#!/usr/bin/perl
print "Content-type: text/html\n\n";
use CGI qw/:cgi-lib/;
use CGI::Carp qw(fatalsToBrowser);
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor;
#####################################################
# Website Crawler v. 1.0.0
# This script was created by Eli at BlueHatSEO.com
# For further instructions on how to use this
# Go to https://www.bluehatseo.com/complete-guide-to-scraping-pt-2-crawling/
#####################
# Domain name. Do not include the trailing slash.
# Be sure to limit the subdomain by including the www.
# To include all subdomains just leave this as domain.com
$domain="www.test.com";
#####################
# Do you want the script to crawl dynamic pages?
# Dynamic pages include the ?
# 0=No 1=Yes
$crawldynamic=0;
#####################
# How long (in seconds) would you like
# the script to pause between pages
$waittime=0;
#####################
# How many pages would you like the script to stop after.
$stopcount=10;
#############################
# Begin The Crawl
open(OUTF,">pages.txt");
print OUTF "http://$domain/\n";
close(OUTF);
$count=0;
open(INF,"pages.txt");
while(){
$URL=$_;
$browser = LWP::UserAgent->new();
$browser->timeout(10);
my $request = HTTP::Request->new(GET => $URL);
my $response = $browser->request($request);
if ($response->is_error()) {printf "%s\n", $response->status_line;}
$contents = $response->content();
my ($page_parser) = HTML::LinkExtor->new(undef, $URL);
$page_parser->parse($contents)->eof;
@links = $page_parser->links;
foreach $link (@links) {
$$link[2] =~ tr/[A-Z]/[a-z]/;
if(($$link[2] !~ /.css/i)
&& ($$link[2] !~ /\/feed\//i)
&& ($$link[2] !~ /\/rss\//i)
&& ($$link[2] !~ /wp-comments-post.php/i)
&& ($$link[2] !~ /#/)
&& ($$link[2] !~ /.js/i)
&& ($$link[2] !~ /xmlrpc.php/i)
&& ($$link[2] !~ /\/trackback\//i)
&& ($$link[2] !~ /.jpg/i)
&& ($$link[2] !~ /.jpeg/i)
&& ($$link[2] !~ /.gif/i)
&& ($$link[2] !~ /.ico/i)
&& ($$link[2] !~ /.swf/i)
&& ($$link[2] =~ /$domain/i)){
if(($crawldynamic == 0) && ($$link[2] !~ /\?/)){
&list;
}
elsif(($crawldynamic == 1) && ($$link[2] =~ /\?/)){
&list;
}
}
}
sleep($waittime);
}
close(INF);
print qq~
Finished!~;
sub list {
$found=0;
open(PFILE,"pages.txt");
@pages=;
close(PFILE);
foreach $line (@pages){
chomp($line);
if(($line eq "$$link[2]") or ($line eq "$$link[2]/")){
$found=1;
next;
}
}
if($found == 0){
open(OUTF,">>pages.txt");
print OUTF "$$link[2]\n";
close(OUTF);
print qq~$count. Adding Page: $$link[2]
~;
$count++;
if($count >= $stopcount){
print qq~
Finished!~;
exit;
}
}
}