#!/usr/bin/perl print "Content-type: text/html\n\n"; use CGI qw/:cgi-lib/; use CGI::Carp qw(fatalsToBrowser); use LWP::Simple; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use HTML::LinkExtor; ##################################################### # Website Crawler v. 1.0.0 # This script was created by Eli at BlueHatSEO.com # For further instructions on how to use this # Go to https://www.bluehatseo.com/complete-guide-to-scraping-pt-2-crawling/ ##################### # Domain name. Do not include the trailing slash. # Be sure to limit the subdomain by including the www. # To include all subdomains just leave this as domain.com $domain="www.test.com"; ##################### # Do you want the script to crawl dynamic pages? # Dynamic pages include the ? # 0=No 1=Yes $crawldynamic=0; ##################### # How long (in seconds) would you like # the script to pause between pages $waittime=0; ##################### # How many pages would you like the script to stop after. $stopcount=10; ############################# # Begin The Crawl open(OUTF,">pages.txt"); print OUTF "http://$domain/\n"; close(OUTF); $count=0; open(INF,"pages.txt"); while(){ $URL=$_; $browser = LWP::UserAgent->new(); $browser->timeout(10); my $request = HTTP::Request->new(GET => $URL); my $response = $browser->request($request); if ($response->is_error()) {printf "%s\n", $response->status_line;} $contents = $response->content(); my ($page_parser) = HTML::LinkExtor->new(undef, $URL); $page_parser->parse($contents)->eof; @links = $page_parser->links; foreach $link (@links) { $$link[2] =~ tr/[A-Z]/[a-z]/; if(($$link[2] !~ /.css/i) && ($$link[2] !~ /\/feed\//i) && ($$link[2] !~ /\/rss\//i) && ($$link[2] !~ /wp-comments-post.php/i) && ($$link[2] !~ /#/) && ($$link[2] !~ /.js/i) && ($$link[2] !~ /xmlrpc.php/i) && ($$link[2] !~ /\/trackback\//i) && ($$link[2] !~ /.jpg/i) && ($$link[2] !~ /.jpeg/i) && ($$link[2] !~ /.gif/i) && ($$link[2] !~ /.ico/i) && ($$link[2] !~ /.swf/i) && ($$link[2] =~ /$domain/i)){ if(($crawldynamic == 0) && ($$link[2] !~ /\?/)){ &list; } elsif(($crawldynamic == 1) && ($$link[2] =~ /\?/)){ &list; } } } sleep($waittime); } close(INF); print qq~
Finished!~; sub list { $found=0; open(PFILE,"pages.txt"); @pages=; close(PFILE); foreach $line (@pages){ chomp($line); if(($line eq "$$link[2]") or ($line eq "$$link[2]/")){ $found=1; next; } } if($found == 0){ open(OUTF,">>pages.txt"); print OUTF "$$link[2]\n"; close(OUTF); print qq~$count. Adding Page: $$link[2]
~; $count++; if($count >= $stopcount){ print qq~
Finished!~; exit; } } }