#!/usr/bin/perl
print "Content-type: text/html\n\n";
use CGI qw/:cgi-lib/;
use CGI::Carp qw(fatalsToBrowser);
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtor;

#####################################################
# Website Crawler v. 1.0.0
# This script was created by Eli at BlueHatSEO.com
# For further instructions on how to use this 
# Go to https://www.bluehatseo.com/complete-guide-to-scraping-pt-2-crawling/


#####################
# Domain name. Do not include the trailing slash.
# Be sure to limit the subdomain by including the www.
# To include all subdomains just leave this as domain.com
$domain="www.test.com";

#####################
# Do you want the script to crawl dynamic pages?
# Dynamic pages include the ?
# 0=No 1=Yes
$crawldynamic=0;

#####################
# How long (in seconds) would you like
# the script to pause between pages
$waittime=0;

#####################
# How many pages would you like the script to stop after.
$stopcount=10;


#############################
# Begin The Crawl
open(OUTF,">pages.txt");
print OUTF "http://$domain/\n";
close(OUTF);

$count=0;

open(INF,"pages.txt");
while(<INF>){
	$URL=$_;
	$browser = LWP::UserAgent->new();
	$browser->timeout(10);
	my $request = HTTP::Request->new(GET => $URL);
	my $response = $browser->request($request);
	if ($response->is_error()) {printf "%s\n", $response->status_line;}
	$contents = $response->content();
	my ($page_parser) = HTML::LinkExtor->new(undef, $URL);
	$page_parser->parse($contents)->eof;
	@links = $page_parser->links;
	foreach $link (@links) {
		$$link[2] =~ tr/[A-Z]/[a-z]/;
		if(($$link[2] !~ /.css/i)
		&& ($$link[2] !~ /\/feed\//i)
		&& ($$link[2] !~ /\/rss\//i)
		&& ($$link[2] !~ /wp-comments-post.php/i)
		&& ($$link[2] !~ /#/)
		&& ($$link[2] !~ /.js/i)
		&& ($$link[2] !~ /xmlrpc.php/i)
		&& ($$link[2] !~ /\/trackback\//i)
		&& ($$link[2] !~ /.jpg/i)
		&& ($$link[2] !~ /.jpeg/i)
		&& ($$link[2] !~ /.gif/i)
		&& ($$link[2] !~ /.ico/i)
		&& ($$link[2] !~ /.swf/i)
		&& ($$link[2] =~ /$domain/i)){
			if(($crawldynamic == 0) && ($$link[2] !~ /\?/)){
				&list;
			}
			elsif(($crawldynamic == 1) && ($$link[2] =~ /\?/)){
				&list;
			}
		}
	}
	sleep($waittime);
}
close(INF);
print qq~<br><b>Finished!</b>~;

sub list {
	$found=0;
	open(PFILE,"pages.txt");
	@pages=<PFILE>;
	close(PFILE);
	foreach $line (@pages){
		chomp($line);
		if(($line eq "$$link[2]") or ($line eq "$$link[2]/")){
			$found=1;
			next;
		}
	}
	if($found == 0){
		open(OUTF,">>pages.txt");
		print OUTF "$$link[2]\n";
		close(OUTF);
		print qq~$count. <b>Adding Page:</b> $$link[2]<br>
		~;
		$count++;
		if($count >= $stopcount){
			print qq~<br><b>Finished!</b>~;
			exit;
		}
	}
}