I tried to find some software that would scan 500+ webpages robots.txt to show me these all together or something like this. I even found some perl code for robot.txt collector to list robots.txt by sites, but I don't know perl. Basically I need to find out witch pages are using Disallow: /. Somebody know good trick/software to do this? PERL CODE: use Socket; @sites = qw( URL ); foreach $site (@sites) { # resolve host name $iaddr = inet_aton($site); unless ($iaddr) { print "Can't resolve $iaddr\n"; next; } $paddr = sockaddr_in(80,$iaddr); $proto = getprotobyname("tcp"); socket (SOCK, PF_INET, SOCK_STREAM, $proto) ; connect (SOCK, $paddr) ; select SOCK; $|=1; print "GET /robots.txt HTTP/1.1\r\nhost: $site\r\n\r\n"; @response = <SOCK>; $|=0; select STDOUT; print "Response from $site was ",@response+0," lines\n"; open (FH,">$site.robo"); print FH @response; }
Here you go: #!/usr/bin/perl use LWP::UserAgent; use CGI; use strict; # =========================== # CONFIG SECTION my $my_user_agent = 'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)'; my $path_to_log = "/some/folder/to/log.txt"; #Not needed if you do not want to log anything my $path_to_robots = "/some/folder/to/robots.txt"; #Not needed if you do not want to store collected robots.txt files to one file my $path_to_urls = "/some/folder/to/urls.txt"; #Needed url list that holds urls to be checked for robots.txt files my $write_log_file = 1; #Set this value to 0 if you do not want to have log file created my $write_robots_file = 1; #Set this value to 0 if you do not want all collected robots.txt files stored in $path_to_robots file my $print_to_screen = 1; #Set this value to 0 if you do not want to have output printed out to screen my $print_to_browser_screen = 1; #Set this value to 0 if you do not want to have output printed out to browser screen # END OF CONFIG SECTION # =========================== my @urls = &read_urls; my $browser = LWP::UserAgent->new(keep_alive => 1); $browser->agent($my_user_agent); $browser->default_header('Accept' => "image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/x-shockwave-flash, */*", 'Accept-Language' => "en-US", 'Keep-Alive' => "300", 'Connection' => "keep-alive"); if ($print_to_browser_screen == 1) { print "Content-type: text/html\n\n"; } if ($write_log_file == 1) { &ResetFile($path_to_log); } if ($write_robots_file == 1) { &ResetFile($path_to_robots); } foreach (@urls) { $_ =~ s/\n//g; if (($_) && ($_ ne "")) { &FetchRobotsUrl($_); } } if (($print_to_screen == 1) or ($print_to_browser_screen == 1)) { my $add_string = ""; if ($print_to_browser_screen == 1) { $add_string = "<br>"; } print "The End.".$add_string."\n"; } exit; sub read_urls { open(URLS, "<$path_to_urls"); my @temp_urls = <URLS>; close URLS; return @temp_urls; } sub FetchRobotsUrl { my($whichurltograb) = $_[0]; $whichurltograb .= "robots.txt"; my $response = $browser->get($whichurltograb); if ($response->is_success) { if ($write_robots_file== 1) { &add_to_robots_file($response->content, $whichurltograb); } if ($write_log_file== 1) { &add_to_log("+++ Found robots.txt on ".$whichurltograb); } if (($print_to_screen == 1) or ($print_to_browser_screen == 1)) { my $add_string = ""; if ($print_to_browser_screen == 1) { $add_string = "<br>"; } print $add_string."\n"; print "+++ Found robots.txt on ".$whichurltograb.$add_string."\n"; print "========================================".$add_string."\n"; print $whichurltograb.$add_string."\n"; print "========================================".$add_string."\n"; my $print_content = $response->content; $print_content =~ s/\n/<br>\n/g; print $print_content.$add_string."\n"; print "========================================".$add_string."\n"; print $add_string."\n"; } } else { if ($write_log_file== 1) { &add_to_log("--- No robots.txt on ".$whichurltograb); } if (($print_to_screen == 1) or ($print_to_browser_screen == 1)) { my $add_string = ""; if ($print_to_browser_screen == 1) { $add_string = "<br>"; } print $add_string."\n"; print "--- No robots.txt on ".$whichurltograb.$add_string."\n"; print $add_string."\n"; } } } sub add_to_log { my($whattoaddtolog) = $_[0]; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); my $datetime = sprintf("%4d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); open(LOG, ">>$path_to_log"); print LOG $datetime." | ".$whattoaddtolog; print LOG "\015\012"; close LOG; } sub add_to_robots_file { my($whattoaddtofile) = $_[0]; my($whichurlwasthis) = $_[1]; open(LOG, ">>$path_to_robots"); print LOG "\015\012"; print LOG "========================================\015\012"; print LOG $whichurlwasthis."\015\012"; print LOG "========================================\015\012"; print LOG $whattoaddtofile."\015\012"; print LOG "========================================\015\012"; print LOG "\015\012"; close LOG; } sub ResetFile { my($whichfiletoreset) = $_[0]; open(LOG, ">$whichfiletoreset"); close LOG; } Code (markup): Pay attention to CONFIG section at the beginning, as there are several things to configure. There are explanations in each row so I guess all is clear. I wrote the script, so let me know if you have any questions. I tested it and seems to be working fine. Your domain name urls need to be in that url file, stored one bellow each other, for example: http://www.site1.com/ http://www.site2.com/ ... http://www.site837.com/ Code (markup): Make sure each url starts with a http:// and ends with a / The script as it is will identify as Internet Explorer 9.0 or you can change $my_user_agent variable to anything you wish. Cheers!