How to identify evil bots?

Discussion in 'Apache' started by Owlcroft, Dec 28, 2004.

  1. #1
    Now that I finally have a site doing some modest volume, I have begun paying a little attention to my weblogs--something I never before felt any need or inclination to do.

    So, I find (via Webalyzer) that I have at least 30 (that's as deep as their list table for this goes) IPs that have each--in most instances on but a single visit--given hundreds of thousands or millions of hits, sucked up dozens or hundreds of megabytes, and yet taken only 2 or 3 files.

    I have assembled a table of supposedly up-to-date listings of known legitimate-searchbot IPs, and so far down as I had patience to check, none of the IPs involved was on those lists--leading me to believe that these are email harvesters or worse.

    My question is how does one tell who is what from such logs? I don't want to block IPs on the chance that they're black hats if I run a risk of blocking a legitimate searchbot. What do people do to deal with these issues?
     
    Owlcroft, Dec 28, 2004 IP
  2. flawebworks

    flawebworks Tech Services

    Messages:
    991
    Likes Received:
    36
    Best Answers:
    1
    Trophy Points:
    78
    #2
    Don't have ip addresses for you; but you can be a bit safe with this list:

    RewriteCond %{HTTP_USER_AGENT} ^JetCar.* [NC]
    RewriteCond %{HTTP_USER_AGENT} ^FlashGet.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Teleport.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetAnts.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebZIP.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Offline.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^GetRight.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Go!Zilla.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^GoZilla.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebReaper.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^ia_archiver.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^wget.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^HTTrack.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebStripper.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebCapture.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Scooter-W3.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebCopier.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^FlashGe.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Webdupe.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetAnts.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Pockey.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^DiscoPump.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^InternetNinja.* [NC,OR]



    RewriteEngine On
    RewriteCond %{HTTP_USER_AGENT} ^BlackWidow [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Bot\ mailto:craftbot@yahoo.com [OR]
    RewriteCond %{HTTP_USER_AGENT} ^ChinaClaw [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Custo [OR]
    RewriteCond %{HTTP_USER_AGENT} ^DISCo [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Download\ Demon [OR]
    RewriteCond %{HTTP_USER_AGENT} ^eCatch [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EirGrabber [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EmailSiphon [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EmailWolf [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Express\ WebPictures [OR]
    RewriteCond %{HTTP_USER_AGENT} ^ExtractorPro [OR]
    RewriteCond %{HTTP_USER_AGENT} ^EyeNetIE [OR]
    RewriteCond %{HTTP_USER_AGENT} ^FlashGet [OR]
    RewriteCond %{HTTP_USER_AGENT} ^GetRight [OR]
    RewriteCond %{HTTP_USER_AGENT} ^GetWeb! [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Go!Zilla [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Go-Ahead-Got-It [OR]
    RewriteCond %{HTTP_USER_AGENT} ^GrabNet [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Grafula [OR]
    RewriteCond %{HTTP_USER_AGENT} ^HMView [OR]
    RewriteCond %{HTTP_USER_AGENT} ^HTTrack [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Image\ Stripper [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Image\ Sucker [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Indy\ Library [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^InterGET [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Internet\ Ninja [OR]
    RewriteCond %{HTTP_USER_AGENT} ^JetCar [OR]
    RewriteCond %{HTTP_USER_AGENT} ^JOC\ Web\ Spider [OR]
    RewriteCond %{HTTP_USER_AGENT} ^larbin [OR]
    RewriteCond %{HTTP_USER_AGENT} ^LeechFTP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Mass\ Downloader [OR]
    RewriteCond %{HTTP_USER_AGENT} ^MIDown\ tool [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Mister\ PiX [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Navroad [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NearSite [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetAnts [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetSpider [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Net\ Vampire [OR]
    RewriteCond %{HTTP_USER_AGENT} ^NetZIP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Octopus [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Offline\ Explorer [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Offline\ Navigator [OR]
    RewriteCond %{HTTP_USER_AGENT} ^PageGrabber [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Papa\ Foto [OR]
    RewriteCond %{HTTP_USER_AGENT} ^pavuk [OR]
    RewriteCond %{HTTP_USER_AGENT} ^pcBrowser [OR]
    RewriteCond %{HTTP_USER_AGENT} ^RealDownload [OR]
    RewriteCond %{HTTP_USER_AGENT} ^ReGet [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SiteSnagger [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SmartDownload [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SuperBot [OR]
    RewriteCond %{HTTP_USER_AGENT} ^SuperHTTP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Surfbot [OR]
    RewriteCond %{HTTP_USER_AGENT} ^tAkeOut [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Teleport\ Pro [OR]
    RewriteCond %{HTTP_USER_AGENT} ^VoidEYE [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Web\ Image\ Collector [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Web\ Sucker [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebAuto [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebCopier [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebFetch [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebGo\ IS [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebLeacher [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebReaper [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebSauger [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Website\ eXtractor [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Website\ Quester [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebStripper [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebWhacker [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebZIP [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Wget [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Widow [OR]
    RewriteCond %{HTTP_USER_AGENT} ^WWWOFFLE [OR]
    RewriteCond %{HTTP_USER_AGENT} ^Xaldon\ WebSpider [OR]
    RewriteCond %{HTTP_USER_AGENT} ^GoZilla.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^WebCapture.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Webdupe.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^Pockey.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^DiscoPump.* [NC,OR]
    RewriteCond %{HTTP_USER_AGENT} ^InternetSeer.com.* [NC,OR]
    RewriteRule .* - [F,L]


    RewriteCond %{HTTP_REFERER} ^http://www.hostitcheap.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.bravespider.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.bigweblist.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.weblinkvalidator.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://traffixer.com* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.youradultpaysite.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.paysiteprofits.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.hotlivewebcams.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.paysiteprofits.com/* [OR]

    RewriteCond %{REQUEST_URI} FormMail.*


    RewriteCond %{HTTP_REFERER} ^www.addresses.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.business-socket.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^www.datashaping.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://cheapweb.biz/* [OR]

    RewriteCond %{HTTP_REFERER} ^http://traffixer.com* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.hostitcheap.com/* [OR]
    RewriteCond %{HTTP_REFERER} ^http://www.weblinkvalidator.com/* [OR]
    RewriteCond %{REQUEST_URI} FormMail.*

    (I think this line here is a note to myself):
    RewriteRule /*$ http://www.site-you-are-sending-the-bot-to.com [L,R]
    # or normal error page
    # RewriteRule ^.*$ - [F]
     
    flawebworks, Dec 28, 2004 IP