Hi, I've decided I'm going to make a search engine, No it's not going to be the same search engine as google or yahoo since I have no chance. I was thinking of using the yahoo api, but it might get a little tricky with what I want to do. I'd like to have it only index the pages of the sites I input, how would you go about doing this, where should I start? Can someone point me in the right direction! Thanks
For that you'd be better making your own search bot since you've got the list of sites you want. Then update the content in your db from your search bot's spidering
If your having trouble accessing good api's you'll probably end up utilizing curl to scrape/crawl webpage's with snoopy PHP class snoopy PHP class being one of the most popular that I know of. It simulates a web browser and automates the task of retrieving web page content and posting forms. This will help you generate an index for the specific sites and with further logic/coding analyze the key page elements for your results hierarchy, here's a funky and simple curl code to start you off, <?php $curl = curl_init(); curl_setopt ($curl, CURLOPT_URL, "http://www.php.net"); curl_exec ($curl); curl_close ($curl); ?> PHP: 1/ curl_init(); will initialize Curl eg: ("http://www.php.net") though you can type any url in the $curl, CURLOPT_URL section 2/ curl_setopt will set URL you want to load 3/ curl_exec executes the retrieval process and print the URL 4/ curl_close close's Curl session your going to need to emulate a browser and maybe include a proxy element to prevent your scrapper/crawler from being ip blocked. (google curl php code example) and you'll find some niffy versions of curl, also check out xpaths and others apps around the net as well An alternative to building your own web crawler - search engine there are some open source search engine scripts available to jump start you in the right direction, you might be able to find something here, sourceforge.net/ ROOFIS
how could i use that to search only specified sites in google? Or is it possible to do that like only show the results from websites specified for the search term entered.
Make sure you support robots.txt early on - otherwise you may eventually end up in court over it. Utterly ridiculous, I know, but it happens.
I have a crudely made spider that will save sites into a database from their meta tags and then redirect to one of the links on the site. It breaks a lot but manages to redirect to another saved link in the database. If you can make anything out of it then your welcome...but again this is something I made in around a half an hour if I wanted to really make it good I would have a secondary cron that cross referenced it to alexa...and I would put less emphasis on subdomains and non index pages. <? $conn = mysql_connect("localhost","removed","removed"); $db = mysql_select_db("removed"); if (!$conn) { die('Could not connect: ' . mysql_error()); } mysql_select_db("video", $conn); $result = mysql_query("SELECT * FROM Spider"); while($row = mysql_fetch_array($result)) { $id = $row['id']; $time = $row['time']; $newurl = $row['url']; } $result2 = mysql_query("SELECT * FROM Spider ORDER BY RAND()"); while($row2 = mysql_fetch_array($result2)) { $newurl2 = $row2['url']; } ?> <? $url = $_GET['url']; $time2 = time(); $url2 = "http://www.".$url.""; $page = ''; $result = mysql_query("DELETE * FROM `Spider` WHERE `url`='".$url."'"); $fh = fopen(''.$url2.'','r') or die('Unable to open file.<meta http-equiv="refresh" content="2;url=http://removed.com/bot.php?url='.$newurl2.'"/>'); while(! feof($fh)){ $page .= fread($fh, 1048576); } $page = strtolower($page); $page = str_replace('\'', '"', $page); $page = str_replace(array(' />', ' />', ' "/>', '" />', '""/>', '< ', '/>'), array('>', '>', '">', '">', '">', '<', '>'), $page); $page1 = explode('<title>', $page); $page2 = explode('</title>', $page1[1]); $H1page1 = explode('<h1>', $page); $H1page2 = explode('</h1>', $H1page1[1]); $H2page1 = explode('<h2>', $page); $H2page2 = explode('</h2>', $H2page1[1]); $page1a = explode('<meta name="description" content="', $page); $page2a = explode('">', $page1a[1]); if($page2a[0] == ""){ $page1c = explode('name="description"/>', $page); $page1c2 = explode('/>', $page1c[0]); $page2c = explode('<meta content="', $page1c2[1]); } if($page2a[0] == "" && $page2c[1] ==""){ $Xpage1c = explode("name='description'/>", $page); $Xpage1c2 = explode("/>", $Xpage1c[0]); $Xpage2c = explode("<meta content='", $Xpage1c2[1]); } if($page2a[0] == "" && $page2c[1] =="" && $Xpage2c[1] ==""){ $Xpage1a = explode("<meta name='description' content='", $page); $Xpage2a = explode("'>", $Xpage1a[1]); } if($page2[0] == ""){ $page1d = explode('name="title"/>', $page); $page2d = explode('<meta content="', $page1d[0]); } $page1b = explode('<meta name="keywords" content="', $page); $page2b = explode('">', $page1b[1]); if($page2b[0] == ""){ $Xpage1c = explode('name="keywords"/>', $page); $Xpage1c2 = explode('/>', $Xpage1c[0]); $Xpage2c = explode('<meta content="', $Xpage1c2[1]); } if($page2b[0] !== ""){ $page2ba = explode(",", $page2b[0]); } else { $page2ba = explode(",", $Xpage2b[0]); } if($page2b[0] !== ""){ $Apage2b = substr_count($page2b[0], ','); } else { $Apage2b = substr_count($Xpage2b[0], ','); } $Apage2b2 = $Apage2b + 1; fclose($fh); echo"Title: ".$page2[0]."".$page2d[1]."<br>Description: ".$page2a[0]."".$page2c[1]."".$Xpage2a[0]."".$Xpage2c[1]."<br>Keywords: ".$Apage2b2."<br>H1: ".$H1page2[0]."<br>H2: ".$H2page2[0]."<br>"; $H1page2[0] = strip_tags($H1page2[0]); $H2page2[0] = strip_tags($H2page2[0]); $i = 0; while ($i <= $Apage2b) { $i2 = $i + 1; echo"Keyword ".$i2.": ".$page2ba[$i]."<br>"; $i++; } $q=mysql_query("SELECT * FROM `Spider` WHERE `url`='".$url."'"); if(mysql_num_rows($q)) { $result = mysql_query("DELETE * FROM `Spider` WHERE `url`='".$url."'"); $q2=mysql_query("SELECT * FROM `Spider2` WHERE `url`='".$urlpieces2.".com'"); if(mysql_num_rows($q2)){ } else { mysql_query("INSERT INTO `Spider2` VALUES ('','".$url."','".$time."','".$page2a[0]."".$page2c[1]."".$Xpage2a[0]."".$Xpage2c[1]."','".$page2ba[0]."','".$page2ba[1]."','".$page2ba[2]."','".$page2ba[3]."','".$page2ba[4]."','".$page2ba[5]."','".$H1page2[0]."','".$H2page2[0]."');") or die(mysql_error()); } } ?> <? $target_url = "".$url.""; $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)'; // make the cURL request to $target_url $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL,$target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html= curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); for ($i = 0; $i < $hrefs->length; $i++) { $rand = rand(0,$hrefs->length); $href = $hrefs->item($i); $url = $href->getAttribute('href'); $firstc = substr(''.$url.'', 0, 1); $pos = strpos($url,$target_url); $C = 0; if (strlen(strstr($url,"http://"))>0) { $C = 1; } if (strlen(strstr($url,"www."))>0) { $C = 1; } $urlpieces = explode(".com", $url); $urlpieces2=str_replace(array('http://', 'https://', 'ftp://', 'www.', 'wwww.', 'http:///'), array('', '', '', '', '', ''), $urlpieces[0]); if($firstc != '/' && $pos === false && $C == 1 && $url !== "".$urlpieces3[$amountP].".com" && "".$urlpieces2.".com" !== "google.com"){ echo "<br />$urlpieces2.com"; $r=mysql_query("SELECT * FROM `Spider` WHERE `url`='".$urlpieces2.".com'"); if(mysql_num_rows($r)) { } else { $r2=mysql_query("SELECT * FROM `Spider2` WHERE `url`='".$urlpieces2.".com'"); if(mysql_num_rows($r2)){ } else { $time = time(); mysql_query("INSERT INTO `Spider` VALUES ('','".$time."','".$urlpieces2.".com');") or die(mysql_error()); } } if($i = $rand){ echo '<meta http-equiv="refresh" content="2;url=http://removed.com/bot.php?url='.$urlpieces2.'.com" />'; } if($hrefs->length = 1){ echo'<meta http-equiv="refresh" content="2;url=http://removed.com/bot.php?url='.$newurl2.'.com"/>'; } } } ?> PHP: