I have some code I am getting a issue with its not submitting the correct links... Its just submitting the same link over and over again. function storeLink($url,$gathered_from) { $query = "INSERT INTO pages (url, gathered_from) VALUES ('$url', '$gathered_from')"; mysql_query($query) or die('Error, insert query failed'); { $dir = dir("/"); //List files in images directory while (($url1 = $dir->read()) !== false) { if(is_dir($url1)) { echo subdirs($url1); } } } if ($url === $url1) { return ; } else { return storeLink($url,$gathered_from); } } $target_url = "mysite.com"; $userAgent = 'somebot/2.1 (http://www.somebot.com/bot.html)'; // make the cURL request to $target_url $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL, $target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 1000); $html= curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); storeLink($url,$target_url); echo "<br />Link stored: $url"; Code (markup): I don't if this will help or not.. but here is where it does it with a single page - function storeLink($url,$gathered_from) { $query = "INSERT INTO pages (url, gathered_from) VALUES ('$url', '$gathered_from')"; mysql_query($query) or die('Error, insert query failed'); } $target_url = "mysite.com"; $userAgent = 'somebot/2.1 (http://www.somebot.com/bot.html)'; // make the cURL request to $target_url $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL, $target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 1000); $html= curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); storeLink($url,$target_url); echo "<br />Link stored: $url"; Code (markup): My goal is to do it for a full website..
That's exactly what you're telling it to do. It works, you just need to write a lot more code to recursively go through and store what you want throughout the site. Run this to get an idea of what you're getting yourself into ... $target_url = 'someurl'; $userAgent = 'somebot/2.1 (http://www.somebot.com/bot.html)'; // make the cURL request to $target_url $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL, $target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 1000); $html= curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); echo $url . '<br />'; echo $target_url . '<br />'; //storeLink($url,$target_url); //echo "<br />Link stored: $url"; } Code (markup): Curl doesn't fetch external resources it only does what it's told. You gave it one link. It's fetching one link. Try using this instead ... http://code.google.com/p/phpquery/
Right now with the code above can catch one link and it repeats the same link over and over again till I run out of memory. Thats the part that I am asking help for is the recursive. I need to go up in links and not submit the same links over and over again. That is the part I am asking help on. I do know about the link you sent me but that is not my question that I ask about. I appreciate you trying but if I was going to use that then I would have never posted a question about the recursive part.
Have you thought of using wget ? wget -r -l 1 http://mysite.com Code (markup): This should give you a good start ........ <?php class HereYouGo{ public $link_array = array(); function getLink($url){ $target_url = $url; $userAgent = 'somebot/2.1 (http://www.somebot.com/bot.html)'; // make the cURL request to $target_url $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL, $target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 1000); $html= curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//a"); for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); ///////////////////////////////////////////// $error=0; if (strpos($url,'http:') !== false) { $error=1; }elseif(strpos($url,'mailto:') !== false){ $error=1; } if(!$error){ $this->link_array[] = $url; } ////////////////////////////////////////////// } } function __construct(){ $site = 'SOME URL'; $this->getLink($site); foreach ($this->link_array as $key => $value) { $this->getLink($site . $value); } //print the links print_r($this->link_array); } } new HereYouGo; Code (markup):