Hello Everyone, I found a simple link crawler script that serves my purposes. It looks at a URL and returns all the links. However, I'm having trouble formatting the results. I would like to put some basic HTML around each link such as, <br><p>, etc. I've tried inserting $str code and echos. I'm just not very good with working with arrays. Note: I plan on using this within my site. The code just points to reddit as an example. Thanks in advance.
<?php class DOM_Crawler { protected $markup = ""; protected $dom = null; protected $base_url = null; public function __construct ($url) { $this->base_url = $this->_base_url($url); $this->markup = $this->_load_markup($url); $this->markup = $this->_prepare_markup($this->markup); $this->_init_dom(); } protected function _load_markup ($url) { $ch = curl_init(); $timeout = 10; curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $contents = curl_exec($ch); curl_close($ch); return $contents; } protected function _prepare_markup ($content, $encod='') { mb_detect_order("ASCII,UTF-8,ISO-8859-1,windows-1252,iso-8859-15"); if (!empty($content)) { if (empty($encod)) { $encod = mb_detect_encoding($content); } $headpos = mb_strpos($content,'<head>'); if ($headpos === false) { $headpos = mb_strpos($content,'<HEAD>'); } if ($headpos !== false) { $headpos += 6; $content = mb_substr($content, 0, $headpos) . '<meta http-equiv="Content-Type" content="text/html; charset=' . $encod . '">' . mb_substr($content, $headpos); } $content = mb_convert_encoding($content, 'HTML-ENTITIES', $encod); return $content; } } protected function _init_dom () { $this->dom = new DomDocument; $this->dom->loadHTML($this->markup); $this->dom->normalizeDocument(); } public function get ($type) { $method = "_get_{$type}"; if (method_exists($this, $method)) { return call_user_method($method, $this); } } protected function _get_links () { if (!empty($this->markup)) { $anchors = $this->dom->getElementsByTagName('a'); foreach ($anchors as $anchor) { $href = $anchor->getAttribute('href'); if (mb_substr($href, 0, 1) == '/') { $links[] = $this->base_url . $href; } else if (mb_substr($href, 0, 1) != '#') { $links[] = $href; } } return $links; } return false; } protected function _base_url ($url) { $parsed_url = parse_url($url); $base_url = $parsed_url['scheme'] . '://'; if (!empty($parsed_url['user'])) { $base_url .= $parsed_url['user']; if (!empty($parsed_url['pass'])) { $base_url .= ':' . $parsed_url['pass']; } $base_url .= '@'; } $base_url .= $parsed_url['host']; if (!empty($parsed_url['port'])) { $base_url .= ':' . $parsed_url['port']; } return $base_url; } } $links[0] = 'http://www.reddit.com/'; for ($i = 0; count($links) < 100; $i++) { $crawler = new DOM_Crawler($links[$i]); $links = _merge_unique($links, $crawler->get('links')); } print_r($links); function _merge_unique ($arr, $arr2) { foreach ($arr2 as $value) return $arr2; } ?>
Could you not just do this $f = new DOM_Crawler(''http://www.reddit.com/'); $links = array_unique($f->get('links')); echo '<pre>'.print_r($links, true).'</pre>'; PHP: