Help With Formatting the Returned Results

Discussion in 'PHP' started by iPhrankie, Oct 31, 2009.

  1. #1
    Hello Everyone,

    I found a simple link crawler script that serves my purposes. It looks at a URL and returns all the links. However, I'm having trouble formatting the results. I would like to put some basic HTML around each link such as, <br><p>, etc.

    I've tried inserting $str code and echos. I'm just not very good with working with arrays.

    Note: I plan on using this within my site. The code just points to reddit as an example.

    Thanks in advance.
     
    iPhrankie, Oct 31, 2009 IP
  2. iPhrankie

    iPhrankie Peon

    Messages:
    2
    Likes Received:
    0
    Best Answers:
    0
    Trophy Points:
    0
    #2
    <?php
    class DOM_Crawler {

    protected $markup = "";

    protected $dom = null;

    protected $base_url = null;

    public function __construct ($url)
    {
    $this->base_url = $this->_base_url($url);
    $this->markup = $this->_load_markup($url);
    $this->markup = $this->_prepare_markup($this->markup);
    $this->_init_dom();
    }

    protected function _load_markup ($url)
    {
    $ch = curl_init();
    $timeout = 10;
    curl_setopt ($ch, CURLOPT_URL, $url);
    curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    $contents = curl_exec($ch);
    curl_close($ch);
    return $contents;
    }

    protected function _prepare_markup ($content, $encod='')
    {
    mb_detect_order("ASCII,UTF-8,ISO-8859-1,windows-1252,iso-8859-15");
    if (!empty($content))
    {
    if (empty($encod))
    {
    $encod = mb_detect_encoding($content);
    }
    $headpos = mb_strpos($content,'<head>');
    if ($headpos === false)
    {
    $headpos = mb_strpos($content,'<HEAD>');
    }
    if ($headpos !== false) {
    $headpos += 6;
    $content = mb_substr($content, 0, $headpos)
    . '<meta http-equiv="Content-Type" content="text/html; charset=' . $encod . '">'
    . mb_substr($content, $headpos);
    }
    $content = mb_convert_encoding($content, 'HTML-ENTITIES', $encod);
    return $content;
    }
    }

    protected function _init_dom ()
    {
    $this->dom = new DomDocument;
    $this->dom->loadHTML($this->markup);
    $this->dom->normalizeDocument();
    }

    public function get ($type)
    {
    $method = "_get_{$type}";
    if (method_exists($this, $method))
    {
    return call_user_method($method, $this);
    }
    }

    protected function _get_links ()
    {
    if (!empty($this->markup))
    {
    $anchors = $this->dom->getElementsByTagName('a');
    foreach ($anchors as $anchor)
    {
    $href = $anchor->getAttribute('href');
    if (mb_substr($href, 0, 1) == '/')
    {
    $links[] = $this->base_url . $href;
    }
    else if (mb_substr($href, 0, 1) != '#')
    {
    $links[] = $href;
    }
    }
    return $links;
    }
    return false;
    }

    protected function _base_url ($url)
    {
    $parsed_url = parse_url($url);
    $base_url = $parsed_url['scheme'] . '://';
    if (!empty($parsed_url['user']))
    {
    $base_url .= $parsed_url['user'];
    if (!empty($parsed_url['pass']))
    {
    $base_url .= ':' . $parsed_url['pass'];
    }
    $base_url .= '@';
    }
    $base_url .= $parsed_url['host'];
    if (!empty($parsed_url['port']))
    {
    $base_url .= ':' . $parsed_url['port'];
    }
    return $base_url;
    }
    }

    $links[0] = 'http://www.reddit.com/';
    for ($i = 0; count($links) < 100; $i++)
    {
    $crawler = new DOM_Crawler($links[$i]);
    $links = _merge_unique($links, $crawler->get('links'));
    }
    print_r($links);

    function _merge_unique ($arr, $arr2) {

    foreach ($arr2 as $value)

    return $arr2;
    }
    ?>
     
    iPhrankie, Oct 31, 2009 IP
  3. JAY6390

    JAY6390 Peon

    Messages:
    918
    Likes Received:
    31
    Best Answers:
    0
    Trophy Points:
    0
    #3
    Could you not just do this
    $f = new DOM_Crawler(''http://www.reddit.com/');
    $links = array_unique($f->get('links'));
    echo '<pre>'.print_r($links, true).'</pre>';
    PHP:
     
    JAY6390, Oct 31, 2009 IP