Hello, I need to parse html documents to get "values" of all html tags and their attributes like if I have: <a href="url_here" title="title_here"> text_here </a> Then I need 3 values of: array( 'href' => 'url_here', 'title' => 'title_here', 'tag_value' => 'text_here' ); How to do this? Please let me know. If possible please give me some code. regards
I found some code here: http://www.weberdev.com/get_example-1817.html and modified it to return the value between the anchor tags too (it did only return the attributes). Here is the code: <?php /* * parseHtml.php * My notation of variables: * i_ = integer, ex: i_count * a_ = array, a_html * b_ = boolean, * s_ = string * * What it does: * - parses a html string and get the tags * - exceptions: html tags like <br> <hr> </a>, etc * - At the end, the array will look like this: * ["IMG"][0]["SRC"] = "xxx" * ["IMG"][1]["SRC"] = "xxx" * ["IMG"][1]["ALT"] = "xxx" * ["A"][0]["HREF"] = "xxx" * */ $html = '<a href="url_here" title="title_here"> text_here </a>'; $tags = parseHtml($html); print_r ($tags); function parseHtml( $s_str ) { $i_indicatorL = 0; $i_indicatorR = 0; $s_tagOption = ""; $i_arrayCounter = 0; $a_html = array(); // Search for a tag in string while( is_int(($i_indicatorL=strpos($s_str,"<",$i_indicatorR))) ) { // Get everything into tag... $i_indicatorL++; $i_indicatorR = strpos($s_str,">", $i_indicatorL); $s_temp = substr($s_str, $i_indicatorL, ($i_indicatorR-$i_indicatorL) ); $a_tag = explode( ' ', $s_temp ); // Here we get the tag's name list( ,$s_tagName,, ) = each($a_tag); $s_tagName = strtoupper($s_tagName); // Well, I am not interesting in <br>, </font> or anything else like that... // So, this is false for tags without options. $b_boolOptions = is_array(($s_tagOption=each($a_tag))) && $s_tagOption[1]; if( $b_boolOptions ) { // Without this, we will mess up the array $i_arrayCounter = (int)count($a_html[$s_tagName]); if ($s_tagName == "A") { $a_value = substr($s_str, $i_indicatorR+1, strpos($s_str,"</a>", $i_indicatorR) - $i_indicatorR - 1); $a_html[$s_tagName][$i_arrayCounter]["VALUE"] = $a_value; } // get the tag options, like src="htt://". Here, s_tagTokOption is 'src' and s_tagTokValue is '"http://"' do { $s_tagTokOption = strtoupper(strtok($s_tagOption[1], "=")); $s_tagTokValue = trim(strtok("=")); $a_html[$s_tagName][$i_arrayCounter][$s_tagTokOption] = $s_tagTokValue; $b_boolOptions = is_array(($s_tagOption=each($a_tag))) && $s_tagOption[1]; } while( $b_boolOptions ); } } return $a_html; } ?> PHP: There are probably better ways of doing this depending on exactly what it is you are trying to achieve.
<?php if( !defined( 'MODE_HTML' ) ) define( 'MODE_HTML', 1 ); # Work in HTML source mode if( !defined( 'MODE_FILE' ) ) define( 'MODE_FILE', 2 ); # Work in FILE/URL source mode if( !defined( 'ERROR_NONE' ) ) define( 'ERROR_NONE', 0 ); # No errors occured if( !defined( 'ERROR_DOM' ) ) define( 'ERROR_DOM', -1 ); # DOMDocument class not available if( !defined( 'ERROR_MODE' ) ) define( 'ERROR_MODE', -2 ); # No valid mode passed to getLinks if( !defined( 'ERROR_LOAD') ) define( 'ERROR_LOAD', -3 ); # Not able to load source if( !defined( 'ERROR_LINK' ) ) define( 'ERROR_LINK', -4 ); # No links found in document /** * getLinks - search for links in document * * @param string $source * @param int $mode * @param array $links * @param DOMDocument $dom * @return int * * <code> * #search for links in source of http://interviolet.com * getLinks( "http://interviolet.com", MODE_FILE, $links, $dom ); * #search for links in the source '<a href="http://interviolet.com" title="TheTitle">Interviolet</a>' * getLinks( "<a href=\"http://interviolet.com\" title=\"TheTitle\">Interviolet</a>", MODE_HTML, $links, $dom ); * </code> */ function getLinks( $source, $mode, &$links = null, &$dom = null ) { $links = array( ); # Create link array for storage if( class_exists( 'DOMDocument' ) ) # Check for DOMDocument class { $dom = new DOMDocument( ); # Create new DOMDocument switch( $mode ) # Load HTML into DOM object { case MODE_HTML: if( !@$dom->loadHTML( $source ) ) return ERROR_LOAD; break; case MODE_FILE: if( !@$dom->loadHTMLFile( $source ) ) return ERROR_LOAD; break; default: return ERROR_MODE; } if( ( $tags = $dom->getElementsByTagName( 'a' ) ) ) # Find links in document { foreach( $tags as $tag ) { if( $tag->hasAttribute( 'href' ) or $tag->hasAttribute( 'title' ) ) # If the current link is valid { $links[ ] = array # Store the current link ( 'href' => $tag->getAttribute( 'href' ), 'title' => $tag->getAttribute( 'title' ), 'text' => $tag->nodeValue, 'node' => $tag # Store DOMNode in results too ... ); } } return ERROR_NONE; } else return ERROR_LINK; } else return ERROR_DOM; } switch( getLinks( 'http://forums.digitalpoint.com/showthread.php?t=863580', MODE_FILE, $links, $dom ) ) { case ERROR_NONE: foreach( $links as $index => $link ) { printf ( "--------------------#%03d--------------------\r\n", $index ); foreach( $link as $key => $value ) if( $key != 'node' ) printf ( "%s\t\t%s\r\n", $key, $value ? $value : '(null)' ); printf ( "--------------------#%03d--------------------\r\n\r\n", $index ); } break; case ERROR_DOM: print( "PHP DOM extension not loaded\r\n" ); break; case ERROR_MODE: printf( "No valid mode passed to function\r\n"); break; case ERROR_LOAD: print( "Failed to load document\r\n" ); break; case ERROR_LINK: print( "No links found in document\r\n" ); break; } ?> PHP: