<?php
/**
 * This file should be run by Cron
 * Get latest news from RSS feed, parse content and save to DB
 */
	set_time_limit( 0 );
	require( 'vendors/simplepie.php' );
	require( 'vendors/simple_html_dom.php' );
	require( 'vendors/http.class.php' );
	
	define('DS', DIRECTORY_SEPARATOR);	
// DB parameter
	//$host =     'localhost:/tmp/mysql.sock';
	$host =     '127.0.0.1:3306';
	$database = 'lifesci4_lsw';
	$user =     'lifesci4_lsw';
	$password = 'QwEAsDZxCRfV1234';
// connect to DB
	$connexion = mysql_connect( $host, $user, $password )
		or die( 'Echec de la connexion au serveur de donnees: ' . mysql_error() );
	$db = mysql_select_db( $database, $connexion )
		or die( 'Echec de la connexion a la base de donnees: ' . mysql_error() );
		
// Init httpObj, is used to download the content
	$GLOBALS['httpObj'] = new http( false, false );
	// root path (used for cache path)
	//$root = substr($_SERVER['SCRIPT_FILENAME'], 0, strlen($_SERVER['SCRIPT_FILENAME']) - strlen($_SERVER['SCRIPT_NAME']));
	//$root = "D:\\Domains\\lifesciencesworld.com\\wwwroot\\app\\";
       $root = "/home2/lifesci4/public_html/app/";
	$cachefile = $root."tmp".DS."cache".DS."tasks".DS."previousFeedItems";
/**
 * Pass variables to the SimplePie component, the feed URL and cache parameters
 */
	$feeds = array(
				'http://www.eurekalert.org/rss.xml'
	);
	$feed = new SimplePie();
	$feed->set_feed_url( $feeds );
	$feed->handle_content_type();
	$feed->enable_cache( false );
	$feed->init();
	//echo $feed->get_item_quantity();
	// create data subset array
	$feedlinks = array();
	$feeddata = array();
	$i = 0;
	$blacklist = array( 'to present', 'to Present' );
	foreach( $feed->get_items() as $item ) {
		// test for unwanted expression
		foreach( $blacklist as $expression ) {
			if( mb_strpos( $item->get_title(), $expression, 0, 'UTF-8' ) !== false ) {
				continue 2;
			}
		}
		$feeddata[$i] = array( 'title' => $item->get_title(), 'url1' => htmlspecialchars_decode($item->get_permalink()), 'created' => $item->get_date("Y-m-d H:i:s") );
		$feedlinks[$i] = htmlspecialchars_decode($item->get_permalink());
		$i++;
	}
// remove duplicates
	$feedlinks = array_unique( $feedlinks ); 
// check cache for URL already processed
	if( !file_exists( $cachefile ) ) {
		$fp = fopen( $cachefile, "wb" );
		fwrite( $fp, serialize( $feedlinks ) );
		fclose( $fp );
	}else {
		$previtems = unserialize( file_get_contents( $cachefile ) );
		$feedlinks = array_diff( $feedlinks, $previtems ); // filter new urls
		if( $feedlinks ) {
			$previtems = array_merge( $feedlinks, $previtems ); // append new item to list
			$previtems = array_slice( $previtems, 0, 260 ); // remove older ones
		}else {
			die( '' );
		}
		$fp = fopen( $cachefile, "wb" );
		fwrite( $fp, serialize( $previtems ) );
		fclose( $fp );
		unset( $olditems, $previtems, $fp );
	}
// merge cleaned array with full data
	foreach( $feedlinks as $key => $value ) {
		$feedlinks[$key] = $feeddata[$key];
	}
// specify encoding
	mysql_query( "SET NAMES 'utf8'", $connexion );
// get text from HTML page and save to DB
	foreach( $feedlinks as $feedlink ) {
		$newsitem = $feedlink;
// newsHost variable is used find sourse & is also used in "html2txt" function via "parseHTML" function
		$newsHost = str_replace( 'www.', '', strtolower( parse_url( $newsitem['url1'], PHP_URL_HOST ) ) );
		$newsitem['text'] = parseHTML( $newsitem['url1'], $newsHost );
		$newsitem['priority_until'] = $newsitem['created'];
		$newsitem['modified'] = $newsitem['created'];
		if( $newsHost == 'prnewswire.com' ) {
			$newsitem['source'] = 'PR Newswire';
		}elseif( $newsHost == 'eurekalert.org' ) {
			$newsitem['source'] = 'Eurek Alert';
		}else {
			$newsitem['source'] = '';
		}
		$newsitem['is_preformatted'] = 0;
		$query = "INSERT INTO news 
						(title,created,priority_until,modified,is_preformatted,text,source,url1) 
						VALUES (
							'".mysqlEscape( $newsitem['title'], $connexion )."',
							'".$newsitem['created']."',
							'".$newsitem['priority_until']."',
							'".$newsitem['modified']."',
							'".$newsitem['is_preformatted']."',
							'".mysqlEscape( $newsitem['text'], $connexion )."',
							'".$newsitem['source']."',
							'".$newsitem['url1']."'
						)";
		$result = mysql_query( $query, $connexion );
		if( !$result ) {
			$message  = 'Invalid query: ' . mysql_error() . "\n";
			$message .= 'Whole query: ' . $query;
			die( $message );
		}
		unset( $newsitem );
	}
// Close DB Connection
	mysql_close( $connexion );
	echo count( $feedlinks ), " links parsed.";
	unset( $feedlinks, $feeddata );
// parse HTML file from PR Newswire
	function parseHTML( $url, $newsHost ) {
		//$content = @file_get_contents( $url );
		$content = $GLOBALS['httpObj']->download( $url );
		
		/*mien
		preg_match_all ("/<div class=\"entry\">([^`]*?)<\/div>/", $content, $matches);
		$desc = $matches[0];
		*/
		$first_step = explode( '<div class="entry">' , $content );
		$second_step = explode("</div>" , $first_step[1] );
		$content =$first_step[1];
		//echo $second_step[1];
		
		
		$desc = html2txt( $content, $newsHost ); // Format html text here
		if( strlen( $desc ) ) {
			return $desc;
		}
		return false;	
	}

	function html2txt( $document, $newsHost ){
		if( $newsHost == 'prnewswire.com' ) {
			$domObj = str_get_html( $document, true );
			$document = $domObj->find( 'div[class=col-1 topics]', 0 )->innertext;
			$domObj->clear();
			$domObj = str_get_html( $document, true );
			$domObj->find( 'div.more', 0 )->outertext = '';
			$domObj->find( 'div.horizontalline', 0 )->outertext = '';
			$domObj->find( 'div.featured', 0 )->outertext = '';
			$domObj->find( 'h1', 0 )->outertext = '';
			$document = $domObj->save();
			$domObj->clear();
		}elseif( $newsHost == 'eurekalert.org' ) {
			$domObj = str_get_html( $document, true );
			$domObj->find( 'br', 2 )->outertext = $domObj->find( 'br', 2 )->outertext . '<div id="wrapper">';
			$domObj->find( 'hr', 0 )->outertext = '</div>' . $domObj->find( 'hr', 0 )->outertext;
			$document = $domObj->save();
			$domObj->clear();
			
			$domObj = str_get_html( $document, true );
			foreach( $domObj->find( 'table' ) as $tempTable ) {
				$tempTable->outertext = '';
			}
			$domObj->find( 'h1.title', 0 )->outertext = '';
			$document = trim( $domObj->find( 'div#wrapper', 0 )->innertext );
			$domObj->clear();
		}

		$domObj = str_get_html( $document, true );
// Remove the comment
		foreach( $domObj->find( 'comment' ) as $tempVar ) {
			$tempVar->outertext = '';
		}
// Assign target to 'a' tage
		foreach( $domObj->find( 'a' ) as $tempVar ) {
			$tempVar->target = '_blank';
		}
		$document = $domObj->save();
		$domObj->clear();
		
// Removing white space in html document
		$GLOBALS['pretags'] = array();
		$GLOBALS['pretagssave'] = 1;
		$document = preg_replace_callback( '@<pre[^>]*?>.*?</pre>@si', 'preTagReplaceCallback', $document );
		$document = preg_replace( '/\\s\\s*/', ' ', $document );
		$GLOBALS['pretagssave'] = 0;
		$document = preg_replace_callback( '@<pre[^>]*?>.*?</pre>@si', 'preTagReplaceCallback', $document );
		$GLOBALS['pretags'] = array();

										/* '@<[\/\!]*?[^<>]*?>@si',            // Strip out HTML tags */
		$search = array('@<script[^>]*?>.*?</script>@si',  // Strip out javascript
						 '@<style[^>]*?>.*?</style>@siU',    // Strip style tags properly
						 '@<![\s\S]*?--[ \t\n\r]*>@'         // Strip multi-line comments including CDATA
		);
		$text = preg_replace( $search, '', $document );
		$text = trim( $text );
		$text = str_ireplace( array( '–', '—' ), ' - ', $text );
		$text = mb_convert_encoding( $text, 'UTF-8', 'ISO-8859-1' );
		return $text;
	}

	function preTagReplaceCallback( $m ) {
		if( $GLOBALS['pretagssave'] ) {
			$GLOBALS['pretags'][] = $m[0];
			return '<pre>'.count( $GLOBALS['pretags'] ).'</pre>';
		}else {
			$m = (int)str_replace( '<pre>', '', $m[0] );
			return $GLOBALS['pretags'][$m-1];
		}
	}
	
// Mysql escape function Wrapper
	function mysqlEscape( $str, $con ) {
		if( function_exists( 'mysql_real_escape_string' ) ) {
			return mysql_real_escape_string( $str, $con );
		}elseif( function_exists( 'mysql_escape_string' ) ) {
			return mysql_escape_string( $str );
		}else {
			return addslashes( $str );
		}
	}
?>