diff --git a/bridges/Dilbert.php b/bridges/Dilbert.php new file mode 100644 index 00000000..cf741a31 --- /dev/null +++ b/bridges/Dilbert.php @@ -0,0 +1,43 @@ +returnError('Could not request Dilbert.', 404); + + foreach($html->find('div.STR_Image') as $element) { + $item = new Item(); + $href = $element->find('a',0)->href; + $item->uri = 'http://dilbert.com' . $href; + $content = str_replace('src="/', 'src="http://dilbert.com/',$element->innertext); + $content = str_replace('href="/', 'href="http://dilbert.com/',$content); + $item->content = $content; + $time = strtotime(substr($href, (strrpos($href, "/", -10) + 1), 10)); + $item->title = date("d/m/Y", $time); + $item->timestamp = $time; + $this->items[] = $item; + } + } + + public function getName(){ + return 'Dilbert'; + } + + public function getURI(){ + return 'http://dilbert.com'; + } + + public function getDescription(){ + return 'Dilbert via rss-bridge'; + } + + public function getCacheDuration(){ + return 14400; // 4 hours + } +} + diff --git a/bridges/Freenews.php b/bridges/Freenews.php new file mode 100644 index 00000000..dad8eb6d --- /dev/null +++ b/bridges/Freenews.php @@ -0,0 +1,69 @@ +uri = RUBRIQUE.$param['id']; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); +// $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.news_line') as $newsLines) { + $this->parseLine($newsLines); + } + } + + public function parseLine($newsLines) { + foreach($newsLines->find('span') as $newsSpan) { + foreach($newsSpan->find('a') as $newsLink) { + $item = new Item(); + $item->title = trim($newsLink->title); + $item->uri = FREENEWS.$newsLink->href; + // now load that uri from cache + $articlePage = str_get_html($this->get_cached($item->uri)); + $content = $articlePage->find('.chapo', 0); + foreach($content->find('img') as $image) { + $image->src = FREENEWS.$image->src; + } + $redaction = $articlePage->find('.redac', 0); + $rubrique = $redaction->find('a', 0); + $auteur = $redaction->find('a', 1); + $item->content = $content->innertext; + $item->name = $auteur->innertext; + // format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple + $item->timestamp = DateTime::createFromFormat('Y-m-d\TH:i:s+', $redaction->title)->getTimestamp(); + $this->items[] = $item; + // return after first link, as there are hidden treasures in those pages + return; + } + } + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 2h hours + } + public function getDescription(){ + return "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). par rss-bridge"; + } +} diff --git a/bridges/Gawker.php b/bridges/Gawker.php new file mode 100644 index 00000000..4d135220 --- /dev/null +++ b/bridges/Gawker.php @@ -0,0 +1,89 @@ +uri = $param['site']; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.main-column') as $content) { + $this->parseContent($content); + } + } + + public function parseContent($content) { + foreach($content->find('.headline') as $headline) { + foreach($headline->find('a') as $articleLink) { + // notice we only use article from this gawker site (as gawker like to see us visit other sites) + if(strpos($articleLink->href, $this->getURI())>=0) { + $this->parseLink($articleLink); + } + } + } + } + + public function parseLink($infoLink) { + $item = new Item(); + $item->uri = $infoLink->href; + $item->title = $infoLink->innertext; + try { + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + if(is_object($articlePage)) { + $content = $articlePage->find('.post-content', 0); + $this->defaultImageSrcTo($content, $this->getURI()); + $item->content = $content->innertext; + // http://stackoverflow.com/q/22715928/15619 + $publishtime = $articlePage->find('.publish-time', 0)->getAttribute("data-publishtime"); + // don't know what I'm doing there, but http://www.epochconverter.com/programming/functions-php.php#epoch2date recommends it + $item->timestamp = $this->js_to_unix_timestamp($publishtime); + $vcard = $articlePage->find('.vcard', 0); + if(is_object($vcard)) { + $item->name = $vcard->find('a', 0)->innertext; + } + } else { + throw new Exception("cache content for ".$item->uri." is NOT a Simple DOM parser object !"); + } + } catch(Exception $e) { + $this->message("obtaining ".$item->uri." resulted in exception ".$e->getMessage().". Deleting cached page ..."); + // maybe file is incorrect. it should be discarded from cache + $this->remove_from_cache($item->url); + $item->content = $e->getMessage(); + } + $this->items[] = $item; + } + + function js_to_unix_timestamp($jsTimestamp){ + return $jsTimestamp/1000; + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 1h + } + public function getDescription(){ + return "Gawker press blog content."; + } +} diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php new file mode 100644 index 00000000..1dd9e3e4 --- /dev/null +++ b/bridges/Les400Culs.php @@ -0,0 +1,63 @@ +getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('#alpha-inner') as $articles) { + foreach($articles->find('div.entry') as $article) { + $header = $article->find('h3.entry-header a', 0); + $content = $article->find('div.entry-content', 0); + + + $item = new Item(); + $item->title = trim($header->innertext); + $item->uri = $header->href; + $item->name = "Agnès Girard"; + // date is stored outside this node ! + $dateHeader = $article->prev_sibling(); + // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit) + $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp(); + + + $linkForMore = $content->find('p.entry-more-link a',0); + if($linkForMore==null) { + $item->content = $content->innertext; + } else { + $pageAddress = $linkForMore->href; + $articlePage = str_get_html($this->get_cached($linkForMore->href)); + if($articlePage==null) { + $item->content = $content->innertext."\n

".$linkForMore->outertext."

"; + } else { + // TODO use some caching there ! + $fullContent = $articlePage->find('div.entry-content', 0); + $item->content = $fullContent->innertext; + } + } + $this->items[] = $item; + } + } + } + + public function getName(){ + return 'Les 400 Culs'; + } + + public function getURI(){ + return SEXE; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; + } +} diff --git a/bridges/LesJoiesDuCode.php b/bridges/LesJoiesDuCode.php new file mode 100644 index 00000000..c5a74b8e --- /dev/null +++ b/bridges/LesJoiesDuCode.php @@ -0,0 +1,55 @@ +returnError('Could not request LesJoiesDuCode.', 404); + + foreach($html->find('div.post') as $element) { + $item = new Item(); + $temp = $element->find('h3 a', 0); + + $titre = $temp->innertext; + $url = $temp->href; + + $temp = $element->find('div.bodytype', 0); + $content = $temp->innertext; + + $auteur = $temp->find('.c1 em', 0); + $pos = strpos($auteur->innertext, "by"); + + if($pos > 0) + { + $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); + $item->name = $auteur; + } + + + $item->content .= trim($content); + $item->uri = $url; + $item->title = trim($titre); + + $this->items[] = $item; + } + } + + public function getName(){ + return 'Les Joies Du Code'; + } + + public function getURI(){ + return 'http://lesjoiesducode.fr/'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Les Joies Du Code via rss-bridge"; + } +} diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php new file mode 100644 index 00000000..20ba5f94 --- /dev/null +++ b/bridges/Sexactu.php @@ -0,0 +1,87 @@ +getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('.content-holder') as $contentHolder) { + // only use first list as second one only contains pages numbers + $articles = $contentHolder->find('ul', 0); + foreach($articles->find('li') as $element) { + // if you ask about that method_exists, there seems to be a bug in simple html dom + // see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619 + if(is_object($element)) { + $item = new Item(); + // various metadata + $titleBlock = $element->find('.title-holder', 0); + if(is_object($titleBlock)) { + $titleDetails = $titleBlock->find('.article-title',0); + $titleData = $titleDetails->find('h2', 0)->find('a',0); + $titleTimestamp =$titleDetails->find('h4',0); + $item->title = $this->correctCase(trim($titleData->innertext)); + $item->uri = GQ.$titleData->href; + + // Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension + $dateText = $titleTimestamp->innertext; + $dateText = substr($dateText, strpos($dateText,',')+1); + $dateText = str_replace($find, $replace, strtolower($dateText)); + $date = strtotime($dateText); + $item->timestamp = $date; + + $item->name = "Maïa Mazaurette"; + $elementText = $element->find('.text-container', 0); + // don't forget to replace images server url with gq one + foreach($elementText->find('img') as $image) { + $image->src = GQ.$image->src; + } + $item->content = $elementText->innertext; + $this->items[] = $item; + } + + } + + } + } + } + + public function getName(){ + return 'Sexactu'; + } + + public function getURI(){ + return GQ.'/sexactu'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Sexactu via rss-bridge"; + } + + public function correctCase($str) { + $sentences=explode('.', mb_strtolower($str, "UTF-8")); + $str=""; + $sep=""; + foreach ($sentences as $sentence) + { + //upper case first char + $sentence=ucfirst(trim($sentence)); + + //append sentence to output + $str=$str.$sep.$sentence; + $sep=". "; + } + return $str; + } +} diff --git a/bridges/WorldOfTanks.php b/bridges/WorldOfTanks.php new file mode 100644 index 00000000..1a6e6257 --- /dev/null +++ b/bridges/WorldOfTanks.php @@ -0,0 +1,63 @@ +lang = $param['lang']; + } + if(empty($param['category'])) { + $this->uri = WORLD_OF_TANKS.$this->lang.NEWS; + } else { + $this->uri = WORLD_OF_TANKS.$this->lang.NEWS.$param['category']."/"; + } + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + $this->message("loaded HTML from ".$this->getURI()); + // customize name + $this->name = $html->find('title', 0)->innertext; + foreach($html->find('.b-imgblock_ico') as $infoLink) { + $this->parseLine($infoLink); + } + } + + public function parseLine($infoLink) { + $item = new Item(); + $item->uri = WORLD_OF_TANKS.$infoLink->href; + // now load that uri from cache +// $this->message("loading page ".$item->uri); + $articlePage = str_get_html($this->get_cached($item->uri)); + $content = $articlePage->find('.l-content', 0); + $this->defaultImageSrcTo($content, WORLD_OF_TANKS); + $item->title = $content->find('h1', 0)->innertext; + $item->content = $content->find('.b-content', 0)->innertext; +// $item->name = $auteur->innertext; + $item->timestamp = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp"); + $this->items[] = $item; + } + + public function getName(){ + return $this->name; + } + + public function getURI(){ + return $this->uri; + } + + public function getCacheDuration(){ + return 3600; // 2h hours + } + public function getDescription(){ + return "Toutes les actualités les plus brulantes de ce simulateur de destruction d'acier."; + } +} diff --git a/index.php b/index.php index 352cbb03..a5fa188c 100644 --- a/index.php +++ b/index.php @@ -85,9 +85,11 @@ try{ // Data retrieval $bridge = Bridge::create($bridge); - $bridge - ->setCache($cache) // Comment this lign for avoid cache use - ->setDatas($_REQUEST); + if(isset($_REQUEST["disable_cache"])) { + } else { + $bridge->setCache($cache); // just add disable cache to your query to disable caching + } + $bridge->setDatas($_REQUEST); // Data transformation $format = Format::create($format); diff --git a/lib/Bridge.php b/lib/Bridge.php index 8620107f..41b5e498 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -71,6 +71,102 @@ abstract class BridgeAbstract implements BridgeInterface{ return $this; } + + /** + * Set default image SRC attribute to point on given server when none is provided (that's to say when image src starts with '/' + */ + public function defaultImageSrcTo($content, $server) { + foreach($content->find('img') as $image) { + if(strpos($image->src, '/')==0) { + $image->src = $server.$image->src; + } + } + } +} + +/** + * Extension of BridgeAbstract allowing caching of files downloaded over http files. + * This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded + * separately. + * This class mainly provides a get_cached method which will will download the file from its remote location. + * TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time + * After all, rss-bridge is not respaw, isn't it ? + */ +abstract class HttpCachingBridgeAbstract extends BridgeAbstract { + + /** + * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. + * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * @param url url to cache + * @return content of file as string + */ + public function get_cached($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = $pageCacheDir.$simplified_url; + if (substr($filename, -1) == '/') { + $filename = $filename."index.html"; + } + if(file_exists($filename)) { +// $this->message("loading cached file from ".$filename." for page at url ".$url); + // TODO touch file and its parent, and try to do neighbour deletion + $this->refresh_in_cache($pageCacheDir, $filename); + } else { +// $this->message("we have no local copy of ".$url." Downloading to ".$filename); + $dir = substr($filename, 0, strrpos($filename, '/')); + if(!is_dir($dir)) { +// $this->message("creating directories for ".$dir); + mkdir($dir, 0777, true); + } + $this->download_remote($url, $filename); + } + return file_get_contents($filename); + } + + private function refresh_in_cache($pageCacheDir, $filename) { + $currentPath = $filename; + while(!$pageCacheDir==$currentPath) { + touch($currentPath); + $currentPath = dirname($currentPath); + } + } + + public function download_remote($url , $save_path) { + $f = fopen( $save_path , 'w+'); + if($f) { + $handle = fopen($url , "rb"); + if($handle) { + while (!feof($handle)) { + $contents = fread($handle, 8192); + if($contents) { + fwrite($f , $contents); + } + } + fclose($handle); + } + fclose($f); + } + } + + public function remove_from_cache($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url); + // TODO build this from the variable given to Cache + $pageCacheDir = __DIR__ . '/../cache/'."pages/"; + $filename = realpath($pageCacheDir.$simplified_url); + $this->message("removing from cache \"".$filename."\" WELL, NOT REALLY"); + // filename is NO GOOD +// unlink($filename); + } + + public function message($text) { + $backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3); + $calling = $backtrace[2]; + $message = $calling["file"].":".$calling["line"] + ." class ".get_class($this)."->".$calling["function"] + ." - ".$text; + error_log($message); + } } class Bridge{ @@ -92,9 +188,9 @@ class Bridge{ } $pathBridge = self::getDir() . $nameBridge . '.php'; - + if( !file_exists($pathBridge) ){ - throw new \Exception('The bridge you looking for does not exist.'); + throw new \Exception('The bridge you looking for does not exist. It should be at path '.$pathBridge); } require_once $pathBridge;