From 4857cdbedc4639403d2cfc731a3a191908dbee98 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 4 Feb 2014 17:53:59 +0100 Subject: [PATCH 01/12] ajout des scriptsd e Superbaillot --- bridges/Dilbert.php | 43 ++++++++++++++++++++++++++++ bridges/LesJoiesDuCode.php | 57 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 bridges/Dilbert.php create mode 100644 bridges/LesJoiesDuCode.php diff --git a/bridges/Dilbert.php b/bridges/Dilbert.php new file mode 100644 index 00000000..893747b6 --- /dev/null +++ b/bridges/Dilbert.php @@ -0,0 +1,43 @@ +returnError('Could not request Dilbert.', 404); + + foreach($html->find('div.STR_Image') as $element) { + $item = new Item(); + $href = $element->find('a',0)->href; + $item->uri = 'http://dilbert.com' . $href; + $content = str_replace('src="/', 'src="http://dilbert.com/',$element->innertext); + $content = str_replace('href="/', 'href="http://dilbert.com/',$content); + $item->content = $content; + $time = strtotime(substr($href, (strrpos($href, "/", -10) + 1), 10)); + $item->title = date("d/m/Y", $time); + $item->timestamp = $time; + $this->items[] = $item; + } + } + + public function getName(){ + return 'Dilbert'; + } + + public function getURI(){ + return 'http://dilbert.com'; + } + + public function getDescription(){ + return 'Dilbert via rss-bridge'; + } + + public function getCacheDuration(){ + return 14400; // 4 hours + } +} +?> diff --git a/bridges/LesJoiesDuCode.php b/bridges/LesJoiesDuCode.php new file mode 100644 index 00000000..a14d9f6d --- /dev/null +++ b/bridges/LesJoiesDuCode.php @@ -0,0 +1,57 @@ +returnError('Could not request LesJoiesDuCode.', 404); + + foreach($html->find('div.post') as $element) { + $item = new Item(); + $temp = $element->find('h3 a', 0); + + $titre = $temp->innertext; + $url = $temp->href; + + $temp = $element->find('div.bodytype', 0); + $content = $temp->innertext; + + $auteur = $temp->find('.c1 em', 0); + $pos = strpos($auteur->innertext, "by"); + + if($pos > 0) + { + $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); + $item->name = $auteur; + } + + + $item->content .= trim($content); + $item->uri = $url; + $item->title = trim($titre); + + $this->items[] = $item; + } + } + + public function getName(){ + return 'Les Joies Du Code'; + } + + public function getURI(){ + return 'http://lesjoiesducode.fr/'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Les Joies Du Code via rss-bridge"; + } +} +?> + From 13232266778c61e52aab3b37ee7874bfa574f5d2 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 4 Feb 2014 17:54:18 +0100 Subject: [PATCH 02/12] ajout du script pour Sexactu --- bridges/Sexactu.php | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 bridges/Sexactu.php diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php new file mode 100644 index 00000000..5abb0325 --- /dev/null +++ b/bridges/Sexactu.php @@ -0,0 +1,58 @@ +returnError('Could not request http://www.gqmagazine.fr/sexactu.', 404); + + foreach($html->find('div.content-holder ul li') as $element) { + $item = new Item(); + $temp = $element->find('h3 a', 0); + + $titreElement = $element->find('.title-holder .article-title a'); + $titre = $titreElement-> + $url = $temp->href; + + $temp = $element->find('div.text-container', 0); + $content = $temp->innertext; + + $auteur = $temp->find('.c1 em', 0); + $pos = strpos($auteur->innertext, "by"); + + if($pos > 0) + { + $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); + $item->name = $auteur; + } + + + $item->content .= trim($content); + $item->uri = $url; + $item->title = trim($titre); + + $this->items[] = $item; + } + } + + public function getName(){ + return 'Sexactu'; + } + + public function getURI(){ + return 'http://http://www.gqmagazine.fr/sexactu/'; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "Sexactu via rss-bridge"; + } +} +?> + From 4f1d4137d4221d12b3a59987bc6adb7e47b97228 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 4 Feb 2014 18:00:11 +0100 Subject: [PATCH 03/12] premire modification qui ne marche pas --- bridges/Sexactu.php | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 5abb0325..37b16c18 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -12,17 +12,16 @@ class LesJoiesDuCodeBridge extends BridgeAbstract{ foreach($html->find('div.content-holder ul li') as $element) { $item = new Item(); - $temp = $element->find('h3 a', 0); $titreElement = $element->find('.title-holder .article-title a'); - $titre = $titreElement-> - $url = $temp->href; + $titre = $titreElement->innertext + $url = $titreElement->href; $temp = $element->find('div.text-container', 0); $content = $temp->innertext; - $auteur = $temp->find('.c1 em', 0); - $pos = strpos($auteur->innertext, "by"); + $auteur = $element->find('div.header-holder', 0); + $pos = strpos($auteur->innertext, "par"); if($pos > 0) { From dbe9ae44dfdf307b44c65b554c98c5f40774ff77 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Wed, 5 Feb 2014 10:16:45 +0100 Subject: [PATCH 04/12] Correctiond e quelques boulettes, mais ca ne marche toujours pas --- bridges/Sexactu.php | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 37b16c18..9153afc7 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -5,7 +5,7 @@ * @description Sexactu via rss-bridge * @update 04/02/2014 */ -class LesJoiesDuCodeBridge extends BridgeAbstract{ +class SexactuBridge extends BridgeAbstract{ public function collectData(array $param){ $html = file_get_html('http://http://www.gqmagazine.fr/sexactu') or $this->returnError('Could not request http://www.gqmagazine.fr/sexactu.', 404); @@ -13,27 +13,14 @@ class LesJoiesDuCodeBridge extends BridgeAbstract{ foreach($html->find('div.content-holder ul li') as $element) { $item = new Item(); - $titreElement = $element->find('.title-holder .article-title a'); - $titre = $titreElement->innertext - $url = $titreElement->href; - - $temp = $element->find('div.text-container', 0); - $content = $temp->innertext; - - $auteur = $element->find('div.header-holder', 0); - $pos = strpos($auteur->innertext, "par"); - - if($pos > 0) - { - $auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2)))); - $item->name = $auteur; - } - - - $item->content .= trim($content); - $item->uri = $url; - $item->title = trim($titre); + // various metadata + $titleBock = $element->find('title-holder'); + $titleData = $titleBlock->find('article-title h2 a'); + $item->title = trim($titleData->innertext); + $item->uri = $titleData->href; + $item->name = "Maïa Mazaurette"; + $item->content = $element->find('text-container')->innertext; $this->items[] = $item; } } From 1644a855ee07daacdc386d10e50ce3e457d0a734 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 18 Feb 2014 11:55:47 +0100 Subject: [PATCH 05/12] Finally, a working Seactu bridge (I ahve however a bug with trailing whitespaces --- bridges/Sexactu.php | 53 ++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index 9153afc7..a44df2b8 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -5,23 +5,40 @@ * @description Sexactu via rss-bridge * @update 04/02/2014 */ -class SexactuBridge extends BridgeAbstract{ +define("GQ", "http://www.gqmagazine.fr"); +class Sexactu extends BridgeAbstract{ public function collectData(array $param){ - $html = file_get_html('http://http://www.gqmagazine.fr/sexactu') or $this->returnError('Could not request http://www.gqmagazine.fr/sexactu.', 404); - - foreach($html->find('div.content-holder ul li') as $element) { - $item = new Item(); - - // various metadata - $titleBock = $element->find('title-holder'); - $titleData = $titleBlock->find('article-title h2 a'); - - $item->title = trim($titleData->innertext); - $item->uri = $titleData->href; - $item->name = "Maïa Mazaurette"; - $item->content = $element->find('text-container')->innertext; - $this->items[] = $item; + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('.content-holder') as $contentHolder) { + // only use first list as second one only contains pages numbers + $articles = $contentHolder->find('ul', 0); + foreach($articles->find('li') as $element) { + // if you ask about that method_exists, there seems to be a bug in simple html dom + // see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619 + if(is_object($element)) { + $item = new Item(); + // various metadata + $titleBlock = $element->find('.title-holder', 0); + if(is_object($titleBlock)) { + $titleData = $titleBlock->find('.article-title',0)->find('h2', 0)->find('a',0); + $item->title = trim($titleData->innertext); + $item->uri = GQ.$titleData->href; + + $item->name = "Maïa Mazaurette"; + $elementText = $element->find('.text-container', 0); + // don't forget to replace images server url with gq one + foreach($elementText->find('img') as $image) { + $image->src = GQ.$image->src; + } + $item->content = $elementText->innertext; + $this->items[] = $item; + } + + } + + } } } @@ -30,7 +47,7 @@ class SexactuBridge extends BridgeAbstract{ } public function getURI(){ - return 'http://http://www.gqmagazine.fr/sexactu/'; + return GQ.'/sexactu'; } public function getCacheDuration(){ @@ -40,5 +57,7 @@ class SexactuBridge extends BridgeAbstract{ return "Sexactu via rss-bridge"; } } -?> + +// what did you do Seb ? WHAT DID YOU DO ???? +// seems like bridge should not incldue php close ?> From 5f150d3ae53c9c9597c19a9181915f59eb06226e Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Tue, 18 Feb 2014 11:58:29 +0100 Subject: [PATCH 06/12] Outputting path to expected bridge should help newcomers feeling welcomed, no ? --- lib/Bridge.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Bridge.php b/lib/Bridge.php index 7b6c4751..dbff16b2 100644 --- a/lib/Bridge.php +++ b/lib/Bridge.php @@ -92,9 +92,9 @@ class Bridge{ } $pathBridge = self::getDir() . $nameBridge . '.php'; - + if( !file_exists($pathBridge) ){ - throw new \Exception('The bridge you looking for does not exist.'); + throw new \Exception('The bridge you looking for does not exist. It should be at path '.$pathBridge); } require_once $pathBridge; From f7976419ae3924d16c1eaf899a86df553c964ed6 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 08:42:40 +0100 Subject: [PATCH 07/12] fixed title formatting --- bridges/Sexactu.php | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/bridges/Sexactu.php b/bridges/Sexactu.php index a44df2b8..20ba5f94 100644 --- a/bridges/Sexactu.php +++ b/bridges/Sexactu.php @@ -9,7 +9,10 @@ define("GQ", "http://www.gqmagazine.fr"); class Sexactu extends BridgeAbstract{ public function collectData(array $param){ - $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); +$find = array('janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'novembre', 'décembre'); +$replace = array('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); + + $html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); foreach($html->find('.content-holder') as $contentHolder) { // only use first list as second one only contains pages numbers @@ -22,10 +25,19 @@ class Sexactu extends BridgeAbstract{ // various metadata $titleBlock = $element->find('.title-holder', 0); if(is_object($titleBlock)) { - $titleData = $titleBlock->find('.article-title',0)->find('h2', 0)->find('a',0); - $item->title = trim($titleData->innertext); + $titleDetails = $titleBlock->find('.article-title',0); + $titleData = $titleDetails->find('h2', 0)->find('a',0); + $titleTimestamp =$titleDetails->find('h4',0); + $item->title = $this->correctCase(trim($titleData->innertext)); $item->uri = GQ.$titleData->href; + // Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension + $dateText = $titleTimestamp->innertext; + $dateText = substr($dateText, strpos($dateText,',')+1); + $dateText = str_replace($find, $replace, strtolower($dateText)); + $date = strtotime($dateText); + $item->timestamp = $date; + $item->name = "Maïa Mazaurette"; $elementText = $element->find('.text-container', 0); // don't forget to replace images server url with gq one @@ -56,8 +68,20 @@ class Sexactu extends BridgeAbstract{ public function getDescription(){ return "Sexactu via rss-bridge"; } + + public function correctCase($str) { + $sentences=explode('.', mb_strtolower($str, "UTF-8")); + $str=""; + $sep=""; + foreach ($sentences as $sentence) + { + //upper case first char + $sentence=ucfirst(trim($sentence)); + + //append sentence to output + $str=$str.$sep.$sentence; + $sep=". "; + } + return $str; + } } - -// what did you do Seb ? WHAT DID YOU DO ???? -// seems like bridge should not incldue php close ?> - From 79e4e9fdea5c56ba91970cb8f0c763de8a3adaeb Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 08:43:14 +0100 Subject: [PATCH 08/12] added a bridge for one Liberation blog that could easily be extended for others --- bridges/Les400Culs.php | 98 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 bridges/Les400Culs.php diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php new file mode 100644 index 00000000..40e1f69b --- /dev/null +++ b/bridges/Les400Culs.php @@ -0,0 +1,98 @@ +getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); + + foreach($html->find('#alpha-inner') as $articles) { + foreach($articles->find('div.entry') as $article) { + $header = $article->find('h3.entry-header a', 0); + $content = $article->find('div.entry-content', 0); + + + $item = new Item(); + $item->title = trim($header->innertext); + $item->uri = $header->href; + $item->name = "Agnès Girard"; + // date is stored outside this node ! + $dateHeader = $article->prev_sibling(); + // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit) + $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp(); + + + $linkForMore = $content->find('p.entry-more-link a',0); + if($linkForMore==null) { + $item->content = $content->innertext; + } else { + $pageAddress = $linkForMore->href; + $articlePage = str_get_html($this->get_cached($linkForMore->href)); + if($articlePage==null) { + $item->content = $content->innertext."\n

".$linkForMore->outertext."

"; + } else { + // TODO use some caching there ! + $fullContent = $articlePage->find('div.entry-content', 0); + $item->content = $fullContent->innertext; + } + } + $this->items[] = $item; + } + } + } + + public function getName(){ + return 'Les 400 Culs'; + } + + public function getURI(){ + return SEXE; + } + + public function getCacheDuration(){ + return 7200; // 2h hours + } + public function getDescription(){ + return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; + } + + /** + * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. + * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache + * @param url url to cache + * @return content of file as string + */ + public function get_cached($url) { + $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); + $filename = __DIR__ . '/../cache/'."pages/".$simplified_url; + if (substr($filename, -1) == '/') { + $filename = $filename."index.html"; + } + if(!file_exists($filename)) { + error_log("we have no local copy of ".$url." Downloading !"); + $dir = substr($filename, 0, strrpos($filename, '/')); + if(!is_dir($dir)) { + mkdir($dir, 0777, true); + } + $this->download_remote($url, $filename); + } + return file_get_contents($filename); + } + + public function download_remote($url , $save_path) { + $f = fopen( $save_path , 'w+'); + $handle = fopen($url , "rb"); + while (!feof($handle)) { + $contents = fread($handle, 8192); + fwrite($f , $contents); + } + fclose($handle); + fclose($f); + } + +} From ef0ce7d6691fc4e9d1a180df8f15d7e6b92767b1 Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 08:43:55 +0100 Subject: [PATCH 09/12] Strangely, those bridges didn't seems to work exactly as expected --- bridges/Dilbert.php | 4 ++-- bridges/LesJoiesDuCode.php | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/bridges/Dilbert.php b/bridges/Dilbert.php index 893747b6..cf741a31 100644 --- a/bridges/Dilbert.php +++ b/bridges/Dilbert.php @@ -5,7 +5,7 @@ * @description The Unofficial Dilbert Daily Comic Strip RSS Feed via rss-bridge * @update 16/10/2013 */ -class DilbertBridge extends BridgeAbstract{ +class Dilbert extends BridgeAbstract{ public function collectData(array $param){ $html = file_get_html('http://dilbert.com/strips/') or $this->returnError('Could not request Dilbert.', 404); @@ -40,4 +40,4 @@ class DilbertBridge extends BridgeAbstract{ return 14400; // 4 hours } } -?> + diff --git a/bridges/LesJoiesDuCode.php b/bridges/LesJoiesDuCode.php index a14d9f6d..c5a74b8e 100644 --- a/bridges/LesJoiesDuCode.php +++ b/bridges/LesJoiesDuCode.php @@ -5,7 +5,7 @@ * @description LesJoiesDuCode via rss-bridge * @update 30/01/2014 */ -class LesJoiesDuCodeBridge extends BridgeAbstract{ +class LesJoiesDuCode extends BridgeAbstract{ public function collectData(array $param){ $html = file_get_html('http://lesjoiesducode.fr/') or $this->returnError('Could not request LesJoiesDuCode.', 404); @@ -53,5 +53,3 @@ class LesJoiesDuCodeBridge extends BridgeAbstract{ return "Les Joies Du Code via rss-bridge"; } } -?> - From 62a5265433ecfdba17d71c3e14b02a1ec551875e Mon Sep 17 00:00:00 2001 From: Nicolas Delsaux Date: Thu, 20 Feb 2014 12:00:50 +0100 Subject: [PATCH 10/12] updated description --- bridges/Les400Culs.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index 40e1f69b..aea233bb 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -1,9 +1,9 @@ Date: Mon, 3 Mar 2014 11:40:29 +0100 Subject: [PATCH 11/12] Revert "updated description" This reverts commit 62a5265433ecfdba17d71c3e14b02a1ec551875e. --- bridges/Les400Culs.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php index aea233bb..40e1f69b 100644 --- a/bridges/Les400Culs.php +++ b/bridges/Les400Culs.php @@ -1,9 +1,9 @@ Date: Mon, 3 Mar 2014 11:40:49 +0100 Subject: [PATCH 12/12] Revert "added a bridge for one Liberation blog that could easily be extended for others" This reverts commit 79e4e9fdea5c56ba91970cb8f0c763de8a3adaeb. --- bridges/Les400Culs.php | 98 ------------------------------------------ 1 file changed, 98 deletions(-) delete mode 100644 bridges/Les400Culs.php diff --git a/bridges/Les400Culs.php b/bridges/Les400Culs.php deleted file mode 100644 index 40e1f69b..00000000 --- a/bridges/Les400Culs.php +++ /dev/null @@ -1,98 +0,0 @@ -getURI()) or $this->returnError('Could not request '.$this->getURI(), 404); - - foreach($html->find('#alpha-inner') as $articles) { - foreach($articles->find('div.entry') as $article) { - $header = $article->find('h3.entry-header a', 0); - $content = $article->find('div.entry-content', 0); - - - $item = new Item(); - $item->title = trim($header->innertext); - $item->uri = $header->href; - $item->name = "Agnès Girard"; - // date is stored outside this node ! - $dateHeader = $article->prev_sibling(); - // http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit) - $item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp(); - - - $linkForMore = $content->find('p.entry-more-link a',0); - if($linkForMore==null) { - $item->content = $content->innertext; - } else { - $pageAddress = $linkForMore->href; - $articlePage = str_get_html($this->get_cached($linkForMore->href)); - if($articlePage==null) { - $item->content = $content->innertext."\n

".$linkForMore->outertext."

"; - } else { - // TODO use some caching there ! - $fullContent = $articlePage->find('div.entry-content', 0); - $item->content = $fullContent->innertext; - } - } - $this->items[] = $item; - } - } - } - - public function getName(){ - return 'Les 400 Culs'; - } - - public function getURI(){ - return SEXE; - } - - public function getCacheDuration(){ - return 7200; // 2h hours - } - public function getDescription(){ - return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge"; - } - - /** - * Maintain locally cached versions of pages to download to avoid multiple doiwnloads. - * A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache - * @param url url to cache - * @return content of file as string - */ - public function get_cached($url) { - $simplified_url = str_replace(["http://", "https://", "?", "&"], ["", "", "/", "/"], $url); - $filename = __DIR__ . '/../cache/'."pages/".$simplified_url; - if (substr($filename, -1) == '/') { - $filename = $filename."index.html"; - } - if(!file_exists($filename)) { - error_log("we have no local copy of ".$url." Downloading !"); - $dir = substr($filename, 0, strrpos($filename, '/')); - if(!is_dir($dir)) { - mkdir($dir, 0777, true); - } - $this->download_remote($url, $filename); - } - return file_get_contents($filename); - } - - public function download_remote($url , $save_path) { - $f = fopen( $save_path , 'w+'); - $handle = fopen($url , "rb"); - while (!feof($handle)) { - $contents = fread($handle, 8192); - fwrite($f , $contents); - } - fclose($handle); - fclose($f); - } - -}