diff --git a/bridges/CADBridge.php b/bridges/CADBridge.php
index 47ff165a..eb05fd16 100644
--- a/bridges/CADBridge.php
+++ b/bridges/CADBridge.php
@@ -1,12 +1,22 @@
collectExpandableDatas('http://cdn2.cad-comic.com/rss.xml');
+ }
+
+ protected function parseItem($newsItem){
+ $item = $this->parseRSS_2_0_Item($newsItem);
+ $item['content'] = $this->CADExtractContent($item['uri']);
+ return $item;
+ }
+
private function CADExtractContent($url) {
- $html3 = $this->getSimpleHTMLDOM($url);
+ $html3 = $this->get_cached($url);
// The request might fail due to missing https support or wrong URL
if($html3 == false)
@@ -32,33 +42,6 @@ class CADBridge extends BridgeAbstract{
return '';
}
- public function collectData(){
- function CADUrl($string) {
- $html2 = explode("\"", $string);
- $string = $html2[1];
- if (substr($string,0,4) != 'http')
- return 'notanurl';
- return $string;
- }
-
- $html = $this->getSimpleHTMLDOM('http://cdn2.cad-comic.com/rss.xml') or $this->returnServerError('Could not request CAD.');
- $limit = 0;
-
- foreach($html->find('item') as $element) {
- if($limit < 5) {
- $item = array();
- $item['title'] = $element->find('title', 0)->innertext;
- $item['uri'] = CADUrl($element->find('description', 0)->innertext);
- if ($item['uri'] != 'notanurl') {
- $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
- $item['content'] = $this->CADExtractContent($item['uri']);
- $this->items[] = $item;
- $limit++;
- }
- }
- }
- }
-
public function getCacheDuration(){
return 3600*2; // 2 hours
}
diff --git a/bridges/CommonDreamsBridge.php b/bridges/CommonDreamsBridge.php
index 446a6df0..e621db41 100644
--- a/bridges/CommonDreamsBridge.php
+++ b/bridges/CommonDreamsBridge.php
@@ -1,39 +1,26 @@
collectExpandableDatas('http://www.commondreams.org/rss.xml');
+ }
+
+ protected function parseItem($newsItem){
+ $item = $this->parseRSS_2_0_Item($newsItem);
+ $item['content'] = $this->CommonDreamsExtractContent($item['uri']);
+ return $item;
+ }
+
private function CommonDreamsExtractContent($url) {
- $html3 = $this->getSimpleHTMLDOM($url);
+ $html3 = $this->get_cached($url);
$text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext;
$html3->clear();
unset ($html3);
return $text;
}
-
- public function collectData(){
-
- function CommonDreamsUrl($string) {
- $html2 = explode(" ", $string);
- $string = $html2[2] . "/node/" . $html2[0];
- return $string;
- }
-
- $html = $this->getSimpleHTMLDOM('http://www.commondreams.org/rss.xml') or $this->returnServerError('Could not request CommonDreams.');
- $limit = 0;
- foreach($html->find('item') as $element) {
- if($limit < 4) {
- $item = array();
- $item['title'] = $element->find('title', 0)->innertext;
- $item['uri'] = CommonDreamsUrl($element->find('guid', 0)->innertext);
- $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
- $item['content'] = $this->CommonDreamsExtractContent($item['uri']);
- $this->items[] = $item;
- $limit++;
- }
- }
- }
}
diff --git a/bridges/DauphineLibereBridge.php b/bridges/DauphineLibereBridge.php
index 143a6c0a..d8e10ddb 100644
--- a/bridges/DauphineLibereBridge.php
+++ b/bridges/DauphineLibereBridge.php
@@ -1,10 +1,10 @@
array(
@@ -30,41 +30,31 @@ class DauphineLibereBridge extends BridgeAbstract {
)
));
- private function ExtractContent($url, $context) {
- $html2 = $this->getSimpleHTMLDOM($url);
- $text = $html2->find('div.column', 0)->innertext;
- $text = preg_replace('@@si', '', $text);
- return $text;
- }
+ public function collectData(){
+ $url = self::URI . 'rss';
- public function collectData(){
+ if (empty($this->getInput('u'))) {
+ $url = self::URI . $this->getInput('u') . '/rss';
+ }
- $context = stream_context_create($opts);
+ $this->collectExpandableDatas($url);
+ }
- if (empty($this->getInput('u'))) {
- $html = $this->getSimpleHTMLDOM(self::URI.$this->getInput('u').'/rss')
- or $this->returnServerError('Could not request DauphineLibere.');
- } else {
- $html = $this->getSimpleHTMLDOM(self::URI.'rss')
- or $this->returnServerError('Could not request DauphineLibere.');
- }
- $limit = 0;
+ protected function parseItem($newsItem){
+ $item = $this->parseRSS_2_0_Item($newsItem);
+ $item['content'] = $this->ExtractContent($item['uri']);
+ return $item;
+ }
- foreach($html->find('item') as $element) {
- if($limit < 10) {
- $item = array();
- $item['title'] = $element->find('title', 0)->innertext;
- $item['uri'] = $element->find('guid', 0)->plaintext;
- $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
- $item['content'] = $this->ExtractContent($item['uri'], $context);
- $this->items[] = $item;
- $limit++;
- }
- }
- }
+ private function ExtractContent($url) {
+ $html2 = $this->getSimpleHTMLDOM($url);
+ $text = $html2->find('div.column', 0)->innertext;
+ $text = preg_replace('@@si', '', $text);
+ return $text;
+ }
- public function getCacheDuration(){
- return 3600*2; // 2 hours
- }
+ public function getCacheDuration(){
+ return 3600*2; // 2 hours
+ }
}
?>
diff --git a/bridges/DeveloppezDotComBridge.php b/bridges/DeveloppezDotComBridge.php
index 48e29741..52e52db1 100644
--- a/bridges/DeveloppezDotComBridge.php
+++ b/bridges/DeveloppezDotComBridge.php
@@ -1,11 +1,21 @@
collectExpandableDatas(self::URI . 'index/rss');
+ }
+
+ protected function parseItem($newsItem){
+ $item = $this->parseRSS_2_0_Item($newsItem);
+ $item['content'] = $this->DeveloppezDotComExtractContent($item['uri']);
+ return $item;
+ }
+
private function DeveloppezDotComStripCDATA($string) {
$string = str_replace('', '', $string);
@@ -32,31 +42,12 @@ class DeveloppezDotComBridge extends BridgeAbstract{
}
private function DeveloppezDotComExtractContent($url) {
- $articleHTMLContent = $this->getSimpleHTMLDOM($url);
+ $articleHTMLContent = $this->get_cached($url);
$text = $this->convert_smart_quotes($articleHTMLContent->find('div.content', 0)->innertext);
$text = utf8_encode($text);
return trim($text);
}
- public function collectData(){
- $rssFeed = $this->getSimpleHTMLDOM(self::URI.'index/rss')
- or $this->returnServerError('Could not request '.self::URI.'index/rss');
- $limit = 0;
-
- foreach($rssFeed->find('item') as $element) {
- if($limit < 10) {
- $item = array();
- $item['title'] = $this->DeveloppezDotComStripCDATA($element->find('title', 0)->innertext);
- $item['uri'] = $this->DeveloppezDotComStripCDATA($element->find('guid', 0)->plaintext);
- $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext);
- $content = $this->DeveloppezDotComExtractContent($item['uri']);
- $item['content'] = strlen($content) ? $content : $element->description; //In case of it is a tutorial, we just keep the original description
- $this->items[] = $item;
- $limit++;
- }
- }
- }
-
public function getCacheDuration(){
return 1800; // 30min
}
diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php
index e4c8471f..beff9c8a 100644
--- a/bridges/FuturaSciencesBridge.php
+++ b/bridges/FuturaSciencesBridge.php
@@ -1,5 +1,5 @@
getInput('feed') . '.xml';
+ $this->collectExpandableDatas($url);
+ }
- function StripCDATA($string) {
- $string = str_replace('', '', $string);
- return $string;
- }
+ protected function parseItem($newsItem){
+ $item = $this->parseRSS_2_0_Item($newsItem);
+ $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']);
+ $article = $this->get_cached($item['uri'])
+ or $this->returnServerError('Could not request Futura-Sciences: ' . $item['uri']);
+ $item['content'] = $this->ExtractArticleContent($article);
+ $item['author'] = empty($this->ExtractAuthor($article)) ? $item['author'] : $this->ExtractAuthor($article);
+ return $item;
+ }
- function StripWithDelimiters($string, $start, $end) {
- while (strpos($string, $start) !== false) {
- $section_to_remove = substr($string, strpos($string, $start));
- $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
+ function StripWithDelimiters($string, $start, $end) {
+ while (strpos($string, $start) !== false) {
+ $section_to_remove = substr($string, strpos($string, $start));
+ $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end));
+ $string = str_replace($section_to_remove, '', $string);
+ } return $string;
+ }
+
+ function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
+ $open_tag = '<'.$tag_name;
+ $close_tag = ''.$tag_name.'>';
+ $close_tag_length = strlen($close_tag);
+ if (strpos($tag_start, $open_tag) === 0) {
+ while (strpos($string, $tag_start) !== false) {
+ $max_recursion = 100;
+ $section_to_remove = null;
+ $section_start = strpos($string, $tag_start);
+ $search_offset = $section_start;
+ do {
+ $max_recursion--;
+ $section_end = strpos($string, $close_tag, $search_offset);
+ $search_offset = $section_end + $close_tag_length;
+ $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
+ $open_tag_count = substr_count($section_to_remove, $open_tag);
+ $close_tag_count = substr_count($section_to_remove, $close_tag);
+ } while ($open_tag_count > $close_tag_count && $max_recursion > 0);
$string = str_replace($section_to_remove, '', $string);
- } return $string;
- }
-
- function StripRecursiveHTMLSection($string, $tag_name, $tag_start) {
- $open_tag = '<'.$tag_name;
- $close_tag = ''.$tag_name.'>';
- $close_tag_length = strlen($close_tag);
- if (strpos($tag_start, $open_tag) === 0) {
- while (strpos($string, $tag_start) !== false) {
- $max_recursion = 100;
- $section_to_remove = null;
- $section_start = strpos($string, $tag_start);
- $search_offset = $section_start;
- do {
- $max_recursion--;
- $section_end = strpos($string, $close_tag, $search_offset);
- $search_offset = $section_end + $close_tag_length;
- $section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
- $open_tag_count = substr_count($section_to_remove, $open_tag);
- $close_tag_count = substr_count($section_to_remove, $close_tag);
- } while ($open_tag_count > $close_tag_count && $max_recursion > 0);
- $string = str_replace($section_to_remove, '', $string);
- }
- }
- return $string;
- }
-
- // Extracts the author from an article or element
- function ExtractAuthor($article, $element){
- $article_author = $article->find('span.author', 0);
- if($article_author){
- $authorname = trim(str_replace(', Futura-Sciences', '', $article_author->plaintext));
- if(empty($authorname)){
- $element_author = $element->find('author', 0);
- if($element_author)
- $authorname = StripCDATA($element_author->plaintext);
- else
- return '';
- }
- return $authorname;
- }
- return '';
- }
-
- $url = $this->getURI().'rss/'.$this->getInput('feed').'.xml';
-
- $html = $this->getSimpleHTMLDOM($url)
- or $this->returnServerError('Could not request Futura-Sciences: '.$url);
- $limit = 0;
-
- foreach($html->find('item') as $element) {
- if ($limit < 10) {
- $article_url = str_replace('#xtor=RSS-8', '', StripCDATA($element->find('guid', 0)->plaintext));
- $article = $this->getSimpleHTMLDOM($article_url) or $this->returnServerError('Could not request Futura-Sciences: '.$article_url);
- $contents = $article->find('div.content', 0)->innertext;
-
- foreach (array(
- '
'); - $contents = StripWithDelimiters($contents, 'fs:definition="', '"'); - $contents = StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); - $contents = StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); - - $item = array(); - $item['author'] = ExtractAuthor($article, $element); - $item['uri'] = $article_url; - $item['title'] = StripCDATA($element->find('title', 0)->innertext); - $item['timestamp'] = strtotime(StripCDATA($element->find('pubDate', 0)->plaintext)); - $item['content'] = trim($contents); - $this->items[] = $item; - $limit++; } } + return $string; + } + function ExtractArticleContent($article){ + $contents = $article->find('div.content', 0)->innertext; + + foreach (array( + '
'); + $contents = $this->StripWithDelimiters($contents, 'fs:definition="', '"'); + $contents = $this->StripWithDelimiters($contents, 'fs:xt:clicktype="', '"'); + $contents = $this->StripWithDelimiters($contents, 'fs:xt:clickname="', '"'); + + return $contents; + } + + // Extracts the author from an article or element + function ExtractAuthor($article){ + $article_author = $article->find('span.author', 0); + if($article_author){ + return trim(str_replace(', Futura-Sciences', '', $article_author->plaintext)); + } + return ''; } } diff --git a/bridges/LeJournalDuGeekBridge.php b/bridges/LeJournalDuGeekBridge.php index e08f4193..dd0c444c 100644 --- a/bridges/LeJournalDuGeekBridge.php +++ b/bridges/LeJournalDuGeekBridge.php @@ -1,19 +1,23 @@ ', '', $string); - return $string; + public function collectData(){ + $this->collectExpandableDatas(self::URI . 'rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']); + return $item; } private function LeJournalDuGeekExtractContent($url) { - $articleHTMLContent = $this->getSimpleHTMLDOM($url); + $articleHTMLContent = $this->get_cached($url); $text = $articleHTMLContent->find('div.post-content', 0)->innertext; foreach($articleHTMLContent->find('a.more') as $element) { @@ -34,24 +38,6 @@ class LeJournalDuGeekBridge extends BridgeAbstract{ return $text; } - public function collectData(){ - $rssFeed = $this->getSimpleHTMLDOM(self::URI.'rss') - or $this->returnServerError('Could not request '.self::URI.'/rss'); - $limit = 0; - - foreach($rssFeed->find('item') as $element) { - if($limit < 5) { - $item = array(); - $item['title'] = $this->LeJournalDuGeekStripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->LeJournalDuGeekStripCDATA($element->find('guid', 0)->plaintext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->LeJournalDuGeekExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } - public function getCacheDuration(){ return 1800; // 30min } diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index 8fd1daa3..e361ea80 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -1,60 +1,42 @@ collectExpandableDatas(self::URI . 'rss/rss.xml'); + } - function StripCDATA($string) { - $string = str_replace('', '', $string); - return $string; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_1_0_Item($newsItem); + $article_html = $this->get_cached($item['uri']) + or $this->returnServerError('Could not request LeMondeInformatique: ' . $item['uri']); + $item['content'] = $this->CleanArticle($article_html->find('div#article', 0)->innertext); + $item['title'] = $article_html->find('h1.cleanprint-title', 0)->plaintext; + return $item; + } - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } return $string; - } + function StripCDATA($string) { + $string = str_replace('', '', $string); + return $string; + } - function CleanArticle($article_html) { - $article_html = StripWithDelimiters($article_html, ''); + $article_html = $this->StripWithDelimiters($article_html, '
'.$html2->find('span.sub_title', 0)->innertext.'
' .''.$premium_article->innertext.'
'; return $text; } - - public function collectData(){ - $html = $this->getSimpleHTMLDOM(self::URI.'rss/news.xml') or $this->returnServerError('Could not request NextInpact.'); - $limit = 0; - - foreach($html->find('item') as $element) { - if($limit < 3) { - $item = array(); - $item['title'] = $this->StripCDATA($element->find('title', 0)->innertext); - $item['uri'] = $this->StripCDATA($element->find('guid', 0)->plaintext); - $item['author'] = $this->StripCDATA($element->find('creator', 0)->innertext); - $item['timestamp'] = strtotime($element->find('pubDate', 0)->plaintext); - $item['content'] = $this->ExtractContent($item['uri']); - $this->items[] = $item; - $limit++; - } - } - } } diff --git a/bridges/NextgovBridge.php b/bridges/NextgovBridge.php index ee4f2996..dee8c370 100644 --- a/bridges/NextgovBridge.php +++ b/bridges/NextgovBridge.php @@ -1,5 +1,5 @@ collectExpandableDatas(self::URI . 'rss/' . $this->getInput('category') . '/'); + } - function ExtractFromDelimiters($string, $start, $end) { - if (strpos($string, $start) !== false) { - $section_retrieved = substr($string, strpos($string, $start) + strlen($start)); - $section_retrieved = substr($section_retrieved, 0, strpos($section_retrieved, $end)); - return $section_retrieved; - } return false; - } + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); - function StripWithDelimiters($string, $start, $end) { - while (strpos($string, $start) !== false) { - $section_to_remove = substr($string, strpos($string, $start)); - $section_to_remove = substr($section_to_remove, 0, strpos($section_to_remove, $end) + strlen($end)); - $string = str_replace($section_to_remove, '', $string); - } return $string; - } + $item['content'] = ''; - $category = $this->getInput('category'); - $url = $this->getURI().'rss/'.$category.'/'; - $html = $this->getSimpleHTMLDOM($url) or $this->returnServerError('Could not request Nextgov: '.$url); - $limit = 0; - - foreach ($html->find('item') as $element) { - if ($limit >= 10) { - break; + $namespaces = $newsItem->getNamespaces(true); + if(isset($namespaces['media'])){ + $media = $newsItem->children($namespaces['media']); + if(isset($media->content)){ + $attributes = $media->content->attributes(); + $item['content'] = ''.$article_subtitle.'
' + .trim($contents); } } diff --git a/bridges/NiceMatinBridge.php b/bridges/NiceMatinBridge.php index 3c189090..0f9d011a 100644 --- a/bridges/NiceMatinBridge.php +++ b/bridges/NiceMatinBridge.php @@ -1,13 +1,23 @@ collectExpandableDatas(self::URI . 'derniere-minute/rss'); + } + + protected function parseItem($newsItem){ + $item = $this->parseRSS_2_0_Item($newsItem); + $item['content'] = $this->NiceMatinExtractContent($item['uri']); + return $item; + } + private function NiceMatinExtractContent($url) { - $html = $this->getSimpleHTMLDOM($url); + $html = $this->get_cached($url); if(!$html) return 'Could not acquire content from url: ' . $url . '!'; @@ -19,29 +29,4 @@ class NiceMatinBridge extends BridgeAbstract{ $text = strip_tags($text, '