This commit is contained in:
Nicolas Delsaux 2014-05-26 15:21:02 +00:00
commit 9ba525092d
7 changed files with 388 additions and 5 deletions

43
bridges/Dilbert.php Normal file
View File

@ -0,0 +1,43 @@
<?php
/**
*
* @name Dilbert Daily Strip
* @description The Unofficial Dilbert Daily Comic Strip RSS Feed via rss-bridge
* @update 16/10/2013
*/
class Dilbert extends BridgeAbstract{
public function collectData(array $param){
$html = file_get_html('http://dilbert.com/strips/') or $this->returnError('Could not request Dilbert.', 404);
foreach($html->find('div.STR_Image') as $element) {
$item = new Item();
$href = $element->find('a',0)->href;
$item->uri = 'http://dilbert.com' . $href;
$content = str_replace('src="/', 'src="http://dilbert.com/',$element->innertext);
$content = str_replace('href="/', 'href="http://dilbert.com/',$content);
$item->content = $content;
$time = strtotime(substr($href, (strrpos($href, "/", -10) + 1), 10));
$item->title = date("d/m/Y", $time);
$item->timestamp = $time;
$this->items[] = $item;
}
}
public function getName(){
return 'Dilbert';
}
public function getURI(){
return 'http://dilbert.com';
}
public function getDescription(){
return 'Dilbert via rss-bridge';
}
public function getCacheDuration(){
return 14400; // 4 hours
}
}

69
bridges/Freenews.php Normal file
View File

@ -0,0 +1,69 @@
<?php
/**
*
* @name FreeNews
* @description Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). Ne rentrez pas d'id si vous voulez accéder aux actualités générales.
* @update 26/03/2014
* @use1(id="Id de la rubrique (sans le '-')")
*/
define('FREENEWS', 'http://www.freenews.fr/');
define('NEWS', FREENEWS.'spip.php?page=news');
define('RUBRIQUE', FREENEWS.'spip.php?page=rubrique&id_rubrique=-');
class FreeNews extends HttpCachingBridgeAbstract{
private $uri = NEWS;
private $name = 'Freenews';
public function collectData(array $param){
if (!empty($param['id'])) {
$this->uri = RUBRIQUE.$param['id'];
}
$html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
// $this->message("loaded HTML from ".$this->getURI());
// customize name
$this->name = $html->find('title', 0)->innertext;
foreach($html->find('.news_line') as $newsLines) {
$this->parseLine($newsLines);
}
}
public function parseLine($newsLines) {
foreach($newsLines->find('span') as $newsSpan) {
foreach($newsSpan->find('a') as $newsLink) {
$item = new Item();
$item->title = trim($newsLink->title);
$item->uri = FREENEWS.$newsLink->href;
// now load that uri from cache
$articlePage = str_get_html($this->get_cached($item->uri));
$content = $articlePage->find('.chapo', 0);
foreach($content->find('img') as $image) {
$image->src = FREENEWS.$image->src;
}
$redaction = $articlePage->find('.redac', 0);
$rubrique = $redaction->find('a', 0);
$auteur = $redaction->find('a', 1);
$item->content = $content->innertext;
$item->name = $auteur->innertext;
// format should parse 2014-03-25T16:21:20Z. But, according to http://stackoverflow.com/a/10478469, it is not that simple
$item->timestamp = DateTime::createFromFormat('Y-m-d\TH:i:s+', $redaction->title)->getTimestamp();
$this->items[] = $item;
// return after first link, as there are hidden treasures in those pages
return;
}
}
}
public function getName(){
return $this->name;
}
public function getURI(){
return $this->uri;
}
public function getCacheDuration(){
return 3600; // 2h hours
}
public function getDescription(){
return "Un site d'actualité pour les freenautes (mais ne parlant pas que de la freebox). par rss-bridge";
}
}

63
bridges/Les400Culs.php Normal file
View File

@ -0,0 +1,63 @@
<?php
/**
*
* @name Les 400 Culs
* @description La planète sexe vue par Agnès Girard via rss-bridge
* @update 20/02/2014
*/
define("SEXE", "http://sexes.blogs.liberation.fr");
class Les400Culs extends HttpCachingBridgeAbstract{
public function collectData(array $param){
$html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
foreach($html->find('#alpha-inner') as $articles) {
foreach($articles->find('div.entry') as $article) {
$header = $article->find('h3.entry-header a', 0);
$content = $article->find('div.entry-content', 0);
$item = new Item();
$item->title = trim($header->innertext);
$item->uri = $header->href;
$item->name = "Agnès Girard";
// date is stored outside this node !
$dateHeader = $article->prev_sibling();
// http://stackoverflow.com/a/6239199/15619 (strtotime is typical amercian bullshit)
$item->timestamp = DateTime::createFromFormat('d/m/Y', $dateHeader->innertext)->getTimestamp();
$linkForMore = $content->find('p.entry-more-link a',0);
if($linkForMore==null) {
$item->content = $content->innertext;
} else {
$pageAddress = $linkForMore->href;
$articlePage = str_get_html($this->get_cached($linkForMore->href));
if($articlePage==null) {
$item->content = $content->innertext."\n<p>".$linkForMore->outertext."</p>";
} else {
// TODO use some caching there !
$fullContent = $articlePage->find('div.entry-content', 0);
$item->content = $fullContent->innertext;
}
}
$this->items[] = $item;
}
}
}
public function getName(){
return 'Les 400 Culs';
}
public function getURI(){
return SEXE;
}
public function getCacheDuration(){
return 7200; // 2h hours
}
public function getDescription(){
return "La planète sexe, vue et racontée par Agnès Giard. Et par rss-bridge";
}
}

View File

@ -0,0 +1,55 @@
<?php
/**
*
* @name Les Joies Du Code
* @description LesJoiesDuCode via rss-bridge
* @update 30/01/2014
*/
class LesJoiesDuCode extends BridgeAbstract{
public function collectData(array $param){
$html = file_get_html('http://lesjoiesducode.fr/') or $this->returnError('Could not request LesJoiesDuCode.', 404);
foreach($html->find('div.post') as $element) {
$item = new Item();
$temp = $element->find('h3 a', 0);
$titre = $temp->innertext;
$url = $temp->href;
$temp = $element->find('div.bodytype', 0);
$content = $temp->innertext;
$auteur = $temp->find('.c1 em', 0);
$pos = strpos($auteur->innertext, "by");
if($pos > 0)
{
$auteur = trim(str_replace("*/", "", substr($auteur->innertext, ($pos + 2))));
$item->name = $auteur;
}
$item->content .= trim($content);
$item->uri = $url;
$item->title = trim($titre);
$this->items[] = $item;
}
}
public function getName(){
return 'Les Joies Du Code';
}
public function getURI(){
return 'http://lesjoiesducode.fr/';
}
public function getCacheDuration(){
return 7200; // 2h hours
}
public function getDescription(){
return "Les Joies Du Code via rss-bridge";
}
}

87
bridges/Sexactu.php Normal file
View File

@ -0,0 +1,87 @@
<?php
/**
*
* @name Sexactu
* @description Sexactu via rss-bridge
* @update 04/02/2014
*/
define("GQ", "http://www.gqmagazine.fr");
class Sexactu extends BridgeAbstract{
public function collectData(array $param){
$find = array('janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'novembre', 'décembre');
$replace = array('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December');
$html = file_get_html($this->getURI()) or $this->returnError('Could not request '.$this->getURI(), 404);
foreach($html->find('.content-holder') as $contentHolder) {
// only use first list as second one only contains pages numbers
$articles = $contentHolder->find('ul', 0);
foreach($articles->find('li') as $element) {
// if you ask about that method_exists, there seems to be a bug in simple html dom
// see stackoverflow for more details : http://stackoverflow.com/a/10828479/15619
if(is_object($element)) {
$item = new Item();
// various metadata
$titleBlock = $element->find('.title-holder', 0);
if(is_object($titleBlock)) {
$titleDetails = $titleBlock->find('.article-title',0);
$titleData = $titleDetails->find('h2', 0)->find('a',0);
$titleTimestamp =$titleDetails->find('h4',0);
$item->title = $this->correctCase(trim($titleData->innertext));
$item->uri = GQ.$titleData->href;
// Fugly date parsing due to the fact my DNS-323 doesn't support php intl extension
$dateText = $titleTimestamp->innertext;
$dateText = substr($dateText, strpos($dateText,',')+1);
$dateText = str_replace($find, $replace, strtolower($dateText));
$date = strtotime($dateText);
$item->timestamp = $date;
$item->name = "Maïa Mazaurette";
$elementText = $element->find('.text-container', 0);
// don't forget to replace images server url with gq one
foreach($elementText->find('img') as $image) {
$image->src = GQ.$image->src;
}
$item->content = $elementText->innertext;
$this->items[] = $item;
}
}
}
}
}
public function getName(){
return 'Sexactu';
}
public function getURI(){
return GQ.'/sexactu';
}
public function getCacheDuration(){
return 7200; // 2h hours
}
public function getDescription(){
return "Sexactu via rss-bridge";
}
public function correctCase($str) {
$sentences=explode('.', mb_strtolower($str, "UTF-8"));
$str="";
$sep="";
foreach ($sentences as $sentence)
{
//upper case first char
$sentence=ucfirst(trim($sentence));
//append sentence to output
$str=$str.$sep.$sentence;
$sep=". ";
}
return $str;
}
}

View File

@ -85,9 +85,11 @@ try{
// Data retrieval // Data retrieval
$bridge = Bridge::create($bridge); $bridge = Bridge::create($bridge);
$bridge if(isset($_REQUEST["disable_cache"])) {
->setCache($cache) // Comment this lign for avoid cache use } else {
->setDatas($_REQUEST); $bridge->setCache($cache); // just add disable cache to your query to disable caching
}
$bridge->setDatas($_REQUEST);
// Data transformation // Data transformation
$format = Format::create($format); $format = Format::create($format);

View File

@ -73,6 +73,70 @@ abstract class BridgeAbstract implements BridgeInterface{
} }
} }
/**
* Extension of BridgeAbstract allowing caching of files downloaded over http files.
* This is specially useful for sites from Gawker or Liberation networks, which allow pages excerpts top be viewed together on index, while full pages have to be downloaded
* separately.
* This class mainly provides a get_cached method which will will download the file from its remote location.
* TODO allow file cache invalidation by touching files on access, and removing files/directories which have not been touched since ... a long time
* After all, rss-bridge is not respaw, isn't it ?
*/
abstract class HttpCachingBridgeAbstract extends BridgeAbstract {
/**
* Maintain locally cached versions of pages to download to avoid multiple doiwnloads.
* A file name is generated by replacing all "/" by "_", and the file is saved below this bridge cache
* @param url url to cache
* @return content of file as string
*/
public function get_cached($url) {
$simplified_url = str_replace(["http://", "https://", "?", "&", "="], ["", "", "/", "/", "/"], $url);
// TODO build this from the variable given to Cache
$pageCacheDir = __DIR__ . '/../cache/'."pages/";
$filename = $pageCacheDir.$simplified_url;
if (substr($filename, -1) == '/') {
$filename = $filename."index.html";
}
if(file_exists($filename)) {
// $this->message("loading cached file from ".$filename." for page at url ".$url);
// TODO touch file and its parent, and try to do neighbour deletion
$currentPath = $filename;
while(!$pageCacheDir==$currentPath) {
touch($currentPath);
$currentPath = dirname($currentPath);
}
} else {
// $this->message("we have no local copy of ".$url." Downloading !");
$dir = substr($filename, 0, strrpos($filename, '/'));
if(!is_dir($dir)) {
mkdir($dir, 0777, true);
}
$this->download_remote($url, $filename);
}
return file_get_contents($filename);
}
public function download_remote($url , $save_path) {
$f = fopen( $save_path , 'w+');
$handle = fopen($url , "rb");
while (!feof($handle)) {
$contents = fread($handle, 8192);
fwrite($f , $contents);
}
fclose($handle);
fclose($f);
}
public function message($text) {
$backtrace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3);
$calling = $backtrace[2];
$message = $calling["file"].":".$calling["line"]
." class ".get_class($this)."->".$calling["function"]
." - ".$text;
error_log($message);
}
}
class Bridge{ class Bridge{
static protected $dirBridge; static protected $dirBridge;
@ -94,7 +158,7 @@ class Bridge{
$pathBridge = self::getDir() . $nameBridge . '.php'; $pathBridge = self::getDir() . $nameBridge . '.php';
if( !file_exists($pathBridge) ){ if( !file_exists($pathBridge) ){
throw new \Exception('The bridge you looking for does not exist.'); throw new \Exception('The bridge you looking for does not exist. It should be at path '.$pathBridge);
} }
require_once $pathBridge; require_once $pathBridge;