Extend support for wordpress.com

Refactor the code to make it more readable.
Change parameters to be more flexible.
Add crawling through articles to get the full content of it.
Add a cleaning method on content to remove scripts (bye bye google tracking)
This commit is contained in:
Alexis Degrugillier 2014-04-25 19:36:33 -04:00
parent 077f0e3fb1
commit cdca40c2bb

View File

@ -5,62 +5,75 @@
*
* @name Wordpress Bridge
* @description Returns the newest articles of a blog hosted on wordpress
* @use1(s="subdomain", f="folder")
* @use1(url="blog URL", name="blog name")
*/
class WordpressBridge extends BridgeAbstract{
private $subdomain;
private $folder;
private $url;
private $name;
public function collectData(array $param){
$this->processParams($param);
if (!$this->hasSubdomain()) {
$this->returnError('You must specify a subdomain', 400);
if (!$this->hasUrl()) {
$this->returnError('You must specify a URL', 400);
}
$html = file_get_html($this->getSiteURI()) or $this->returnError("Could not request {$this->getSiteURI()}.", 404);
$html = file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404);
foreach($html->find('.post') as $article) {
$item = new \Item();
$uri = $article->find('a[rel=bookmark]',0)->href;
$item->uri = $uri;
$item->title = $article->find('h2',0)->innertext;
$item->content = $article->find('.entry',0)->innertext;
preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches);
$date = new \DateTime($matches[0]);
$item->timestamp = $date->format('U');
$this->items[] = $item;
$uri = $article->find('a',0)->href;
$this->items[] = $this->getDetails($uri);
}
}
private function getDetails($uri) {
$html = file_get_html($uri) or exit;
$item = new \Item();
$article = $html->find('.post', 0);
$item->uri = $uri;
$item->title = $article->find('h1',0)->innertext;
$item->content = $this->clearContent($article->find('.entry-content,.entry',0)->innertext);
$item->timestamp = $this->getDate($uri);
return $item;
}
private function clearContent($content) {
$content = preg_replace('/<script.*\/script>/', '', $content);
return $content;
}
private function getDate($uri) {
preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches);
$date = new \DateTime($matches[0]);
return $date->format('U');
}
public function getName(){
return 'Wordpress.com Bridge';
return "{$this->name} - Wordpress.com Bridge";
}
public function getURI(){
return 'http://%s.wordpress.com/%s';
return $this->url;
}
public function getCacheDuration(){
return 21600; // 6 hours
}
private function getSiteURI(){
return sprintf($this->getURI(), $this->subdomain, $this->folder);
}
private function hasSubdomain(){
if (empty($this->subdomain)){
private function hasUrl(){
if (empty($this->url)){
return false;
}
return true;
}
private function processParams($param){
$this->subdomain = $param['s'];
$this->folder = $param['f'];
$this->url = $param['url'];
$this->name = $param['name'];
}
}