Extend support for wordpress.com

Refactor the code to make it more readable.
Change parameters to be more flexible.
Add crawling through articles to get the full content of it.
Add a cleaning method on content to remove scripts (bye bye google tracking)
This commit is contained in:
Alexis Degrugillier 2014-04-25 19:36:33 -04:00
parent 077f0e3fb1
commit cdca40c2bb

View File

@ -5,62 +5,75 @@
* *
* @name Wordpress Bridge * @name Wordpress Bridge
* @description Returns the newest articles of a blog hosted on wordpress * @description Returns the newest articles of a blog hosted on wordpress
* @use1(s="subdomain", f="folder") * @use1(url="blog URL", name="blog name")
*/ */
class WordpressBridge extends BridgeAbstract{ class WordpressBridge extends BridgeAbstract{
private $subdomain; private $url;
private $folder; private $name;
public function collectData(array $param){ public function collectData(array $param){
$this->processParams($param); $this->processParams($param);
if (!$this->hasSubdomain()) { if (!$this->hasUrl()) {
$this->returnError('You must specify a subdomain', 400); $this->returnError('You must specify a URL', 400);
} }
$html = file_get_html($this->getSiteURI()) or $this->returnError("Could not request {$this->getSiteURI()}.", 404); $html = file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404);
foreach($html->find('.post') as $article) { foreach($html->find('.post') as $article) {
$uri = $article->find('a',0)->href;
$this->items[] = $this->getDetails($uri);
}
}
private function getDetails($uri) {
$html = file_get_html($uri) or exit;
$item = new \Item(); $item = new \Item();
$uri = $article->find('a[rel=bookmark]',0)->href; $article = $html->find('.post', 0);
$item->uri = $uri; $item->uri = $uri;
$item->title = $article->find('h2',0)->innertext; $item->title = $article->find('h1',0)->innertext;
$item->content = $article->find('.entry',0)->innertext; $item->content = $this->clearContent($article->find('.entry-content,.entry',0)->innertext);
$item->timestamp = $this->getDate($uri);
return $item;
}
private function clearContent($content) {
$content = preg_replace('/<script.*\/script>/', '', $content);
return $content;
}
private function getDate($uri) {
preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches); preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches);
$date = new \DateTime($matches[0]); $date = new \DateTime($matches[0]);
$item->timestamp = $date->format('U'); return $date->format('U');
$this->items[] = $item;
}
} }
public function getName(){ public function getName(){
return 'Wordpress.com Bridge'; return "{$this->name} - Wordpress.com Bridge";
} }
public function getURI(){ public function getURI(){
return 'http://%s.wordpress.com/%s'; return $this->url;
} }
public function getCacheDuration(){ public function getCacheDuration(){
return 21600; // 6 hours return 21600; // 6 hours
} }
private function getSiteURI(){ private function hasUrl(){
return sprintf($this->getURI(), $this->subdomain, $this->folder); if (empty($this->url)){
}
private function hasSubdomain(){
if (empty($this->subdomain)){
return false; return false;
} }
return true; return true;
} }
private function processParams($param){ private function processParams($param){
$this->subdomain = $param['s']; $this->url = $param['url'];
$this->folder = $param['f']; $this->name = $param['name'];
} }
} }