From 077f0e3fb181b27d27271efa2d419a75bc211c2d Mon Sep 17 00:00:00 2001 From: Alexis Degrugillier Date: Wed, 23 Apr 2014 21:04:43 -0400 Subject: [PATCH 1/3] Add a bridge to blogs hosted on wordpress.com --- bridges/WordPressBridge.php | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 bridges/WordPressBridge.php diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php new file mode 100644 index 00000000..53d7df08 --- /dev/null +++ b/bridges/WordPressBridge.php @@ -0,0 +1,66 @@ +processParams($param); + + if (!$this->hasSubdomain()) { + $this->returnError('You must specify a subdomain', 400); + } + + $html = file_get_html($this->getSiteURI()) or $this->returnError("Could not request {$this->getSiteURI()}.", 404); + + foreach($html->find('.post') as $article) { + $item = new \Item(); + + $uri = $article->find('a[rel=bookmark]',0)->href; + $item->uri = $uri; + $item->title = $article->find('h2',0)->innertext; + $item->content = $article->find('.entry',0)->innertext; + preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches); + $date = new \DateTime($matches[0]); + $item->timestamp = $date->format('U'); + $this->items[] = $item; + } + } + + public function getName(){ + return 'Wordpress.com Bridge'; + } + + public function getURI(){ + return 'http://%s.wordpress.com/%s'; + } + + public function getCacheDuration(){ + return 21600; // 6 hours + } + + private function getSiteURI(){ + return sprintf($this->getURI(), $this->subdomain, $this->folder); + } + + private function hasSubdomain(){ + if (empty($this->subdomain)){ + return false; + } + return true; + } + + private function processParams($param){ + $this->subdomain = $param['s']; + $this->folder = $param['f']; + } + +} From cdca40c2bb663c1aab02eeea158cfa7807649726 Mon Sep 17 00:00:00 2001 From: Alexis Degrugillier Date: Fri, 25 Apr 2014 19:36:33 -0400 Subject: [PATCH 2/3] Extend support for wordpress.com Refactor the code to make it more readable. Change parameters to be more flexible. Add crawling through articles to get the full content of it. Add a cleaning method on content to remove scripts (bye bye google tracking) --- bridges/WordPressBridge.php | 65 ++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index 53d7df08..fcb2a0dd 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -5,62 +5,75 @@ * * @name Wordpress Bridge * @description Returns the newest articles of a blog hosted on wordpress -* @use1(s="subdomain", f="folder") +* @use1(url="blog URL", name="blog name") */ class WordpressBridge extends BridgeAbstract{ - private $subdomain; - private $folder; + private $url; + private $name; public function collectData(array $param){ $this->processParams($param); - if (!$this->hasSubdomain()) { - $this->returnError('You must specify a subdomain', 400); + if (!$this->hasUrl()) { + $this->returnError('You must specify a URL', 400); } - $html = file_get_html($this->getSiteURI()) or $this->returnError("Could not request {$this->getSiteURI()}.", 404); + $html = file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404); foreach($html->find('.post') as $article) { - $item = new \Item(); - - $uri = $article->find('a[rel=bookmark]',0)->href; - $item->uri = $uri; - $item->title = $article->find('h2',0)->innertext; - $item->content = $article->find('.entry',0)->innertext; - preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches); - $date = new \DateTime($matches[0]); - $item->timestamp = $date->format('U'); - $this->items[] = $item; + $uri = $article->find('a',0)->href; + $this->items[] = $this->getDetails($uri); } } + private function getDetails($uri) { + $html = file_get_html($uri) or exit; + + $item = new \Item(); + + $article = $html->find('.post', 0); + $item->uri = $uri; + $item->title = $article->find('h1',0)->innertext; + $item->content = $this->clearContent($article->find('.entry-content,.entry',0)->innertext); + $item->timestamp = $this->getDate($uri); + + return $item; + } + + private function clearContent($content) { + $content = preg_replace('//', '', $content); + return $content; + } + + private function getDate($uri) { + preg_match('/\d{4}\/\d{2}\/\d{2}/', $uri, $matches); + $date = new \DateTime($matches[0]); + return $date->format('U'); + } + public function getName(){ - return 'Wordpress.com Bridge'; + return "{$this->name} - Wordpress.com Bridge"; } public function getURI(){ - return 'http://%s.wordpress.com/%s'; + return $this->url; } public function getCacheDuration(){ return 21600; // 6 hours } - private function getSiteURI(){ - return sprintf($this->getURI(), $this->subdomain, $this->folder); - } - - private function hasSubdomain(){ - if (empty($this->subdomain)){ + private function hasUrl(){ + if (empty($this->url)){ return false; } return true; } private function processParams($param){ - $this->subdomain = $param['s']; - $this->folder = $param['f']; + $this->url = $param['url']; + $this->name = $param['name']; } } From 21d81068ce88b3cf6479871a5d00d4ea587428c2 Mon Sep 17 00:00:00 2001 From: Alexis Degrugillier Date: Sun, 4 May 2014 07:26:10 -0400 Subject: [PATCH 3/3] Fix indent --- bridges/WordPressBridge.php | 53 +++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/bridges/WordPressBridge.php b/bridges/WordPressBridge.php index fcb2a0dd..274f3809 100644 --- a/bridges/WordPressBridge.php +++ b/bridges/WordPressBridge.php @@ -1,18 +1,19 @@ processParams($param); if (!$this->hasUrl()) { @@ -21,11 +22,11 @@ class WordpressBridge extends BridgeAbstract{ $html = file_get_html($this->url) or $this->returnError("Could not request {$this->url}.", 404); - foreach($html->find('.post') as $article) { - $uri = $article->find('a',0)->href; + foreach ($html->find('.post') as $article) { + $uri = $article->find('a', 0)->href; $this->items[] = $this->getDetails($uri); } - } + } private function getDetails($uri) { $html = file_get_html($uri) or exit; @@ -34,8 +35,8 @@ class WordpressBridge extends BridgeAbstract{ $article = $html->find('.post', 0); $item->uri = $uri; - $item->title = $article->find('h1',0)->innertext; - $item->content = $this->clearContent($article->find('.entry-content,.entry',0)->innertext); + $item->title = $article->find('h1', 0)->innertext; + $item->content = $this->clearContent($article->find('.entry-content,.entry', 0)->innertext); $item->timestamp = $this->getDate($uri); return $item; @@ -52,26 +53,26 @@ class WordpressBridge extends BridgeAbstract{ return $date->format('U'); } - public function getName(){ - return "{$this->name} - Wordpress.com Bridge"; - } + public function getName() { + return "{$this->name} - Wordpress.com Bridge"; + } - public function getURI(){ - return $this->url; - } + public function getURI() { + return $this->url; + } - public function getCacheDuration(){ - return 21600; // 6 hours - } + public function getCacheDuration() { + return 21600; // 6 hours + } - private function hasUrl(){ - if (empty($this->url)){ + private function hasUrl() { + if (empty($this->url)) { return false; } return true; } - private function processParams($param){ + private function processParams($param) { $this->url = $param['url']; $this->name = $param['name']; }