From 85ac9001d6c700658664c0e5cab9f27f80ee93b8 Mon Sep 17 00:00:00 2001 From: LogMANOriginal Date: Tue, 13 Feb 2018 21:46:33 +0100 Subject: [PATCH 1/2] [IPBBridge] Add bridge (#564) This bridge returns feeds for any URI that is compatible with the IPB implementation (currently 4.x). Older versions might work, but there is no guarantee. Only forum and topic URIs are supported! The bridge automatically checks if natural feeds are available (by adding '.xml' to the URI). If so the feed is returned. Otherwise the bridge will attempt to identify the content type and build a feed accordingly. Valid URIs are forums and topics. For forums the first page is returned, for topics the last one. Elements are ordered such that the latest entry is returned first (oldest-to-newest) The optional parameter '&limit=' specifies how many pages should be loaded (default: 1). Topics are loaded in reverse order. => Does not work with forums! Images are provided as enclosures and scaled to a max-size of 400x400 pixels by default (Except for natural feeds). The content is filtered before being returned: - Unnecessary tags are removed (iframes, etc...) - Styles for blockquotes are restored (grey background) Closes #507 --- bridges/IPBBridge.php | 307 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 bridges/IPBBridge.php diff --git a/bridges/IPBBridge.php b/bridges/IPBBridge.php new file mode 100644 index 00000000..f3fa14f4 --- /dev/null +++ b/bridges/IPBBridge.php @@ -0,0 +1,307 @@ + array( + 'name' => 'URI', + 'type' => 'text', + 'required' => true, + 'title' => 'Insert forum, subforum or topic URI', + 'exampleValue' => 'https://invisioncommunity.com/forums/forum/499-feedback-and-ideas/' + ), + 'limit' => array( + 'name' => 'Limit', + 'type' => 'number', + 'required' => false, + 'title' => 'Specify how many pages should be fetched (-1: all)', + 'defaultValue' => 1 + ) + ) + ); + const CACHE_TIMEOUT = 3600; + + // Constants for internal use + const FORUM_TYPE_LIST_FILTER = '.cForumTopicTable'; + const FORUM_TYPE_TABLE_FILTER = '#forum_table'; + + const TOPIC_TYPE_ARTICLE = 'article'; + const TOPIC_TYPE_DIV = 'div.post_block'; + + public function getURI(){ + return $this->getInput('uri') ?: parent::getURI(); + } + + public function collectData(){ + // The URI cannot be the mainpage (or anything related) + switch(parse_url($this->getInput('uri'), PHP_URL_PATH)) { + case null: + case '/index.php': + returnClientError('Provided URI is invalid!'); + break; + default: + break; + } + + // Sanitize the URI (because else it won't work) + $uri = rtrim($this->getInput('uri'), '/'); // No trailing slashes! + + // Forums might provide feeds, though that's optional *facepalm* + // Let's check if there is a valid feed available + $headers = get_headers($uri . '.xml'); + + if($headers[0] === 'HTTP/1.1 200 OK') { // Heureka! It's a valid feed! + return $this->collectExpandableDatas($uri); + } + + // No valid feed, so do it the hard way + $html = getSimpleHTMLDOM($uri) + or returnServerError('Could not request ' . $this->getInput('uri') . '!'); + + $limit = $this->getInput('limit'); + + // Determine if this is a topic or a forum + switch(true) { + case $this->isTopic($html): + $this->collectTopic($html, $limit); + break; + case $this->isForum($html); + $this->collectForum($html); + break; + default: + returnClientError('Unknown type!'); + break; + } + } + + private function isForum($html){ + return !is_null($html->find('div[data-controller*=forums.front.forum.forumPage]', 0)) + || !is_null($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)); + } + + private function isTopic($html){ + return !is_null($html->find('div[data-controller*=core.front.core.commentFeed]', 0)) + || !is_null($html->find(static::TOPIC_TYPE_DIV, 0)); + } + + private function collectForum($html){ + // There are multiple forum designs in use (depends on version?) + // 1 - Uses an ordered list (based on https://invisioncommunity.com/forums) + // 2 - Uses a table (based on https://onehallyu.com) + + switch(true) { + case !is_null($html->find(static::FORUM_TYPE_LIST_FILTER, 0)): + $this->collectForumList($html); + break; + case !is_null($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)): + $this->collectForumTable($html); + break; + default: + returnClientError('Unknown forum format!'); + break; + } + } + + private function collectForumList($html){ + foreach($html->find(static::FORUM_TYPE_LIST_FILTER, 0)->children() as $row) { + // Columns: Title, Statistics, Last modified + $item = array(); + + $item['uri'] = $row->find('a', 0)->href; + $item['title'] = $row->find('a', 0)->title; + $item['author'] = $row->find('a', 1)->innertext; + $item['timestamp'] = strtotime($row->find('time', 0)->getAttribute('datetime')); + + $this->items[] = $item; + } + } + + private function collectForumTable($html){ + foreach($html->find(static::FORUM_TYPE_TABLE_FILTER, 0)->children() as $row) { + // Columns: Icon, Content, Preview, Statistics, Last modified + $item = array(); + + // Skip header row + if(!is_null($row->find('th', 0))) continue; + + $item['uri'] = $row->find('a', 0)->href; + $item['title'] = $row->find('.title', 0)->plaintext; + $item['timestamp'] = strtotime($row->find('[itemprop=dateCreated]', 0)->plaintext); + + $this->items[] = $item; + } + } + + private function collectTopic($html, $limit){ + // There are multiple topic designs in use (depends on version?) + // 1 - Uses articles (based on https://invisioncommunity.com/forums) + // 2 - Uses divs (based on https://onehallyu.com) + + switch(true) { + case !is_null($html->find(static::TOPIC_TYPE_ARTICLE, 0)): + $this->collectTopicHistory($html, $limit, 'collectTopicArticle'); + break; + case !is_null($html->find(static::TOPIC_TYPE_DIV, 0)): + $this->collectTopicHistory($html, $limit, 'collectTopicDiv'); + break; + default: + returnClientError('Unknown topic format!'); + break; + } + } + + private function collectTopicHistory($html, $limit, $callback){ + // Make sure the callback is valid! + if(!method_exists($this, $callback)) + returnServerError('Unknown function (\'' . $callback . '\')!'); + + $next = null; // Holds the URI of the next page + + do { + // Skip loading HTML on first iteration + if(!is_null($next)) { + $html = getSimpleHTMLDOMCached($next); + } + + $next = $this->$callback($html, is_null($next)); + $limit--; + } while(!is_null($next) && $limit <> 0); + } + + private function collectTopicArticle($html, $firstrun = true){ + $title = $html->find('h1.ipsType_pageTitle', 0)->plaintext; + + // Are we on last page? + if($firstrun && !is_null($html->find('.ipsPagination', 0))) { + $last = $html->find('.ipsPagination_last a', 0)->{'data-page'}; + $active = $html->find('.ipsPagination_active a', 0)->{'data-page'}; + + if($active !== $last) { + // Load last page into memory (cached) + $html = getSimpleHTMLDOMCached($html->find('.ipsPagination_last a', 0)->href); + } + } + + foreach(array_reverse($html->find(static::TOPIC_TYPE_ARTICLE)) as $article) { + $item = array(); + + $item['uri'] = $article->find('time', 0)->parent()->href; + $item['author'] = $article->find('aside a', 0)->plaintext; + $item['title'] = $item['author'] . ' - ' . $title; + $item['timestamp'] = strtotime($article->find('time', 0)->getAttribute('datetime')); + + $content = $article->find('[data-role=commentContent]', 0); + $content = $this->scaleImages($content); + $item['content'] = $this->fixContent($content); + $item['enclosures'] = $this->findImages($article->find('[data-role=commentContent]', 0)) ?: null; + + $this->items[] = $item; + } + + // Return whatever page comes next (previous, as we add in inverse order) + // Do we have a previous page? (inactive means no) + if(!is_null($html->find('li[class=ipsPagination_prev ipsPagination_inactive]', 0))) { + return null; // No, or no more + } elseif(!is_null($html->find('li[class=ipsPagination_prev]', 0))) { + return $html->find('.ipsPagination_prev a', 0)->href; + } + + return null; + } + + private function collectTopicDiv($html, $firstrun = true){ + $title = $html->find('h1.ipsType_pagetitle', 0)->plaintext; + + // Are we on last page? + if($firstrun && !is_null($html->find('.pagination', 0))) { + + $active = $html->find('li[class=page active]', 0)->plaintext; + + // There are two ways the 'last' page is displayed: + // - With a distict 'last' button (only if there are enough pages) + // - With a button for each page (use last button) + if(!is_null($html->find('li.last', 0))) { + $last = $html->find('li.last a', 0); + } else { + $last = $html->find('li[class=page] a', -1); + } + + if($active !== $last->plaintext) { + // Load last page into memory (cached) + $html = getSimpleHTMLDOMCached($last->href); + } + } + + foreach(array_reverse($html->find(static::TOPIC_TYPE_DIV)) as $article) { + $item = array(); + + $item['uri'] = $article->find('a[rel=bookmark]', 0)->href; + $item['author'] = $article->find('.author', 0)->plaintext; + $item['title'] = $item['author'] . ' - ' . $title; + $item['timestamp'] = strtotime($article->find('.published', 0)->getAttribute('title')); + + $content = $article->find('[itemprop=commentText]', 0); + $content = $this->scaleImages($content); + $item['content'] = $this->fixContent($content); + + $item['enclosures'] = $this->findImages($article->find('.post_body', 0)) ?: null; + + $this->items[] = $item; + } + + // Return whatever page comes next (previous, as we add in inverse order) + // Do we have a previous page? + if(!is_null($html->find('li.prev', 0))) { + return $html->find('li.prev a', 0)->href; + } + + return null; + } + + /** Returns all images from the provide HTML DOM */ + private function findImages($html){ + $images = array(); + + foreach($html->find('img') as $img) { + $images[] = $img->src; + } + + return $images; + } + + /** Sets the maximum width and height for all images */ + private function scaleImages($html, $width = 400, $height = 400){ + foreach($html->find('img') as $img) { + $img->style = "max-width: {$width}px; max-height: {$height}px;"; + } + + return $html; + } + + /** Removes all unnecessary tags and adds formatting */ + private function fixContent($html){ + + // Restore quote highlighting + foreach($html->find('blockquote') as $quote) { + $quote->style = <<innertext, + '



    • ' + ); + + return $content; + } +} From ef402bb5c3f953300eaf559bf0e4f54b2628f68f Mon Sep 17 00:00:00 2001 From: sysadminstory Date: Wed, 14 Feb 2018 12:03:44 +0100 Subject: [PATCH 2/2] [DealabsBride] Fix for the new site (#595) * [DealabsBride] Fix for the new site --- bridges/DealabsBridge.php | 372 +++++++++++++++++++++++++++----------- 1 file changed, 263 insertions(+), 109 deletions(-) diff --git a/bridges/DealabsBridge.php b/bridges/DealabsBridge.php index d6b1e661..cd020171 100644 --- a/bridges/DealabsBridge.php +++ b/bridges/DealabsBridge.php @@ -2,8 +2,7 @@ class DealabsBridge extends BridgeAbstract { const NAME = 'Dealabs search bridge'; const URI = 'https://www.dealabs.com/'; - const DESCRIPTION = 'Return the Dealabs search result using keywords, - with/without expired deals, with/without shop deals and by category'; + const DESCRIPTION = 'Return the Dealabs search result using keywords'; const MAINTAINER = 'sysadminstory'; const PARAMETERS = array( array ( 'q' => array( @@ -11,97 +10,56 @@ class DealabsBridge extends BridgeAbstract { 'type' => 'text', 'required' => true ), - 'expired_choice' => array( - 'name' => 'Afficher deals expirés', - 'type' => 'checkbox' + 'hide_expired' => array( + 'name' => 'Masquer les éléments expirés', + 'type' => 'checkbox', + 'required' => 'true' ), - 'instore_choice' => array( - 'name' => 'Afficher deals en magasin', - 'type' => 'checkbox' + 'hide_local' => array( + 'name' => 'Masquer les deals locaux', + 'type' => 'checkbox', + 'title' => 'Masquer les deals en magasins physiques', + 'required' => 'true' + ), + 'priceFrom' => array( + 'name' => 'Prix minimum', + 'type' => 'text', + 'title' => 'Prix mnimum en euros', + 'required' => 'false', + 'defaultValue' => '' + ), + 'priceTo' => array( + 'name' => 'Prix maximum', + 'type' => 'text', + 'title' => 'Prix maximum en euros', + 'required' => 'false', + 'defaultValue' => '' ), - 'cat' => array( - 'name' => 'Catégorie', - 'type' => 'list', - 'values' => array( - 'Toutes les catégories' => '', - 'High-tech' => array( - 'Tous' => 'c2', - 'Informatique' => 's3', - 'Téléphonie' => 's4', - 'Accessoires, consommables' => 's6', - 'Gadgets' => 's8', - 'Applications, logiciels' => 's46' - ), - 'Audiovisuel' => array( - 'Tous' => 'c5', - 'Image et son' => 's9', - 'Photo, caméscopes' => 's10', - 'CD, DVD, Blu-ray' => 's11', - 'Jeux vidéo, consoles' => 's12' - ), - 'Loisirs' => array( - 'Tous' => 'c7', - 'Jeux, jouets' => 's13', - 'Livres, papeterie' => 's14', - 'Plein air' => 's15', - 'Sport' => 's35', - 'Auto/Moto, accessoires' => 's37', - 'Animaux, accessoires' => 's47', - 'Instruments de musique' => 's48' - ), - 'Mode' => array( - 'Tous' => 'c16', - 'Homme' => 's17', - 'Femme' => 's18', - 'Mixte' => 's50', - 'Enfants' => 's19', - 'Puériculture' => 's36', - 'Beauté, santé' => 's21', - 'Bijoux, accessoires' => 's20', - 'Bagagerie' => 's38' - ), - 'Maison' => array( - 'Tous' => 'c23', - 'Meuble, literie, déco' => 's24', - 'Cuisine, art de la table' => 's25', - 'Électroménager' => 's26', - 'Bricolage' => 's27', - 'Jardin' => 's28' - ), - 'Services' => array( - 'Tous' => 'c51', - 'Voyages' => 's57', - 'Hébergement, restauration' => 's52', - 'Sorties' => 's53', - 'Presse' => 's24', - 'Bien-être' => 's55', - 'Transport, expédition' => 's56', - 'Autres' => 's58' - ), - 'Épicerie' => 'c31' - - ) - ) - - )); const CACHE_TIMEOUT = 3600; public function collectData(){ $q = $this->getInput('q'); + $hide_expired = $this->getInput('hide_expired'); + $hide_local = $this->getInput('hide_local'); + $priceFrom = $this->getInput('priceFrom'); + $priceTo = $this->getInput('priceFrom'); - $expired_choice = $this->getInput('expired_choice'); - $instore_choice = $this->getInput('instore_choice'); - $cat_subcat = $this->getInput('cat'); + /* Event if the original website uses POST with the search page, GET works too */ $html = getSimpleHTMLDOM(self::URI - . '/search/?q=' + . '/search/advanced?q=' . urlencode($q) - . '&hide_expired=' - . $expired_choice - . '&hide_instore=' - . $instore_choice - . '&' . $this->getCatSubcatParam($cat_subcat)) + . '&hide_expired='. $hide_expired + . '&hide_local='. $hide_local + . '&priceFrom='. $priceFrom + . '&priceTo='. $priceTo + /* Some default parameters + * search_fields : Search in Titres & Descriptions & Codes + * sort_by : Sort the search by new deals + * time_frame : Search will not be on a limited timeframe + */ + . '&search_fields[]=1&search_fields[]=2&search_fields[]=3&sort_by=new&time_frame=0') or returnServerError('Could not request Dealabs.'); $list = $html->find('article'); if($list === null) { @@ -110,24 +68,232 @@ class DealabsBridge extends BridgeAbstract { foreach($list as $deal) { $item = array(); - $item['uri'] = $deal->find('a.title', 0)->href; - $item['title'] = $deal->find('a.title', 0)->plaintext; - $item['author'] = $deal->find('a.poster_link', 0)->plaintext; - $item['content'] = '
      ' - . $deal->find('div.image_part', 0)->outertext + $item['uri'] = $deal->find('div[class=threadGrid-title]', 0)->find('a', 0)->href; + $item['title'] = $deal->find( + 'a[class=cept-tt thread-link linkPlain space--r-1 size--all-s size--fromW3-m]', 0 + )->plaintext; + $item['author'] = $deal->find('span.thread-username', 0)->plaintext; + $item['content'] = '

      ' + . $deal->find('a[class=cept-tt thread-link linkPlain space--r-1 size--all-s size--fromW3-m]', 0)->innertext + . '

      ' + . $this->getPrix($deal) + . $this->getReduction($deal) + . $this->getExpedition($deal) + . $this->getLivraison($deal) + . $this->getOrigine($deal) + . $deal->find( + 'div[class=cept-description-container overflow--wrap-break size--all-s size--fromW3-m]', 0 + )->innertext . '
      ' - . $deal->find('a.title', 0)->outertext - . $deal->find('p.description', 0)->outertext - . '' - . $deal->find('div.vote_part', 0)->outertext + . $deal->find('div[class=flex flex--align-c flex--justify-space-between space--b-2]', 0)->children(0)->outertext . '
      '; - $item['timestamp'] = $this->relativeDateToTimestamp( - $deal->find('p.date_deal', 0)->plaintext); + $dealDateDiv = $deal->find('div[class=size--all-s flex flex--wrap flex--justify-e flex--grow-1]', 0) + ->find('span[class=hide--toW3]'); + $itemDate = end($dealDateDiv)->plaintext; + if(substr( $itemDate, 0, 6 ) === 'il y a') { + $item['timestamp'] = $this->relativeDateToTimestamp($itemDate); + } else { + $item['timestamp'] = $this->parseDate($itemDate); + } $this->items[] = $item; } } + /** + * Get the Price from a Deal if it exists + * @return string String of the deal price + */ + private function getPrix($deal) + { + if($deal->find( + 'span[class*=thread-price]', 0) != null) { + return '
      Prix : ' + . $deal->find( + 'span[class*=thread-price]', 0 + )->plaintext + . '
      '; + } else { + return ''; + } + } + + + /** + * Get the Shipping costs from a Deal if it exists + * @return string String of the deal shipping Cost + */ + private function getLivraison($deal) + { + if($deal->find('span[class*=cept-shipping-price]', 0) != null) { + if($deal->find('span[class*=cept-shipping-price]', 0)->children(0) != null) { + return '
      Livraison : ' + . $deal->find('span[class*=cept-shipping-price]', 0)->children(0)->innertext + . '
      '; + } else { + return '
      Livraison : ' + . $deal->find('span[class*=cept-shipping-price]', 0)->innertext + . '
      '; + } + } else { + return ''; + } + } + + /** + * Get the source of a Deal if it exists + * @return string String of the deal source + */ + private function getOrigine($deal) + { + if($deal->find('a[class=text--color-greyShade]', 0) != null) { + return '
      Origine : ' + . $deal->find('a[class=text--color-greyShade]', 0)->outertext + . '
      '; + } else { + return ''; + } + } + + /** + * Get the original Price and discout from a Deal if it exists + * @return string String of the deal original price and discount + */ + private function getReduction($deal) + { + if($deal->find('span[class*=mute--text text--lineThrough]', 0) != null) { + return '
      Réduction : ' + . $deal->find( + 'span[class*=mute--text text--lineThrough]', 0 + )->plaintext + . ' ' + . $deal->find('span[class=space--ml-1 size--all-l size--fromW3-xl]', 0)->plaintext + . '
      '; + } else { + return ''; + } + } + + /** + * Get the Picture URL from a Deal if it exists + * @return string String of the deal Picture URL + */ + private function getImage($deal) + { + + $selectorLazy = implode( + ' ', /* Notice this is a space! */ + array( + 'thread-image', + 'width--all-auto', + 'height--all-auto', + 'imgFrame-img', + 'cept-thread-img', + 'img--dummy', + 'js-lazy-img' + ) + ); + + $selectorPlain = implode( + ' ', /* Notice this is a space! */ + array( + 'thread-image', + 'width--all-auto', + 'height--all-auto', + 'imgFrame-img', + 'cept-thread-img' + ) + ); + if($deal->find('img[class='. $selectorLazy .']', 0) != null) { + return json_decode( + html_entity_decode( + $deal->find('img[class='. $selectorLazy .']', 0) + ->getAttribute('data-lazy-img')))->{'src'}; + } else { + return $deal->find('img[class='. $selectorPlain .']', 0 )->src; + } + } + + /** + * Get the originating country from a Deal if it existsa + * @return string String of the deal originating country + */ + private function getExpedition($deal) + { + $selector = implode( + ' ', /* Notice this is a space! */ + array( + 'meta-ribbon', + 'overflow--wrap-off', + 'space--l-3', + 'text--color-greyShade' + ) + ); + if($deal->find('span[class='. $selector .']', 0) != null) { + return '
      ' + . $deal->find('span[class='. $selector .']', 0)->children(2)->plaintext + . '
      '; + } else { + return ''; + } + } + + /** + * Transforms a French date into a timestam + * @return int timestamp of the input date + */ + private function parseDate($string) + { + $month_fr = array( + 'janvier', + 'février', + 'mars', + 'avril', + 'mai', + 'juin', + 'juillet', + 'août', + 'septembre', + 'octobre', + 'novembre', + 'décembre' + ); + $month_en = array( + 'January', + 'February', + 'March', + 'April', + 'May', + 'June', + 'July', + 'August', + 'September', + 'October', + 'November', + 'December' + ); + $date_str = trim(str_replace($month_fr, $month_en, $string)); + + if(!preg_match('/[0-9]{4}/', $string)) { + $date_str .= ' ' . date('Y'); + } + $date_str .= ' 00:00'; + + $date = DateTime::createFromFormat('j F Y H:i', $date_str); + return $date->getTimestamp(); + } + + /** + * Transforms a relate French date into a timestam + * @return int timestamp of the input date + */ private function relativeDateToTimestamp($str) { $date = new DateTime(); $search = array( @@ -137,7 +303,8 @@ class DealabsBridge extends BridgeAbstract { 'jour', 'jours', 'mois', - 'ans' + 'ans', + 'et ' ); $replace = array( '-', @@ -145,25 +312,12 @@ class DealabsBridge extends BridgeAbstract { 'hour', 'day', 'month', - 'year' + 'year', + '' ); $date->modify(str_replace($search, $replace, $str)); return $date->getTimestamp(); } - private function getCatSubcatParam($str) { - if(strlen($str) >= 2) { - if(substr($str, 0, 1) == 'c') { - $var_name = 'cat[]'; - } else if(substr($str, 0, 1) == 's') { - $var_name = 'sub_cat[]'; - } - $value = substr($str, 1); - return $var_name .'='. $value; - } else { - return ''; - } - } - }