From 00dd81a8aadad1ec94e955255abc2dab151222da Mon Sep 17 00:00:00 2001 From: ORelio Date: Wed, 25 Mar 2020 20:40:17 +0100 Subject: [PATCH 01/11] [DarkReading] Hide dummy articles --- bridges/DarkReadingBridge.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bridges/DarkReadingBridge.php b/bridges/DarkReadingBridge.php index 3baaad75..6ab83e9c 100644 --- a/bridges/DarkReadingBridge.php +++ b/bridges/DarkReadingBridge.php @@ -53,6 +53,8 @@ class DarkReadingBridge extends FeedExpander { protected function parseItem($newsItem){ $item = parent::parseItem($newsItem); + if (empty($item['content'])) + return null; //ignore dummy articles $article = getSimpleHTMLDOMCached($item['uri']) or returnServerError('Could not request Dark Reading: ' . $item['uri']); $item['content'] = $this->extractArticleContent($article); From 062dd7f8a58782bbb841bdfeb0384f836410e786 Mon Sep 17 00:00:00 2001 From: ORelio Date: Wed, 25 Mar 2020 21:09:43 +0100 Subject: [PATCH 02/11] [FuturaSciences] Strip inline scripts from content --- bridges/FuturaSciencesBridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index 772f4438..5859bc41 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -129,6 +129,7 @@ class FuturaSciencesBridge extends FeedExpander { $contents = stripWithDelimiters($contents, 'fs:xt:clickname="', '"'); $contents = StripWithDelimiters($contents, '
', $string); + $string = preg_replace('/\!\[([^\]]*)\]\(([^\) ]+)(?: [^\)]+)?\)/', '$1', $string); $string = preg_replace('/\[([^\]]+)\]\(([^\)]+)\)/', '$1', $string); $string = preg_replace('/\*\*(.*)\*\*/U', '$1', $string); $string = preg_replace('/\*(.*)\*/U', '$1', $string); From 90e9c9962a3d9cf7c98173b1fd4e1e48adf9bca2 Mon Sep 17 00:00:00 2001 From: ORelio Date: Wed, 25 Mar 2020 23:57:22 +0100 Subject: [PATCH 05/11] [TheHackerNews] Fix Author name cleanup --- bridges/TheHackerNewsBridge.php | 1 + 1 file changed, 1 insertion(+) diff --git a/bridges/TheHackerNewsBridge.php b/bridges/TheHackerNewsBridge.php index 687b620c..1e710b31 100644 --- a/bridges/TheHackerNewsBridge.php +++ b/bridges/TheHackerNewsBridge.php @@ -17,6 +17,7 @@ class TheHackerNewsBridge extends BridgeAbstract { $article_url = $element->find('a.story-link', 0)->href; $article_author = trim($element->find('i.icon-user', 0)->parent()->plaintext); + $article_author = str_replace('', '', $article_author); $article_title = $element->find('h2.home-title', 0)->plaintext; //Date without time From 8b173b88740924d20850ad3f39c71379fb2512f2 Mon Sep 17 00:00:00 2001 From: ORelio Date: Thu, 26 Mar 2020 23:05:19 +0100 Subject: [PATCH 06/11] [LeMondeInformatique] Remove encoding conversion Was previously needed due to actual encoding on the page being inconsistent with encoding specified in tag --- bridges/LeMondeInformatiqueBridge.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bridges/LeMondeInformatiqueBridge.php b/bridges/LeMondeInformatiqueBridge.php index 45aa6075..b85a9631 100644 --- a/bridges/LeMondeInformatiqueBridge.php +++ b/bridges/LeMondeInformatiqueBridge.php @@ -26,8 +26,8 @@ class LeMondeInformatiqueBridge extends FeedExpander { //No response header sets the encoding, explicit conversion is needed or subsequent xml_encode() will fail $content_node = $article_html->find('div.col-primary, div.col-sm-9', 0); - $item['content'] = utf8_encode($this->cleanArticle($content_node->innertext)); - $item['author'] = utf8_encode($article_html->find('div.author-infos', 0)->find('b', 0)->plaintext); + $item['content'] = $this->cleanArticle($content_node->innertext); + $item['author'] = $article_html->find('div.author-infos', 0)->find('b', 0)->plaintext; return $item; } From efd1abfab193e314306a19d2495632b2a26bf1f5 Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 15 May 2020 14:05:13 +0200 Subject: [PATCH 07/11] [AnimeUltime] Remove encoding conversion Was previously needed due to encoding on the page being incorrect --- bridges/AnimeUltimeBridge.php | 1 - 1 file changed, 1 deletion(-) diff --git a/bridges/AnimeUltimeBridge.php b/bridges/AnimeUltimeBridge.php index bc1dd7bc..c83d6ddb 100644 --- a/bridges/AnimeUltimeBridge.php +++ b/bridges/AnimeUltimeBridge.php @@ -102,7 +102,6 @@ class AnimeUltimeBridge extends BridgeAbstract { $item_description = defaultLinkTo($item_description, self::URI); $item_description = str_replace("\r", '', $item_description); $item_description = str_replace("\n", '', $item_description); - $item_description = utf8_encode($item_description); //Build and add final item $item = array(); From 66a009b8fb634bade38319376d056ca5cba1c800 Mon Sep 17 00:00:00 2001 From: ORelio Date: Sat, 23 May 2020 19:20:39 +0200 Subject: [PATCH 08/11] [FuturaSciences] Fix content extraction --- bridges/FuturaSciencesBridge.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index 5859bc41..59b4734e 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -92,11 +92,12 @@ class FuturaSciencesBridge extends FeedExpander { $author = $this->extractAuthor($article); if (!empty($author)) $item['author'] = $author; + unset($article); return $item; } private function extractArticleContent($article){ - $contents = $article->find('section.article-text-classic', 0)->innertext; + $contents = $article->find('section.article-text', 1)->innertext; $headline = trim($article->find('p.description', 0)->plaintext); if(!empty($headline)) $headline = '

' . $headline . '

'; From 45e247b9d092f910a0b9ed03d8cbad1a4ed19554 Mon Sep 17 00:00:00 2001 From: ORelio Date: Sat, 23 May 2020 19:21:48 +0200 Subject: [PATCH 09/11] [FuturaSciences] Fix unneeded unset() --- bridges/FuturaSciencesBridge.php | 1 - 1 file changed, 1 deletion(-) diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index 59b4734e..79c05880 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -92,7 +92,6 @@ class FuturaSciencesBridge extends FeedExpander { $author = $this->extractAuthor($article); if (!empty($author)) $item['author'] = $author; - unset($article); return $item; } From f0e6298cab91e7c669aaa242681adb697783c49c Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 7 Aug 2020 15:09:21 +0200 Subject: [PATCH 10/11] [GBAtemp] Fix tutorial mode URL extraction --- bridges/GBAtempBridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bridges/GBAtempBridge.php b/bridges/GBAtempBridge.php index 48a7f851..b1a86ad9 100644 --- a/bridges/GBAtempBridge.php +++ b/bridges/GBAtempBridge.php @@ -113,7 +113,7 @@ class GBAtempBridge extends BridgeAbstract { break; case 'T': foreach($html->find('li.portal-tutorial') as $tutorialItem) { - $url = self::URI . $tutorialItem->find('a', 0)->href; + $url = self::URI . $tutorialItem->find('a', 1)->href; $title = $tutorialItem->find('a', 0)->plaintext; $time = $this->findItemDate($tutorialItem); $author = $tutorialItem->find('a.username', 0)->plaintext; From c642652fea6fd7ea27eb264c88aeef3d5d65ef0e Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 7 Aug 2020 15:19:14 +0200 Subject: [PATCH 11/11] [GBAtemp] Fix tutorial mode Title extraction --- bridges/GBAtempBridge.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bridges/GBAtempBridge.php b/bridges/GBAtempBridge.php index b1a86ad9..e0841950 100644 --- a/bridges/GBAtempBridge.php +++ b/bridges/GBAtempBridge.php @@ -114,7 +114,7 @@ class GBAtempBridge extends BridgeAbstract { case 'T': foreach($html->find('li.portal-tutorial') as $tutorialItem) { $url = self::URI . $tutorialItem->find('a', 1)->href; - $title = $tutorialItem->find('a', 0)->plaintext; + $title = $tutorialItem->find('a', 1)->plaintext; $time = $this->findItemDate($tutorialItem); $author = $tutorialItem->find('a.username', 0)->plaintext; $content = $this->fetchPostContent($url, self::URI);