From 0de2db853f139e6f7c11dcd4cf402e72660d20ef Mon Sep 17 00:00:00 2001 From: Corentin Garcia Date: Wed, 30 Jun 2021 12:14:25 +0200 Subject: [PATCH] [NYTBridge] Fix article parsing (#2106) Co-authored-by: podiki --- bridges/NYTBridge.php | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/bridges/NYTBridge.php b/bridges/NYTBridge.php index 687d0889..15fded3a 100644 --- a/bridges/NYTBridge.php +++ b/bridges/NYTBridge.php @@ -4,22 +4,33 @@ class NYTBridge extends FeedExpander { const MAINTAINER = 'IceWreck'; const NAME = 'New York Times Bridge'; const URI = 'https://www.nytimes.com/'; - const CACHE_TIMEOUT = 3600; + const CACHE_TIMEOUT = 900; // 15 minutes const DESCRIPTION = 'RSS feed for the New York Times'; public function collectData(){ - $this->collectExpandableDatas('https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', 15); + $this->collectExpandableDatas('https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', 40); } protected function parseItem($newsItem){ $item = parent::parseItem($newsItem); + $article = ''; + // $articlePage gets the entire page's contents $articlePage = getSimpleHTMLDOM($newsItem->link); + + // handle subtitle + $subtitle = $articlePage->find('p.css-w6ymp8', 0); + if ($subtitle != null) { + $article .= '' . $subtitle->plaintext . ''; + } + // figure contain's the main article image - $article = $articlePage->find('figure', 0); - // p > css-exrw3m has the actual article - foreach($articlePage->find('p.css-exrw3m') as $element) - $article = $article . $element; + $article .= $articlePage->find('figure', 0) . '
'; + + // section.meteredContent has the actual article + foreach($articlePage->find('section.meteredContent p') as $element) + $article .= '' . $element . ''; + $item['content'] = $article; return $item; }