From 9d871e8a45a92b3152e1c1987bf8f9034a87c31c Mon Sep 17 00:00:00 2001 From: Mynacol Date: Wed, 21 Sep 2022 22:24:11 +0200 Subject: [PATCH] [ZeitBridge] Add bridge for zeit.de (#3056) * [ZeitBridge] Add bridge for zeit.de New bridge expanding the feeds of zeit.de to full-text ones. Circumvents cookie banners and Z+ premium article paywalls. * [ZeitBridge] Formatting --- bridges/ZeitBridge.php | 139 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 bridges/ZeitBridge.php diff --git a/bridges/ZeitBridge.php b/bridges/ZeitBridge.php new file mode 100644 index 00000000..d398ede0 --- /dev/null +++ b/bridges/ZeitBridge.php @@ -0,0 +1,139 @@ + [ + 'name' => 'Category', + 'type' => 'list', + 'values' => [ + 'Startseite' + => 'https://newsfeed.zeit.de/index', + 'Politik' + => 'https://newsfeed.zeit.de/politik/index', + 'Wirtschaft' + => 'https://newsfeed.zeit.de/wirtschaft/index', + 'Gesellschaft' + => 'https://newsfeed.zeit.de/gesellschaft/index', + 'Kultur' + => 'https://newsfeed.zeit.de/kultur/index', + 'Wissen' + => 'https://newsfeed.zeit.de/wissen/index', + 'Digital' + => 'https://newsfeed.zeit.de/digital/index', + 'ZEIT Campus ONLINE' + => 'https://newsfeed.zeit.de/campus/index', + 'ZEIT ONLINE Arbeit' + => 'https://newsfeed.zeit.de/arbeit/index', + 'ZEIT Magazin ONLINE' + => 'https://newsfeed.zeit.de/zeit-magazin/index', + 'Entdecken' + => 'https://newsfeed.zeit.de/entdecken/index', + 'Mobilität' + => 'https://newsfeed.zeit.de/mobilitaet/index', + 'Sport' + => 'https://newsfeed.zeit.de/sport/index', + 'Alle Inhalte' + => 'https://newsfeed.zeit.de/all' + ] + ], + 'limit' => [ + 'name' => 'Limit', + 'type' => 'number', + 'required' => false, + 'title' => 'Specify number of full articles to return', + 'defaultValue' => 5 + ] + ]]; + const LIMIT = 5; + + public function collectData() + { + $this->collectExpandableDatas( + $this->getInput('category'), + $this->getInput('limit') ?: static::LIMIT + ); + } + + protected function parseItem($item) + { + $item = parent::parseItem($item); + $item['enclosures'] = []; + + $headers = [ + 'Cookie: zonconsent=' . date('Y-m-d\TH:i:s.v\Z'), + 'User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)']; + + // one-page article + $article = getSimpleHTMLDOM($item['uri'], $headers); + if ($article->find('a[href="' . $item['uri'] . '/komplettansicht"]', 0)) { + $item['uri'] .= '/komplettansicht'; + $article = getSimpleHTMLDOM($item['uri'], $headers); + } + + $article = defaultLinkTo($article, $item['uri']); + $item = $this->parseArticle($item, $article); + + return $item; + } + + private function parseArticle($item, $article) + { + $article = $article->find('main', 0); + + // remove known bad elements + foreach ( + $article->find( + 'aside, .visually-hidden, .carousel-container, #tickaroo-liveblog, .zplus-badge, .article-heading__container--podcast' + ) as $bad + ) { + $bad->remove(); + } + // reload html, as remove() is buggy + $article = str_get_html($article->outertext); + + // podcast audio, if available + $podcast_src = $article->find('.article-heading__podcast audio[src]', 0); + if ($podcast_src) { + $item['enclosures'][] = $podcast_src->src; + } + + // full res images + foreach ($article->find('img[data-src]') as $img) { + $img->src = $img->getAttribute('data-src'); + $item['enclosures'][] = $img->src; + } + + // authors + $authors = $article->find('*[itemtype*="schema.org/Person"]'); + if (!$authors) { + $authors = $article->find('.metadata__source'); + } + if ($authors) { + $item['author'] = implode(', ', $authors); + } + + // header image + $headerimg = $article->find('*[data-ct-row="headerimage"]', 0) ?? $article->find('header', 0); + if ($headerimg) { + $item['content'] .= implode('', $headerimg->find('img[src], figcaption')); + } + + // article content + $pages = $article->find('.article-page'); + + if ($pages) { + foreach ($pages as $page) { + $elements = $page->find('p, h2, figcaption, img[src]'); + $item['content'] .= implode('', $elements); + } + } + + return $item; + } +}