[CNet] Rewrite bridge (#3764) (#3770)

Bridge was broken.
Full bridge rewrite using Sitemap as source.
This commit is contained in:
ORelio 2023-10-18 19:13:33 +02:00 committed by GitHub
parent 7533ef12e3
commit 9056106c2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 87 additions and 87 deletions

View File

@ -1,6 +1,6 @@
<?php <?php
class CNETBridge extends BridgeAbstract class CNETBridge extends SitemapBridge
{ {
const MAINTAINER = 'ORelio'; const MAINTAINER = 'ORelio';
const NAME = 'CNET News'; const NAME = 'CNET News';
@ -14,101 +14,101 @@ class CNETBridge extends BridgeAbstract
'type' => 'list', 'type' => 'list',
'values' => [ 'values' => [
'All articles' => '', 'All articles' => '',
'Apple' => 'apple', 'Tech' => 'tech',
'Google' => 'google', 'Money' => 'personal-finance',
'Microsoft' => 'tags-microsoft', 'Home' => 'home',
'Computers' => 'topics-computers', 'Wellness' => 'health',
'Mobile' => 'topics-mobile', 'Energy' => 'home/energy-and-utilities',
'Sci-Tech' => 'topics-sci-tech', 'Deals' => 'deals',
'Security' => 'topics-security', 'Computing' => 'tech/computing',
'Internet' => 'topics-internet', 'Mobile' => 'tech/mobile',
'Tech Industry' => 'topics-tech-industry' 'Science' => 'science',
] 'Services' => 'tech/services-and-software'
] ]
],
'limit' => self::LIMIT
] ]
]; ];
private function cleanArticle($article_html)
{
$offset_p = strpos($article_html, '<p>');
$offset_figure = strpos($article_html, '<figure');
$offset = ($offset_figure < $offset_p ? $offset_figure : $offset_p);
$article_html = substr($article_html, $offset);
$article_html = str_replace('href="/', 'href="' . self::URI, $article_html);
$article_html = str_replace(' height="0"', '', $article_html);
$article_html = str_replace('<noscript>', '', $article_html);
$article_html = str_replace('</noscript>', '', $article_html);
$article_html = StripWithDelimiters($article_html, '<a class="clickToEnlarge', '</a>');
$article_html = stripWithDelimiters($article_html, '<span class="nowPlaying', '</span>');
$article_html = stripWithDelimiters($article_html, '<span class="duration', '</span>');
$article_html = stripWithDelimiters($article_html, '<script', '</script>');
$article_html = stripWithDelimiters($article_html, '<svg', '</svg>');
return $article_html;
}
public function collectData() public function collectData()
{ {
// Retrieve and check user input $topic = $this->getInput('topic');
$topic = str_replace('-', '/', $this->getInput('topic')); $limit = $this->getInput('limit');
if (!empty($topic) && (substr_count($topic, '/') > 1 || !ctype_alpha(str_replace('/', '', $topic)))) { $limit = empty($limit) ? 10 : $limit;
returnClientError('Invalid topic: ' . $topic);
}
// Retrieve webpage $url_pattern = empty($topic) ? '' : self::URI . $topic;
$pageUrl = self::URI . (empty($topic) ? 'news/' : $topic . '/'); $sitemap_latest = self::URI . 'sitemaps/article/' . date('Y/m') . '.xml';
$html = getSimpleHTMLDOM($pageUrl); $sitemap_previous = self::URI . 'sitemaps/article/' . date('Y/m', strtotime('last day of previous month')) . '.xml';
// Process articles $links = array_merge(
foreach ($html->find('div.assetBody, div.riverPost') as $element) { $this->sitemapXmlToList($this->getSitemapXml($sitemap_latest, true), $url_pattern, $limit),
if (count($this->items) >= 10) { $this->sitemapXmlToList($this->getSitemapXml($sitemap_previous, true), $url_pattern, $limit)
break;
}
$article_title = trim($element->find('h2, h3', 0)->plaintext);
$article_uri = self::URI . substr($element->find('a', 0)->href, 1);
$article_thumbnail = $element->parent()->find('img[src]', 0)->src;
$article_timestamp = strtotime($element->find('time.assetTime, div.timeAgo', 0)->plaintext);
$article_author = trim($element->find('a[rel=author], a.name', 0)->plaintext);
$article_content = '<p><b>' . trim($element->find('p.dek', 0)->plaintext) . '</b></p>';
if (is_null($article_thumbnail)) {
$article_thumbnail = extractFromDelimiters($element->innertext, '<img src="', '"');
}
if (!empty($article_title) && !empty($article_uri) && strpos($article_uri, self::URI . 'news/') !== false) {
$article_html = getSimpleHTMLDOMCached($article_uri) or $article_html = null;
if (!is_null($article_html)) {
if (empty($article_thumbnail)) {
$article_thumbnail = $article_html->find('div.originalImage', 0);
}
if (empty($article_thumbnail)) {
$article_thumbnail = $article_html->find('span.imageContainer', 0);
}
if (is_object($article_thumbnail)) {
$article_thumbnail = $article_thumbnail->find('img', 0)->src;
}
$article_content .= trim(
$this->cleanArticle(
extractFromDelimiters(
$article_html,
'<article',
'<footer'
)
)
); );
if ($limit > 0 && count($links) > $limit) {
$links = array_slice($links, 0, $limit);
}
if (empty($links)) {
returnClientError('Failed to retrieve article list');
}
foreach ($links as $article_uri) {
$article_dom = convertLazyLoading(getSimpleHTMLDOMCached($article_uri));
$title = trim($article_dom->find('h1', 0)->plaintext);
$author = $article_dom->find('span.c-assetAuthor_name', 0)->plaintext;
$headline = $article_dom->find('p.c-contentHeader_description', 0);
$content = $article_dom->find('div.c-pageArticle_content, div.single-article__content, div.article-main-body', 0);
$date = null;
$enclosure = null;
foreach ($article_dom->find('script[type=application/ld+json]') as $ldjson) {
$datePublished = extractFromDelimiters($ldjson->innertext, '"datePublished":"', '"');
if ($datePublished !== false) {
$date = strtotime($datePublished);
}
$imageObject = extractFromDelimiters($ldjson->innertext, 'ImageObject","url":"', '"');
if ($imageObject !== false) {
$enclosure = $imageObject;
}
}
foreach ($content->find('div.c-shortcodeGallery') as $cleanup) {
$cleanup->outertext = '';
}
foreach ($content->find('figure') as $figure) {
$img = $figure->find('img', 0);
if ($img) {
$figure->outertext = $img->outertext;
}
}
$content = $content->innertext;
if ($enclosure) {
$content = "<div><img src=\"$enclosure\" /></div>" . $content;
}
if ($headline) {
$content = '<p><b>' . $headline->plaintext . '</b></p><br />' . $content;
} }
$item = []; $item = [];
$item['uri'] = $article_uri; $item['uri'] = $article_uri;
$item['title'] = $article_title; $item['title'] = $title;
$item['author'] = $article_author; $item['author'] = $author;
$item['timestamp'] = $article_timestamp; $item['content'] = $content;
$item['enclosures'] = [$article_thumbnail];
$item['content'] = $article_content; if (!is_null($date)) {
$item['timestamp'] = $date;
}
if (!is_null($enclosure)) {
$item['enclosures'] = [$enclosure];
}
$this->items[] = $item; $this->items[] = $item;
} }
} }
} }
}

View File

@ -131,7 +131,7 @@ class SitemapBridge extends CssSelectorBridge
foreach ($sitemap->find('sitemap') as $nested_sitemap) { foreach ($sitemap->find('sitemap') as $nested_sitemap) {
$url = $nested_sitemap->find('loc'); $url = $nested_sitemap->find('loc');
if (!empty($url)) { if (!empty($url)) {
$url = $url[0]->plaintext; $url = trim($url[0]->plaintext);
if (str_ends_with(strtolower($url), '.xml')) { if (str_ends_with(strtolower($url), '.xml')) {
$nested_sitemap_xml = $this->getSitemapXml($url, true); $nested_sitemap_xml = $this->getSitemapXml($url, true);
$nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true); $nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true);
@ -148,8 +148,8 @@ class SitemapBridge extends CssSelectorBridge
$url = $item->find('loc'); $url = $item->find('loc');
$lastmod = $item->find('lastmod'); $lastmod = $item->find('lastmod');
if (!empty($url) && !empty($lastmod)) { if (!empty($url) && !empty($lastmod)) {
$url = $url[0]->plaintext; $url = trim($url[0]->plaintext);
$lastmod = $lastmod[0]->plaintext; $lastmod = trim($lastmod[0]->plaintext);
$timestamp = strtotime($lastmod); $timestamp = strtotime($lastmod);
if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) { if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) {
$links[$url] = $timestamp; $links[$url] = $timestamp;