diff --git a/bridges/CNETBridge.php b/bridges/CNETBridge.php index 34442abd..4a63c847 100644 --- a/bridges/CNETBridge.php +++ b/bridges/CNETBridge.php @@ -1,6 +1,6 @@ 'list', 'values' => [ 'All articles' => '', - 'Apple' => 'apple', - 'Google' => 'google', - 'Microsoft' => 'tags-microsoft', - 'Computers' => 'topics-computers', - 'Mobile' => 'topics-mobile', - 'Sci-Tech' => 'topics-sci-tech', - 'Security' => 'topics-security', - 'Internet' => 'topics-internet', - 'Tech Industry' => 'topics-tech-industry' + 'Tech' => 'tech', + 'Money' => 'personal-finance', + 'Home' => 'home', + 'Wellness' => 'health', + 'Energy' => 'home/energy-and-utilities', + 'Deals' => 'deals', + 'Computing' => 'tech/computing', + 'Mobile' => 'tech/mobile', + 'Science' => 'science', + 'Services' => 'tech/services-and-software' ] - ] + ], + 'limit' => self::LIMIT ] ]; - private function cleanArticle($article_html) - { - $offset_p = strpos($article_html, '

'); - $offset_figure = strpos($article_html, '', '', $article_html); - $article_html = str_replace('', '', $article_html); - $article_html = StripWithDelimiters($article_html, ''); - $article_html = stripWithDelimiters($article_html, 'find('div.originalImage', 0); - } - if (empty($article_thumbnail)) { - $article_thumbnail = $article_html->find('span.imageContainer', 0); - } - if (is_object($article_thumbnail)) { - $article_thumbnail = $article_thumbnail->find('img', 0)->src; - } - - $article_content .= trim( - $this->cleanArticle( - extractFromDelimiters( - $article_html, - 'find('script[type=application/ld+json]') as $ldjson) { + $datePublished = extractFromDelimiters($ldjson->innertext, '"datePublished":"', '"'); + if ($datePublished !== false) { + $date = strtotime($datePublished); + } + $imageObject = extractFromDelimiters($ldjson->innertext, 'ImageObject","url":"', '"'); + if ($imageObject !== false) { + $enclosure = $imageObject; } - - $item = []; - $item['uri'] = $article_uri; - $item['title'] = $article_title; - $item['author'] = $article_author; - $item['timestamp'] = $article_timestamp; - $item['enclosures'] = [$article_thumbnail]; - $item['content'] = $article_content; - $this->items[] = $item; } + + foreach ($content->find('div.c-shortcodeGallery') as $cleanup) { + $cleanup->outertext = ''; + } + + foreach ($content->find('figure') as $figure) { + $img = $figure->find('img', 0); + if ($img) { + $figure->outertext = $img->outertext; + } + } + + $content = $content->innertext; + + if ($enclosure) { + $content = "

" . $content; + } + + if ($headline) { + $content = '

' . $headline->plaintext . '


' . $content; + } + + $item = []; + $item['uri'] = $article_uri; + $item['title'] = $title; + $item['author'] = $author; + $item['content'] = $content; + + if (!is_null($date)) { + $item['timestamp'] = $date; + } + + if (!is_null($enclosure)) { + $item['enclosures'] = [$enclosure]; + } + + $this->items[] = $item; } } } diff --git a/bridges/SitemapBridge.php b/bridges/SitemapBridge.php index bdf662ee..bbbb3e16 100644 --- a/bridges/SitemapBridge.php +++ b/bridges/SitemapBridge.php @@ -131,7 +131,7 @@ class SitemapBridge extends CssSelectorBridge foreach ($sitemap->find('sitemap') as $nested_sitemap) { $url = $nested_sitemap->find('loc'); if (!empty($url)) { - $url = $url[0]->plaintext; + $url = trim($url[0]->plaintext); if (str_ends_with(strtolower($url), '.xml')) { $nested_sitemap_xml = $this->getSitemapXml($url, true); $nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true); @@ -148,8 +148,8 @@ class SitemapBridge extends CssSelectorBridge $url = $item->find('loc'); $lastmod = $item->find('lastmod'); if (!empty($url) && !empty($lastmod)) { - $url = $url[0]->plaintext; - $lastmod = $lastmod[0]->plaintext; + $url = trim($url[0]->plaintext); + $lastmod = trim($lastmod[0]->plaintext); $timestamp = strtotime($lastmod); if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) { $links[$url] = $timestamp;