diff --git a/bridges/CNETBridge.php b/bridges/CNETBridge.php
index 34442abd..4a63c847 100644
--- a/bridges/CNETBridge.php
+++ b/bridges/CNETBridge.php
@@ -1,6 +1,6 @@
'list',
'values' => [
'All articles' => '',
- 'Apple' => 'apple',
- 'Google' => 'google',
- 'Microsoft' => 'tags-microsoft',
- 'Computers' => 'topics-computers',
- 'Mobile' => 'topics-mobile',
- 'Sci-Tech' => 'topics-sci-tech',
- 'Security' => 'topics-security',
- 'Internet' => 'topics-internet',
- 'Tech Industry' => 'topics-tech-industry'
+ 'Tech' => 'tech',
+ 'Money' => 'personal-finance',
+ 'Home' => 'home',
+ 'Wellness' => 'health',
+ 'Energy' => 'home/energy-and-utilities',
+ 'Deals' => 'deals',
+ 'Computing' => 'tech/computing',
+ 'Mobile' => 'tech/mobile',
+ 'Science' => 'science',
+ 'Services' => 'tech/services-and-software'
]
- ]
+ ],
+ 'limit' => self::LIMIT
]
];
- private function cleanArticle($article_html)
- {
- $offset_p = strpos($article_html, '
');
- $offset_figure = strpos($article_html, '', '', $article_html);
- $article_html = str_replace('', '', $article_html);
- $article_html = StripWithDelimiters($article_html, '');
- $article_html = stripWithDelimiters($article_html, 'find('div.originalImage', 0);
- }
- if (empty($article_thumbnail)) {
- $article_thumbnail = $article_html->find('span.imageContainer', 0);
- }
- if (is_object($article_thumbnail)) {
- $article_thumbnail = $article_thumbnail->find('img', 0)->src;
- }
-
- $article_content .= trim(
- $this->cleanArticle(
- extractFromDelimiters(
- $article_html,
- 'find('script[type=application/ld+json]') as $ldjson) {
+ $datePublished = extractFromDelimiters($ldjson->innertext, '"datePublished":"', '"');
+ if ($datePublished !== false) {
+ $date = strtotime($datePublished);
+ }
+ $imageObject = extractFromDelimiters($ldjson->innertext, 'ImageObject","url":"', '"');
+ if ($imageObject !== false) {
+ $enclosure = $imageObject;
}
-
- $item = [];
- $item['uri'] = $article_uri;
- $item['title'] = $article_title;
- $item['author'] = $article_author;
- $item['timestamp'] = $article_timestamp;
- $item['enclosures'] = [$article_thumbnail];
- $item['content'] = $article_content;
- $this->items[] = $item;
}
+
+ foreach ($content->find('div.c-shortcodeGallery') as $cleanup) {
+ $cleanup->outertext = '';
+ }
+
+ foreach ($content->find('figure') as $figure) {
+ $img = $figure->find('img', 0);
+ if ($img) {
+ $figure->outertext = $img->outertext;
+ }
+ }
+
+ $content = $content->innertext;
+
+ if ($enclosure) {
+ $content = "" . $content;
+ }
+
+ if ($headline) {
+ $content = '' . $headline->plaintext . '
' . $content;
+ }
+
+ $item = [];
+ $item['uri'] = $article_uri;
+ $item['title'] = $title;
+ $item['author'] = $author;
+ $item['content'] = $content;
+
+ if (!is_null($date)) {
+ $item['timestamp'] = $date;
+ }
+
+ if (!is_null($enclosure)) {
+ $item['enclosures'] = [$enclosure];
+ }
+
+ $this->items[] = $item;
}
}
}
diff --git a/bridges/SitemapBridge.php b/bridges/SitemapBridge.php
index bdf662ee..bbbb3e16 100644
--- a/bridges/SitemapBridge.php
+++ b/bridges/SitemapBridge.php
@@ -131,7 +131,7 @@ class SitemapBridge extends CssSelectorBridge
foreach ($sitemap->find('sitemap') as $nested_sitemap) {
$url = $nested_sitemap->find('loc');
if (!empty($url)) {
- $url = $url[0]->plaintext;
+ $url = trim($url[0]->plaintext);
if (str_ends_with(strtolower($url), '.xml')) {
$nested_sitemap_xml = $this->getSitemapXml($url, true);
$nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true);
@@ -148,8 +148,8 @@ class SitemapBridge extends CssSelectorBridge
$url = $item->find('loc');
$lastmod = $item->find('lastmod');
if (!empty($url) && !empty($lastmod)) {
- $url = $url[0]->plaintext;
- $lastmod = $lastmod[0]->plaintext;
+ $url = trim($url[0]->plaintext);
+ $lastmod = trim($lastmod[0]->plaintext);
$timestamp = strtotime($lastmod);
if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) {
$links[$url] = $timestamp;