From d0b1789560fbb7c6dbf4e02b7a83421f9fc20f6e Mon Sep 17 00:00:00 2001 From: tillcash Date: Sun, 10 Aug 2025 15:49:27 +0530 Subject: [PATCH] [CybernewsBridge] add bridge --- bridges/CybernewsBridge.php | 104 ++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 bridges/CybernewsBridge.php diff --git a/bridges/CybernewsBridge.php b/bridges/CybernewsBridge.php new file mode 100644 index 00000000..e4cdd7b9 --- /dev/null +++ b/bridges/CybernewsBridge.php @@ -0,0 +1,104 @@ +url as $entry) { + $url = trim((string) $entry->loc); + $lastmod = trim((string) $entry->lastmod); + + if (!$url) { + continue; + } + + $pathParts = explode('/', trim(parse_url($url, PHP_URL_PATH), '/')); + $category = isset($pathParts[0]) && $pathParts[0] !== '' ? $pathParts[0] : ''; + + // Skip non-English versions of articles? + if (in_array($category, ['nl', 'de'])) { + continue; + } + + $namespaces = $entry->getNamespaces(true); + $title = ''; + if (isset($namespaces['news'])) { + $news = $entry->children($namespaces['news'])->news; + if ($news) { + $title = trim((string) $news->title); + } + } + + if (!$title) { + continue; + } + + $this->items[] = [ + 'title' => $title, + 'uri' => $url, + 'uid' => $url, + 'timestamp' => strtotime($lastmod), + 'categories' => $category ? [$category] : [], + 'content' => $this->fetchFullArticle($url), + ]; + + if (++$articleCount >= self::MAX_ARTICLES) { + break; + } + } + } + + private function fetchFullArticle($url) { + $html = getSimpleHTMLDOMCached($url); + if (!$html) { + return 'Failed to fetch article content'; + } + + $article = $html->find('article', 0); + if (!$article) { + return 'Failed to parse article content'; + } + + // Remove unnecessary elements + $removeSelectors = [ + 'script', + 'style', + 'div.links-bar', + 'div.google-news-cta', + 'div.a-wrapper', + 'div.embed_youtube', + ]; + foreach ($removeSelectors as $selector) { + foreach ($article->find($selector) as $element) { + $element->outertext = ''; + } + } + + // Handle lazy-loaded images + foreach ($article->find('img') as $img) { + if (!empty($img->{'data-src'})) { + $img->src = $img->{'data-src'}; + unset($img->{'data-src'}); + } + } + + return $article->innertext; + } +}