From 9ffd89f4e3cb213921e0656e1819b2941157a07a Mon Sep 17 00:00:00 2001 From: Phantop Date: Fri, 4 Oct 2024 22:02:40 -0400 Subject: [PATCH] [ArsTechnicaBridge] Refactor to website restructuring --- bridges/ArsTechnicaBridge.php | 72 ++++++++++++++--------------------- 1 file changed, 28 insertions(+), 44 deletions(-) diff --git a/bridges/ArsTechnicaBridge.php b/bridges/ArsTechnicaBridge.php index fcb1bd4f..8bf7e41a 100644 --- a/bridges/ArsTechnicaBridge.php +++ b/bridges/ArsTechnicaBridge.php @@ -7,24 +7,24 @@ class ArsTechnicaBridge extends FeedExpander const URI = 'https://arstechnica.com/'; const DESCRIPTION = 'Returns the latest articles from Ars Technica'; const PARAMETERS = [[ - 'section' => [ - 'name' => 'Site section', - 'type' => 'list', - 'defaultValue' => 'index', - 'values' => [ - 'All' => 'index', - 'Apple' => 'apple', - 'Board Games' => 'cardboard', - 'Cars' => 'cars', - 'Features' => 'features', - 'Gaming' => 'gaming', - 'Information Technology' => 'technology-lab', - 'Science' => 'science', - 'Staff Blogs' => 'staff-blogs', - 'Tech Policy' => 'tech-policy', - 'Tech' => 'gadgets', - ] + 'section' => [ + 'name' => 'Site section', + 'type' => 'list', + 'defaultValue' => 'index', + 'values' => [ + 'All' => 'index', + 'Apple' => 'apple', + 'Board Games' => 'cardboard', + 'Cars' => 'cars', + 'Features' => 'features', + 'Gaming' => 'gaming', + 'Information Technology' => 'technology-lab', + 'Science' => 'science', + 'Staff Blogs' => 'staff-blogs', + 'Tech Policy' => 'tech-policy', + 'Tech' => 'gadgets', ] + ] ]]; public function collectData() @@ -36,39 +36,23 @@ class ArsTechnicaBridge extends FeedExpander protected function parseItem(array $item) { $item_html = getSimpleHTMLDOMCached($item['uri']); - $item_html = defaultLinkTo($item_html, self::URI); - $item['content'] = $item_html->find('.article-content', 0); + $parsely = $item_html->find('[name="parsely-page"]', 0)->content; + $parsely_json = Json::decode(html_entity_decode($parsely)); - $parsely = $item_html->find('[name="parsely-page"]', 0); - $parsely_json = json_decode(html_entity_decode($parsely->content), true); $item['categories'] = $parsely_json['tags']; - - $pages = $item_html->find('nav.page-numbers > .numbers > a', -2); - if (null !== $pages) { - for ($i = 2; $i <= $pages->innertext; $i++) { - $page_url = $item['uri'] . '&page=' . $i; - $page_html = getSimpleHTMLDOMCached($page_url); - $page_html = defaultLinkTo($page_html, self::URI); - $item['content'] .= $page_html->find('.article-content', 0); - } - $item['content'] = str_get_html($item['content']); - } - - // remove various ars advertising - $item['content']->find('#social-left', 0)->remove(); - foreach ($item['content']->find('.ars-component-buy-box') as $ad) { - $ad->remove(); - } - foreach ($item['content']->find('.ad_wrapper') as $ad) { - $ad->remove(); - } - foreach ($item['content']->find('.sidebar') as $ad) { - $ad->remove(); + $item['comments'] = $item_html->find('#comments a', 0)->href; + $item['content'] = ''; + foreach ($item_html->find('.post-content') as $content) { + $item['content'] .= $content; } $item['content'] = backgroundToImg($item['content']); - $item['uid'] = explode('=', $item['uri'])[1]; + // remove various ars advertising + $sel = '#social-left, .ars-component-buy-box, .ad_wrapper, .sidebar, .toc-container, .ars-gallery-caption-arrow'; + foreach ($item['content']->find($sel) as $ad) { + $ad->remove(); + } return $item; }