[ArsTechnicaBridge] Refactor to website restructuring

This commit is contained in:
Phantop 2024-10-04 22:02:40 -04:00
parent 80c43f10d8
commit 9ffd89f4e3

View File

@ -7,24 +7,24 @@ class ArsTechnicaBridge extends FeedExpander
const URI = 'https://arstechnica.com/'; const URI = 'https://arstechnica.com/';
const DESCRIPTION = 'Returns the latest articles from Ars Technica'; const DESCRIPTION = 'Returns the latest articles from Ars Technica';
const PARAMETERS = [[ const PARAMETERS = [[
'section' => [ 'section' => [
'name' => 'Site section', 'name' => 'Site section',
'type' => 'list', 'type' => 'list',
'defaultValue' => 'index', 'defaultValue' => 'index',
'values' => [ 'values' => [
'All' => 'index', 'All' => 'index',
'Apple' => 'apple', 'Apple' => 'apple',
'Board Games' => 'cardboard', 'Board Games' => 'cardboard',
'Cars' => 'cars', 'Cars' => 'cars',
'Features' => 'features', 'Features' => 'features',
'Gaming' => 'gaming', 'Gaming' => 'gaming',
'Information Technology' => 'technology-lab', 'Information Technology' => 'technology-lab',
'Science' => 'science', 'Science' => 'science',
'Staff Blogs' => 'staff-blogs', 'Staff Blogs' => 'staff-blogs',
'Tech Policy' => 'tech-policy', 'Tech Policy' => 'tech-policy',
'Tech' => 'gadgets', 'Tech' => 'gadgets',
]
] ]
]
]]; ]];
public function collectData() public function collectData()
@ -36,39 +36,23 @@ class ArsTechnicaBridge extends FeedExpander
protected function parseItem(array $item) protected function parseItem(array $item)
{ {
$item_html = getSimpleHTMLDOMCached($item['uri']); $item_html = getSimpleHTMLDOMCached($item['uri']);
$item_html = defaultLinkTo($item_html, self::URI); $parsely = $item_html->find('[name="parsely-page"]', 0)->content;
$item['content'] = $item_html->find('.article-content', 0); $parsely_json = Json::decode(html_entity_decode($parsely));
$parsely = $item_html->find('[name="parsely-page"]', 0);
$parsely_json = json_decode(html_entity_decode($parsely->content), true);
$item['categories'] = $parsely_json['tags']; $item['categories'] = $parsely_json['tags'];
$item['comments'] = $item_html->find('#comments a', 0)->href;
$pages = $item_html->find('nav.page-numbers > .numbers > a', -2); $item['content'] = '';
if (null !== $pages) { foreach ($item_html->find('.post-content') as $content) {
for ($i = 2; $i <= $pages->innertext; $i++) { $item['content'] .= $content;
$page_url = $item['uri'] . '&page=' . $i;
$page_html = getSimpleHTMLDOMCached($page_url);
$page_html = defaultLinkTo($page_html, self::URI);
$item['content'] .= $page_html->find('.article-content', 0);
}
$item['content'] = str_get_html($item['content']);
}
// remove various ars advertising
$item['content']->find('#social-left', 0)->remove();
foreach ($item['content']->find('.ars-component-buy-box') as $ad) {
$ad->remove();
}
foreach ($item['content']->find('.ad_wrapper') as $ad) {
$ad->remove();
}
foreach ($item['content']->find('.sidebar') as $ad) {
$ad->remove();
} }
$item['content'] = backgroundToImg($item['content']); $item['content'] = backgroundToImg($item['content']);
$item['uid'] = explode('=', $item['uri'])[1]; // remove various ars advertising
$sel = '#social-left, .ars-component-buy-box, .ad_wrapper, .sidebar, .toc-container, .ars-gallery-caption-arrow';
foreach ($item['content']->find($sel) as $ad) {
$ad->remove();
}
return $item; return $item;
} }