diff --git a/bridges/ArsTechnicaBridge.php b/bridges/ArsTechnicaBridge.php index fcb1bd4f..ac722dc9 100644 --- a/bridges/ArsTechnicaBridge.php +++ b/bridges/ArsTechnicaBridge.php @@ -37,39 +37,82 @@ class ArsTechnicaBridge extends FeedExpander { $item_html = getSimpleHTMLDOMCached($item['uri']); $item_html = defaultLinkTo($item_html, self::URI); - $item['content'] = $item_html->find('.article-content', 0); + + $content = ''; + $header = $item_html->find('article header', 0); + $leading = $header->find('p[class*=leading]', 0); + if ($leading != null) { + $content .= '

' . $leading->innertext . '

'; + } + $intro_image = $header->find('img.intro-image', 0); + if ($intro_image != null) { + $content .= '
' . $intro_image; + + $image_caption = $header->find('.caption .caption-content', 0); + if ($image_caption != null) { + $content .= '
' . $image_caption->innertext . '
'; + } + $content .= '
'; + } + + foreach ($item_html->find('.post-content') as $content_tag) { + $content .= $content_tag->innertext; + } + + $item['content'] = str_get_html($content); $parsely = $item_html->find('[name="parsely-page"]', 0); $parsely_json = json_decode(html_entity_decode($parsely->content), true); $item['categories'] = $parsely_json['tags']; - $pages = $item_html->find('nav.page-numbers > .numbers > a', -2); - if (null !== $pages) { - for ($i = 2; $i <= $pages->innertext; $i++) { - $page_url = $item['uri'] . '&page=' . $i; - $page_html = getSimpleHTMLDOMCached($page_url); - $page_html = defaultLinkTo($page_html, self::URI); - $item['content'] .= $page_html->find('.article-content', 0); + // Some lightboxes are nested in figures. I'd guess that's a + // bug in the website + foreach ($item['content']->find('figure div div.ars-lightbox') as $weird_lightbox) { + $weird_lightbox->parent->parent->outertext = $weird_lightbox; + } + + // It's easier to reconstruct the whole thing than remove + // duplicate reactive tags + foreach ($item['content']->find('.ars-lightbox') as $lightbox) { + $lightbox_content = ''; + foreach ($lightbox->find('.ars-lightbox-item') as $lightbox_item) { + $img = $lightbox_item->find('img', 0); + if ($img != null) { + $lightbox_content .= '
' . $img; + $caption = $lightbox_item->find('div.pswp-caption-content', 0); + if ($caption != null) { + $credit = $lightbox_item->find('div.ars-gallery-caption-credit', 0); + if ($credit != null) { + $credit->innertext = 'Credit: ' . $credit->innertext; + } + $lightbox_content .= '
' . $caption->innertext . '
'; + } + $lightbox_content .= '
'; + } } - $item['content'] = str_get_html($item['content']); + $lightbox->innertext = $lightbox_content; } // remove various ars advertising - $item['content']->find('#social-left', 0)->remove(); - foreach ($item['content']->find('.ars-component-buy-box') as $ad) { + foreach ($item['content']->find('.ars-interlude-container') as $ad) { $ad->remove(); } - foreach ($item['content']->find('.ad_wrapper') as $ad) { - $ad->remove(); + foreach ($item['content']->find('.toc-container') as $toc) { + $toc->remove(); } - foreach ($item['content']->find('.sidebar') as $ad) { - $ad->remove(); + + // Mostly YouTube videos + $iframes = $item['content']->find('iframe'); + foreach ($iframes as $iframe) { + $iframe->outertext = '' . $iframe->src . ''; + } + // This fixed padding around the former iframes and actual inline videos + foreach ($item['content']->find('div[style*=aspect-ratio]') as $styled) { + $styled->removeAttribute('style'); } $item['content'] = backgroundToImg($item['content']); - - $item['uid'] = explode('=', $item['uri'])[1]; - + $item['uid'] = strval($parsely_json['post_id']); return $item; } }