diff --git a/bridges/ArsTechnicaBridge.php b/bridges/ArsTechnicaBridge.php
index fcb1bd4f..ac722dc9 100644
--- a/bridges/ArsTechnicaBridge.php
+++ b/bridges/ArsTechnicaBridge.php
@@ -37,39 +37,82 @@ class ArsTechnicaBridge extends FeedExpander
{
$item_html = getSimpleHTMLDOMCached($item['uri']);
$item_html = defaultLinkTo($item_html, self::URI);
- $item['content'] = $item_html->find('.article-content', 0);
+
+ $content = '';
+ $header = $item_html->find('article header', 0);
+ $leading = $header->find('p[class*=leading]', 0);
+ if ($leading != null) {
+ $content .= '
' . $leading->innertext . '
';
+ }
+ $intro_image = $header->find('img.intro-image', 0);
+ if ($intro_image != null) {
+ $content .= '' . $intro_image;
+
+ $image_caption = $header->find('.caption .caption-content', 0);
+ if ($image_caption != null) {
+ $content .= '' . $image_caption->innertext . '';
+ }
+ $content .= '';
+ }
+
+ foreach ($item_html->find('.post-content') as $content_tag) {
+ $content .= $content_tag->innertext;
+ }
+
+ $item['content'] = str_get_html($content);
$parsely = $item_html->find('[name="parsely-page"]', 0);
$parsely_json = json_decode(html_entity_decode($parsely->content), true);
$item['categories'] = $parsely_json['tags'];
- $pages = $item_html->find('nav.page-numbers > .numbers > a', -2);
- if (null !== $pages) {
- for ($i = 2; $i <= $pages->innertext; $i++) {
- $page_url = $item['uri'] . '&page=' . $i;
- $page_html = getSimpleHTMLDOMCached($page_url);
- $page_html = defaultLinkTo($page_html, self::URI);
- $item['content'] .= $page_html->find('.article-content', 0);
+ // Some lightboxes are nested in figures. I'd guess that's a
+ // bug in the website
+ foreach ($item['content']->find('figure div div.ars-lightbox') as $weird_lightbox) {
+ $weird_lightbox->parent->parent->outertext = $weird_lightbox;
+ }
+
+ // It's easier to reconstruct the whole thing than remove
+ // duplicate reactive tags
+ foreach ($item['content']->find('.ars-lightbox') as $lightbox) {
+ $lightbox_content = '';
+ foreach ($lightbox->find('.ars-lightbox-item') as $lightbox_item) {
+ $img = $lightbox_item->find('img', 0);
+ if ($img != null) {
+ $lightbox_content .= '' . $img;
+ $caption = $lightbox_item->find('div.pswp-caption-content', 0);
+ if ($caption != null) {
+ $credit = $lightbox_item->find('div.ars-gallery-caption-credit', 0);
+ if ($credit != null) {
+ $credit->innertext = 'Credit: ' . $credit->innertext;
+ }
+ $lightbox_content .= '' . $caption->innertext . '';
+ }
+ $lightbox_content .= '';
+ }
}
- $item['content'] = str_get_html($item['content']);
+ $lightbox->innertext = $lightbox_content;
}
// remove various ars advertising
- $item['content']->find('#social-left', 0)->remove();
- foreach ($item['content']->find('.ars-component-buy-box') as $ad) {
+ foreach ($item['content']->find('.ars-interlude-container') as $ad) {
$ad->remove();
}
- foreach ($item['content']->find('.ad_wrapper') as $ad) {
- $ad->remove();
+ foreach ($item['content']->find('.toc-container') as $toc) {
+ $toc->remove();
}
- foreach ($item['content']->find('.sidebar') as $ad) {
- $ad->remove();
+
+ // Mostly YouTube videos
+ $iframes = $item['content']->find('iframe');
+ foreach ($iframes as $iframe) {
+ $iframe->outertext = '' . $iframe->src . '';
+ }
+ // This fixed padding around the former iframes and actual inline videos
+ foreach ($item['content']->find('div[style*=aspect-ratio]') as $styled) {
+ $styled->removeAttribute('style');
}
$item['content'] = backgroundToImg($item['content']);
-
- $item['uid'] = explode('=', $item['uri'])[1];
-
+ $item['uid'] = strval($parsely_json['post_id']);
return $item;
}
}