From 8bcf4ebfbf220adbd65becdc824f0832c80424f9 Mon Sep 17 00:00:00 2001 From: csisoap <33269526+csisoap@users.noreply.github.com> Date: Fri, 1 Oct 2021 20:39:36 +0700 Subject: [PATCH] [NationalGeographicBridge] Rewrite bridge (#2177) - All the option will be preserved. - Add timestamp, author's name included with full article. --- bridges/NationalGeographicBridge.php | 316 ++++++++++++++++++++------- 1 file changed, 242 insertions(+), 74 deletions(-) diff --git a/bridges/NationalGeographicBridge.php b/bridges/NationalGeographicBridge.php index dfccd25c..458a26da 100644 --- a/bridges/NationalGeographicBridge.php +++ b/bridges/NationalGeographicBridge.php @@ -6,11 +6,12 @@ class NationalGeographicBridge extends BridgeAbstract { const PARAMETER_FULL_ARTICLE = 'full'; const TOPIC_MAGAZINE = 'Magazine'; const TOPIC_LATEST_STORIES = 'Latest Stories'; + const CACHE_TIMEOUT = 900; //15 min const NAME = 'National Geographic'; const URI = 'https://www.nationalgeographic.com/'; const DESCRIPTION = 'Fetches the latest articles from the National Geographic Magazine'; - const MAINTAINER = 'logmanoriginal'; + const MAINTAINER = 'csisoap'; const PARAMETERS = array( self::CONTEXT_BY_TOPIC => array( self::PARAMETER_TOPIC => array( @@ -28,12 +29,22 @@ class NationalGeographicBridge extends BridgeAbstract { self::PARAMETER_FULL_ARTICLE => array( 'name' => 'Full Article', 'type' => 'checkbox', - 'title' => 'Enable to load full articles (takes longer)' + 'title' => 'Enable to load full articles and other infos (takes longer)' ) ) ); private $topicName = ''; + const CONTEXT = 'eyJjb250ZW50VHlwZSI6IlVuaXNvbkh1YiIsInZhcmlhYmxlcyI6eyJsb2NhdG9yIjoiL3BhZ2VzL3 + RvcGljL2xhdGVzdC1zdG9yaWVzIiwicG9ydGZvbGlvIjoibmF0Z2VvIiwicXVlcn + lUeXBlIjoiTE9DQVRPUiJ9LCJtb2R1bGVJZCI6bnVsbH0'; + const LATEST_STORIES_ID = array( + '1df278bb-0e3d-4a67-a0ce-8fae48392822-f2-m1' + ); + const MAGAZINE_ID = array( + '94d87d74-f41a-4a32-9acd-b591ba2df288-f2-m1', + '94d87d74-f41a-4a32-9acd-b591ba2df288-f5-m2', + ); public function getURI() { switch ($this->queriedContext) { @@ -46,9 +57,16 @@ class NationalGeographicBridge extends BridgeAbstract { } } + private function getAPIURL($id) { + $context = preg_replace('/\s*/m', '', self::CONTEXT); + $url = 'https://www.nationalgeographic.com/proxy/hub?context=' + . $context . '&id=' . $id + . '&moduleType=InfiniteFeedModule&_xhr=pageContent'; + return $url; + } + public function collectData() { $this->topicName = $this->getTopicName($this->getInput(self::PARAMETER_TOPIC)); - switch($this->topicName) { case self::TOPIC_MAGAZINE: { return $this->collectMagazine(); @@ -78,28 +96,35 @@ class NationalGeographicBridge extends BridgeAbstract { } private function collectMagazine() { - $uri = $this->getURI(); + $stories = array(); - $html = getSimpleHTMLDOM($uri) - or returnServerError('Could not request ' . $uri); + foreach(self::MAGAZINE_ID as $id) { + $uri = $this->getAPIURL($id); - $script = $html->find('#lead-component script')[0]; + $json_raw = getContents($uri); - $json = json_decode($script->innertext, true); + $json = json_decode($json_raw, true)['tiles']; + $stories = array_merge($json, $stories); + } - // This is probably going to break in the future, fix it then :) - foreach($json['body']['0']['multilayout_promo_beta']['stories'] as $story) { + foreach($stories as $story) { $this->addStory($story); } } private function collectLatestStories() { - $uri = self::URI . 'latest-stories/_jcr_content/content/hubfeed.promo-hub-feed-all-stories.json'; + $stories = array(); - $json_raw = getContents($uri) - or returnServerError('Could not request ' . $uri); + foreach(self::LATEST_STORIES_ID as $id) { + $uri = $this->getAPIURL($id); - foreach(json_decode($json_raw, true) as $story) { + $json_raw = getContents($uri); + + $json = json_decode($json_raw, true)['tiles']; + $stories = array_merge($stories, $json); + } + + foreach($stories as $story) { $this->addStory($story); } } @@ -107,88 +132,231 @@ class NationalGeographicBridge extends BridgeAbstract { private function addStory($story) { $title = 'Unknown title'; $content = ''; + $story_type = ''; + $uri = ''; - foreach($story['components'] as $component) { - switch($component['content_type']) { - case 'title': { - $title = $component['title']['text']; - } break; - case 'dek': { - $content = $component['dek']['text']; - } break; - } + foreach($story['ctas'] as $component) { + $uri = $component['url']; + $story_type = $component['icon']; } $item = array(); - - $item['uri'] = $story['uri']; - $item['title'] = $title; + if(isset($story['description'])) { + $content = '

' . $story['description'] . '

'; + } + $title = $story['title']; + $item['uri'] = $uri; + $item['title'] = $story['title']; // if full article is requested! - if ($this->getInput(self::PARAMETER_FULL_ARTICLE)) - $item['content'] = $this->getFullArticle($item['uri']); - else + if ($this->getInput(self::PARAMETER_FULL_ARTICLE)) { + if($story_type != 'interactive') { + /* Nat Geo doesn't provided much info about interactive page + * and it requires JS to load the interactive. + */ + $article_data = $this->getFullArticle($item['uri']); + $item['timestamp'] = $article_data['published_date']; + $item['author'] = $article_data['authors']; + $item['content'] = $content . $article_data['content']; + } else { + $item['content'] = $content; + } + } else $item['content'] = $content; - if (isset($story['promo_image'])) { - switch($story['promo_image']['content_type']) { - case 'image': { - $item['enclosures'][] = $story['promo_image']['image']['uri']; - } break; - } - } + $image = $story['img']; + $item['enclosures'][] = $image['src']; - if (isset($story['lead_media'])) { - $media = $story['lead_media']; - switch($media['content_type']) { - case 'image': { - // Don't add if promo_image was added - if (empty($item['enclosures'])) - $item['enclosures'][] = $media['image']['uri']; - } break; - case 'image_gallery': { - foreach($media['image_gallery']['images'] as $image) { - $item['enclosures'][] = $image['uri']; - } - } break; - } + $tags = $story['tags']; + foreach($tags as $tag) { + $tag_name = $tag['name']; + $item['categories'][] = $tag_name; } $this->items[] = $item; } + private function filterArticleData($data) { + $article_module = array_filter( + $data, function ($item) { + if(isset($item['id']) && $item['id'] == 'natgeo-template1-frame-1') { + return true; + } + } + ); + + $article_data = array_reduce( + $article_module, + function (array $carry, array $item) { + $module = $item['mods']; + return array_merge( + $carry, + array_filter( + $module, function ($data) { + return $data['id'] == 'natgeo-template1-frame-1-module-1'; + } + ) + ); + }, + array() + ); + + return $article_data[0]; + } + + private function handleImages($image_module, $image_type) { + $image_alt = ''; + $image_credit = ''; + $image_src = ''; + $image_caption = ''; + $caption = ''; + switch($image_type) { + case 'image': + case 'imagegroup': + $image = $image_module['image']; + $image_src = $image['src']; + if(isset($image_module['alt'])) { + $image_alt = $image_module['alt']; + } elseif(isset($image['altText'])) { + $image_alt = $image['altText']; + } + if(isset($image['crdt'])) { + $image_credit = $image['crdt']; + } + $caption = (isset($image_module['caption']) ? $image_module['caption'] : ''); + break; + case 'photogallery': + $image_credit = (isset($image_module['caption']['credit']) ? $image_module['caption']['credit'] : ''); + $caption = $image_module['caption']['text']; + $image_src = $image_module['img']['src']; + $image_alt = $image_module['img']['altText']; + break; + case 'video': + $image_credit = (isset($image_module['credit']) ? $image_module['credit'] : ''); + $description = (isset($image_module['description']) ? $image_module['description'] : ''); + $caption = $description . ' Video can be watched on the article\'s page'; + $image = $image_module['image']; + $image_alt = $image['altText']; + $image_src = $image['src']; + } + + $image_caption = $caption . ' ' . $image_credit + . '. Notes: Some image may have copyrighted on it.'; + $wrapper = << +{$image_alt} +
$image_caption
+ +EOD; + return $wrapper; + } + private function getFullArticle($uri) { - $html = getSimpleHTMLDOMCached($uri) + $html = getContents($uri) or returnServerError('Could not load ' . $uri); - $html = defaultLinkTo($html, $uri); + $scriptRegex = '/window\[\'__natgeo__\'\]=(.*);<\/script>/'; - $content = ''; + preg_match($scriptRegex, $html, $matches, PREG_OFFSET_CAPTURE, 0); - foreach($html->find(' - .content > .smartbody.text, - .content > .section.image script[type="text/json"], - .content > .section.image span[itemprop="caption"], - .content > .section.inline script[type="text/json"] - ') as $element) { - if ($element->tag === 'script') { - $json = json_decode($element->innertext, true); - if (isset($json['src'])) { - $content .= '' . $json['alt'] . ''; - } elseif (isset($json['galleryType']) && isset($json['endpoint'])) { - $doc = getContents($json['endpoint']) - or returnServerError('Could not load ' . $json['endpoint']); - $json = json_decode($doc, true); - foreach($json['items'] as $item) { - $content .= '

' . $item['caption'] . '

'; - $content .= '' . $item['caption'] . ''; - } - } + $json = json_decode($matches[1][0], true); + + $unfiltered_data = $json['page']['content']['article']['frms']; + $filtered_data = $this->filterArticleData($unfiltered_data); + + $article = $filtered_data['edgs'][0]; + + $contributors = $article['cntrbGrp']; + $authors = array(); + if(count($contributors) > 0) { + $authors = $contributors[0]['contributors']; + } + + $authors_name = ''; + $counter = 0; + foreach($authors as $author) { + $counter++; + if($counter == count($authors)) { + $authors_name .= $author['displayName']; } else { - $content .= $element->outertext; + $authors_name .= $author['displayName'] . ', '; } } - return $content; + $published_date = $article['pbDt']; + $article_body = $article['bdy']; + $content = ''; + + foreach($article_body as $body) { + switch($body['type']) { + case 'p': + $content .= '

' . $body['cntnt']['mrkup'] . '

'; + break; + case 'h2': + $content .= '

' . $body['cntnt']['mrkup'] . '

'; + break; + case 'inline': + $module = $body['cntnt']; + if(empty($module)) + continue 2; + switch($module['cmsType']) { + case 'image': + $content .= $this->handleImages($module, $module['cmsType']); + break; + case 'imagegroup': + $images = $module['images']; + foreach($images as $image) { + $content .= $this->handleImages($image, $module['cmsType']); + } + break; + case 'editorsNote': + $content .= $module['note']; + break; + case 'listicle': + $content .= '

' . $module['title'] . '

'; + if(isset($module['image'])) { + $content .= $this->handleImages($module['image'], $module['image']['cmsType']); + } + $content .= '

' . (isset($module['text']) ? $module['text'] : '') . '

'; + break; + case 'photogallery': + $gallery = $body['cntnt']['media']; + foreach($gallery as $image) { + $content .= $this->handleImages($image, $module['cmsType']); + } + break; + case 'video': + $content .= $this->handleImages($module, $module['cmsType']); + break; + case 'pullquote'; + $quote = $module['quote']; + $author_name = ''; + $authors = (isset($module['byLineProps']['authors']) ? $module['byLineProps']['authors'] : array()); + foreach($authors as $author) { + $author_desc = (isset($author['authorDesc']) ? $author['authorDesc'] : ''); + $author_name .= $author['displayName'] . ', ' . $author_desc; + } + $content .= << +
+

$quote

+
+
$author_name
+ +EOD; + break; + } + break; + case 'ul': + $content .= $body['cntnt']['mrkup'] . '
'; + break; + } + } + + return array( + 'content' => $content, + 'published_date' => $published_date, + 'authors' => $authors_name + ); } }