diff --git a/bridges/RobinhoodSnacksBridge.php b/bridges/RobinhoodSnacksBridge.php index 313c6386..0f2eac83 100644 --- a/bridges/RobinhoodSnacksBridge.php +++ b/bridges/RobinhoodSnacksBridge.php @@ -7,22 +7,107 @@ class RobinhoodSnacksBridge extends BridgeAbstract { const CACHE_TIMEOUT = 86400; // 24h const DESCRIPTION = 'Returns newsletters from Robinhood Snacks'; + // Work around 403 by pretending to be a legit browser + const FAKE_HEADERS = array( + 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0', + 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language: es-ES,en-US;q=0.7,en;q=0.3', + 'Accept-Encoding: gzip, deflate, br', + 'Connection: keep-alive', + 'Upgrade-Insecure-Requests: 1', + 'Sec-Fetch-Dest: document', + 'Sec-Fetch-Mode: navigate', + 'Sec-Fetch-Site: none', + 'Sec-Fetch-User: ?1', + 'Pragma: no-cache', + 'Cache-Control: no-cache', + 'TE: trailers' + ); + public function collectData() { - $html = getSimpleHTMLDOM(self::URI); + $html = getSimpleHTMLDOM(self::URI, self::FAKE_HEADERS); + $html = defaultLinkTo($html, $this->getURI()); - $elements = $html->find('#__next > div > div > div > div > div > a'); + $elements = $html->find('#__next > div > div > div > div > a'); foreach ($elements as $element) { if ($element->href === 'https://snacks.robinhood.com/newsletters/page/2/') { continue; } + $content = $element->find('div > div', 2); + + // Remove element that is not parsed (span with weekly tag) + $unwanted_selector = 'span'; + foreach($content->find($unwanted_selector) as $found) { + $found->outertext = ''; + } + + $title = $content->find('div', 0)->innertext; + $timestamp = strtotime($content->find('div', 1)->innertext); + $uri = $element->href; + $this->items[] = array( - 'uri' => $element->href, - 'title' => $element->find('div > div', 3)->plaintext, - 'content' => $element->find('div > div', 4)->plaintext, + 'uri' => $uri, + 'title' => $title, + 'timestamp' => $timestamp, + 'content' => self::getArticleContent($uri) ); } } + + private function getArticleContent($uri) + { + $article_html = getSimpleHTMLDOMCached($uri, self::CACHE_TIMEOUT, self::FAKE_HEADERS); + if(!$article_html) { + return ''; + } + + $content = $article_html->find('#__next > div > div > div > span', 0); + $content->removeChild($content->find('div', 0)); + $content->removeChild($content->find('h1', 0)); + $content->removeChild($content->find('img', 1)); + + // Remove elements that are not part of article content + $unwanted_selector = 'style'; + foreach($content->find($unwanted_selector) as $found) { + $found->outertext = ''; + } + + // Images cleanup + $already_displayed_pictures = array(); + foreach($content->find('img') as $found) { + // Skip loader images + if (str_contains($found->src, 'data:image/gif;base64')) { + $found->outertext = ''; + continue; + } + + // Skip multiple images with same src + // and remove duplicated image description + if (in_array($found->src, $already_displayed_pictures)) { + $found->parent->parent->parent->outertext = ''; + $found->parent->parent->parent->nextSibling()->nextSibling()->outertext = ''; + continue; + } + + // Remove srcset attribute + $found->removeAttribute('srcset'); + + // If relative img, fix path + if (str_starts_with($found->src, '/_next')) { + $found->setAttribute('src', 'https://snacks.robinhood.com' . $found->getAttribute('src')); + } + + $already_displayed_pictures[] = $found->src; + } + + $content_text = $content->innertext; + + // Remove noscript tag to display images + $content_text = str_replace('