diff --git a/bridges/AnthropicBridge.php b/bridges/AnthropicBridge.php deleted file mode 100644 index 1272d35f..00000000 --- a/bridges/AnthropicBridge.php +++ /dev/null @@ -1,147 +0,0 @@ - [ - 'limit' => [ - 'name' => 'Limit', - 'type' => 'number', - 'required' => true, - 'defaultValue' => 10 - ], - ] - ]; - - public function collectData() - { - // Anthropic sometimes returns 500 for no reason. The contents are still there. - $html = $this->getHTMLIgnoreError(self::URI . '/research'); - $limit = $this->getInput('limit'); - - $page_data = $this->extractPageData($html); - $pages = $this->parsePageData($page_data); - for ($i = 0; $i < min(count($pages), $limit); $i++) { - $page = $pages[$i]; - $page['content'] = $this->parsePage($page['uri']); - $this->items[] = $page; - } - } - - private function getHTMLIgnoreError($url, $ttl = null) - { - if ($ttl != null) { - $cacheKey = 'pages_' . $url; - $content = $this->cache->get($cacheKey); - if ($content) { - return str_get_html($content); - } - } - - try { - $content = getContents($url); - } catch (HttpException $e) { - $content = $e->response->getBody(); - } - if ($ttl != null) { - $this->cache->set($cacheKey, $content, $ttl); - } - return str_get_html($content); - } - - private function extractPageData($html) - { - foreach ($html->find('script') as $script) { - $js_code = $script->innertext; - if (!str_starts_with($js_code, 'self.__next_f.push(')) { - continue; - } - $push_data = (string)json_decode(mb_substr($js_code, 22, mb_strlen($js_code) - 2 - 22)); - $square_bracket = mb_strpos($push_data, '['); - $push_array = json_decode(mb_substr($push_data, $square_bracket), true); - if ($push_array == null || count($push_array) < 4) { - continue; - } - $page_data = $push_array[3]; - if ($page_data != null && array_key_exists('page', $page_data)) { - return $page_data; - } - } - } - - private function parsePageData($page_data) - { - $result = []; - foreach ($page_data['page']['sections'] as $section) { - if ( - !array_key_exists('internalName', $section) || - $section['internalName'] != 'Research Teams' - ) { - continue; - } - foreach ($section['tabPages'] as $tabPage) { - if ($tabPage['label'] != 'Overview') { - continue; - } - foreach ($tabPage['sections'] as $section1) { - if ( - !array_key_exists('title', $section1) - || $section1['title'] != 'Publications' - ) { - continue; - } - foreach ($section1['posts'] as $post) { - $enc = []; - if ($post['cta'] != null && array_key_exists('url', $post['cta'])) { - $enc = [$post['cta']['url']]; - } - $result[] = [ - 'title' => $post['title'], - 'timestamp' => $post['publishedOn'], - 'uri' => self::URI . '/research/' . $post['slug']['current'], - 'categories' => array_map( - fn($s) => $s['label'], - $post['subjects'], - ), - 'enclosures' => $enc, - ]; - } - break; - } - break; - } - break; - } - return $result; - } - - private function parsePage($url) - { - // Again, 500 for no reason. - $html = $this->getHTMLIgnoreError($url, 7 * 24 * 60 * 60); - - $content = ''; - - // Main content - $main = $html->find('div[class*="PostDetail_post-detail"] > article', 0); - - // Mostly YouTube videos - $iframes = $main->find('iframe'); - foreach ($iframes as $iframe) { - $iframe->parent->removeAttribute('style'); - $iframe->outertext = '' . $iframe->src . ''; - } - - $main = convertLazyLoading($main); - $main = defaultLinkTo($main, self::URI); - $content .= $main; - return $content; - } -}