From e2460ead189deda3271347e74fe8c40c7840d9b8 Mon Sep 17 00:00:00 2001
From: Joseph
Date: Fri, 28 Jun 2019 13:45:27 +0000
Subject: [PATCH] [InternetArchiveBridge] Add new bridge (#1186)
---
bridges/InternetArchiveBridge.php | 293 ++++++++++++++++++++++++++++++
1 file changed, 293 insertions(+)
create mode 100644 bridges/InternetArchiveBridge.php
diff --git a/bridges/InternetArchiveBridge.php b/bridges/InternetArchiveBridge.php
new file mode 100644
index 00000000..a578075f
--- /dev/null
+++ b/bridges/InternetArchiveBridge.php
@@ -0,0 +1,293 @@
+ array(
+ 'username' => array(
+ 'name' => 'Username',
+ 'type' => 'text',
+ 'required' => true,
+ 'exampleValue' => '@verifiedjoseph',
+ ),
+ 'content' => array(
+ 'name' => 'Content',
+ 'type' => 'list',
+ 'values' => array(
+ 'Uploads' => 'uploads',
+ 'Posts' => 'posts',
+ 'Reviews' => 'reviews',
+ 'Collections' => 'collections',
+ 'Web Archives' => 'web-archive',
+ ),
+ 'defaultValue' => 'uploads',
+ )
+ )
+ );
+
+ const CACHE_TIMEOUT = 900; // 15 mins
+
+ private $skipClasses = array(
+ 'item-ia mobile-header hidden-tiles',
+ 'item-ia account-ia'
+ );
+
+ public function collectData() {
+
+ $html = getSimpleHTMLDOM($this->getURI())
+ or returnServerError('Could not request: ' . $this->getURI());
+
+ $html = defaultLinkTo($html, $this->getURI());
+
+ if ($this->getInput('content') !== 'posts') {
+
+ $detailsDivNumber = 0;
+
+ foreach ($html->find('div.results > div[data-id]') as $index => $result) {
+ $item = array();
+
+ if (in_array($result->class, $this->skipClasses)) {
+ continue;
+ }
+
+ switch($result->class) {
+ case 'item-ia':
+
+ switch($this->getInput('content')) {
+ case 'reviews':
+ $item = $this->processReview($result);
+ break;
+ case 'uploads':
+ $item = $this->processUpload($result);
+ break;
+ }
+
+ break;
+ case 'item-ia url-item':
+ $item = $this->processWebArchives($result);
+ break;
+ case 'item-ia collection-ia':
+ $item = $this->processCollection($result);
+ break;
+ }
+
+ if ($this->getInput('content') !== 'reviews') {
+ $hiddenDetails = $this->processHiddenDetails($html, $detailsDivNumber, $item);
+
+ $this->items[] = array_merge($item, $hiddenDetails);
+ } else {
+
+ $this->items[] = $item;
+
+ }
+
+ $detailsDivNumber++;
+ }
+ }
+
+ if ($this->getInput('content') === 'posts') {
+ $this->items = $this->processPosts($html);
+ }
+ }
+
+ public function getURI() {
+
+ if (!is_null($this->getInput('username')) && !is_null($this->getInput('content'))) {
+ return self::URI . '/details/' . $this->processUsername() . '&tab=' . $this->getInput('content');
+ }
+
+ return parent::getURI();
+ }
+
+ public function getName() {
+
+ if (!is_null($this->getInput('username')) && !is_null($this->getInput('content'))) {
+
+ $contentValues = array_flip(self::PARAMETERS['Account']['content']['values']);
+
+ return $contentValues[$this->getInput('content')] . ' - '
+ . $this->processUsername() . ' - Internet Archive';
+ }
+
+ return parent::getName();
+ }
+
+ private function processUsername() {
+
+ if (substr($this->getInput('username'), 0, 1) !== '@') {
+ return '@' . $this->getInput('username');
+ }
+
+ return $this->getInput('username');
+ }
+
+ private function processUpload($result) {
+
+ $item = array();
+
+ $collection = $result->find('a.stealth', 0);
+ $collectionLink = self::URI . $collection->href;
+ $collectionTitle = $collection->find('div.item-parent-ttl', 0)->plaintext;
+
+ $item['title'] = trim($result->find('div.ttl', 0)->innertext);
+ $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
+ $item['uri'] = self::URI . $result->find('div.item-ttl.C.C2 > a', 0)->href;
+
+ if ($result->find('div.by.C.C4', 0)->children(2)) {
+ $item['author'] = $result->find('div.by.C.C4', 0)->children(2)->plaintext;
+ }
+
+ $item['content'] = <<Media Type: {$result->attr['data-mediatype']}
+Collection: {$collectionTitle}
+EOD;
+
+ $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
+
+ return $item;
+ }
+
+ private function processReview($result) {
+
+ $item = array();
+
+ $item['title'] = trim($result->find('div.ttl', 0)->innertext);
+ $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.review-title', 0)->children(0)->href;
+
+ if ($result->find('div.by.C.C4', 0)->children(2)) {
+ $item['author'] = $result->find('div.by.C.C4', 0)->children(2)->plaintext;
+ }
+
+ $item['content'] = <<Subject: {$result->find('div.review-title', 0)->plaintext}
+{$result->find('div.hidden-lists.review' , 0)->children(1)->plaintext}
+EOD;
+
+ $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
+
+ return $item;
+ }
+
+ private function processWebArchives($result) {
+
+ $item = array();
+
+ $item['title'] = trim($result->find('div.ttl', 0)->plaintext);
+ $item['timestamp'] = strtotime($result->find('div.hidden-lists', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.item-ttl.C.C2 > a', 0)->href;
+
+ $item['content'] = <<processUsername()} archived {$result->find('div.ttl', 0)->plaintext}
+EOD;
+
+ $item['enclosures'][] = $result->find('img.item-img', 0)->source;
+
+ return $item;
+ }
+
+ private function processCollection($result) {
+
+ $item = array();
+
+ $title = trim($result->find('div.collection-title.C.C2', 0)->children(0)->plaintext);
+ $itemCount = strtolower(trim($result->find('div.num-items.topinblock', 0)->plaintext));
+
+ $item['title'] = $title . ' (' . $itemCount . ')';
+ $item['timestamp'] = strtotime($result->find('div.hidden-tiles.pubdate.C.C3', 0)->children(0)->plaintext);
+ $item['uri'] = $result->find('div.collection-title.C.C2 > a', 0)->href;
+
+ $item['content'] = '';
+
+ if ($result->find('img.item-img', 0)) {
+ $item['enclosures'][] = self::URI . $result->find('img.item-img', 0)->source;
+ }
+
+ return $item;
+ }
+
+ private function processHiddenDetails($html, $detailsDivNumber, $item) {
+
+ $description = '';
+
+ if ($html->find('div.details-ia.hidden-tiles', $detailsDivNumber)) {
+ $detailsDiv = $html->find('div.details-ia.hidden-tiles', $detailsDivNumber);
+
+ if ($detailsDiv->find('div.C234', 0)->children(0)) {
+ $description = $detailsDiv->find('div.C234', 0)->children(0)->plaintext;
+
+ $detailsDiv->find('div.C234', 0)->children(0)->innertext = '';
+ }
+
+ $topics = trim($detailsDiv->find('div.C234', 0)->plaintext);
+
+ if (!empty($topics)) {
+ $topics = trim($detailsDiv->find('div.C234', 0)->plaintext);
+ $topics = trim(substr($topics, 7));
+
+ $item['categories'] = explode(',', $topics);
+ }
+
+ $item['content'] = '' . $description . '
' . $item['content'];
+ }
+
+ return $item;
+ }
+
+ private function processPosts($html) {
+
+ $items = array();
+
+ foreach ($html->find('table.forumTable > tr') as $index => $tr) {
+ $item = array();
+
+ if ($index === 0) {
+ continue;
+ }
+
+ $item['title'] = $tr->find('td', 0)->plaintext;
+ $item['timestamp'] = strtotime($tr->find('td', 4)->children(0)->plaintext);
+ $item['uri'] = $tr->find('td', 0)->children(0)->href;
+
+ $formLink = <<{$tr->find('td', 2)->children(0)->plaintext}
+EOD;
+
+ $postDate = $tr->find('td', 4)->children(0)->plaintext;
+
+ $postPageHtml = getSimpleHTMLDOMCached($item['uri'], 3600)
+ or returnServerError('Could not request: ' . $item['uri']);
+
+ $postPageHtml = defaultLinkTo($postPageHtml, $this->getURI());
+
+ $post = $postPageHtml->find('div.box.well.well-sm', 0);
+
+ $parentLink = '';
+ $replyLink = <<Reply
+EOD;
+
+ if ($post->find('a', 1)->innertext = 'See parent post') {
+ $parentLink = <<View parent post
+EOD;
+ }
+
+ $post->find('h1', 0)->outertext = '';
+ $post->find('h2', 0)->outertext = '';
+
+ $item['content'] = <<{$post->innertext}{$replyLink} - {$parentLink} - Posted in {$formLink} on {$postDate}
+EOD;
+
+ $items[] = $item;
+
+ if (count($items) >= 10) {
+ break;
+ }
+ }
+ return $items;
+ }
+}