mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-04-09 16:38:50 +00:00
[GoogleSearch] fix: improve bridge (#2848)
Sort properly by date. Fix php errors. Improve the date parsing logic. Improve the content parsing by also including those with · as separator
This commit is contained in:
parent
e7aebb223d
commit
9d5f59e2db
@ -1,19 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
/**
|
|
||||||
* Returns the 100 most recent links in results in past year, sorting by date (most recent first).
|
|
||||||
* Example:
|
|
||||||
* http://www.google.com/search?q=sebsauvage&num=100&complete=0&tbs=qdr:y,sbd:1
|
|
||||||
* complete=0&num=100 : get 100 results
|
|
||||||
* qdr:y : in past year
|
|
||||||
* sbd:1 : sort by date (will only work if qdr: is specified)
|
|
||||||
*/
|
|
||||||
class GoogleSearchBridge extends BridgeAbstract {
|
class GoogleSearchBridge extends BridgeAbstract {
|
||||||
|
|
||||||
const MAINTAINER = 'sebsauvage';
|
const MAINTAINER = 'sebsauvage';
|
||||||
const NAME = 'Google search';
|
const NAME = 'Google search';
|
||||||
const URI = 'https://www.google.com/';
|
const URI = 'https://www.google.com/';
|
||||||
const CACHE_TIMEOUT = 1800; // 30min
|
const CACHE_TIMEOUT = 1800; // 30min
|
||||||
const DESCRIPTION = 'Returns most recent results from Google search.';
|
const DESCRIPTION = 'Returns max 100 results from the past year.';
|
||||||
|
|
||||||
const PARAMETERS = array(array(
|
const PARAMETERS = array(array(
|
||||||
'q' => array(
|
'q' => array(
|
||||||
@ -24,43 +17,70 @@ class GoogleSearchBridge extends BridgeAbstract {
|
|||||||
));
|
));
|
||||||
|
|
||||||
public function collectData(){
|
public function collectData(){
|
||||||
$header = array('Accept-language: en-US');
|
$dom = getSimpleHTMLDOM($this->getURI(), ['Accept-language: en-US']);
|
||||||
$html = getSimpleHTMLDOM($this->getURI(), $header)
|
if (!$dom) {
|
||||||
or returnServerError('No results for this query.');
|
returnServerError('No results for this query.');
|
||||||
|
|
||||||
$emIsRes = $html->find('div[id=res]', 0);
|
|
||||||
|
|
||||||
if(!is_null($emIsRes)) {
|
|
||||||
foreach($emIsRes->find('div[class~=g]') as $element) {
|
|
||||||
$item = array();
|
|
||||||
|
|
||||||
$t = $element->find('a[href]', 0)->href;
|
|
||||||
$item['uri'] = htmlspecialchars_decode($t);
|
|
||||||
$item['title'] = $element->find('h3', 0)->plaintext;
|
|
||||||
$resultComponents = explode(' — ', $element->find('div[data-content-feature=1]', 0)->plaintext);
|
|
||||||
$item['content'] = $resultComponents[1];
|
|
||||||
|
|
||||||
if(strpos($resultComponents[0], 'day') === true) {
|
|
||||||
$daysago = explode(' ', $resultComponents[0])[0];
|
|
||||||
$item['timestamp'] = date('d M Y', strtotime('-' . $daysago . ' days'));
|
|
||||||
} else {
|
|
||||||
$item['timestamp'] = $resultComponents[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->items[] = $item;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
$result = $dom->find('div[id=res]', 0);
|
||||||
|
|
||||||
|
if(!$result) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach ($result->find('div[class~=g]') as $element) {
|
||||||
|
$item = [];
|
||||||
|
|
||||||
|
$url = $element->find('a[href]', 0)->href;
|
||||||
|
$item['uri'] = htmlspecialchars_decode($url);
|
||||||
|
$item['title'] = $element->find('h3', 0)->plaintext;
|
||||||
|
|
||||||
|
$resultDom = $element->find('div[data-content-feature=1]', 0);
|
||||||
|
if ($resultDom) {
|
||||||
|
// Split by — or ·
|
||||||
|
$resultParts = preg_split('/( — | · )/', $resultDom->plaintext);
|
||||||
|
$resultDate = trim($resultParts[0]);
|
||||||
|
$resultContent = trim($resultParts[1] ?? '');
|
||||||
|
} else {
|
||||||
|
// Some search results don't have this particular dom identifier
|
||||||
|
$resultDate = null;
|
||||||
|
$resultContent = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($resultDate) {
|
||||||
|
try {
|
||||||
|
$createdAt = new \DateTime($resultDate);
|
||||||
|
// Set to midnight for consistent datetime
|
||||||
|
$createdAt->setTime(0, 0);
|
||||||
|
$item['timestamp'] = $createdAt->format('U');
|
||||||
|
} catch (\Exception $e) {
|
||||||
|
$item['timestamp'] = 0;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$item['timestamp'] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($resultContent) {
|
||||||
|
$item['content'] = $resultContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->items[] = $item;
|
||||||
|
}
|
||||||
|
// Sort by descending date
|
||||||
usort($this->items, function($a, $b) {
|
usort($this->items, function($a, $b) {
|
||||||
return $a['timestamp'] < $b['timestamp'];
|
return $b['timestamp'] <=> $a['timestamp'];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getURI() {
|
public function getURI() {
|
||||||
if (!is_null($this->getInput('q'))) {
|
if ($this->getInput('q')) {
|
||||||
return self::URI
|
$queryParameters = [
|
||||||
. 'search?q='
|
'q' => $this->getInput('q'),
|
||||||
. urlencode($this->getInput('q'))
|
'hl' => 'en',
|
||||||
. '&hl=en&num=100&complete=0&tbs=qdr:y,sbd:1';
|
'num' => '100', // get 100 results
|
||||||
|
'complete' => '0',
|
||||||
|
'tbs' => 'qdr:y,sbd:1' // in past year, sort by date
|
||||||
|
];
|
||||||
|
return sprintf('https://www.google.com/search?%s', http_build_query($queryParameters));
|
||||||
}
|
}
|
||||||
|
|
||||||
return parent::getURI();
|
return parent::getURI();
|
||||||
|
Loading…
Reference in New Issue
Block a user