mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-04-16 11:50:55 +00:00
fix(reuters): tweak, try to avoid antibot (#4515)
This commit is contained in:
parent
d6a9da1cc8
commit
95af1ffddf
@ -5,7 +5,7 @@ class ReutersBridge extends BridgeAbstract
|
|||||||
const MAINTAINER = 'hollowleviathan, spraynard, csisoap';
|
const MAINTAINER = 'hollowleviathan, spraynard, csisoap';
|
||||||
const NAME = 'Reuters Bridge';
|
const NAME = 'Reuters Bridge';
|
||||||
const URI = 'https://www.reuters.com';
|
const URI = 'https://www.reuters.com';
|
||||||
const CACHE_TIMEOUT = 1800; // 30min
|
const CACHE_TIMEOUT = 3600; // 1h
|
||||||
const DESCRIPTION = 'Returns news from Reuters';
|
const DESCRIPTION = 'Returns news from Reuters';
|
||||||
|
|
||||||
private $feedName = self::NAME;
|
private $feedName = self::NAME;
|
||||||
@ -142,6 +142,98 @@ class ReutersBridge extends BridgeAbstract
|
|||||||
'wire'
|
'wire'
|
||||||
];
|
];
|
||||||
|
|
||||||
|
public function collectData()
|
||||||
|
{
|
||||||
|
$endpoint = $this->getSectionEndpoint();
|
||||||
|
$url = $this->getAPIURL($endpoint, 'section');
|
||||||
|
$json = getContents($url);
|
||||||
|
$data = Json::decode($json);
|
||||||
|
|
||||||
|
$stories = [];
|
||||||
|
$section_name = '';
|
||||||
|
if ($this->useWireAPI) {
|
||||||
|
$reuters_wireitems = $data['wireitems'];
|
||||||
|
$section_name = $data['wire_name'];
|
||||||
|
$processedData = $this->processData($reuters_wireitems);
|
||||||
|
|
||||||
|
// Merge all articles from Editor's Highlight section into existing array of templates.
|
||||||
|
$top_section = reset($processedData);
|
||||||
|
if ($top_section['type'] == 'headlines') {
|
||||||
|
$top_section = array_shift($processedData);
|
||||||
|
$articles = $top_section['headlines'];
|
||||||
|
$processedData = array_merge($articles, $processedData);
|
||||||
|
}
|
||||||
|
$stories = $processedData;
|
||||||
|
} else {
|
||||||
|
$section_name = $data['result']['section']['name'];
|
||||||
|
if (isset($data['arcResult']['articles'])) {
|
||||||
|
$stories = $data['arcResult']['articles'];
|
||||||
|
} else {
|
||||||
|
$stories = $data['result']['articles'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$this->feedName = $section_name . ' | Reuters';
|
||||||
|
|
||||||
|
usort($stories, function ($story1, $story2) {
|
||||||
|
return $story2['published_time'] <=> $story1['published_time'];
|
||||||
|
});
|
||||||
|
|
||||||
|
$stories = array_slice($stories, 0, 20);
|
||||||
|
|
||||||
|
foreach ($stories as $story) {
|
||||||
|
$uid = '';
|
||||||
|
$author = '';
|
||||||
|
$category = [];
|
||||||
|
$content = $story['description'];
|
||||||
|
$title = '';
|
||||||
|
$timestamp = $story['published_time'];
|
||||||
|
$url = '';
|
||||||
|
$article_uri = '';
|
||||||
|
$source_type = '';
|
||||||
|
if ($this->useWireAPI) {
|
||||||
|
$uid = $story['story']['usn'];
|
||||||
|
$article_uri = $story['template_action']['api_path'];
|
||||||
|
$title = $story['story']['hed'];
|
||||||
|
$url = $story['template_action']['url'];
|
||||||
|
} else {
|
||||||
|
$uid = $story['id'];
|
||||||
|
$url = self::URI . $story['canonical_url'];
|
||||||
|
$title = $story['title'];
|
||||||
|
$article_uri = $story['canonical_url'];
|
||||||
|
$source_type = $story['source']['name'];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Some article cause unexpected behaviour like redirect to another site not API.
|
||||||
|
// Attempt to check article source type to avoid this.
|
||||||
|
if (!$this->useWireAPI && $source_type != 'Package') { // Only Reuters PF api have this, Wire don't.
|
||||||
|
$author = $this->handleAuthorName($story['authors'] ?? []);
|
||||||
|
$timestamp = $story['published_time'];
|
||||||
|
$image_placeholder = '';
|
||||||
|
if (isset($story['thumbnail'])) {
|
||||||
|
$image_placeholder = $this->handleImage([$story['thumbnail']]);
|
||||||
|
}
|
||||||
|
$content = $story['description'] . $image_placeholder;
|
||||||
|
if (isset($story['primary_section']['name'])) {
|
||||||
|
$category = [$story['primary_section']['name']];
|
||||||
|
} else {
|
||||||
|
$category = [];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$content_detail = $this->getArticle($article_uri);
|
||||||
|
$description = $content_detail['content'];
|
||||||
|
$description = defaultLinkTo($description, $this->getURI());
|
||||||
|
|
||||||
|
$author = $content_detail['author'];
|
||||||
|
$images = $content_detail['images'];
|
||||||
|
$category = $content_detail['category'];
|
||||||
|
//$content = "$description $images";
|
||||||
|
//$timestamp = $content_detail['published_at'];
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->addStories($title, $content, $timestamp, $author, $url, $category);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Takes in data from Reuters Wire API and
|
* Takes in data from Reuters Wire API and
|
||||||
* creates structured data in the form of a list
|
* creates structured data in the form of a list
|
||||||
@ -280,6 +372,14 @@ class ReutersBridge extends BridgeAbstract
|
|||||||
|
|
||||||
private function getArticle($feed_uri, $is_article_uid = false)
|
private function getArticle($feed_uri, $is_article_uid = false)
|
||||||
{
|
{
|
||||||
|
// Temp fix to try to avoid reuters anti-bot
|
||||||
|
return [
|
||||||
|
'content' => '',
|
||||||
|
'author' => '',
|
||||||
|
'category' => '',
|
||||||
|
'images' => '',
|
||||||
|
'published_at' => ''
|
||||||
|
];
|
||||||
// This will make another request to API to get full detail of article and author's name.
|
// This will make another request to API to get full detail of article and author's name.
|
||||||
$url = $this->getAPIURL($feed_uri, 'article', $is_article_uid);
|
$url = $this->getAPIURL($feed_uri, 'article', $is_article_uid);
|
||||||
|
|
||||||
@ -493,90 +593,4 @@ EOD;
|
|||||||
{
|
{
|
||||||
return $this->feedName;
|
return $this->feedName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function collectData()
|
|
||||||
{
|
|
||||||
$endpoint = $this->getSectionEndpoint();
|
|
||||||
$url = $this->getAPIURL($endpoint, 'section');
|
|
||||||
$json = getContents($url);
|
|
||||||
$data = Json::decode($json);
|
|
||||||
|
|
||||||
$stories = [];
|
|
||||||
$section_name = '';
|
|
||||||
if ($this->useWireAPI) {
|
|
||||||
$reuters_wireitems = $data['wireitems'];
|
|
||||||
$section_name = $data['wire_name'];
|
|
||||||
$processedData = $this->processData($reuters_wireitems);
|
|
||||||
|
|
||||||
// Merge all articles from Editor's Highlight section into existing array of templates.
|
|
||||||
$top_section = reset($processedData);
|
|
||||||
if ($top_section['type'] == 'headlines') {
|
|
||||||
$top_section = array_shift($processedData);
|
|
||||||
$articles = $top_section['headlines'];
|
|
||||||
$processedData = array_merge($articles, $processedData);
|
|
||||||
}
|
|
||||||
$stories = $processedData;
|
|
||||||
} else {
|
|
||||||
$section_name = $data['result']['section']['name'];
|
|
||||||
if (isset($data['arcResult']['articles'])) {
|
|
||||||
$stories = $data['arcResult']['articles'];
|
|
||||||
} else {
|
|
||||||
$stories = $data['result']['articles'];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$this->feedName = $section_name . ' | Reuters';
|
|
||||||
|
|
||||||
foreach ($stories as $story) {
|
|
||||||
$uid = '';
|
|
||||||
$author = '';
|
|
||||||
$category = [];
|
|
||||||
$content = '';
|
|
||||||
$title = '';
|
|
||||||
$timestamp = '';
|
|
||||||
$url = '';
|
|
||||||
$article_uri = '';
|
|
||||||
$source_type = '';
|
|
||||||
if ($this->useWireAPI) {
|
|
||||||
$uid = $story['story']['usn'];
|
|
||||||
$article_uri = $story['template_action']['api_path'];
|
|
||||||
$title = $story['story']['hed'];
|
|
||||||
$url = $story['template_action']['url'];
|
|
||||||
} else {
|
|
||||||
$uid = $story['id'];
|
|
||||||
$url = self::URI . $story['canonical_url'];
|
|
||||||
$title = $story['title'];
|
|
||||||
$article_uri = $story['canonical_url'];
|
|
||||||
$source_type = $story['source']['name'];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Some article cause unexpected behaviour like redirect to another site not API.
|
|
||||||
// Attempt to check article source type to avoid this.
|
|
||||||
if (!$this->useWireAPI && $source_type != 'Package') { // Only Reuters PF api have this, Wire don't.
|
|
||||||
$author = $this->handleAuthorName($story['authors'] ?? []);
|
|
||||||
$timestamp = $story['published_time'];
|
|
||||||
$image_placeholder = '';
|
|
||||||
if (isset($story['thumbnail'])) {
|
|
||||||
$image_placeholder = $this->handleImage([$story['thumbnail']]);
|
|
||||||
}
|
|
||||||
$content = $story['description'] . $image_placeholder;
|
|
||||||
if (isset($story['primary_section']['name'])) {
|
|
||||||
$category = [$story['primary_section']['name']];
|
|
||||||
} else {
|
|
||||||
$category = [];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
$content_detail = $this->getArticle($article_uri);
|
|
||||||
$description = $content_detail['content'];
|
|
||||||
$description = defaultLinkTo($description, $this->getURI());
|
|
||||||
|
|
||||||
$author = $content_detail['author'];
|
|
||||||
$images = $content_detail['images'];
|
|
||||||
$category = $content_detail['category'];
|
|
||||||
$content = "$description $images";
|
|
||||||
$timestamp = $content_detail['published_at'];
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->addStories($title, $content, $timestamp, $author, $url, $category);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user