[FuturaSciences] Improve content extraction (#2317)

- Fix tracking removal in URL
- Fix images broken due to new lazy loading mechanism
- Remove headline, articles do not have it anymore
- Improve article cleanup
This commit is contained in:
ORelio 2021-10-29 22:24:19 +02:00 committed by GitHub
parent 970bdd45f9
commit 547829f971
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -85,7 +85,7 @@ class FuturaSciencesBridge extends FeedExpander {
protected function parseItem($newsItem){ protected function parseItem($newsItem){
$item = parent::parseItem($newsItem); $item = parent::parseItem($newsItem);
$item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); $item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']);
$article = getSimpleHTMLDOMCached($item['uri']) $article = getSimpleHTMLDOMCached($item['uri'])
or returnServerError('Could not request Futura-Sciences: ' . $item['uri']); or returnServerError('Could not request Futura-Sciences: ' . $item['uri']);
$item['content'] = $this->extractArticleContent($article); $item['content'] = $this->extractArticleContent($article);
@ -96,30 +96,46 @@ class FuturaSciencesBridge extends FeedExpander {
} }
private function extractArticleContent($article){ private function extractArticleContent($article){
$contents = $article->find('section.article-text', 1)->innertext; $contents = $article->find('section.article-text', 1);
$headline = trim($article->find('p.description', 0)->plaintext);
if(!empty($headline)) foreach($contents->find('img') as $img) {
$headline = '<p><b>' . $headline . '</b></p>'; if(!empty($img->getAttribute('data-src'))) {
$img->src = $img->getAttribute('data-src');
}
}
foreach($contents->find('a.tooltip-link') as $a) {
$a->outertext = $a->plaintext;
}
foreach(array( foreach(array(
'<div class="clear', 'clear',
'<div class="sharebar2', 'sharebar2',
'<div class="diaporamafullscreen"', 'diaporamafullscreen',
'<div class="module social-button', 'module.social-button',
'<div class="module social-share', 'module.social-share',
'<div style="margin-bottom:10px;" class="noprint"', 'ficheprevnext',
'<div class="ficheprevnext', 'addthis_toolbox',
'<div class="bar noprint', 'noprint',
'<div class="toolbar noprint', 'hubbottom',
'<div class="addthis_toolbox', 'hubbottom2'
'<div class="noprint', ) as $div_class_remove) {
'<div class="bg bglight border border-full noprint', foreach($contents->find('div.' . $div_class_remove) as $div) {
'<div class="httplogbar-wrapper noprint', $keep_div = false;
'<div id="forumcomments', foreach(array(
'<div ng-if="active"' 'didyouknow'
) as $div_start) { ) as $div_class_dont_remove) {
$contents = stripRecursiveHTMLSection($contents, 'div', $div_start); if(strpos($div->getAttribute('class'), $div_class_dont_remove) !== false) {
$keep_div = true;
} }
}
if(!$keep_div) {
$div->outertext = '';
}
}
}
$contents = $contents->innertext;
$contents = stripWithDelimiters($contents, '<hr ', '/>'); $contents = stripWithDelimiters($contents, '<hr ', '/>');
$contents = stripWithDelimiters($contents, '<p class="content-date', '</p>'); $contents = stripWithDelimiters($contents, '<p class="content-date', '</p>');
@ -131,7 +147,7 @@ class FuturaSciencesBridge extends FeedExpander {
$contents = stripWithDelimiters($contents, '<script ', '</script>'); $contents = stripWithDelimiters($contents, '<script ', '</script>');
$contents = stripWithDelimiters($contents, '<script>', '</script>'); $contents = stripWithDelimiters($contents, '<script>', '</script>');
return $headline . trim($contents); return trim($contents);
} }
// Extracts the author from an article or element // Extracts the author from an article or element