fix(FeedParser): scrape out content from rss content:encoded (#4178)

* fix(FeedParser): parse content module from rss2

* refactor
This commit is contained in:
Dag 2024-07-31 19:04:07 +02:00 committed by GitHub
parent e55e9b8fac
commit b8a9f34527
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 39 additions and 20 deletions

View File

@ -418,7 +418,16 @@ See `formats/PlaintextFormat.php` for an example.
These commands require that you have installed the dev dependencies in `composer.json`.
Run all tests:
./vendor/bin/phpunit
Run a single test class:
./vendor/bin/phpunit --filter UrlTest
Run linter:
./vendor/bin/phpcs --standard=phpcs.xml --warning-severity=0 --extensions=php -p ./
https://github.com/squizlabs/PHP_CodeSniffer/wiki

View File

@ -112,15 +112,6 @@ class DisplayAction implements ActionInterface
$input = array_diff_key($requestArray, array_fill_keys($remove, ''));
$bridge->setInput($input);
$bridge->collectData();
$items = $bridge->getItems();
if (isset($items[0]) && is_array($items[0])) {
$feedItems = [];
foreach ($items as $item) {
$feedItems[] = FeedItem::fromArray($item);
}
$items = $feedItems;
}
$feed = $bridge->getFeed();
} catch (\Exception $e) {
// Probably an exception inside a bridge
if ($e instanceof HttpException) {
@ -154,6 +145,16 @@ class DisplayAction implements ActionInterface
}
}
$items = $bridge->getItems();
if (isset($items[0]) && is_array($items[0])) {
$feedItems = [];
foreach ($items as $item) {
$feedItems[] = FeedItem::fromArray($item);
}
$items = $feedItems;
}
$feed = $bridge->getFeed();
$formatFactory = new FormatFactory();
$format = $formatFactory->create($format);

View File

@ -186,21 +186,26 @@ class FeedItem
}
/**
* @param string|object $content The item content as text or simple_html_dom object.
* @param string|array|\simple_html_dom|\simple_html_dom_node $content The item content
*/
public function setContent($content)
{
$this->content = null;
if (
$content instanceof simple_html_dom
|| $content instanceof simple_html_dom_node
) {
$content = (string) $content;
} elseif (is_array($content)) {
// Assuming this is the rss2.0 content module
$content = $content['encoded'] ?? '';
}
if (is_string($content)) {
$this->content = $content;
} else {
Debug::log(sprintf('Feed content must be a string but got %s', gettype($content)));
Debug::log(sprintf('Unable to convert feed content to string: %s', gettype($content)));
}
}

View File

@ -167,8 +167,9 @@ final class FeedParser
if (isset($namespaces['media'])) {
$media = $feedItem->children($namespaces['media']);
}
foreach ($namespaces as $namespaceName => $namespaceUrl) {
if (in_array($namespaceName, ['', 'content', 'media'])) {
if (in_array($namespaceName, ['', 'media'])) {
continue;
}
$item[$namespaceName] = $this->parseModule($feedItem, $namespaceName, $namespaceUrl);

View File

@ -8,6 +8,13 @@ use PHPUnit\Framework\TestCase;
class FeedParserTest extends TestCase
{
private \FeedParser $sut;
public function setUp(): void
{
$this->sut = new \FeedParser();
}
public function testRss1()
{
$xml = <<<XML
@ -37,8 +44,7 @@ class FeedParserTest extends TestCase
</rdf:RDF>
XML;
$sut = new \FeedParser();
$feed = $sut->parseFeed($xml);
$feed = $this->sut->parseFeed($xml);
$this->assertSame('hello feed', $feed['title']);
$this->assertSame('http://meerkat.oreillynet.com', $feed['uri']);
@ -74,8 +80,7 @@ class FeedParserTest extends TestCase
</rss>
XML;
$sut = new \FeedParser();
$feed = $sut->parseFeed($xml);
$feed = $this->sut->parseFeed($xml);
$this->assertSame('hello feed', $feed['title']);
$this->assertSame('https://example.com/', $feed['uri']);
@ -111,8 +116,7 @@ class FeedParserTest extends TestCase
</feed>
XML;
$sut = new \FeedParser();
$feed = $sut->parseFeed($xml);
$feed = $this->sut->parseFeed($xml);
$this->assertSame('hello feed', $feed['title']);
$this->assertSame('https://example.com/1', $feed['uri']);
@ -151,8 +155,7 @@ class FeedParserTest extends TestCase
</rss>
XML;
$sut = new \FeedParser();
$feed = $sut->parseFeed($xml);
$feed = $this->sut->parseFeed($xml);
$expected = [
'title' => '',
'uri' => '',