From c7b5a3f5b3021314e89fa18364cfaa6c70cc7eb6 Mon Sep 17 00:00:00 2001 From: ORelio Date: Thu, 20 Mar 2025 15:11:32 +0100 Subject: [PATCH] FeedExpander: Remove tailing content in XML - Move preprocessing code into overridable preprocessXml() - Auto-remove trailing data after root xml node --- lib/FeedExpander.php | 45 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/lib/FeedExpander.php b/lib/FeedExpander.php index ef001af1..f4fc0891 100644 --- a/lib/FeedExpander.php +++ b/lib/FeedExpander.php @@ -22,14 +22,7 @@ abstract class FeedExpander extends BridgeAbstract if ($xmlString === '') { throw new \Exception(sprintf('Unable to parse xml from `%s` because we got the empty string', $url), 10); } - // prepare/massage the xml to make it more acceptable - $problematicStrings = [ - ' ', - '»', - '’', - ]; - $xmlString = str_replace($problematicStrings, '', $xmlString); - + $xmlString = $this->prepareXml($xmlString); $feedParser = new FeedParser(); try { $this->feed = $feedParser->parseFeed($xmlString); @@ -59,6 +52,42 @@ abstract class FeedExpander extends BridgeAbstract return $item; } + /** + * Prepare XML document to make it more acceptable by the parser + * This method can be overriden by bridges to change this behavior + * + * @return string + */ + protected function prepareXml($xmlString) + { + // Remove problematic escape sequences + $problematicStrings = [ + ' ', + '»', + '’', + ]; + $xmlString = str_replace($problematicStrings, '', $xmlString); + + // Remove extra content at the end of the document, if any. + // First retrieve tag name of root node, which is the first node following in xml string and strip anything beyond that. + if (preg_match('/(?:<\?xml[^>]*\?>[^<]*<)([^ "\'>]+)/i', $xmlString, $matches) === 1) + { + $root_node_tag = $matches[1]; + $last_closing_occurrence = strripos($xmlString, '', $last_closing_occurrence); + if ($closing_node_end !== false) + { + $xmlString = substr($xmlString, 0, $closing_node_end + 1); + } + } + } + + return $xmlString; + } + public function getURI() { return $this->feed['uri'] ?? parent::getURI();