From e30698f12ff3a11931e87600d1f2d2e6954b2209 Mon Sep 17 00:00:00 2001 From: Mynacol Date: Sun, 17 Aug 2025 11:57:00 +0000 Subject: [PATCH] [GolemBridge] Add multi-page headings On multi-page articles like [1], some paragraph headers were missing because they are headers of the article pages. These headers were previously removed in c5f586497f3d23be61a6e8a5fe0f948f98a5b2f6 for being redundant with the original header. The article at [1] proves us wrong, but I added a logic to ignore truly duplicate headers. [1] https://www.golem.de/news/es-muss-nicht-immer-apple-sein-fuenf-ueberzeugende-airpods-pro-alternativen-im-test-2508-195000.html --- bridges/GolemBridge.php | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bridges/GolemBridge.php b/bridges/GolemBridge.php index a3ce82ac..48e00310 100644 --- a/bridges/GolemBridge.php +++ b/bridges/GolemBridge.php @@ -132,13 +132,22 @@ class GolemBridge extends FeedExpander // delete known bad elements foreach ( $article->find('div[id*="adtile"], #job-market, #seminars, iframe, - .gbox_affiliate, div.toc') as $bad + .gbox_affiliate, div.toc') as $bad ) { $bad->remove(); } // reload html, as remove() is buggy $article = str_get_html($article->outertext); + // Add multipage headers, but only if they are different to the article header + $firstHeader = $page->find('.table-jtoc td', 0); + if (isset($firstHeader)) { + $firstHeader = html_entity_decode($firstHeader->title); + } + $multipageHeader = $article->find('header.paged-cluster-header h1', 0); + if (isset($multipageHeader) && $multipageHeader->plaintext !== $firstHeader) { + $item .= $multipageHeader; + } $header = $article->find('header', 0); foreach ($header->find('p, figure') as $element) {