mirror of
https://github.com/RSS-Bridge/rss-bridge.git
synced 2025-04-05 17:19:37 +00:00
[CssSelector/Sitemap] Minor fixes (#3719)
- Apply title_cleanup to title from metadata (#3717) - Metadata: Fix ld+json object/array confusion - Sitemap: Also try /sitemap.xml well known url
This commit is contained in:
parent
2172df9fa2
commit
3557e5ffd4
@ -91,7 +91,7 @@ class CssSelectorBridge extends BridgeAbstract
|
|||||||
$limit = $this->getInput('limit') ?? 10;
|
$limit = $this->getInput('limit') ?? 10;
|
||||||
|
|
||||||
$html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
|
$html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
|
||||||
$this->feedName = $this->getPageTitle($html, $title_cleanup);
|
$this->feedName = $this->titleCleanup($this->getPageTitle($html), $title_cleanup);
|
||||||
$items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);
|
$items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);
|
||||||
|
|
||||||
if (empty($content_selector)) {
|
if (empty($content_selector)) {
|
||||||
@ -139,17 +139,27 @@ class CssSelectorBridge extends BridgeAbstract
|
|||||||
/**
|
/**
|
||||||
* Retrieve title from webpage URL or DOM
|
* Retrieve title from webpage URL or DOM
|
||||||
* @param string|object $page URL or DOM to retrieve title from
|
* @param string|object $page URL or DOM to retrieve title from
|
||||||
* @param string $title_cleanup optional string to remove from webpage title, e.g. " | BlogName"
|
|
||||||
* @return string Webpage title
|
* @return string Webpage title
|
||||||
*/
|
*/
|
||||||
protected function getPageTitle($page, $title_cleanup = null)
|
protected function getPageTitle($page)
|
||||||
{
|
{
|
||||||
if (is_string($page)) {
|
if (is_string($page)) {
|
||||||
$page = getSimpleHTMLDOMCached($page);
|
$page = getSimpleHTMLDOMCached($page);
|
||||||
}
|
}
|
||||||
$title = html_entity_decode($page->find('title', 0)->plaintext);
|
$title = html_entity_decode($page->find('title', 0)->plaintext);
|
||||||
if (!empty($title)) {
|
return $title;
|
||||||
$title = trim(str_replace($title_cleanup, '', $title));
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean Article title. Remove constant part that appears in every title such as blog name.
|
||||||
|
* @param string $title Title to clean, e.g. "Article Name | BlogName"
|
||||||
|
* @param string $title_cleanup string to remove from webpage title, e.g. " | BlogName"
|
||||||
|
* @return string Cleaned Title
|
||||||
|
*/
|
||||||
|
protected function titleCleanup($title, $title_cleanup)
|
||||||
|
{
|
||||||
|
if (!empty($title) && !empty($title_cleanup)) {
|
||||||
|
return trim(str_replace($title_cleanup, '', $title));
|
||||||
}
|
}
|
||||||
return $title;
|
return $title;
|
||||||
}
|
}
|
||||||
@ -270,6 +280,8 @@ class CssSelectorBridge extends BridgeAbstract
|
|||||||
$item['title'] = $article_title;
|
$item['title'] = $article_title;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$item['title'] = $this->titleCleanup($item['title'], $title_cleanup);
|
||||||
|
|
||||||
$article_content = $entry_html->find($content_selector);
|
$article_content = $entry_html->find($content_selector);
|
||||||
|
|
||||||
if (!empty($article_content)) {
|
if (!empty($article_content)) {
|
||||||
@ -484,7 +496,7 @@ class CssSelectorBridge extends BridgeAbstract
|
|||||||
// Now we can check for desired field in JSON and populate $item accordingly
|
// Now we can check for desired field in JSON and populate $item accordingly
|
||||||
if (isset($json_root[$field])) {
|
if (isset($json_root[$field])) {
|
||||||
$field_value = $json_root[$field];
|
$field_value = $json_root[$field];
|
||||||
if (is_array($field_value)) {
|
if (is_array($field_value) && isset($field_value[0])) {
|
||||||
$field_value = $field_value[0]; // Different versions of the same enclosure? Take the first one
|
$field_value = $field_value[0]; // Different versions of the same enclosure? Take the first one
|
||||||
}
|
}
|
||||||
if (is_string($field_value) && !empty($field_value)) {
|
if (is_string($field_value) && !empty($field_value)) {
|
||||||
|
@ -73,7 +73,7 @@ class SitemapBridge extends CssSelectorBridge
|
|||||||
$discard_thumbnail = $this->getInput('discard_thumbnail');
|
$discard_thumbnail = $this->getInput('discard_thumbnail');
|
||||||
$limit = $this->getInput('limit');
|
$limit = $this->getInput('limit');
|
||||||
|
|
||||||
$this->feedName = $this->getPageTitle($url, $title_cleanup);
|
$this->feedName = $this->titleCleanup($this->getPageTitle($url), $title_cleanup);
|
||||||
$sitemap_url = empty($site_map) ? $url : $site_map;
|
$sitemap_url = empty($site_map) ? $url : $site_map;
|
||||||
$sitemap_xml = $this->getSitemapXml($sitemap_url, !empty($site_map));
|
$sitemap_xml = $this->getSitemapXml($sitemap_url, !empty($site_map));
|
||||||
$links = $this->sitemapXmlToList($sitemap_xml, $url_pattern, empty($limit) ? 10 : $limit);
|
$links = $this->sitemapXmlToList($sitemap_xml, $url_pattern, empty($limit) ? 10 : $limit);
|
||||||
@ -103,7 +103,13 @@ class SitemapBridge extends CssSelectorBridge
|
|||||||
$robots_txt = getSimpleHTMLDOM(urljoin($url, '/robots.txt'))->outertext;
|
$robots_txt = getSimpleHTMLDOM(urljoin($url, '/robots.txt'))->outertext;
|
||||||
preg_match('/Sitemap: ([^ ]+)/', $robots_txt, $matches);
|
preg_match('/Sitemap: ([^ ]+)/', $robots_txt, $matches);
|
||||||
if (empty($matches)) {
|
if (empty($matches)) {
|
||||||
returnClientError('Failed to determine Sitemap from robots.txt. Try setting it manually.');
|
$sitemap = getSimpleHTMLDOM(urljoin($url, '/sitemap.xml'));
|
||||||
|
if (!empty($sitemap->find('urlset, sitemap'))) {
|
||||||
|
$url = urljoin($url, '/sitemap.xml');
|
||||||
|
return $sitemap;
|
||||||
|
} else {
|
||||||
|
returnClientError('Failed to locate Sitemap from /robots.txt or /sitemap.xml. Try setting it manually.');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
$url = $matches[1];
|
$url = $matches[1];
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user