diff --git a/bridges/CssSelectorBridge.php b/bridges/CssSelectorBridge.php
index 5b2625a9..ac3261bf 100644
--- a/bridges/CssSelectorBridge.php
+++ b/bridges/CssSelectorBridge.php
@@ -276,7 +276,7 @@ class CssSelectorBridge extends BridgeAbstract
}
$entry_html = getSimpleHTMLDOMCached($entry_url);
- $item = $this->entryHtmlRetrieveMetadata($entry_html);
+ $item = html_find_seo_metadata($entry_html);
if (empty($item['uri'])) {
$item['uri'] = $entry_url;
@@ -306,253 +306,4 @@ class CssSelectorBridge extends BridgeAbstract
return $item;
}
-
- /**
- * Retrieve metadata from entry HTML: title, author, date published, etc. from metadata intended for social media embeds and SEO
- * @param obj $entry_html DOM object representing the webpage HTML
- * @return array Entry data collected from Metadata
- */
- protected function entryHtmlRetrieveMetadata($entry_html)
- {
- $item = [];
-
- // == First source of metadata: Meta tags ==
- // Facebook Open Graph (og:KEY) - https://developers.facebook.com/docs/sharing/webmasters
- // Twitter (twitter:KEY) - https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
- // Standard meta tags - https://www.w3schools.com/tags/tag_meta.asp
- // Standard time tag - https://developer.mozilla.org/en-US/docs/Web/HTML/Element/time
-
- // Each Entry field mapping defines a list of possible tags names that contains the expected value
- static $meta_mappings = [
- //
- //
- //
- //
- //
- //
- 'uri' => [
- 'og:url',
- 'twitter:url',
- 'canonical',
- ],
- 'title' => [
- 'og:title',
- 'twitter:title',
- ],
- 'content' => [
- 'og:description',
- 'twitter:description',
- 'description',
- ],
- 'timestamp' => [
- 'article:published_time',
- 'og:article:published_time',
- 'releaseDate',
- 'releasedate',
- 'article:modified_time',
- 'og:article:modified_time',
- 'lastModified',
- 'lastmodified',
- 'time',
- ],
- 'enclosures' => [
- 'og:image:secure_url',
- 'og:image:url',
- 'og:image',
- 'twitter:image',
- 'thumbnailImg',
- 'thumbnailimg',
- ],
- 'author' => [
- 'article:author',
- 'og:article:author',
- 'author',
- 'article:author:username',
- 'profile:first_name',
- 'profile:last_name',
- 'article:author:first_name',
- 'article:author:last_name',
- 'twitter:creator',
- ],
- ];
-
- $author_first_name = null;
- $author_last_name = null;
-
- // For each Entry property, look for corresponding HTML tags using a list of candidates
- foreach ($meta_mappings as $property => $field_list) {
- foreach ($field_list as $field) {
- // Look for HTML meta tag
- $element = null;
- if ($field === 'canonical') {
- $element = $entry_html->find('link[rel=canonical]');
- } else if ($field === 'time') {
- $element = $entry_html->find('time[datetime]');
- } else {
- $element = $entry_html->find("meta[property=$field], meta[name=$field]");
- }
- // Found something? Extract the value and populate Entry field
- if (!empty($element)) {
- $element = $element[0];
- $field_value = '';
- if ($field === 'canonical') {
- $field_value = $element->href;
- } else if ($field === 'time') {
- $field_value = $element->datetime;
- } else {
- $field_value = $element->content;
- }
- if (!empty($field_value)) {
- if ($field === 'article:author:first_name' || $field === 'profile:first_name') {
- $author_first_name = $field_value;
- } else if ($field === 'article:author:last_name' || $field === 'profile:last_name') {
- $author_last_name = $field_value;
- } else {
- $item[$property] = $field_value;
- break; // Stop on first match, e.g. og:url has priority over canonical url.
- }
- }
- }
- }
- }
-
- // Populate author from first name and last name if all we have is nothing or Twitter @username
- if ((!isset($item['author']) || $item['author'][0] === '@') && (is_string($author_first_name) || is_string($author_last_name))) {
- $author = '';
- if (is_string($author_first_name)) {
- $author = $author_first_name;
- }
- if (is_string($author_last_name)) {
- $author = $author . ' ' . $author_last_name;
- }
- $item['author'] = trim($author);
- }
-
- // == Second source of metadata: Embedded JSON ==
- // JSON linked data - https://www.w3.org/TR/2014/REC-json-ld-20140116/
- // JSON linked data is COMPLEX and MAY BE LESS RELIABLE than tags. Used for fields not found as tags.
- // The implementation below will load all ld+json we can understand and attempt to extract relevant information.
-
- // ld+json object types that hold article metadata
- // Each mapping define item fields and a list of possible JSON field for this field
- // Each candiate JSON field is either a string (field name) or a list (path to nested field)
- static $ldjson_article_types = ['webpage', 'article', 'newsarticle', 'blogposting'];
- static $ldjson_article_mappings = [
- 'uri' => ['url', 'mainEntityOfPage'],
- 'title' => ['headline'],
- 'content' => ['description'],
- 'timestamp' => ['dateModified', 'datePublished'],
- 'enclosures' => ['image'],
- 'author' => [['author', 'name'], ['author', '@id'], 'author'],
- ];
-
- // ld+json object types that hold author metadata
- $ldjson_author_types = ['person', 'organization'];
- $ldjson_author_mappings = []; // ID => Name
- $ldjson_author_id = null;
-
- // Utility function for checking if JSON array matches one of the desired ld+json object types
- // A JSON object may have a single ld+json @type as a string OR several types at once as a list
- $ldjson_is_of_type = function ($json, $allowed_types) {
- if (isset($json['@type'])) {
- $json_types = $json['@type'];
- if (!is_array($json_types)) {
- $json_types = [ $json_types ];
- }
- foreach ($json_types as $item_type) {
- if (in_array(strtolower($item_type), $allowed_types)) {
- return true;
- }
- }
- }
- return false;
- };
-
- // Process ld+json objects embedded in the HTML DOM
- foreach ($entry_html->find('script[type=application/ld+json]') as $html_ldjson_node) {
- $json_raw = json_decode($html_ldjson_node->innertext, true);
- if (is_array($json_raw)) {
- // The JSON we just loaded may contain directly a single ld+json object AND/OR several ones under the '@graph' key
- $json_items = [ $json_raw ];
- if (isset($json_raw['@graph'])) {
- foreach ($json_raw['@graph'] as $json_raw_sub_item) {
- $json_items[] = $json_raw_sub_item;
- }
- }
- // Now that we have a list of distinct JSON items, we can process them individually
- foreach ($json_items as $json) {
- // JSON item that holds an ld+json Article object (or a variant)
- if ($ldjson_is_of_type($json, $ldjson_article_types)) {
- // For each item property, look for corresponding JSON fields and populate the item
- foreach ($ldjson_article_mappings as $property => $field_list) {
- // Skip fields already found as tags, except Twitter @username (because we might find a better name)
- if (!isset($item[$property]) || ($property === 'author' && $item['author'][0] === '@')) {
- foreach ($field_list as $field) {
- $json_root = $json;
- // If necessary, navigate inside the JSON object to access a nested field
- if (is_array($field)) {
- // At this point, $field = ['author', 'name'] and $json_root = {"author": {"name": "John Doe"}}
- $json_navigate_ok = true;
- while (count($field) > 1) {
- $sub_field = array_shift($field);
- if (array_key_exists($sub_field, $json_root)) {
- $json_root = $json_root[$sub_field];
- if (array_is_list($json_root) && count($json_root) === 1) {
- $json_root = $json_root[0]; // Unwrap list of single item e.g. {"author":[{"name":"John Doe"}]}
- }
- } else {
- // Desired path not found in JSON, stop navigating
- $json_navigate_ok = false;
- break;
- }
- }
- if (!$json_navigate_ok) {
- continue; //Desired path not found in JSON, skip this field
- }
- $field = $field[0];
- // At this point, $field = "name" and $json_root = {"name": "John Doe"}
- }
- // Now we can check for desired field in JSON and populate $item accordingly
- if (isset($json_root[$field])) {
- $field_value = $json_root[$field];
- if (is_array($field_value) && isset($field_value[0])) {
- $field_value = $field_value[0]; // Different versions of the same enclosure? Take the first one
- }
- if (is_string($field_value) && !empty($field_value)) {
- if ($property === 'author' && $field === '@id') {
- $ldjson_author_id = $field_value; // Author is referred to by its ID: We'll see later if we can resolve it
- } else {
- $item[$property] = $field_value;
- break; // Stop on first match, e.g. {"author":{"name":"John Doe"}} has priority over {"author":"John Doe"}
- }
- }
- }
- }
- }
- }
- // JSON item that holds an ld+json Author object (or a variant)
- } else if ($ldjson_is_of_type($json, $ldjson_author_types)) {
- if (isset($json['@id']) && isset($json['name'])) {
- $ldjson_author_mappings[$json['@id']] = $json['name'];
- }
- }
- }
- }
- }
-
- // Attempt to resolve ld+json author if all we have is nothing or Twitter @username
- if ((!isset($item['author']) || $item['author'][0] === '@') && !is_null($ldjson_author_id) && isset($ldjson_author_mappings[$ldjson_author_id])) {
- $item['author'] = $ldjson_author_mappings[$ldjson_author_id];
- }
-
- // Adjust item field types
- if (isset($item['enclosures'])) {
- $item['enclosures'] = [ $item['enclosures'] ];
- }
- if (isset($item['timestamp'])) {
- $item['timestamp'] = strtotime($item['timestamp']);
- }
-
- return $item;
- }
}
diff --git a/lib/bootstrap.php b/lib/bootstrap.php
index fe2069d3..9a69d756 100644
--- a/lib/bootstrap.php
+++ b/lib/bootstrap.php
@@ -26,6 +26,7 @@ $files = [
__DIR__ . '/../lib/http.php',
__DIR__ . '/../lib/logger.php',
__DIR__ . '/../lib/url.php',
+ __DIR__ . '/../lib/seotags.php',
// Vendor
__DIR__ . '/../vendor/parsedown/Parsedown.php',
__DIR__ . '/../vendor/php-urljoin/src/urljoin.php',
diff --git a/lib/seotags.php b/lib/seotags.php
new file mode 100644
index 00000000..52657ff9
--- /dev/null
+++ b/lib/seotags.php
@@ -0,0 +1,256 @@
+ tags names that contains the expected value
+ // There are various source candidates per type of data, listed from most reliable to least reliable
+ static $meta_mappings = [
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ 'uri' => [
+ 'og:url',
+ 'twitter:url',
+ 'canonical',
+ ],
+ 'title' => [
+ 'og:title',
+ 'twitter:title',
+ ],
+ 'content' => [
+ 'og:description',
+ 'twitter:description',
+ 'description',
+ ],
+ 'timestamp' => [
+ 'article:published_time',
+ 'og:article:published_time',
+ 'releaseDate',
+ 'releasedate',
+ 'article:modified_time',
+ 'og:article:modified_time',
+ 'lastModified',
+ 'lastmodified',
+ 'time',
+ ],
+ 'enclosures' => [
+ 'og:image:secure_url',
+ 'og:image:url',
+ 'og:image',
+ 'twitter:image',
+ 'thumbnailImg',
+ 'thumbnailimg',
+ ],
+ 'author' => [
+ 'article:author',
+ 'og:article:author',
+ 'author',
+ 'article:author:username',
+ 'profile:first_name',
+ 'profile:last_name',
+ 'article:author:first_name',
+ 'article:author:last_name',
+ 'twitter:creator',
+ ],
+ ];
+
+ $author_first_name = null;
+ $author_last_name = null;
+
+ // For each Entry property, look for corresponding HTML tags using a list of candidates
+ foreach ($meta_mappings as $property => $field_list) {
+ foreach ($field_list as $field) {
+ // Look for HTML meta tag
+ $element = null;
+ if ($field === 'canonical') {
+ $element = $html->find('link[rel=canonical]');
+ } else if ($field === 'time') {
+ $element = $html->find('time[datetime]');
+ } else {
+ $element = $html->find("meta[property=$field], meta[name=$field]");
+ }
+ // Found something? Extract the value and populate Entry field
+ if (!empty($element)) {
+ $element = $element[0];
+ $field_value = '';
+ if ($field === 'canonical') {
+ $field_value = $element->href;
+ } else if ($field === 'time') {
+ $field_value = $element->datetime;
+ } else {
+ $field_value = $element->content;
+ }
+ if (!empty($field_value)) {
+ if ($field === 'article:author:first_name' || $field === 'profile:first_name') {
+ $author_first_name = $field_value;
+ } else if ($field === 'article:author:last_name' || $field === 'profile:last_name') {
+ $author_last_name = $field_value;
+ } else {
+ $item[$property] = $field_value;
+ break; // Stop on first match, e.g. og:url has priority over canonical url.
+ }
+ }
+ }
+ }
+ }
+
+ // Populate author from first name and last name if all we have is nothing or Twitter @username
+ if ((!isset($item['author']) || $item['author'][0] === '@') && (is_string($author_first_name) || is_string($author_last_name))) {
+ $author = '';
+ if (is_string($author_first_name)) {
+ $author = $author_first_name;
+ }
+ if (is_string($author_last_name)) {
+ $author = $author . ' ' . $author_last_name;
+ }
+ $item['author'] = trim($author);
+ }
+
+ // == Second source of metadata: Embedded JSON ==
+ // JSON linked data - https://www.w3.org/TR/2014/REC-json-ld-20140116/
+ // JSON linked data is COMPLEX and MAY BE LESS RELIABLE than tags. Used for fields not found as tags.
+ // The implementation below will load all ld+json we can understand and attempt to extract relevant information.
+
+ // ld+json object types that hold article metadata
+ // Each mapping define item fields and a list of possible JSON field for this field
+ // Each candiate JSON field is either a string (field name) or a list (path to nested field)
+ static $ldjson_article_types = ['webpage', 'article', 'newsarticle', 'blogposting'];
+ static $ldjson_article_mappings = [
+ 'uri' => ['url', 'mainEntityOfPage'],
+ 'title' => ['headline'],
+ 'content' => ['description'],
+ 'timestamp' => ['dateModified', 'datePublished'],
+ 'enclosures' => ['image'],
+ 'author' => [['author', 'name'], ['author', '@id'], 'author'],
+ ];
+
+ // ld+json object types that hold author metadata
+ $ldjson_author_types = ['person', 'organization'];
+ $ldjson_author_mappings = []; // ID => Name
+ $ldjson_author_id = null;
+
+ // Utility function for checking if JSON array matches one of the desired ld+json object types
+ // A JSON object may have a single ld+json @type as a string OR several types at once as a list
+ $ldjson_is_of_type = function ($json, $allowed_types) {
+ if (isset($json['@type'])) {
+ $json_types = $json['@type'];
+ if (!is_array($json_types)) {
+ $json_types = [ $json_types ];
+ }
+ foreach ($json_types as $item_type) {
+ if (in_array(strtolower($item_type), $allowed_types)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ };
+
+ // Process ld+json objects embedded in the HTML DOM
+ foreach ($html->find('script[type=application/ld+json]') as $html_ldjson_node) {
+ $json_raw = json_decode($html_ldjson_node->innertext, true);
+ if (is_array($json_raw)) {
+ // The JSON we just loaded may contain directly a single ld+json object AND/OR several ones under the '@graph' key
+ $json_items = [ $json_raw ];
+ if (isset($json_raw['@graph'])) {
+ foreach ($json_raw['@graph'] as $json_raw_sub_item) {
+ $json_items[] = $json_raw_sub_item;
+ }
+ }
+ // Now that we have a list of distinct JSON items, we can process them individually
+ foreach ($json_items as $json) {
+ // JSON item that holds an ld+json Article object (or a variant)
+ if ($ldjson_is_of_type($json, $ldjson_article_types)) {
+ // For each item property, look for corresponding JSON fields and populate the item
+ foreach ($ldjson_article_mappings as $property => $field_list) {
+ // Skip fields already found as tags, except Twitter @username (because we might find a better name)
+ if (!isset($item[$property]) || ($property === 'author' && $item['author'][0] === '@')) {
+ foreach ($field_list as $field) {
+ $json_root = $json;
+ // If necessary, navigate inside the JSON object to access a nested field
+ if (is_array($field)) {
+ // At this point, $field = ['author', 'name'] and $json_root = {"author": {"name": "John Doe"}}
+ $json_navigate_ok = true;
+ while (count($field) > 1) {
+ $sub_field = array_shift($field);
+ if (array_key_exists($sub_field, $json_root)) {
+ $json_root = $json_root[$sub_field];
+ if (array_is_list($json_root) && count($json_root) === 1) {
+ $json_root = $json_root[0]; // Unwrap list of single item e.g. {"author":[{"name":"John Doe"}]}
+ }
+ } else {
+ // Desired path not found in JSON, stop navigating
+ $json_navigate_ok = false;
+ break;
+ }
+ }
+ if (!$json_navigate_ok) {
+ continue; //Desired path not found in JSON, skip this field
+ }
+ $field = $field[0];
+ // At this point, $field = "name" and $json_root = {"name": "John Doe"}
+ }
+ // Now we can check for desired field in JSON and populate $item accordingly
+ if (isset($json_root[$field])) {
+ $field_value = $json_root[$field];
+ if (is_array($field_value) && isset($field_value[0])) {
+ $field_value = $field_value[0]; // Different versions of the same enclosure? Take the first one
+ }
+ if (is_string($field_value) && !empty($field_value)) {
+ if ($property === 'author' && $field === '@id') {
+ $ldjson_author_id = $field_value; // Author is referred to by its ID: We'll see later if we can resolve it
+ } else {
+ $item[$property] = $field_value;
+ break; // Stop on first match, e.g. {"author":{"name":"John Doe"}} has priority over {"author":"John Doe"}
+ }
+ }
+ }
+ }
+ }
+ }
+ // JSON item that holds an ld+json Author object (or a variant)
+ } else if ($ldjson_is_of_type($json, $ldjson_author_types)) {
+ if (isset($json['@id']) && isset($json['name'])) {
+ $ldjson_author_mappings[$json['@id']] = $json['name'];
+ }
+ }
+ }
+ }
+ }
+
+ // Attempt to resolve ld+json author if all we have is nothing or Twitter @username
+ if ((!isset($item['author']) || $item['author'][0] === '@') && !is_null($ldjson_author_id) && isset($ldjson_author_mappings[$ldjson_author_id])) {
+ $item['author'] = $ldjson_author_mappings[$ldjson_author_id];
+ }
+
+ // Adjust item field types
+ if (isset($item['enclosures'])) {
+ $item['enclosures'] = [ $item['enclosures'] ];
+ }
+ if (isset($item['timestamp'])) {
+ $item['timestamp'] = strtotime($item['timestamp']);
+ }
+
+ return $item;
+}