diff --git a/lib/html.php b/lib/html.php
index eeaf2b32..71a3a29e 100644
--- a/lib/html.php
+++ b/lib/html.php
@@ -226,6 +226,63 @@ function defaultLinkTo($dom, $url)
return $dom;
}
+/**
+ * Parse a srcset HTML attribute value and return size => URL mappings
+ * Srcset contains a list of image URLs with associated size specified as size (e.g. 1024w) or scale (e.g. 2x)
+ * The web browser should pick the most appropriate image depending on screen size and/or pixel density
+ *
+ * This function takes a srcset string such as the following:
+ * header640.png 640w, header960.png 960w, header1024.png 1024w
+ *
+ * Returns an array such as the following:
+ * [
+ * '640w' => 'header640.png',
+ * '960w' => 'header960.png',
+ * '1024w' => 'header1024.png'
+ * ]
+ *
+ * @param string $srcset Content of srcset html attribute
+ * @param bool $return_largest_url Instead of returning an array, return URL for the largest entry
+ * @return array|string Content of srcset attribute as { size => url } array, or largest entry URL if requested
+ */
+function parseSrcset(string $srcset, bool $return_largest_url = false)
+{
+ // The srcset format is more tricky to parse that it seems:
+ // URLs may contain commas, and space after comma is not mandatory, so the following is valid:
+ // image.png?resize=640,640 640w,image.png?resize=960,960 960w,image.png?resize=1024,1024 1024w
+ // Since splitting by space or comma will not work, there is a precise algorithm to parse srcset attribute:
+ // https://html.spec.whatwg.org/multipage/images.html#parse-a-srcset-attribute
+ // To summarize, each srcset entry has the following format:
+ // 1. Leading spaces and comma. Zero or more spaces, zero or at most one comma
+ // 2. Any amount of characters up to the next whitespace (space, tab, newline...): This is the URL
+ // 3. A nonnegative number followed by lowercase w, x or h: This is the image size
+ // We parse the srcset entries using a regex to mimick the above parser/tokenizer behavior.
+ $preg_status = preg_match_all('/[\s]*,?[\s]*([^\s]+)\s+([0-9]+[wxh])/', $srcset, $matches);
+ $entries = [];
+ if ($preg_status !== false && $preg_status > 0) {
+ foreach ($matches[1] as $index => $url) {
+ if (array_key_exists($index, $matches[2])) {
+ $size = $matches[2][$index];
+ $entries[$size] = html_entity_decode($url);
+ }
+ }
+ }
+ if ($return_largest_url) {
+ $largest_image_url = null;
+ $largest_image_size = -1;
+ foreach ($entries as $size => $url) {
+ $size_int = intval(substr($size, 0, strlen($size) - 1));
+ if ($size_int > $largest_image_size) {
+ $largest_image_size = $size_int;
+ $largest_image_url = $url;
+ }
+ }
+ return $largest_image_url;
+ } else {
+ return $entries;
+ }
+}
+
/**
* Convert lazy-loading images and frames (video embeds) into static elements
*
@@ -244,28 +301,18 @@ function convertLazyLoading($dom)
$dom = str_get_html($dom);
}
- // Retrieve image URL from srcset attribute
- // https://developer.mozilla.org/en-US/docs/Web/API/HTMLImageElement/srcset
- // Example: convert "header640.png 640w, header960.png 960w, header1024.png 1024w" to "header1024.png"
- $srcset_to_src = function ($srcset) {
- $sources = explode(',', $srcset);
- $last_entry = trim($sources[array_key_last($sources)]);
- $url = explode(' ', $last_entry)[0];
- return $url;
- };
-
// Process standalone images, embeds and picture sources
foreach ($dom->find('img, iframe, source') as $img) {
if (!empty($img->getAttribute('data-src'))) {
$img->src = $img->getAttribute('data-src');
} elseif (!empty($img->getAttribute('data-srcset'))) {
- $img->src = $srcset_to_src($img->getAttribute('data-srcset'));
+ $img->src = parseSrcset($img->getAttribute('data-srcset'));
} elseif (!empty($img->getAttribute('data-lazy-src'))) {
$img->src = $img->getAttribute('data-lazy-src');
} elseif (!empty($img->getAttribute('data-orig-file'))) {
$img->src = $img->getAttribute('data-orig-file');
} elseif (!empty($img->getAttribute('srcset'))) {
- $img->src = $srcset_to_src($img->getAttribute('srcset'));
+ $img->src = parseSrcset($img->getAttribute('srcset'));
} else {
continue; // Proceed to next element without removing attributes
}