From e6f12c073416eba6fc620f0ff38a343bda428280 Mon Sep 17 00:00:00 2001 From: Simounet Date: Wed, 11 Jul 2018 19:57:34 +0200 Subject: [PATCH] More robust srcset image attribute handling Linked to HTMLawed PR https://github.com/kesar/HTMLawed/pull/17 --- .../CoreBundle/Helper/DownloadImages.php | 11 +++++--- .../CoreBundle/Helper/DownloadImagesTest.php | 25 +++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/Wallabag/CoreBundle/Helper/DownloadImages.php b/src/Wallabag/CoreBundle/Helper/DownloadImages.php index f91cdf5eb..487a3a238 100644 --- a/src/Wallabag/CoreBundle/Helper/DownloadImages.php +++ b/src/Wallabag/CoreBundle/Helper/DownloadImages.php @@ -185,7 +185,7 @@ class DownloadImages * * @return array An array of urls */ - protected function getSrcsetUrls(Crawler $imagesCrawler) + private function getSrcsetUrls(Crawler $imagesCrawler) { $urls = []; $iterator = $imagesCrawler @@ -193,9 +193,14 @@ class DownloadImages while ($iterator->valid()) { $srcsetAttribute = $iterator->current()->getAttribute('srcset'); if ('' !== $srcsetAttribute) { - $srcset = array_map('trim', explode(',', $srcsetAttribute)); + // Couldn't start with " OR ' OR a white space + // Could be one or more white space + // Must be one or more digits followed by w OR x + $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; + preg_match_all($pattern, $srcsetAttribute, $matches); + $srcset = call_user_func_array('array_merge', $matches); $srcsetUrls = array_map(function ($src) { - return explode(' ', $src)[0]; + return trim(explode(' ', $src, 2)[0]); }, $srcset); $urls = array_merge($srcsetUrls, $urls); } diff --git a/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php b/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php index faa803fa0..cda5f8431 100644 --- a/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php +++ b/tests/Wallabag/CoreBundle/Helper/DownloadImagesTest.php @@ -205,6 +205,31 @@ class DownloadImagesTest extends TestCase $this->assertNotContains('http://piketty.blog.lemonde.fr/', $res, 'Image srcset attribute were not replaced'); } + public function testProcessImageWithTrickySrcset() + { + $client = new Client(); + + $mock = new Mock([ + new Response(200, ['content-type' => 'image/jpeg'], Stream::factory(file_get_contents(__DIR__ . '/../fixtures/image-no-content-type.jpg'))), + new Response(200, ['content-type' => 'image/jpeg'], Stream::factory(file_get_contents(__DIR__ . '/../fixtures/image-no-content-type.jpg'))), + new Response(200, ['content-type' => 'image/jpeg'], Stream::factory(file_get_contents(__DIR__ . '/../fixtures/image-no-content-type.jpg'))), + ]); + + $client->getEmitter()->attach($mock); + + $logHandler = new TestHandler(); + $logger = new Logger('test', [$logHandler]); + + $download = new DownloadImages($client, sys_get_temp_dir() . '/wallabag_test', 'http://wallabag.io/', $logger); + $res = $download->processHtml(123, '
', 'https://css-tricks.com/the-critical-request/'); + + $this->assertNotContains('f_auto,q_auto', $res, 'Image srcset attribute were not replaced'); + } + public function testProcessImageWithNullPath() { $client = new Client();