From fd9d6b58d5b0fe6c21168f039678b5721b41fcea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Sat, 22 Jan 2022 11:59:21 +0100 Subject: [PATCH] Add scheme to img_src and thumbnail_url if missing from URL Closes #3092 --- searx/results.py | 10 ++++++++-- searx/utils.py | 12 +++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/searx/results.py b/searx/results.py index 6502ddcd..1b1eccbe 100644 --- a/searx/results.py +++ b/searx/results.py @@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote from searx import logger from searx.engines import engines from searx.metrology.error_recorder import record_error +from searx.utils import add_scheme_to_url from searx import settings @@ -240,10 +241,15 @@ class ResultContainer: result['parsed_url'] = urlparse(result['url']) # if the result has no scheme, use http as default - if not result['parsed_url'].scheme: - result['parsed_url'] = result['parsed_url']._replace(scheme="http") + if not result['parsed_url'].scheme or result['parsed_url'].scheme == '': + result['parsed_url'] = result['parsed_url']._replace(scheme='http') result['url'] = result['parsed_url'].geturl() + if 'thumbnail_src' in result: + result['thumbnail_src'] = add_scheme_to_url(result['thumbnail_src']) + if 'img_src' in result: + result['img_src'] = add_scheme_to_url(result['img_src']) + result['engines'] = set([result['engine']]) # strip multiple spaces and cariage returns from content diff --git a/searx/utils.py b/searx/utils.py index 55a386bd..9aea9bb0 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -7,7 +7,7 @@ from numbers import Number from os.path import splitext, join from random import choice from html.parser import HTMLParser -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin, urlparse, urlunparse from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult @@ -214,6 +214,16 @@ def normalize_url(url, base_url): return url +def add_scheme_to_url(url, scheme="https"): + """Add schema to URL: if scheme is missing from the URL, then add it.""" + + parsed = urlparse(url) + if parsed.scheme == '': + parsed_with_scheme = parsed._replace(scheme=scheme) + return urlunparse(parsed_with_scheme) + return url + + def extract_url(xpath_results, base_url): """Extract and normalize URL from lxml Element