mirror of https://github.com/searx/searx
Add scheme to img_src and thumbnail_url if missing from URL
Closes #3092
This commit is contained in:
parent
148090df12
commit
fd9d6b58d5
|
@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
|
||||||
from searx import logger
|
from searx import logger
|
||||||
from searx.engines import engines
|
from searx.engines import engines
|
||||||
from searx.metrology.error_recorder import record_error
|
from searx.metrology.error_recorder import record_error
|
||||||
|
from searx.utils import add_scheme_to_url
|
||||||
from searx import settings
|
from searx import settings
|
||||||
|
|
||||||
|
|
||||||
|
@ -240,10 +241,15 @@ class ResultContainer:
|
||||||
result['parsed_url'] = urlparse(result['url'])
|
result['parsed_url'] = urlparse(result['url'])
|
||||||
|
|
||||||
# if the result has no scheme, use http as default
|
# if the result has no scheme, use http as default
|
||||||
if not result['parsed_url'].scheme:
|
if not result['parsed_url'].scheme or result['parsed_url'].scheme == '':
|
||||||
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
|
result['parsed_url'] = result['parsed_url']._replace(scheme='http')
|
||||||
result['url'] = result['parsed_url'].geturl()
|
result['url'] = result['parsed_url'].geturl()
|
||||||
|
|
||||||
|
if 'thumbnail_src' in result:
|
||||||
|
result['thumbnail_src'] = add_scheme_to_url(result['thumbnail_src'])
|
||||||
|
if 'img_src' in result:
|
||||||
|
result['img_src'] = add_scheme_to_url(result['img_src'])
|
||||||
|
|
||||||
result['engines'] = set([result['engine']])
|
result['engines'] = set([result['engine']])
|
||||||
|
|
||||||
# strip multiple spaces and cariage returns from content
|
# strip multiple spaces and cariage returns from content
|
||||||
|
|
|
@ -7,7 +7,7 @@ from numbers import Number
|
||||||
from os.path import splitext, join
|
from os.path import splitext, join
|
||||||
from random import choice
|
from random import choice
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse, urlunparse
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
||||||
|
@ -214,6 +214,16 @@ def normalize_url(url, base_url):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def add_scheme_to_url(url, scheme="https"):
|
||||||
|
"""Add schema to URL: if scheme is missing from the URL, then add it."""
|
||||||
|
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if parsed.scheme == '':
|
||||||
|
parsed_with_scheme = parsed._replace(scheme=scheme)
|
||||||
|
return urlunparse(parsed_with_scheme)
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
def extract_url(xpath_results, base_url):
|
def extract_url(xpath_results, base_url):
|
||||||
"""Extract and normalize URL from lxml Element
|
"""Extract and normalize URL from lxml Element
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue