mirror of https://github.com/searx/searx
pick engine fixes (#3306)
* [fix] google engine: results XPath * [fix] google & youtube - set EU consent cookie This change the previous bypass method for Google consent using ``ucbcb=1`` (6face215b8) to accept the consent using ``CONSENT=YES+``. The youtube_noapi and google have a similar API, at least for the consent[1]. Get CONSENT cookie from google reguest:: curl -i "https://www.google.com/search?q=time&tbm=isch" \ -A "Mozilla/5.0 (X11; Linux i686; rv:102.0) Gecko/20100101 Firefox/102.0" \ | grep -i consent ... location: https://consent.google.com/m?continue=https://www.google.com/search?q%3Dtime%26tbm%3Disch&gl=DE&m=0&pc=irp&uxe=eomtm&hl=en-US&src=1 set-cookie: CONSENT=PENDING+936; expires=Wed, 24-Jul-2024 11:26:20 GMT; path=/; domain=.google.com; Secure ... PENDING & YES [2]: Google change the way for consent about YouTube cookies agreement in EU countries. Instead of showing a popup in the website, YouTube redirects the user to a new webpage at consent.youtube.com domain ... Fix for this is to put a cookie CONSENT with YES+ value for every YouTube request [1] https://github.com/iv-org/invidious/pull/2207 [2] https://github.com/TeamNewPipe/NewPipeExtractor/issues/592 Closes: https://github.com/searxng/searxng/issues/1432 * [fix] sjp engine - convert enginename to a latin1 compliance name The engine name is not only a *name* its also a identifier that is used in logs, HTTP headers and more. Unicode characters in the name of an engine could cause various issues. Closes: https://github.com/searxng/searxng/issues/1544 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [fix] engine tineye: handle 422 response of not supported img format Closes: https://github.com/searxng/searxng/issues/1449 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * bypass google consent with ucbcb=1 * [mod] Adds Lingva translate engine Add the lingva engine (which grabs data from google translate). Results from Lingva are added to the infobox results. * openstreetmap engine: return the localized named. For example: display "Tokyo" instead of "東京都" when the language is English. * [fix] engines/openstreetmap.py typo: user_langage --> user_language Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * Wikidata engine: ignore dummy entities * Wikidata engine: minor change of the SPARQL request The engine can be slow especially when the query won't return any answer. See https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI#Find_articles_in_Wikipedia_speaking_about_cheese_and_see_which_Wikibase_items_they_correspond_to Co-authored-by: Léon Tiekötter <leon@tiekoetter.com> Co-authored-by: Emilien Devos <contact@emiliendevos.be> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Emilien Devos <github@emiliendevos.be> Co-authored-by: ta <alt3753.7@gmail.com> Co-authored-by: Alexandre Flament <alex@al-f.net>
This commit is contained in:
parent
85034b49ef
commit
05fe2ee093
|
@ -108,8 +108,8 @@ filter_mapping = {
|
||||||
# specific xpath variables
|
# specific xpath variables
|
||||||
# ------------------------
|
# ------------------------
|
||||||
|
|
||||||
# google results are grouped into <div class="g ..." ../>
|
# google results are grouped into <div class="jtfYYd ..." ../>
|
||||||
results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]'
|
results_xpath = '//div[contains(@class, "jtfYYd")]'
|
||||||
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
|
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
|
||||||
|
|
||||||
# google *sections* are no usual *results*, we ignore them
|
# google *sections* are no usual *results*, we ignore them
|
||||||
|
@ -223,6 +223,7 @@ def request(query, params):
|
||||||
'oe': "utf8",
|
'oe': "utf8",
|
||||||
'start': offset,
|
'start': offset,
|
||||||
'filter': '0',
|
'filter': '0',
|
||||||
|
'ucbcb': 1,
|
||||||
**additional_parameters,
|
**additional_parameters,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -235,6 +236,7 @@ def request(query, params):
|
||||||
params['url'] = query_url
|
params['url'] = query_url
|
||||||
|
|
||||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
params['headers'].update(lang_info['headers'])
|
params['headers'].update(lang_info['headers'])
|
||||||
if use_mobile_ui:
|
if use_mobile_ui:
|
||||||
params['headers']['Accept'] = '*/*'
|
params['headers']['Accept'] = '*/*'
|
||||||
|
|
|
@ -109,6 +109,7 @@ def request(query, params):
|
||||||
**lang_info['params'],
|
**lang_info['params'],
|
||||||
'ie': "utf8",
|
'ie': "utf8",
|
||||||
'oe': "utf8",
|
'oe': "utf8",
|
||||||
|
'ucbcd': 1,
|
||||||
'num': 30,
|
'num': 30,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -121,6 +122,7 @@ def request(query, params):
|
||||||
params['url'] = query_url
|
params['url'] = query_url
|
||||||
|
|
||||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
params['headers'].update(lang_info['headers'])
|
params['headers'].update(lang_info['headers'])
|
||||||
params['headers']['Accept'] = (
|
params['headers']['Accept'] = (
|
||||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
|
|
@ -104,6 +104,7 @@ def request(query, params):
|
||||||
**lang_info['params'],
|
**lang_info['params'],
|
||||||
'ie': "utf8",
|
'ie': "utf8",
|
||||||
'oe': "utf8",
|
'oe': "utf8",
|
||||||
|
'ucbcb': 1,
|
||||||
'gl': lang_info['country'],
|
'gl': lang_info['country'],
|
||||||
}) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
|
}) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
|
||||||
|
|
||||||
|
@ -111,10 +112,12 @@ def request(query, params):
|
||||||
params['url'] = query_url
|
params['url'] = query_url
|
||||||
|
|
||||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||||
|
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
params['headers'].update(lang_info['headers'])
|
params['headers'].update(lang_info['headers'])
|
||||||
params['headers']['Accept'] = (
|
params['headers']['Accept'] = (
|
||||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
)
|
)
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""
|
||||||
|
Google Play Apps
|
||||||
|
"""
|
||||||
|
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
from lxml import html
|
||||||
|
from searx.utils import (
|
||||||
|
eval_xpath,
|
||||||
|
extract_url,
|
||||||
|
extract_text,
|
||||||
|
eval_xpath_list,
|
||||||
|
eval_xpath_getindex,
|
||||||
|
)
|
||||||
|
|
||||||
|
about = {
|
||||||
|
"website": "https://play.google.com/",
|
||||||
|
"wikidata_id": "Q79576",
|
||||||
|
"use_official_api": False,
|
||||||
|
"require_api_key": False,
|
||||||
|
"results": "HTML",
|
||||||
|
}
|
||||||
|
|
||||||
|
categories = ["files", "apps"]
|
||||||
|
search_url = "https://play.google.com/store/search?{query}&c=apps&ucbcb=1"
|
||||||
|
|
||||||
|
|
||||||
|
def request(query, params):
|
||||||
|
params["url"] = search_url.format(query=urlencode({"q": query}))
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
|
||||||
|
return []
|
||||||
|
|
||||||
|
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
|
||||||
|
if spot is not None:
|
||||||
|
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
|
||||||
|
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
|
||||||
|
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
|
||||||
|
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
|
||||||
|
|
||||||
|
results.append({"url": url, "title": title, "content": content, "img_src": img})
|
||||||
|
|
||||||
|
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
|
||||||
|
for result in more:
|
||||||
|
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
|
||||||
|
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
|
||||||
|
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
|
||||||
|
img = extract_text(
|
||||||
|
eval_xpath(
|
||||||
|
result,
|
||||||
|
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append({"url": url, "title": title, "content": content, "img_src": img})
|
||||||
|
|
||||||
|
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
|
||||||
|
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
|
||||||
|
|
||||||
|
return results
|
|
@ -85,13 +85,13 @@ def request(query, params):
|
||||||
# subdomain is: scholar.google.xy
|
# subdomain is: scholar.google.xy
|
||||||
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
|
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
|
||||||
|
|
||||||
query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({
|
query_url = (
|
||||||
'q': query,
|
'https://'
|
||||||
**lang_info['params'],
|
+ lang_info['subdomain']
|
||||||
'ie': "utf8",
|
+ '/scholar'
|
||||||
'oe': "utf8",
|
+ "?"
|
||||||
'start' : offset,
|
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset, 'ucbcb': 1})
|
||||||
})
|
)
|
||||||
|
|
||||||
query_url += time_range_url(params)
|
query_url += time_range_url(params)
|
||||||
|
|
||||||
|
@ -99,6 +99,7 @@ def request(query, params):
|
||||||
params['url'] = query_url
|
params['url'] = query_url
|
||||||
|
|
||||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
params['headers'].update(lang_info['headers'])
|
params['headers'].update(lang_info['headers'])
|
||||||
params['headers']['Accept'] = (
|
params['headers']['Accept'] = (
|
||||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
|
|
@ -125,6 +125,7 @@ def request(query, params):
|
||||||
'q': query,
|
'q': query,
|
||||||
'tbm': "vid",
|
'tbm': "vid",
|
||||||
**lang_info['params'],
|
**lang_info['params'],
|
||||||
|
'ucbcb': 1,
|
||||||
'ie': "utf8",
|
'ie': "utf8",
|
||||||
'oe': "utf8",
|
'oe': "utf8",
|
||||||
})
|
})
|
||||||
|
@ -138,6 +139,7 @@ def request(query, params):
|
||||||
params['url'] = query_url
|
params['url'] = query_url
|
||||||
|
|
||||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
params['headers'].update(lang_info['headers'])
|
params['headers'].update(lang_info['headers'])
|
||||||
params['headers']['Accept'] = (
|
params['headers']['Accept'] = (
|
||||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# lint: pylint
|
||||||
|
"""Lingva (alternative Google Translate frontend)"""
|
||||||
|
|
||||||
|
from json import loads
|
||||||
|
|
||||||
|
about = {
|
||||||
|
"website": 'https://lingva.ml',
|
||||||
|
"wikidata_id": None,
|
||||||
|
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
|
||||||
|
"use_official_api": True,
|
||||||
|
"require_api_key": False,
|
||||||
|
"results": 'JSON',
|
||||||
|
}
|
||||||
|
|
||||||
|
engine_type = 'online_dictionary'
|
||||||
|
categories = ['general']
|
||||||
|
|
||||||
|
url = "https://lingva.ml"
|
||||||
|
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
|
||||||
|
|
||||||
|
|
||||||
|
def request(_query, params):
|
||||||
|
params['url'] = search_url.format(
|
||||||
|
url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query']
|
||||||
|
)
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
result = loads(resp.text)
|
||||||
|
info = result["info"]
|
||||||
|
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
|
||||||
|
|
||||||
|
if "typo" in info:
|
||||||
|
results.append({"suggestion": from_to_prefix + info["typo"]})
|
||||||
|
|
||||||
|
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
|
||||||
|
for definition in info['definitions']:
|
||||||
|
if 'list' in definition:
|
||||||
|
for item in definition['list']:
|
||||||
|
if 'synonyms' in item:
|
||||||
|
for synonym in item['synonyms']:
|
||||||
|
results.append({"suggestion": from_to_prefix + synonym})
|
||||||
|
|
||||||
|
infobox = ""
|
||||||
|
|
||||||
|
for translation in info["extraTranslations"]:
|
||||||
|
infobox += f"<b>{translation['type']}</b>"
|
||||||
|
|
||||||
|
for word in translation["list"]:
|
||||||
|
infobox += f"<dl><dt>{word['word']}</dt>"
|
||||||
|
|
||||||
|
for meaning in word["meanings"]:
|
||||||
|
infobox += f"<dd>{meaning}</dd>"
|
||||||
|
|
||||||
|
infobox += "</dl>"
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
'infobox': result["translation"],
|
||||||
|
'content': infobox,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
|
@ -30,6 +30,7 @@ about = {
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ['map']
|
categories = ['map']
|
||||||
paging = False
|
paging = False
|
||||||
|
language_support = True
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = 'https://nominatim.openstreetmap.org/'
|
base_url = 'https://nominatim.openstreetmap.org/'
|
||||||
|
@ -141,6 +142,9 @@ def request(query, params):
|
||||||
params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
|
params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
|
||||||
params['route'] = route_re.match(query)
|
params['route'] = route_re.match(query)
|
||||||
params['headers']['User-Agent'] = searx_useragent()
|
params['headers']['User-Agent'] = searx_useragent()
|
||||||
|
|
||||||
|
accept_language = 'en' if params['language'] == 'all' else params['language']
|
||||||
|
params['headers']['Accept-Language'] = accept_language
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
@ -200,7 +204,7 @@ def get_wikipedia_image(raw_value):
|
||||||
return get_external_url('wikimedia_image', raw_value)
|
return get_external_url('wikimedia_image', raw_value)
|
||||||
|
|
||||||
|
|
||||||
def fetch_wikidata(nominatim_json, user_langage):
|
def fetch_wikidata(nominatim_json, user_language):
|
||||||
"""Update nominatim_json using the result of an unique to wikidata
|
"""Update nominatim_json using the result of an unique to wikidata
|
||||||
|
|
||||||
For result in nominatim_json:
|
For result in nominatim_json:
|
||||||
|
@ -221,9 +225,10 @@ def fetch_wikidata(nominatim_json, user_langage):
|
||||||
wd_to_results.setdefault(wd_id, []).append(result)
|
wd_to_results.setdefault(wd_id, []).append(result)
|
||||||
|
|
||||||
if wikidata_ids:
|
if wikidata_ids:
|
||||||
|
user_language = 'en' if user_language == 'all' else user_language.split('-')[0]
|
||||||
wikidata_ids_str = " ".join(wikidata_ids)
|
wikidata_ids_str = " ".join(wikidata_ids)
|
||||||
query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace(
|
query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace(
|
||||||
'%LANGUAGE%', sparql_string_escape(user_langage)
|
'%LANGUAGE%', sparql_string_escape(user_language)
|
||||||
)
|
)
|
||||||
wikidata_json = send_wikidata_query(query)
|
wikidata_json = send_wikidata_query(query)
|
||||||
for wd_result in wikidata_json.get('results', {}).get('bindings', {}):
|
for wd_result in wikidata_json.get('results', {}).get('bindings', {}):
|
||||||
|
@ -238,7 +243,7 @@ def fetch_wikidata(nominatim_json, user_langage):
|
||||||
# overwrite wikipedia link
|
# overwrite wikipedia link
|
||||||
wikipedia_name = wd_result.get('wikipediaName', {}).get('value')
|
wikipedia_name = wd_result.get('wikipediaName', {}).get('value')
|
||||||
if wikipedia_name:
|
if wikipedia_name:
|
||||||
result['extratags']['wikipedia'] = user_langage + ':' + wikipedia_name
|
result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name
|
||||||
# get website if not already defined
|
# get website if not already defined
|
||||||
website = wd_result.get('website', {}).get('value')
|
website = wd_result.get('website', {}).get('value')
|
||||||
if (
|
if (
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""Słownik Języka Polskiego (general)
|
# lint: pylint
|
||||||
|
"""Słownik Języka Polskiego
|
||||||
|
|
||||||
|
Dictionary of the polish language from PWN (sjp.pwn)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from lxml.html import fromstring
|
from lxml.html import fromstring
|
||||||
|
|
|
@ -2,10 +2,12 @@
|
||||||
Tineye - Reverse search images
|
Tineye - Reverse search images
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from json import loads
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from flask_babel import gettext
|
||||||
|
|
||||||
|
from searx import logger
|
||||||
|
|
||||||
about = {
|
about = {
|
||||||
"website": "https://tineye.com",
|
"website": "https://tineye.com",
|
||||||
|
@ -18,13 +20,29 @@ about = {
|
||||||
|
|
||||||
categories = ['images']
|
categories = ['images']
|
||||||
paging = True
|
paging = True
|
||||||
|
|
||||||
safesearch = False
|
safesearch = False
|
||||||
|
|
||||||
|
|
||||||
base_url = 'https://tineye.com'
|
base_url = 'https://tineye.com'
|
||||||
search_string = '/result_json/?page={page}&{query}'
|
search_string = '/result_json/?page={page}&{query}'
|
||||||
|
|
||||||
|
logger = logger.getChild('tineye')
|
||||||
|
|
||||||
|
FORMAT_NOT_SUPPORTED = gettext(
|
||||||
|
"Could not read that image url. This may be due to an unsupported file"
|
||||||
|
" format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
|
||||||
|
)
|
||||||
|
"""TinEye error message"""
|
||||||
|
|
||||||
|
NO_SIGNATURE_ERROR = gettext(
|
||||||
|
"The image is too simple to find matches. TinEye requires a basic level of"
|
||||||
|
" visual detail to successfully identify matches."
|
||||||
|
)
|
||||||
|
"""TinEye error message"""
|
||||||
|
|
||||||
|
DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
|
||||||
|
"""TinEye error message"""
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
params['url'] = base_url +\
|
params['url'] = base_url +\
|
||||||
|
@ -40,47 +58,147 @@ def request(query, params):
|
||||||
'TE': 'trailers',
|
'TE': 'trailers',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
query = urlencode({'url': query})
|
||||||
|
|
||||||
|
# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
|
||||||
|
params['url'] = base_url + search_string.format(query=query, page=params['pageno'])
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tineye_match(match_json):
|
||||||
|
"""Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
|
||||||
|
object.
|
||||||
|
|
||||||
|
Attributes `(class Match) <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__
|
||||||
|
|
||||||
|
- `image_url`, link to the result image.
|
||||||
|
- `domain`, domain this result was found on.
|
||||||
|
- `score`, a number (0 to 100) that indicates how closely the images match.
|
||||||
|
- `width`, image width in pixels.
|
||||||
|
- `height`, image height in pixels.
|
||||||
|
- `size`, image area in pixels.
|
||||||
|
- `format`, image format.
|
||||||
|
- `filesize`, image size in bytes.
|
||||||
|
- `overlay`, overlay URL.
|
||||||
|
- `tags`, whether this match belongs to a collection or stock domain.
|
||||||
|
|
||||||
|
- `backlinks`, a list of Backlink objects pointing to the original websites
|
||||||
|
and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
|
||||||
|
<https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__):
|
||||||
|
|
||||||
|
- `url`, the image URL to the image.
|
||||||
|
- `backlink`, the original website URL.
|
||||||
|
- `crawl_date`, the date the image was crawled.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# HINT: there exists an alternative backlink dict in the domains list / e.g.::
|
||||||
|
#
|
||||||
|
# match_json['domains'][0]['backlinks']
|
||||||
|
|
||||||
|
backlinks = []
|
||||||
|
if "backlinks" in match_json:
|
||||||
|
|
||||||
|
for backlink_json in match_json["backlinks"]:
|
||||||
|
if not isinstance(backlink_json, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
crawl_date = backlink_json.get("crawl_date")
|
||||||
|
if crawl_date:
|
||||||
|
crawl_date = datetime.fromisoformat(crawl_date[:-3])
|
||||||
|
else:
|
||||||
|
crawl_date = datetime.min
|
||||||
|
|
||||||
|
backlinks.append({
|
||||||
|
'url': backlink_json.get("url"),
|
||||||
|
'backlink': backlink_json.get("backlink"),
|
||||||
|
'crawl_date': crawl_date,
|
||||||
|
'image_name': backlink_json.get("image_name")}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'image_url': match_json.get("image_url"),
|
||||||
|
'domain': match_json.get("domain"),
|
||||||
|
'score': match_json.get("score"),
|
||||||
|
'width': match_json.get("width"),
|
||||||
|
'height': match_json.get("height"),
|
||||||
|
'size': match_json.get("size"),
|
||||||
|
'image_format': match_json.get("format"),
|
||||||
|
'filesize': match_json.get("filesize"),
|
||||||
|
'overlay': match_json.get("overlay"),
|
||||||
|
'tags': match_json.get("tags"),
|
||||||
|
'backlinks': backlinks,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
|
"""Parse HTTP response from TinEye."""
|
||||||
results = []
|
results = []
|
||||||
# Define wanted results
|
|
||||||
json_data = loads(resp.text)
|
|
||||||
number_of_results = json_data['num_matches']
|
|
||||||
|
|
||||||
for i in json_data['matches']:
|
try:
|
||||||
for i in json_data['matches']:
|
json_data = resp.json()
|
||||||
image_format = i['format']
|
except Exception as exc: # pylint: disable=broad-except
|
||||||
width = i['width']
|
msg = "can't parse JSON response // %s" % exc
|
||||||
height = i['height']
|
logger.error(msg)
|
||||||
thumbnail_src = i['image_url']
|
json_data = {'error': msg}
|
||||||
backlink = i['domains'][0]['backlinks'][0]
|
|
||||||
|
|
||||||
url = backlink['backlink']
|
# handle error codes from Tineye
|
||||||
source = backlink['url']
|
|
||||||
title = backlink['image_name']
|
|
||||||
img_src = backlink['url']
|
|
||||||
|
|
||||||
# Get and convert published date
|
if resp.is_error:
|
||||||
api_date = backlink['crawl_date'][:-3]
|
if resp.status_code in (400, 422):
|
||||||
publishedDate = datetime.fromisoformat(api_date)
|
|
||||||
|
|
||||||
# Append results
|
message = 'HTTP status: %s' % resp.status_code
|
||||||
results.append({
|
error = json_data.get('error')
|
||||||
|
s_key = json_data.get('suggestions', {}).get('key', '')
|
||||||
|
|
||||||
|
if error and s_key:
|
||||||
|
message = "%s (%s)" % (error, s_key)
|
||||||
|
elif error:
|
||||||
|
message = error
|
||||||
|
|
||||||
|
if s_key == "Invalid image URL":
|
||||||
|
# test https://docs.searxng.org/_static/searxng-wordmark.svg
|
||||||
|
message = FORMAT_NOT_SUPPORTED
|
||||||
|
elif s_key == 'NO_SIGNATURE_ERROR':
|
||||||
|
# test https://pngimg.com/uploads/dot/dot_PNG4.png
|
||||||
|
message = NO_SIGNATURE_ERROR
|
||||||
|
elif s_key == 'Download Error':
|
||||||
|
# test https://notexists
|
||||||
|
message = DOWNLOAD_ERROR
|
||||||
|
|
||||||
|
logger.error(message)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# append results from matches
|
||||||
|
for match_json in json_data['matches']:
|
||||||
|
|
||||||
|
tineye_match = parse_tineye_match(match_json)
|
||||||
|
if not tineye_match['backlinks']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
backlink = tineye_match['backlinks'][0]
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
'template': 'images.html',
|
'template': 'images.html',
|
||||||
'url': url,
|
'url': backlink['backlink'],
|
||||||
'thumbnail_src': thumbnail_src,
|
'thumbnail_src': tineye_match['image_url'],
|
||||||
'source': source,
|
'source': backlink['url'],
|
||||||
'title': title,
|
'title': backlink['image_name'],
|
||||||
'img_src': img_src,
|
'img_src': backlink['url'],
|
||||||
'format': image_format,
|
'format': tineye_match['image_format'],
|
||||||
'widht': width,
|
'widht': tineye_match['width'],
|
||||||
'height': height,
|
'height': tineye_match['height'],
|
||||||
'publishedDate': publishedDate,
|
'publishedDate': backlink['crawl_date'],
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Append number of results
|
# append number of results
|
||||||
results.append({'number_of_results': number_of_results})
|
number_of_results = json_data.get('num_matches')
|
||||||
|
if number_of_results:
|
||||||
|
results.append({'number_of_results': number_of_results})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -64,6 +64,7 @@ WHERE
|
||||||
mwapi:language "%LANGUAGE%".
|
mwapi:language "%LANGUAGE%".
|
||||||
?item wikibase:apiOutputItem mwapi:item.
|
?item wikibase:apiOutputItem mwapi:item.
|
||||||
}
|
}
|
||||||
|
hint:Prior hint:runFirst "true".
|
||||||
|
|
||||||
%WHERE%
|
%WHERE%
|
||||||
|
|
||||||
|
@ -92,6 +93,12 @@ WHERE {
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata)
|
||||||
|
# hard coded here to avoid to an additional SPARQL request when the server starts
|
||||||
|
DUMMY_ENTITY_URLS = set(
|
||||||
|
"http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
|
# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
|
||||||
# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
|
# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
|
||||||
|
@ -173,7 +180,7 @@ def response(resp):
|
||||||
for result in jsonresponse.get('results', {}).get('bindings', []):
|
for result in jsonresponse.get('results', {}).get('bindings', []):
|
||||||
attribute_result = {key: value['value'] for key, value in result.items()}
|
attribute_result = {key: value['value'] for key, value in result.items()}
|
||||||
entity_url = attribute_result['item']
|
entity_url = attribute_result['item']
|
||||||
if entity_url not in seen_entities:
|
if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS:
|
||||||
seen_entities.add(entity_url)
|
seen_entities.add(entity_url)
|
||||||
results += get_results(attribute_result, attributes, language)
|
results += get_results(attribute_result, attributes, language)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
Youtube (Videos)
|
Youtube (Videos)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from json import loads, dumps
|
from json import loads, dumps
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
@ -26,7 +25,7 @@ time_range_support = True
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = 'https://www.youtube.com/results'
|
base_url = 'https://www.youtube.com/results'
|
||||||
search_url = base_url + '?search_query={query}&page={page}'
|
search_url = base_url + '?search_query={query}&page={page}&ucbcb=1'
|
||||||
time_range_url = '&sp=EgII{time_range}%253D%253D'
|
time_range_url = '&sp=EgII{time_range}%253D%253D'
|
||||||
# the key seems to be constant
|
# the key seems to be constant
|
||||||
next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
|
next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
|
||||||
|
@ -44,6 +43,7 @@ base_youtube_url = 'https://www.youtube.com/watch?v='
|
||||||
|
|
||||||
# do search-request
|
# do search-request
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
params['cookies']['CONSENT'] = "YES+"
|
||||||
if not params['engine_data'].get('next_page_token'):
|
if not params['engine_data'].get('next_page_token'):
|
||||||
params['url'] = search_url.format(query=quote_plus(query), page=params['pageno'])
|
params['url'] = search_url.format(query=quote_plus(query), page=params['pageno'])
|
||||||
if params['time_range'] in time_range_dict:
|
if params['time_range'] in time_range_dict:
|
||||||
|
@ -57,7 +57,6 @@ def request(query, params):
|
||||||
})
|
})
|
||||||
params['headers']['Content-Type'] = 'application/json'
|
params['headers']['Content-Type'] = 'application/json'
|
||||||
|
|
||||||
params['headers']['Cookie'] = "CONSENT=YES+cb.%s-17-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -787,17 +787,23 @@ engines:
|
||||||
shortcut : loc
|
shortcut : loc
|
||||||
categories : images
|
categories : images
|
||||||
|
|
||||||
- name : lobste.rs
|
- name: lingva
|
||||||
engine : xpath
|
engine: lingva
|
||||||
search_url : https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance
|
shortcut: lv
|
||||||
results_xpath : //li[contains(@class, "story")]
|
# set lingva instance in url, by default it will use the official instance
|
||||||
url_xpath : .//a[@class="u-url"]/@href
|
# url: https://lingva.ml
|
||||||
title_xpath : .//a[@class="u-url"]
|
|
||||||
content_xpath : .//a[@class="domain"]
|
- name: lobste.rs
|
||||||
categories : it
|
engine: xpath
|
||||||
shortcut : lo
|
search_url: https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance
|
||||||
timeout : 5.0
|
results_xpath: //li[contains(@class, "story")]
|
||||||
disabled: True
|
url_xpath: .//a[@class="u-url"]/@href
|
||||||
|
title_xpath: .//a[@class="u-url"]
|
||||||
|
content_xpath: .//a[@class="domain"]
|
||||||
|
categories: it
|
||||||
|
shortcut: lo
|
||||||
|
timeout: 5.0
|
||||||
|
disabled: true
|
||||||
about:
|
about:
|
||||||
website: https://lobste.rs/
|
website: https://lobste.rs/
|
||||||
wikidata_id: Q60762874
|
wikidata_id: Q60762874
|
||||||
|
@ -1632,7 +1638,7 @@ engines:
|
||||||
require_api_key: false
|
require_api_key: false
|
||||||
results: HTML
|
results: HTML
|
||||||
|
|
||||||
- name: słownik języka polskiego
|
- name: sjp.pwn
|
||||||
engine: sjp
|
engine: sjp
|
||||||
shortcut: sjp
|
shortcut: sjp
|
||||||
base_url: https://sjp.pwn.pl/
|
base_url: https://sjp.pwn.pl/
|
||||||
|
|
Loading…
Reference in New Issue