From 2719fd2526a4aae2985df487ceda8714f1830d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Mon, 6 Jun 2022 00:18:33 +0200 Subject: [PATCH] Pick pass cookies from searxng (#3252) * [enh] Allow passing headers/cookies from settings.yml Example: - engine: xpath - search_url: example.org - headers: {'example_header': 'example_header'} - cookies: {'safesearch': 'off'} * [fix[ Update only cookies/headers * [enh] XPath engine - add time range support Signed-off-by: Markus Heiser * [enh] XPath engine - add time safe-search support Signed-off-by: Markus Heiser Co-authored-by: Allen <64094914+allendema@users.noreply.github.com> Co-authored-by: Markus Heiser --- searx/engines/xpath.py | 96 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 9c2929c4..92423302 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -5,6 +5,7 @@ from urllib.parse import urlencode from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list search_url = None +lang_all = 'en' url_xpath = None content_xpath = None title_xpath = None @@ -16,15 +17,71 @@ cached_xpath = '' cached_url = '' soft_max_redirects = 0 -# parameters for engines with paging support -# -# number of results on each page -# (only needed if the site requires not a page number, but an offset) +cookies = {} +headers = {} +'''Some engines might offer different result based on cookies or headers. +Possible use-case: To set safesearch cookie or header to moderate.''' + +paging = False +'''Engine supports paging [True or False].''' + page_size = 1 # number of the first page (usually 0 or 1) first_page_num = 1 +time_range_support = False +'''Engine supports search time range.''' + +time_range_url = '&hours={time_range_val}' +'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is +requested by the user, the URL paramter is an empty string. The +``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`. + +.. code:: yaml + + time_range_url : '&days={time_range_val}' +''' + +time_range_map = { + 'day': 24, + 'week': 24 * 7, + 'month': 24 * 30, + 'year': 24 * 365, +} +'''Maps time range value from user to ``{time_range_val}`` in +:py:obj:`time_range_url`. + +.. code:: yaml + + time_range_map: + day: 1 + week: 7 + month: 30 + year: 365 +''' + +safe_search_support = False +'''Engine supports safe-search.''' + +safe_search_map = { + 0: '&filter=none', + 1: '&filter=moderate', + 2: '&filter=strict' +} +'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`. + +.. code:: yaml + + safesearch: true + safes_search_map: + 0: '&filter=none' + 1: '&filter=moderate' + 2: '&filter=strict' + +''' + + def request(query, params): query = urlencode({'q': query})[2:] @@ -32,8 +89,35 @@ def request(query, params): if paging and search_url.find('{pageno}') >= 0: fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num - params['url'] = search_url.format(**fp) - params['query'] = query + safe_search = '' + if params['safesearch']: + safe_search = safe_search_map[params['safesearch']] + + lang = lang_all + if params['language'] != 'all': + lang = params['language'][:2] + + time_range = '' + if params.get('time_range'): + time_range_val = time_range_map.get(params.get('time_range')) + time_range = time_range_url.format(time_range_val=time_range_val) + + safe_search = '' + if params['safesearch']: + safe_search = safe_search_map[params['safesearch']] + + fargs = { + 'query': urlencode({'q': query})[2:], + 'lang': lang, + 'pageno': (params['pageno'] - 1) * page_size + first_page_num, + 'time_range': time_range, + 'safe_search': safe_search, + } + + params['cookies'].update(cookies) + params['headers'].update(headers) + + params['url'] = search_url.format(**fargs) params['soft_max_redirects'] = soft_max_redirects return params