From 2dbc0de0cd000459ebfdb3d015be8684e737e95c Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 22 Sep 2020 13:59:27 +0200 Subject: [PATCH] [mod] add searx/webadapter.py * move searx.search.get_search_query_from_webapp to searx.webadapter * move searx.query.SearchQuery to searx.search --- searx/query.py | 20 ---- searx/search.py | 186 +++++--------------------------------- searx/webadapter.py | 162 +++++++++++++++++++++++++++++++++ searx/webapp.py | 3 +- tests/unit/test_search.py | 67 +++++++------- utils/standalone_searx.py | 3 +- 6 files changed, 221 insertions(+), 220 deletions(-) create mode 100644 searx/webadapter.py diff --git a/searx/query.py b/searx/query.py index 7c2b0a24..9e2af0c4 100644 --- a/searx/query.py +++ b/searx/query.py @@ -178,23 +178,3 @@ class RawTextQuery: def getFullQuery(self): # get full querry including whitespaces return ''.join(self.query_parts) - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range, - timeout_limit=None, preferences=None, external_bang=None): - self.query = query - self.engines = engines - self.categories = categories - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = None if time_range in ('', 'None', None) else time_range - self.timeout_limit = timeout_limit - self.preferences = preferences - self.external_bang = external_bang - - def __str__(self): - return self.query + ";" + str(self.engines) diff --git a/searx/search.py b/searx/search.py index 96fffd56..d3b131d4 100644 --- a/searx/search.py +++ b/searx/search.py @@ -16,26 +16,20 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. ''' import gc -import sys import threading from time import time from uuid import uuid4 from _thread import start_new_thread -from flask_babel import gettext import requests.exceptions import searx.poolrequests as requests_lib -from searx.engines import ( - categories, engines, settings -) +from searx.engines import engines, settings from searx.answerers import ask from searx.external_bang import get_bang_url from searx.utils import gen_useragent -from searx.query import RawTextQuery, SearchQuery, VALID_LANGUAGE_CODE from searx.results import ResultContainer from searx import logger from searx.plugins import plugins -from searx.exceptions import SearxParameterException logger = logger.getChild('search') @@ -53,6 +47,26 @@ else: exit(1) +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range, + timeout_limit=None, preferences=None, external_bang=None): + self.query = query + self.engines = engines + self.categories = categories + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = None if time_range in ('', 'None', None) else time_range + self.timeout_limit = timeout_limit + self.preferences = preferences + self.external_bang = external_bang + + def __str__(self): + return self.query + ";" + str(self.engines) + + def send_http_request(engine, request_params): # create dictionary which contain all # informations about the request @@ -247,164 +261,6 @@ def default_request_params(): } -# remove duplicate queries. -# FIXME: does not fix "!music !soundcloud", because the categories are 'none' and 'music' -def deduplicate_query_engines(query_engines): - uniq_query_engines = {q["category"] + '|' + q["name"]: q for q in query_engines} - return uniq_query_engines.values() - - -def get_search_query_from_webapp(preferences, form): - # no text for the query ? - if not form.get('q'): - raise SearxParameterException('q', '') - - # set blocked engines - disabled_engines = preferences.engines.get_disabled() - - # parse query, if tags are set, which change - # the serch engine or search-language - raw_text_query = RawTextQuery(form['q'], disabled_engines) - - # set query - query = raw_text_query.getQuery() - - # get and check page number - pageno_param = form.get('pageno', '1') - if not pageno_param.isdigit() or int(pageno_param) < 1: - raise SearxParameterException('pageno', pageno_param) - query_pageno = int(pageno_param) - - # get language - # set specific language if set on request, query or preferences - # TODO support search with multible languages - if len(raw_text_query.languages): - query_lang = raw_text_query.languages[-1] - elif 'language' in form: - query_lang = form.get('language') - else: - query_lang = preferences.get_value('language') - - # check language - if not VALID_LANGUAGE_CODE.match(query_lang): - raise SearxParameterException('language', query_lang) - - # get safesearch - if 'safesearch' in form: - query_safesearch = form.get('safesearch') - # first check safesearch - if not query_safesearch.isdigit(): - raise SearxParameterException('safesearch', query_safesearch) - query_safesearch = int(query_safesearch) - else: - query_safesearch = preferences.get_value('safesearch') - - # safesearch : second check - if query_safesearch < 0 or query_safesearch > 2: - raise SearxParameterException('safesearch', query_safesearch) - - # get time_range - query_time_range = form.get('time_range') - - # check time_range - if query_time_range not in ('None', None, '', 'day', 'week', 'month', 'year'): - raise SearxParameterException('time_range', query_time_range) - - # query_engines - query_engines = raw_text_query.engines - - # timeout_limit - query_timeout = raw_text_query.timeout_limit - if query_timeout is None and 'timeout_limit' in form: - raw_time_limit = form.get('timeout_limit') - if raw_time_limit in ['None', '']: - raw_time_limit = None - else: - try: - query_timeout = float(raw_time_limit) - except ValueError: - raise SearxParameterException('timeout_limit', raw_time_limit) - - # query_categories - query_categories = [] - - # if engines are calculated from query, - # set categories by using that informations - if query_engines and raw_text_query.specific: - additional_categories = set() - for engine in query_engines: - if 'from_bang' in engine and engine['from_bang']: - additional_categories.add('none') - else: - additional_categories.add(engine['category']) - query_categories = list(additional_categories) - - # otherwise, using defined categories to - # calculate which engines should be used - else: - # set categories/engines - load_default_categories = True - for pd_name, pd in form.items(): - if pd_name == 'categories': - query_categories.extend(categ for categ in map(str.strip, pd.split(',')) if categ in categories) - elif pd_name == 'engines': - pd_engines = [{'category': engines[engine].categories[0], - 'name': engine} - for engine in map(str.strip, pd.split(',')) if engine in engines] - if pd_engines: - query_engines.extend(pd_engines) - load_default_categories = False - elif pd_name.startswith('category_'): - category = pd_name[9:] - - # if category is not found in list, skip - if category not in categories: - continue - - if pd != 'off': - # add category to list - query_categories.append(category) - elif category in query_categories: - # remove category from list if property is set to 'off' - query_categories.remove(category) - - if not load_default_categories: - if not query_categories: - query_categories = list(set(engine['category'] - for engine in query_engines)) - else: - # if no category is specified for this search, - # using user-defined default-configuration which - # (is stored in cookie) - if not query_categories: - cookie_categories = preferences.get_value('categories') - for ccateg in cookie_categories: - if ccateg in categories: - query_categories.append(ccateg) - - # if still no category is specified, using general - # as default-category - if not query_categories: - query_categories = ['general'] - - # using all engines for that search, which are - # declared under the specific categories - for categ in query_categories: - query_engines.extend({'category': categ, - 'name': engine.name} - for engine in categories[categ] - if (engine.name, categ) not in disabled_engines) - - query_engines = deduplicate_query_engines(query_engines) - external_bang = raw_text_query.external_bang - - return (SearchQuery(query, query_engines, query_categories, - query_lang, query_safesearch, query_pageno, - query_time_range, query_timeout, preferences, - external_bang=external_bang), - raw_text_query) - - class Search: """Search information container""" diff --git a/searx/webadapter.py b/searx/webadapter.py new file mode 100644 index 00000000..cad834bb --- /dev/null +++ b/searx/webadapter.py @@ -0,0 +1,162 @@ +from searx.exceptions import SearxParameterException +from searx.query import RawTextQuery, VALID_LANGUAGE_CODE +from searx.engines import categories, engines +from searx.search import SearchQuery + + +# remove duplicate queries. +# FIXME: does not fix "!music !soundcloud", because the categories are 'none' and 'music' +def deduplicate_query_engines(query_engines): + uniq_query_engines = {q["category"] + '|' + q["name"]: q for q in query_engines} + return uniq_query_engines.values() + + +def get_search_query_from_webapp(preferences, form): + # no text for the query ? + if not form.get('q'): + raise SearxParameterException('q', '') + + # set blocked engines + disabled_engines = preferences.engines.get_disabled() + + # parse query, if tags are set, which change + # the serch engine or search-language + raw_text_query = RawTextQuery(form['q'], disabled_engines) + + # set query + query = raw_text_query.getQuery() + + # get and check page number + pageno_param = form.get('pageno', '1') + if not pageno_param.isdigit() or int(pageno_param) < 1: + raise SearxParameterException('pageno', pageno_param) + query_pageno = int(pageno_param) + + # get language + # set specific language if set on request, query or preferences + # TODO support search with multible languages + if len(raw_text_query.languages): + query_lang = raw_text_query.languages[-1] + elif 'language' in form: + query_lang = form.get('language') + else: + query_lang = preferences.get_value('language') + + # check language + if not VALID_LANGUAGE_CODE.match(query_lang): + raise SearxParameterException('language', query_lang) + + # get safesearch + if 'safesearch' in form: + query_safesearch = form.get('safesearch') + # first check safesearch + if not query_safesearch.isdigit(): + raise SearxParameterException('safesearch', query_safesearch) + query_safesearch = int(query_safesearch) + else: + query_safesearch = preferences.get_value('safesearch') + + # safesearch : second check + if query_safesearch < 0 or query_safesearch > 2: + raise SearxParameterException('safesearch', query_safesearch) + + # get time_range + query_time_range = form.get('time_range') + + # check time_range + if query_time_range not in ('None', None, '', 'day', 'week', 'month', 'year'): + raise SearxParameterException('time_range', query_time_range) + + # query_engines + query_engines = raw_text_query.engines + + # timeout_limit + query_timeout = raw_text_query.timeout_limit + if query_timeout is None and 'timeout_limit' in form: + raw_time_limit = form.get('timeout_limit') + if raw_time_limit in ['None', '']: + raw_time_limit = None + else: + try: + query_timeout = float(raw_time_limit) + except ValueError: + raise SearxParameterException('timeout_limit', raw_time_limit) + + # query_categories + query_categories = [] + + # if engines are calculated from query, + # set categories by using that informations + if query_engines and raw_text_query.specific: + additional_categories = set() + for engine in query_engines: + if 'from_bang' in engine and engine['from_bang']: + additional_categories.add('none') + else: + additional_categories.add(engine['category']) + query_categories = list(additional_categories) + + # otherwise, using defined categories to + # calculate which engines should be used + else: + # set categories/engines + load_default_categories = True + for pd_name, pd in form.items(): + if pd_name == 'categories': + query_categories.extend(categ for categ in map(str.strip, pd.split(',')) if categ in categories) + elif pd_name == 'engines': + pd_engines = [{'category': engines[engine].categories[0], + 'name': engine} + for engine in map(str.strip, pd.split(',')) if engine in engines] + if pd_engines: + query_engines.extend(pd_engines) + load_default_categories = False + elif pd_name.startswith('category_'): + category = pd_name[9:] + + # if category is not found in list, skip + if category not in categories: + continue + + if pd != 'off': + # add category to list + query_categories.append(category) + elif category in query_categories: + # remove category from list if property is set to 'off' + query_categories.remove(category) + + if not load_default_categories: + if not query_categories: + query_categories = list(set(engine['category'] + for engine in query_engines)) + else: + # if no category is specified for this search, + # using user-defined default-configuration which + # (is stored in cookie) + if not query_categories: + cookie_categories = preferences.get_value('categories') + for ccateg in cookie_categories: + if ccateg in categories: + query_categories.append(ccateg) + + # if still no category is specified, using general + # as default-category + if not query_categories: + query_categories = ['general'] + + # using all engines for that search, which are + # declared under the specific categories + for categ in query_categories: + query_engines.extend({'category': categ, + 'name': engine.name} + for engine in categories[categ] + if (engine.name, categ) not in disabled_engines) + + query_engines = deduplicate_query_engines(query_engines) + external_bang = raw_text_query.external_bang + + return (SearchQuery(query, query_engines, query_categories, + query_lang, query_safesearch, query_pageno, + query_time_range, query_timeout, preferences, + external_bang=external_bang), + raw_text_query) diff --git a/searx/webapp.py b/searx/webapp.py index bba37cce..ad937291 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -67,10 +67,11 @@ from searx.webutils import ( get_static_files, get_result_templates, get_themes, prettify_url, new_hmac ) +from searx.webadapter import get_search_query_from_webapp from searx.utils import html_to_text, gen_useragent, dict_subset, match_language from searx.version import VERSION_STRING from searx.languages import language_codes as languages -from searx.search import SearchWithPlugins, get_search_query_from_webapp +from searx.search import SearchWithPlugins from searx.query import RawTextQuery from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.plugins import plugins diff --git a/tests/unit/test_search.py b/tests/unit/test_search.py index a15d2c89..d2322b20 100644 --- a/tests/unit/test_search.py +++ b/tests/unit/test_search.py @@ -5,6 +5,7 @@ from searx.preferences import Preferences from searx.engines import engines import searx.search +from searx.search import SearchQuery SAFESEARCH = 0 @@ -40,53 +41,53 @@ class SearchTestCase(SearxTestCase): def test_timeout_simple(self): searx.search.max_request_timeout = None - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, None, - preferences=Preferences(['oscar'], ['general'], engines, [])) + search_query = SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, None, + preferences=Preferences(['oscar'], ['general'], engines, [])) search = searx.search.Search(search_query) search.search() self.assertEqual(search.actual_timeout, 3.0) def test_timeout_query_above_default_nomax(self): searx.search.max_request_timeout = None - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 5.0, - preferences=Preferences(['oscar'], ['general'], engines, [])) + search_query = SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 5.0, + preferences=Preferences(['oscar'], ['general'], engines, [])) search = searx.search.Search(search_query) search.search() self.assertEqual(search.actual_timeout, 3.0) def test_timeout_query_below_default_nomax(self): searx.search.max_request_timeout = None - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 1.0, - preferences=Preferences(['oscar'], ['general'], engines, [])) + search_query = SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 1.0, + preferences=Preferences(['oscar'], ['general'], engines, [])) search = searx.search.Search(search_query) search.search() self.assertEqual(search.actual_timeout, 1.0) def test_timeout_query_below_max(self): searx.search.max_request_timeout = 10.0 - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 5.0, - preferences=Preferences(['oscar'], ['general'], engines, [])) + search_query = SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 5.0, + preferences=Preferences(['oscar'], ['general'], engines, [])) search = searx.search.Search(search_query) search.search() self.assertEqual(search.actual_timeout, 5.0) def test_timeout_query_above_max(self): searx.search.max_request_timeout = 10.0 - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 15.0, - preferences=Preferences(['oscar'], ['general'], engines, [])) + search_query = SearchQuery('test', [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 15.0, + preferences=Preferences(['oscar'], ['general'], engines, [])) search = searx.search.Search(search_query) search.search() self.assertEqual(search.actual_timeout, 10.0) def test_query_private_engine_without_token(self): - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PRIVATE_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 2.0, - preferences=Preferences(['oscar'], ['general'], engines, [])) + search_query = SearchQuery('test', [{'category': 'general', 'name': PRIVATE_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 2.0, + preferences=Preferences(['oscar'], ['general'], engines, [])) search = searx.search.Search(search_query) results = search.search() self.assertEqual(results.results_length(), 0) @@ -94,9 +95,9 @@ class SearchTestCase(SearxTestCase): def test_query_private_engine_with_incorrect_token(self): preferences_with_tokens = Preferences(['oscar'], ['general'], engines, []) preferences_with_tokens.parse_dict({'tokens': 'bad-token'}) - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PRIVATE_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 2.0, - preferences=preferences_with_tokens) + search_query = SearchQuery('test', [{'category': 'general', 'name': PRIVATE_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 2.0, + preferences=preferences_with_tokens) search = searx.search.Search(search_query) results = search.search() self.assertEqual(results.results_length(), 0) @@ -104,28 +105,28 @@ class SearchTestCase(SearxTestCase): def test_query_private_engine_with_correct_token(self): preferences_with_tokens = Preferences(['oscar'], ['general'], engines, []) preferences_with_tokens.parse_dict({'tokens': 'my-token'}) - search_query = searx.query.SearchQuery('test', [{'category': 'general', 'name': PRIVATE_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, 2.0, - preferences=preferences_with_tokens) + search_query = SearchQuery('test', [{'category': 'general', 'name': PRIVATE_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, 2.0, + preferences=preferences_with_tokens) search = searx.search.Search(search_query) results = search.search() self.assertEqual(results.results_length(), 1) def test_external_bang(self): - search_query = searx.query.SearchQuery('yes yes', - [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, None, - preferences=Preferences(['oscar'], ['general'], engines, [],), - external_bang="yt") + search_query = SearchQuery('yes yes', + [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, None, + preferences=Preferences(['oscar'], ['general'], engines, [],), + external_bang="yt") search = searx.search.Search(search_query) results = search.search() # For checking if the user redirected with the youtube external bang self.assertTrue(results.redirect_url is not None) - search_query = searx.query.SearchQuery('youtube never gonna give you up', - [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], - ['general'], 'en-US', SAFESEARCH, PAGENO, None, None, - preferences=Preferences(['oscar'], ['general'], engines, []),) + search_query = SearchQuery('youtube never gonna give you up', + [{'category': 'general', 'name': PUBLIC_ENGINE_NAME}], + ['general'], 'en-US', SAFESEARCH, PAGENO, None, None, + preferences=Preferences(['oscar'], ['general'], engines, []),) search = searx.search.Search(search_query) results = search.search() diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py index d43b474d..2ca3fea5 100755 --- a/utils/standalone_searx.py +++ b/utils/standalone_searx.py @@ -31,6 +31,7 @@ import searx.query import searx.search import searx.engines import searx.preferences +import searx.webadapter import argparse searx.engines.initialize_engines(settings['engines']) @@ -64,7 +65,7 @@ form = { preferences = searx.preferences.Preferences(['oscar'], searx.engines.categories.keys(), searx.engines.engines, []) preferences.key_value_settings['safesearch'].parse(args.safesearch) -search_query, raw_text_query = searx.search.get_search_query_from_webapp(preferences, form) +search_query, raw_text_query = searx.webadapter.get_search_query_from_webapp(preferences, form) search = searx.search.Search(search_query) result_container = search.search()