[enh] Improve ranking based on language (#3053)

Add configurable setting to rank search results higher when part of the
domain (e.g. 'en' in 'en.wikipedia.org' or 'de' in 'beispiel.de')
matches the selected search language. Does not apply to e.g. 'be' in
'youtube.com'.

Closes #206
This commit is contained in:
Finn 2021-11-15 20:31:22 +01:00 committed by GitHub
parent a880920dc7
commit 8c3454fd1b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 9 deletions

View File

@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
from searx import logger from searx import logger
from searx.engines import engines from searx.engines import engines
from searx.metrology.error_recorder import record_error from searx.metrology.error_recorder import record_error
from searx import settings
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
@ -129,13 +130,18 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['content'] = content2 infobox1['content'] = content2
def result_score(result): def result_score(result, language):
weight = 1.0 weight = 1.0
for result_engine in result['engines']: for result_engine in result['engines']:
if hasattr(engines[result_engine], 'weight'): if hasattr(engines[result_engine], 'weight'):
weight *= float(engines[result_engine].weight) weight *= float(engines[result_engine].weight)
if settings['search']['prefer_configured_language']:
domain_parts = result['parsed_url'].netloc.split('.')
if language in domain_parts:
weight *= 1.1
occurences = len(result['positions']) occurences = len(result['positions'])
return sum((occurences * weight) / position for position in result['positions']) return sum((occurences * weight) / position for position in result['positions'])
@ -145,9 +151,10 @@ class ResultContainer:
"""docstring for ResultContainer""" """docstring for ResultContainer"""
__slots__ = '_merged_results', 'infoboxes', 'suggestions', 'answers', 'corrections', '_number_of_results',\ __slots__ = '_merged_results', 'infoboxes', 'suggestions', 'answers', 'corrections', '_number_of_results',\
'_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data' '_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data',\
'_language'
def __init__(self): def __init__(self, language):
super().__init__() super().__init__()
self._merged_results = [] self._merged_results = []
self.infoboxes = [] self.infoboxes = []
@ -161,6 +168,7 @@ class ResultContainer:
self.unresponsive_engines = set() self.unresponsive_engines = set()
self.timings = [] self.timings = []
self.redirect_url = None self.redirect_url = None
self._language = language.lower().split('-')[0]
def extend(self, engine_name, results): def extend(self, engine_name, results):
standard_result_count = 0 standard_result_count = 0
@ -299,7 +307,7 @@ class ResultContainer:
def order_results(self): def order_results(self):
for result in self._merged_results: for result in self._merged_results:
score = result_score(result) score = result_score(result, self._language)
result['score'] = score result['score'] = score
with RLock(): with RLock():
for result_engine in result['engines']: for result_engine in result['engines']:

View File

@ -66,7 +66,7 @@ class Search:
# init vars # init vars
super().__init__() super().__init__()
self.search_query = search_query self.search_query = search_query
self.result_container = ResultContainer() self.result_container = ResultContainer(search_query.lang)
self.start_time = None self.start_time = None
self.actual_timeout = None self.actual_timeout = None

View File

@ -19,6 +19,7 @@ search:
default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py' default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py'
ban_time_on_fail : 5 # ban time in seconds after engine errors ban_time_on_fail : 5 # ban time in seconds after engine errors
max_ban_time_on_fail : 120 # max ban time in seconds after engine errors max_ban_time_on_fail : 120 # max ban time in seconds after engine errors
prefer_configured_language: False # increase weight of results in confiugred language in ranking
server: server:
port : 8888 port : 8888

View File

@ -20,22 +20,22 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff',
class ResultContainerTestCase(SearxTestCase): class ResultContainerTestCase(SearxTestCase):
def test_empty(self): def test_empty(self):
c = ResultContainer() c = ResultContainer("en-US")
self.assertEqual(c.get_ordered_results(), []) self.assertEqual(c.get_ordered_results(), [])
def test_one_result(self): def test_one_result(self):
c = ResultContainer() c = ResultContainer("en-US")
c.extend('wikipedia', [fake_result()]) c.extend('wikipedia', [fake_result()])
self.assertEqual(c.results_length(), 1) self.assertEqual(c.results_length(), 1)
def test_one_suggestion(self): def test_one_suggestion(self):
c = ResultContainer() c = ResultContainer("en-US")
c.extend('wikipedia', [fake_result(suggestion=True)]) c.extend('wikipedia', [fake_result(suggestion=True)])
self.assertEqual(len(c.suggestions), 1) self.assertEqual(len(c.suggestions), 1)
self.assertEqual(c.results_length(), 0) self.assertEqual(c.results_length(), 0)
def test_result_merge(self): def test_result_merge(self):
c = ResultContainer() c = ResultContainer("en-US")
c.extend('wikipedia', [fake_result()]) c.extend('wikipedia', [fake_result()])
c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')]) c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')])
self.assertEqual(c.results_length(), 2) self.assertEqual(c.results_length(), 2)