mirror of https://github.com/searx/searx
[enh] Improve ranking based on language (#3053)
Add configurable setting to rank search results higher when part of the domain (e.g. 'en' in 'en.wikipedia.org' or 'de' in 'beispiel.de') matches the selected search language. Does not apply to e.g. 'be' in 'youtube.com'. Closes #206
This commit is contained in:
parent
a880920dc7
commit
8c3454fd1b
|
@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
|
|||
from searx import logger
|
||||
from searx.engines import engines
|
||||
from searx.metrology.error_recorder import record_error
|
||||
from searx import settings
|
||||
|
||||
|
||||
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
|
||||
|
@ -129,13 +130,18 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||
infobox1['content'] = content2
|
||||
|
||||
|
||||
def result_score(result):
|
||||
def result_score(result, language):
|
||||
weight = 1.0
|
||||
|
||||
for result_engine in result['engines']:
|
||||
if hasattr(engines[result_engine], 'weight'):
|
||||
weight *= float(engines[result_engine].weight)
|
||||
|
||||
if settings['search']['prefer_configured_language']:
|
||||
domain_parts = result['parsed_url'].netloc.split('.')
|
||||
if language in domain_parts:
|
||||
weight *= 1.1
|
||||
|
||||
occurences = len(result['positions'])
|
||||
|
||||
return sum((occurences * weight) / position for position in result['positions'])
|
||||
|
@ -145,9 +151,10 @@ class ResultContainer:
|
|||
"""docstring for ResultContainer"""
|
||||
|
||||
__slots__ = '_merged_results', 'infoboxes', 'suggestions', 'answers', 'corrections', '_number_of_results',\
|
||||
'_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data'
|
||||
'_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data',\
|
||||
'_language'
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, language):
|
||||
super().__init__()
|
||||
self._merged_results = []
|
||||
self.infoboxes = []
|
||||
|
@ -161,6 +168,7 @@ class ResultContainer:
|
|||
self.unresponsive_engines = set()
|
||||
self.timings = []
|
||||
self.redirect_url = None
|
||||
self._language = language.lower().split('-')[0]
|
||||
|
||||
def extend(self, engine_name, results):
|
||||
standard_result_count = 0
|
||||
|
@ -299,7 +307,7 @@ class ResultContainer:
|
|||
|
||||
def order_results(self):
|
||||
for result in self._merged_results:
|
||||
score = result_score(result)
|
||||
score = result_score(result, self._language)
|
||||
result['score'] = score
|
||||
with RLock():
|
||||
for result_engine in result['engines']:
|
||||
|
|
|
@ -66,7 +66,7 @@ class Search:
|
|||
# init vars
|
||||
super().__init__()
|
||||
self.search_query = search_query
|
||||
self.result_container = ResultContainer()
|
||||
self.result_container = ResultContainer(search_query.lang)
|
||||
self.start_time = None
|
||||
self.actual_timeout = None
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ search:
|
|||
default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py'
|
||||
ban_time_on_fail : 5 # ban time in seconds after engine errors
|
||||
max_ban_time_on_fail : 120 # max ban time in seconds after engine errors
|
||||
prefer_configured_language: False # increase weight of results in confiugred language in ranking
|
||||
|
||||
server:
|
||||
port : 8888
|
||||
|
|
|
@ -20,22 +20,22 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff',
|
|||
class ResultContainerTestCase(SearxTestCase):
|
||||
|
||||
def test_empty(self):
|
||||
c = ResultContainer()
|
||||
c = ResultContainer("en-US")
|
||||
self.assertEqual(c.get_ordered_results(), [])
|
||||
|
||||
def test_one_result(self):
|
||||
c = ResultContainer()
|
||||
c = ResultContainer("en-US")
|
||||
c.extend('wikipedia', [fake_result()])
|
||||
self.assertEqual(c.results_length(), 1)
|
||||
|
||||
def test_one_suggestion(self):
|
||||
c = ResultContainer()
|
||||
c = ResultContainer("en-US")
|
||||
c.extend('wikipedia', [fake_result(suggestion=True)])
|
||||
self.assertEqual(len(c.suggestions), 1)
|
||||
self.assertEqual(c.results_length(), 0)
|
||||
|
||||
def test_result_merge(self):
|
||||
c = ResultContainer()
|
||||
c = ResultContainer("en-US")
|
||||
c.extend('wikipedia', [fake_result()])
|
||||
c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')])
|
||||
self.assertEqual(c.results_length(), 2)
|
||||
|
|
Loading…
Reference in New Issue