mirror of https://github.com/searx/searx
[fix] url path unquoted check to avoid duplications
This commit is contained in:
parent
cebf5868b3
commit
a07b2b514c
|
@ -21,7 +21,7 @@ import sys
|
|||
from imp import load_source
|
||||
from itertools import izip_longest, chain
|
||||
from operator import itemgetter
|
||||
from urlparse import urlparse
|
||||
from urlparse import urlparse, unquote
|
||||
from datetime import datetime
|
||||
import grequests
|
||||
from flask.ext.babel import gettext
|
||||
|
@ -153,7 +153,9 @@ def score_results(results):
|
|||
results = []
|
||||
# deduplication + scoring
|
||||
for i, res in enumerate(flat_res):
|
||||
|
||||
res['parsed_url'] = urlparse(res['url'])
|
||||
|
||||
res['host'] = res['parsed_url'].netloc
|
||||
|
||||
if res['host'].startswith('www.'):
|
||||
|
@ -172,7 +174,7 @@ def score_results(results):
|
|||
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
||||
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
||||
if res['host'] == new_res['host'] and\
|
||||
p1 == p2 and\
|
||||
unquote(p1) == unquote(p2) and\
|
||||
res['parsed_url'].query == new_res['parsed_url'].query and\
|
||||
res.get('template') == new_res.get('template'):
|
||||
duplicated = new_res
|
||||
|
@ -222,6 +224,10 @@ def search(query, request, selected_engines, pageno=1, lang='all'):
|
|||
request_params['language'] = lang
|
||||
request_params = engine.request(query.encode('utf-8'), request_params)
|
||||
|
||||
if request_params['url'] is None:
|
||||
# TODO add support of offline engines
|
||||
pass
|
||||
|
||||
callback = make_callback(
|
||||
selected_engine['name'],
|
||||
results,
|
||||
|
|
Loading…
Reference in New Issue