mirror of https://github.com/searx/searx
[fix] url path unquoted check to avoid duplications
This commit is contained in:
parent
cebf5868b3
commit
a07b2b514c
|
@ -21,7 +21,7 @@ import sys
|
||||||
from imp import load_source
|
from imp import load_source
|
||||||
from itertools import izip_longest, chain
|
from itertools import izip_longest, chain
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse, unquote
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import grequests
|
import grequests
|
||||||
from flask.ext.babel import gettext
|
from flask.ext.babel import gettext
|
||||||
|
@ -153,7 +153,9 @@ def score_results(results):
|
||||||
results = []
|
results = []
|
||||||
# deduplication + scoring
|
# deduplication + scoring
|
||||||
for i, res in enumerate(flat_res):
|
for i, res in enumerate(flat_res):
|
||||||
|
|
||||||
res['parsed_url'] = urlparse(res['url'])
|
res['parsed_url'] = urlparse(res['url'])
|
||||||
|
|
||||||
res['host'] = res['parsed_url'].netloc
|
res['host'] = res['parsed_url'].netloc
|
||||||
|
|
||||||
if res['host'].startswith('www.'):
|
if res['host'].startswith('www.'):
|
||||||
|
@ -172,7 +174,7 @@ def score_results(results):
|
||||||
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
||||||
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
||||||
if res['host'] == new_res['host'] and\
|
if res['host'] == new_res['host'] and\
|
||||||
p1 == p2 and\
|
unquote(p1) == unquote(p2) and\
|
||||||
res['parsed_url'].query == new_res['parsed_url'].query and\
|
res['parsed_url'].query == new_res['parsed_url'].query and\
|
||||||
res.get('template') == new_res.get('template'):
|
res.get('template') == new_res.get('template'):
|
||||||
duplicated = new_res
|
duplicated = new_res
|
||||||
|
@ -222,6 +224,10 @@ def search(query, request, selected_engines, pageno=1, lang='all'):
|
||||||
request_params['language'] = lang
|
request_params['language'] = lang
|
||||||
request_params = engine.request(query.encode('utf-8'), request_params)
|
request_params = engine.request(query.encode('utf-8'), request_params)
|
||||||
|
|
||||||
|
if request_params['url'] is None:
|
||||||
|
# TODO add support of offline engines
|
||||||
|
pass
|
||||||
|
|
||||||
callback = make_callback(
|
callback = make_callback(
|
||||||
selected_engine['name'],
|
selected_engine['name'],
|
||||||
results,
|
results,
|
||||||
|
|
Loading…
Reference in New Issue