[fix] url path unquoted check to avoid duplications

This commit is contained in:
Adam Tauber 2014-07-05 17:33:19 +02:00
parent cebf5868b3
commit a07b2b514c
1 changed files with 8 additions and 2 deletions

View File

@ -21,7 +21,7 @@ import sys
from imp import load_source from imp import load_source
from itertools import izip_longest, chain from itertools import izip_longest, chain
from operator import itemgetter from operator import itemgetter
from urlparse import urlparse from urlparse import urlparse, unquote
from datetime import datetime from datetime import datetime
import grequests import grequests
from flask.ext.babel import gettext from flask.ext.babel import gettext
@ -153,7 +153,9 @@ def score_results(results):
results = [] results = []
# deduplication + scoring # deduplication + scoring
for i, res in enumerate(flat_res): for i, res in enumerate(flat_res):
res['parsed_url'] = urlparse(res['url']) res['parsed_url'] = urlparse(res['url'])
res['host'] = res['parsed_url'].netloc res['host'] = res['parsed_url'].netloc
if res['host'].startswith('www.'): if res['host'].startswith('www.'):
@ -172,7 +174,7 @@ def score_results(results):
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
if res['host'] == new_res['host'] and\ if res['host'] == new_res['host'] and\
p1 == p2 and\ unquote(p1) == unquote(p2) and\
res['parsed_url'].query == new_res['parsed_url'].query and\ res['parsed_url'].query == new_res['parsed_url'].query and\
res.get('template') == new_res.get('template'): res.get('template') == new_res.get('template'):
duplicated = new_res duplicated = new_res
@ -222,6 +224,10 @@ def search(query, request, selected_engines, pageno=1, lang='all'):
request_params['language'] = lang request_params['language'] = lang
request_params = engine.request(query.encode('utf-8'), request_params) request_params = engine.request(query.encode('utf-8'), request_params)
if request_params['url'] is None:
# TODO add support of offline engines
pass
callback = make_callback( callback = make_callback(
selected_engine['name'], selected_engine['name'],
results, results,