From 70cbc09e9390d02686882786c20c201b3a08edef Mon Sep 17 00:00:00 2001 From: asciimoo Date: Sat, 19 Oct 2013 17:36:44 +0200 Subject: [PATCH] [enh] better url comparison --- searx/engines/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index cdf667e3..078188cd 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -22,6 +22,7 @@ from imp import load_source import grequests from itertools import izip_longest, chain from operator import itemgetter +from urlparse import urlparse engine_dir = dirname(realpath(__file__)) @@ -87,16 +88,23 @@ def search(query, request, selected_engines): results = [] # deduplication + scoring for i,res in enumerate(flat_res): + res['parsed_url'] = urlparse(res['url']) score = flat_len - i duplicated = False for new_res in results: - if res['url'] == new_res['url']: + if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ + res['parsed_url'].path == new_res['parsed_url'].path: duplicated = new_res break if duplicated: if len(res.get('content', '')) > len(duplicated.get('content', '')): duplicated['content'] = res['content'] duplicated['score'] += score + if duplicated['parsed_url'].scheme == 'https': + continue + elif res['parsed_url'].scheme == 'https': + duplicated['parsed_url'].scheme == 'https' + duplicated['url'] = duplicated['parsed_url'].geturl() else: res['score'] = score results.append(res)