From 4d36aee57b99d6c72e5c96d05a76e6aa534ed664 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 16 Oct 2021 21:56:08 +0200 Subject: [PATCH] [fix] engine - yahoo: rewrite and fix issues Languages are supported by mapping the language to a domain. If domain is not found in :py:obj:`lang2domain` URL ``.search.yahoo.com`` is used. Closes #3020 Signed-off-by: Markus Heiser --- searx/data/engines_languages.json | 4 +- searx/engines/yahoo.py | 184 +++++++++++++++--------------- 2 files changed, 95 insertions(+), 93 deletions(-) diff --git a/searx/data/engines_languages.json b/searx/data/engines_languages.json index 16e1de0e..ade3e146 100644 --- a/searx/data/engines_languages.json +++ b/searx/data/engines_languages.json @@ -28582,7 +28582,7 @@ "sv", "th", "tr", - "zh-CHS", - "zh-CHT" + "zh_chs", + "zh_cht" ] } \ No newline at end of file diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 26a4469b..71f93225 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -3,9 +3,18 @@ Yahoo (Web) """ -from urllib.parse import unquote, urlencode +from urllib.parse import ( + unquote, + urlencode, +) from lxml import html -from searx.utils import extract_text, extract_url, match_language, eval_xpath + +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, + match_language, +) # about about = { @@ -21,29 +30,75 @@ about = { categories = ['general'] paging = True time_range_support = True +supported_languages_url = 'https://search.yahoo.com/preferences/languages' +"""Supported languages are read from Yahoo preference page.""" -# search-url -base_url = 'https://search.yahoo.com/' -search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' -search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' +time_range_dict = { + 'day': ('1d', 'd'), + 'week': ('1w', 'w'), + 'month': ('1m', 'm'), +} -supported_languages_url = 'https://search.yahoo.com/web/advanced' +language_aliases = { + 'zh-HK': 'zh_chs', + 'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com + 'zh-TW': 'zh_cht', +} -# specific xpath variables -results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" -url_xpath = './/h3/a/@href' -title_xpath = './/h3/a' -content_xpath = './/div[contains(@class, "compText")]' -suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" +lang2domain = { + 'zh_chs': 'hk.search.yahoo.com', + 'zh_cht': 'tw.search.yahoo.com', + 'en': 'search.yahoo.com', -time_range_dict = {'day': ['1d', 'd'], - 'week': ['1w', 'w'], - 'month': ['1m', 'm']} + 'bg': 'search.yahoo.com', + 'cs': 'search.yahoo.com', + 'da': 'search.yahoo.com', + 'el': 'search.yahoo.com', + 'et': 'search.yahoo.com', + 'he': 'search.yahoo.com', + 'hr': 'search.yahoo.com', + 'ja': 'search.yahoo.com', + 'ko': 'search.yahoo.com', + 'sk': 'search.yahoo.com', + 'sl': 'search.yahoo.com', -language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} +} + + +def _get_language(params): + + lang = language_aliases.get(params['language']) + if lang is None: + lang = match_language( + params['language'], supported_languages, language_aliases + ) + lang = lang.split('-')[0] + return lang + + +def request(query, params): + """build request""" + offset = (params['pageno'] - 1) * 7 + 1 + lang = _get_language(params) + age, btf = time_range_dict.get(params['time_range'], ('', '')) + + args = urlencode({ + 'p': query, + 'ei': 'UTF-8', + 'fl': 1, + 'vl': 'lang_' + lang, + 'btf': btf, + 'fr2': 'time', + 'age': age, + 'b': offset, + 'xargs': 0, + }) + + domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) + params['url'] = 'https://%s/search?%s' % (domain, args) + return params -# remove yahoo-specific tracking-url def parse_url(url_string): endings = ['/RS', '/RK'] endpositions = [] @@ -61,86 +116,37 @@ def parse_url(url_string): return unquote(url_string[start:end]) -def _get_url(query, offset, language, time_range): - if time_range in time_range_dict: - return base_url + search_url_with_time.format(offset=offset, - query=urlencode({'p': query}), - lang=language, - age=time_range_dict[time_range][0], - btf=time_range_dict[time_range][1]) - return base_url + search_url.format(offset=offset, - query=urlencode({'p': query}), - lang=language) - - -def _get_language(params): - if params['language'] == 'all': - return 'en' - - language = match_language(params['language'], supported_languages, language_aliases) - if language not in language_aliases.values(): - language = language.split('-')[0] - language = language.replace('-', '_').lower() - - return language - - -# do search-request -def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: - return params - - offset = (params['pageno'] - 1) * 10 + 1 - language = _get_language(params) - - params['url'] = _get_url(query, offset, language, params['time_range']) - - # TODO required? - params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ - .format(lang=language) - - return params - - -# get response from search-request def response(resp): results = [] - dom = html.fromstring(resp.text) - try: - results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0] - .split()[0].replace(',', '')) - results.append({'number_of_results': results_num}) - except: - pass - - # parse results - for result in eval_xpath(dom, results_xpath): - try: - url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url)) - title = extract_text(eval_xpath(result, title_xpath)[0]) - except: + for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): + url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) + if url is None: continue + url = parse_url(url) - content = extract_text(eval_xpath(result, content_xpath)[0]) + title = eval_xpath_getindex(result, './/h3/a', 0, default=None) + if title is None: + continue + offset = len(extract_text(title.xpath('span'))) + title = extract_text(title)[offset:] + + content = eval_xpath_getindex( + result, './/div[contains(@class, "compText")]', 0, default='' + ) + if content: + content = extract_text(content) # append result results.append({'url': url, 'title': title, 'content': content}) - # if no suggestion found, return results - suggestions = eval_xpath(dom, suggestion_xpath) - if not suggestions: - return results - - # parse suggestion - for suggestion in suggestions: + for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - # return results return results @@ -148,13 +154,9 @@ def response(resp): def _fetch_supported_languages(resp): supported_languages = [] dom = html.fromstring(resp.text) - options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input') - for option in options: - code_parts = eval_xpath(option, './@value')[0][5:].split('_') - if len(code_parts) == 2: - code = code_parts[0] + '-' + code_parts[1].upper() - else: - code = code_parts[0] - supported_languages.append(code) + offset = len('lang_') + + for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): + supported_languages.append(val[offset:]) return supported_languages