diff --git a/Makefile b/Makefile index fbf5f94b..4e451b7c 100644 --- a/Makefile +++ b/Makefile @@ -195,7 +195,8 @@ PYLINT_FILES=\ searx/engines/google_videos.py \ searx/engines/google_images.py \ searx/engines/mediathekviewweb.py \ - utils/fetch_external_bangs.py + utils/fetch_external_bangs.py \ + searx/engines/google_scholar.py test.pylint: pyenvinstall $(call cmd,pylint,$(PYLINT_FILES)) diff --git a/searx/data/engines_languages.json b/searx/data/engines_languages.json index 324e4f3a..e6b41480 100644 --- a/searx/data/engines_languages.json +++ b/searx/data/engines_languages.json @@ -25176,6 +25176,146 @@ "name": "\u4e2d\u6587 (\u7e41\u9ad4)" } }, + "google scholar": { + "af": { + "name": "Afrikaans" + }, + "ar": { + "name": "\u0627\u0644\u0639\u0631\u0628\u064a\u0629" + }, + "be": { + "name": "\u0431\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f" + }, + "bg": { + "name": "\u0431\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438" + }, + "ca": { + "name": "catal\u00e0" + }, + "cs": { + "name": "\u010de\u0161tina" + }, + "da": { + "name": "dansk" + }, + "de": { + "name": "Deutsch" + }, + "el": { + "name": "\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac" + }, + "en": { + "name": "English" + }, + "eo": { + "name": "esperanto" + }, + "es": { + "name": "espa\u00f1ol" + }, + "et": { + "name": "eesti" + }, + "fa": { + "name": "\u0641\u0627\u0631\u0633\u06cc" + }, + "fi": { + "name": "suomi" + }, + "fr": { + "name": "fran\u00e7ais" + }, + "hi": { + "name": "\u0939\u093f\u0928\u094d\u0926\u0940" + }, + "hr": { + "name": "hrvatski" + }, + "hu": { + "name": "magyar" + }, + "hy": { + "name": "\u0570\u0561\u0575\u0565\u0580\u0565\u0576" + }, + "id": { + "name": "Indonesia" + }, + "is": { + "name": "\u00edslenska" + }, + "it": { + "name": "italiano" + }, + "iw": { + "name": "\u05e2\u05d1\u05e8\u05d9\u05ea" + }, + "ja": { + "name": "\u65e5\u672c\u8a9e" + }, + "ko": { + "name": "\ud55c\uad6d\uc5b4" + }, + "lt": { + "name": "lietuvi\u0173" + }, + "lv": { + "name": "latvie\u0161u" + }, + "nl": { + "name": "Nederlands" + }, + "no": { + "name": "norsk" + }, + "pl": { + "name": "polski" + }, + "pt": { + "name": "portugu\u00eas" + }, + "ro": { + "name": "rom\u00e2n\u0103" + }, + "ru": { + "name": "\u0440\u0443\u0441\u0441\u043a\u0438\u0439" + }, + "sk": { + "name": "sloven\u010dina" + }, + "sl": { + "name": "sloven\u0161\u010dina" + }, + "sr": { + "name": "\u0441\u0440\u043f\u0441\u043a\u0438" + }, + "sv": { + "name": "svenska" + }, + "sw": { + "name": "Kiswahili" + }, + "th": { + "name": "\u0e44\u0e17\u0e22" + }, + "tl": { + "name": "Filipino" + }, + "tr": { + "name": "T\u00fcrk\u00e7e" + }, + "uk": { + "name": "\u0443\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430" + }, + "vi": { + "name": "Ti\u1ebfng Vi\u1ec7t" + }, + "zh-CN": { + "name": "\u4e2d\u6587 (\u7b80\u4f53)" + }, + "zh-TW": { + "name": "\u4e2d\u6587 (\u7e41\u9ad4)" + } + }, "google videos": { "af": { "name": "Afrikaans" diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py new file mode 100644 index 00000000..86bb0105 --- /dev/null +++ b/searx/engines/google_scholar.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Google (Scholar) + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions +""" + +# pylint: disable=invalid-name, missing-function-docstring + +from urllib.parse import urlencode +from datetime import datetime +from lxml import html +from searx import logger + +from searx.utils import ( + eval_xpath, + eval_xpath_list, + extract_text, +) + +from searx.engines.google import ( + get_lang_info, + time_range_dict, + detect_google_sorry, +) + +# pylint: disable=unused-import +from searx.engines.google import ( + supported_languages_url, + _fetch_supported_languages, +) +# pylint: enable=unused-import + +# about +about = { + "website": 'https://scholar.google.com', + "wikidata_id": 'Q494817', + "official_api_documentation": 'https://developers.google.com/custom-search', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['science'] +paging = True +language_support = True +use_locale_domain = True +time_range_support = True +safesearch = False + +logger = logger.getChild('google scholar') + +def time_range_url(params): + """Returns a URL query component for a google-Scholar time range based on + ``params['time_range']``. Google-Scholar does only support ranges in years. + To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*) + are mapped to *year*. If no range is set, an empty string is returned. + Example:: + + &as_ylo=2019 + """ + # as_ylo=2016&as_yhi=2019 + ret_val = '' + if params['time_range'] in time_range_dict: + ret_val= urlencode({'as_ylo': datetime.now().year -1 }) + return '&' + ret_val + + +def request(query, params): + """Google-Scholar search request""" + + offset = (params['pageno'] - 1) * 10 + lang_info = get_lang_info( + # pylint: disable=undefined-variable + + + # params, {}, language_aliases + + params, supported_languages, language_aliases + ) + # subdomain is: scholar.google.xy + lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") + + query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({ + 'q': query, + 'hl': lang_info['hl'], + 'lr': lang_info['lr'], + 'ie': "utf8", + 'oe': "utf8", + 'start' : offset, + }) + + query_url += time_range_url(params) + + logger.debug("query_url --> %s", query_url) + params['url'] = query_url + + logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) + params['headers']['Accept-Language'] = lang_info['Accept-Language'] + params['headers']['Accept'] = ( + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + ) + + #params['google_subdomain'] = subdomain + return params + +def response(resp): + """Get response from google's search request""" + results = [] + + detect_google_sorry(resp) + + # which subdomain ? + # subdomain = resp.search_params.get('google_subdomain') + + # convert the text to dom + dom = html.fromstring(resp.text) + + # parse results + for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): + + title = extract_text(eval_xpath(result, './h3[1]//a')) + + if not title: + # this is a [ZITATION] block + continue + + url = eval_xpath(result, './h3[1]//a/@href')[0] + content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' + + pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) + if pub_info: + content += "[%s]" % pub_info + + pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) + if pub_type: + title = title + " " + pub_type + + results.append({ + 'url': url, + 'title': title, + 'content': content, + }) + + # parse suggestion + for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): + # append suggestion + results.append({'suggestion': extract_text(suggestion)}) + + for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'): + results.append({'correction': extract_text(correction)}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 581f1934..e45afb59 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -535,25 +535,8 @@ engines: # android: *test_android - name : google scholar - engine : xpath - paging : True - search_url : https://scholar.google.com/scholar?start={pageno}&q={query}&hl=en&as_sdt=0,5&as_vis=1 - results_xpath : //div[contains(@class, "gs_r")]/div[@class="gs_ri"] - url_xpath : .//h3/a/@href - title_xpath : .//h3/a - content_xpath : .//div[@class="gs_rs"] - suggestion_xpath : //div[@id="gs_res_ccl_top"]//a/b - page_size : 10 - first_page_num : 0 - categories : science + engine : google_scholar shortcut : gos - about: - website: https://scholar.google.com/ - wikidata_id: Q494817 - official_api_documentation: - use_official_api: false - require_api_key: false - results: HTML - name : google play apps engine : xpath