From 988cf38196cc26aa1473c7ece024ee178a856655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Sat, 17 Feb 2018 21:36:34 +0100 Subject: [PATCH] fix Microsoft Academic engine --- searx/engines/microsoft_academic.py | 75 +++++++++++++++++++++++++++++ searx/settings.yml | 10 +--- 2 files changed, 76 insertions(+), 9 deletions(-) create mode 100644 searx/engines/microsoft_academic.py diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py new file mode 100644 index 00000000..9387b08d --- /dev/null +++ b/searx/engines/microsoft_academic.py @@ -0,0 +1,75 @@ +""" +Microsoft Academic (Science) + +@website https://academic.microsoft.com +@provide-api yes +@using-api no +@results JSON +@stable no +@parse url, title, content +""" + +from datetime import datetime +from json import loads +from uuid import uuid4 + +from searx.url_utils import urlencode +from searx.utils import html_to_text + +categories = ['images'] +paging = True +result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' + + +def request(query, params): + correlation_id = uuid4() + msacademic = uuid4() + time_now = datetime.now() + + params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id})) + params['cookies']['msacademic'] = str(msacademic) + params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now)) + params['method'] = 'POST' + params['data'] = { + 'Query': '@{query}@'.format(query=query), + 'Limit': 10, + 'Offset': params['pageno'] - 1, + 'Filters': '', + 'OrderBy': '', + 'SortAscending': False, + } + + return params + + +def response(resp): + results = [] + response_data = loads(resp.text) + + for result in response_data['results']: + url = _get_url(result) + title = result['e']['dn'] + content = _get_content(result) + results.append({ + 'url': url, + 'title': html_to_text(title), + 'content': html_to_text(content), + }) + + return results + + +def _get_url(result): + if 's' in result['e']: + return result['e']['s'][0]['u'] + return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id']) + + +def _get_content(result): + if 'd' in result['e']: + content = result['e']['d'] + if len(content) > 300: + return content[:300] + '...' + return content + + return '' diff --git a/searx/settings.yml b/searx/settings.yml index e819eada..3903bbf1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -398,15 +398,7 @@ engines: shortcut : lo - name : microsoft academic - engine : json_engine - paging : True - search_url : https://academic.microsoft.com/api/search/GetEntityResults?query=%40{query}%40&filters=&offset={pageno}&limit=8&correlationId=undefined - results_query : results - url_query : u - title_query : dn - content_query : d - page_size : 8 - first_page_num : 0 + engine : microsoft_academic categories : science shortcut : ma