fix Microsoft Academic engine

This commit is contained in:
Noémi Ványi 2021-03-15 20:21:28 +01:00
parent f97b4ff7b6
commit 8158d8654a
1 changed files with 25 additions and 32 deletions

View File

@ -3,10 +3,7 @@
Microsoft Academic (Science) Microsoft Academic (Science)
""" """
from datetime import datetime from json import dumps, loads
from json import loads
from uuid import uuid4
from urllib.parse import urlencode
from searx.utils import html_to_text from searx.utils import html_to_text
# about # about
@ -21,26 +18,25 @@ about = {
categories = ['images'] categories = ['images']
paging = True paging = True
result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' search_url = 'https://academic.microsoft.com/api/search'
_paper_url = 'https://academic.microsoft.com/paper/{id}/reference'
def request(query, params): def request(query, params):
correlation_id = uuid4() params['url'] = search_url
msacademic = uuid4()
time_now = datetime.now()
params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id}))
params['cookies']['msacademic'] = str(msacademic)
params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now))
params['method'] = 'POST' params['method'] = 'POST'
params['data'] = { params['headers']['content-type'] = 'application/json; charset=utf-8'
'Query': '@{query}@'.format(query=query), params['data'] = dumps({
'Limit': 10, 'query': query,
'Offset': params['pageno'] - 1, 'queryExpression': '',
'Filters': '', 'filters': [],
'OrderBy': '', 'orderBy': 0,
'SortAscending': False, 'skip': (params['pageno'] - 1) * 10,
} 'sortAscending': True,
'take': 10,
'includeCitationContexts': False,
'profileId': '',
})
return params return params
@ -51,10 +47,13 @@ def response(resp):
if not response_data: if not response_data:
return results return results
for result in response_data['results']: for result in response_data['pr']:
url = _get_url(result) if 'dn' not in result['paper']:
title = result['e']['dn'] continue
content = _get_content(result)
title = result['paper']['dn']
content = _get_content(result['paper'])
url = _paper_url.format(id=result['paper']['id'])
results.append({ results.append({
'url': url, 'url': url,
'title': html_to_text(title), 'title': html_to_text(title),
@ -64,15 +63,9 @@ def response(resp):
return results return results
def _get_url(result):
if 's' in result['e']:
return result['e']['s'][0]['u']
return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id'])
def _get_content(result): def _get_content(result):
if 'd' in result['e']: if 'd' in result:
content = result['e']['d'] content = result['d']
if len(content) > 300: if len(content) > 300:
return content[:300] + '...' return content[:300] + '...'
return content return content