From f231d79a5ddd2ff211e401fe6ea2250325df116f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 4 Mar 2022 22:00:59 +0100 Subject: [PATCH] [fix] engine: Semantic Scholar (Science) // rework & fix Signed-off-by: Markus Heiser --- searx/engines/semantic_scholar.py | 58 +++++++++++++++++++++++-------- searx/settings.yml | 7 ---- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py index 2fccf152..aa750ecc 100644 --- a/searx/engines/semantic_scholar.py +++ b/searx/engines/semantic_scholar.py @@ -1,15 +1,23 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" - Semantic Scholar (Science) +# lint: pylint +"""Semantic Scholar (Science) """ from json import dumps, loads - -from searx import logger +from datetime import datetime -logger = logger.getChild('semantic scholar') +about = { + "website": 'https://www.semanticscholar.org/', + "wikidata_id": 'Q22908627', + "official_api_documentation": 'https://api.semanticscholar.org/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} +paging = True search_url = 'https://www.semanticscholar.org/api/1/search' +paper_url = 'https://www.semanticscholar.org/paper' def request(query, params): @@ -35,15 +43,37 @@ def request(query, params): def response(resp): res = loads(resp.text) results = [] - for result in res['results']: - if 'primaryPaperLink' not in result or 'url' not in result['primaryPaperLink']: - logger.debug('ignore result because of missing link: %s', result) - continue - results.append({ - 'url': result['primaryPaperLink']['url'], - 'title': result['title']['text'], - 'content': result['paperAbstractTruncated'] - }) + for result in res['results']: + item = {} + metadata = [] + + url = result.get('primaryPaperLink', {}).get('url') + if not url and result.get('links'): + url = result.get('links')[0] + if not url: + alternatePaperLinks = result.get('alternatePaperLinks') + if alternatePaperLinks: + url = alternatePaperLinks[0].get('url') + if not url: + url = paper_url + '/%s' % result['id'] + + item['url'] = url + + item['title'] = result['title']['text'] + item['content'] = result['paperAbstract']['text'] + + metadata = result.get('fieldsOfStudy') or [] + venue = result.get('venue', {}).get('text') + if venue: + metadata.append(venue) + if metadata: + item['metadata'] = ', '.join(metadata) + + pubDate = result.get('pubDate') + if pubDate: + item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") + + results.append(item) return results diff --git a/searx/settings.yml b/searx/settings.yml index 8999194c..f4fd30fa 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1133,13 +1133,6 @@ engines: disabled : True shortcut : se categories : science - about: - website: https://www.semanticscholar.org/ - wikidata_id: Q22908627 - official_api_documentation: https://api.semanticscholar.org/ - use_official_api: false - require_api_key: false - results: JSON # Spotify needs API credentials # - name : spotify