From 3e3672e0790266fc7f2482fdd854d7789a915d4d Mon Sep 17 00:00:00 2001 From: jibe-b Date: Sat, 23 Sep 2017 14:16:06 +0200 Subject: [PATCH] [add] arxiv engine --- searx/engines/arxiv.py | 73 ++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +++ tests/unit/engines/test_arxiv.py | 58 +++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 searx/engines/arxiv.py create mode 100644 tests/unit/engines/test_arxiv.py diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py new file mode 100644 index 00000000..cbeac008 --- /dev/null +++ b/searx/engines/arxiv.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +""" + ArXiV (Scientific preprints) + @website https://axiv.org + @provide-api yes (export.arxiv.org/api/query) + @using-api yes + @results XML-RSS + @stable yes + @parse url, title, publishedDate, content + More info on api: https://arxiv.org/help/api/user-manual +""" + +from lxml import html +from datetime import datetime +from searx.url_utils import urlencode + + +categories = ['science'] + +base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ + + '{query}&start={offset}&max_results={number_of_results}' + +# engine dependent config +number_of_results = 10 + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=query, + offset=offset, + number_of_results=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + search_results = html.fromstring(resp.text.encode('utf-8')).xpath('//entry') + + for entry in search_results: + title = entry.xpath('.//title')[0].text + + url = entry.xpath('.//id')[0].text + + content = entry.xpath('.//summary')[0].text + + # If a doi is available, add it to the snipppet + try: + doi = entry.xpath('.//link[@title="doi"]')[0].text + content = 'DOI: ' + doi + ' Abstract: ' + content + except: + pass + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 6f44a3b8..54b2b2e6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -60,6 +60,12 @@ engines: disabled : True shortcut : ai + - name : arxiv + engine : arxiv + shortcut : arx + categories : science + timeout : 4.0 + - name : base engine : base shortcut : bs diff --git a/tests/unit/engines/test_arxiv.py b/tests/unit/engines/test_arxiv.py new file mode 100644 index 00000000..e51d0f48 --- /dev/null +++ b/tests/unit/engines/test_arxiv.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import arxiv +from searx.testing import SearxTestCase + + +class TestBaseEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = arxiv.request(query, dicto) + self.assertIn('url', params) + self.assertIn('export.arxiv.org/api/', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, arxiv.response, None) + self.assertRaises(AttributeError, arxiv.response, []) + self.assertRaises(AttributeError, arxiv.response, '') + self.assertRaises(AttributeError, arxiv.response, '[]') + + response = mock.Mock(text=''' +''') + self.assertEqual(arxiv.response(response), []) + + xml_mock = ''' + + ArXiv Query: search_query=all:test_query&id_list=&start=0&max_results=1 + http://arxiv.org/api/1 + 2000-01-21T00:00:00-01:00 + 1 + 0 + 1 + + http://arxiv.org/1 + 2000-01-01T00:00:01Z + 2000-01-01T00:00:01Z + Mathematical proof. + Mathematical formula. + + A. B. + + + + + + + +''' + + response = mock.Mock(text=xml_mock.encode('utf-8')) + results = arxiv.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Mathematical proof.') + self.assertEqual(results[0]['content'], 'Mathematical formula.')