From 092e15cc1db2faa6c2fffb6163c5147d386bf948 Mon Sep 17 00:00:00 2001 From: jibe-b Date: Tue, 29 Mar 2016 11:59:16 +0200 Subject: [PATCH] [enh] Add BASE engine in category Science. Basic and advanced search. --- searx/engines/base.py | 122 ++++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +++ searx/webapp.py | 24 +++++---- 3 files changed, 142 insertions(+), 10 deletions(-) create mode 100755 searx/engines/base.py diff --git a/searx/engines/base.py b/searx/engines/base.py new file mode 100755 index 00000000..66491d39 --- /dev/null +++ b/searx/engines/base.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +""" + BASE (Scholar publications) + + @website https://base-search.net + @provide-api yes with authorization (https://api.base-search.net/) + + @using-api yes + @results XML + @stable ? + @parse url, title, publishedDate, content + More info on api: http://base-search.net/about/download/base_interface.pdf +""" + +from lxml import etree +from urllib import urlencode +from searx.utils import searx_useragent +from cgi import escape +from datetime import datetime +import re + + +categories = ['science'] + +base_url = 'https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi'\ + + '?func=PerformSearch&{query}&boost=oa&hits={hits}&offset={offset}' + +# engine dependent config +paging = True +number_of_results = 10 + +# shortcuts for advanced search +shorcut_dict = { + # user-friendly keywords + 'format:': 'dcformat:', + 'author:': 'dccreator:', + 'collection:': 'dccollection:', + 'hdate:': 'dchdate:', + 'contributor:': 'dccontributor:', + 'coverage:': 'dccoverage:', + 'date:': 'dcdate:', + 'abstract:': 'dcdescription:', + 'urls:': 'dcidentifier:', + 'language:': 'dclanguage:', + 'publisher:': 'dcpublisher:', + 'relation:': 'dcrelation:', + 'rights:': 'dcrights:', + 'source:': 'dcsource:', + 'subject:': 'dcsubject:', + 'title:': 'dctitle:', + 'type:': 'dcdctype:' +} + + +def request(query, params): + # replace shortcuts with API advanced search keywords + for key in shorcut_dict.keys(): + query = re.sub(str(key), str(shorcut_dict[key]), query) + + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'query': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + params['headers']['User-Agent'] = searx_useragent() + return params + + +def response(resp): + results = [] + + search_results = etree.XML(resp.content) + + for entry in search_results.xpath('./result/doc'): + content = "No description available" + + date = datetime.now() # needed in case no dcdate is available for an item + for item in entry: + if item.attrib["name"] == "dchdate": + harvestDate = item.text + + elif item.attrib["name"] == "dcdate": + date = item.text + + elif item.attrib["name"] == "dctitle": + title = item.text + + elif item.attrib["name"] == "dclink": + url = item.text + + elif item.attrib["name"] == "dcdescription": + content = escape(item.text[:300]) + if len(item.text) > 300: + content += "..." + +# dates returned by the BASE API are not several formats + publishedDate = None + for date_format in ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d', '%Y-%m', '%Y']: + try: + publishedDate = datetime.strptime(date, date_format) + break + except: + pass + + if publishedDate is not None: + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + else: + res_dict = {'url': url, + 'title': title, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 5ef74d95..994106bf 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -34,9 +34,15 @@ outgoing: # communication with search engines # - 1.1.1.2 engines: +<<<<<<< HEAD - name : arch linux wiki engine : archlinux shortcut : al +======= + - name : base + engine : base + shortcut : bs +>>>>>>> Add BASE engine in category Science. Basic and advanced request. - name : wikipedia engine : mediawiki diff --git a/searx/webapp.py b/searx/webapp.py index 66ba65a2..2b665d96 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -408,17 +408,21 @@ def index(): # TODO, check if timezone is calculated right if 'publishedDate' in result: - result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') - if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): - timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) - minutes = int((timedifference.seconds / 60) % 60) - hours = int(timedifference.seconds / 60 / 60) - if hours == 0: - result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) - else: - result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa + try: # test if publishedDate >= 1900 (datetime module bug) + result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') + except ValueError: + result['publishedDate'] = None else: - result['publishedDate'] = format_date(result['publishedDate']) + if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): + timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) + minutes = int((timedifference.seconds / 60) % 60) + hours = int(timedifference.seconds / 60 / 60) + if hours == 0: + result['publishedDate'] = gettext(u'{minutes} minute(s) ago').format(minutes=minutes) + else: + result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa + else: + result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': return Response(json.dumps({'query': search.query,