From 45d15bd6f0d64c9e10dde31cec239936ee553dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Revol?= Date: Fri, 6 Jan 2017 00:22:43 +0100 Subject: [PATCH] Add framalibre engine framalibre.org is a catalogue of Free Software, edited by Framasoft. For now we pass the thumbnail as img_src as it doesn't seem to be used for IT... --- searx/engines/framalibre.py | 72 ++++++++++++++++++ searx/settings.yml | 5 ++ tests/unit/engines/test_framalibre.py | 103 ++++++++++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 searx/engines/framalibre.py create mode 100644 tests/unit/engines/test_framalibre.py diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py new file mode 100644 index 00000000..e8d1d8aa --- /dev/null +++ b/searx/engines/framalibre.py @@ -0,0 +1,72 @@ +""" + FramaLibre (It) + + @website https://framalibre.org/ + @provide-api no + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, content, thumbnail, img_src +""" + +from urlparse import urljoin +from cgi import escape +from urllib import urlencode +from lxml import html +from searx.engines.xpath import extract_text +from dateutil import parser + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +base_url = 'https://framalibre.org/' +search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}' + +# specific xpath variables +results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]' +link_xpath = './/h3[@class="node-title"]/a[@href]' +thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src' +content_xpath = './/div[@class="content"]//p' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) + params['url'] = search_url.format(query=urlencode({'keys': query}), + offset=offset) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(base_url, link.attrib.get('href')) + # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... + title = escape(extract_text(link)) + thumbnail_tags = result.xpath(thumbnail_xpath) + thumbnail = None + if len(thumbnail_tags) > 0: + thumbnail = extract_text(thumbnail_tags[0]) + if thumbnail[0] == '/': + thumbnail = base_url + thumbnail + content = escape(extract_text(result.xpath(content_xpath))) + + # append result + results.append({'url': href, + 'title': title, + 'thumbnail': thumbnail, + 'img_src': thumbnail, + 'content': content}) + + # return results + return results diff --git a/searx/settings.yml b/searx/settings.yml index e00cce3d..580e4f11 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -465,6 +465,11 @@ engines: shortcut : scc disabled : True + - name : framalibre + engine : framalibre + shortcut : frl + disabled : True + # - name : searx # engine : searx_engine # shortcut : se diff --git a/tests/unit/engines/test_framalibre.py b/tests/unit/engines/test_framalibre.py new file mode 100644 index 00000000..85099637 --- /dev/null +++ b/tests/unit/engines/test_framalibre.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import framalibre +from searx.testing import SearxTestCase + + +class TestFramalibreEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 0 + params = framalibre.request(query, dicto) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('framalibre.org' in params['url']) + + def test_response(self): + self.assertRaises(AttributeError, framalibre.response, None) + self.assertRaises(AttributeError, framalibre.response, []) + self.assertRaises(AttributeError, framalibre.response, '') + self.assertRaises(AttributeError, framalibre.response, '[]') + + response = mock.Mock(text='{}') + self.assertEqual(framalibre.response(response), []) + + response = mock.Mock(text='{"data": []}') + self.assertEqual(framalibre.response(response), []) + + html = u""" +
+
+
+
+ +
+
+

Gogs

+ +
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Gogs est une interface web basée sur git et une bonne alternative à GitHub.

+
+
+
+
+ +
+
+ """ + response = mock.Mock(text=html) + results = framalibre.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Gogs') + self.assertEqual(results[0]['url'], + 'https://framalibre.org/content/gogs') + self.assertEqual(results[0]['content'], + u"Gogs est une interface web basée sur git et une bonne alternative à GitHub.")