Merge pull request #208 from pointhi/new_engines

add 1x.com engine, improve yacy-engine
2024-12-12 08:46:26 +01:00 · 2015-02-01 14:07:34 +01:00 · 2015-02-01 14:07:34 +01:00 · 03137eebd9
commit 03137eebd9
parent 4a20fc202e a605d0ae69
5 changed files with 157 additions and 13 deletions
--- a/searx/engines/www1x.py
+++ b/searx/engines/www1x.py
@ -0,0 +1,82 @@
 ## 1x (Images)
 #
 # @website     http://1x.com/
 # @provide-api no
 #
 # @using-api   no
 # @results     HTML
 # @stable      no (HTML can change)
 # @parse       url, title, thumbnail, img_src, content
 from urllib import urlencode
 from urlparse import urljoin
 from lxml import html
 import string
 import re
 # engine dependent config
 categories = ['images']
 paging = False
 # search-url
 base_url = 'http://1x.com'
 search_url = base_url+'/backend/search.php?{query}'
 # do search-request
 def request(query, params):
    params['url'] = search_url.format(query=urlencode({'q': query}))
    return params
 # get response from search-request
 def response(resp):
    results = []
    # get links from result-text
    regex = re.compile('(</a>|<a)')
    results_parts = re.split(regex, resp.text)
    cur_element = ''
    # iterate over link parts
    for result_part in results_parts:
        # processed start and end of link
        if result_part == '<a':
            cur_element = result_part
            continue
        elif result_part != '</a>':
            cur_element += result_part
            continue
        cur_element += result_part
        # fix xml-error
        cur_element = string.replace(cur_element, '"></a>', '"/></a>')
        dom = html.fromstring(cur_element)
        link = dom.xpath('//a')[0]
        url = urljoin(base_url, link.attrib.get('href'))
        title = link.attrib.get('title', '')
        thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
        # TODO: get image with higher resolution
        img_src = thumbnail_src
        # check if url is showing to a photo
        if '/photo/' not in url:
            continue
        # append result
        results.append({'url': url,
                        'title': title,
                        'img_src': img_src,
                        'content': '',
                        'thumbnail_src': thumbnail_src,
                        'template': 'images.html'})
    # return results
    return results
--- a/searx/engines/yacy.py
+++ b/searx/engines/yacy.py
@ -68,9 +68,18 @@ def response(resp):
    search_results = raw_search_results.get('channels', {})[0].get('items', [])
-    if resp.search_params['category'] == 'general':
+    for result in search_results:
        # parse image results
        if result.get('image'):
            # append result
            results.append({'url': result['url'],
                            'title': result['title'],
                            'content': '',
                            'img_src': result['image'],
                            'template': 'images.html'})
        # parse general results
-        for result in search_results:
+        else:
            publishedDate = parser.parse(result['pubDate'])
            # append result
@ -79,17 +88,7 @@ def response(resp):
                            'content': result['description'],
                            'publishedDate': publishedDate})
-    elif resp.search_params['category'] == 'images':
+        #TODO parse video, audio and file results
        # parse image results
        for result in search_results:
            # append result
            results.append({'url': result['url'],
                            'title': result['title'],
                            'content': '',
                            'img_src': result['image'],
                            'template': 'images.html'})
    #TODO parse video, audio and file results
    # return results
    return results
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -83,6 +83,11 @@ engines:
    engine : www500px
    shortcut : px
  - name : 1x
    engine : www1x
    shortcut : 1x
    disabled : True
  - name : flickr
    categories : images
    shortcut : fl
--- a/searx/tests/engines/test_www1x.py
+++ b/searx/tests/engines/test_www1x.py
@ -0,0 +1,57 @@
 from collections import defaultdict
 import mock
 from searx.engines import www1x
 from searx.testing import SearxTestCase
 class TestWww1xEngine(SearxTestCase):
    def test_request(self):
        query = 'test_query'
        params = www1x.request(query, defaultdict(dict))
        self.assertTrue('url' in params)
        self.assertTrue(query in params['url'])
        self.assertTrue('1x.com' in params['url'])
    def test_response(self):
        self.assertRaises(AttributeError, www1x.response, None)
        self.assertRaises(AttributeError, www1x.response, [])
        self.assertRaises(AttributeError, www1x.response, '')
        self.assertRaises(AttributeError, www1x.response, '[]')
        response = mock.Mock(text='<html></html>')
        self.assertEqual(www1x.response(response), [])
        html = """
        <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters
        [
        <!ELEMENT characters (character*) >
        <!ELEMENT character  (#PCDATA   ) >
        <!ENTITY iexcl   "&#161;" >
        <!ENTITY cent    "&#162;" >
        <!ENTITY pound   "&#163;" >
        ]
        ><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%">
        <tr>
            <td style="min-width: 220px;" valign="top">
                <div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div>
                <div>
                    <a href="/photo/123456" class="dynamiclink">
 <img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;">
                    </a>
                    <a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink">
 <img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg">
                    </a>
                </div>
            </td>
        </table>
        ]]></searchresult></root>
        """
        response = mock.Mock(text=html)
        results = www1x.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456')
        self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg')
        self.assertEqual(results[0]['content'], '')
        self.assertEqual(results[0]['template'], 'images.html')
--- a/searx/tests/test_engines.py
+++ b/searx/tests/test_engines.py
@ -1,2 +1,3 @@
 from searx.tests.engines.test_dummy import *  # noqa
 from searx.tests.engines.test_github import *  # noqa
 from searx.tests.engines.test_www1x import *  # noqa