diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index bd9a8c2f..0e35e618 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -13,8 +13,8 @@ from urlparse import urljoin from urllib import urlencode from lxml import html -from cgi import escape from datetime import datetime +from searx.engines.xpath import extract_text # engine dependent config categories = ['social media'] @@ -22,12 +22,12 @@ language_support = True # search-url base_url = 'https://twitter.com/' -search_url = base_url+'search?' +search_url = base_url + 'search?' # specific xpath variables results_xpath = '//li[@data-item-type="tweet"]' link_xpath = './/small[@class="time"]//a' -title_xpath = './/span[@class="username js-action-profile-name"]//text()' +title_xpath = './/span[@class="username js-action-profile-name"]' content_xpath = './/p[@class="js-tweet-text tweet-text"]' timestamp_xpath = './/span[contains(@class,"_timestamp")]' @@ -39,6 +39,8 @@ def request(query, params): # set language if specified if params['language'] != 'all': params['cookies']['lang'] = params['language'].split('_')[0] + else: + params['cookies']['lang'] = 'en' return params @@ -53,8 +55,9 @@ def response(resp): for tweet in dom.xpath(results_xpath): link = tweet.xpath(link_xpath)[0] url = urljoin(base_url, link.attrib.get('href')) - title = ''.join(tweet.xpath(title_xpath)) - content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8")) + title = extract_text(tweet.xpath(title_xpath)) + content = extract_text(tweet.xpath(content_xpath)[0]) + pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: timestamp = float(pubdate[0].attrib.get('data-time')) diff --git a/searx/tests/engines/test_twitter.py b/searx/tests/engines/test_twitter.py new file mode 100644 index 00000000..b444b48e --- /dev/null +++ b/searx/tests/engines/test_twitter.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import twitter +from searx.testing import SearxTestCase + + +class TestTwitterEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 0 + dicto['language'] = 'fr_FR' + params = twitter.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('twitter.com', params['url']) + self.assertIn('cookies', params) + self.assertIn('lang', params['cookies']) + self.assertIn('fr', params['cookies']['lang']) + + dicto['language'] = 'all' + params = twitter.request(query, dicto) + self.assertIn('cookies', params) + self.assertIn('lang', params['cookies']) + self.assertIn('en', params['cookies']['lang']) + + def test_response(self): + self.assertRaises(AttributeError, twitter.response, None) + self.assertRaises(AttributeError, twitter.response, []) + self.assertRaises(AttributeError, twitter.response, '') + self.assertRaises(AttributeError, twitter.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(twitter.response(response), []) + + html = """ +
  • +
    +
    +
    +
    + +

    + This is the content étude à€ + + +

    +
    +
    + +
    +
    +
  • + """ + response = mock.Mock(text=html) + results = twitter.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], '@TitleName') + self.assertEqual(results[0]['url'], 'https://twitter.com/this.is.the.url') + self.assertIn(u'This is the content', results[0]['content']) + # self.assertIn(u'This is the content étude à€', results[0]['content']) + + html = """ +
  • +
    +
    +
    +
    + +

    + This is the content étude à€ + + +

    +
    +
    + +
    +
    +
  • + """ + response = mock.Mock(text=html) + results = twitter.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], '@TitleName') + self.assertEqual(results[0]['url'], 'https://twitter.com/this.is.the.url') + self.assertIn(u'This is the content', results[0]['content']) + + html = """ +
  • +
    + +
    + + this.meta.com + + + + +
    +

    + This should be the content.

    +
    +
  • + """ + response = mock.Mock(text=html) + results = twitter.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 0) diff --git a/searx/tests/test_engines.py b/searx/tests/test_engines.py index 067616f0..ccef2890 100644 --- a/searx/tests/test_engines.py +++ b/searx/tests/test_engines.py @@ -23,6 +23,7 @@ from searx.tests.engines.test_searchcode_code import * # noqa from searx.tests.engines.test_searchcode_doc import * # noqa from searx.tests.engines.test_soundcloud import * # noqa from searx.tests.engines.test_stackoverflow import * # noqa +from searx.tests.engines.test_twitter import * # noqa from searx.tests.engines.test_vimeo import * # noqa from searx.tests.engines.test_www500px import * # noqa from searx.tests.engines.test_youtube import * # noqa