[enh] removing result html tags

2025-01-30 17:54:50 +01:00 · 2013-11-09 18:39:20 +01:00 · 2013-11-09 18:39:20 +01:00 · 17bf00ee42
commit 17bf00ee42
parent 14a53e3430
4 changed files with 7 additions and 6 deletions
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@ -1,5 +1,6 @@
 from json import loads
 from urllib import urlencode
+from searx.utils import html_to_text

 url = 'https://duckduckgo.com/'
 search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
@ -16,7 +17,7 @@ def response(resp):
        if not r.get('t'):
            continue
        results.append({'title': r['t']
-                       ,'content': r['a']
+                       ,'content': html_to_text(r['a'])
                       ,'url': r['u']
                       })
    return results
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@ -1,4 +1,4 @@
-from urllib import quote
+from urllib import urlencode
 from lxml import html
 from urlparse import urlparse
 from cgi import escape
@ -8,7 +8,7 @@ search_url = base_url+'do/search'

 def request(query, params):
    global search_url
-    query = quote(query.replace(' ', '+'), safe='+')
+    query = urlencode({'q': query})[2:]
    params['url'] = search_url
    params['method'] = 'POST'
    params['data'] = {'query': query}
--- a/searx/engines/twitter.py
+++ b/searx/engines/twitter.py
@ -1,6 +1,7 @@
 from urlparse import urljoin
 from urllib import urlencode
 from lxml import html
+from cgi import escape

 categories = ['social media']

@ -21,6 +22,6 @@ def response(resp):
        link = tweet.xpath('.//small[@class="time"]//a')[0]
        url = urljoin(base_url, link.attrib.get('href'))
        title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
-        content = ''.join(map(html.tostring, tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//*')))
+        content = escape(''.join(tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//text()')))
        results.append({'url': url, 'title': title, 'content': content})
    return results
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@ -46,12 +46,11 @@ def request(query, params):
 def response(resp):
    results = []
    dom = html.fromstring(resp.text)
-    query = resp.search_params['query']
    if results_xpath:
        for result in dom.xpath(results_xpath):
            url = extract_url(result.xpath(url_xpath))
            title = ' '.join(result.xpath(title_xpath))
-            content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
+            content = escape(' '.join(result.xpath(content_xpath)))
            results.append({'url': url, 'title': title, 'content': content})
    else:
        for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):