Browse Source

[enh] Add onions category with Ahmia, Not Evil and Torch

Xpath engine and results template changed to account for the fact that
archive.org doesn't cache .onions, though some onion engines migth have
their own cache.

Disabled by default. Can be enabled by setting the SOCKS proxies to
wherever Tor is listening and setting using_tor_proxy as True.

Requires Tor and updating packages.

To avoid manually adding the timeout on each engine, you can set
extra_proxy_timeout to account for Tor's (or whatever proxy used) extra
time.
tags/v0.18.0
a01200356 5 years ago
committed by Marc Abonce Seguin
parent
commit
c3daa08537
11 changed files with 399 additions and 14 deletions
  1. +1
    -0
      .gitignore
  2. +13
    -1
      searx/engines/__init__.py
  3. +82
    -0
      searx/engines/ahmia.py
  4. +64
    -0
      searx/engines/not_evil.py
  5. +29
    -7
      searx/engines/xpath.py
  6. +28
    -3
      searx/settings.yml
  7. +6
    -1
      searx/templates/legacy/result_templates/default.html
  8. +10
    -2
      searx/templates/oscar/macros.html
  9. +1
    -0
      searx/webapp.py
  10. +121
    -0
      tests/unit/engines/test_xpath.py
  11. +44
    -0
      tests/unit/test_engines_init.py

+ 1
- 0
.gitignore View File

@@ -15,6 +15,7 @@ setup.cfg
*.pyc
*/*.pyc
*~
*.swp

/node_modules



+ 13
- 1
searx/engines/__init__.py View File

@@ -142,6 +142,17 @@ def load_engine(engine_data):
engine.stats['page_load_time'] = 0
engine.stats['page_load_count'] = 0

# tor related settings
if settings['outgoing'].get('using_tor_proxy'):
# use onion url if using tor.
if hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')
elif 'onions' in engine.categories:
# exclude onion engines if not using tor.
return None

engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)

for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)

@@ -252,8 +263,9 @@ def get_engines_stats(preferences):


def load_engines(engine_list):
global engines
global engines, engine_shortcuts
engines.clear()
engine_shortcuts.clear()
for engine_data in engine_list:
engine = load_engine(engine_data)
if engine is not None:


+ 82
- 0
searx/engines/ahmia.py View File

@@ -0,0 +1,82 @@
"""
Ahmia (Onions)

@website http://msydqstlz2kzerdg.onion
@provides-api no

@using-api no
@results HTML
@stable no
@parse url, title, content
"""

from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from searx.engines.xpath import extract_url, extract_text

# engine config
categories = ['onions']
paging = True
page_size = 10

# search url
search_url = 'http://msydqstlz2kzerdg.onion/search/?{query}'
time_range_support = True
time_range_dict = {'day': 1,
'week': 7,
'month': 30}

# xpaths
results_xpath = '//li[@class="result"]'
url_xpath = './h4/a/@href'
title_xpath = './h4/a[1]'
content_xpath = './/p[1]'
correction_xpath = '//*[@id="didYouMean"]//a'
number_of_results_xpath = '//*[@id="totalResults"]'


def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))

if params['time_range'] in time_range_dict:
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})

return params


def response(resp):
results = []
dom = fromstring(resp.text)

# trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = dom.xpath(results_xpath)
trimmed_results = all_results[first_result_index:first_result_index + page_size]

# get results
for result in trimmed_results:
# remove ahmia url and extract the actual url for the result
raw_url = extract_url(result.xpath(url_xpath), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]

title = extract_text(result.xpath(title_xpath))
content = extract_text(result.xpath(content_xpath))

results.append({'url': cleaned_url,
'title': title,
'content': content,
'is_onion': True})

# get spelling corrections
for correction in dom.xpath(correction_xpath):
results.append({'correction': extract_text(correction)})

# get number of results
number_of_results = dom.xpath(number_of_results_xpath)
if number_of_results:
try:
results.append({'number_of_results': int(extract_text(number_of_results))})
except:
pass

return results

+ 64
- 0
searx/engines/not_evil.py View File

@@ -0,0 +1,64 @@
"""
not Evil (Onions)

@website http://hss3uro2hsxfogfq.onion
@provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm)

@using-api no
@results HTML
@stable no
@parse url, title, content
"""

from urllib.parse import urlencode
from lxml import html
from searx.engines.xpath import extract_text

# engine dependent config
categories = ['onions']
paging = True
page_size = 20

# search-url
base_url = 'http://hss3uro2hsxfogfq.onion/'
search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}'

# specific xpath variables
results_xpath = '//*[@id="content"]/div/p'
url_xpath = './span[1]'
title_xpath = './a[1]'
content_xpath = './text()'


# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * page_size

params['url'] = base_url + search_url.format(pageno=offset,
query=urlencode({'q': query}),
page_size=page_size)

return params


# get response from search-request
def response(resp):
results = []

# needed because otherwise requests guesses wrong encoding
resp.encoding = 'utf8'
dom = html.fromstring(resp.text)

# parse results
for result in dom.xpath(results_xpath):
url = extract_text(result.xpath(url_xpath)[0])
title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath))

# append result
results.append({'url': url,
'title': title,
'content': content,
'is_onion': True})

return results

+ 29
- 7
searx/engines/xpath.py View File

@@ -10,6 +10,8 @@ thumbnail_xpath = False
paging = False
suggestion_xpath = ''
results_xpath = ''
cached_xpath = ''
cached_url = ''

# parameters for engines with paging support
#
@@ -36,6 +38,8 @@ def request(query, params):
def response(resp):
results = []
dom = html.fromstring(resp.text)
is_onion = True if 'onions' in categories else False

if results_xpath:
for result in eval_xpath(dom, results_xpath):
url = extract_url(eval_xpath(result, url_xpath), search_url)
@@ -49,15 +53,33 @@ def response(resp):
if len(thumbnail_xpath_result) > 0:
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)

# add alternative cached url if available
if cached_xpath:
tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath))

if is_onion:
tmp_result['is_onion'] = True

results.append(tmp_result)
else:
for url, title, content in zip(
(extract_url(x, search_url) for
x in eval_xpath(dom, url_xpath)),
map(extract_text, eval_xpath(dom, title_xpath)),
map(extract_text, eval_xpath(dom, content_xpath))
):
results.append({'url': url, 'title': title, 'content': content})
if cached_xpath:
for url, title, content, cached in zip(
(extract_url(x, search_url) for
x in dom.xpath(url_xpath)),
map(extract_text, dom.xpath(title_xpath)),
map(extract_text, dom.xpath(content_xpath)),
map(extract_text, dom.xpath(cached_xpath))
):
results.append({'url': url, 'title': title, 'content': content,
'cached_url': cached_url + cached, 'is_onion': is_onion})
else:
for url, title, content in zip(
(extract_url(x, search_url) for
x in dom.xpath(url_xpath)),
map(extract_text, dom.xpath(title_xpath)),
map(extract_text, dom.xpath(content_xpath))
):
results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})

if not suggestion_xpath:
return results


+ 28
- 3
searx/settings.yml View File

@@ -60,8 +60,10 @@ outgoing: # communication with search engines
# see http://docs.python-requests.org/en/latest/user/advanced/#proxies
# SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks
# proxies :
# http : http://127.0.0.1:8080
# https: http://127.0.0.1:8080
# http : socks5h://127.0.0.1:9050
# https: socks5h://127.0.0.1:9050
# using_tor_proxy : True
# extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy
# uncomment below section only if you have more than one network interface
# which can be the source of outgoing search requests
# source_ips:
@@ -89,6 +91,12 @@ engines:
shortcut: apkm
disabled: True

# Requires Tor
- name : ahmia
engine : ahmia
categories : onions
shortcut : ah

- name : arch linux wiki
engine : archlinux
shortcut : al
@@ -185,7 +193,7 @@ engines:
- name : deviantart
engine : deviantart
shortcut : da
timeout: 3.0
timeout : 3.0

- name : ddg definitions
engine : duckduckgo_definitions
@@ -514,6 +522,11 @@ engines:
timeout: 5.0
shortcut : npm

# Requires Tor
- name : not evil
engine : not_evil
shortcut : ne

- name : nyaa
engine : nyaa
shortcut : nt
@@ -698,6 +711,18 @@ engines:
url: https://torrentz2.eu/
timeout : 3.0

# Requires Tor
- name : torch
engine : xpath
paging : True
search_url : http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
results_xpath : //table//tr
url_xpath : ./td[2]/a
title_xpath : ./td[2]/b
content_xpath : ./td[2]/small
categories : onions
shortcut : tch

- name : twitter
engine : twitter
shortcut : tw


+ 6
- 1
searx/templates/legacy/result_templates/default.html View File

@@ -1,6 +1,11 @@
<div class="result {{ result.class }}{% for e in result.engines %} {{ e }}{% endfor %}">
<h3 class="result_title">{% if "icon_"~result.engine~".ico" in favicons %}<img width="14" height="14" class="favicon" src="{{ url_for('static', filename='img/icons/icon_'+result.engine+'.ico') }}" alt="{{result.engine}}" />{% endif %}<a href="{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ result.title|safe }}</a></h3>
<p class="url">{{ result.pretty_url }}&lrm; <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
<p class="url">{{ result.pretty_url }}&lrm;
{% if result.cached_url %}
<a class="cache_link" href="{{ result.cached_url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
{% elif not result.is_onion %}
<a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
{% endif %}
{% if result.publishedDate %}<span class="published_date">{{ result.publishedDate }}</span>{% endif %}</p>
<p class="content">{% if result.img_src %}<img src="{{ image_proxify(result.img_src) }}" class="image" />{% endif %}{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
</div>

+ 10
- 2
searx/templates/oscar/macros.html View File

@@ -32,7 +32,11 @@
<span class="label label-default">{{ engine }}</span>
{%- endfor -%}
{%- if result.url -%}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
{% if result.cached_url %}
<small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
{% elif not result.is_onion %}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
{% endif %}
{%- endif -%}
{%- if proxify -%}
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
@@ -50,7 +54,11 @@
<span class="label label-default">{{ engine }}</span>
{%- endfor %}
{%- if result.url -%}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
{% if result.cached_url %}
<small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
{% elif not result.is_onion %}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
{% endif %}
{%- endif -%}
{% if proxify -%}
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>


+ 1
- 0
searx/webapp.py View File

@@ -146,6 +146,7 @@ _category_names = (gettext('files'),
gettext('it'),
gettext('news'),
gettext('map'),
gettext('onions'),
gettext('science'))

outgoing_proxies = settings['outgoing'].get('proxies') or None


+ 121
- 0
tests/unit/engines/test_xpath.py View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import mock
from searx.engines import xpath
from searx.testing import SearxTestCase


class TestXpathEngine(SearxTestCase):

def test_request(self):
xpath.search_url = 'https://url.com/{query}'
xpath.categories = []
xpath.paging = False
query = 'test_query'
dicto = defaultdict(dict)
params = xpath.request(query, dicto)
self.assertIn('url', params)
self.assertEquals('https://url.com/test_query', params['url'])

xpath.search_url = 'https://url.com/q={query}&p={pageno}'
xpath.paging = True
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
params = xpath.request(query, dicto)
self.assertIn('url', params)
self.assertEquals('https://url.com/q=test_query&p=1', params['url'])

def test_response(self):
# without results_xpath
xpath.url_xpath = '//div[@class="search_result"]//a[@class="result"]/@href'
xpath.title_xpath = '//div[@class="search_result"]//a[@class="result"]'
xpath.content_xpath = '//div[@class="search_result"]//p[@class="content"]'

self.assertRaises(AttributeError, xpath.response, None)
self.assertRaises(AttributeError, xpath.response, [])
self.assertRaises(AttributeError, xpath.response, '')
self.assertRaises(AttributeError, xpath.response, '[]')

response = mock.Mock(text='<html></html>')
self.assertEqual(xpath.response(response), [])

html = u"""
<div>
<div class="search_result">
<a class="result" href="https://result1.com">Result 1</a>
<p class="content">Content 1</p>
<a class="cached" href="https://cachedresult1.com">Cache</a>
</div>
<div class="search_result">
<a class="result" href="https://result2.com">Result 2</a>
<p class="content">Content 2</p>
<a class="cached" href="https://cachedresult2.com">Cache</a>
</div>
</div>
"""
response = mock.Mock(text=html)
results = xpath.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['title'], 'Result 1')
self.assertEqual(results[0]['url'], 'https://result1.com/')
self.assertEqual(results[0]['content'], 'Content 1')
self.assertEqual(results[1]['title'], 'Result 2')
self.assertEqual(results[1]['url'], 'https://result2.com/')
self.assertEqual(results[1]['content'], 'Content 2')

# with cached urls, without results_xpath
xpath.cached_xpath = '//div[@class="search_result"]//a[@class="cached"]/@href'
results = xpath.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
self.assertFalse(results[0].get('is_onion', False))

# results are onion urls (no results_xpath)
xpath.categories = ['onions']
results = xpath.response(response)
self.assertTrue(results[0]['is_onion'])

# with results_xpath
xpath.results_xpath = '//div[@class="search_result"]'
xpath.url_xpath = './/a[@class="result"]/@href'
xpath.title_xpath = './/a[@class="result"]'
xpath.content_xpath = './/p[@class="content"]'
xpath.cached_xpath = None
xpath.categories = []

self.assertRaises(AttributeError, xpath.response, None)
self.assertRaises(AttributeError, xpath.response, [])
self.assertRaises(AttributeError, xpath.response, '')
self.assertRaises(AttributeError, xpath.response, '[]')

response = mock.Mock(text='<html></html>')
self.assertEqual(xpath.response(response), [])

response = mock.Mock(text=html)
results = xpath.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['title'], 'Result 1')
self.assertEqual(results[0]['url'], 'https://result1.com/')
self.assertEqual(results[0]['content'], 'Content 1')
self.assertEqual(results[1]['title'], 'Result 2')
self.assertEqual(results[1]['url'], 'https://result2.com/')
self.assertEqual(results[1]['content'], 'Content 2')

# with cached urls, with results_xpath
xpath.cached_xpath = './/a[@class="cached"]/@href'
results = xpath.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
self.assertFalse(results[0].get('is_onion', False))

# results are onion urls (with results_xpath)
xpath.categories = ['onions']
results = xpath.response(response)
self.assertTrue(results[0]['is_onion'])

+ 44
- 0
tests/unit/test_engines_init.py View File

@@ -0,0 +1,44 @@
from searx.testing import SearxTestCase
from searx import settings, engines


class TestEnginesInit(SearxTestCase):

@classmethod
def tearDownClass(cls):
settings['outgoing']['using_tor_proxy'] = False
settings['outgoing']['extra_proxy_timeout'] = 0

def test_initialize_engines_default(self):
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'},
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}]

engines.initialize_engines(engine_list)
self.assertEqual(len(engines.engines), 2)
self.assertIn('engine1', engines.engines)
self.assertIn('engine2', engines.engines)

def test_initialize_engines_exclude_onions(self):
settings['outgoing']['using_tor_proxy'] = False
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'},
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]

engines.initialize_engines(engine_list)
self.assertEqual(len(engines.engines), 1)
self.assertIn('engine1', engines.engines)
self.assertNotIn('onions', engines.categories)

def test_initialize_engines_include_onions(self):
settings['outgoing']['using_tor_proxy'] = True
settings['outgoing']['extra_proxy_timeout'] = 100.0
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general',
'timeout': 20.0, 'onion_url': 'http://engine1.onion'},
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]

engines.initialize_engines(engine_list)
self.assertEqual(len(engines.engines), 2)
self.assertIn('engine1', engines.engines)
self.assertIn('engine2', engines.engines)
self.assertIn('onions', engines.categories)
self.assertIn('http://engine1.onion', engines.engines['engine1'].search_url)
self.assertEqual(engines.engines['engine1'].timeout, 120.0)

Loading…
Cancel
Save