searx/searx/engines/xpath.py

# SPDX-License-Identifier: AGPL-3.0-or-later

from lxml import html
from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list

search_url = None
lang_all = 'en'
url_xpath = None
content_xpath = None
title_xpath = None
thumbnail_xpath = False
paging = False
suggestion_xpath = ''
results_xpath = ''
cached_xpath = ''
cached_url = ''
soft_max_redirects = 0

cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''

paging = False
'''Engine supports paging [True or False].'''

page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1


time_range_support = False
'''Engine supports search time range.'''

time_range_url = '&hours={time_range_val}'
'''Time range URL parameter in the in :py:obj:`search_url`.  If no time range is
requested by the user, the URL parameter is an empty string.  The
``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`.

.. code:: yaml

    time_range_url : '&days={time_range_val}'
'''

time_range_map = {
    'day': 24,
    'week': 24 * 7,
    'month': 24 * 30,
    'year': 24 * 365,
}
'''Maps time range value from user to ``{time_range_val}`` in
:py:obj:`time_range_url`.

.. code:: yaml

    time_range_map:
      day: 1
      week: 7
      month: 30
      year: 365
'''

safe_search_support = False
'''Engine supports safe-search.'''

safe_search_map = {
    0: '&filter=none',
    1: '&filter=moderate',
    2: '&filter=strict'
}
'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`.

.. code:: yaml

    safesearch: true
    safes_search_map:
      0: '&filter=none'
      1: '&filter=moderate'
      2: '&filter=strict'

'''


def request(query, params):
    query = urlencode({'q': query})[2:]

    fp = {'query': query}
    if paging and search_url.find('{pageno}') >= 0:
        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num

    safe_search = ''
    if params['safesearch']:
        safe_search = safe_search_map[params['safesearch']]

    lang = lang_all
    if params['language'] != 'all':
        lang = params['language'][:2]

    time_range = ''
    if params.get('time_range'):
        time_range_val = time_range_map.get(params.get('time_range'))
        time_range = time_range_url.format(time_range_val=time_range_val)

    safe_search = ''
    if params['safesearch']:
        safe_search = safe_search_map[params['safesearch']]

    fargs = {
        'query': urlencode({'q': query})[2:],
        'lang': lang,
        'pageno': (params['pageno'] - 1) * page_size + first_page_num,
        'time_range': time_range,
        'safe_search': safe_search,
    }

    params['cookies'].update(cookies)
    params['headers'].update(headers)

    params['url'] = search_url.format(**fargs)
    params['soft_max_redirects'] = soft_max_redirects

    return params


def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False  # pylint: disable=undefined-variable

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):
            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
            title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(eval_xpath_list(result, content_xpath))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url\
                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url) for
                 x in eval_xpath_list(dom, url_xpath)),
                map(extract_text, eval_xpath_list(dom, title_xpath)),
                map(extract_text, eval_xpath_list(dom, content_xpath)),
                map(extract_text, eval_xpath_list(dom, cached_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content,
                                'cached_url': cached_url + cached, 'is_onion': is_onion})
        else:
            for url, title, content in zip(
                (extract_url(x, search_url) for
                 x in eval_xpath_list(dom, url_xpath)),
                map(extract_text, eval_xpath_list(dom, title_xpath)),
                map(extract_text, eval_xpath_list(dom, content_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`

[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`from lxml import html`
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`from urllib.parse import urlencode`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`search_url = None`
Pick pass cookies from searxng (#3252) * [enh] Allow passing headers/cookies from settings.yml Example: - engine: xpath - search_url: example.org - headers: {'example_header': 'example_header'} - cookies: {'safesearch': 'off'} * [fix[ Update only cookies/headers * [enh] XPath engine - add time range support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [enh] XPath engine - add time safe-search support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Allen <64094914+allendema@users.noreply.github.com> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> 2022-06-06 00:18:33 +02:00			`lang_all = 'en'`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`url_xpath = None`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`content_xpath = None`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`title_xpath = None`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`thumbnail_xpath = False`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`paging = False`
[enh] suggestion support for xpath engine 2013-11-13 19:33:09 +01:00			`suggestion_xpath = ''`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`results_xpath = ''`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`cached_xpath = ''`
			`cached_url = ''`
[enh] xpath engine - add request parameter 'soft_max_redirects' Make 'soft_max_redirects' configurable per Xpath engine:: - name : <engine-name> engine : xpath soft_max_redirects: 1 ... Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-05-17 15:04:55 +02:00			`soft_max_redirects = 0`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00
Pick pass cookies from searxng (#3252) * [enh] Allow passing headers/cookies from settings.yml Example: - engine: xpath - search_url: example.org - headers: {'example_header': 'example_header'} - cookies: {'safesearch': 'off'} * [fix[ Update only cookies/headers * [enh] XPath engine - add time range support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [enh] XPath engine - add time safe-search support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Allen <64094914+allendema@users.noreply.github.com> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> 2022-06-06 00:18:33 +02:00			`cookies = {}`
			`headers = {}`
			`'''Some engines might offer different result based on cookies or headers.`
			`Possible use-case: To set safesearch cookie or header to moderate.'''`

			`paging = False`
			`'''Engine supports paging [True or False].'''`

Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00			`page_size = 1`
			`# number of the first page (usually 0 or 1)`
			`first_page_num = 1`

[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00
Pick pass cookies from searxng (#3252) * [enh] Allow passing headers/cookies from settings.yml Example: - engine: xpath - search_url: example.org - headers: {'example_header': 'example_header'} - cookies: {'safesearch': 'off'} * [fix[ Update only cookies/headers * [enh] XPath engine - add time range support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [enh] XPath engine - add time safe-search support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Allen <64094914+allendema@users.noreply.github.com> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> 2022-06-06 00:18:33 +02:00			`time_range_support = False`
			`'''Engine supports search time range.'''`

			`time_range_url = '&hours={time_range_val}'`
			'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is
Fix typos (#3366) Found via `codespell -S ./searx/translations,./searx/data,./searx/static -L ans,te,fo,doubleclick,tthe,dum` 2022-09-29 23:06:59 +02:00			`requested by the user, the URL parameter is an empty string. The`
Pick pass cookies from searxng (#3252) * [enh] Allow passing headers/cookies from settings.yml Example: - engine: xpath - search_url: example.org - headers: {'example_header': 'example_header'} - cookies: {'safesearch': 'off'} * [fix[ Update only cookies/headers * [enh] XPath engine - add time range support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [enh] XPath engine - add time safe-search support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Allen <64094914+allendema@users.noreply.github.com> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> 2022-06-06 00:18:33 +02:00			``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`.

			`.. code:: yaml`

			`time_range_url : '&days={time_range_val}'`
			`'''`

			`time_range_map = {`
			`'day': 24,`
			`'week': 24 * 7,`
			`'month': 24 * 30,`
			`'year': 24 * 365,`
			`}`
			'''Maps time range value from user to ``{time_range_val}`` in
			:py:obj:`time_range_url`.

			`.. code:: yaml`

			`time_range_map:`
			`day: 1`
			`week: 7`
			`month: 30`
			`year: 365`
			`'''`

			`safe_search_support = False`
			`'''Engine supports safe-search.'''`

			`safe_search_map = {`
			`0: '&filter=none',`
			`1: '&filter=moderate',`
			`2: '&filter=strict'`
			`}`
			'''Maps safe-search value to ``{safe_search}`` in :py:obj:`search_url`.

			`.. code:: yaml`

			`safesearch: true`
			`safes_search_map:`
			`0: '&filter=none'`
			`1: '&filter=moderate'`
			`2: '&filter=strict'`

			`'''`


[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`def request(query, params):`
			`query = urlencode({'q': query})[2:]`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
			`fp = {'query': query}`
			`if paging and search_url.find('{pageno}') >= 0:`
[fix] behaviour for page_size>1 and first_page_num>0 eg. pageno=1,21,41,... instead of 20,40,60,... 2016-08-14 13:46:54 +02:00			`fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
Pick pass cookies from searxng (#3252) * [enh] Allow passing headers/cookies from settings.yml Example: - engine: xpath - search_url: example.org - headers: {'example_header': 'example_header'} - cookies: {'safesearch': 'off'} * [fix[ Update only cookies/headers * [enh] XPath engine - add time range support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [enh] XPath engine - add time safe-search support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Allen <64094914+allendema@users.noreply.github.com> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> 2022-06-06 00:18:33 +02:00			`safe_search = ''`
			`if params['safesearch']:`
			`safe_search = safe_search_map[params['safesearch']]`

			`lang = lang_all`
			`if params['language'] != 'all':`
			`lang = params['language'][:2]`

			`time_range = ''`
			`if params.get('time_range'):`
			`time_range_val = time_range_map.get(params.get('time_range'))`
			`time_range = time_range_url.format(time_range_val=time_range_val)`

			`safe_search = ''`
			`if params['safesearch']:`
			`safe_search = safe_search_map[params['safesearch']]`

			`fargs = {`
			`'query': urlencode({'q': query})[2:],`
			`'lang': lang,`
			`'pageno': (params['pageno'] - 1) * page_size + first_page_num,`
			`'time_range': time_range,`
			`'safe_search': safe_search,`
			`}`

			`params['cookies'].update(cookies)`
			`params['headers'].update(headers)`

			`params['url'] = search_url.format(**fargs)`
[enh] xpath engine - add request parameter 'soft_max_redirects' Make 'soft_max_redirects' configurable per Xpath engine:: - name : <engine-name> engine : xpath soft_max_redirects: 1 ... Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-05-17 15:04:55 +02:00			`params['soft_max_redirects'] = soft_max_redirects`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`return params`


			`def response(resp):`
			`results = []`
			`dom = html.fromstring(resp.text)`
[fix] xpath, mojeek: fix commit 58d72f26925d56e22330c54be03c3dcbee0c4135 before commit 58d72f2, category was not set in xpath.py, so searx/engines/__init__py was setting the category to ['general'] the commit 58d72f2 set the category to [] which is not replaced by searx/engines/__init__.py consequence: the mojeek engine is hidden in the preferences. this commit revert the xpath.py change. close #2368 2020-12-10 10:40:45 +01:00			`is_onion = True if 'onions' in categories else False # pylint: disable=undefined-variable`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`if results_xpath:`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`for result in eval_xpath_list(dom, results_xpath):`
			`url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)`
			`title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))`
[fix] remove minimum length of content for XPath engine Instead of raising an exception and therefore hiding all results of the engine. It make sense to remove that requirement in order to allow the implementation of search engines that do not always have a description. In fact some search engines that in 99% of the case have a description like Brave Search or Mojeek crash completely if they for some reason included a result with no description. To test this patch try Mojeek: !mjk xyz before and after the patch. Suggested-by: 0xhtml in https://github.com/searx/searx/discussions/2933 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-09-04 12:41:23 +02:00			`content = extract_text(eval_xpath_list(result, content_xpath))`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`tmp_result = {'url': url, 'title': title, 'content': content}`

			`# add thumbnail if available`
			`if thumbnail_xpath:`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath)`
[fix] fixes google play engines (#1651) update commit 87baa74a863ac74ae4c86bbfcb04148ba7f70696 2019-07-25 09:31:47 +02:00			`if len(thumbnail_xpath_result) > 0:`
			`tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`# add alternative cached url if available`
			`if cached_xpath:`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`tmp_result['cached_url'] = cached_url\`
			`+ extract_text(eval_xpath_list(result, cached_xpath, min_len=1))`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00
			`if is_onion:`
			`tmp_result['is_onion'] = True`

[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`results.append(tmp_result)`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`else:`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`if cached_xpath:`
			`for url, title, content, cached in zip(`
			`(extract_url(x, search_url) for`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`x in eval_xpath_list(dom, url_xpath)),`
			`map(extract_text, eval_xpath_list(dom, title_xpath)),`
			`map(extract_text, eval_xpath_list(dom, content_xpath)),`
			`map(extract_text, eval_xpath_list(dom, cached_xpath))`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`):`
			`results.append({'url': url, 'title': title, 'content': content,`
			`'cached_url': cached_url + cached, 'is_onion': is_onion})`
			`else:`
			`for url, title, content in zip(`
			`(extract_url(x, search_url) for`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`x in eval_xpath_list(dom, url_xpath)),`
			`map(extract_text, eval_xpath_list(dom, title_xpath)),`
			`map(extract_text, eval_xpath_list(dom, content_xpath))`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`):`
			`results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00
[enh] suggestion support for xpath engine 2013-11-13 19:33:09 +01:00			`if not suggestion_xpath:`
			`return results`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`for suggestion in eval_xpath(dom, suggestion_xpath):`
[mod][fix] xpath engine simplified, yahoo engine never returns truncated urls 2014-01-05 14:06:52 +01:00			`results.append({'suggestion': extract_text(suggestion)})`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`return results`