From f0ca1c34833e2c0c79af68e699e646d77167a269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Tue, 8 Sep 2020 09:51:53 +0200 Subject: [PATCH] [enh] Add command line engines: git grep, find, etc. (#2128) A new "base" engine called command is introduced. It is the foundation for all command line engines for now. You can use this engine to create your own command line engine. Add some engines (commented out to make sure no one enables anything accidentally): * git grep: This engine lets you grep in the searx repo. * locate: If locate is installed and initialized, you can search on the FS. * find: You can find files with a specific name from where you started searx. * pattern search in files: This engine utilizes the command fgrep. * regex search in files: This engine runs `grep` to find a file based on its contents. --- searx/engines/command.py | 184 +++++++++++++ searx/settings.yml | 71 ++++++ .../oscar/result_templates/key-value.html | 2 +- tests/unit/engines/test_command.py | 241 ++++++++++++++++++ 4 files changed, 497 insertions(+), 1 deletion(-) create mode 100644 searx/engines/command.py create mode 100644 tests/unit/engines/test_command.py diff --git a/searx/engines/command.py b/searx/engines/command.py new file mode 100644 index 00000000..b9e672ff --- /dev/null +++ b/searx/engines/command.py @@ -0,0 +1,184 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. +''' + + +from os.path import expanduser, isabs, realpath, commonprefix +from re import MULTILINE, search as re_search +from shlex import split as shlex_split +from subprocess import Popen, PIPE +from time import time +from threading import Thread + +from searx import logger + + +offline = True +paging = True +command = [] +delimiter = {} +parse_regex = {} +query_type = '' +query_enum = [] +environment_variables = {} +working_dir = realpath('.') +result_separator = '\n' +result_template = 'key-value.html' +timeout = 4.0 + +_command_logger = logger.getChild('command') +_compiled_parse_regex = {} + + +def init(engine_settings): + check_parsing_options(engine_settings) + + if 'command' not in engine_settings: + raise ValueError('engine command : missing configuration key: command') + + global command, working_dir, result_template, delimiter, parse_regex, timeout, environment_variables + + command = engine_settings['command'] + + if 'working_dir' in engine_settings: + working_dir = engine_settings['working_dir'] + if not isabs(engine_settings['working_dir']): + working_dir = realpath(working_dir) + + if 'parse_regex' in engine_settings: + parse_regex = engine_settings['parse_regex'] + for result_key, regex in parse_regex.items(): + _compiled_parse_regex[result_key] = re.compile(regex, flags=MULTILINE) + if 'delimiter' in engine_settings: + delimiter = engine_settings['delimiter'] + + if 'environment_variables' in engine_settings: + environment_variables = engine_settings['environment_variables'] + + +def search(query, params): + cmd = _get_command_to_run(query) + if not cmd: + return [] + + results = [] + reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno'])) + reader_thread.start() + reader_thread.join(timeout=timeout) + + return results + + +def _get_command_to_run(query): + params = shlex_split(query.decode('utf-8')) + __check_query_params(params) + + cmd = [] + for c in command: + if c == '{{QUERY}}': + cmd.extend(params) + else: + cmd.append(c) + + return cmd + + +def _get_results_from_process(results, cmd, pageno): + leftover = '' + count = 0 + start, end = __get_results_limits(pageno) + with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process: + line = process.stdout.readline() + while line: + buf = leftover + line.decode('utf-8') + raw_results = buf.split(result_separator) + if raw_results[-1]: + leftover = raw_results[-1] + raw_results = raw_results[:-1] + + for raw_result in raw_results: + result = __parse_single_result(raw_result) + if result is None: + _command_logger.debug('skipped result:', raw_result) + continue + + if start <= count and count <= end: + result['template'] = result_template + results.append(result) + + count += 1 + if end < count: + return results + + line = process.stdout.readline() + + return_code = process.wait(timeout=timeout) + if return_code != 0: + raise RuntimeError('non-zero return code when running command', cmd, return_code) + + +def __get_results_limits(pageno): + start = (pageno - 1) * 10 + end = start + 9 + return start, end + + +def __check_query_params(params): + if not query_type: + return + + if query_type == 'path': + query_path = params[-1] + query_path = expanduser(query_path) + if commonprefix([realpath(query_path), working_dir]) != working_dir: + raise ValueError('requested path is outside of configured working directory') + elif query_type == 'enum' and len(query_enum) > 0: + for param in params: + if param not in query_enum: + raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum) + + +def check_parsing_options(engine_settings): + """ Checks if delimiter based parsing or regex parsing is configured correctly """ + + if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings: + raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex') + if 'delimiter' in engine_settings and 'parse_regex' in engine_settings: + raise ValueError('failed to init settings for parsing lines: too many settings') + + if 'delimiter' in engine_settings: + if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']: + raise ValueError + + +def __parse_single_result(raw_result): + """ Parses command line output based on configuration """ + + result = {} + + if delimiter: + elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1) + if len(elements) != len(delimiter['keys']): + return {} + for i in range(len(elements)): + result[delimiter['keys'][i]] = elements[i] + + if parse_regex: + for result_key, regex in _compiled_parse_regex.items(): + found = regex.search(raw_result) + if not found: + return {} + result[result_key] = raw_result[found.start():found.end()] + + return result diff --git a/searx/settings.yml b/searx/settings.yml index d6ea5317..9140522c 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -920,6 +920,77 @@ engines: # shortcut : uw # base_url : 'http://doc.ubuntu-fr.org' +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: git grep +# engine: command +# command: ['git', 'grep', '{{QUERY}}'] +# shortcut: gg +# tokens: [] +# disabled: True +# delimiter: +# chars: ':' +# keys: ['filepath', 'code'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: locate +# engine: command +# command: ['locate', '{{QUERY}}'] +# shortcut: loc +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: find +# engine: command +# command: ['find', '.', '-name', '{{QUERY}}'] +# query_type: path +# shortcut: fnd +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: pattern search in files +# engine: command +# command: ['fgrep', '{{QUERY}}'] +# shortcut: fgr +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: regex search in files +# engine: command +# command: ['grep', '{{QUERY}}'] +# shortcut: gr +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + locales: en : English ar : العَرَبِيَّة (Arabic) diff --git a/searx/templates/oscar/result_templates/key-value.html b/searx/templates/oscar/result_templates/key-value.html index 67c748e7..d5c56a18 100644 --- a/searx/templates/oscar/result_templates/key-value.html +++ b/searx/templates/oscar/result_templates/key-value.html @@ -6,7 +6,7 @@ {% continue %} {% endif %} - {{ key|upper }}: {{ value }} + {{ key|upper }}: {{ value|truncate }} {% endfor %} diff --git a/tests/unit/engines/test_command.py b/tests/unit/engines/test_command.py new file mode 100644 index 00000000..0aa1c620 --- /dev/null +++ b/tests/unit/engines/test_command.py @@ -0,0 +1,241 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +''' + +from sys import version_info + +from searx.engines import command as command_engine +from searx.testing import SearxTestCase + + +class TestCommandEngine(SearxTestCase): + def test_basic_seq_command_engine(self): + ls_engine = command_engine + ls_engine.command = ['seq', '{{QUERY}}'] + ls_engine.delimiter = {'chars': ' ', 'keys': ['number']} + expected_results = [ + {'number': '1', 'template': 'key-value.html'}, + {'number': '2', 'template': 'key-value.html'}, + {'number': '3', 'template': 'key-value.html'}, + {'number': '4', 'template': 'key-value.html'}, + {'number': '5', 'template': 'key-value.html'}, + ] + results = ls_engine.search('5'.encode('utf-8'), {'pageno': 1}) + self.assertEqual(results, expected_results) + + def test_delimiter_parsing_command_engine(self): + searx_logs = '''DEBUG:searx.webapp:static directory is /home/n/p/searx/searx/static +DEBUG:searx.webapp:templates directory is /home/n/p/searx/searx/templates +DEBUG:searx.engines:soundcloud engine: Starting background initialization +DEBUG:searx.engines:wolframalpha engine: Starting background initialization +DEBUG:searx.engines:locate engine: Starting background initialization +DEBUG:searx.engines:regex search in files engine: Starting background initialization +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.wolframalpha.com +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): soundcloud.com +DEBUG:searx.engines:find engine: Starting background initialization +DEBUG:searx.engines:pattern search in files engine: Starting background initialization +DEBUG:searx.webapp:starting webserver on 127.0.0.1:8888 +WARNING:werkzeug: * Debugger is active! +INFO:werkzeug: * Debugger PIN: 299-578-362''' + echo_engine = command_engine + echo_engine.command = ['echo', searx_logs] + echo_engine.delimiter = {'chars': ':', 'keys': ['level', 'component', 'message']} + + expected_results_by_page = [ + [ + { + 'component': 'searx.webapp', + 'message': 'static directory is /home/n/p/searx/searx/static', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.webapp', + 'message': 'templates directory is /home/n/p/searx/searx/templates', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'soundcloud engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'wolframalpha engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'locate engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'regex search in files engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'urllib3.connectionpool', + 'message': 'Starting new HTTPS connection (1): www.wolframalpha.com', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'urllib3.connectionpool', + 'message': 'Starting new HTTPS connection (1): soundcloud.com', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'find engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'pattern search in files engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + + ], + [ + { + 'component': 'searx.webapp', + 'message': 'starting webserver on 127.0.0.1:8888', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'werkzeug', + 'message': ' * Debugger is active!', + 'template': 'key-value.html', + 'level': 'WARNING', + }, + { + 'component': 'werkzeug', + 'message': ' * Debugger PIN: 299-578-362', + 'template': 'key-value.html', + 'level': 'INFO', + }, + ], + + ] + + for i in [0, 1]: + results = echo_engine.search(''.encode('utf-8'), {'pageno': i + 1}) + self.assertEqual(results, expected_results_by_page[i]) + + def test_regex_parsing_command_engine(self): + txt = '''commit 35f9a8c81d162a361b826bbcd4a1081a4fbe76a7 +Author: Noémi Ványi +Date: Tue Oct 15 11:31:33 2019 +0200 + +first interesting message + +commit 6c3c206316153ccc422755512bceaa9ab0b14faa +Author: Noémi Ványi +Date: Mon Oct 14 17:10:08 2019 +0200 + +second interesting message + +commit d8594d2689b4d5e0d2f80250223886c3a1805ef5 +Author: Noémi Ványi +Date: Mon Oct 14 14:45:05 2019 +0200 + +third interesting message + +commit ''' + git_log_engine = command_engine + git_log_engine.command = ['echo', txt] + git_log_engine.result_separator = '\n\ncommit ' + git_log_engine.delimiter = {} + git_log_engine.parse_regex = { + 'commit': '\w{40}', + 'author': '[\w* ]* <\w*@?\w*\.?\w*>', + 'date': 'Date: .*', + 'message': '\n\n.*$' + } + expected_results = [ + { + 'commit': '35f9a8c81d162a361b826bbcd4a1081a4fbe76a7', + 'author': ' Noémi Ványi ', + 'date': 'Date: Tue Oct 15 11:31:33 2019 +0200', + 'message': '\n\nfirst interesting message', + 'template': 'key-value.html', + }, + { + 'commit': '6c3c206316153ccc422755512bceaa9ab0b14faa', + 'author': ' Noémi Ványi ', + 'date': 'Date: Mon Oct 14 17:10:08 2019 +0200', + 'message': '\n\nsecond interesting message', + 'template': 'key-value.html', + }, + { + 'commit': 'd8594d2689b4d5e0d2f80250223886c3a1805ef5', + 'author': ' Noémi Ványi ', + 'date': 'Date: Mon Oct 14 14:45:05 2019 +0200', + 'message': '\n\nthird interesting message', + 'template': 'key-value.html', + }, + + ] + + results = git_log_engine.search(''.encode('utf-8'), {'pageno': 1}) + self.assertEqual(results, expected_results) + + def test_working_dir_path_query(self): + ls_engine = command_engine + ls_engine.command = ['ls', '{{QUERY}}'] + ls_engine.result_separator = '\n' + ls_engine.delimiter = {'chars': ' ', 'keys': ['file']} + ls_engine.query_type = 'path' + + results = ls_engine.search('.'.encode(), {'pageno': 1}) + self.assertTrue(len(results) != 0) + + forbidden_paths = [ + '..', + '../..', + './..', + '~', + '/var', + ] + for forbidden_path in forbidden_paths: + self.assertRaises(ValueError, ls_engine.search, '..'.encode(), {'pageno': 1}) + + def test_enum_queries(self): + echo_engine = command_engine + echo_engine.command = ['echo', '{{QUERY}}'] + echo_engine.query_type = 'enum' + echo_engine.query_enum = ['i-am-allowed-to-say-this', 'and-that'] + + for allowed in echo_engine.query_enum: + results = echo_engine.search(allowed.encode(), {'pageno': 1}) + self.assertTrue(len(results) != 0) + + forbidden_queries = [ + 'forbidden', + 'banned', + 'prohibited', + ] + for forbidden in forbidden_queries: + self.assertRaises(ValueError, echo_engine.search, forbidden.encode(), {'pageno': 1})