mirror of https://github.com/searx/searx
[mod][fix] https rewrite refactor ++ fixes
This commit is contained in:
parent
813247b37a
commit
f141773814
|
@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from urlparse import urlparse
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from os import listdir
|
from os import listdir
|
||||||
from os.path import isfile, isdir, join
|
from os.path import isfile, isdir, join
|
||||||
|
@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
|
||||||
|
|
||||||
# TODO hack, which convert a javascript regex group
|
# TODO hack, which convert a javascript regex group
|
||||||
# into a valid python regex group
|
# into a valid python regex group
|
||||||
rule_from = ruleset.attrib.get('from').replace('$', '\\')
|
rule_from = ruleset.attrib['from'].replace('$', '\\')
|
||||||
rule_to = ruleset.attrib.get('to').replace('$', '\\')
|
if rule_from.endswith('\\'):
|
||||||
|
rule_from = rule_from[:-1]+'$'
|
||||||
|
rule_to = ruleset.attrib['to'].replace('$', '\\')
|
||||||
|
if rule_to.endswith('\\'):
|
||||||
|
rule_to = rule_to[:-1]+'$'
|
||||||
|
|
||||||
# TODO, not working yet because of the hack above,
|
# TODO, not working yet because of the hack above,
|
||||||
# currently doing that in webapp.py
|
# currently doing that in webapp.py
|
||||||
# rule_from_rgx = re.compile(rule_from, re.I)
|
# rule_from_rgx = re.compile(rule_from, re.I)
|
||||||
|
|
||||||
# append rule
|
# append rule
|
||||||
rules.append((rule_from, rule_to))
|
try:
|
||||||
|
rules.append((re.compile(rule_from, re.I | re.U), rule_to))
|
||||||
|
except:
|
||||||
|
# TODO log regex error
|
||||||
|
continue
|
||||||
|
|
||||||
# this child define an exclusion
|
# this child define an exclusion
|
||||||
elif ruleset.tag == 'exclusion':
|
elif ruleset.tag == 'exclusion':
|
||||||
|
@ -143,3 +152,56 @@ def load_https_rules(rules_path):
|
||||||
https_rules.append(ruleset)
|
https_rules.append(ruleset)
|
||||||
|
|
||||||
print(' * {n} https-rules loaded'.format(n=len(https_rules)))
|
print(' * {n} https-rules loaded'.format(n=len(https_rules)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def https_url_rewrite(result):
|
||||||
|
skip_https_rewrite = False
|
||||||
|
# check if HTTPS rewrite is possible
|
||||||
|
for target, rules, exclusions in https_rules:
|
||||||
|
|
||||||
|
# check if target regex match with url
|
||||||
|
if target.match(result['parsed_url'].netloc):
|
||||||
|
# process exclusions
|
||||||
|
for exclusion in exclusions:
|
||||||
|
# check if exclusion match with url
|
||||||
|
if exclusion.match(result['url']):
|
||||||
|
skip_https_rewrite = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# skip https rewrite if required
|
||||||
|
if skip_https_rewrite:
|
||||||
|
break
|
||||||
|
|
||||||
|
# process rules
|
||||||
|
for rule in rules:
|
||||||
|
try:
|
||||||
|
new_result_url = rule[0].sub(rule[1], result['url'])
|
||||||
|
except:
|
||||||
|
break
|
||||||
|
|
||||||
|
# parse new url
|
||||||
|
new_parsed_url = urlparse(new_result_url)
|
||||||
|
|
||||||
|
# continiue if nothing was rewritten
|
||||||
|
if result['url'] == new_result_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# get domainname from result
|
||||||
|
# TODO, does only work correct with TLD's like
|
||||||
|
# asdf.com, not for asdf.com.de
|
||||||
|
# TODO, using publicsuffix instead of this rewrite rule
|
||||||
|
old_result_domainname = '.'.join(
|
||||||
|
result['parsed_url'].hostname.split('.')[-2:])
|
||||||
|
new_result_domainname = '.'.join(
|
||||||
|
new_parsed_url.hostname.split('.')[-2:])
|
||||||
|
|
||||||
|
# check if rewritten hostname is the same,
|
||||||
|
# to protect against wrong or malicious rewrite rules
|
||||||
|
if old_result_domainname == new_result_domainname:
|
||||||
|
# set new url
|
||||||
|
result['url'] = new_result_url
|
||||||
|
|
||||||
|
# target has matched, do not search over the other rules
|
||||||
|
break
|
||||||
|
return result
|
||||||
|
|
|
@ -89,7 +89,7 @@
|
||||||
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
|
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
|
||||||
to="https://$1.sndcdn.com/" />
|
to="https://$1.sndcdn.com/" />
|
||||||
|
|
||||||
<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
|
<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
|
||||||
to="https://$1soundcloud.com/" />
|
to="https://$1soundcloud.com/" />
|
||||||
|
|
||||||
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
|
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
|
||||||
|
|
|
@ -41,15 +41,12 @@ from searx.utils import (
|
||||||
UnicodeWriter, highlight_content, html_to_text, get_themes
|
UnicodeWriter, highlight_content, html_to_text, get_themes
|
||||||
)
|
)
|
||||||
from searx.version import VERSION_STRING
|
from searx.version import VERSION_STRING
|
||||||
from searx.https_rewrite import https_rules
|
|
||||||
from searx.languages import language_codes
|
from searx.languages import language_codes
|
||||||
|
from searx.https_rewrite import https_url_rewrite
|
||||||
from searx.search import Search
|
from searx.search import Search
|
||||||
from searx.query import Query
|
from searx.query import Query
|
||||||
from searx.autocomplete import backends as autocomplete_backends
|
from searx.autocomplete import backends as autocomplete_backends
|
||||||
|
|
||||||
from urlparse import urlparse
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
static_path, templates_path, themes =\
|
static_path, templates_path, themes =\
|
||||||
get_themes(settings['themes_path']
|
get_themes(settings['themes_path']
|
||||||
|
@ -215,59 +212,7 @@ def index():
|
||||||
if settings['server']['https_rewrite']\
|
if settings['server']['https_rewrite']\
|
||||||
and result['parsed_url'].scheme == 'http':
|
and result['parsed_url'].scheme == 'http':
|
||||||
|
|
||||||
skip_https_rewrite = False
|
result = https_url_rewrite(result)
|
||||||
|
|
||||||
# check if HTTPS rewrite is possible
|
|
||||||
for target, rules, exclusions in https_rules:
|
|
||||||
|
|
||||||
# check if target regex match with url
|
|
||||||
if target.match(result['url']):
|
|
||||||
# process exclusions
|
|
||||||
for exclusion in exclusions:
|
|
||||||
# check if exclusion match with url
|
|
||||||
if exclusion.match(result['url']):
|
|
||||||
skip_https_rewrite = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# skip https rewrite if required
|
|
||||||
if skip_https_rewrite:
|
|
||||||
break
|
|
||||||
|
|
||||||
# process rules
|
|
||||||
for rule in rules:
|
|
||||||
try:
|
|
||||||
# TODO, precompile rule
|
|
||||||
p = re.compile(rule[0])
|
|
||||||
|
|
||||||
# rewrite url if possible
|
|
||||||
new_result_url = p.sub(rule[1], result['url'])
|
|
||||||
except:
|
|
||||||
break
|
|
||||||
|
|
||||||
# parse new url
|
|
||||||
new_parsed_url = urlparse(new_result_url)
|
|
||||||
|
|
||||||
# continiue if nothing was rewritten
|
|
||||||
if result['url'] == new_result_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# get domainname from result
|
|
||||||
# TODO, does only work correct with TLD's like
|
|
||||||
# asdf.com, not for asdf.com.de
|
|
||||||
# TODO, using publicsuffix instead of this rewrite rule
|
|
||||||
old_result_domainname = '.'.join(
|
|
||||||
result['parsed_url'].hostname.split('.')[-2:])
|
|
||||||
new_result_domainname = '.'.join(
|
|
||||||
new_parsed_url.hostname.split('.')[-2:])
|
|
||||||
|
|
||||||
# check if rewritten hostname is the same,
|
|
||||||
# to protect against wrong or malicious rewrite rules
|
|
||||||
if old_result_domainname == new_result_domainname:
|
|
||||||
# set new url
|
|
||||||
result['url'] = new_result_url
|
|
||||||
|
|
||||||
# target has matched, do not search over the other rules
|
|
||||||
break
|
|
||||||
|
|
||||||
if search.request_data.get('format', 'html') == 'html':
|
if search.request_data.get('format', 'html') == 'html':
|
||||||
if 'content' in result:
|
if 'content' in result:
|
||||||
|
|
Loading…
Reference in New Issue