Merge pull request #621 from stepshal/anomalous-backslash-in-string

Fix anomalous backslash in string
This commit is contained in:
Adam Tauber 2016-07-18 22:27:17 +02:00 committed by GitHub
commit aa09f963eb
21 changed files with 47 additions and 47 deletions

View File

@ -9,7 +9,7 @@ categories = []
url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X' url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X'
weight = 100 weight = 100
parser_re = re.compile(u'.*?(\d+(?:\.\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) # noqa parser_re = re.compile(u'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) # noqa
db = 1 db = 1

View File

@ -47,7 +47,7 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
regex = re.compile('\/200H\/') regex = re.compile(r'\/200H\/')
# parse results # parse results
for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'): for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):

View File

@ -300,9 +300,9 @@ def parse_map_detail(parsed_url, result, google_hostname):
results = [] results = []
# try to parse the geoloc # try to parse the geoloc
m = re.search('@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path) m = re.search(r'@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path)
if m is None: if m is None:
m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query) m = re.search(r'll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query)
if m is not None: if m is not None:
# geoloc found (ignored) # geoloc found (ignored)

View File

@ -68,15 +68,15 @@ def response(resp):
url = link.attrib.get('href') url = link.attrib.get('href')
# block google-ad url's # block google-ad url's
if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
continue continue
# block startpage search url's # block startpage search url's
if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
continue continue
# block ixquick search url's # block ixquick search url's
if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue continue
title = escape(extract_text(link)) title = escape(extract_text(link))
@ -89,7 +89,7 @@ def response(resp):
published_date = None published_date = None
# check if search result starts with something like: "2 Sep 2014 ... " # check if search result starts with something like: "2 Sep 2014 ... "
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...') + 4 date_pos = content.find('...') + 4
date_string = content[0:date_pos - 5] date_string = content[0:date_pos - 5]
published_date = parser.parse(date_string, dayfirst=True) published_date = parser.parse(date_string, dayfirst=True)
@ -98,7 +98,7 @@ def response(resp):
content = content[date_pos:] content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... " # check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content): elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...') + 4 date_pos = content.find('...') + 4
date_string = content[0:date_pos - 5] date_string = content[0:date_pos - 5]

View File

@ -25,10 +25,10 @@ base_url = 'https://swisscows.ch/'
search_string = '?{query}&page={page}' search_string = '?{query}&page={page}'
# regex # regex
regex_json = re.compile('initialData: {"Request":(.|\n)*},\s*environment') regex_json = re.compile(r'initialData: {"Request":(.|\n)*},\s*environment')
regex_json_remove_start = re.compile('^initialData:\s*') regex_json_remove_start = re.compile(r'^initialData:\s*')
regex_json_remove_end = re.compile(',\s*environment$') regex_json_remove_end = re.compile(r',\s*environment$')
regex_img_url_remove_start = re.compile('^https?://i\.swisscows\.ch/\?link=') regex_img_url_remove_start = re.compile(r'^https?://i\.swisscows\.ch/\?link=')
# do search-request # do search-request

View File

@ -48,7 +48,7 @@ def response(resp):
return [] return []
# regular expression for parsing torrent size strings # regular expression for parsing torrent size strings
size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
# processing the results, two rows at a time # processing the results, two rows at a time
for i in xrange(0, len(rows), 2): for i in xrange(0, len(rows), 2):

View File

@ -41,7 +41,7 @@ def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
regex = re.compile('3\.jpg.*$') regex = re.compile(r'3\.jpg.*$')
# parse results # parse results
for result in dom.xpath('//div[@class="photo"]'): for result in dom.xpath('//div[@class="photo"]'):

View File

@ -55,7 +55,7 @@ def request(query, params):
def sanitize_url(url): def sanitize_url(url):
if ".yahoo.com/" in url: if ".yahoo.com/" in url:
return re.sub(u"\;\_ylt\=.+$", "", url) return re.sub(u"\\;\\_ylt\\=.+$", "", url)
else: else:
return url return url

View File

@ -87,7 +87,7 @@ def load_single_https_ruleset(rules_path):
# convert host-rule to valid regex # convert host-rule to valid regex
host = ruleset.attrib.get('host')\ host = ruleset.attrib.get('host')\
.replace('.', '\.').replace('*', '.*') .replace('.', r'\.').replace('*', '.*')
# append to host list # append to host list
hosts.append(host) hosts.append(host)

View File

@ -5,7 +5,7 @@ from threading import RLock
from urlparse import urlparse, unquote from urlparse import urlparse, unquote
from searx.engines import engines from searx.engines import engines
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile('[,;:!?\./\\\\ ()-_]', re.M | re.U) CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U) WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)

View File

@ -63,7 +63,7 @@ def highlight_content(content, query):
regex_parts = [] regex_parts = []
for chunk in query.split(): for chunk in query.split():
if len(chunk) == 1: if len(chunk) == 1:
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk)))
else: else:
regex_parts.append(u'{0}'.format(re.escape(chunk))) regex_parts.append(u'{0}'.format(re.escape(chunk)))
query_regex = u'({0})'.format('|'.join(regex_parts)) query_regex = u'({0})'.format('|'.join(regex_parts))

View File

@ -62,7 +62,7 @@ class TestDailymotionEngine(SearxTestCase):
self.assertEqual(results[0]['content'], 'Description') self.assertEqual(results[0]['content'], 'Description')
self.assertIn('x2fit7q', results[0]['embedded']) self.assertIn('x2fit7q', results[0]['embedded'])
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.dailymotion.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.dailymotion.com\/artist\/1217","type":"artist"}

View File

@ -27,7 +27,7 @@ class TestDeezerEngine(SearxTestCase):
response = mock.Mock(text='{"data": []}') response = mock.Mock(text='{"data": []}')
self.assertEqual(deezer.response(response), []) self.assertEqual(deezer.response(response), [])
json = """ json = r"""
{"data":[ {"data":[
{"id":100, "title":"Title of track", {"id":100, "title":"Title of track",
"link":"https:\/\/www.deezer.com\/track\/1094042","duration":232, "link":"https:\/\/www.deezer.com\/track\/1094042","duration":232,
@ -45,7 +45,7 @@ class TestDeezerEngine(SearxTestCase):
self.assertEqual(results[0]['content'], 'Artist Name • Album Title • Title of track') self.assertEqual(results[0]['content'], 'Artist Name • Album Title • Title of track')
self.assertTrue('100' in results[0]['embedded']) self.assertTrue('100' in results[0]['embedded'])
json = """ json = r"""
{"data":[ {"data":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"https:\/\/www.deezer.com\/artist\/1217","type":"artist"} "link":"https:\/\/www.deezer.com\/artist\/1217","type":"artist"}

View File

@ -27,7 +27,7 @@ class TestFlickrEngine(SearxTestCase):
response = mock.Mock(text='{"data": []}') response = mock.Mock(text='{"data": []}')
self.assertEqual(flickr.response(response), []) self.assertEqual(flickr.response(response), [])
json = """ json = r"""
{ "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032", { "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032",
"photo": [ "photo": [
{ "id": "15751017054", "owner": "66847915@N08", { "id": "15751017054", "owner": "66847915@N08",
@ -55,7 +55,7 @@ class TestFlickrEngine(SearxTestCase):
self.assertTrue('Owner' in results[0]['content']) self.assertTrue('Owner' in results[0]['content'])
self.assertTrue('Description' in results[0]['content']) self.assertTrue('Description' in results[0]['content'])
json = """ json = r"""
{ "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032", { "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032",
"photo": [ "photo": [
{ "id": "15751017054", "owner": "66847915@N08", { "id": "15751017054", "owner": "66847915@N08",
@ -79,7 +79,7 @@ class TestFlickrEngine(SearxTestCase):
self.assertTrue('Owner' in results[0]['content']) self.assertTrue('Owner' in results[0]['content'])
self.assertTrue('Description' in results[0]['content']) self.assertTrue('Description' in results[0]['content'])
json = """ json = r"""
{ "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032", { "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032",
"photo": [ "photo": [
{ "id": "15751017054", "owner": "66847915@N08", { "id": "15751017054", "owner": "66847915@N08",
@ -103,7 +103,7 @@ class TestFlickrEngine(SearxTestCase):
self.assertTrue('Owner' in results[0]['content']) self.assertTrue('Owner' in results[0]['content'])
self.assertTrue('Description' in results[0]['content']) self.assertTrue('Description' in results[0]['content'])
json = """ json = r"""
{ "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032", { "photos": { "page": 1, "pages": "41001", "perpage": 100, "total": "4100032",
"photo": [ "photo": [
{ "id": "15751017054", "owner": "66847915@N08", { "id": "15751017054", "owner": "66847915@N08",
@ -130,7 +130,7 @@ class TestFlickrEngine(SearxTestCase):
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.flickr.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.flickr.com\/artist\/1217","type":"artist"}

View File

@ -316,7 +316,7 @@ class TestFlickrNoapiEngine(SearxTestCase):
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
# garbage test # garbage test
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.flickr.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.flickr.com\/artist\/1217","type":"artist"}

View File

@ -33,23 +33,23 @@ class TestInaEngine(SearxTestCase):
<div class=\\"search-results--list\\"><div class=\\"media\\">\\n\ <div class=\\"search-results--list\\"><div class=\\"media\\">\\n\
\\t\\t\\t\\t<a class=\\"media-left media-video premium xiti_click_action\\" \ \\t\\t\\t\\t<a class=\\"media-left media-video premium xiti_click_action\\" \
data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \
href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\\n\ href=\\"\\/video\\/CAF89035682\\/conference-de-presse-du-general-de-gaulle-video.html\\">\\n\
<img src=\\"https:\/\/www.ina.fr\/images_v2\/140x105\/CAF89035682.jpeg\\" \ <img src=\\"https:\\/\\/www.ina.fr\\/images_v2\\/140x105\\/CAF89035682.jpeg\\" \
alt=\\"Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle \\">\\n\ alt=\\"Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle \\">\\n\
\\t\\t\\t\\t\\t<\/a>\\n\ \\t\\t\\t\\t\\t<\\/a>\\n\
\\t\\t\\t\\t\\t<div class=\\"media-body\\">\\n\\t\\t\\t\\t\\t\\t<h3 class=\\"h3--title media-heading\\">\\n\ \\t\\t\\t\\t\\t<div class=\\"media-body\\">\\n\\t\\t\\t\\t\\t\\t<h3 class=\\"h3--title media-heading\\">\\n\
\\t\\t\\t\\t\\t\\t\\t<a class=\\"xiti_click_action\\" \ \\t\\t\\t\\t\\t\\t\\t<a class=\\"xiti_click_action\\" \
data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \
href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\ href=\\"\\/video\\/CAF89035682\\/conference-de-presse-du-general-de-gaulle-video.html\\">\
Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle <\/a>\\n\ Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle <\\/a>\\n\
<\/h3>\\n\ <\\/h3>\\n\
<div class=\\"media-body__info\\">\\n<span class=\\"broadcast\\">27\/11\/1967<\/span>\\n\ <div class=\\"media-body__info\\">\\n<span class=\\"broadcast\\">27\\/11\\/1967<\\/span>\\n\
<span class=\\"views\\">29321 vues<\/span>\\n\ <span class=\\"views\\">29321 vues<\\/span>\\n\
<span class=\\"duration\\">01h 33m 07s<\/span>\\n\ <span class=\\"duration\\">01h 33m 07s<\\/span>\\n\
<\/div>\\n\ <\\/div>\\n\
<p class=\\"media-body__summary\\">VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE . \ <p class=\\"media-body__summary\\">VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE . \
- PA le Pr\\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...<\/p>\\n\ - PA le Pr\\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...<\\/p>\\n\
<\/div>\\n<\/div><!-- \/.media -->\\n" <\\/div>\\n<\\/div><!-- \\/.media -->\\n"
} }
""" """
response = mock.Mock(text=json) response = mock.Mock(text=json)

View File

@ -118,7 +118,7 @@ class TestMediawikiEngine(SearxTestCase):
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.mediawiki.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.mediawiki.com\/artist\/1217","type":"artist"}

View File

@ -55,7 +55,7 @@ class TestMixcloudEngine(SearxTestCase):
self.assertEqual(results[0]['content'], 'User') self.assertEqual(results[0]['content'], 'User')
self.assertTrue('http://www.mixcloud.com/user/this-is-the-url/' in results[0]['embedded']) self.assertTrue('http://www.mixcloud.com/user/this-is-the-url/' in results[0]['embedded'])
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.mixcloud.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.mixcloud.com\/artist\/1217","type":"artist"}

View File

@ -63,7 +63,7 @@ class TestSearchcodeCodeEngine(SearxTestCase):
self.assertEqual(results[0]['repository'], 'https://repo') self.assertEqual(results[0]['repository'], 'https://repo')
self.assertEqual(results[0]['code_language'], 'cpp') self.assertEqual(results[0]['code_language'], 'cpp')
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.searchcode_code.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.searchcode_code.com\/artist\/1217","type":"artist"}

View File

@ -61,7 +61,7 @@ class TestSearchcodeDocEngine(SearxTestCase):
self.assertIn('test', results[0]['content']) self.assertIn('test', results[0]['content'])
self.assertIn('Description', results[0]['content']) self.assertIn('Description', results[0]['content'])
json = """ json = r"""
{"toto":[ {"toto":[
{"id":200,"name":"Artist Name", {"id":200,"name":"Artist Name",
"link":"http:\/\/www.searchcode_doc.com\/artist\/1217","type":"artist"} "link":"http:\/\/www.searchcode_doc.com\/artist\/1217","type":"artist"}

View File

@ -28,7 +28,7 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
request = Request(headers={'Referer': referer_url}) request = Request(headers={'Referer': referer_url})
# test failure # test failure
json = ''' json = r'''
{"queryresult" : { {"queryresult" : {
"success" : false, "success" : false,
"error" : false, "error" : false,
@ -42,7 +42,7 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
self.assertEqual(wolframalpha_noapi.response(response), []) self.assertEqual(wolframalpha_noapi.response(response), [])
# test basic case # test basic case
json = ''' json = r'''
{"queryresult" : { {"queryresult" : {
"success" : true, "success" : true,
"error" : false, "error" : false,
@ -143,7 +143,7 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
self.assertEqual('Wolfram|Alpha', results[1]['title']) self.assertEqual('Wolfram|Alpha', results[1]['title'])
# test calc # test calc
json = """ json = r"""
{"queryresult" : { {"queryresult" : {
"success" : true, "success" : true,
"error" : false, "error" : false,