2018-01-17 15:28:07 +01:00
|
|
|
#! /usr/bin/env python
|
|
|
|
|
2020-04-21 23:07:32 +02:00
|
|
|
# Use the following command ensure the right encoding:
|
|
|
|
# python generate-domains-blacklist.py -o list.txt.tmp && mv -f list.txt.tmp list
|
|
|
|
|
2018-01-17 15:28:07 +01:00
|
|
|
|
|
|
|
import argparse
|
|
|
|
import re
|
|
|
|
import sys
|
2020-04-21 23:07:32 +02:00
|
|
|
from io import StringIO
|
|
|
|
from fnmatch import fnmatch
|
2018-01-17 15:28:07 +01:00
|
|
|
|
2019-05-22 10:15:08 +02:00
|
|
|
try:
|
|
|
|
import urllib2 as urllib
|
2019-08-27 18:25:47 +02:00
|
|
|
|
2019-05-22 10:15:08 +02:00
|
|
|
URLLIB_NEW = False
|
|
|
|
except (ImportError, ModuleNotFoundError):
|
|
|
|
import urllib.request as urllib
|
|
|
|
from urllib.request import Request
|
2019-08-27 18:25:47 +02:00
|
|
|
|
2019-05-22 10:15:08 +02:00
|
|
|
URLLIB_NEW = True
|
2018-01-17 15:28:07 +01:00
|
|
|
|
2019-08-27 18:25:47 +02:00
|
|
|
|
2019-02-15 00:03:02 +01:00
|
|
|
def parse_time_restricted_list(content):
|
2019-08-27 18:25:47 +02:00
|
|
|
rx_comment = re.compile(r"^(#|$)")
|
|
|
|
rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
|
|
|
|
rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
|
2019-02-14 23:27:19 +01:00
|
|
|
|
|
|
|
names = set()
|
2019-02-15 00:03:02 +01:00
|
|
|
time_restrictions = {}
|
2019-02-14 23:27:19 +01:00
|
|
|
rx_set = [rx_trusted]
|
|
|
|
for line in content.splitlines():
|
|
|
|
line = str.lower(str.strip(line))
|
|
|
|
if rx_comment.match(line):
|
|
|
|
continue
|
2019-08-27 18:25:47 +02:00
|
|
|
line = rx_inline_comment.sub("", line)
|
2019-02-14 23:27:19 +01:00
|
|
|
for rx in rx_set:
|
|
|
|
matches = rx.match(line)
|
|
|
|
if not matches:
|
|
|
|
continue
|
|
|
|
name = matches.group(1)
|
|
|
|
names.add(name)
|
2019-02-15 00:03:02 +01:00
|
|
|
time_restriction = matches.group(2)
|
|
|
|
if time_restriction:
|
|
|
|
time_restrictions[name] = time_restriction
|
|
|
|
return names, time_restrictions
|
|
|
|
|
|
|
|
|
|
|
|
def parse_trusted_list(content):
|
|
|
|
names, _time_restrictions = parse_time_restricted_list(content)
|
|
|
|
time_restrictions = {}
|
|
|
|
return names, time_restrictions
|
2019-02-14 23:27:19 +01:00
|
|
|
|
|
|
|
|
2018-02-19 16:38:06 +01:00
|
|
|
def parse_list(content, trusted=False):
|
2019-08-27 18:25:47 +02:00
|
|
|
rx_comment = re.compile(r"^(#|$)")
|
|
|
|
rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
|
2019-12-23 18:55:37 +01:00
|
|
|
rx_u = re.compile(
|
2020-01-08 20:05:27 +01:00
|
|
|
r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$")
|
|
|
|
rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
|
2019-02-14 23:27:19 +01:00
|
|
|
rx_h = re.compile(
|
2020-01-08 20:05:27 +01:00
|
|
|
r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$"
|
2019-08-27 18:25:47 +02:00
|
|
|
)
|
2020-01-08 20:05:27 +01:00
|
|
|
rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",')
|
|
|
|
rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,")
|
|
|
|
rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.")
|
2019-02-14 23:27:19 +01:00
|
|
|
|
|
|
|
if trusted:
|
|
|
|
return parse_trusted_list(content)
|
2018-01-17 15:28:07 +01:00
|
|
|
|
|
|
|
names = set()
|
2019-02-15 00:03:02 +01:00
|
|
|
time_restrictions = {}
|
2018-03-26 20:43:42 +02:00
|
|
|
rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b, rx_dq]
|
2018-01-17 15:28:07 +01:00
|
|
|
for line in content.splitlines():
|
|
|
|
line = str.lower(str.strip(line))
|
|
|
|
if rx_comment.match(line):
|
|
|
|
continue
|
2019-08-27 18:25:47 +02:00
|
|
|
line = rx_inline_comment.sub("", line)
|
2018-01-17 15:28:07 +01:00
|
|
|
for rx in rx_set:
|
|
|
|
matches = rx.match(line)
|
|
|
|
if not matches:
|
|
|
|
continue
|
|
|
|
name = matches.group(1)
|
|
|
|
names.add(name)
|
2019-02-15 00:03:02 +01:00
|
|
|
return names, time_restrictions
|
|
|
|
|
|
|
|
|
2020-04-21 23:07:32 +02:00
|
|
|
# basic check if the line contains any glob specific characters
|
|
|
|
def is_glob(line):
|
|
|
|
glob_chars = "*[]?" # ignore = for now
|
|
|
|
return any(char in line for char in glob_chars)
|
|
|
|
|
|
|
|
|
|
|
|
def get_lines_with_globs(names):
|
|
|
|
return set(filter(lambda line: is_glob(line), names))
|
|
|
|
|
|
|
|
|
2019-02-15 00:03:02 +01:00
|
|
|
def print_restricted_name(name, time_restrictions):
|
|
|
|
if name in time_restrictions:
|
|
|
|
print("{}\t{}".format(name, time_restrictions[name]))
|
|
|
|
else:
|
2019-08-27 18:25:47 +02:00
|
|
|
print(
|
|
|
|
"# ignored: [{}] was in the time-restricted list, "
|
|
|
|
"but without a time restriction label".format(name)
|
|
|
|
)
|
2018-01-17 15:28:07 +01:00
|
|
|
|
|
|
|
|
2018-02-19 14:38:43 +01:00
|
|
|
def load_from_url(url):
|
2018-01-17 15:28:07 +01:00
|
|
|
sys.stderr.write("Loading data from [{}]\n".format(url))
|
2019-08-27 18:26:29 +02:00
|
|
|
req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
|
2018-01-17 15:28:07 +01:00
|
|
|
trusted = False
|
2019-05-22 10:15:08 +02:00
|
|
|
|
|
|
|
if URLLIB_NEW:
|
|
|
|
req_type = req.type
|
|
|
|
else:
|
|
|
|
req_type = req.get_type()
|
|
|
|
if req_type == "file":
|
2018-01-17 15:28:07 +01:00
|
|
|
trusted = True
|
2019-05-22 10:15:08 +02:00
|
|
|
|
2018-01-17 15:28:07 +01:00
|
|
|
response = None
|
|
|
|
try:
|
2019-05-22 10:15:08 +02:00
|
|
|
response = urllib.urlopen(req, timeout=int(args.timeout))
|
2019-05-22 20:50:45 +02:00
|
|
|
except urllib.URLError as err:
|
2018-01-17 15:28:07 +01:00
|
|
|
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
|
|
|
|
if trusted is False and response.getcode() != 200:
|
2019-12-23 18:55:37 +01:00
|
|
|
raise Exception("[{}] returned HTTP code {}\n".format(
|
|
|
|
url, response.getcode()))
|
2018-01-17 15:28:07 +01:00
|
|
|
content = response.read()
|
2019-05-22 10:15:08 +02:00
|
|
|
if URLLIB_NEW:
|
2019-08-27 18:25:47 +02:00
|
|
|
content = content.decode("utf-8", errors="replace")
|
2018-01-17 15:28:07 +01:00
|
|
|
|
2020-04-21 23:07:32 +02:00
|
|
|
return content, trusted
|
2018-01-17 15:28:07 +01:00
|
|
|
|
|
|
|
|
|
|
|
def name_cmp(name):
|
|
|
|
parts = name.split(".")
|
|
|
|
parts.reverse()
|
|
|
|
return str.join(".", parts)
|
|
|
|
|
|
|
|
|
|
|
|
def has_suffix(names, name):
|
|
|
|
parts = str.split(name, ".")
|
|
|
|
while parts:
|
|
|
|
parts = parts[1:]
|
|
|
|
if str.join(".", parts) in names:
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2020-04-21 23:07:32 +02:00
|
|
|
# check if a line matches with any of the collected globs:
|
|
|
|
def covered_by_glob(line, glob_list):
|
|
|
|
# ignore lines that are part of the glob_list
|
|
|
|
if line not in glob_list:
|
|
|
|
for glob in glob_list:
|
|
|
|
if fnmatch(line, glob):
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2018-01-17 15:28:07 +01:00
|
|
|
def whitelist_from_url(url):
|
|
|
|
if not url:
|
|
|
|
return set()
|
2018-02-19 16:38:06 +01:00
|
|
|
content, trusted = load_from_url(url)
|
2018-01-17 15:28:07 +01:00
|
|
|
|
2019-02-15 00:03:02 +01:00
|
|
|
names, _time_restrictions = parse_list(content, trusted)
|
|
|
|
return names
|
2018-01-17 15:28:07 +01:00
|
|
|
|
|
|
|
|
2019-08-27 18:25:47 +02:00
|
|
|
def blacklists_from_config_file(
|
2020-04-21 23:07:32 +02:00
|
|
|
file, whitelist, time_restricted_url, ignore_retrieval_failure, output_file
|
2019-08-27 18:25:47 +02:00
|
|
|
):
|
2018-01-17 15:28:07 +01:00
|
|
|
blacklists = {}
|
2018-02-19 14:38:43 +01:00
|
|
|
whitelisted_names = set()
|
2020-04-21 23:07:32 +02:00
|
|
|
all_globs = set()
|
2018-01-17 15:28:07 +01:00
|
|
|
all_names = set()
|
|
|
|
unique_names = set()
|
|
|
|
|
2018-02-19 14:38:43 +01:00
|
|
|
# Load conf & blacklists
|
2018-01-17 15:28:07 +01:00
|
|
|
with open(file) as fd:
|
|
|
|
for line in fd:
|
|
|
|
line = str.strip(line)
|
|
|
|
if str.startswith(line, "#") or line == "":
|
|
|
|
continue
|
|
|
|
url = line
|
|
|
|
try:
|
2018-02-19 16:38:06 +01:00
|
|
|
content, trusted = load_from_url(url)
|
2019-02-15 00:03:02 +01:00
|
|
|
names, _time_restrictions = parse_list(content, trusted)
|
2018-01-17 15:28:07 +01:00
|
|
|
blacklists[url] = names
|
|
|
|
all_names |= names
|
2020-04-21 23:07:32 +02:00
|
|
|
# only check local files for globs:
|
|
|
|
if trusted:
|
|
|
|
all_globs |= get_lines_with_globs(names)
|
|
|
|
|
2018-01-17 15:28:07 +01:00
|
|
|
except Exception as e:
|
2019-12-23 18:58:00 +01:00
|
|
|
sys.stderr.write(str(e))
|
2018-01-17 15:28:07 +01:00
|
|
|
if not ignore_retrieval_failure:
|
|
|
|
exit(1)
|
|
|
|
|
2018-02-19 16:38:06 +01:00
|
|
|
# Time-based blacklist
|
2019-08-27 18:25:47 +02:00
|
|
|
if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
|
2018-02-19 16:38:06 +01:00
|
|
|
time_restricted_url = "file:" + time_restricted_url
|
|
|
|
|
|
|
|
if time_restricted_url:
|
2019-02-15 00:03:02 +01:00
|
|
|
time_restricted_content, _trusted = load_from_url(time_restricted_url)
|
|
|
|
time_restricted_names, time_restrictions = parse_time_restricted_list(
|
2019-08-27 18:25:47 +02:00
|
|
|
time_restricted_content
|
|
|
|
)
|
2018-02-19 16:38:06 +01:00
|
|
|
|
|
|
|
if time_restricted_names:
|
|
|
|
print("########## Time-based blacklist ##########\n")
|
|
|
|
for name in time_restricted_names:
|
2019-02-15 00:03:02 +01:00
|
|
|
print_restricted_name(name, time_restrictions)
|
2018-02-19 16:38:06 +01:00
|
|
|
|
|
|
|
# Time restricted names should be whitelisted, or they could be always blocked
|
|
|
|
whitelisted_names |= time_restricted_names
|
|
|
|
|
|
|
|
# Whitelist
|
2019-08-27 18:25:47 +02:00
|
|
|
if whitelist and not re.match(r"^[a-z0-9]+:", whitelist):
|
2018-02-19 16:38:06 +01:00
|
|
|
whitelist = "file:" + whitelist
|
|
|
|
|
|
|
|
whitelisted_names |= whitelist_from_url(whitelist)
|
|
|
|
|
2020-04-21 23:07:32 +02:00
|
|
|
# redirect output to output_file if provided
|
|
|
|
output = StringIO() if output_file else sys.stdout
|
|
|
|
|
2018-02-19 14:38:43 +01:00
|
|
|
# Process blacklists
|
2018-01-17 15:28:07 +01:00
|
|
|
for url, names in blacklists.items():
|
2020-04-21 23:07:32 +02:00
|
|
|
print("########## Blacklist from {} ##########\n".format(url), file=output)
|
|
|
|
ignored, whitelisted, glob_ignored = 0, 0, 0
|
2018-01-17 15:28:07 +01:00
|
|
|
list_names = list()
|
|
|
|
for name in names:
|
2020-04-21 23:07:32 +02:00
|
|
|
if covered_by_glob(name, all_globs):
|
|
|
|
glob_ignored = glob_ignored + 1
|
|
|
|
elif has_suffix(all_names, name) or name in unique_names:
|
2018-01-17 15:28:07 +01:00
|
|
|
ignored = ignored + 1
|
|
|
|
elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
|
|
|
|
whitelisted = whitelisted + 1
|
|
|
|
else:
|
|
|
|
list_names.append(name)
|
|
|
|
unique_names.add(name)
|
|
|
|
|
|
|
|
list_names.sort(key=name_cmp)
|
|
|
|
if ignored:
|
2020-04-21 23:07:32 +02:00
|
|
|
print("# Ignored duplicates: {}\n".format(ignored), file=output)
|
2018-01-17 15:28:07 +01:00
|
|
|
if whitelisted:
|
2020-04-21 23:07:32 +02:00
|
|
|
print("# Ignored entries due to the whitelist: {}\n".format(whitelisted), file=output)
|
|
|
|
if glob_ignored:
|
|
|
|
print("# Ignored entries due to globs in local-additions: {}\n".format(glob_ignored), file=output)
|
2018-01-17 15:28:07 +01:00
|
|
|
for name in list_names:
|
2020-04-21 23:07:32 +02:00
|
|
|
print(name, file=output)
|
|
|
|
print("\n\n", file=output)
|
|
|
|
|
|
|
|
# if provided, save content from output buffer to file all at once
|
|
|
|
if output_file:
|
|
|
|
f = open(output_file, "w", encoding='utf8')
|
|
|
|
f.write(output.getvalue())
|
|
|
|
f.close()
|
|
|
|
output.close()
|
2018-01-17 15:28:07 +01:00
|
|
|
|
|
|
|
|
2019-02-14 23:27:19 +01:00
|
|
|
argp = argparse.ArgumentParser(
|
2019-08-27 18:25:47 +02:00
|
|
|
description="Create a unified blacklist from a set of local and remote files"
|
|
|
|
)
|
|
|
|
argp.add_argument(
|
|
|
|
"-c",
|
|
|
|
"--config",
|
|
|
|
default="domains-blacklist.conf",
|
|
|
|
help="file containing blacklist sources",
|
|
|
|
)
|
|
|
|
argp.add_argument(
|
|
|
|
"-w",
|
|
|
|
"--whitelist",
|
|
|
|
default="domains-whitelist.txt",
|
|
|
|
help="file containing a set of names to exclude from the blacklist",
|
|
|
|
)
|
|
|
|
argp.add_argument(
|
|
|
|
"-r",
|
|
|
|
"--time-restricted",
|
|
|
|
default="domains-time-restricted.txt",
|
|
|
|
help="file containing a set of names to be time restricted",
|
|
|
|
)
|
|
|
|
argp.add_argument(
|
|
|
|
"-i",
|
|
|
|
"--ignore-retrieval-failure",
|
|
|
|
action="store_true",
|
|
|
|
help="generate list even if some urls couldn't be retrieved",
|
|
|
|
)
|
2020-04-21 23:07:32 +02:00
|
|
|
argp.add_argument(
|
|
|
|
"-o",
|
|
|
|
"--output-file",
|
|
|
|
default=None,
|
|
|
|
help="save generated blacklist to a text file with the provided file name",
|
|
|
|
)
|
2019-08-27 18:25:47 +02:00
|
|
|
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
|
2018-01-17 15:28:07 +01:00
|
|
|
args = argp.parse_args()
|
|
|
|
|
|
|
|
conf = args.config
|
|
|
|
whitelist = args.whitelist
|
2018-02-19 14:38:43 +01:00
|
|
|
time_restricted = args.time_restricted
|
2018-01-17 15:28:07 +01:00
|
|
|
ignore_retrieval_failure = args.ignore_retrieval_failure
|
2020-04-21 23:07:32 +02:00
|
|
|
output_file = args.output_file
|
2018-01-17 15:28:07 +01:00
|
|
|
|
2019-12-23 18:55:37 +01:00
|
|
|
blacklists_from_config_file(
|
2020-04-21 23:07:32 +02:00
|
|
|
conf, whitelist, time_restricted, ignore_retrieval_failure, output_file)
|