dnscrypt-proxy/utils/generate-domains-blacklists/generate-domains-blacklist.py

315 lines
9.4 KiB
Python
Raw Normal View History

2020-05-06 19:34:41 +02:00
#! /usr/bin/env python3
# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list
2020-04-21 23:40:58 +02:00
from __future__ import print_function
import argparse
import re
import sys
import fnmatch
try:
import urllib2 as urllib
2019-08-27 18:25:47 +02:00
URLLIB_NEW = False
except (ImportError, ModuleNotFoundError):
import urllib.request as urllib
from urllib.request import Request
2019-08-27 18:25:47 +02:00
URLLIB_NEW = True
2019-08-27 18:25:47 +02:00
2020-05-06 19:34:41 +02:00
def parse_trusted_list(content):
2019-08-27 18:25:47 +02:00
rx_comment = re.compile(r"^(#|$)")
rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
names = set()
time_restrictions = {}
globs = set()
rx_set = [rx_trusted]
for line in content.splitlines():
line = str.lower(str.strip(line))
if rx_comment.match(line):
continue
line = str.strip(rx_inline_comment.sub("", line))
if is_glob(line):
globs.add(line)
names.add(line)
continue
for rx in rx_set:
matches = rx.match(line)
if not matches:
continue
name = matches.group(1)
names.add(name)
time_restriction = matches.group(2)
if time_restriction:
time_restrictions[name] = time_restriction
return names, time_restrictions, globs
2018-02-19 16:38:06 +01:00
def parse_list(content, trusted=False):
2020-05-06 19:34:41 +02:00
if trusted:
return parse_trusted_list(content)
2019-08-27 18:25:47 +02:00
rx_comment = re.compile(r"^(#|$)")
rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
2019-12-23 18:55:37 +01:00
rx_u = re.compile(
r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$")
rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
rx_h = re.compile(
r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$"
2019-08-27 18:25:47 +02:00
)
rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",')
rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,")
rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.")
names = set()
time_restrictions = {}
globs = set()
rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b, rx_dq]
for line in content.splitlines():
line = str.lower(str.strip(line))
if rx_comment.match(line):
continue
line = str.strip(rx_inline_comment.sub("", line))
for rx in rx_set:
matches = rx.match(line)
if not matches:
continue
name = matches.group(1)
names.add(name)
return names, time_restrictions, globs
2020-04-21 23:40:58 +02:00
def print_restricted_name(output_fd, name, time_restrictions):
if name in time_restrictions:
2020-04-21 23:40:58 +02:00
print("{}\t{}".format(
name, time_restrictions[name]), file=output_fd, end='\n')
else:
2019-08-27 18:25:47 +02:00
print(
"# ignored: [{}] was in the time-restricted list, "
2020-04-21 23:40:58 +02:00
"but without a time restriction label".format(name), file=output_fd, end='\n'
2019-08-27 18:25:47 +02:00
)
def load_from_url(url):
sys.stderr.write("Loading data from [{}]\n".format(url))
2019-08-27 18:26:29 +02:00
req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
trusted = False
if URLLIB_NEW:
req_type = req.type
else:
req_type = req.get_type()
if req_type == "file":
trusted = True
response = None
try:
response = urllib.urlopen(req, timeout=int(args.timeout))
2019-05-22 20:50:45 +02:00
except urllib.URLError as err:
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
if trusted is False and response.getcode() != 200:
2019-12-23 18:55:37 +01:00
raise Exception("[{}] returned HTTP code {}\n".format(
url, response.getcode()))
content = response.read()
if URLLIB_NEW:
2019-08-27 18:25:47 +02:00
content = content.decode("utf-8", errors="replace")
2020-04-21 23:40:58 +02:00
return content, trusted
def name_cmp(name):
parts = name.split(".")
parts.reverse()
return str.join(".", parts)
def is_glob(pattern):
maybe_glob = False
for i in range(len(pattern)):
c = pattern[i]
if c == "?" or c == "[":
maybe_glob = True
elif c == "*" and i != 0:
if i < len(pattern) - 1 or pattern[i - 1] == '.':
maybe_glob = True
if maybe_glob:
try:
fnmatch.fnmatch("example", pattern)
return True
except:
pass
return False
def covered_by_glob(globs, name):
if name in globs:
return False
for glob in globs:
try:
if fnmatch.fnmatch(name, glob):
return True
except:
pass
return False
def has_suffix(names, name):
parts = str.split(name, ".")
while parts:
parts = parts[1:]
if str.join(".", parts) in names:
return True
return False
def whitelist_from_url(url):
if not url:
return set()
2018-02-19 16:38:06 +01:00
content, trusted = load_from_url(url)
names, _time_restrictions, _globs = parse_list(content, trusted)
return names
2019-08-27 18:25:47 +02:00
def blacklists_from_config_file(
2020-04-21 23:40:58 +02:00
file, whitelist, time_restricted_url, ignore_retrieval_failure, output_file
2019-08-27 18:25:47 +02:00
):
blacklists = {}
whitelisted_names = set()
all_names = set()
unique_names = set()
all_globs = set()
# Load conf & blacklists
with open(file) as fd:
for line in fd:
line = str.strip(line)
if str.startswith(line, "#") or line == "":
continue
url = line
try:
2018-02-19 16:38:06 +01:00
content, trusted = load_from_url(url)
names, _time_restrictions, globs = parse_list(content, trusted)
blacklists[url] = names
all_names |= names
all_globs |= globs
except Exception as e:
sys.stderr.write(str(e))
if not ignore_retrieval_failure:
exit(1)
2018-02-19 16:38:06 +01:00
# Time-based blacklist
2019-08-27 18:25:47 +02:00
if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
2018-02-19 16:38:06 +01:00
time_restricted_url = "file:" + time_restricted_url
2020-04-21 23:40:58 +02:00
output_fd = sys.stdout
if output_file:
output_fd = open(output_file, "w")
2018-02-19 16:38:06 +01:00
if time_restricted_url:
time_restricted_content, _trusted = load_from_url(time_restricted_url)
2020-05-06 19:34:41 +02:00
time_restricted_names, time_restrictions, _globs = parse_trusted_list(
time_restricted_content)
2018-02-19 16:38:06 +01:00
if time_restricted_names:
2020-04-21 23:40:58 +02:00
print("########## Time-based blacklist ##########\n",
file=output_fd, end='\n')
2018-02-19 16:38:06 +01:00
for name in time_restricted_names:
2020-04-21 23:40:58 +02:00
print_restricted_name(output_fd, name, time_restrictions)
2018-02-19 16:38:06 +01:00
# Time restricted names should be whitelisted, or they could be always blocked
whitelisted_names |= time_restricted_names
# Whitelist
2019-08-27 18:25:47 +02:00
if whitelist and not re.match(r"^[a-z0-9]+:", whitelist):
2018-02-19 16:38:06 +01:00
whitelist = "file:" + whitelist
whitelisted_names |= whitelist_from_url(whitelist)
# Process blacklists
for url, names in blacklists.items():
2020-04-21 23:40:58 +02:00
print("\n\n########## Blacklist from {} ##########\n".format(
url), file=output_fd, end='\n')
ignored, glob_ignored, whitelisted = 0, 0, 0
list_names = list()
for name in names:
if covered_by_glob(all_globs, name):
glob_ignored = glob_ignored + 1
elif has_suffix(all_names, name) or name in unique_names:
ignored = ignored + 1
elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
whitelisted = whitelisted + 1
else:
list_names.append(name)
unique_names.add(name)
list_names.sort(key=name_cmp)
if ignored:
print("# Ignored duplicates: {}".format(
2020-04-21 23:40:58 +02:00
ignored), file=output_fd, end='\n')
if glob_ignored:
print("# Ignored due to overlapping local patterns: {}".format(
glob_ignored), file=output_fd, end='\n')
if whitelisted:
2020-04-21 23:40:58 +02:00
print(
"# Ignored entries due to the whitelist: {}".format(whitelisted), file=output_fd, end='\n')
if ignored or glob_ignored or whitelisted:
print(file=output_fd, end='\n')
for name in list_names:
2020-04-21 23:40:58 +02:00
print(name, file=output_fd, end='\n')
output_fd.close()
argp = argparse.ArgumentParser(
2019-08-27 18:25:47 +02:00
description="Create a unified blacklist from a set of local and remote files"
)
argp.add_argument(
"-c",
"--config",
default="domains-blacklist.conf",
help="file containing blacklist sources",
)
argp.add_argument(
"-w",
"--whitelist",
default="domains-whitelist.txt",
help="file containing a set of names to exclude from the blacklist",
)
argp.add_argument(
"-r",
"--time-restricted",
default="domains-time-restricted.txt",
help="file containing a set of names to be time restricted",
)
argp.add_argument(
"-i",
"--ignore-retrieval-failure",
action="store_true",
help="generate list even if some urls couldn't be retrieved",
)
2020-04-21 23:40:58 +02:00
argp.add_argument(
"-o",
"--output-file",
default=None,
help="save generated blacklist to a text file with the provided file name",
)
2019-08-27 18:25:47 +02:00
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
2020-04-21 23:40:58 +02:00
args = argp.parse_args()
conf = args.config
whitelist = args.whitelist
time_restricted = args.time_restricted
ignore_retrieval_failure = args.ignore_retrieval_failure
2020-04-21 23:40:58 +02:00
output_file = args.output_file
2019-12-23 18:55:37 +01:00
blacklists_from_config_file(
2020-04-21 23:40:58 +02:00
conf, whitelist, time_restricted, ignore_retrieval_failure, output_file)