mirror of
https://github.com/DNSCrypt/dnscrypt-proxy.git
synced 2025-01-14 02:25:52 +01:00
Improve generate-domains-blacklist.py to remove redundant lines (#1184)
* Improve script to remove redundant lines Let the script remove those lines that are covered by regular expressions already * add optional "-o OUTPUT_FILE" argument This ensures that UTF-8 is used. The redirect to file functionality from before is maintained, because "default=None" is used for the -o argument I also fixed the formatting slightly to avoid newlines at the beginning of the file. * improve glob matching - rename regexes into globs - only check trusted (local) files for globs - use fnmatch instead of manually converting globs into regular expressions and matching them - modify is_glob function to check only for the following characters: * [ ] ? - improve get_lines_with_globs function, by using the native filter and lambda functions - improve covered_by_glob function, by checking if line is part of glob_list, instead of calling is_glob again - print "ignored entries due to globs in local-additions" to the output as well to better differentiate from other duplicates
This commit is contained in:
parent
9519472bbe
commit
58871de725
@ -1,10 +1,14 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list
|
||||
# Use the following command ensure the right encoding:
|
||||
# python generate-domains-blacklist.py -o list.txt.tmp && mv -f list.txt.tmp list
|
||||
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from io import StringIO
|
||||
from fnmatch import fnmatch
|
||||
|
||||
try:
|
||||
import urllib2 as urllib
|
||||
@ -81,6 +85,16 @@ def parse_list(content, trusted=False):
|
||||
return names, time_restrictions
|
||||
|
||||
|
||||
# basic check if the line contains any glob specific characters
|
||||
def is_glob(line):
|
||||
glob_chars = "*[]?" # ignore = for now
|
||||
return any(char in line for char in glob_chars)
|
||||
|
||||
|
||||
def get_lines_with_globs(names):
|
||||
return set(filter(lambda line: is_glob(line), names))
|
||||
|
||||
|
||||
def print_restricted_name(name, time_restrictions):
|
||||
if name in time_restrictions:
|
||||
print("{}\t{}".format(name, time_restrictions[name]))
|
||||
@ -115,7 +129,7 @@ def load_from_url(url):
|
||||
if URLLIB_NEW:
|
||||
content = content.decode("utf-8", errors="replace")
|
||||
|
||||
return (content, trusted)
|
||||
return content, trusted
|
||||
|
||||
|
||||
def name_cmp(name):
|
||||
@ -134,6 +148,17 @@ def has_suffix(names, name):
|
||||
return False
|
||||
|
||||
|
||||
# check if a line matches with any of the collected globs:
|
||||
def covered_by_glob(line, glob_list):
|
||||
# ignore lines that are part of the glob_list
|
||||
if line not in glob_list:
|
||||
for glob in glob_list:
|
||||
if fnmatch(line, glob):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def whitelist_from_url(url):
|
||||
if not url:
|
||||
return set()
|
||||
@ -144,10 +169,11 @@ def whitelist_from_url(url):
|
||||
|
||||
|
||||
def blacklists_from_config_file(
|
||||
file, whitelist, time_restricted_url, ignore_retrieval_failure
|
||||
file, whitelist, time_restricted_url, ignore_retrieval_failure, output_file
|
||||
):
|
||||
blacklists = {}
|
||||
whitelisted_names = set()
|
||||
all_globs = set()
|
||||
all_names = set()
|
||||
unique_names = set()
|
||||
|
||||
@ -163,6 +189,10 @@ def blacklists_from_config_file(
|
||||
names, _time_restrictions = parse_list(content, trusted)
|
||||
blacklists[url] = names
|
||||
all_names |= names
|
||||
# only check local files for globs:
|
||||
if trusted:
|
||||
all_globs |= get_lines_with_globs(names)
|
||||
|
||||
except Exception as e:
|
||||
sys.stderr.write(str(e))
|
||||
if not ignore_retrieval_failure:
|
||||
@ -192,13 +222,18 @@ def blacklists_from_config_file(
|
||||
|
||||
whitelisted_names |= whitelist_from_url(whitelist)
|
||||
|
||||
# redirect output to output_file if provided
|
||||
output = StringIO() if output_file else sys.stdout
|
||||
|
||||
# Process blacklists
|
||||
for url, names in blacklists.items():
|
||||
print("\n\n########## Blacklist from {} ##########\n".format(url))
|
||||
ignored, whitelisted = 0, 0
|
||||
print("########## Blacklist from {} ##########\n".format(url), file=output)
|
||||
ignored, whitelisted, glob_ignored = 0, 0, 0
|
||||
list_names = list()
|
||||
for name in names:
|
||||
if has_suffix(all_names, name) or name in unique_names:
|
||||
if covered_by_glob(name, all_globs):
|
||||
glob_ignored = glob_ignored + 1
|
||||
elif has_suffix(all_names, name) or name in unique_names:
|
||||
ignored = ignored + 1
|
||||
elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
|
||||
whitelisted = whitelisted + 1
|
||||
@ -208,11 +243,21 @@ def blacklists_from_config_file(
|
||||
|
||||
list_names.sort(key=name_cmp)
|
||||
if ignored:
|
||||
print("# Ignored duplicates: {}\n".format(ignored))
|
||||
print("# Ignored duplicates: {}\n".format(ignored), file=output)
|
||||
if whitelisted:
|
||||
print("# Ignored entries due to the whitelist: {}\n".format(whitelisted))
|
||||
print("# Ignored entries due to the whitelist: {}\n".format(whitelisted), file=output)
|
||||
if glob_ignored:
|
||||
print("# Ignored entries due to globs in local-additions: {}\n".format(glob_ignored), file=output)
|
||||
for name in list_names:
|
||||
print(name)
|
||||
print(name, file=output)
|
||||
print("\n\n", file=output)
|
||||
|
||||
# if provided, save content from output buffer to file all at once
|
||||
if output_file:
|
||||
f = open(output_file, "w", encoding='utf8')
|
||||
f.write(output.getvalue())
|
||||
f.close()
|
||||
output.close()
|
||||
|
||||
|
||||
argp = argparse.ArgumentParser(
|
||||
@ -242,6 +287,12 @@ argp.add_argument(
|
||||
action="store_true",
|
||||
help="generate list even if some urls couldn't be retrieved",
|
||||
)
|
||||
argp.add_argument(
|
||||
"-o",
|
||||
"--output-file",
|
||||
default=None,
|
||||
help="save generated blacklist to a text file with the provided file name",
|
||||
)
|
||||
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
|
||||
args = argp.parse_args()
|
||||
|
||||
@ -249,6 +300,7 @@ conf = args.config
|
||||
whitelist = args.whitelist
|
||||
time_restricted = args.time_restricted
|
||||
ignore_retrieval_failure = args.ignore_retrieval_failure
|
||||
output_file = args.output_file
|
||||
|
||||
blacklists_from_config_file(
|
||||
conf, whitelist, time_restricted, ignore_retrieval_failure)
|
||||
conf, whitelist, time_restricted, ignore_retrieval_failure, output_file)
|
||||
|
Loading…
Reference in New Issue
Block a user