Revert "Improve generate-domains-blacklist.py to remove redundant lines (#1184)"

This reverts commit 58871de725.
This commit is contained in:
Frank Denis 2020-04-21 23:08:40 +02:00
parent 58871de725
commit dcd6f8448d
1 changed files with 10 additions and 62 deletions

View File

@ -1,14 +1,10 @@
#! /usr/bin/env python #! /usr/bin/env python
# Use the following command ensure the right encoding: # run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list
# python generate-domains-blacklist.py -o list.txt.tmp && mv -f list.txt.tmp list
import argparse import argparse
import re import re
import sys import sys
from io import StringIO
from fnmatch import fnmatch
try: try:
import urllib2 as urllib import urllib2 as urllib
@ -85,16 +81,6 @@ def parse_list(content, trusted=False):
return names, time_restrictions return names, time_restrictions
# basic check if the line contains any glob specific characters
def is_glob(line):
glob_chars = "*[]?" # ignore = for now
return any(char in line for char in glob_chars)
def get_lines_with_globs(names):
return set(filter(lambda line: is_glob(line), names))
def print_restricted_name(name, time_restrictions): def print_restricted_name(name, time_restrictions):
if name in time_restrictions: if name in time_restrictions:
print("{}\t{}".format(name, time_restrictions[name])) print("{}\t{}".format(name, time_restrictions[name]))
@ -129,7 +115,7 @@ def load_from_url(url):
if URLLIB_NEW: if URLLIB_NEW:
content = content.decode("utf-8", errors="replace") content = content.decode("utf-8", errors="replace")
return content, trusted return (content, trusted)
def name_cmp(name): def name_cmp(name):
@ -148,17 +134,6 @@ def has_suffix(names, name):
return False return False
# check if a line matches with any of the collected globs:
def covered_by_glob(line, glob_list):
# ignore lines that are part of the glob_list
if line not in glob_list:
for glob in glob_list:
if fnmatch(line, glob):
return True
return False
def whitelist_from_url(url): def whitelist_from_url(url):
if not url: if not url:
return set() return set()
@ -169,11 +144,10 @@ def whitelist_from_url(url):
def blacklists_from_config_file( def blacklists_from_config_file(
file, whitelist, time_restricted_url, ignore_retrieval_failure, output_file file, whitelist, time_restricted_url, ignore_retrieval_failure
): ):
blacklists = {} blacklists = {}
whitelisted_names = set() whitelisted_names = set()
all_globs = set()
all_names = set() all_names = set()
unique_names = set() unique_names = set()
@ -189,10 +163,6 @@ def blacklists_from_config_file(
names, _time_restrictions = parse_list(content, trusted) names, _time_restrictions = parse_list(content, trusted)
blacklists[url] = names blacklists[url] = names
all_names |= names all_names |= names
# only check local files for globs:
if trusted:
all_globs |= get_lines_with_globs(names)
except Exception as e: except Exception as e:
sys.stderr.write(str(e)) sys.stderr.write(str(e))
if not ignore_retrieval_failure: if not ignore_retrieval_failure:
@ -222,18 +192,13 @@ def blacklists_from_config_file(
whitelisted_names |= whitelist_from_url(whitelist) whitelisted_names |= whitelist_from_url(whitelist)
# redirect output to output_file if provided
output = StringIO() if output_file else sys.stdout
# Process blacklists # Process blacklists
for url, names in blacklists.items(): for url, names in blacklists.items():
print("########## Blacklist from {} ##########\n".format(url), file=output) print("\n\n########## Blacklist from {} ##########\n".format(url))
ignored, whitelisted, glob_ignored = 0, 0, 0 ignored, whitelisted = 0, 0
list_names = list() list_names = list()
for name in names: for name in names:
if covered_by_glob(name, all_globs): if has_suffix(all_names, name) or name in unique_names:
glob_ignored = glob_ignored + 1
elif has_suffix(all_names, name) or name in unique_names:
ignored = ignored + 1 ignored = ignored + 1
elif has_suffix(whitelisted_names, name) or name in whitelisted_names: elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
whitelisted = whitelisted + 1 whitelisted = whitelisted + 1
@ -243,21 +208,11 @@ def blacklists_from_config_file(
list_names.sort(key=name_cmp) list_names.sort(key=name_cmp)
if ignored: if ignored:
print("# Ignored duplicates: {}\n".format(ignored), file=output) print("# Ignored duplicates: {}\n".format(ignored))
if whitelisted: if whitelisted:
print("# Ignored entries due to the whitelist: {}\n".format(whitelisted), file=output) print("# Ignored entries due to the whitelist: {}\n".format(whitelisted))
if glob_ignored:
print("# Ignored entries due to globs in local-additions: {}\n".format(glob_ignored), file=output)
for name in list_names: for name in list_names:
print(name, file=output) print(name)
print("\n\n", file=output)
# if provided, save content from output buffer to file all at once
if output_file:
f = open(output_file, "w", encoding='utf8')
f.write(output.getvalue())
f.close()
output.close()
argp = argparse.ArgumentParser( argp = argparse.ArgumentParser(
@ -287,12 +242,6 @@ argp.add_argument(
action="store_true", action="store_true",
help="generate list even if some urls couldn't be retrieved", help="generate list even if some urls couldn't be retrieved",
) )
argp.add_argument(
"-o",
"--output-file",
default=None,
help="save generated blacklist to a text file with the provided file name",
)
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout") argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
args = argp.parse_args() args = argp.parse_args()
@ -300,7 +249,6 @@ conf = args.config
whitelist = args.whitelist whitelist = args.whitelist
time_restricted = args.time_restricted time_restricted = args.time_restricted
ignore_retrieval_failure = args.ignore_retrieval_failure ignore_retrieval_failure = args.ignore_retrieval_failure
output_file = args.output_file
blacklists_from_config_file( blacklists_from_config_file(
conf, whitelist, time_restricted, ignore_retrieval_failure, output_file) conf, whitelist, time_restricted, ignore_retrieval_failure)