diff --git a/utils/generate-domains-blacklists/generate-domains-blacklist.py b/utils/generate-domains-blacklists/generate-domains-blacklist.py index 43eba48b..387cc250 100755 --- a/utils/generate-domains-blacklists/generate-domains-blacklist.py +++ b/utils/generate-domains-blacklists/generate-domains-blacklist.py @@ -1,10 +1,14 @@ #! /usr/bin/env python -# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list +# Use the following command ensure the right encoding: +# python generate-domains-blacklist.py -o list.txt.tmp && mv -f list.txt.tmp list + import argparse import re import sys +from io import StringIO +from fnmatch import fnmatch try: import urllib2 as urllib @@ -81,6 +85,16 @@ def parse_list(content, trusted=False): return names, time_restrictions +# basic check if the line contains any glob specific characters +def is_glob(line): + glob_chars = "*[]?" # ignore = for now + return any(char in line for char in glob_chars) + + +def get_lines_with_globs(names): + return set(filter(lambda line: is_glob(line), names)) + + def print_restricted_name(name, time_restrictions): if name in time_restrictions: print("{}\t{}".format(name, time_restrictions[name])) @@ -115,7 +129,7 @@ def load_from_url(url): if URLLIB_NEW: content = content.decode("utf-8", errors="replace") - return (content, trusted) + return content, trusted def name_cmp(name): @@ -134,6 +148,17 @@ def has_suffix(names, name): return False +# check if a line matches with any of the collected globs: +def covered_by_glob(line, glob_list): + # ignore lines that are part of the glob_list + if line not in glob_list: + for glob in glob_list: + if fnmatch(line, glob): + return True + + return False + + def whitelist_from_url(url): if not url: return set() @@ -144,10 +169,11 @@ def whitelist_from_url(url): def blacklists_from_config_file( - file, whitelist, time_restricted_url, ignore_retrieval_failure + file, whitelist, time_restricted_url, ignore_retrieval_failure, output_file ): blacklists = {} whitelisted_names = set() + all_globs = set() all_names = set() unique_names = set() @@ -163,6 +189,10 @@ def blacklists_from_config_file( names, _time_restrictions = parse_list(content, trusted) blacklists[url] = names all_names |= names + # only check local files for globs: + if trusted: + all_globs |= get_lines_with_globs(names) + except Exception as e: sys.stderr.write(str(e)) if not ignore_retrieval_failure: @@ -192,13 +222,18 @@ def blacklists_from_config_file( whitelisted_names |= whitelist_from_url(whitelist) + # redirect output to output_file if provided + output = StringIO() if output_file else sys.stdout + # Process blacklists for url, names in blacklists.items(): - print("\n\n########## Blacklist from {} ##########\n".format(url)) - ignored, whitelisted = 0, 0 + print("########## Blacklist from {} ##########\n".format(url), file=output) + ignored, whitelisted, glob_ignored = 0, 0, 0 list_names = list() for name in names: - if has_suffix(all_names, name) or name in unique_names: + if covered_by_glob(name, all_globs): + glob_ignored = glob_ignored + 1 + elif has_suffix(all_names, name) or name in unique_names: ignored = ignored + 1 elif has_suffix(whitelisted_names, name) or name in whitelisted_names: whitelisted = whitelisted + 1 @@ -208,11 +243,21 @@ def blacklists_from_config_file( list_names.sort(key=name_cmp) if ignored: - print("# Ignored duplicates: {}\n".format(ignored)) + print("# Ignored duplicates: {}\n".format(ignored), file=output) if whitelisted: - print("# Ignored entries due to the whitelist: {}\n".format(whitelisted)) + print("# Ignored entries due to the whitelist: {}\n".format(whitelisted), file=output) + if glob_ignored: + print("# Ignored entries due to globs in local-additions: {}\n".format(glob_ignored), file=output) for name in list_names: - print(name) + print(name, file=output) + print("\n\n", file=output) + + # if provided, save content from output buffer to file all at once + if output_file: + f = open(output_file, "w", encoding='utf8') + f.write(output.getvalue()) + f.close() + output.close() argp = argparse.ArgumentParser( @@ -242,6 +287,12 @@ argp.add_argument( action="store_true", help="generate list even if some urls couldn't be retrieved", ) +argp.add_argument( + "-o", + "--output-file", + default=None, + help="save generated blacklist to a text file with the provided file name", +) argp.add_argument("-t", "--timeout", default=30, help="URL open timeout") args = argp.parse_args() @@ -249,6 +300,7 @@ conf = args.config whitelist = args.whitelist time_restricted = args.time_restricted ignore_retrieval_failure = args.ignore_retrieval_failure +output_file = args.output_file blacklists_from_config_file( - conf, whitelist, time_restricted, ignore_retrieval_failure) + conf, whitelist, time_restricted, ignore_retrieval_failure, output_file)