Improve generate-domains-blacklist.py to remove redundant lines (#1184)

* Improve script to remove redundant lines Let the script remove those lines that are covered by regular expressions already * add optional "-o OUTPUT_FILE" argument This ensures that UTF-8 is used. The redirect to file functionality from before is maintained, because "default=None" is used for the -o argument I also fixed the formatting slightly to avoid newlines at the beginning of the file. * improve glob matching - rename regexes into globs - only check trusted (local) files for globs - use fnmatch instead of manually converting globs into regular expressions and matching them - modify is_glob function to check only for the following characters: * [ ] ? - improve get_lines_with_globs function, by using the native filter and lambda functions - improve covered_by_glob function, by checking if line is part of glob_list, instead of calling is_glob again - print "ignored entries due to globs in local-additions" to the output as well to better differentiate from other duplicates
2024-12-27 00:12:31 +01:00 · 2020-04-21 23:07:32 +02:00 · 2020-04-21 23:07:32 +02:00 · 58871de725
commit 58871de725
parent 9519472bbe
1 changed files with 62 additions and 10 deletions
--- a/utils/generate-domains-blacklists/generate-domains-blacklist.py
+++ b/utils/generate-domains-blacklists/generate-domains-blacklist.py
@ -1,10 +1,14 @@
 #! /usr/bin/env python

-# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list
+# Use the following command ensure the right encoding:
+# python generate-domains-blacklist.py -o list.txt.tmp && mv -f list.txt.tmp list
+

 import argparse
 import re
 import sys
+from io import StringIO
+from fnmatch import fnmatch

 try:
    import urllib2 as urllib
@ -81,6 +85,16 @@ def parse_list(content, trusted=False):
    return names, time_restrictions


+# basic check if the line contains any glob specific characters
+def is_glob(line):
+    glob_chars = "*[]?"  # ignore = for now
+    return any(char in line for char in glob_chars)
+
+
+def get_lines_with_globs(names):
+    return set(filter(lambda line: is_glob(line), names))
+
+
 def print_restricted_name(name, time_restrictions):
    if name in time_restrictions:
        print("{}\t{}".format(name, time_restrictions[name]))
@ -115,7 +129,7 @@ def load_from_url(url):
    if URLLIB_NEW:
        content = content.decode("utf-8", errors="replace")

-    return (content, trusted)
+    return content, trusted


 def name_cmp(name):
@ -134,6 +148,17 @@ def has_suffix(names, name):
    return False


+# check if a line matches with any of the collected globs:
+def covered_by_glob(line, glob_list):
+    # ignore lines that are part of the glob_list
+    if line not in glob_list:
+        for glob in glob_list:
+            if fnmatch(line, glob):
+                return True
+
+    return False
+
+
 def whitelist_from_url(url):
    if not url:
        return set()
@ -144,10 +169,11 @@ def whitelist_from_url(url):


 def blacklists_from_config_file(
-    file, whitelist, time_restricted_url, ignore_retrieval_failure
+        file, whitelist, time_restricted_url, ignore_retrieval_failure, output_file
 ):
    blacklists = {}
    whitelisted_names = set()
+    all_globs = set()
    all_names = set()
    unique_names = set()

@ -163,6 +189,10 @@ def blacklists_from_config_file(
                names, _time_restrictions = parse_list(content, trusted)
                blacklists[url] = names
                all_names |= names
+                # only check local files for globs:
+                if trusted:
+                    all_globs |= get_lines_with_globs(names)
+
            except Exception as e:
                sys.stderr.write(str(e))
                if not ignore_retrieval_failure:
@ -192,13 +222,18 @@ def blacklists_from_config_file(

    whitelisted_names |= whitelist_from_url(whitelist)

+    # redirect output to output_file if provided
+    output = StringIO() if output_file else sys.stdout
+
    # Process blacklists
    for url, names in blacklists.items():
-        print("\n\n########## Blacklist from {} ##########\n".format(url))
-        ignored, whitelisted = 0, 0
+        print("########## Blacklist from {} ##########\n".format(url), file=output)
+        ignored, whitelisted, glob_ignored = 0, 0, 0
        list_names = list()
        for name in names:
-            if has_suffix(all_names, name) or name in unique_names:
+            if covered_by_glob(name, all_globs):
+                glob_ignored = glob_ignored + 1
+            elif has_suffix(all_names, name) or name in unique_names:
                ignored = ignored + 1
            elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
                whitelisted = whitelisted + 1
@ -208,11 +243,21 @@ def blacklists_from_config_file(

        list_names.sort(key=name_cmp)
        if ignored:
-            print("# Ignored duplicates: {}\n".format(ignored))
+            print("# Ignored duplicates: {}\n".format(ignored), file=output)
        if whitelisted:
-            print("# Ignored entries due to the whitelist: {}\n".format(whitelisted))
+            print("# Ignored entries due to the whitelist: {}\n".format(whitelisted), file=output)
+        if glob_ignored:
+            print("# Ignored entries due to globs in local-additions: {}\n".format(glob_ignored), file=output)
        for name in list_names:
-            print(name)
+            print(name, file=output)
+        print("\n\n", file=output)
+
+    # if provided, save content from output buffer to file all at once
+    if output_file:
+        f = open(output_file, "w", encoding='utf8')
+        f.write(output.getvalue())
+        f.close()
+        output.close()


 argp = argparse.ArgumentParser(
@ -242,6 +287,12 @@ argp.add_argument(
    action="store_true",
    help="generate list even if some urls couldn't be retrieved",
 )
+argp.add_argument(
+    "-o",
+    "--output-file",
+    default=None,
+    help="save generated blacklist to a text file with the provided file name",
+)
 argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
 args = argp.parse_args()

@ -249,6 +300,7 @@ conf = args.config
 whitelist = args.whitelist
 time_restricted = args.time_restricted
 ignore_retrieval_failure = args.ignore_retrieval_failure
+output_file = args.output_file

 blacklists_from_config_file(
-    conf, whitelist, time_restricted, ignore_retrieval_failure)
+    conf, whitelist, time_restricted, ignore_retrieval_failure, output_file)