Format

2024-12-28 00:20:13 +01:00 · 2019-08-27 18:25:47 +02:00 · 2019-08-27 18:25:47 +02:00 · 5f29677400
commit 5f29677400
parent b543ccabdb
1 changed files with 58 additions and 35 deletions
--- a/utils/generate-domains-blacklists/generate-domains-blacklist.py
+++ b/utils/generate-domains-blacklists/generate-domains-blacklist.py
@ -8,16 +8,19 @@ import sys
 try:
    import urllib2 as urllib
    URLLIB_NEW = False
 except (ImportError, ModuleNotFoundError):
    import urllib.request as urllib
    from urllib.request import Request
    URLLIB_NEW = True
 def parse_time_restricted_list(content):
-    rx_comment = re.compile(r'^(#|$)')
+    rx_comment = re.compile(r"^(#|$)")
-    rx_inline_comment = re.compile(r'\s*#\s*[a-z0-9-].*$')
+    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
-    rx_trusted = re.compile(r'^([*a-z0-9.-]+)\s*(@\S+)?$')
+    rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
    names = set()
    time_restrictions = {}
@ -26,7 +29,7 @@ def parse_time_restricted_list(content):
        line = str.lower(str.strip(line))
        if rx_comment.match(line):
            continue
-        line = rx_inline_comment.sub('', line)
+        line = rx_inline_comment.sub("", line)
        for rx in rx_set:
            matches = rx.match(line)
            if not matches:
@ -46,16 +49,16 @@ def parse_trusted_list(content):
 def parse_list(content, trusted=False):
-    rx_comment = re.compile(r'^(#|$)')
+    rx_comment = re.compile(r"^(#|$)")
-    rx_inline_comment = re.compile(r'\s*#\s*[a-z0-9-].*$')
+    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
-    rx_u = re.compile(
+    rx_u = re.compile(r"^@*\|\|([a-z0-9.-]+[.][a-z]{2,})\^?(\$(popup|third-party))?$")
-        r'^@*\|\|([a-z0-9.-]+[.][a-z]{2,})\^?(\$(popup|third-party))?$')
+    rx_l = re.compile(r"^([a-z0-9.-]+[.][a-z]{2,})$")
    rx_l = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,})$')
    rx_h = re.compile(
-        r'^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9.-]+[.][a-z]{2,})$')
+        r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9.-]+[.][a-z]{2,})$"
    )
    rx_mdl = re.compile(r'^"[^"]+","([a-z0-9.-]+[.][a-z]{2,})",')
-    rx_b = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,}),.+,[0-9: /-]+,')
+    rx_b = re.compile(r"^([a-z0-9.-]+[.][a-z]{2,}),.+,[0-9: /-]+,")
-    rx_dq = re.compile(r'^address=/([a-z0-9.-]+[.][a-z]{2,})/.')
+    rx_dq = re.compile(r"^address=/([a-z0-9.-]+[.][a-z]{2,})/.")
    if trusted:
        return parse_trusted_list(content)
@ -67,7 +70,7 @@ def parse_list(content, trusted=False):
        line = str.lower(str.strip(line))
        if rx_comment.match(line):
            continue
-        line = rx_inline_comment.sub('', line)
+        line = rx_inline_comment.sub("", line)
        for rx in rx_set:
            matches = rx.match(line)
            if not matches:
@ -81,8 +84,10 @@ def print_restricted_name(name, time_restrictions):
    if name in time_restrictions:
        print("{}\t{}".format(name, time_restrictions[name]))
    else:
-        print("# ignored: [{}] was in the time-restricted list, "
+        print(
-              "but without a time restriction label".format(name))
+            "# ignored: [{}] was in the time-restricted list, "
            "but without a time restriction label".format(name)
        )
 def load_from_url(url):
@ -103,11 +108,10 @@ def load_from_url(url):
    except urllib.URLError as err:
        raise Exception("[{}] could not be loaded: {}\n".format(url, err))
    if trusted is False and response.getcode() != 200:
-        raise Exception("[{}] returned HTTP code {}\n".format(
+        raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
            url, response.getcode()))
    content = response.read()
    if URLLIB_NEW:
-        content = content.decode('utf-8', errors='replace')
+        content = content.decode("utf-8", errors="replace")
    return (content, trusted)
@ -137,7 +141,9 @@ def whitelist_from_url(url):
    return names
-def blacklists_from_config_file(file, whitelist, time_restricted_url, ignore_retrieval_failure):
+def blacklists_from_config_file(
    file, whitelist, time_restricted_url, ignore_retrieval_failure
 ):
    blacklists = {}
    whitelisted_names = set()
    all_names = set()
@ -161,13 +167,14 @@ def blacklists_from_config_file(file, whitelist, time_restricted_url, ignore_ret
                    exit(1)
    # Time-based blacklist
-    if time_restricted_url and not re.match(r'^[a-z0-9]+:', time_restricted_url):
+    if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
        time_restricted_url = "file:" + time_restricted_url
    if time_restricted_url:
        time_restricted_content, _trusted = load_from_url(time_restricted_url)
        time_restricted_names, time_restrictions = parse_time_restricted_list(
-            time_restricted_content)
+            time_restricted_content
        )
        if time_restricted_names:
            print("########## Time-based blacklist ##########\n")
@ -178,7 +185,7 @@ def blacklists_from_config_file(file, whitelist, time_restricted_url, ignore_ret
        whitelisted_names |= time_restricted_names
    # Whitelist
-    if whitelist and not re.match(r'^[a-z0-9]+:', whitelist):
+    if whitelist and not re.match(r"^[a-z0-9]+:", whitelist):
        whitelist = "file:" + whitelist
    whitelisted_names |= whitelist_from_url(whitelist)
@ -207,17 +214,33 @@ def blacklists_from_config_file(file, whitelist, time_restricted_url, ignore_ret
 argp = argparse.ArgumentParser(
-    description="Create a unified blacklist from a set of local and remote files")
+    description="Create a unified blacklist from a set of local and remote files"
-argp.add_argument("-c", "--config", default="domains-blacklist.conf",
+)
-                  help="file containing blacklist sources")
+argp.add_argument(
-argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt",
+    "-c",
-                  help="file containing a set of names to exclude from the blacklist")
+    "--config",
-argp.add_argument("-r", "--time-restricted", default="domains-time-restricted.txt",
+    default="domains-blacklist.conf",
-                  help="file containing a set of names to be time restricted")
+    help="file containing blacklist sources",
-argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true',
+)
-                  help="generate list even if some urls couldn't be retrieved")
+argp.add_argument(
-argp.add_argument("-t", "--timeout", default=30,
+    "-w",
-                  help="URL open timeout")
+    "--whitelist",
    default="domains-whitelist.txt",
    help="file containing a set of names to exclude from the blacklist",
 )
 argp.add_argument(
    "-r",
    "--time-restricted",
    default="domains-time-restricted.txt",
    help="file containing a set of names to be time restricted",
 )
 argp.add_argument(
    "-i",
    "--ignore-retrieval-failure",
    action="store_true",
    help="generate list even if some urls couldn't be retrieved",
 )
 argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
 args = argp.parse_args()
 conf = args.config
@ -225,5 +248,5 @@ whitelist = args.whitelist
 time_restricted = args.time_restricted
 ignore_retrieval_failure = args.ignore_retrieval_failure
-blacklists_from_config_file(
+blacklists_from_config_file(conf, whitelist, time_restricted, ignore_retrieval_failure)
-    conf, whitelist, time_restricted, ignore_retrieval_failure)
+