From 9b701d81215e78f88070db96eafdcbf5197e5726 Mon Sep 17 00:00:00 2001 From: Alexandre L Date: Mon, 19 Feb 2018 14:38:43 +0100 Subject: [PATCH] Support time-based blacklist from domains-time-restricted.txt * Modified list_from_url to load_from_url to avoid reading the `time_restricted` file twice (1 for output, 1 for whitelist) --- .../generate-domains-blacklist.py | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/utils/generate-domains-blacklists/generate-domains-blacklist.py b/utils/generate-domains-blacklists/generate-domains-blacklist.py index 1c579a87..605ce666 100755 --- a/utils/generate-domains-blacklists/generate-domains-blacklist.py +++ b/utils/generate-domains-blacklists/generate-domains-blacklist.py @@ -36,7 +36,7 @@ def parse_blacklist(content, trusted=False): return names -def list_from_url(url): +def load_from_url(url): sys.stderr.write("Loading data from [{}]\n".format(url)) req = urllib2.Request(url) trusted = False @@ -51,7 +51,7 @@ def list_from_url(url): raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode())) content = response.read() - return parse_blacklist(content, trusted) + return (content, trusted) def name_cmp(name): @@ -74,19 +74,34 @@ def whitelist_from_url(url): if not url: return set() - return list_from_url(url) + return parse_blacklist(*load_from_url(url)) -def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure): +def blacklists_from_config_file(file, whitelist, time_restricted, ignore_retrieval_failure): blacklists = {} + whitelisted_names = set() all_names = set() unique_names = set() + # Load time-based blacklist + if time_restricted and not re.match(r'^[a-z0-9]+:', time_restricted): + time_restricted = "file:" + time_restricted + + time_restricted_fetched = load_from_url(time_restricted) + + print("########## Time-based blacklist ##########\n") + print(time_restricted_fetched[0].replace('\r', '')) # Comments are not removed from output ; remove \r not removed by urllib2 + + # Time restricted names are supposed to be whitelisted, or that's useless + whitelisted_names |= parse_blacklist(*time_restricted_fetched) + + # Whitelist if whitelist and not re.match(r'^[a-z0-9]+:', whitelist): whitelist = "file:" + whitelist - whitelisted_names = whitelist_from_url(whitelist) + whitelisted_names |= whitelist_from_url(whitelist) + # Load conf & blacklists with open(file) as fd: for line in fd: line = str.strip(line) @@ -94,7 +109,7 @@ def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure): continue url = line try: - names = list_from_url(url) + names = parse_blacklist(*load_from_url(url)) blacklists[url] = names all_names |= names except Exception as e: @@ -102,6 +117,7 @@ def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure): if not ignore_retrieval_failure: exit(1) + # Process blacklists for url, names in blacklists.items(): print("\n\n########## Blacklist from {} ##########\n".format(url)) ignored, whitelisted = 0, 0 @@ -129,6 +145,8 @@ argp.add_argument("-c", "--config", default="domains-blacklist.conf", help="file containing blacklist sources") argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt", help="file containing a set of names to exclude from the blacklist") +argp.add_argument("-r", "--time-restricted", default="domains-time-restricted.txt", + help="file containing a set of names to be time restricted") argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true', help="generate list even if some urls couldn't be retrieved") argp.add_argument("-t", "--timeout", default=30, @@ -137,6 +155,7 @@ args = argp.parse_args() conf = args.config whitelist = args.whitelist +time_restricted = args.time_restricted ignore_retrieval_failure = args.ignore_retrieval_failure -blacklists_from_config_file(conf, whitelist, ignore_retrieval_failure) +blacklists_from_config_file(conf, whitelist, time_restricted, ignore_retrieval_failure)