Support time-based blacklist from domains-time-restricted.txt

* Modified list_from_url to load_from_url to avoid reading the `time_restricted` file twice (1 for output, 1 for whitelist)
This commit is contained in:
Alexandre L 2018-02-19 14:38:43 +01:00 committed by GitHub
parent ac0f0d2ff0
commit 9b701d8121
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 26 additions and 7 deletions

View File

@ -36,7 +36,7 @@ def parse_blacklist(content, trusted=False):
return names return names
def list_from_url(url): def load_from_url(url):
sys.stderr.write("Loading data from [{}]\n".format(url)) sys.stderr.write("Loading data from [{}]\n".format(url))
req = urllib2.Request(url) req = urllib2.Request(url)
trusted = False trusted = False
@ -51,7 +51,7 @@ def list_from_url(url):
raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode())) raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
content = response.read() content = response.read()
return parse_blacklist(content, trusted) return (content, trusted)
def name_cmp(name): def name_cmp(name):
@ -74,19 +74,34 @@ def whitelist_from_url(url):
if not url: if not url:
return set() return set()
return list_from_url(url) return parse_blacklist(*load_from_url(url))
def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure): def blacklists_from_config_file(file, whitelist, time_restricted, ignore_retrieval_failure):
blacklists = {} blacklists = {}
whitelisted_names = set()
all_names = set() all_names = set()
unique_names = set() unique_names = set()
# Load time-based blacklist
if time_restricted and not re.match(r'^[a-z0-9]+:', time_restricted):
time_restricted = "file:" + time_restricted
time_restricted_fetched = load_from_url(time_restricted)
print("########## Time-based blacklist ##########\n")
print(time_restricted_fetched[0].replace('\r', '')) # Comments are not removed from output ; remove \r not removed by urllib2
# Time restricted names are supposed to be whitelisted, or that's useless
whitelisted_names |= parse_blacklist(*time_restricted_fetched)
# Whitelist
if whitelist and not re.match(r'^[a-z0-9]+:', whitelist): if whitelist and not re.match(r'^[a-z0-9]+:', whitelist):
whitelist = "file:" + whitelist whitelist = "file:" + whitelist
whitelisted_names = whitelist_from_url(whitelist) whitelisted_names |= whitelist_from_url(whitelist)
# Load conf & blacklists
with open(file) as fd: with open(file) as fd:
for line in fd: for line in fd:
line = str.strip(line) line = str.strip(line)
@ -94,7 +109,7 @@ def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure):
continue continue
url = line url = line
try: try:
names = list_from_url(url) names = parse_blacklist(*load_from_url(url))
blacklists[url] = names blacklists[url] = names
all_names |= names all_names |= names
except Exception as e: except Exception as e:
@ -102,6 +117,7 @@ def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure):
if not ignore_retrieval_failure: if not ignore_retrieval_failure:
exit(1) exit(1)
# Process blacklists
for url, names in blacklists.items(): for url, names in blacklists.items():
print("\n\n########## Blacklist from {} ##########\n".format(url)) print("\n\n########## Blacklist from {} ##########\n".format(url))
ignored, whitelisted = 0, 0 ignored, whitelisted = 0, 0
@ -129,6 +145,8 @@ argp.add_argument("-c", "--config", default="domains-blacklist.conf",
help="file containing blacklist sources") help="file containing blacklist sources")
argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt", argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt",
help="file containing a set of names to exclude from the blacklist") help="file containing a set of names to exclude from the blacklist")
argp.add_argument("-r", "--time-restricted", default="domains-time-restricted.txt",
help="file containing a set of names to be time restricted")
argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true', argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true',
help="generate list even if some urls couldn't be retrieved") help="generate list even if some urls couldn't be retrieved")
argp.add_argument("-t", "--timeout", default=30, argp.add_argument("-t", "--timeout", default=30,
@ -137,6 +155,7 @@ args = argp.parse_args()
conf = args.config conf = args.config
whitelist = args.whitelist whitelist = args.whitelist
time_restricted = args.time_restricted
ignore_retrieval_failure = args.ignore_retrieval_failure ignore_retrieval_failure = args.ignore_retrieval_failure
blacklists_from_config_file(conf, whitelist, ignore_retrieval_failure) blacklists_from_config_file(conf, whitelist, time_restricted, ignore_retrieval_failure)