2021-03-16 08:59:20 +01:00
|
|
|
# Produces the list of links to XML feeds as extracted from input list of generic URLs.
|
|
|
|
# This script expects to have the file path passed as the only input parameter
|
|
|
|
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import urllib.request
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
|
|
urls_file = sys.argv[1]
|
|
|
|
|
|
|
|
with open(urls_file) as f:
|
|
|
|
urls_lines = [line.rstrip() for line in f]
|
|
|
|
|
|
|
|
regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
|
|
|
|
regexp_href = re.compile("href=\"([^\"]+)\"")
|
|
|
|
|
|
|
|
for url in urls_lines:
|
|
|
|
# Download HTML data.
|
2021-03-16 09:02:21 +01:00
|
|
|
try:
|
|
|
|
url_response = urllib.request.urlopen(url)
|
|
|
|
html = url_response.read().decode("utf-8")
|
|
|
|
except:
|
|
|
|
continue
|
|
|
|
|
2021-03-16 08:59:20 +01:00
|
|
|
# Search for XML feeds with regexps.
|
|
|
|
for link_tag in re.findall(regexp_link, html):
|
|
|
|
for link_xml_feed in re.findall(regexp_href, link_tag):
|
|
|
|
if link_xml_feed.startswith("/"):
|
|
|
|
print(urljoin(url, "/") + link_xml_feed[1:])
|
|
|
|
else:
|
|
|
|
print(link_xml_feed)
|