scraper for searching of feeds in html sites

This commit is contained in:
Martin Rotter 2021-03-16 08:59:20 +01:00
parent 3ac28d135c
commit 24df33dd89
3 changed files with 30 additions and 2 deletions

View File

@ -30,7 +30,7 @@
<url type="donation">https://martinrotter.github.io/donate/</url>
<content_rating type="oars-1.1" />
<releases>
<release version="3.9.0" date="2021-03-15"/>
<release version="3.9.0" date="2021-03-16"/>
</releases>
<content_rating type="oars-1.0">
<content_attribute id="violence-cartoon">none</content_attribute>

@ -1 +1 @@
Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749
Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9

View File

@ -0,0 +1,28 @@
# Produces the list of links to XML feeds as extracted from input list of generic URLs.
# This script expects to have the file path passed as the only input parameter
import re
import sys
import urllib.request
from urllib.parse import urljoin
urls_file = sys.argv[1]
with open(urls_file) as f:
urls_lines = [line.rstrip() for line in f]
regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
regexp_href = re.compile("href=\"([^\"]+)\"")
for url in urls_lines:
# Download HTML data.
url_response = urllib.request.urlopen(url)
html = url_response.read().decode("utf-8")
# Search for XML feeds with regexps.
for link_tag in re.findall(regexp_link, html):
for link_xml_feed in re.findall(regexp_href, link_tag):
if link_xml_feed.startswith("/"):
print(urljoin(url, "/") + link_xml_feed[1:])
else:
print(link_xml_feed)