mirror of
https://github.com/martinrotter/rssguard.git
synced 2025-01-19 12:22:47 +01:00
scraper for searching of feeds in html sites
This commit is contained in:
parent
3ac28d135c
commit
24df33dd89
@ -30,7 +30,7 @@
|
||||
<url type="donation">https://martinrotter.github.io/donate/</url>
|
||||
<content_rating type="oars-1.1" />
|
||||
<releases>
|
||||
<release version="3.9.0" date="2021-03-15"/>
|
||||
<release version="3.9.0" date="2021-03-16"/>
|
||||
</releases>
|
||||
<content_rating type="oars-1.0">
|
||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749
|
||||
Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9
|
28
resources/scripts/scrapers/search-xml-feeds.py
Executable file
28
resources/scripts/scrapers/search-xml-feeds.py
Executable file
@ -0,0 +1,28 @@
|
||||
# Produces the list of links to XML feeds as extracted from input list of generic URLs.
|
||||
# This script expects to have the file path passed as the only input parameter
|
||||
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
from urllib.parse import urljoin
|
||||
|
||||
urls_file = sys.argv[1]
|
||||
|
||||
with open(urls_file) as f:
|
||||
urls_lines = [line.rstrip() for line in f]
|
||||
|
||||
regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
|
||||
regexp_href = re.compile("href=\"([^\"]+)\"")
|
||||
|
||||
for url in urls_lines:
|
||||
# Download HTML data.
|
||||
url_response = urllib.request.urlopen(url)
|
||||
html = url_response.read().decode("utf-8")
|
||||
|
||||
# Search for XML feeds with regexps.
|
||||
for link_tag in re.findall(regexp_link, html):
|
||||
for link_xml_feed in re.findall(regexp_href, link_tag):
|
||||
if link_xml_feed.startswith("/"):
|
||||
print(urljoin(url, "/") + link_xml_feed[1:])
|
||||
else:
|
||||
print(link_xml_feed)
|
Loading…
Reference in New Issue
Block a user