mirror of
https://github.com/martinrotter/rssguard.git
synced 2025-01-31 17:44:52 +01:00
scraper for searching of feeds in html sites
This commit is contained in:
parent
3ac28d135c
commit
24df33dd89
@ -30,7 +30,7 @@
|
|||||||
<url type="donation">https://martinrotter.github.io/donate/</url>
|
<url type="donation">https://martinrotter.github.io/donate/</url>
|
||||||
<content_rating type="oars-1.1" />
|
<content_rating type="oars-1.1" />
|
||||||
<releases>
|
<releases>
|
||||||
<release version="3.9.0" date="2021-03-15"/>
|
<release version="3.9.0" date="2021-03-16"/>
|
||||||
</releases>
|
</releases>
|
||||||
<content_rating type="oars-1.0">
|
<content_rating type="oars-1.0">
|
||||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749
|
Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9
|
28
resources/scripts/scrapers/search-xml-feeds.py
Executable file
28
resources/scripts/scrapers/search-xml-feeds.py
Executable file
@ -0,0 +1,28 @@
|
|||||||
|
# Produces the list of links to XML feeds as extracted from input list of generic URLs.
|
||||||
|
# This script expects to have the file path passed as the only input parameter
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
urls_file = sys.argv[1]
|
||||||
|
|
||||||
|
with open(urls_file) as f:
|
||||||
|
urls_lines = [line.rstrip() for line in f]
|
||||||
|
|
||||||
|
regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
|
||||||
|
regexp_href = re.compile("href=\"([^\"]+)\"")
|
||||||
|
|
||||||
|
for url in urls_lines:
|
||||||
|
# Download HTML data.
|
||||||
|
url_response = urllib.request.urlopen(url)
|
||||||
|
html = url_response.read().decode("utf-8")
|
||||||
|
|
||||||
|
# Search for XML feeds with regexps.
|
||||||
|
for link_tag in re.findall(regexp_link, html):
|
||||||
|
for link_xml_feed in re.findall(regexp_href, link_tag):
|
||||||
|
if link_xml_feed.startswith("/"):
|
||||||
|
print(urljoin(url, "/") + link_xml_feed[1:])
|
||||||
|
else:
|
||||||
|
print(link_xml_feed)
|
Loading…
x
Reference in New Issue
Block a user