scraper for searching of feeds in html sites

2025-01-19 12:22:47 +01:00 · 2021-03-16 08:59:20 +01:00 · 2021-03-16 08:59:20 +01:00 · 24df33dd89
commit 24df33dd89
parent 3ac28d135c
3 changed files with 30 additions and 2 deletions
--- a/resources/desktop/com.github.rssguard.appdata.xml
+++ b/resources/desktop/com.github.rssguard.appdata.xml
@ -30,7 +30,7 @@
  <url type="donation">https://martinrotter.github.io/donate/</url>
  <content_rating type="oars-1.1" />
  <releases>
-    <release version="3.9.0" date="2021-03-15"/>
+    <release version="3.9.0" date="2021-03-16"/>
  </releases>
  <content_rating type="oars-1.0">
    <content_attribute id="violence-cartoon">none</content_attribute>
--- a/resources/scripts/7za
+++ b/resources/scripts/7za
@ -1 +1 @@
-Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749
+Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9
--- a/resources/scripts/scrapers/search-xml-feeds.py
+++ b/resources/scripts/scrapers/search-xml-feeds.py
@ -0,0 +1,28 @@
+# Produces the list of links to XML feeds as extracted from input list of generic URLs.
+# This script expects to have the file path passed as the only input parameter
+
+import re
+import sys
+import urllib.request
+from urllib.parse import urljoin
+
+urls_file = sys.argv[1]
+
+with open(urls_file) as f:
+  urls_lines = [line.rstrip() for line in f]
+
+regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
+regexp_href = re.compile("href=\"([^\"]+)\"")
+
+for url in urls_lines:
+  # Download HTML data.
+  url_response = urllib.request.urlopen(url)
+  html =  url_response.read().decode("utf-8")
+  
+  # Search for XML feeds with regexps.
+  for link_tag in re.findall(regexp_link, html):
+    for link_xml_feed in re.findall(regexp_href, link_tag):
+      if link_xml_feed.startswith("/"):
+        print(urljoin(url, "/") + link_xml_feed[1:])
+      else:
+        print(link_xml_feed)