scraper for searching of feeds in html sites

2025-01-31 17:44:52 +01:00 · 2021-03-16 08:59:20 +01:00 · 2021-03-16 08:59:20 +01:00 · 24df33dd89
commit 24df33dd89
parent 3ac28d135c
3 changed files with 30 additions and 2 deletions
--- a/resources/desktop/com.github.rssguard.appdata.xml
+++ b/resources/desktop/com.github.rssguard.appdata.xml
@ -30,7 +30,7 @@
  <url type="donation">https://martinrotter.github.io/donate/</url>
  <content_rating type="oars-1.1" />
  <releases>
-    <release version="3.9.0" date="2021-03-15"/>
+    <release version="3.9.0" date="2021-03-16"/>
  </releases>
  <content_rating type="oars-1.0">
    <content_attribute id="violence-cartoon">none</content_attribute>
--- a/resources/scripts/7za
+++ b/resources/scripts/7za
@ -1 +1 @@
-Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749
+Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9
--- a/resources/scripts/scrapers/search-xml-feeds.py
+++ b/resources/scripts/scrapers/search-xml-feeds.py
@ -0,0 +1,28 @@
 # Produces the list of links to XML feeds as extracted from input list of generic URLs.
 # This script expects to have the file path passed as the only input parameter
 import re
 import sys
 import urllib.request
 from urllib.parse import urljoin
 urls_file = sys.argv[1]
 with open(urls_file) as f:
  urls_lines = [line.rstrip() for line in f]
 regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
 regexp_href = re.compile("href=\"([^\"]+)\"")
 for url in urls_lines:
  # Download HTML data.
  url_response = urllib.request.urlopen(url)
  html =  url_response.read().decode("utf-8")
  # Search for XML feeds with regexps.
  for link_tag in re.findall(regexp_link, html):
    for link_xml_feed in re.findall(regexp_href, link_tag):
      if link_xml_feed.startswith("/"):
        print(urljoin(url, "/") + link_xml_feed[1:])
      else:
        print(link_xml_feed)
		`@ -1 +1 @@`
			`Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749`				`Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9`