rssguard/resources/scripts/scrapers/search-xml-feeds.py

# Produces the list of links to XML feeds as extracted from input list of generic URLs.
# This script expects to have the file path passed as the only input parameter

import re
import sys
import urllib.request
from urllib.parse import urljoin

urls_file = sys.argv[1]

with open(urls_file) as f:
  urls_lines = [line.rstrip() for line in f]

regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml|rss\+xml|feed\+json|json)\"[^>]*>")
regexp_href = re.compile("href=\"([^\"]+)\"")

for url in urls_lines:
  # Download HTML data.
  try:
    url_response = urllib.request.urlopen(url)
    html =  url_response.read().decode("utf-8")
  except:
    continue

  # Search for XML feeds with regexps.
  for link_tag in re.findall(regexp_link, html):
    for link_xml_feed in re.findall(regexp_href, link_tag):
      if link_xml_feed.startswith("/"):
        print(urljoin(url, "/") + link_xml_feed[1:])
      else:
        print(link_xml_feed)
scraper for searching of feeds in html sites 2021-03-16 08:59:20 +01:00			`# Produces the list of links to XML feeds as extracted from input list of generic URLs.`
			`# This script expects to have the file path passed as the only input parameter`

			`import re`
			`import sys`
			`import urllib.request`
			`from urllib.parse import urljoin`

			`urls_file = sys.argv[1]`

			`with open(urls_file) as f:`
			`urls_lines = [line.rstrip() for line in f]`

			`regexp_link = re.compile("<link[^>]+type=\"application\/(?:atom\+xml\|rss\+xml\|feed\+json\|json)\"[^>]*>")`
			`regexp_href = re.compile("href=\"([^\"]+)\"")`

			`for url in urls_lines:`
			`# Download HTML data.`
scraper for searching of feeds in html sites 2021-03-16 09:02:21 +01:00			`try:`
			`url_response = urllib.request.urlopen(url)`
			`html = url_response.read().decode("utf-8")`
			`except:`
			`continue`

scraper for searching of feeds in html sites 2021-03-16 08:59:20 +01:00			`# Search for XML feeds with regexps.`
			`for link_tag in re.findall(regexp_link, html):`
			`for link_xml_feed in re.findall(regexp_href, link_tag):`
			`if link_xml_feed.startswith("/"):`
			`print(urljoin(url, "/") + link_xml_feed[1:])`
			`else:`
			`print(link_xml_feed)`