rssguard/resources/scripts/scrapers/scrape-full-articles.py

# Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles.
#
# Make sure to have all dependencies installed:
#   pip3 install asyncio (if using parallel version of the script)
#
# You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl:
#   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-full-articles.py "4"
#
# You must provide three command line arguments:
#   scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS]

import json
import sys
import urllib.request
import xml.etree.ElementTree as ET

# Globals.
atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url="


# Methods.
def process_article(article, is_rss, is_atom):
  try:
    # Extract link.
    scraped_article = ""

    if is_rss:
      article_link = article.find("link").text
    elif is_atom:
      article_link = article.find("atom:link", atom_ns).attrib['href']

    # Scrape with article-parser.
    link = article_parser_url + article_link

    response = urllib.request.urlopen(link)
    text = response.read().decode("utf-8")
    js = json.loads(text)

    if int(js["error"]) == 0:
      scraped_article = js["data"]["content"]

    # Save scraped data.
    if scraped_article:
      if is_rss:
        article.find("description").text = scraped_article
      elif is_atom:
        article.find("atom:content", atom_ns).text = scraped_article
  except:
    pass


def main():
  no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1

  if no_threads > 1:
    import asyncio
    from concurrent.futures import ThreadPoolExecutor

  sys.stdin.reconfigure(encoding="utf-8")

  #feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()
  feed_data = sys.stdin.read()
  feed_document = ET.fromstring(feed_data)

  # Determine feed type.
  is_rss = feed_document.tag == "rss"
  is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed"

  if not is_rss and not is_atom:
    sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.")

  # Extract articles.
  if is_rss:
    feed_articles = feed_document.findall(".//item")
  elif is_atom:
    feed_articles = feed_document.findall(".//atom:entry", atom_ns)

  # Scrape articles.
  if no_threads > 1:
    with ThreadPoolExecutor(max_workers=no_threads) as executor:
      futures = []
      for article in feed_articles:
        futures.append(
            executor.submit(process_article, article, is_rss, is_atom))
      for future in futures:
        future.result()
  else:
    for article in feed_articles:
      process_article(article, is_rss, is_atom)

  print(ET.tostring(feed_document, encoding="unicode"))


if __name__ == '__main__':
  main()
unify scrape scripts 2021-12-10 12:07:26 +01:00			`# Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles.`
			`#`
			`# Make sure to have all dependencies installed:`
			`# pip3 install asyncio (if using parallel version of the script)`
			`#`
			`# You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl:`
			`# curl 'http://rss.cnn.com/rss/edition.rss' \| python ./scrape-full-articles.py "4"`
			`#`
			`# You must provide three command line arguments:`
			`# scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS]`

			`import json`
			`import sys`
			`import urllib.request`
			`import xml.etree.ElementTree as ET`

			`# Globals.`
			`atom_ns = {"atom": "http://www.w3.org/2005/Atom"}`
			`article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url="`


			`# Methods.`
			`def process_article(article, is_rss, is_atom):`
			`try:`
			`# Extract link.`
			`scraped_article = ""`

			`if is_rss:`
			`article_link = article.find("link").text`
			`elif is_atom:`
			`article_link = article.find("atom:link", atom_ns).attrib['href']`

			`# Scrape with article-parser.`
			`link = article_parser_url + article_link`

			`response = urllib.request.urlopen(link)`
			`text = response.read().decode("utf-8")`
			`js = json.loads(text)`

			`if int(js["error"]) == 0:`
			`scraped_article = js["data"]["content"]`

			`# Save scraped data.`
			`if scraped_article:`
			`if is_rss:`
			`article.find("description").text = scraped_article`
			`elif is_atom:`
			`article.find("atom:content", atom_ns).text = scraped_article`
			`except:`
			`pass`


			`def main():`
			`no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1`

			`if no_threads > 1:`
			`import asyncio`
			`from concurrent.futures import ThreadPoolExecutor`

			`sys.stdin.reconfigure(encoding="utf-8")`

			`#feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()`
			`feed_data = sys.stdin.read()`
			`feed_document = ET.fromstring(feed_data)`

			`# Determine feed type.`
			`is_rss = feed_document.tag == "rss"`
			`is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed"`

			`if not is_rss and not is_atom:`
			`sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.")`

			`# Extract articles.`
			`if is_rss:`
			`feed_articles = feed_document.findall(".//item")`
			`elif is_atom:`
			`feed_articles = feed_document.findall(".//atom:entry", atom_ns)`

			`# Scrape articles.`
			`if no_threads > 1:`
			`with ThreadPoolExecutor(max_workers=no_threads) as executor:`
			`futures = []`
			`for article in feed_articles:`
			`futures.append(`
			`executor.submit(process_article, article, is_rss, is_atom))`
			`for future in futures:`
			`future.result()`
			`else:`
			`for article in feed_articles:`
			`process_article(article, is_rss, is_atom)`

			`print(ET.tostring(feed_document, encoding="unicode"))`


			`if __name__ == '__main__':`
			`main()`