Another script

2021-04-19 08:04:34 +02:00 · 2021-04-19 08:04:34 +02:00 · 6c45d020f4
commit 6c45d020f4
parent 93d673ef74
1 changed files with 55 additions and 0 deletions
--- a/resources/scripts/scrapers/scrape-as-rss2.py
+++ b/resources/scripts/scrapers/scrape-as-rss2.py
@ -0,0 +1,55 @@
+# Downloads full articles for RSS 2.0 feed and replaces original articles.
+#
+# Make sure to have all dependencies installed:
+#   pip3 install asyncio (if using parallel version of the script)
+#
+# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
+#   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
+#
+# You must provide three command line arguments:
+#   scrape-rss2.py  [NUMBER-OF-PARALLEL-THREADS]
+
+import json
+import re
+import sys
+import time
+import html
+import urllib.request
+import distutils.util
+import xml.etree.ElementTree as ET
+
+no_threads = int(sys.argv[1])
+
+if no_threads > 1:
+  import asyncio
+  from concurrent.futures import ThreadPoolExecutor
+
+sys.stdin.reconfigure(encoding='utf-8')
+rss_data = sys.stdin.read()
+rss_document = ET.fromstring(rss_data)
+
+def process_article(article):
+  try:
+    link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
+    response = urllib.request.urlopen(link)
+    text =  response.read().decode("utf-8")
+    js = json.loads(text)
+
+    if int(js["error"]) == 0:
+      article.find("description").text = js["data"]["content"]
+  except:
+    pass
+
+# Scrape articles.
+if no_threads > 1:
+  with ThreadPoolExecutor(max_workers = no_threads) as executor:
+    futures = []
+    for article in rss_document.findall(".//item"):
+      futures.append(executor.submit(process_article, article))
+    for future in futures:
+      future.result()
+else:
+  for article in rss_document.findall(".//item"):
+    process_article(article)
+
+print(ET.tostring(rss_document, encoding = "unicode"))