mirror of
https://github.com/martinrotter/rssguard.git
synced 2025-01-19 12:22:47 +01:00
Another script
This commit is contained in:
parent
93d673ef74
commit
6c45d020f4
55
resources/scripts/scrapers/scrape-as-rss2.py
Executable file
55
resources/scripts/scrapers/scrape-as-rss2.py
Executable file
@ -0,0 +1,55 @@
|
|||||||
|
# Downloads full articles for RSS 2.0 feed and replaces original articles.
|
||||||
|
#
|
||||||
|
# Make sure to have all dependencies installed:
|
||||||
|
# pip3 install asyncio (if using parallel version of the script)
|
||||||
|
#
|
||||||
|
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
||||||
|
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
|
||||||
|
#
|
||||||
|
# You must provide three command line arguments:
|
||||||
|
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import html
|
||||||
|
import urllib.request
|
||||||
|
import distutils.util
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
no_threads = int(sys.argv[1])
|
||||||
|
|
||||||
|
if no_threads > 1:
|
||||||
|
import asyncio
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
sys.stdin.reconfigure(encoding='utf-8')
|
||||||
|
rss_data = sys.stdin.read()
|
||||||
|
rss_document = ET.fromstring(rss_data)
|
||||||
|
|
||||||
|
def process_article(article):
|
||||||
|
try:
|
||||||
|
link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
|
||||||
|
response = urllib.request.urlopen(link)
|
||||||
|
text = response.read().decode("utf-8")
|
||||||
|
js = json.loads(text)
|
||||||
|
|
||||||
|
if int(js["error"]) == 0:
|
||||||
|
article.find("description").text = js["data"]["content"]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Scrape articles.
|
||||||
|
if no_threads > 1:
|
||||||
|
with ThreadPoolExecutor(max_workers = no_threads) as executor:
|
||||||
|
futures = []
|
||||||
|
for article in rss_document.findall(".//item"):
|
||||||
|
futures.append(executor.submit(process_article, article))
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
else:
|
||||||
|
for article in rss_document.findall(".//item"):
|
||||||
|
process_article(article)
|
||||||
|
|
||||||
|
print(ET.tostring(rss_document, encoding = "unicode"))
|
Loading…
Reference in New Issue
Block a user