diff --git a/resources/desktop/com.github.rssguard.appdata.xml b/resources/desktop/com.github.rssguard.appdata.xml index 414a44f21..4d3f9db79 100644 --- a/resources/desktop/com.github.rssguard.appdata.xml +++ b/resources/desktop/com.github.rssguard.appdata.xml @@ -30,7 +30,7 @@ https://martinrotter.github.io/donate/ - + none diff --git a/resources/scripts/7za b/resources/scripts/7za index 9c10723bf..47f412575 160000 --- a/resources/scripts/7za +++ b/resources/scripts/7za @@ -1 +1 @@ -Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749 +Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9 diff --git a/resources/scripts/scrapers/scrape-rss2.py b/resources/scripts/scrapers/scrape-rss2.py new file mode 100755 index 000000000..63ab40eb7 --- /dev/null +++ b/resources/scripts/scrapers/scrape-rss2.py @@ -0,0 +1,56 @@ +# Downloads full articles for RSS 2.0 feed and replaces original articles. +# +# Make sure to have all dependencies installed: +# pip3 install newspaper3k +# pip3 install asyncio (if using parallel version of the script) +# +# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: +# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4" +# +# You must provide three command line arguments: +# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS] + +import json +import re +import sys +import time +import html +import requests +import distutils.util +import xml.etree.ElementTree as ET +from newspaper import Article + +no_threads = int(sys.argv[1]) + +if no_threads > 1: + import asyncio + from concurrent.futures import ThreadPoolExecutor + +sys.stdin.reconfigure(encoding='utf-8') +rss_data = sys.stdin.read() +rss_document = ET.fromstring(rss_data) + +def process_article(article): + try: + link = article.find("link").text + + f = Article(link, keep_article_html = True) + f.download() + f.parse() + article.find("description").text = f.article_html + except: + pass + +# Scrape articles. +if no_threads > 1: + with ThreadPoolExecutor(max_workers = no_threads) as executor: + futures = [] + for article in rss_document.findall(".//item"): + futures.append(executor.submit(process_article, article)) + for future in futures: + future.result() +else: + for article in rss_document.findall(".//item"): + process_article(article) + +print(ET.tostring(rss_document, encoding = "unicode")) \ No newline at end of file