From d9330210da1b6bb92e4153c86d8a3d89dc29f0ac Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Tue, 18 Oct 2022 06:29:20 +0200 Subject: [PATCH] fix full site scraper! --- resources/desktop/com.github.rssguard.appdata.xml | 2 +- resources/scripts/scrapers/scrape-full-articles.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/resources/desktop/com.github.rssguard.appdata.xml b/resources/desktop/com.github.rssguard.appdata.xml index 651d60dc3..01c9643f5 100644 --- a/resources/desktop/com.github.rssguard.appdata.xml +++ b/resources/desktop/com.github.rssguard.appdata.xml @@ -24,7 +24,7 @@ https://github.com/sponsors/martinrotter - + none diff --git a/resources/scripts/scrapers/scrape-full-articles.py b/resources/scripts/scrapers/scrape-full-articles.py index 60616299e..466530c1e 100644 --- a/resources/scripts/scrapers/scrape-full-articles.py +++ b/resources/scripts/scrapers/scrape-full-articles.py @@ -16,8 +16,7 @@ import xml.etree.ElementTree as ET # Globals. atom_ns = {"atom": "http://www.w3.org/2005/Atom"} -article_parser_url = "https://demos.pwshub.com/article-parser?url=" - +article_parser_url = "https://extract-article.deta.dev/?url=" # Methods. def process_article(article, is_rss, is_atom): @@ -59,7 +58,7 @@ def main(): sys.stdin.reconfigure(encoding="utf-8") - #feed_data = urllib.request.urlopen("https://dilbert.com/feed").read() + #feed_data = urllib.request.urlopen("http://feeds.hanselman.com/ScottHanselman").read() feed_data = sys.stdin.read() feed_document = ET.fromstring(feed_data) @@ -89,7 +88,7 @@ def main(): for article in feed_articles: process_article(article, is_rss, is_atom) - print(ET.tostring(feed_document, encoding="unicode")) + print(ET.tostring(feed_document).decode()) if __name__ == '__main__':