fix full site scraper!

This commit is contained in:
Martin Rotter 2022-10-18 06:29:20 +02:00
parent 577f1f9884
commit d9330210da
2 changed files with 4 additions and 5 deletions

View File

@ -24,7 +24,7 @@
<url type="donation">https://github.com/sponsors/martinrotter</url> <url type="donation">https://github.com/sponsors/martinrotter</url>
<content_rating type="oars-1.1" /> <content_rating type="oars-1.1" />
<releases> <releases>
<release version="4.2.5" date="2022-10-14"/> <release version="4.2.5" date="2022-10-18"/>
</releases> </releases>
<content_rating type="oars-1.0"> <content_rating type="oars-1.0">
<content_attribute id="violence-cartoon">none</content_attribute> <content_attribute id="violence-cartoon">none</content_attribute>

View File

@ -16,8 +16,7 @@ import xml.etree.ElementTree as ET
# Globals. # Globals.
atom_ns = {"atom": "http://www.w3.org/2005/Atom"} atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
article_parser_url = "https://demos.pwshub.com/article-parser?url=" article_parser_url = "https://extract-article.deta.dev/?url="
# Methods. # Methods.
def process_article(article, is_rss, is_atom): def process_article(article, is_rss, is_atom):
@ -59,7 +58,7 @@ def main():
sys.stdin.reconfigure(encoding="utf-8") sys.stdin.reconfigure(encoding="utf-8")
#feed_data = urllib.request.urlopen("https://dilbert.com/feed").read() #feed_data = urllib.request.urlopen("http://feeds.hanselman.com/ScottHanselman").read()
feed_data = sys.stdin.read() feed_data = sys.stdin.read()
feed_document = ET.fromstring(feed_data) feed_document = ET.fromstring(feed_data)
@ -89,7 +88,7 @@ def main():
for article in feed_articles: for article in feed_articles:
process_article(article, is_rss, is_atom) process_article(article, is_rss, is_atom)
print(ET.tostring(feed_document, encoding="unicode")) print(ET.tostring(feed_document).decode())
if __name__ == '__main__': if __name__ == '__main__':