fix full site scraper!
This commit is contained in:
parent
577f1f9884
commit
d9330210da
@ -24,7 +24,7 @@
|
|||||||
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
||||||
<content_rating type="oars-1.1" />
|
<content_rating type="oars-1.1" />
|
||||||
<releases>
|
<releases>
|
||||||
<release version="4.2.5" date="2022-10-14"/>
|
<release version="4.2.5" date="2022-10-18"/>
|
||||||
</releases>
|
</releases>
|
||||||
<content_rating type="oars-1.0">
|
<content_rating type="oars-1.0">
|
||||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||||
|
@ -16,8 +16,7 @@ import xml.etree.ElementTree as ET
|
|||||||
|
|
||||||
# Globals.
|
# Globals.
|
||||||
atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
|
atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
|
||||||
article_parser_url = "https://demos.pwshub.com/article-parser?url="
|
article_parser_url = "https://extract-article.deta.dev/?url="
|
||||||
|
|
||||||
|
|
||||||
# Methods.
|
# Methods.
|
||||||
def process_article(article, is_rss, is_atom):
|
def process_article(article, is_rss, is_atom):
|
||||||
@ -59,7 +58,7 @@ def main():
|
|||||||
|
|
||||||
sys.stdin.reconfigure(encoding="utf-8")
|
sys.stdin.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
#feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()
|
#feed_data = urllib.request.urlopen("http://feeds.hanselman.com/ScottHanselman").read()
|
||||||
feed_data = sys.stdin.read()
|
feed_data = sys.stdin.read()
|
||||||
feed_document = ET.fromstring(feed_data)
|
feed_document = ET.fromstring(feed_data)
|
||||||
|
|
||||||
@ -89,7 +88,7 @@ def main():
|
|||||||
for article in feed_articles:
|
for article in feed_articles:
|
||||||
process_article(article, is_rss, is_atom)
|
process_article(article, is_rss, is_atom)
|
||||||
|
|
||||||
print(ET.tostring(feed_document, encoding="unicode"))
|
print(ET.tostring(feed_document).decode())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user