From 2c1bc5a937f5508d61a7cdd0493aba0e45b546f2 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Fri, 16 Apr 2021 11:03:13 +0200 Subject: [PATCH] better in the news --- resources/scripts/scrapers/wiki-inthenews.py | 21 ++++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/resources/scripts/scrapers/wiki-inthenews.py b/resources/scripts/scrapers/wiki-inthenews.py index d95847f54..d7988c1ae 100755 --- a/resources/scripts/scrapers/wiki-inthenews.py +++ b/resources/scripts/scrapers/wiki-inthenews.py @@ -3,7 +3,7 @@ import urllib.request import re import json -from html.parser import HTMLParser +from newspaper import Article url = "https://en.wikipedia.org/wiki/Main_Page" response = urllib.request.urlopen(url) @@ -15,21 +15,20 @@ articles_li = re.findall("
  • ([\S\n\t\v ]+?)<\/li>", text_li) # Iterate all articles and generate JSON feed entries. wiki_base_url = "https://en.wikipedia.org" -class HTMLFilter(HTMLParser): - text = "" - def handle_data(self, data): - self.text += data json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}" items = list() for article in articles_li: - article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1)) - f = HTMLFilter() - f.feed(article) - f.text - article_title = json.dumps(f.text) - article_html = json.dumps("
    {}
    ".format(article)) + article_url = wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1) + + f = Article(article_url, keep_article_html = True) + f.download() + f.parse() + + article_url = json.dumps(article_url) + article_title = json.dumps(f.title) + article_html = json.dumps(f.article_html) items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title, html=article_html, url=article_url))