From 2c1bc5a937f5508d61a7cdd0493aba0e45b546f2 Mon Sep 17 00:00:00 2001
From: Martin Rotter <rotter@praktik.cz>
Date: Fri, 16 Apr 2021 11:03:13 +0200
Subject: [PATCH] better in the news

---
 resources/scripts/scrapers/wiki-inthenews.py | 21 ++++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/resources/scripts/scrapers/wiki-inthenews.py b/resources/scripts/scrapers/wiki-inthenews.py
index d95847f54..d7988c1ae 100755
--- a/resources/scripts/scrapers/wiki-inthenews.py
+++ b/resources/scripts/scrapers/wiki-inthenews.py
@@ -3,7 +3,7 @@
 import urllib.request
 import re
 import json
-from html.parser import HTMLParser
+from newspaper import Article
 
 url = "https://en.wikipedia.org/wiki/Main_Page"
 response = urllib.request.urlopen(url)
@@ -15,21 +15,20 @@ articles_li = re.findall("<li>([\S\n\t\v ]+?)<\/li>", text_li)
 # Iterate all articles and generate JSON feed entries.
 wiki_base_url = "https://en.wikipedia.org"
 
-class HTMLFilter(HTMLParser):
-  text = ""
-  def handle_data(self, data):
-      self.text += data
 
 json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
 items = list()
 
 for article in articles_li:
-  article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1))
-  f = HTMLFilter()
-  f.feed(article)
-  f.text
-  article_title = json.dumps(f.text)
-  article_html = json.dumps("<div>{}</div>".format(article))
+  article_url = wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1)
+
+  f = Article(article_url, keep_article_html = True)
+  f.download()
+  f.parse()
+
+  article_url = json.dumps(article_url)
+  article_title = json.dumps(f.title)
+  article_html = json.dumps(f.article_html)
   items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
                                                                                          html=article_html,
                                                                                          url=article_url))