rssguard/resources/scripts/scrapers/wiki-inthenews.py

# Obtains Wikipedia's "In the news" today's articles.

import urllib.request
import re
import json
from html.parser import HTMLParser

url = "https://en.wikipedia.org/wiki/Main_Page"
response = urllib.request.urlopen(url)
text =  response.read().decode("utf-8")

text_li = re.search("In the news[\S\n\t\v ]+?<ul>([\S\n\t\v ]+?)<\/ul>", text).group(1)
articles_li = re.findall("<li>([\S\n\t\v ]+?)<\/li>", text_li)

# Iterate all articles and generate JSON feed entries.
wiki_base_url = "https://en.wikipedia.org"

class HTMLFilter(HTMLParser):
  text = ""
  def handle_data(self, data):
      self.text += data

json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
items = list()

for article in articles_li:
  article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1))
  f = HTMLFilter()
  f.feed(article)
  f.text
  article_title = json.dumps(f.text)
  article_html = json.dumps("<div>{}</div>".format(article))
  items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
                                                                                         html=article_html,
                                                                                         url=article_url))

json_feed = json_feed.format(items=", ".join(items))

print(json_feed)