rssguard/resources/scripts/scrapers/wiki-qotd.py

44 lines
1.5 KiB
Python
Raw Normal View History

2021-05-20 10:02:10 +02:00
# Downloads all quotes of the day.
import urllib.request
import json
from bs4 import BeautifulSoup
url = "https://en.wikiquote.org/wiki/Wikiquote:Quote_of_the_day/Complete_list"
response = urllib.request.urlopen(url)
text = response.read().decode("utf-8")
soup = BeautifulSoup(text, 'html.parser')
lists = soup.find_all("ul")
items = list()
json_feed = "{{\"title\": {title}, \"items\": [{items}]}}"
for lst in lists:
try:
last_link = lst.find_all("a")[-1]
quote_link = last_link.get("href")
if quote_link.startswith("/"):
quote_link = "https://en.wikiquote.org" + quote_link
quote_author = last_link.get_text()
2021-05-20 10:14:11 +02:00
if not quote_author:
continue
quote_text = "<span>" + lst.find("li").decode_contents() + "</span>"
quote_heading = lst.find("li")
quote_heading = (quote_heading.get_text()[:75] + '...') if len(quote_heading) > 75 else quote_heading.get_text()
quote_heading = quote_heading.split(" ~")[0]
2021-05-20 10:02:10 +02:00
items.append("{{\"title\": {title}, \"authors\": [{{\"name\": {author}}}], \"content_html\": {html}, \"url\": {url}, \"date_published\": {date}}}".format(
title = json.dumps(quote_heading),
html = json.dumps(quote_text),
url = json.dumps(quote_link),
author = json.dumps(quote_author),
date = json.dumps("2020-12-31T08:00:00")))
except:
continue
json_feed = json_feed.format(title = json.dumps(soup.title.text), items = ", ".join(items))
print(json_feed)