This commit is contained in:
Martin Rotter 2021-03-27 20:25:07 +01:00 committed by Martin Rotter
parent e7b1c4b6d9
commit 16e8d9a42f
2 changed files with 32 additions and 1 deletions

@ -1 +1 @@
Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9
Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749

View File

@ -0,0 +1,31 @@
# Obtains articles from "connect.nj.com"
# Sample input file whose contents must be provided as stdin: "https://connect.nj.com/staff/bklapisch/posts.html"
import json
import re
import sys
import html
import urllib.request
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from datetime import datetime
sys.stdin.reconfigure(encoding='utf-8')
input_data = sys.stdin.read()
soup = BeautifulSoup(input_data, 'html.parser')
json_feed = "{{\"title\": {title}, \"items\": [{items}]}}"
items = list()
for content in soup.find_all("article"):
article_url = json.dumps(content.find("h2").a["href"])
article_title = json.dumps(content.find("h2").text.replace(" ", ""))
article_contents = json.dumps(str(content.find("p")))
article_time = json.dumps(content.find("time")["datetime"])
items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}, \"date_published\": {date}}}".format(
title=article_title,
html=article_contents,
url=article_url,
date=article_time))
json_feed = json_feed.format(title = json.dumps(soup.title.text), items = ", ".join(items))
print(json_feed)