From 16e8d9a42ff85a39b6eb751b15023d3baf66d7c3 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Sat, 27 Mar 2021 20:25:07 +0100 Subject: [PATCH] scraper --- resources/scripts/7za | 2 +- resources/scripts/scrapers/njcom.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 resources/scripts/scrapers/njcom.py diff --git a/resources/scripts/7za b/resources/scripts/7za index 47f412575..9c10723bf 160000 --- a/resources/scripts/7za +++ b/resources/scripts/7za @@ -1 +1 @@ -Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9 +Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749 diff --git a/resources/scripts/scrapers/njcom.py b/resources/scripts/scrapers/njcom.py new file mode 100644 index 000000000..3d0f8aa39 --- /dev/null +++ b/resources/scripts/scrapers/njcom.py @@ -0,0 +1,31 @@ +# Obtains articles from "connect.nj.com" +# Sample input file whose contents must be provided as stdin: "https://connect.nj.com/staff/bklapisch/posts.html" + +import json +import re +import sys +import html +import urllib.request +from html.parser import HTMLParser +from bs4 import BeautifulSoup +from datetime import datetime + +sys.stdin.reconfigure(encoding='utf-8') +input_data = sys.stdin.read() +soup = BeautifulSoup(input_data, 'html.parser') +json_feed = "{{\"title\": {title}, \"items\": [{items}]}}" +items = list() + +for content in soup.find_all("article"): + article_url = json.dumps(content.find("h2").a["href"]) + article_title = json.dumps(content.find("h2").text.replace(" ", "")) + article_contents = json.dumps(str(content.find("p"))) + article_time = json.dumps(content.find("time")["datetime"]) + items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}, \"date_published\": {date}}}".format( + title=article_title, + html=article_contents, + url=article_url, + date=article_time)) + +json_feed = json_feed.format(title = json.dumps(soup.title.text), items = ", ".join(items)) +print(json_feed) \ No newline at end of file