mirror of
https://github.com/martinrotter/rssguard.git
synced 2025-01-19 12:22:47 +01:00
new scraper
This commit is contained in:
parent
5ecd47daec
commit
425bcfe1b3
33
resources/scripts/scrapers/product-hunt.py
Executable file
33
resources/scripts/scrapers/product-hunt.py
Executable file
@ -0,0 +1,33 @@
|
||||
# Obtains Wikipedia's "In the news" today's articles.
|
||||
# Sample input file whose contents must be provided as stdin: "https://www.producthunt.com/topics/XXXX"
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
from html.parser import HTMLParser
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
|
||||
input_data = sys.stdin.read()
|
||||
soup = BeautifulSoup(input_data, 'html.parser')
|
||||
|
||||
json_feed = "{{\"title\": {title}, \"items\": [{items}]}}"
|
||||
items = list()
|
||||
|
||||
regex_batch = re.compile('^styles_postContent__.+$')
|
||||
for post_content in soup.find_all("div", {"class" : regex_batch}):
|
||||
regex_single = re.compile('^styles_content__.+$')
|
||||
for content in post_content.find_all("div", {"class" : regex_single}):
|
||||
article_url = json.dumps("https://www.producthunt.com" + content.p.a["href"])
|
||||
article_title = json.dumps(content.h3.text)
|
||||
article_contents = json.dumps(content.p.text)
|
||||
article_time = json.dumps(datetime.now().isoformat())
|
||||
items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}, \"date_published\": {date}}}".format(
|
||||
title=article_title,
|
||||
html=article_contents,
|
||||
url=article_url,
|
||||
date=article_time))
|
||||
|
||||
json_feed = json_feed.format(title = json.dumps(soup.title.text), items = ", ".join(items))
|
||||
print(json_feed)
|
Loading…
Reference in New Issue
Block a user