From 4667f71f63ba93cc02d41ef7f4619e37b216fcd7 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Fri, 5 Feb 2021 09:56:49 +0100 Subject: [PATCH] abclinuxu scraper --- resources/scripts/scrapers/abc-my-comments.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 resources/scripts/scrapers/abc-my-comments.py diff --git a/resources/scripts/scrapers/abc-my-comments.py b/resources/scripts/scrapers/abc-my-comments.py new file mode 100755 index 000000000..ca9ae3976 --- /dev/null +++ b/resources/scripts/scrapers/abc-my-comments.py @@ -0,0 +1,44 @@ +# Obtains Abclinuxu's "mé komentáře" as JSON feed. + +import urllib.request +import re +import json +import sys +from datetime import datetime +from html.parser import HTMLParser + +base_url = "https://www.abclinuxu.cz" +url = "https://www.abclinuxu.cz/History?type=comments&uid={uid}".format(uid=sys.argv[1]) +response = urllib.request.urlopen(url) +text = response.read().decode("utf-8") + +abc_title = re.search("

(.+?)

", text).group(1) +abc_table = re.search("(.+?)<\/table>", text, re.S).group(1) + +articles_iter = re.finditer(".+?href=\"(.+?)\">(.+?).+?\"td-datum\">.+?(\d{1,2}\.\d{1,2}\.\d{4} \d{2}:\d{2}).+?", abc_table, re.S) + +# Iterate all articles and generate JSON feed entries. + +class HTMLFilter(HTMLParser): + text = "" + def handle_data(self, data): + self.text += data + +json_feed = "{{\"title\": \"{title}\", \"items\": [{items}]}}" +items = list() + +for article in articles_iter: + article_url = json.dumps(base_url + article.group(1)) + + f = HTMLFilter() + f.feed(article.group(2)) + + article_title = json.dumps(f.text) + article_time = json.dumps(datetime.strptime(article.group(3), "%d.%m.%Y %H:%M").isoformat()) + items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}, \"date_published\": {date}}}".format(title=article_title, + html=article_title, + url=article_url, + date=article_time)) + +json_feed = json_feed.format(title=abc_title, items=", ".join(items)) +print(json_feed) \ No newline at end of file