From 7446d7323b6399792718996b0585b4d8c5816ce8 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Sat, 13 Mar 2021 21:07:01 +0100 Subject: [PATCH] scraper --- .../scripts/scrapers/funnyjunk-embed-gifs.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 resources/scripts/scrapers/funnyjunk-embed-gifs.py diff --git a/resources/scripts/scrapers/funnyjunk-embed-gifs.py b/resources/scripts/scrapers/funnyjunk-embed-gifs.py new file mode 100644 index 000000000..7423d0ce6 --- /dev/null +++ b/resources/scripts/scrapers/funnyjunk-embed-gifs.py @@ -0,0 +1,27 @@ +# Embeds GIFs into individual messages from Funnyjunk.com +# Sample input file whose contents must be provided as stdin: "https://funnyjunk.com/rss/most_popular.rss" + +import json +import re +import sys +import urllib.request +import xml.etree.ElementTree as ET +from datetime import datetime + +input_data = sys.stdin.read() + +tree = ET.fromstring(input_data) +pattern = re.compile("href=\"(https://[^<>]+\.(gif|png|jpg))\"") + +for ite in tree.find("channel").iter("item"): + link = ite.find("link").text + #print(link) + response = urllib.request.urlopen(link) + text = response.read().decode("utf-8") + for pic_link in re.findall(pattern, text): + new = ET.SubElement(ite, "enclosure") + new.set("url", pic_link[0]) + new.set("type", "image/" + pic_link[1]) + #print(ET.tostring(ite, encoding="unicode")) + +print(ET.tostring(tree, encoding="unicode")) \ No newline at end of file