mirror of
https://github.com/martinrotter/rssguard.git
synced 2025-01-29 08:39:41 +01:00
scraper
This commit is contained in:
parent
a3e2e8fa77
commit
7446d7323b
27
resources/scripts/scrapers/funnyjunk-embed-gifs.py
Normal file
27
resources/scripts/scrapers/funnyjunk-embed-gifs.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# Embeds GIFs into individual messages from Funnyjunk.com
|
||||||
|
# Sample input file whose contents must be provided as stdin: "https://funnyjunk.com/rss/most_popular.rss"
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
input_data = sys.stdin.read()
|
||||||
|
|
||||||
|
tree = ET.fromstring(input_data)
|
||||||
|
pattern = re.compile("href=\"(https://[^<>]+\.(gif|png|jpg))\"")
|
||||||
|
|
||||||
|
for ite in tree.find("channel").iter("item"):
|
||||||
|
link = ite.find("link").text
|
||||||
|
#print(link)
|
||||||
|
response = urllib.request.urlopen(link)
|
||||||
|
text = response.read().decode("utf-8")
|
||||||
|
for pic_link in re.findall(pattern, text):
|
||||||
|
new = ET.SubElement(ite, "enclosure")
|
||||||
|
new.set("url", pic_link[0])
|
||||||
|
new.set("type", "image/" + pic_link[1])
|
||||||
|
#print(ET.tostring(ite, encoding="unicode"))
|
||||||
|
|
||||||
|
print(ET.tostring(tree, encoding="unicode"))
|
Loading…
x
Reference in New Issue
Block a user