mirror of
https://github.com/martinrotter/rssguard.git
synced 2025-01-17 03:07:03 +01:00
scraper
This commit is contained in:
parent
a3e2e8fa77
commit
7446d7323b
27
resources/scripts/scrapers/funnyjunk-embed-gifs.py
Normal file
27
resources/scripts/scrapers/funnyjunk-embed-gifs.py
Normal file
@ -0,0 +1,27 @@
|
||||
# Embeds GIFs into individual messages from Funnyjunk.com
|
||||
# Sample input file whose contents must be provided as stdin: "https://funnyjunk.com/rss/most_popular.rss"
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime
|
||||
|
||||
input_data = sys.stdin.read()
|
||||
|
||||
tree = ET.fromstring(input_data)
|
||||
pattern = re.compile("href=\"(https://[^<>]+\.(gif|png|jpg))\"")
|
||||
|
||||
for ite in tree.find("channel").iter("item"):
|
||||
link = ite.find("link").text
|
||||
#print(link)
|
||||
response = urllib.request.urlopen(link)
|
||||
text = response.read().decode("utf-8")
|
||||
for pic_link in re.findall(pattern, text):
|
||||
new = ET.SubElement(ite, "enclosure")
|
||||
new.set("url", pic_link[0])
|
||||
new.set("type", "image/" + pic_link[1])
|
||||
#print(ET.tostring(ite, encoding="unicode"))
|
||||
|
||||
print(ET.tostring(tree, encoding="unicode"))
|
Loading…
Reference in New Issue
Block a user