Parse HTML instead of RSS (to get more content)

This commit is contained in:
octt 2022-08-15 22:32:34 +00:00
parent 1bfd002a03
commit 60d5b90866

View File

@ -4,6 +4,7 @@ import feedparser
import os import os
import urllib.request import urllib.request
import email, smtplib, ssl import email, smtplib, ssl
from bs4 import BeautifulSoup
from email import encoders from email import encoders
from email.mime.base import MIMEBase from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
@ -18,60 +19,97 @@ Password = "Example"
Server = "smtp.example.com" Server = "smtp.example.com"
Port = 465 Port = 465
OnlyOwnPosts = False
MailSend = False
LocalSave = True LocalSave = True
MailSend = True NoSpacesFile = False
StripWS = '\t\r\n'
def MakePathStr(Str): def MakePathStr(Str):
for c in ('<>:"/\\|?* '): for c in ('<>:"/\\|?*'):
Str = Str.replace(c, '_') Str = Str.replace(c, '_')
if NoSpacesFile:
Str = Str.replace(' ', '_')
return Str return Str
def Main(): def Main():
Feeds = [feedparser.parse(URL)['entries'] for URL in URLs] for URL in URLs:
for Feed in Feeds: URL = URL.rstrip('/').rstrip('/with_replies') + '/with_replies'
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
try:
Response = urllib.request.urlopen(URL)
Data = Response.read()
Soup = BeautifulSoup(Data, 'html.parser')
Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest Feed.reverse() # Order from oldest to newest
for Entry in Feed: for Entry in Feed:
Attached = ''
GlobalId = Entry.find('a', class_='u-url')['href'].lstrip('https://').lstrip('http://')
LocalId = GlobalId.split('/')[-1]
Username = Entry.find('a', class_='status__display-name').get_text().strip(StripWS)
Content = Entry.find('div', class_='e-content')
StatusPrepend = Entry.find('div', class_='status__prepend')
StatusPrepend = StatusPrepend.get_text().strip(StripWS)[len(Username):] if StatusPrepend else ''
StatusPrepend = ' ' + StatusPrepend.strip(StripWS) if StatusPrepend else ''
if OnlyOwnPosts and StatusPrepend:
continue
Title = Content.get_text().strip(StripWS)
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
for Emoji in Entry.find_all('img', class_='custom-emoji'):
Emoji['style'] = 'max-height:1em;'
Attachments = Entry.find('ul', class_='attachment-list__list')
if Attachments:
for Attachment in Attachments:
Href, Alt = '', ''
Attachment = str(Attachment).strip(StripWS).replace("'",'"').split('"')
for i,e in enumerate(Attachment):
if e.endswith('<a href='):
Href = Attachment[i+1]
elif e.endswith('title='):
Alt = "'".join(Attachment[i+1:-1])
if Href:
Response = urllib.request.urlopen(Href)
Data = Response.read()
Type = 'img' if Href.lower().endswith(('.png','.jpg','.jpeg')) else 'img'
Mime = f"image/{Href.lower().split('.')[-1]}"
Opening = f'<{Type} alt="{Alt}" title="{Alt}"' if Type == 'img' else f'<{Type} controls'
Closing = '>' if Type == 'img' else f"></{Type}>"
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:2em;'#'display:none; visibility:hidden;'
Entry.find('div', class_='status__action-bar').replace_with('')
if os.path.isfile('MastodonFeedToHTML.db'): if os.path.isfile('MastodonFeedToHTML.db'):
with open('MastodonFeedToHTML.db', 'r') as Db: with open('MastodonFeedToHTML.db', 'r') as Db:
if Entry['id'] in Db.read().splitlines(): if GlobalId in Db.read().splitlines():
continue pass #continue
try: print(f"-> {LocalId} - {Title}")
print(f"{Entry['id']} - {Entry['title']}")
Attached = ''
HTML = f"""\ HTML = f"""\
<h1>{Entry['title']}</h1> <h1>{Title}</h1>
<div id="content"> <div>
{Entry['summary']} {Entry}
{{ Replace:Attached }} {{ Replace:Attached }}
</div> </div>
<br><hr><br> <br><hr><br>
<p>Published on {Entry['published']}</p>
<p>From <a href="{Entry['link']}">{Entry['link']}</a></p>
<br>
<h3>JSON dump</h3>
<div style="overflow-x:scroll;">
<xmp>
{Entry}
</xmp>
</div>
<p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p> <p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p>
""" """
#print(HTML)
if MailSend:
Message = MIMEMultipart() Message = MIMEMultipart()
Message['From'] = Sender Message['From'] = Sender
Message['To'] = ', '.join(Receivers) Message['To'] = ', '.join(Receivers)
Message['Subject'] = Entry['title'] Message['Subject'] = Entry['title']
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html')) Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
for Link in Entry['links']: for Link in []:#Entry['links']:
if Link['type'].startswith(('audio/', 'image/', 'video/')): if Link['type'].startswith(('audio/', 'image/', 'video/')):
Response = urllib.request.urlopen(Link['href']) Response = urllib.request.urlopen(Link['href'])
Data = Response.read() Data = Response.read()
@ -79,6 +117,8 @@ def Main():
Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls" Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls"
Closing = '>' if Type == 'img' else f"></{Type}>" Closing = '>' if Type == 'img' else f"></{Type}>"
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n""" Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
if MailSend:
File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1]) File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
File.set_payload(Data) File.set_payload(Data)
encoders.encode_base64(File) encoders.encode_base64(File)
@ -94,15 +134,15 @@ def Main():
Client.sendmail(Sender, Receivers, Message.as_string()) Client.sendmail(Sender, Receivers, Message.as_string())
if LocalSave: if LocalSave:
LocalBackupDir = MakePathStr(Entry['title_detail']['base'].lstrip('https://').lstrip('http://')) LocalBackupDir = MakePathStr(Usertag)
if not os.path.isdir(LocalBackupDir): if not os.path.isdir(LocalBackupDir):
os.mkdir(LocalBackupDir) os.mkdir(LocalBackupDir)
FileName = MakePathStr(f"{Entry['id'].split('/')[-1]} - {Entry['title']}") FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File: with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
File.write(HTML.replace('{ Replace:Attached }', Attached)) File.write(HTML.replace('{ Replace:Attached }', Attached))
with open('MastodonFeedToHTML.db', 'a') as Db: with open('MastodonFeedToHTML.db', 'a') as Db:
Db.write(Entry['id'] + '\n') pass #Db.write(GlobalId + '\n')
except Exception: except Exception:
raise raise