Parse HTML instead of RSS (to get more content)

This commit is contained in:
octt 2022-08-15 22:32:34 +00:00
parent 1bfd002a03
commit 60d5b90866
1 changed files with 87 additions and 47 deletions

View File

@ -4,6 +4,7 @@ import feedparser
import os import os
import urllib.request import urllib.request
import email, smtplib, ssl import email, smtplib, ssl
from bs4 import BeautifulSoup
from email import encoders from email import encoders
from email.mime.base import MIMEBase from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
@ -18,60 +19,97 @@ Password = "Example"
Server = "smtp.example.com" Server = "smtp.example.com"
Port = 465 Port = 465
OnlyOwnPosts = False
MailSend = False
LocalSave = True LocalSave = True
MailSend = True NoSpacesFile = False
StripWS = '\t\r\n'
def MakePathStr(Str): def MakePathStr(Str):
for c in ('<>:"/\\|?* '): for c in ('<>:"/\\|?*'):
Str = Str.replace(c, '_') Str = Str.replace(c, '_')
if NoSpacesFile:
Str = Str.replace(' ', '_')
return Str return Str
def Main(): def Main():
Feeds = [feedparser.parse(URL)['entries'] for URL in URLs] for URL in URLs:
for Feed in Feeds: URL = URL.rstrip('/').rstrip('/with_replies') + '/with_replies'
Feed.reverse() # Order from oldest to newest Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
for Entry in Feed: try:
if os.path.isfile('MastodonFeedToHTML.db'): Response = urllib.request.urlopen(URL)
with open('MastodonFeedToHTML.db', 'r') as Db: Data = Response.read()
if Entry['id'] in Db.read().splitlines(): Soup = BeautifulSoup(Data, 'html.parser')
continue Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest
try: for Entry in Feed:
print(f"{Entry['id']} - {Entry['title']}")
Attached = '' Attached = ''
HTML = f"""\ GlobalId = Entry.find('a', class_='u-url')['href'].lstrip('https://').lstrip('http://')
<h1>{Entry['title']}</h1> LocalId = GlobalId.split('/')[-1]
Username = Entry.find('a', class_='status__display-name').get_text().strip(StripWS)
Content = Entry.find('div', class_='e-content')
StatusPrepend = Entry.find('div', class_='status__prepend')
StatusPrepend = StatusPrepend.get_text().strip(StripWS)[len(Username):] if StatusPrepend else ''
StatusPrepend = ' ' + StatusPrepend.strip(StripWS) if StatusPrepend else ''
if OnlyOwnPosts and StatusPrepend:
continue
Title = Content.get_text().strip(StripWS)
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
for Emoji in Entry.find_all('img', class_='custom-emoji'):
Emoji['style'] = 'max-height:1em;'
Attachments = Entry.find('ul', class_='attachment-list__list')
if Attachments:
for Attachment in Attachments:
Href, Alt = '', ''
Attachment = str(Attachment).strip(StripWS).replace("'",'"').split('"')
for i,e in enumerate(Attachment):
if e.endswith('<a href='):
Href = Attachment[i+1]
elif e.endswith('title='):
Alt = "'".join(Attachment[i+1:-1])
if Href:
Response = urllib.request.urlopen(Href)
Data = Response.read()
Type = 'img' if Href.lower().endswith(('.png','.jpg','.jpeg')) else 'img'
Mime = f"image/{Href.lower().split('.')[-1]}"
Opening = f'<{Type} alt="{Alt}" title="{Alt}"' if Type == 'img' else f'<{Type} controls'
Closing = '>' if Type == 'img' else f"></{Type}>"
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:2em;'#'display:none; visibility:hidden;'
Entry.find('div', class_='status__action-bar').replace_with('')
<div id="content"> if os.path.isfile('MastodonFeedToHTML.db'):
{Entry['summary']} with open('MastodonFeedToHTML.db', 'r') as Db:
if GlobalId in Db.read().splitlines():
pass #continue
print(f"-> {LocalId} - {Title}")
HTML = f"""\
<h1>{Title}</h1>
<div>
{Entry}
{{ Replace:Attached }} {{ Replace:Attached }}
</div> </div>
<br><hr><br> <br><hr><br>
<p>Published on {Entry['published']}</p>
<p>From <a href="{Entry['link']}">{Entry['link']}</a></p>
<br>
<h3>JSON dump</h3>
<div style="overflow-x:scroll;">
<xmp>
{Entry}
</xmp>
</div>
<p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p> <p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p>
""" """
Message = MIMEMultipart() #print(HTML)
Message['From'] = Sender
Message['To'] = ', '.join(Receivers)
Message['Subject'] = Entry['title']
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
for Link in Entry['links']: if MailSend:
Message = MIMEMultipart()
Message['From'] = Sender
Message['To'] = ', '.join(Receivers)
Message['Subject'] = Entry['title']
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
for Link in []:#Entry['links']:
if Link['type'].startswith(('audio/', 'image/', 'video/')): if Link['type'].startswith(('audio/', 'image/', 'video/')):
Response = urllib.request.urlopen(Link['href']) Response = urllib.request.urlopen(Link['href'])
Data = Response.read() Data = Response.read()
@ -79,14 +117,16 @@ def Main():
Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls" Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls"
Closing = '>' if Type == 'img' else f"></{Type}>" Closing = '>' if Type == 'img' else f"></{Type}>"
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n""" Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
File.set_payload(Data) if MailSend:
encoders.encode_base64(File) File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
File.add_header( File.set_payload(Data)
"Content-Disposition", encoders.encode_base64(File)
f"attachment; filename= {Link['href'].split('/')[-1]}", File.add_header(
) "Content-Disposition",
Message.attach(File) f"attachment; filename= {Link['href'].split('/')[-1]}",
)
Message.attach(File)
if MailSend: if MailSend:
with smtplib.SMTP_SSL(Server, Port, context=ssl.create_default_context()) as Client: with smtplib.SMTP_SSL(Server, Port, context=ssl.create_default_context()) as Client:
@ -94,18 +134,18 @@ def Main():
Client.sendmail(Sender, Receivers, Message.as_string()) Client.sendmail(Sender, Receivers, Message.as_string())
if LocalSave: if LocalSave:
LocalBackupDir = MakePathStr(Entry['title_detail']['base'].lstrip('https://').lstrip('http://')) LocalBackupDir = MakePathStr(Usertag)
if not os.path.isdir(LocalBackupDir): if not os.path.isdir(LocalBackupDir):
os.mkdir(LocalBackupDir) os.mkdir(LocalBackupDir)
FileName = MakePathStr(f"{Entry['id'].split('/')[-1]} - {Entry['title']}") FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File: with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
File.write(HTML.replace('{ Replace:Attached }', Attached)) File.write(HTML.replace('{ Replace:Attached }', Attached))
with open('MastodonFeedToHTML.db', 'a') as Db: with open('MastodonFeedToHTML.db', 'a') as Db:
Db.write(Entry['id'] + '\n') pass #Db.write(GlobalId + '\n')
except Exception: except Exception:
raise raise
if __name__ == '__main__': if __name__ == '__main__':
Main() Main()