mirror of https://gitlab.com/octospacc/bottocto
Parse HTML instead of RSS (to get more content)
This commit is contained in:
parent
1bfd002a03
commit
60d5b90866
|
@ -4,6 +4,7 @@ import feedparser
|
||||||
import os
|
import os
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import email, smtplib, ssl
|
import email, smtplib, ssl
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from email import encoders
|
from email import encoders
|
||||||
from email.mime.base import MIMEBase
|
from email.mime.base import MIMEBase
|
||||||
from email.mime.multipart import MIMEMultipart
|
from email.mime.multipart import MIMEMultipart
|
||||||
|
@ -18,60 +19,97 @@ Password = "Example"
|
||||||
Server = "smtp.example.com"
|
Server = "smtp.example.com"
|
||||||
Port = 465
|
Port = 465
|
||||||
|
|
||||||
|
OnlyOwnPosts = False
|
||||||
|
MailSend = False
|
||||||
LocalSave = True
|
LocalSave = True
|
||||||
MailSend = True
|
NoSpacesFile = False
|
||||||
|
|
||||||
|
|
||||||
|
StripWS = '\t\r\n'
|
||||||
|
|
||||||
def MakePathStr(Str):
|
def MakePathStr(Str):
|
||||||
for c in ('<>:"/\\|?* '):
|
for c in ('<>:"/\\|?*'):
|
||||||
Str = Str.replace(c, '_')
|
Str = Str.replace(c, '_')
|
||||||
|
if NoSpacesFile:
|
||||||
|
Str = Str.replace(' ', '_')
|
||||||
return Str
|
return Str
|
||||||
|
|
||||||
def Main():
|
def Main():
|
||||||
Feeds = [feedparser.parse(URL)['entries'] for URL in URLs]
|
for URL in URLs:
|
||||||
for Feed in Feeds:
|
URL = URL.rstrip('/').rstrip('/with_replies') + '/with_replies'
|
||||||
Feed.reverse() # Order from oldest to newest
|
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
|
||||||
for Entry in Feed:
|
try:
|
||||||
if os.path.isfile('MastodonFeedToHTML.db'):
|
Response = urllib.request.urlopen(URL)
|
||||||
with open('MastodonFeedToHTML.db', 'r') as Db:
|
Data = Response.read()
|
||||||
if Entry['id'] in Db.read().splitlines():
|
Soup = BeautifulSoup(Data, 'html.parser')
|
||||||
continue
|
Feed = Soup.find_all('div', class_='entry')
|
||||||
|
Feed.reverse() # Order from oldest to newest
|
||||||
|
|
||||||
try:
|
for Entry in Feed:
|
||||||
print(f"{Entry['id']} - {Entry['title']}")
|
|
||||||
Attached = ''
|
Attached = ''
|
||||||
HTML = f"""\
|
GlobalId = Entry.find('a', class_='u-url')['href'].lstrip('https://').lstrip('http://')
|
||||||
<h1>{Entry['title']}</h1>
|
LocalId = GlobalId.split('/')[-1]
|
||||||
|
Username = Entry.find('a', class_='status__display-name').get_text().strip(StripWS)
|
||||||
|
Content = Entry.find('div', class_='e-content')
|
||||||
|
StatusPrepend = Entry.find('div', class_='status__prepend')
|
||||||
|
StatusPrepend = StatusPrepend.get_text().strip(StripWS)[len(Username):] if StatusPrepend else ''
|
||||||
|
StatusPrepend = ' ' + StatusPrepend.strip(StripWS) if StatusPrepend else ''
|
||||||
|
if OnlyOwnPosts and StatusPrepend:
|
||||||
|
continue
|
||||||
|
Title = Content.get_text().strip(StripWS)
|
||||||
|
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
|
||||||
|
for Emoji in Entry.find_all('img', class_='custom-emoji'):
|
||||||
|
Emoji['style'] = 'max-height:1em;'
|
||||||
|
Attachments = Entry.find('ul', class_='attachment-list__list')
|
||||||
|
if Attachments:
|
||||||
|
for Attachment in Attachments:
|
||||||
|
Href, Alt = '', ''
|
||||||
|
Attachment = str(Attachment).strip(StripWS).replace("'",'"').split('"')
|
||||||
|
for i,e in enumerate(Attachment):
|
||||||
|
if e.endswith('<a href='):
|
||||||
|
Href = Attachment[i+1]
|
||||||
|
elif e.endswith('title='):
|
||||||
|
Alt = "'".join(Attachment[i+1:-1])
|
||||||
|
if Href:
|
||||||
|
Response = urllib.request.urlopen(Href)
|
||||||
|
Data = Response.read()
|
||||||
|
Type = 'img' if Href.lower().endswith(('.png','.jpg','.jpeg')) else 'img'
|
||||||
|
Mime = f"image/{Href.lower().split('.')[-1]}"
|
||||||
|
Opening = f'<{Type} alt="{Alt}" title="{Alt}"' if Type == 'img' else f'<{Type} controls'
|
||||||
|
Closing = '>' if Type == 'img' else f"></{Type}>"
|
||||||
|
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
|
||||||
|
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:2em;'#'display:none; visibility:hidden;'
|
||||||
|
Entry.find('div', class_='status__action-bar').replace_with('')
|
||||||
|
|
||||||
<div id="content">
|
if os.path.isfile('MastodonFeedToHTML.db'):
|
||||||
{Entry['summary']}
|
with open('MastodonFeedToHTML.db', 'r') as Db:
|
||||||
|
if GlobalId in Db.read().splitlines():
|
||||||
|
pass #continue
|
||||||
|
|
||||||
|
print(f"-> {LocalId} - {Title}")
|
||||||
|
HTML = f"""\
|
||||||
|
<h1>{Title}</h1>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
{Entry}
|
||||||
|
|
||||||
{{ Replace:Attached }}
|
{{ Replace:Attached }}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<br><hr><br>
|
<br><hr><br>
|
||||||
|
|
||||||
<p>Published on {Entry['published']}</p>
|
|
||||||
<p>From <a href="{Entry['link']}">{Entry['link']}</a></p>
|
|
||||||
|
|
||||||
<br>
|
|
||||||
|
|
||||||
<h3>JSON dump</h3>
|
|
||||||
<div style="overflow-x:scroll;">
|
|
||||||
<xmp>
|
|
||||||
{Entry}
|
|
||||||
</xmp>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p>
|
<p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p>
|
||||||
"""
|
"""
|
||||||
Message = MIMEMultipart()
|
#print(HTML)
|
||||||
Message['From'] = Sender
|
|
||||||
Message['To'] = ', '.join(Receivers)
|
|
||||||
Message['Subject'] = Entry['title']
|
|
||||||
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
|
|
||||||
|
|
||||||
for Link in Entry['links']:
|
if MailSend:
|
||||||
|
Message = MIMEMultipart()
|
||||||
|
Message['From'] = Sender
|
||||||
|
Message['To'] = ', '.join(Receivers)
|
||||||
|
Message['Subject'] = Entry['title']
|
||||||
|
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
|
||||||
|
|
||||||
|
for Link in []:#Entry['links']:
|
||||||
if Link['type'].startswith(('audio/', 'image/', 'video/')):
|
if Link['type'].startswith(('audio/', 'image/', 'video/')):
|
||||||
Response = urllib.request.urlopen(Link['href'])
|
Response = urllib.request.urlopen(Link['href'])
|
||||||
Data = Response.read()
|
Data = Response.read()
|
||||||
|
@ -79,14 +117,16 @@ def Main():
|
||||||
Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls"
|
Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls"
|
||||||
Closing = '>' if Type == 'img' else f"></{Type}>"
|
Closing = '>' if Type == 'img' else f"></{Type}>"
|
||||||
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
|
Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
|
||||||
File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
|
|
||||||
File.set_payload(Data)
|
if MailSend:
|
||||||
encoders.encode_base64(File)
|
File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
|
||||||
File.add_header(
|
File.set_payload(Data)
|
||||||
"Content-Disposition",
|
encoders.encode_base64(File)
|
||||||
f"attachment; filename= {Link['href'].split('/')[-1]}",
|
File.add_header(
|
||||||
)
|
"Content-Disposition",
|
||||||
Message.attach(File)
|
f"attachment; filename= {Link['href'].split('/')[-1]}",
|
||||||
|
)
|
||||||
|
Message.attach(File)
|
||||||
|
|
||||||
if MailSend:
|
if MailSend:
|
||||||
with smtplib.SMTP_SSL(Server, Port, context=ssl.create_default_context()) as Client:
|
with smtplib.SMTP_SSL(Server, Port, context=ssl.create_default_context()) as Client:
|
||||||
|
@ -94,18 +134,18 @@ def Main():
|
||||||
Client.sendmail(Sender, Receivers, Message.as_string())
|
Client.sendmail(Sender, Receivers, Message.as_string())
|
||||||
|
|
||||||
if LocalSave:
|
if LocalSave:
|
||||||
LocalBackupDir = MakePathStr(Entry['title_detail']['base'].lstrip('https://').lstrip('http://'))
|
LocalBackupDir = MakePathStr(Usertag)
|
||||||
if not os.path.isdir(LocalBackupDir):
|
if not os.path.isdir(LocalBackupDir):
|
||||||
os.mkdir(LocalBackupDir)
|
os.mkdir(LocalBackupDir)
|
||||||
FileName = MakePathStr(f"{Entry['id'].split('/')[-1]} - {Entry['title']}")
|
FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
|
||||||
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
|
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
|
||||||
File.write(HTML.replace('{ Replace:Attached }', Attached))
|
File.write(HTML.replace('{ Replace:Attached }', Attached))
|
||||||
|
|
||||||
with open('MastodonFeedToHTML.db', 'a') as Db:
|
with open('MastodonFeedToHTML.db', 'a') as Db:
|
||||||
Db.write(Entry['id'] + '\n')
|
pass #Db.write(GlobalId + '\n')
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
Main()
|
Main()
|
||||||
|
|
Loading…
Reference in New Issue