Parse HTML instead of RSS (to get more content)

2025-06-05 22:19:22 +02:00 · 2022-08-15 22:32:34 +00:00
parent 1bfd002a03
commit 60d5b90866
1 changed files with 87 additions and 47 deletions
--- a/MastodonFeedHTML.py
+++ b/MastodonFeedHTML.py
@ -4,6 +4,7 @@ import feedparser
 import os
 import urllib.request
 import email, smtplib, ssl
+from bs4 import BeautifulSoup
 from email import encoders
 from email.mime.base import MIMEBase
 from email.mime.multipart import MIMEMultipart
@ -18,60 +19,97 @@ Password = "Example"
 Server = "smtp.example.com"
 Port = 465

+OnlyOwnPosts = False
+MailSend = False
 LocalSave = True
-MailSend = True
+NoSpacesFile = False


+StripWS = '\t\r\n'
+
 def MakePathStr(Str):
-	for c in ('<>:"/\\|?* '):
+	for c in ('<>:"/\\|?*'):
 		Str = Str.replace(c, '_')
+	if NoSpacesFile:
+		Str = Str.replace(' ', '_')
 	return Str

 def Main():
-	Feeds = [feedparser.parse(URL)['entries'] for URL in URLs]
-	for Feed in Feeds:
-		Feed.reverse() # Order from oldest to newest
-		for Entry in Feed:
-			if os.path.isfile('MastodonFeedToHTML.db'):
-				with open('MastodonFeedToHTML.db', 'r') as Db:
-					if Entry['id'] in Db.read().splitlines():
-						continue
+	for URL in URLs:
+		URL = URL.rstrip('/').rstrip('/with_replies') + '/with_replies'
+		Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
+		try:
+			Response = urllib.request.urlopen(URL)
+			Data = Response.read()
+			Soup = BeautifulSoup(Data, 'html.parser')
+			Feed = Soup.find_all('div', class_='entry')
+			Feed.reverse() # Order from oldest to newest

-			try:
-				print(f"{Entry['id']} - {Entry['title']}")
+			for Entry in Feed:
 				Attached = ''
-				HTML = f"""\
-<h1>{Entry['title']}</h1>
+				GlobalId = Entry.find('a', class_='u-url')['href'].lstrip('https://').lstrip('http://')
+				LocalId = GlobalId.split('/')[-1]
+				Username = Entry.find('a', class_='status__display-name').get_text().strip(StripWS)
+				Content = Entry.find('div', class_='e-content')
+				StatusPrepend = Entry.find('div', class_='status__prepend')
+				StatusPrepend = StatusPrepend.get_text().strip(StripWS)[len(Username):] if StatusPrepend else ''
+				StatusPrepend = ' ' + StatusPrepend.strip(StripWS) if StatusPrepend else ''
+				if OnlyOwnPosts and StatusPrepend:
+					continue
+				Title = Content.get_text().strip(StripWS)
+				Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
+				for Emoji in Entry.find_all('img', class_='custom-emoji'):
+					Emoji['style'] = 'max-height:1em;'
+				Attachments = Entry.find('ul', class_='attachment-list__list')
+				if Attachments:
+					for Attachment in Attachments:
+						Href, Alt = '', ''
+						Attachment = str(Attachment).strip(StripWS).replace("'",'"').split('"')
+						for i,e in enumerate(Attachment):
+							if e.endswith('<a href='):
+								Href = Attachment[i+1]
+							elif e.endswith('title='):
+								Alt = "'".join(Attachment[i+1:-1])
+						if Href:
+							Response = urllib.request.urlopen(Href)
+							Data = Response.read()
+							Type = 'img' if Href.lower().endswith(('.png','.jpg','.jpeg')) else 'img'
+							Mime = f"image/{Href.lower().split('.')[-1]}"
+							Opening = f'<{Type} alt="{Alt}" title="{Alt}"' if Type == 'img' else f'<{Type} controls'
+							Closing = '>' if Type == 'img' else f"></{Type}>"
+							Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
+				Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:2em;'#'display:none; visibility:hidden;'
+				Entry.find('div', class_='status__action-bar').replace_with('')

-<div id="content">
-{Entry['summary']}
+				if os.path.isfile('MastodonFeedToHTML.db'):
+					with open('MastodonFeedToHTML.db', 'r') as Db:
+						if GlobalId in Db.read().splitlines():
+							pass #continue
+
+				print(f"-> {LocalId} - {Title}")
+				HTML = f"""\
+<h1>{Title}</h1>
+
+<div>
+{Entry}

 {{ Replace:Attached }}
 </div>

 <br><hr><br>

-<p>Published on {Entry['published']}</p>
-<p>From <a href="{Entry['link']}">{Entry['link']}</a></p>
-
-<br>
-
-<h3>JSON dump</h3>
-<div style="overflow-x:scroll;">
-<xmp>
-{Entry}
-</xmp>
-</div>
-
 <p><i>Via <a href="https://gitlab.com/-/snippets/2388397">https://gitlab.com/-/snippets/2388397</a></i></p>
 """
-				Message = MIMEMultipart()
-				Message['From'] = Sender
-				Message['To'] = ', '.join(Receivers)
-				Message['Subject'] = Entry['title']
-				Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
+				#print(HTML)

-				for Link in Entry['links']:
+				if MailSend:
+					Message = MIMEMultipart()
+					Message['From'] = Sender
+					Message['To'] = ', '.join(Receivers)
+					Message['Subject'] = Entry['title']
+					Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
+
+				for Link in []:#Entry['links']:
 					if Link['type'].startswith(('audio/', 'image/', 'video/')):
 						Response = urllib.request.urlopen(Link['href'])
 						Data = Response.read()
@ -79,14 +117,16 @@ def Main():
 						Opening = f"<{Type}" if Type == 'img' else f"<{Type} controls"
 						Closing = '>' if Type == 'img' else f"></{Type}>"
 						Attached += f"""{Opening} style="max-width:100%;max-height:100vh;" src="data:{Link['type']};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
-						File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
-						File.set_payload(Data)
-						encoders.encode_base64(File)
-						File.add_header(
-							"Content-Disposition",
-							f"attachment; filename= {Link['href'].split('/')[-1]}",
-						)
-						Message.attach(File)
+
+						if MailSend:
+							File = MIMEBase(Link['type'].split('/')[0], Link['type'].split('/')[1])
+							File.set_payload(Data)
+							encoders.encode_base64(File)
+							File.add_header(
+								"Content-Disposition",
+								f"attachment; filename= {Link['href'].split('/')[-1]}",
+							)
+							Message.attach(File)

 				if MailSend:
 					with smtplib.SMTP_SSL(Server, Port, context=ssl.create_default_context()) as Client:
@ -94,18 +134,18 @@ def Main():
 						Client.sendmail(Sender, Receivers, Message.as_string())

 				if LocalSave:
-					LocalBackupDir = MakePathStr(Entry['title_detail']['base'].lstrip('https://').lstrip('http://'))
+					LocalBackupDir = MakePathStr(Usertag)
 					if not os.path.isdir(LocalBackupDir):
 						os.mkdir(LocalBackupDir)
-					FileName = MakePathStr(f"{Entry['id'].split('/')[-1]} - {Entry['title']}")
+					FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
 					with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
 						File.write(HTML.replace('{ Replace:Attached }', Attached))

 				with open('MastodonFeedToHTML.db', 'a') as Db:
-					Db.write(Entry['id'] + '\n')
+					pass #Db.write(GlobalId + '\n')

-			except Exception:
-				raise
+		except Exception:
+			raise

 if __name__ == '__main__':
 	Main()