Handle items older than last 20; TODO: Fix order

This commit is contained in:
octo@swiss 2022-08-22 21:12:05 +02:00
parent ad45d852ee
commit 491b99f339
3 changed files with 118 additions and 96 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*.pyc
*.db
Config.py

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python3
import base64
import feedparser
import os
import time
import email, smtplib, ssl
@ -22,49 +21,77 @@ def MakePathStr(Str):
Str = Str.replace(' ', '_')
return Str
def HandleFeedsList(List):
for Feed in List:
print(f"[I] Handling Feed ->\n: {Feed}")
HandleFeed(
URLs=SureList(Feed['URLs']),
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
for URL in URLs:
if not (LocalSave or SendMail):
print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).")
URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies'
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
try:
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
Data = Response.read()
Soup = BeautifulSoup(Data, 'html.parser')
Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest
LastEntryIsNew, PageOlder = HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
while LastEntryIsNew and PageOlder: # TODO: Fix this, make a single run for all items / start from oldest, otherwise order is messed up
LastEntryIsNew, PageOlder = HandleURL(PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
for Entry in Feed:
Attached = ''
GlobalId = Entry.find('a', class_='u-url')
if GlobalId:
GlobalId = GlobalId['href'].removeprefix('https://').removeprefix('http://')
else:
continue
def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
LastEntryIsNew = False
PageOlder = ''
try:
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
Data = Response.read()
Soup = BeautifulSoup(Data, 'html.parser')
Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest
if os.path.isfile(f'{AppName}.db'):
with open(f'{AppName}.db', 'r') as Db:
if f'{Usertag} {GlobalId}' in Db.read().splitlines():
continue
Index = 0
for Entry in Feed:
Attached = ''
Anchor = Entry.find('a', class_='u-url')
if Anchor:
GlobalId = Anchor['href'].removeprefix('https://').removeprefix('http://')
Index += 1
else:
Anchor = Entry.find('a', class_='load-more')
if Anchor:
if '?max_id=' in Anchor['href']:
PageOlder = Anchor['href']
continue
LocalId = GlobalId.split('/')[-1]
Username = Entry.find('a', class_='status__display-name').get_text().strip()
Content = Entry.find('div', class_='e-content')
StatusPrepend = Entry.find('div', class_='status__prepend')
StatusPrepend = StatusPrepend.get_text().strip()[len(Username):] if StatusPrepend else ''
StatusPrepend = ' ' + StatusPrepend.strip() if StatusPrepend else ''
if not IncludeRetoots and StatusPrepend:
continue
if not StatusPrepend and IncludeReplies and Entry.find('i', class_='fa-reply-all'):
StatusPrepend = ' replied'
Title = Content.get_text().strip()
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
for Emoji in Entry.find_all('img', class_='custom-emoji'): # Custom emojis in text
Emoji['style'] = 'max-height:1em;'
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
Entry.find('div', class_='status__action-bar').replace_with('')
if os.path.isfile(f'{AppName}.db'):
with open(f'{AppName}.db', 'r') as Db:
if f'{Usertag} {GlobalId}' in Db.read().splitlines():
continue
print(f"-> {LocalId} - {Title}")
HTML = f"""\
if Index == 1:
LastEntryIsNew = True
LocalId = GlobalId.split('/')[-1]
Username = Entry.find('a', class_='status__display-name').get_text().strip()
Content = Entry.find('div', class_='e-content')
StatusPrepend = Entry.find('div', class_='status__prepend')
StatusPrepend = StatusPrepend.get_text().strip()[len(Username):] if StatusPrepend else ''
StatusPrepend = ' ' + StatusPrepend.strip() if StatusPrepend else ''
if not IncludeRetoots and StatusPrepend:
continue
if not StatusPrepend and IncludeReplies and Entry.find('i', class_='fa-reply-all'):
StatusPrepend = ' replied'
Title = Content.get_text().strip()
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
for Emoji in Entry.find_all('img', class_='custom-emoji'): # Custom emojis in text
Emoji['style'] = 'max-height:1em;'
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
Entry.find('div', class_='status__action-bar').replace_with('')
print(f"-> {LocalId} - {Title}")
HTML = f"""\
<h1>{Title}</h1>
<div class="AppName-content" style="word-wrap:break-word;">
@ -77,76 +104,67 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
<p><i>Via <a href="https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML">https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML</a></i></p>
"""
if SendMail:
Message = MIMEMultipart()
Message['From'] = MailUsername
Message['To'] = ', '.join(MailTo)
Message['Subject'] = Title
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
if SendMail:
Message = MIMEMultipart()
Message['From'] = MailUsername
Message['To'] = ', '.join(MailTo)
Message['Subject'] = Title
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
Attachments = Entry.find('ul', class_='attachment-list__list')
if Attachments:
for Attachment in Attachments:
Href, Alt = '', ''
Attachment = str(Attachment).strip().replace("'",'"').split('"')
for i,e in enumerate(Attachment):
if e.endswith('<a href='):
Href = Attachment[i+1]
elif e.endswith('title='):
Alt = "'".join(Attachment[i+1:-1])
if Href:
Response = urlopen(Request(Href, headers={'User-Agent':UserAgent}))
Data = Response.read()
Mime = Response.info().get_content_type()
if LocalSave:
Tag = 'img' if Mime.split('/')[0] == 'image' else Mime.split('/')[0]
Opening = f'<{Tag} alt="{Alt}" title="{Alt}"' if Tag == 'img' else f'<{Tag} controls'
Closing = '>' if Tag == 'img' else f"></{Tag}>"
Attached += f"""{Opening} style="max-width:100%; max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
if SendMail:
File = MIMEBase(Mime.split('/')[0], Mime.split('/')[1])
File.set_payload(Data)
encoders.encode_base64(File)
File.add_header(
"Content-Disposition",
f"attachment; filename={Href.split('/')[-1]}")
Message.attach(File)
Attachments = Entry.find('ul', class_='attachment-list__list')
if Attachments and (LocalSave or SendMail):
for Attachment in Attachments:
Href, Alt = '', ''
Attachment = str(Attachment).strip().replace("'",'"').split('"')
for i,e in enumerate(Attachment):
if e.endswith('<a href='):
Href = Attachment[i+1]
elif e.endswith('title='):
Alt = "'".join(Attachment[i+1:-1])
if Href:
Response = urlopen(Request(Href, headers={'User-Agent':UserAgent}))
Data = Response.read()
Mime = Response.info().get_content_type()
if LocalSave:
Tag = 'img' if Mime.split('/')[0] == 'image' else Mime.split('/')[0]
Opening = f'<{Tag} alt="{Alt}" title="{Alt}"' if Tag == 'img' else f'<{Tag} controls'
Closing = '>' if Tag == 'img' else f"></{Tag}>"
Attached += f"""{Opening} style="max-width:100%; max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
if SendMail:
File = MIMEBase(Mime.split('/')[0], Mime.split('/')[1])
File.set_payload(Data)
encoders.encode_base64(File)
File.add_header(
"Content-Disposition",
f"attachment; filename={Href.split('/')[-1]}")
Message.attach(File)
if SendMail:
with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client:
Client.login(MailUsername, MailPassword)
Client.sendmail(MailUsername, MailTo, Message.as_string())
time.sleep(MailSleep)
if SendMail:
with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client:
Client.login(MailUsername, MailPassword)
Client.sendmail(MailUsername, MailTo, Message.as_string())
time.sleep(MailSleep)
if LocalSave:
LocalBackupDir = MakePathStr(Usertag)
if not os.path.isdir(LocalBackupDir):
os.mkdir(LocalBackupDir)
FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
File.write(HTML.replace('{ Replace:Attached }', Attached))
if LocalSave:
LocalBackupDir = MakePathStr(Usertag)
if not os.path.isdir(LocalBackupDir):
os.mkdir(LocalBackupDir)
FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
File.write(HTML.replace('{ Replace:Attached }', Attached))
with open(f'{AppName}.db', 'a') as Db:
Db.write(f'{Usertag} {GlobalId}' + '\n')
with open(f'{AppName}.db', 'a') as Db:
Db.write(f'{Usertag} {GlobalId}' + '\n')
except Exception:
raise
return LastEntryIsNew, PageOlder
def HandleFeedList(List):
for Feed in List:
print(f"[I] Handling Feed ->\n: {Feed}")
HandleFeed(
URLs=SureList(Feed['URLs']),
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
except Exception:
raise
if __name__ == '__main__':
while True:
print("[I] Scraping...")
HandleFeedList(Feeds)
HandleFeedsList(Feeds)
if LoopTime <= 0:
exit()
print(f"[I] Sleeping for {LoopTime}s...")

View File

@ -0,0 +1,3 @@
#!/bin/sh
cd "$( dirname "$( realpath "$0" )" )"
./MastodonFeedHTML.py