Handle items older than last 20; TODO: Fix order

This commit is contained in:
octo@swiss 2022-08-22 21:12:05 +02:00
parent ad45d852ee
commit 491b99f339
3 changed files with 118 additions and 96 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*.pyc *.pyc
*.db
Config.py Config.py

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import base64 import base64
import feedparser
import os import os
import time import time
import email, smtplib, ssl import email, smtplib, ssl
@ -22,49 +21,77 @@ def MakePathStr(Str):
Str = Str.replace(' ', '_') Str = Str.replace(' ', '_')
return Str return Str
def HandleFeedsList(List):
for Feed in List:
print(f"[I] Handling Feed ->\n: {Feed}")
HandleFeed(
URLs=SureList(Feed['URLs']),
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo): def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
for URL in URLs: for URL in URLs:
if not (LocalSave or SendMail):
print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).")
URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies' URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies'
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}" Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
try: LastEntryIsNew, PageOlder = HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent})) while LastEntryIsNew and PageOlder: # TODO: Fix this, make a single run for all items / start from oldest, otherwise order is messed up
Data = Response.read() LastEntryIsNew, PageOlder = HandleURL(PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
Soup = BeautifulSoup(Data, 'html.parser')
Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest
for Entry in Feed: def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
Attached = '' LastEntryIsNew = False
GlobalId = Entry.find('a', class_='u-url') PageOlder = ''
if GlobalId: try:
GlobalId = GlobalId['href'].removeprefix('https://').removeprefix('http://') Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
else: Data = Response.read()
continue Soup = BeautifulSoup(Data, 'html.parser')
Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest
if os.path.isfile(f'{AppName}.db'): Index = 0
with open(f'{AppName}.db', 'r') as Db: for Entry in Feed:
if f'{Usertag} {GlobalId}' in Db.read().splitlines(): Attached = ''
continue Anchor = Entry.find('a', class_='u-url')
if Anchor:
GlobalId = Anchor['href'].removeprefix('https://').removeprefix('http://')
Index += 1
else:
Anchor = Entry.find('a', class_='load-more')
if Anchor:
if '?max_id=' in Anchor['href']:
PageOlder = Anchor['href']
continue
LocalId = GlobalId.split('/')[-1] if os.path.isfile(f'{AppName}.db'):
Username = Entry.find('a', class_='status__display-name').get_text().strip() with open(f'{AppName}.db', 'r') as Db:
Content = Entry.find('div', class_='e-content') if f'{Usertag} {GlobalId}' in Db.read().splitlines():
StatusPrepend = Entry.find('div', class_='status__prepend') continue
StatusPrepend = StatusPrepend.get_text().strip()[len(Username):] if StatusPrepend else ''
StatusPrepend = ' ' + StatusPrepend.strip() if StatusPrepend else ''
if not IncludeRetoots and StatusPrepend:
continue
if not StatusPrepend and IncludeReplies and Entry.find('i', class_='fa-reply-all'):
StatusPrepend = ' replied'
Title = Content.get_text().strip()
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
for Emoji in Entry.find_all('img', class_='custom-emoji'): # Custom emojis in text
Emoji['style'] = 'max-height:1em;'
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
Entry.find('div', class_='status__action-bar').replace_with('')
print(f"-> {LocalId} - {Title}") if Index == 1:
HTML = f"""\ LastEntryIsNew = True
LocalId = GlobalId.split('/')[-1]
Username = Entry.find('a', class_='status__display-name').get_text().strip()
Content = Entry.find('div', class_='e-content')
StatusPrepend = Entry.find('div', class_='status__prepend')
StatusPrepend = StatusPrepend.get_text().strip()[len(Username):] if StatusPrepend else ''
StatusPrepend = ' ' + StatusPrepend.strip() if StatusPrepend else ''
if not IncludeRetoots and StatusPrepend:
continue
if not StatusPrepend and IncludeReplies and Entry.find('i', class_='fa-reply-all'):
StatusPrepend = ' replied'
Title = Content.get_text().strip()
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
for Emoji in Entry.find_all('img', class_='custom-emoji'): # Custom emojis in text
Emoji['style'] = 'max-height:1em;'
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
Entry.find('div', class_='status__action-bar').replace_with('')
print(f"-> {LocalId} - {Title}")
HTML = f"""\
<h1>{Title}</h1> <h1>{Title}</h1>
<div class="AppName-content" style="word-wrap:break-word;"> <div class="AppName-content" style="word-wrap:break-word;">
@ -77,76 +104,67 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
<p><i>Via <a href="https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML">https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML</a></i></p> <p><i>Via <a href="https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML">https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML</a></i></p>
""" """
if SendMail: if SendMail:
Message = MIMEMultipart() Message = MIMEMultipart()
Message['From'] = MailUsername Message['From'] = MailUsername
Message['To'] = ', '.join(MailTo) Message['To'] = ', '.join(MailTo)
Message['Subject'] = Title Message['Subject'] = Title
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html')) Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
Attachments = Entry.find('ul', class_='attachment-list__list') Attachments = Entry.find('ul', class_='attachment-list__list')
if Attachments: if Attachments and (LocalSave or SendMail):
for Attachment in Attachments: for Attachment in Attachments:
Href, Alt = '', '' Href, Alt = '', ''
Attachment = str(Attachment).strip().replace("'",'"').split('"') Attachment = str(Attachment).strip().replace("'",'"').split('"')
for i,e in enumerate(Attachment): for i,e in enumerate(Attachment):
if e.endswith('<a href='): if e.endswith('<a href='):
Href = Attachment[i+1] Href = Attachment[i+1]
elif e.endswith('title='): elif e.endswith('title='):
Alt = "'".join(Attachment[i+1:-1]) Alt = "'".join(Attachment[i+1:-1])
if Href: if Href:
Response = urlopen(Request(Href, headers={'User-Agent':UserAgent})) Response = urlopen(Request(Href, headers={'User-Agent':UserAgent}))
Data = Response.read() Data = Response.read()
Mime = Response.info().get_content_type() Mime = Response.info().get_content_type()
if LocalSave: if LocalSave:
Tag = 'img' if Mime.split('/')[0] == 'image' else Mime.split('/')[0] Tag = 'img' if Mime.split('/')[0] == 'image' else Mime.split('/')[0]
Opening = f'<{Tag} alt="{Alt}" title="{Alt}"' if Tag == 'img' else f'<{Tag} controls' Opening = f'<{Tag} alt="{Alt}" title="{Alt}"' if Tag == 'img' else f'<{Tag} controls'
Closing = '>' if Tag == 'img' else f"></{Tag}>" Closing = '>' if Tag == 'img' else f"></{Tag}>"
Attached += f"""{Opening} style="max-width:100%; max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n""" Attached += f"""{Opening} style="max-width:100%; max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
if SendMail: if SendMail:
File = MIMEBase(Mime.split('/')[0], Mime.split('/')[1]) File = MIMEBase(Mime.split('/')[0], Mime.split('/')[1])
File.set_payload(Data) File.set_payload(Data)
encoders.encode_base64(File) encoders.encode_base64(File)
File.add_header( File.add_header(
"Content-Disposition", "Content-Disposition",
f"attachment; filename={Href.split('/')[-1]}") f"attachment; filename={Href.split('/')[-1]}")
Message.attach(File) Message.attach(File)
if SendMail: if SendMail:
with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client: with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client:
Client.login(MailUsername, MailPassword) Client.login(MailUsername, MailPassword)
Client.sendmail(MailUsername, MailTo, Message.as_string()) Client.sendmail(MailUsername, MailTo, Message.as_string())
time.sleep(MailSleep) time.sleep(MailSleep)
if LocalSave: if LocalSave:
LocalBackupDir = MakePathStr(Usertag) LocalBackupDir = MakePathStr(Usertag)
if not os.path.isdir(LocalBackupDir): if not os.path.isdir(LocalBackupDir):
os.mkdir(LocalBackupDir) os.mkdir(LocalBackupDir)
FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}") FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File: with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
File.write(HTML.replace('{ Replace:Attached }', Attached)) File.write(HTML.replace('{ Replace:Attached }', Attached))
with open(f'{AppName}.db', 'a') as Db: with open(f'{AppName}.db', 'a') as Db:
Db.write(f'{Usertag} {GlobalId}' + '\n') Db.write(f'{Usertag} {GlobalId}' + '\n')
except Exception: return LastEntryIsNew, PageOlder
raise
def HandleFeedList(List): except Exception:
for Feed in List: raise
print(f"[I] Handling Feed ->\n: {Feed}")
HandleFeed(
URLs=SureList(Feed['URLs']),
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
if __name__ == '__main__': if __name__ == '__main__':
while True: while True:
print("[I] Scraping...") print("[I] Scraping...")
HandleFeedList(Feeds) HandleFeedsList(Feeds)
if LoopTime <= 0: if LoopTime <= 0:
exit() exit()
print(f"[I] Sleeping for {LoopTime}s...") print(f"[I] Sleeping for {LoopTime}s...")

View File

@ -0,0 +1,3 @@
#!/bin/sh
cd "$( dirname "$( realpath "$0" )" )"
./MastodonFeedHTML.py