2022-08-16 00:27:27 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import base64
|
|
|
|
import os
|
2022-08-16 12:50:39 +02:00
|
|
|
import time
|
2022-08-16 00:27:27 +02:00
|
|
|
import email, smtplib, ssl
|
2022-08-16 00:32:34 +02:00
|
|
|
from bs4 import BeautifulSoup
|
2022-08-16 00:27:27 +02:00
|
|
|
from email import encoders
|
|
|
|
from email.mime.base import MIMEBase
|
|
|
|
from email.mime.multipart import MIMEMultipart
|
|
|
|
from email.mime.text import MIMEText
|
2022-08-16 13:23:56 +02:00
|
|
|
from urllib.request import urlopen, Request
|
|
|
|
from Config import *
|
2022-08-16 00:27:27 +02:00
|
|
|
|
2022-08-16 12:50:39 +02:00
|
|
|
def SureList(Item):
|
|
|
|
return Item if type(Item) == list else [Item]
|
|
|
|
|
2022-08-16 00:27:27 +02:00
|
|
|
def MakePathStr(Str):
|
2022-08-16 00:32:34 +02:00
|
|
|
for c in ('<>:"/\\|?*'):
|
2022-08-16 00:27:27 +02:00
|
|
|
Str = Str.replace(c, '_')
|
2022-08-16 13:39:24 +02:00
|
|
|
if not SpacesInFiles:
|
2022-08-16 00:32:34 +02:00
|
|
|
Str = Str.replace(' ', '_')
|
2022-08-16 00:27:27 +02:00
|
|
|
return Str
|
|
|
|
|
2022-08-22 23:19:33 +02:00
|
|
|
def SleepPrint(s):
|
|
|
|
print(f"[I] Sleeping for {s}s...")
|
|
|
|
time.sleep(s)
|
|
|
|
|
2022-08-22 21:12:05 +02:00
|
|
|
def HandleFeedsList(List):
|
|
|
|
for Feed in List:
|
|
|
|
print(f"[I] Handling Feed ->\n: {Feed}")
|
|
|
|
HandleFeed(
|
|
|
|
URLs=SureList(Feed['URLs']),
|
|
|
|
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
|
|
|
|
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
|
|
|
|
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
|
|
|
|
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
|
|
|
|
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
|
|
|
|
|
2022-08-16 12:50:39 +02:00
|
|
|
def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
|
2022-08-16 00:32:34 +02:00
|
|
|
for URL in URLs:
|
2022-08-22 21:12:05 +02:00
|
|
|
if not (LocalSave or SendMail):
|
|
|
|
print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).")
|
2022-08-22 18:35:37 +02:00
|
|
|
URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies'
|
2022-08-16 00:32:34 +02:00
|
|
|
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
|
2022-08-22 23:19:33 +02:00
|
|
|
Pages = []
|
|
|
|
LastEntryIsNew, PageOlder = HandleURL(True, URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
|
|
|
if LastEntryIsNew and PageOlder:
|
|
|
|
Pages += [PageOlder]
|
|
|
|
while LastEntryIsNew and PageOlder and len(Pages) < MaxPagesRecursion:
|
|
|
|
LastEntryIsNew, PageOlder = HandleURL(True, PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
|
|
|
if LastEntryIsNew and PageOlder and MaxPagesRecursion:
|
|
|
|
Pages += [PageOlder]
|
|
|
|
Pages.reverse()
|
|
|
|
for Page in Pages:
|
|
|
|
HandleURL(False, Page, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
|
|
|
|
|
|
|
def HandleURL(IsFirstRun, URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
|
2022-08-22 21:12:05 +02:00
|
|
|
LastEntryIsNew = False
|
|
|
|
PageOlder = ''
|
|
|
|
try:
|
2022-08-22 23:19:33 +02:00
|
|
|
print(f"-> Page: {URL}")
|
2022-08-22 21:12:05 +02:00
|
|
|
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
|
|
|
|
Data = Response.read()
|
|
|
|
Soup = BeautifulSoup(Data, 'html.parser')
|
|
|
|
Feed = Soup.find_all('div', class_='entry')
|
|
|
|
Feed.reverse() # Order from oldest to newest
|
|
|
|
|
|
|
|
Index = 0
|
|
|
|
for Entry in Feed:
|
|
|
|
Attached = ''
|
|
|
|
Anchor = Entry.find('a', class_='u-url')
|
|
|
|
if Anchor:
|
|
|
|
GlobalId = Anchor['href'].removeprefix('https://').removeprefix('http://')
|
|
|
|
Index += 1
|
|
|
|
else:
|
|
|
|
Anchor = Entry.find('a', class_='load-more')
|
|
|
|
if Anchor:
|
|
|
|
if '?max_id=' in Anchor['href']:
|
|
|
|
PageOlder = Anchor['href']
|
|
|
|
continue
|
|
|
|
|
|
|
|
if os.path.isfile(f'{AppName}.db'):
|
|
|
|
with open(f'{AppName}.db', 'r') as Db:
|
|
|
|
if f'{Usertag} {GlobalId}' in Db.read().splitlines():
|
|
|
|
continue
|
|
|
|
|
|
|
|
if Index == 1:
|
|
|
|
LastEntryIsNew = True
|
2022-08-22 23:19:33 +02:00
|
|
|
if IsFirstRun:
|
|
|
|
break
|
|
|
|
|
2022-08-22 21:12:05 +02:00
|
|
|
LocalId = GlobalId.split('/')[-1]
|
|
|
|
Username = Entry.find('a', class_='status__display-name').get_text().strip()
|
|
|
|
Content = Entry.find('div', class_='e-content')
|
|
|
|
StatusPrepend = Entry.find('div', class_='status__prepend')
|
|
|
|
StatusPrepend = StatusPrepend.get_text().strip()[len(Username):] if StatusPrepend else ''
|
|
|
|
StatusPrepend = ' ' + StatusPrepend.strip() if StatusPrepend else ''
|
|
|
|
if not IncludeRetoots and StatusPrepend:
|
|
|
|
continue
|
|
|
|
if not StatusPrepend and IncludeReplies and Entry.find('i', class_='fa-reply-all'):
|
|
|
|
StatusPrepend = ' replied'
|
|
|
|
Title = Content.get_text().strip()
|
|
|
|
Title = f"{Usertag}{StatusPrepend}: {Title[:32]}..."
|
|
|
|
for Emoji in Entry.find_all('img', class_='custom-emoji'): # Custom emojis in text
|
|
|
|
Emoji['style'] = 'max-height:1em;'
|
|
|
|
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
|
|
|
|
Entry.find('div', class_='status__action-bar').replace_with('')
|
|
|
|
|
2022-08-22 23:19:33 +02:00
|
|
|
print(f"-> Item: {LocalId} - {Title}")
|
2022-08-22 21:12:05 +02:00
|
|
|
HTML = f"""\
|
2022-08-16 00:32:34 +02:00
|
|
|
<h1>{Title}</h1>
|
2022-08-16 00:27:27 +02:00
|
|
|
|
2022-08-16 13:23:56 +02:00
|
|
|
<div class="AppName-content" style="word-wrap:break-word;">
|
2022-08-16 00:32:34 +02:00
|
|
|
{Entry}
|
2022-08-16 00:27:27 +02:00
|
|
|
|
|
|
|
{{ Replace:Attached }}
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<br><hr><br>
|
|
|
|
|
2022-08-16 13:39:24 +02:00
|
|
|
<p><i>Via <a href="https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML">https://gitlab.com/octospacc/bottocto/-/tree/main/MastodonFeedHTML</a></i></p>
|
2022-08-16 00:27:27 +02:00
|
|
|
"""
|
2022-08-22 21:12:05 +02:00
|
|
|
if SendMail:
|
|
|
|
Message = MIMEMultipart()
|
|
|
|
Message['From'] = MailUsername
|
|
|
|
Message['To'] = ', '.join(MailTo)
|
|
|
|
Message['Subject'] = Title
|
|
|
|
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
|
|
|
|
|
|
|
|
Attachments = Entry.find('ul', class_='attachment-list__list')
|
|
|
|
if Attachments and (LocalSave or SendMail):
|
|
|
|
for Attachment in Attachments:
|
|
|
|
Href, Alt = '', ''
|
|
|
|
Attachment = str(Attachment).strip().replace("'",'"').split('"')
|
|
|
|
for i,e in enumerate(Attachment):
|
|
|
|
if e.endswith('<a href='):
|
|
|
|
Href = Attachment[i+1]
|
|
|
|
elif e.endswith('title='):
|
|
|
|
Alt = "'".join(Attachment[i+1:-1])
|
|
|
|
if Href:
|
|
|
|
Response = urlopen(Request(Href, headers={'User-Agent':UserAgent}))
|
|
|
|
Data = Response.read()
|
|
|
|
Mime = Response.info().get_content_type()
|
|
|
|
if LocalSave:
|
|
|
|
Tag = 'img' if Mime.split('/')[0] == 'image' else Mime.split('/')[0]
|
|
|
|
Opening = f'<{Tag} alt="{Alt}" title="{Alt}"' if Tag == 'img' else f'<{Tag} controls'
|
|
|
|
Closing = '>' if Tag == 'img' else f"></{Tag}>"
|
|
|
|
Attached += f"""{Opening} style="max-width:100%; max-height:100vh;" src="data:{Mime};base64,{base64.b64encode(Data).decode()}"{Closing}\n"""
|
|
|
|
if SendMail:
|
|
|
|
File = MIMEBase(Mime.split('/')[0], Mime.split('/')[1])
|
|
|
|
File.set_payload(Data)
|
|
|
|
encoders.encode_base64(File)
|
|
|
|
File.add_header(
|
|
|
|
"Content-Disposition",
|
|
|
|
f"attachment; filename={Href.split('/')[-1]}")
|
|
|
|
Message.attach(File)
|
|
|
|
|
|
|
|
if SendMail:
|
|
|
|
with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client:
|
|
|
|
Client.login(MailUsername, MailPassword)
|
|
|
|
Client.sendmail(MailUsername, MailTo, Message.as_string())
|
2022-08-22 23:19:33 +02:00
|
|
|
SleepPrint(MailSleep)
|
2022-08-22 21:12:05 +02:00
|
|
|
if LocalSave:
|
|
|
|
LocalBackupDir = MakePathStr(Usertag)
|
|
|
|
if not os.path.isdir(LocalBackupDir):
|
|
|
|
os.mkdir(LocalBackupDir)
|
|
|
|
FileName = MakePathStr(f"{GlobalId.split('/')[-1]} - {Title}")
|
|
|
|
with open(f'{LocalBackupDir}/{FileName}.html', 'w') as File:
|
|
|
|
File.write(HTML.replace('{ Replace:Attached }', Attached))
|
|
|
|
|
|
|
|
with open(f'{AppName}.db', 'a') as Db:
|
|
|
|
Db.write(f'{Usertag} {GlobalId}' + '\n')
|
|
|
|
|
2022-08-22 23:19:33 +02:00
|
|
|
SleepPrint(ItemSleep)
|
|
|
|
SleepPrint(PageSleep)
|
|
|
|
|
2022-08-22 21:12:05 +02:00
|
|
|
return LastEntryIsNew, PageOlder
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
raise
|
2022-08-16 12:50:39 +02:00
|
|
|
|
2022-08-16 00:27:27 +02:00
|
|
|
if __name__ == '__main__':
|
2022-08-16 12:50:39 +02:00
|
|
|
while True:
|
|
|
|
print("[I] Scraping...")
|
2022-08-22 21:12:05 +02:00
|
|
|
HandleFeedsList(Feeds)
|
2022-08-16 12:50:39 +02:00
|
|
|
if LoopTime <= 0:
|
|
|
|
exit()
|
2022-08-22 23:19:33 +02:00
|
|
|
SleepPrint(LoopTime)
|