Handle items older than last 20; TODO: Fix order

This commit is contained in:
octo@swiss 2022-08-22 21:12:05 +02:00
parent ad45d852ee
commit 491b99f339
3 changed files with 118 additions and 96 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*.pyc
*.db
Config.py

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python3
import base64
import feedparser
import os
import time
import email, smtplib, ssl
@ -22,10 +21,30 @@ def MakePathStr(Str):
Str = Str.replace(' ', '_')
return Str
def HandleFeedsList(List):
for Feed in List:
print(f"[I] Handling Feed ->\n: {Feed}")
HandleFeed(
URLs=SureList(Feed['URLs']),
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
for URL in URLs:
if not (LocalSave or SendMail):
print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).")
URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies'
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
LastEntryIsNew, PageOlder = HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
while LastEntryIsNew and PageOlder: # TODO: Fix this, make a single run for all items / start from oldest, otherwise order is messed up
LastEntryIsNew, PageOlder = HandleURL(PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
LastEntryIsNew = False
PageOlder = ''
try:
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
Data = Response.read()
@ -33,12 +52,18 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
Feed = Soup.find_all('div', class_='entry')
Feed.reverse() # Order from oldest to newest
Index = 0
for Entry in Feed:
Attached = ''
GlobalId = Entry.find('a', class_='u-url')
if GlobalId:
GlobalId = GlobalId['href'].removeprefix('https://').removeprefix('http://')
Anchor = Entry.find('a', class_='u-url')
if Anchor:
GlobalId = Anchor['href'].removeprefix('https://').removeprefix('http://')
Index += 1
else:
Anchor = Entry.find('a', class_='load-more')
if Anchor:
if '?max_id=' in Anchor['href']:
PageOlder = Anchor['href']
continue
if os.path.isfile(f'{AppName}.db'):
@ -46,6 +71,8 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
if f'{Usertag} {GlobalId}' in Db.read().splitlines():
continue
if Index == 1:
LastEntryIsNew = True
LocalId = GlobalId.split('/')[-1]
Username = Entry.find('a', class_='status__display-name').get_text().strip()
Content = Entry.find('div', class_='e-content')
@ -85,7 +112,7 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
Message.attach(MIMEText(HTML.replace('{ Replace:Attached }', ''), 'html'))
Attachments = Entry.find('ul', class_='attachment-list__list')
if Attachments:
if Attachments and (LocalSave or SendMail):
for Attachment in Attachments:
Href, Alt = '', ''
Attachment = str(Attachment).strip().replace("'",'"').split('"')
@ -129,24 +156,15 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
with open(f'{AppName}.db', 'a') as Db:
Db.write(f'{Usertag} {GlobalId}' + '\n')
return LastEntryIsNew, PageOlder
except Exception:
raise
def HandleFeedList(List):
for Feed in List:
print(f"[I] Handling Feed ->\n: {Feed}")
HandleFeed(
URLs=SureList(Feed['URLs']),
IncludeRetoots=Feed['IncludeRetoots'] if 'IncludeRetoots' in Feed else True,
IncludeReplies=Feed['IncludeReplies'] if 'IncludeReplies' in Feed else True,
LocalSave=Feed['LocalSave'] if 'LocalSave' in Feed else True,
SendMail=Feed['SendMail'] if 'SendMail' in Feed else True if 'To' in Feed and Feed['To'] else False,
MailTo=SureList(Feed['MailTo']) if 'MailTo' in Feed and Feed['MailTo'] else [])
if __name__ == '__main__':
while True:
print("[I] Scraping...")
HandleFeedList(Feeds)
HandleFeedsList(Feeds)
if LoopTime <= 0:
exit()
print(f"[I] Sleeping for {LoopTime}s...")

View File

@ -0,0 +1,3 @@
#!/bin/sh
cd "$( dirname "$( realpath "$0" )" )"
./MastodonFeedHTML.py