mirror of
https://gitlab.com/octospacc/bottocto
synced 2025-02-09 00:18:41 +01:00
MastodonFeedHTML: Ordered recurse into previus pages
This commit is contained in:
parent
491b99f339
commit
a12c38924f
@ -19,8 +19,13 @@ MailPort = 465
|
|||||||
# How often to refresh the feeds (in seconds). Set to 0 for a single run, instead of having the program sleep.
|
# How often to refresh the feeds (in seconds). Set to 0 for a single run, instead of having the program sleep.
|
||||||
LoopTime = 300
|
LoopTime = 300
|
||||||
|
|
||||||
# Additional time (in seconds) to sleep between every mail sent, to prevent spam.
|
# Additional time (in seconds) to sleep in code sections, to prevent ratelimiting.
|
||||||
MailSleep = 10
|
PageSleep = 2 # Between every scraped page
|
||||||
|
ItemSleep = 1 # Between every scaped item
|
||||||
|
MailSleep = 9 # Between every sent mail
|
||||||
|
|
||||||
|
# Stop recursive navigation across posts pages if limit is reached. Set 0 for no limit (use with caution on new profiles with many posts).
|
||||||
|
MaxPagesRecursion = 10
|
||||||
|
|
||||||
# Whether or not to allow spaces in file names.
|
# Whether or not to allow spaces in file names.
|
||||||
SpacesInFiles = True
|
SpacesInFiles = True
|
||||||
|
@ -21,6 +21,10 @@ def MakePathStr(Str):
|
|||||||
Str = Str.replace(' ', '_')
|
Str = Str.replace(' ', '_')
|
||||||
return Str
|
return Str
|
||||||
|
|
||||||
|
def SleepPrint(s):
|
||||||
|
print(f"[I] Sleeping for {s}s...")
|
||||||
|
time.sleep(s)
|
||||||
|
|
||||||
def HandleFeedsList(List):
|
def HandleFeedsList(List):
|
||||||
for Feed in List:
|
for Feed in List:
|
||||||
print(f"[I] Handling Feed ->\n: {Feed}")
|
print(f"[I] Handling Feed ->\n: {Feed}")
|
||||||
@ -38,14 +42,23 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo
|
|||||||
print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).")
|
print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).")
|
||||||
URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies'
|
URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies'
|
||||||
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
|
Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}"
|
||||||
LastEntryIsNew, PageOlder = HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
Pages = []
|
||||||
while LastEntryIsNew and PageOlder: # TODO: Fix this, make a single run for all items / start from oldest, otherwise order is messed up
|
LastEntryIsNew, PageOlder = HandleURL(True, URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
||||||
LastEntryIsNew, PageOlder = HandleURL(PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
if LastEntryIsNew and PageOlder:
|
||||||
|
Pages += [PageOlder]
|
||||||
|
while LastEntryIsNew and PageOlder and len(Pages) < MaxPagesRecursion:
|
||||||
|
LastEntryIsNew, PageOlder = HandleURL(True, PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
||||||
|
if LastEntryIsNew and PageOlder and MaxPagesRecursion:
|
||||||
|
Pages += [PageOlder]
|
||||||
|
Pages.reverse()
|
||||||
|
for Page in Pages:
|
||||||
|
HandleURL(False, Page, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo)
|
||||||
|
|
||||||
def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
|
def HandleURL(IsFirstRun, URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo):
|
||||||
LastEntryIsNew = False
|
LastEntryIsNew = False
|
||||||
PageOlder = ''
|
PageOlder = ''
|
||||||
try:
|
try:
|
||||||
|
print(f"-> Page: {URL}")
|
||||||
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
|
Response = urlopen(Request(URL, headers={'User-Agent':UserAgent}))
|
||||||
Data = Response.read()
|
Data = Response.read()
|
||||||
Soup = BeautifulSoup(Data, 'html.parser')
|
Soup = BeautifulSoup(Data, 'html.parser')
|
||||||
@ -73,6 +86,9 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail,
|
|||||||
|
|
||||||
if Index == 1:
|
if Index == 1:
|
||||||
LastEntryIsNew = True
|
LastEntryIsNew = True
|
||||||
|
if IsFirstRun:
|
||||||
|
break
|
||||||
|
|
||||||
LocalId = GlobalId.split('/')[-1]
|
LocalId = GlobalId.split('/')[-1]
|
||||||
Username = Entry.find('a', class_='status__display-name').get_text().strip()
|
Username = Entry.find('a', class_='status__display-name').get_text().strip()
|
||||||
Content = Entry.find('div', class_='e-content')
|
Content = Entry.find('div', class_='e-content')
|
||||||
@ -90,7 +106,7 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail,
|
|||||||
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
|
Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics
|
||||||
Entry.find('div', class_='status__action-bar').replace_with('')
|
Entry.find('div', class_='status__action-bar').replace_with('')
|
||||||
|
|
||||||
print(f"-> {LocalId} - {Title}")
|
print(f"-> Item: {LocalId} - {Title}")
|
||||||
HTML = f"""\
|
HTML = f"""\
|
||||||
<h1>{Title}</h1>
|
<h1>{Title}</h1>
|
||||||
|
|
||||||
@ -143,8 +159,7 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail,
|
|||||||
with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client:
|
with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client:
|
||||||
Client.login(MailUsername, MailPassword)
|
Client.login(MailUsername, MailPassword)
|
||||||
Client.sendmail(MailUsername, MailTo, Message.as_string())
|
Client.sendmail(MailUsername, MailTo, Message.as_string())
|
||||||
time.sleep(MailSleep)
|
SleepPrint(MailSleep)
|
||||||
|
|
||||||
if LocalSave:
|
if LocalSave:
|
||||||
LocalBackupDir = MakePathStr(Usertag)
|
LocalBackupDir = MakePathStr(Usertag)
|
||||||
if not os.path.isdir(LocalBackupDir):
|
if not os.path.isdir(LocalBackupDir):
|
||||||
@ -156,6 +171,9 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail,
|
|||||||
with open(f'{AppName}.db', 'a') as Db:
|
with open(f'{AppName}.db', 'a') as Db:
|
||||||
Db.write(f'{Usertag} {GlobalId}' + '\n')
|
Db.write(f'{Usertag} {GlobalId}' + '\n')
|
||||||
|
|
||||||
|
SleepPrint(ItemSleep)
|
||||||
|
SleepPrint(PageSleep)
|
||||||
|
|
||||||
return LastEntryIsNew, PageOlder
|
return LastEntryIsNew, PageOlder
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -167,5 +185,4 @@ if __name__ == '__main__':
|
|||||||
HandleFeedsList(Feeds)
|
HandleFeedsList(Feeds)
|
||||||
if LoopTime <= 0:
|
if LoopTime <= 0:
|
||||||
exit()
|
exit()
|
||||||
print(f"[I] Sleeping for {LoopTime}s...")
|
SleepPrint(LoopTime)
|
||||||
time.sleep(LoopTime)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user