From a12c38924fab0e5bab568fdde3c92199be1add33 Mon Sep 17 00:00:00 2001 From: "octo@swiss" Date: Mon, 22 Aug 2022 23:19:33 +0200 Subject: [PATCH] MastodonFeedHTML: Ordered recurse into previus pages --- MastodonFeedHTML/Example.Config.py | 9 +++++-- MastodonFeedHTML/MastodonFeedHTML.py | 35 +++++++++++++++++++++------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/MastodonFeedHTML/Example.Config.py b/MastodonFeedHTML/Example.Config.py index 6dfab9d..722d9ab 100644 --- a/MastodonFeedHTML/Example.Config.py +++ b/MastodonFeedHTML/Example.Config.py @@ -19,8 +19,13 @@ MailPort = 465 # How often to refresh the feeds (in seconds). Set to 0 for a single run, instead of having the program sleep. LoopTime = 300 -# Additional time (in seconds) to sleep between every mail sent, to prevent spam. -MailSleep = 10 +# Additional time (in seconds) to sleep in code sections, to prevent ratelimiting. +PageSleep = 2 # Between every scraped page +ItemSleep = 1 # Between every scaped item +MailSleep = 9 # Between every sent mail + +# Stop recursive navigation across posts pages if limit is reached. Set 0 for no limit (use with caution on new profiles with many posts). +MaxPagesRecursion = 10 # Whether or not to allow spaces in file names. SpacesInFiles = True diff --git a/MastodonFeedHTML/MastodonFeedHTML.py b/MastodonFeedHTML/MastodonFeedHTML.py index 44ec853..afe8c2e 100755 --- a/MastodonFeedHTML/MastodonFeedHTML.py +++ b/MastodonFeedHTML/MastodonFeedHTML.py @@ -21,6 +21,10 @@ def MakePathStr(Str): Str = Str.replace(' ', '_') return Str +def SleepPrint(s): + print(f"[I] Sleeping for {s}s...") + time.sleep(s) + def HandleFeedsList(List): for Feed in List: print(f"[I] Handling Feed ->\n: {Feed}") @@ -38,14 +42,23 @@ def HandleFeed(URLs, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo print("[I] Partial dry-run for this URL (LocalSave and SendMail are disabled).") URL = URL.removesuffix('/').removesuffix('/with_replies') + '/with_replies' Usertag = f"{URL.split('/')[-2]}@{URL.split('/')[-3]}" - LastEntryIsNew, PageOlder = HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo) - while LastEntryIsNew and PageOlder: # TODO: Fix this, make a single run for all items / start from oldest, otherwise order is messed up - LastEntryIsNew, PageOlder = HandleURL(PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo) + Pages = [] + LastEntryIsNew, PageOlder = HandleURL(True, URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo) + if LastEntryIsNew and PageOlder: + Pages += [PageOlder] + while LastEntryIsNew and PageOlder and len(Pages) < MaxPagesRecursion: + LastEntryIsNew, PageOlder = HandleURL(True, PageOlder, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo) + if LastEntryIsNew and PageOlder and MaxPagesRecursion: + Pages += [PageOlder] + Pages.reverse() + for Page in Pages: + HandleURL(False, Page, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo) -def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo): +def HandleURL(IsFirstRun, URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, MailTo): LastEntryIsNew = False PageOlder = '' try: + print(f"-> Page: {URL}") Response = urlopen(Request(URL, headers={'User-Agent':UserAgent})) Data = Response.read() Soup = BeautifulSoup(Data, 'html.parser') @@ -73,6 +86,9 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, if Index == 1: LastEntryIsNew = True + if IsFirstRun: + break + LocalId = GlobalId.split('/')[-1] Username = Entry.find('a', class_='status__display-name').get_text().strip() Content = Entry.find('div', class_='e-content') @@ -90,7 +106,7 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, Entry.find('img', class_='u-photo account__avatar')['style'] = 'max-height:4em;' # Profile pics Entry.find('div', class_='status__action-bar').replace_with('') - print(f"-> {LocalId} - {Title}") + print(f"-> Item: {LocalId} - {Title}") HTML = f"""\

{Title}

@@ -143,8 +159,7 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, with smtplib.SMTP_SSL(MailServer, MailPort, context=ssl.create_default_context()) as Client: Client.login(MailUsername, MailPassword) Client.sendmail(MailUsername, MailTo, Message.as_string()) - time.sleep(MailSleep) - + SleepPrint(MailSleep) if LocalSave: LocalBackupDir = MakePathStr(Usertag) if not os.path.isdir(LocalBackupDir): @@ -156,6 +171,9 @@ def HandleURL(URL, Usertag, IncludeRetoots, IncludeReplies, LocalSave, SendMail, with open(f'{AppName}.db', 'a') as Db: Db.write(f'{Usertag} {GlobalId}' + '\n') + SleepPrint(ItemSleep) + SleepPrint(PageSleep) + return LastEntryIsNew, PageOlder except Exception: @@ -167,5 +185,4 @@ if __name__ == '__main__': HandleFeedsList(Feeds) if LoopTime <= 0: exit() - print(f"[I] Sleeping for {LoopTime}s...") - time.sleep(LoopTime) + SleepPrint(LoopTime)