diff --git a/Source/Build.py b/Source/Build.py index 59c283a..03208f8 100755 --- a/Source/Build.py +++ b/Source/Build.py @@ -81,7 +81,7 @@ def Main(Args, FeedEntries): Locale = LoadLocale(SiteLang) MastodonURL = Args.MastodonURL if Args.MastodonURL else '' MastodonToken = Args.MastodonToken if Args.MastodonToken else '' - MarkdownExts = literal_eval(Args.MarkdownExts) if Args.MarkdownExts else EvalOpt(ReadConf(SiteConf, 'Site', 'MarkdownExts')) if ReadConf(SiteConf, 'Site', 'MarkdownExts') else ('attr_list', 'def_list', 'markdown_del_ins', 'md_in_html', 'mdx_subscript', 'mdx_superscript', 'tables') + MarkdownExts = literal_eval(Args.MarkdownExts) if Args.MarkdownExts else EvalOpt(ReadConf(SiteConf, 'Site', 'MarkdownExts')) if ReadConf(SiteConf, 'Site', 'MarkdownExts') else MarkdownExtsDefault ActivityPubTypeFilter = Args.ActivityPubTypeFilter if Args.ActivityPubTypeFilter else 'Post' FeedCategoryFilter = Args.FeedCategoryFilter if Args.FeedCategoryFilter else 'Blog' Minify = StringBoolChoose(False, Args.Minify, ReadConf(SiteConf, 'Site', 'Minify')) diff --git a/Source/Libs/markdown/extensions/footnotes.py b/Source/Libs/markdown/extensions/footnotes.py index 1cc7118..208a575 100644 --- a/Source/Libs/markdown/extensions/footnotes.py +++ b/Source/Libs/markdown/extensions/footnotes.py @@ -166,8 +166,9 @@ class FootnoteExtension(Extension): div = etree.Element("div") div.set('class', 'footnote') - etree.SubElement(div, "hr") + # etree.SubElement(div, "hr") ol = etree.SubElement(div, "ol") + # ol = etree.Element("ol") surrogate_parent = etree.Element("div") for index, id in enumerate(self.footnotes.keys(), start=1): @@ -198,6 +199,7 @@ class FootnoteExtension(Extension): p = etree.SubElement(li, "p") p.append(backlink) return div + # return ol class FootnoteBlockProcessor(BlockProcessor): diff --git a/Source/Modules/Gemini.py b/Source/Modules/Gemini.py index abc5509..16cc74f 100644 --- a/Source/Modules/Gemini.py +++ b/Source/Modules/Gemini.py @@ -10,27 +10,9 @@ # TODO: Write the Python HTML2Gemtext converter from Libs.bs4 import BeautifulSoup +from Modules.HTML import * from Modules.Utils import * -""" -ClosedTags = ( - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'p', 'span', 'pre', 'code', - 'a', 'b', 'i', 'del', 'strong', - 'div', 'details', 'summary', - 'ol', 'ul', 'li', 'dl', 'dt', 'dd') -OpenTags = ( - 'img') -""" - -def StripAttrs(HTML): - Soup = BeautifulSoup(HTML, 'html.parser') - Tags = Soup.find_all() - for t in Tags: - if 'href' not in t.attrs and 'src' not in t.attrs: - t.attrs = {} - return str(Soup) - def FixGemlogDateLine(Line): if len(Line) >= 2 and Line[0] == '[' and Line[1].isdigit(): Line = Line[1:] diff --git a/Source/Modules/HTML.py b/Source/Modules/HTML.py index 9b84ad8..ca0dd02 100644 --- a/Source/Modules/HTML.py +++ b/Source/Modules/HTML.py @@ -10,10 +10,60 @@ from Libs.bs4 import BeautifulSoup from Modules.Utils import * +""" +ClosedTags = ( + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'p', 'span', 'pre', 'code', + 'a', 'b', 'i', 'del', 'strong', + 'div', 'details', 'summary', + 'ol', 'ul', 'li', 'dl', 'dt', 'dd') +OpenTags = ( + 'img') +""" + +def MkSoup(HTML): + return BeautifulSoup(HTML, 'html.parser') + +def StripAttrs(HTML): + Soup = MkSoup(HTML) + Tags = Soup.find_all() + for t in Tags: + if 'href' not in t.attrs and 'src' not in t.attrs: + t.attrs = {} + return str(Soup) + def StripTags(HTML, ToStrip): - Soup = BeautifulSoup(HTML, 'html.parser') + Soup = MkSoup(HTML) Tags = Soup.find_all() for t in Tags: if t.name in ToStrip: t.replace_with('') return str(Soup) + +def AddToTagStartEnd(HTML, MatchStart, MatchEnd, AddStart, AddEnd): # This doesn't handle nested tags + StartPos = None + for i,e in enumerate(HTML): + FilterStart = HTML[i:i+len(MatchStart)] + FilterEnd = HTML[i:i+len(MatchEnd)] + if not AddStart and not AddEnd: + break + if FilterStart == MatchStart: + StartPos = i + # TagName = FirstRealItem(FirstRealItem(FilterStart.split('<')).split(' ')) + if AddStart: + HTML = HTML[:i] + AddStart + HTML[i:] + AddStart = None + if FilterEnd == MatchEnd and StartPos and i > StartPos: + if AddEnd: + HTML = HTML[:i+len(MatchEnd)] + AddEnd + HTML[i+len(MatchEnd):] + AddEnd = None + return HTML + +def SquareFnrefs(HTML): # Different combinations of formatting for Soup .prettify, .encode, .decode break different page elements, don't use this for now + Soup = MkSoup(HTML) + Tags = Soup.find_all('sup') + for t in Tags: + if 'id' in t.attrs and t.attrs['id'].startswith('fnref:'): + s = t.find('a') + s.replace_with(f'[{t}]') + return str(Soup.prettify(formatter=None)) diff --git a/Source/Modules/Markdown.py b/Source/Modules/Markdown.py index 8d0388f..a2a96ca 100644 --- a/Source/Modules/Markdown.py +++ b/Source/Modules/Markdown.py @@ -13,6 +13,8 @@ try: except ModuleNotFoundError: from Libs.markdown import markdown +MarkdownExtsDefault = ('attr_list', 'def_list', 'footnotes', 'markdown_del_ins', 'md_in_html', 'mdx_subscript', 'mdx_superscript', 'tables') + def MarkdownHTMLEscape(Str, Extensions=()): # WIP Text = '' for i,e in enumerate(Str): diff --git a/Source/Modules/Site.py b/Source/Modules/Site.py index f9b8c34..c0e2faf 100644 --- a/Source/Modules/Site.py +++ b/Source/Modules/Site.py @@ -250,11 +250,15 @@ def CanIndex(Index, For): def PatchHTML(File, HTML, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, SiteName, BlogName, FolderRoots, Categories, SiteLang, Locale): HTMLTitles = FormatTitles(Titles) BodyDescription, BodyImage = '', '' - Parse = BeautifulSoup(Content, 'html.parser') - if not BodyDescription and Parse.p: - BodyDescription = Parse.p.get_text()[:150].replace('\n', ' ').replace('"', "'") + '...' - if not BodyImage and Parse.img and Parse.img['src']: - BodyImage = Parse.img['src'] + Soup = BeautifulSoup(Content, 'html.parser') + + if not BodyDescription and Soup.p: + BodyDescription = Soup.p.get_text()[:150].replace('\n', ' ').replace('"', "'") + '...' + if not BodyImage and Soup.img and Soup.img['src']: + BodyImage = Soup.img['src'] + + #Content = SquareFnrefs(Content) + Content = AddToTagStartEnd(Content, '', '[', ']') Title = GetTitle(Meta, Titles, 'MetaTitle', BlogName) Description = GetDescription(Meta, BodyDescription, 'MetaDescription') @@ -301,14 +305,14 @@ def PatchHTML(File, HTML, PartsText, ContextParts, ContextPartsText, HTMLPagesLi # TODO: Clean this doubling? ContentHTML = Content - ContentHTML = ContentHTML.replace('[HTML:Site:AbsoluteRoot]', SiteRoot) - ContentHTML = ContentHTML.replace('[HTML:Site:RelativeRoot]', GetPathLevels(PagePath)) + ContentHTML = ReplWithEsc(ContentHTML, '[HTML:Site:AbsoluteRoot]', SiteRoot) + ContentHTML = ReplWithEsc(ContentHTML, '[HTML:Site:RelativeRoot]', GetPathLevels(PagePath)) for e in Meta['Macros']: - ContentHTML = ContentHTML.replace(f"[:{e}:]", Meta['Macros'][e]) + ContentHTML = ReplWithEsc(ContentHTML, f"[:{e}:]", Meta['Macros'][e]) for e in FolderRoots: - ContentHTML = ContentHTML.replace(f"[HTML:Folder:{e}:AbsoluteRoot]", FolderRoots[e]) + ContentHTML = ReplWithEsc(ContentHTML, f"[HTML:Folder:{e}:AbsoluteRoot]", FolderRoots[e]) for e in Categories: - ContentHTML = ContentHTML.replace(f"[HTML:Category:{e}]", Categories[e]) + ContentHTML = ReplWithEsc(ContentHTML, f"[HTML:Category:{e}]", Categories[e]) SlimHTML = HTMLPagesList + ContentHTML return HTML, ContentHTML, SlimHTML, Description, Image diff --git a/Source/Modules/Utils.py b/Source/Modules/Utils.py index 1ac29bb..0c5df66 100644 --- a/Source/Modules/Utils.py +++ b/Source/Modules/Utils.py @@ -99,6 +99,9 @@ def RevSort(List): List.reverse() return List +def FirstRealItem(List): + return next(e for e in List if e) + def GetFullDate(Date): if not Date: return None diff --git a/TODO b/TODO index f03d655..21ab971 100644 --- a/TODO +++ b/TODO @@ -1,5 +1,6 @@ - Fix arguments - some are only callable from CLI and not Site.ini - Fix Python-Markdown is installed problem (to load our modules) +- Postprocessing markdown footnotes to add brackets - Hot-recompile - Differential recompile - Feed generation without native libraries