Some markdown patches

2025-06-05 22:09:23 +02:00 · 2022-07-29 18:12:13 +02:00
parent f62a26eb33
commit fb02893d21
8 changed files with 76 additions and 32 deletions
--- a/Source/Modules/Gemini.py
+++ b/Source/Modules/Gemini.py
@@ -10,27 +10,9 @@
 # TODO: Write the Python HTML2Gemtext converter

 from Libs.bs4 import BeautifulSoup
+from Modules.HTML import *
 from Modules.Utils import *

-"""
-ClosedTags = (
-	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-	'p', 'span', 'pre', 'code',
-	'a', 'b', 'i', 'del', 'strong',
-	'div', 'details', 'summary',
-	'ol', 'ul', 'li', 'dl', 'dt', 'dd')
-OpenTags = (
-	'img')
-"""
-
-def StripAttrs(HTML):
-	Soup = BeautifulSoup(HTML, 'html.parser')
-	Tags = Soup.find_all()
-	for t in Tags:
-		if 'href' not in t.attrs and 'src' not in t.attrs:
-			t.attrs = {}
-	return str(Soup)
-
 def FixGemlogDateLine(Line):
 	if len(Line) >= 2 and Line[0] == '[' and Line[1].isdigit():
 		Line = Line[1:]
--- a/Source/Modules/HTML.py
+++ b/Source/Modules/HTML.py
@@ -10,10 +10,60 @@
 from Libs.bs4 import BeautifulSoup
 from Modules.Utils import *

+"""
+ClosedTags = (
+	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+	'p', 'span', 'pre', 'code',
+	'a', 'b', 'i', 'del', 'strong',
+	'div', 'details', 'summary',
+	'ol', 'ul', 'li', 'dl', 'dt', 'dd')
+OpenTags = (
+	'img')
+"""
+
+def MkSoup(HTML):
+	return BeautifulSoup(HTML, 'html.parser')
+
+def StripAttrs(HTML):
+	Soup = MkSoup(HTML)
+	Tags = Soup.find_all()
+	for t in Tags:
+		if 'href' not in t.attrs and 'src' not in t.attrs:
+			t.attrs = {}
+	return str(Soup)
+
 def StripTags(HTML, ToStrip):
-	Soup = BeautifulSoup(HTML, 'html.parser')
+	Soup = MkSoup(HTML)
 	Tags = Soup.find_all()
 	for t in Tags:
 		if t.name in ToStrip:
 			t.replace_with('')
 	return str(Soup)
+
+def AddToTagStartEnd(HTML, MatchStart, MatchEnd, AddStart, AddEnd): # This doesn't handle nested tags
+	StartPos = None
+	for i,e in enumerate(HTML):
+		FilterStart = HTML[i:i+len(MatchStart)]
+		FilterEnd = HTML[i:i+len(MatchEnd)]
+		if not AddStart and not AddEnd:
+			break
+		if FilterStart == MatchStart:
+			StartPos = i
+			# TagName = FirstRealItem(FirstRealItem(FilterStart.split('<')).split(' '))
+			if AddStart:
+				HTML = HTML[:i] + AddStart + HTML[i:]
+				AddStart = None
+		if FilterEnd == MatchEnd and StartPos and i > StartPos:
+			if AddEnd:
+				HTML = HTML[:i+len(MatchEnd)] + AddEnd + HTML[i+len(MatchEnd):]
+				AddEnd = None
+	return HTML
+
+def SquareFnrefs(HTML): # Different combinations of formatting for Soup .prettify, .encode, .decode break different page elements, don't use this for now
+	Soup = MkSoup(HTML)
+	Tags = Soup.find_all('sup')
+	for t in Tags:
+		if 'id' in t.attrs and t.attrs['id'].startswith('fnref:'):
+			s = t.find('a')
+			s.replace_with(f'[{t}]')
+	return str(Soup.prettify(formatter=None))
--- a/Source/Modules/Markdown.py
+++ b/Source/Modules/Markdown.py
@@ -13,6 +13,8 @@ try:
 except ModuleNotFoundError:
 	from Libs.markdown import markdown

+MarkdownExtsDefault = ('attr_list', 'def_list', 'footnotes', 'markdown_del_ins', 'md_in_html', 'mdx_subscript', 'mdx_superscript', 'tables')
+
 def MarkdownHTMLEscape(Str, Extensions=()): # WIP
 	Text = ''
 	for i,e in enumerate(Str):
--- a/Source/Modules/Site.py
+++ b/Source/Modules/Site.py
@@ -250,11 +250,15 @@ def CanIndex(Index, For):
 def PatchHTML(File, HTML, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, SiteName, BlogName, FolderRoots, Categories, SiteLang, Locale):
 	HTMLTitles = FormatTitles(Titles)
 	BodyDescription, BodyImage = '', ''
-	Parse = BeautifulSoup(Content, 'html.parser')
-	if not BodyDescription and Parse.p:
-		BodyDescription = Parse.p.get_text()[:150].replace('\n', ' ').replace('"', "'") + '...'
-	if not BodyImage and Parse.img and Parse.img['src']:
-		BodyImage = Parse.img['src']
+	Soup = BeautifulSoup(Content, 'html.parser')
+	
+	if not BodyDescription and Soup.p:
+		BodyDescription = Soup.p.get_text()[:150].replace('\n', ' ').replace('"', "'") + '...'
+	if not BodyImage and Soup.img and Soup.img['src']:
+		BodyImage = Soup.img['src']
+
+	#Content = SquareFnrefs(Content)
+	Content = AddToTagStartEnd(Content, '<a class="footnote-ref"', '</a>', '[', ']')

 	Title = GetTitle(Meta, Titles, 'MetaTitle', BlogName)
 	Description = GetDescription(Meta, BodyDescription, 'MetaDescription')
@@ -301,14 +305,14 @@ def PatchHTML(File, HTML, PartsText, ContextParts, ContextPartsText, HTMLPagesLi

 	# TODO: Clean this doubling?
 	ContentHTML = Content
-	ContentHTML = ContentHTML.replace('[HTML:Site:AbsoluteRoot]', SiteRoot)
-	ContentHTML = ContentHTML.replace('[HTML:Site:RelativeRoot]', GetPathLevels(PagePath))
+	ContentHTML = ReplWithEsc(ContentHTML, '[HTML:Site:AbsoluteRoot]', SiteRoot)
+	ContentHTML = ReplWithEsc(ContentHTML, '[HTML:Site:RelativeRoot]', GetPathLevels(PagePath))
 	for e in Meta['Macros']:
-		ContentHTML = ContentHTML.replace(f"[:{e}:]", Meta['Macros'][e])
+		ContentHTML = ReplWithEsc(ContentHTML, f"[:{e}:]", Meta['Macros'][e])
 	for e in FolderRoots:
-		ContentHTML = ContentHTML.replace(f"[HTML:Folder:{e}:AbsoluteRoot]", FolderRoots[e])
+		ContentHTML = ReplWithEsc(ContentHTML, f"[HTML:Folder:{e}:AbsoluteRoot]", FolderRoots[e])
 	for e in Categories:
-		ContentHTML = ContentHTML.replace(f"<span>[HTML:Category:{e}]</span>", Categories[e])
+		ContentHTML = ReplWithEsc(ContentHTML, f"<span>[HTML:Category:{e}]</span>", Categories[e])
 	SlimHTML = HTMLPagesList + ContentHTML

 	return HTML, ContentHTML, SlimHTML, Description, Image
--- a/Source/Modules/Utils.py
+++ b/Source/Modules/Utils.py
@@ -99,6 +99,9 @@ def RevSort(List):
 	List.reverse()
 	return List

+def FirstRealItem(List):
+	return next(e for e in List if e)
+
 def GetFullDate(Date):
 	if not Date:
 		return None