Some markdown patches

This commit is contained in:
octospacc 2022-07-29 18:12:13 +02:00
parent f62a26eb33
commit fb02893d21
8 changed files with 76 additions and 32 deletions

View File

@ -81,7 +81,7 @@ def Main(Args, FeedEntries):
Locale = LoadLocale(SiteLang)
MastodonURL = Args.MastodonURL if Args.MastodonURL else ''
MastodonToken = Args.MastodonToken if Args.MastodonToken else ''
MarkdownExts = literal_eval(Args.MarkdownExts) if Args.MarkdownExts else EvalOpt(ReadConf(SiteConf, 'Site', 'MarkdownExts')) if ReadConf(SiteConf, 'Site', 'MarkdownExts') else ('attr_list', 'def_list', 'markdown_del_ins', 'md_in_html', 'mdx_subscript', 'mdx_superscript', 'tables')
MarkdownExts = literal_eval(Args.MarkdownExts) if Args.MarkdownExts else EvalOpt(ReadConf(SiteConf, 'Site', 'MarkdownExts')) if ReadConf(SiteConf, 'Site', 'MarkdownExts') else MarkdownExtsDefault
ActivityPubTypeFilter = Args.ActivityPubTypeFilter if Args.ActivityPubTypeFilter else 'Post'
FeedCategoryFilter = Args.FeedCategoryFilter if Args.FeedCategoryFilter else 'Blog'
Minify = StringBoolChoose(False, Args.Minify, ReadConf(SiteConf, 'Site', 'Minify'))

View File

@ -166,8 +166,9 @@ class FootnoteExtension(Extension):
div = etree.Element("div")
div.set('class', 'footnote')
etree.SubElement(div, "hr")
# etree.SubElement(div, "hr")
ol = etree.SubElement(div, "ol")
# ol = etree.Element("ol")
surrogate_parent = etree.Element("div")
for index, id in enumerate(self.footnotes.keys(), start=1):
@ -198,6 +199,7 @@ class FootnoteExtension(Extension):
p = etree.SubElement(li, "p")
return div
# return ol
class FootnoteBlockProcessor(BlockProcessor):

View File

@ -10,27 +10,9 @@
# TODO: Write the Python HTML2Gemtext converter
from Libs.bs4 import BeautifulSoup
from Modules.HTML import *
from Modules.Utils import *
ClosedTags = (
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'p', 'span', 'pre', 'code',
'a', 'b', 'i', 'del', 'strong',
'div', 'details', 'summary',
'ol', 'ul', 'li', 'dl', 'dt', 'dd')
OpenTags = (
def StripAttrs(HTML):
Soup = BeautifulSoup(HTML, 'html.parser')
Tags = Soup.find_all()
for t in Tags:
if 'href' not in t.attrs and 'src' not in t.attrs:
t.attrs = {}
return str(Soup)
def FixGemlogDateLine(Line):
if len(Line) >= 2 and Line[0] == '[' and Line[1].isdigit():
Line = Line[1:]

View File

@ -10,10 +10,60 @@
from Libs.bs4 import BeautifulSoup
from Modules.Utils import *
ClosedTags = (
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'p', 'span', 'pre', 'code',
'a', 'b', 'i', 'del', 'strong',
'div', 'details', 'summary',
'ol', 'ul', 'li', 'dl', 'dt', 'dd')
OpenTags = (
def MkSoup(HTML):
return BeautifulSoup(HTML, 'html.parser')
def StripAttrs(HTML):
Soup = MkSoup(HTML)
Tags = Soup.find_all()
for t in Tags:
if 'href' not in t.attrs and 'src' not in t.attrs:
t.attrs = {}
return str(Soup)
def StripTags(HTML, ToStrip):
Soup = BeautifulSoup(HTML, 'html.parser')
Soup = MkSoup(HTML)
Tags = Soup.find_all()
for t in Tags:
if in ToStrip:
return str(Soup)
def AddToTagStartEnd(HTML, MatchStart, MatchEnd, AddStart, AddEnd): # This doesn't handle nested tags
StartPos = None
for i,e in enumerate(HTML):
FilterStart = HTML[i:i+len(MatchStart)]
FilterEnd = HTML[i:i+len(MatchEnd)]
if not AddStart and not AddEnd:
if FilterStart == MatchStart:
StartPos = i
# TagName = FirstRealItem(FirstRealItem(FilterStart.split('<')).split(' '))
if AddStart:
HTML = HTML[:i] + AddStart + HTML[i:]
AddStart = None
if FilterEnd == MatchEnd and StartPos and i > StartPos:
if AddEnd:
HTML = HTML[:i+len(MatchEnd)] + AddEnd + HTML[i+len(MatchEnd):]
AddEnd = None
return HTML
def SquareFnrefs(HTML): # Different combinations of formatting for Soup .prettify, .encode, .decode break different page elements, don't use this for now
Soup = MkSoup(HTML)
Tags = Soup.find_all('sup')
for t in Tags:
if 'id' in t.attrs and t.attrs['id'].startswith('fnref:'):
s = t.find('a')
return str(Soup.prettify(formatter=None))

View File

@ -13,6 +13,8 @@ try:
except ModuleNotFoundError:
from Libs.markdown import markdown
MarkdownExtsDefault = ('attr_list', 'def_list', 'footnotes', 'markdown_del_ins', 'md_in_html', 'mdx_subscript', 'mdx_superscript', 'tables')
def MarkdownHTMLEscape(Str, Extensions=()): # WIP
Text = ''
for i,e in enumerate(Str):

View File

@ -250,11 +250,15 @@ def CanIndex(Index, For):
def PatchHTML(File, HTML, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, SiteName, BlogName, FolderRoots, Categories, SiteLang, Locale):
HTMLTitles = FormatTitles(Titles)
BodyDescription, BodyImage = '', ''
Parse = BeautifulSoup(Content, 'html.parser')
if not BodyDescription and Parse.p:
BodyDescription = Parse.p.get_text()[:150].replace('\n', ' ').replace('"', "'") + '...'
if not BodyImage and Parse.img and Parse.img['src']:
BodyImage = Parse.img['src']
Soup = BeautifulSoup(Content, 'html.parser')
if not BodyDescription and Soup.p:
BodyDescription = Soup.p.get_text()[:150].replace('\n', ' ').replace('"', "'") + '...'
if not BodyImage and Soup.img and Soup.img['src']:
BodyImage = Soup.img['src']
#Content = SquareFnrefs(Content)
Content = AddToTagStartEnd(Content, '<a class="footnote-ref"', '</a>', '[', ']')
Title = GetTitle(Meta, Titles, 'MetaTitle', BlogName)
Description = GetDescription(Meta, BodyDescription, 'MetaDescription')
@ -301,14 +305,14 @@ def PatchHTML(File, HTML, PartsText, ContextParts, ContextPartsText, HTMLPagesLi
# TODO: Clean this doubling?
ContentHTML = Content
ContentHTML = ContentHTML.replace('[HTML:Site:AbsoluteRoot]', SiteRoot)
ContentHTML = ContentHTML.replace('[HTML:Site:RelativeRoot]', GetPathLevels(PagePath))
ContentHTML = ReplWithEsc(ContentHTML, '[HTML:Site:AbsoluteRoot]', SiteRoot)
ContentHTML = ReplWithEsc(ContentHTML, '[HTML:Site:RelativeRoot]', GetPathLevels(PagePath))
for e in Meta['Macros']:
ContentHTML = ContentHTML.replace(f"[:{e}:]", Meta['Macros'][e])
ContentHTML = ReplWithEsc(ContentHTML, f"[:{e}:]", Meta['Macros'][e])
for e in FolderRoots:
ContentHTML = ContentHTML.replace(f"[HTML:Folder:{e}:AbsoluteRoot]", FolderRoots[e])
ContentHTML = ReplWithEsc(ContentHTML, f"[HTML:Folder:{e}:AbsoluteRoot]", FolderRoots[e])
for e in Categories:
ContentHTML = ContentHTML.replace(f"<span>[HTML:Category:{e}]</span>", Categories[e])
ContentHTML = ReplWithEsc(ContentHTML, f"<span>[HTML:Category:{e}]</span>", Categories[e])
SlimHTML = HTMLPagesList + ContentHTML
return HTML, ContentHTML, SlimHTML, Description, Image

View File

@ -99,6 +99,9 @@ def RevSort(List):
return List
def FirstRealItem(List):
return next(e for e in List if e)
def GetFullDate(Date):
if not Date:
return None

View File

@ -1,5 +1,6 @@
- Fix arguments - some are only callable from CLI and not Site.ini
- Fix Python-Markdown is installed problem (to load our modules)
- Postprocessing markdown footnotes to add brackets
- Hot-recompile
- Differential recompile
- Feed generation without native libraries