Working but really not Gemtext generation

2025-03-13 09:40:17 +01:00 · 2022-06-29 20:09:13 +02:00 · 2022-06-29 20:09:13 +02:00 · 1962a808bc
commit 1962a808bc
parent fc3c0b8be6
5 changed files with 126 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -17,8 +17,11 @@ Feel free to experiment with all of this stuff!
 - (Included) [htmlmin == 0.1.12](https://pypi.org/project/htmlmin)
 - [node == 12.22.5](https://nodejs.org) - [npm == 7.5.2](https://www.npmjs.com)
 - (Included) [pug-cli == 1.0.0-alpha6](https://npmjs.com/package/pug-cli)
+- [Go](https://go.dev)
+- [html2gmi](https://github.com/LukeEmmet/html2gmi)

 ## Features roadmap
+- [ ] Polished Gemtext generation
 - [x] Autodetection of pages and posts
 - [x] Info for posts shown on their page
 - [x] HTML minification
--- a/Source/Build.py
+++ b/Source/Build.py
@ -13,13 +13,19 @@ import os
 import shutil
 from ast import literal_eval
 from datetime import datetime
+from pathlib import Path
+
+# Our local Markdown patches conflict if the module is installed on the system, so first try to import from system
+try:
+	from markdown import markdown
+except ModuleNotFoundError:
+	from Libs.markdown import markdown
+
 from Libs import htmlmin
 from Libs.bs4 import BeautifulSoup
-from Libs.markdown import Markdown
-from Libs.markdown import markdown
-from pathlib import Path
 from Modules.Feed import *
 from Modules.Gemini import *
+from Modules.Pug import *
 from Modules.Utils import *

 Extensions = {
@ -101,7 +107,11 @@ def FormatTitles(Titles):
 		DashyTitles += [DashyTitle]
 		Title = '[{}](#{})'.format(Title, DashyTitle)
 		MDTitles += Heading + Title + '\n'
-	return Markdown().convert(MDTitles)
+	return markdown(MDTitles)
+
+# https://stackoverflow.com/a/15664273
+def IgnoreFiles(Dir, Files):
+    return [f for f in Files if os.path.isfile(os.path.join(Dir, f))]

 def LoadFromDir(Dir, Rglob):
 	Contents = {}
@ -169,16 +179,6 @@ def PreProcessor(Path, SiteRoot):
 					Content += l + '\n'
 	return Content, Titles, Meta

-def PugCompileList(Pages):
-	# Pug-cli seems to shit itself with folder paths as input, so we pass ALL the files as arguments
-	Paths = ''
-	for File, Content, Titles, Meta in Pages:
-		if File.endswith('.pug'):
-			Path = 'public/{}'.format(File)
-			WriteFile(Path, Content)
-			Paths += '"{}" '.format(Path)
-	os.system('pug -P {} > /dev/null'.format(Paths))
-
 def MakeContentHeader(Meta, Locale, Categories=''):
 	Header = ''
 	if Meta['Type'] == 'Post':
@ -187,7 +187,7 @@ def MakeContentHeader(Meta, Locale, Categories=''):
 				Header += '{} {}  \n'.format(Locale[i], Meta[i])
 		if Categories:
 			Header += '{}: {}  \n'.format(Locale['Categories'], Categories)
-	return Markdown().convert(Header)
+	return markdown(Header)

 def MakeCategoryLine(Meta, Reserved):
 	Categories = ''
@ -243,10 +243,13 @@ def PatchHTML(Base, PartsText, ContextParts, ContextPartsText, HTMLPagesList, Pa
 	for i in Categories:
 		Base = Base.replace('<span>[HTML:Category:{}]</span>'.format(i), Categories[i])

+	# TODO: Clean this doubling?
 	Content = Content.replace('[HTML:Site:AbsoluteRoot]', SiteRoot)
 	Content = Content.replace('[HTML:Site:RelativeRoot]', GetLevels(PagePath))
 	for i in FolderRoots:
 		Content = Content.replace('[HTML:Folder:{}:AbsoluteRoot]'.format(i), FolderRoots[i])
+	for i in Categories:
+		Content = Content.replace('<span>[HTML:Category:{}]</span>'.format(i), Categories[i])

 	return Base, Content, Description, Image

@ -311,12 +314,14 @@ def GetHTMLPagesList(Pages, SiteRoot, PathPrefix, Type='Page', Category=None, Fo
 				Levels = '- ' * n
 				Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix)
 				List += Levels + Title + '\n'
-	return Markdown().convert(List)
+	return markdown(List)

 def DelTmp():
 	for Ext in Extensions['Pages']:
 		for File in Path('public').rglob('*.{}'.format(Ext)):
 			os.remove(File)
+	for File in Path('public').rglob('*.tmp'):
+		os.remove(File)

 def RevSort(List):
 	List.sort()
@ -394,7 +399,7 @@ def MakeSite(TemplatesText, PartsText, ContextParts, ContextPartsText, SiteName,
 			For='Menu')
 		PagePath = 'public/{}.html'.format(StripExt(File))
 		if File.endswith('.md'):
-			Content = markdown(Content, extensions=['attr_list'])
+			Content = markdown(Content, extensions=['attr_list']) # TODO: Configurable extensions?
 		elif File.endswith('.pug'):
 			Content = ReadFile(PagePath)
 		HTML, HTMLContent, Description, Image = PatchHTML(
@ -447,8 +452,12 @@ def Main(Args):
 	ResetPublic()
 	if os.path.isdir('Pages'):
 		shutil.copytree('Pages', 'public')
+		if Args.GemtextOut:
+			shutil.copytree('Pages', 'public.gmi', ignore=IgnoreFiles)
 	if os.path.isdir('Posts'):
 		shutil.copytree('Posts', 'public/Posts')
+		if Args.GemtextOut:
+			shutil.copytree('Posts', 'public.gmi/Posts', ignore=IgnoreFiles)

 	Pages = MakeSite(
 		TemplatesText=LoadFromDir('Templates', '*.html'),
@ -475,10 +484,12 @@ def Main(Args):
 			Lang=SiteLang,
 			Minify=True if Args.Minify and Args.Minify not in ('False', 'None') else False)

-	#HTML2Gemtext(
-	#	Pages=Pages,
-	#	SiteName=SiteName,
-	#	SiteTagline=SiteTagline)
+	if Args.GemtextOut:
+		GemtextCompileList(Pages)
+		#HTML2Gemtext(
+		#	Pages=Pages,
+		#	SiteName=SiteName,
+		#	SiteTagline=SiteTagline)

 	DelTmp()
 	os.system("cp -R Assets/* public/")
@ -493,6 +504,7 @@ if __name__ == '__main__':
 	Parser.add_argument('--SiteDomain', type=str)
 	Parser.add_argument('--SiteTagline', type=str)
 	Parser.add_argument('--FeedEntries', type=int)
+	Parser.add_argument('--GemtextOut', type=bool)
 	Parser.add_argument('--FolderRoots', type=str)
 	Parser.add_argument('--ContextParts', type=str)
 	Parser.add_argument('--ReservedPaths', type=str)
--- a/Source/Modules/Feed.py
+++ b/Source/Modules/Feed.py
@ -7,6 +7,8 @@
 |   Copyright (C) 2022, OctoSpacc     |
 | ================================= """

+# TODO: Either switch feed generation lib, or rewrite the 'lxml' module, so that no modules have to be compiled and the program is 100% portable
+
 from Libs.feedgen.feed import FeedGenerator
 from Modules.Utils import *

--- a/Source/Modules/Gemini.py
+++ b/Source/Modules/Gemini.py
@ -7,15 +7,75 @@
 |   Copyright (C) 2022, OctoSpacc     |
 | ================================= """

+# TODO: Write the Python HTML2Gemtext converter
+
 from Libs.bs4 import BeautifulSoup
 from Modules.Utils import *

-def HTML2Gemtext(Pages, SiteName, SiteTagline):
-	os.mkdir('public.gmi')
+ClosedTags = (
+	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+	'p', 'span', 'pre', 'code',
+	'a', 'b', 'i', 'del', 'strong',
+	'div', 'details', 'summary',
+	'ol', 'ul', 'li', 'dl', 'dt', 'dd')
+OpenTags = (
+	'img')
+
+def GemtextCompileList(Pages):
 	for File, Content, Titles, Meta, HTMLContent, Description, Image in Pages:
-		Parse = BeautifulSoup(HTMLContent, 'html.parser')
-		# We should first get the most basic HTML elements, convert them to Gemtext, then replace the Gemtext in the original full HTML, and then removing <p> tags?
-		#print(File, Parse.find_all('p'), Parse.find_all('li'))
+		Src = 'public/{}.html.tmp'.format(StripExt(File))
+		WriteFile(Src, HTMLContent)
+		Dst = 'public.gmi/{}.gmi'.format(StripExt(File))
+		os.system('cat {} | html2gmi > {}'.format(Src, Dst))
+
+def FindEarliest(Str, Items):
+	Pos, Item = 0, ''
+	for Item in Items:
+		Str.find(Item)
+	return Pos, Item
+
+def ParseTag(Content):
+	print(Content)
+	Parse = BeautifulSoup(str(Content), 'html.parser')
+	Tag = Parse.find()
+
+def HTML2Gemtext(Pages, SiteName, SiteTagline):
+	#os.mkdir('public.gmi')
+	for File, Content, Titles, Meta, HTMLContent, Description, Image in Pages:
+		Gemtext = ''
+		Content = HTMLContent
+		print(File)
+		while len(Content) != 0:
+			BlockStart = Content.find('<')
+			TagEnd = Content.find('>')
+			Parse = BeautifulSoup(Content, 'html.parser')
+			Tag = Parse.find()
+			#if Tag.name in ('a'):
+			#	if 'href' in Tag.attrs:
+			#		pass
+			for i in Tag.contents:
+				ParseTag(i)
+			if Tag.name in ('h1', 'h2', 'h3'):
+				Gemtext += '#' * int(Tag.name[1]) + ' '
+			elif Tag.name in ('h4', 'h5', 'h6'):
+				Gemtext += '### '
+			elif Tag.name in ('li'):
+				Gemtext += '* '
+			Gemtext += str(Tag.get_text()) + '\n\n'
+			#print(File, Tag.name, len(Tag.contents))
+			if Tag.name in ClosedTags:
+				Str = '</{}>'.format(Tag.name)
+			elif Tag.name in OpenTags:
+				Str = '>'
+			BlockEnd = Content.find(Str) + len(Str)
+			Content = Content.replace(Content[BlockStart:TagEnd], '').replace(Content[BlockEnd-len(Str):BlockEnd], '')
+			#print(BlockStart, TagEnd, BlockEnd, Tag.contents)
+			#print(Content[BlockStart:BlockEnd])
+			#Gemtext += Content[BlockStart:BlockEnd]
+			Content = Content[BlockEnd:]
+		PagePath = 'public.gmi/{}.gmi'.format(StripExt(File))
+		WriteFile(PagePath, Gemtext)
+		#exit()

 """ Gemtext:
 # h1
--- a/Source/Modules/Pug.py
+++ b/Source/Modules/Pug.py
@ -0,0 +1,23 @@
+""" ================================= |
+| This file is part of                |
+|   staticoso                         |
+| Just a simple Static Site Generator |
+|                                     |
+| Licensed under the AGPLv3 license   |
+|   Copyright (C) 2022, OctoSpacc     |
+| ================================= """
+
+# TODO: Write a native Pug parser; There is one already available for Python but seems broken / out-of-date
+
+import os
+from Modules.Utils import *
+
+def PugCompileList(Pages):
+	# Pug-cli seems to shit itself with folder paths as input, so we pass ALL the files as arguments
+	Paths = ''
+	for File, Content, Titles, Meta in Pages:
+		if File.endswith('.pug'):
+			Path = 'public/{}'.format(File)
+			WriteFile(Path, Content)
+			Paths += '"{}" '.format(Path)
+	os.system('pug -P {} > /dev/null'.format(Paths))