Working but really not Gemtext generation

This commit is contained in:
octospacc 2022-06-29 20:09:13 +02:00
parent fc3c0b8be6
commit 1962a808bc
5 changed files with 126 additions and 26 deletions

View File

@ -17,8 +17,11 @@ Feel free to experiment with all of this stuff!
- (Included) [htmlmin == 0.1.12](https://pypi.org/project/htmlmin) - (Included) [htmlmin == 0.1.12](https://pypi.org/project/htmlmin)
- [node == 12.22.5](https://nodejs.org) - [npm == 7.5.2](https://www.npmjs.com) - [node == 12.22.5](https://nodejs.org) - [npm == 7.5.2](https://www.npmjs.com)
- (Included) [pug-cli == 1.0.0-alpha6](https://npmjs.com/package/pug-cli) - (Included) [pug-cli == 1.0.0-alpha6](https://npmjs.com/package/pug-cli)
- [Go](https://go.dev)
- [html2gmi](https://github.com/LukeEmmet/html2gmi)
## Features roadmap ## Features roadmap
- [ ] Polished Gemtext generation
- [x] Autodetection of pages and posts - [x] Autodetection of pages and posts
- [x] Info for posts shown on their page - [x] Info for posts shown on their page
- [x] HTML minification - [x] HTML minification

View File

@ -13,13 +13,19 @@ import os
import shutil import shutil
from ast import literal_eval from ast import literal_eval
from datetime import datetime from datetime import datetime
from pathlib import Path
# Our local Markdown patches conflict if the module is installed on the system, so first try to import from system
try:
from markdown import markdown
except ModuleNotFoundError:
from Libs.markdown import markdown
from Libs import htmlmin from Libs import htmlmin
from Libs.bs4 import BeautifulSoup from Libs.bs4 import BeautifulSoup
from Libs.markdown import Markdown
from Libs.markdown import markdown
from pathlib import Path
from Modules.Feed import * from Modules.Feed import *
from Modules.Gemini import * from Modules.Gemini import *
from Modules.Pug import *
from Modules.Utils import * from Modules.Utils import *
Extensions = { Extensions = {
@ -101,7 +107,11 @@ def FormatTitles(Titles):
DashyTitles += [DashyTitle] DashyTitles += [DashyTitle]
Title = '[{}](#{})'.format(Title, DashyTitle) Title = '[{}](#{})'.format(Title, DashyTitle)
MDTitles += Heading + Title + '\n' MDTitles += Heading + Title + '\n'
return Markdown().convert(MDTitles) return markdown(MDTitles)
# https://stackoverflow.com/a/15664273
def IgnoreFiles(Dir, Files):
return [f for f in Files if os.path.isfile(os.path.join(Dir, f))]
def LoadFromDir(Dir, Rglob): def LoadFromDir(Dir, Rglob):
Contents = {} Contents = {}
@ -169,16 +179,6 @@ def PreProcessor(Path, SiteRoot):
Content += l + '\n' Content += l + '\n'
return Content, Titles, Meta return Content, Titles, Meta
def PugCompileList(Pages):
# Pug-cli seems to shit itself with folder paths as input, so we pass ALL the files as arguments
Paths = ''
for File, Content, Titles, Meta in Pages:
if File.endswith('.pug'):
Path = 'public/{}'.format(File)
WriteFile(Path, Content)
Paths += '"{}" '.format(Path)
os.system('pug -P {} > /dev/null'.format(Paths))
def MakeContentHeader(Meta, Locale, Categories=''): def MakeContentHeader(Meta, Locale, Categories=''):
Header = '' Header = ''
if Meta['Type'] == 'Post': if Meta['Type'] == 'Post':
@ -187,7 +187,7 @@ def MakeContentHeader(Meta, Locale, Categories=''):
Header += '{} {} \n'.format(Locale[i], Meta[i]) Header += '{} {} \n'.format(Locale[i], Meta[i])
if Categories: if Categories:
Header += '{}: {} \n'.format(Locale['Categories'], Categories) Header += '{}: {} \n'.format(Locale['Categories'], Categories)
return Markdown().convert(Header) return markdown(Header)
def MakeCategoryLine(Meta, Reserved): def MakeCategoryLine(Meta, Reserved):
Categories = '' Categories = ''
@ -243,10 +243,13 @@ def PatchHTML(Base, PartsText, ContextParts, ContextPartsText, HTMLPagesList, Pa
for i in Categories: for i in Categories:
Base = Base.replace('<span>[HTML:Category:{}]</span>'.format(i), Categories[i]) Base = Base.replace('<span>[HTML:Category:{}]</span>'.format(i), Categories[i])
# TODO: Clean this doubling?
Content = Content.replace('[HTML:Site:AbsoluteRoot]', SiteRoot) Content = Content.replace('[HTML:Site:AbsoluteRoot]', SiteRoot)
Content = Content.replace('[HTML:Site:RelativeRoot]', GetLevels(PagePath)) Content = Content.replace('[HTML:Site:RelativeRoot]', GetLevels(PagePath))
for i in FolderRoots: for i in FolderRoots:
Content = Content.replace('[HTML:Folder:{}:AbsoluteRoot]'.format(i), FolderRoots[i]) Content = Content.replace('[HTML:Folder:{}:AbsoluteRoot]'.format(i), FolderRoots[i])
for i in Categories:
Content = Content.replace('<span>[HTML:Category:{}]</span>'.format(i), Categories[i])
return Base, Content, Description, Image return Base, Content, Description, Image
@ -311,12 +314,14 @@ def GetHTMLPagesList(Pages, SiteRoot, PathPrefix, Type='Page', Category=None, Fo
Levels = '- ' * n Levels = '- ' * n
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix) Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix)
List += Levels + Title + '\n' List += Levels + Title + '\n'
return Markdown().convert(List) return markdown(List)
def DelTmp(): def DelTmp():
for Ext in Extensions['Pages']: for Ext in Extensions['Pages']:
for File in Path('public').rglob('*.{}'.format(Ext)): for File in Path('public').rglob('*.{}'.format(Ext)):
os.remove(File) os.remove(File)
for File in Path('public').rglob('*.tmp'):
os.remove(File)
def RevSort(List): def RevSort(List):
List.sort() List.sort()
@ -394,7 +399,7 @@ def MakeSite(TemplatesText, PartsText, ContextParts, ContextPartsText, SiteName,
For='Menu') For='Menu')
PagePath = 'public/{}.html'.format(StripExt(File)) PagePath = 'public/{}.html'.format(StripExt(File))
if File.endswith('.md'): if File.endswith('.md'):
Content = markdown(Content, extensions=['attr_list']) Content = markdown(Content, extensions=['attr_list']) # TODO: Configurable extensions?
elif File.endswith('.pug'): elif File.endswith('.pug'):
Content = ReadFile(PagePath) Content = ReadFile(PagePath)
HTML, HTMLContent, Description, Image = PatchHTML( HTML, HTMLContent, Description, Image = PatchHTML(
@ -447,8 +452,12 @@ def Main(Args):
ResetPublic() ResetPublic()
if os.path.isdir('Pages'): if os.path.isdir('Pages'):
shutil.copytree('Pages', 'public') shutil.copytree('Pages', 'public')
if Args.GemtextOut:
shutil.copytree('Pages', 'public.gmi', ignore=IgnoreFiles)
if os.path.isdir('Posts'): if os.path.isdir('Posts'):
shutil.copytree('Posts', 'public/Posts') shutil.copytree('Posts', 'public/Posts')
if Args.GemtextOut:
shutil.copytree('Posts', 'public.gmi/Posts', ignore=IgnoreFiles)
Pages = MakeSite( Pages = MakeSite(
TemplatesText=LoadFromDir('Templates', '*.html'), TemplatesText=LoadFromDir('Templates', '*.html'),
@ -475,10 +484,12 @@ def Main(Args):
Lang=SiteLang, Lang=SiteLang,
Minify=True if Args.Minify and Args.Minify not in ('False', 'None') else False) Minify=True if Args.Minify and Args.Minify not in ('False', 'None') else False)
#HTML2Gemtext( if Args.GemtextOut:
# Pages=Pages, GemtextCompileList(Pages)
# SiteName=SiteName, #HTML2Gemtext(
# SiteTagline=SiteTagline) # Pages=Pages,
# SiteName=SiteName,
# SiteTagline=SiteTagline)
DelTmp() DelTmp()
os.system("cp -R Assets/* public/") os.system("cp -R Assets/* public/")
@ -493,6 +504,7 @@ if __name__ == '__main__':
Parser.add_argument('--SiteDomain', type=str) Parser.add_argument('--SiteDomain', type=str)
Parser.add_argument('--SiteTagline', type=str) Parser.add_argument('--SiteTagline', type=str)
Parser.add_argument('--FeedEntries', type=int) Parser.add_argument('--FeedEntries', type=int)
Parser.add_argument('--GemtextOut', type=bool)
Parser.add_argument('--FolderRoots', type=str) Parser.add_argument('--FolderRoots', type=str)
Parser.add_argument('--ContextParts', type=str) Parser.add_argument('--ContextParts', type=str)
Parser.add_argument('--ReservedPaths', type=str) Parser.add_argument('--ReservedPaths', type=str)

View File

@ -7,6 +7,8 @@
| Copyright (C) 2022, OctoSpacc | | Copyright (C) 2022, OctoSpacc |
| ================================= """ | ================================= """
# TODO: Either switch feed generation lib, or rewrite the 'lxml' module, so that no modules have to be compiled and the program is 100% portable
from Libs.feedgen.feed import FeedGenerator from Libs.feedgen.feed import FeedGenerator
from Modules.Utils import * from Modules.Utils import *

View File

@ -7,15 +7,75 @@
| Copyright (C) 2022, OctoSpacc | | Copyright (C) 2022, OctoSpacc |
| ================================= """ | ================================= """
# TODO: Write the Python HTML2Gemtext converter
from Libs.bs4 import BeautifulSoup from Libs.bs4 import BeautifulSoup
from Modules.Utils import * from Modules.Utils import *
def HTML2Gemtext(Pages, SiteName, SiteTagline): ClosedTags = (
os.mkdir('public.gmi') 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'p', 'span', 'pre', 'code',
'a', 'b', 'i', 'del', 'strong',
'div', 'details', 'summary',
'ol', 'ul', 'li', 'dl', 'dt', 'dd')
OpenTags = (
'img')
def GemtextCompileList(Pages):
for File, Content, Titles, Meta, HTMLContent, Description, Image in Pages: for File, Content, Titles, Meta, HTMLContent, Description, Image in Pages:
Parse = BeautifulSoup(HTMLContent, 'html.parser') Src = 'public/{}.html.tmp'.format(StripExt(File))
# We should first get the most basic HTML elements, convert them to Gemtext, then replace the Gemtext in the original full HTML, and then removing <p> tags? WriteFile(Src, HTMLContent)
#print(File, Parse.find_all('p'), Parse.find_all('li')) Dst = 'public.gmi/{}.gmi'.format(StripExt(File))
os.system('cat {} | html2gmi > {}'.format(Src, Dst))
def FindEarliest(Str, Items):
Pos, Item = 0, ''
for Item in Items:
Str.find(Item)
return Pos, Item
def ParseTag(Content):
print(Content)
Parse = BeautifulSoup(str(Content), 'html.parser')
Tag = Parse.find()
def HTML2Gemtext(Pages, SiteName, SiteTagline):
#os.mkdir('public.gmi')
for File, Content, Titles, Meta, HTMLContent, Description, Image in Pages:
Gemtext = ''
Content = HTMLContent
print(File)
while len(Content) != 0:
BlockStart = Content.find('<')
TagEnd = Content.find('>')
Parse = BeautifulSoup(Content, 'html.parser')
Tag = Parse.find()
#if Tag.name in ('a'):
# if 'href' in Tag.attrs:
# pass
for i in Tag.contents:
ParseTag(i)
if Tag.name in ('h1', 'h2', 'h3'):
Gemtext += '#' * int(Tag.name[1]) + ' '
elif Tag.name in ('h4', 'h5', 'h6'):
Gemtext += '### '
elif Tag.name in ('li'):
Gemtext += '* '
Gemtext += str(Tag.get_text()) + '\n\n'
#print(File, Tag.name, len(Tag.contents))
if Tag.name in ClosedTags:
Str = '</{}>'.format(Tag.name)
elif Tag.name in OpenTags:
Str = '>'
BlockEnd = Content.find(Str) + len(Str)
Content = Content.replace(Content[BlockStart:TagEnd], '').replace(Content[BlockEnd-len(Str):BlockEnd], '')
#print(BlockStart, TagEnd, BlockEnd, Tag.contents)
#print(Content[BlockStart:BlockEnd])
#Gemtext += Content[BlockStart:BlockEnd]
Content = Content[BlockEnd:]
PagePath = 'public.gmi/{}.gmi'.format(StripExt(File))
WriteFile(PagePath, Gemtext)
#exit()
""" Gemtext: """ Gemtext:
# h1 # h1

23
Source/Modules/Pug.py Normal file
View File

@ -0,0 +1,23 @@
""" ================================= |
| This file is part of |
| staticoso |
| Just a simple Static Site Generator |
| |
| Licensed under the AGPLv3 license |
| Copyright (C) 2022, OctoSpacc |
| ================================= """
# TODO: Write a native Pug parser; There is one already available for Python but seems broken / out-of-date
import os
from Modules.Utils import *
def PugCompileList(Pages):
# Pug-cli seems to shit itself with folder paths as input, so we pass ALL the files as arguments
Paths = ''
for File, Content, Titles, Meta in Pages:
if File.endswith('.pug'):
Path = 'public/{}'.format(File)
WriteFile(Path, Content)
Paths += '"{}" '.format(Path)
os.system('pug -P {} > /dev/null'.format(Paths))