Browse Source

Cleaned code a bit, added HTML minification

main
octospacc 6 months ago
parent
commit
2cc2bfc62a
  1. 1
      .gitignore
  2. 4
      Locale/en.json
  3. 4
      Locale/it.json
  4. 5
      README.md
  5. 172
      Source/Build.py
  6. 30
      Source/Libs/htmlmin/__init__.py
  7. 175
      Source/Libs/htmlmin/command.py
  8. 64
      Source/Libs/htmlmin/decorator.py
  9. 204
      Source/Libs/htmlmin/escape.py
  10. 193
      Source/Libs/htmlmin/main.py
  11. 92
      Source/Libs/htmlmin/middleware.py
  12. 408
      Source/Libs/htmlmin/parser.py
  13. 139
      Source/Libs/htmlmin/python3html/__init__.py
  14. 481
      Source/Libs/htmlmin/python3html/parser.py

1
.gitignore

@ -0,0 +1 @@
*.pyc

4
Locale/en.json

@ -0,0 +1,4 @@
{
"CreatedOn": "Created on",
"EditedOn": "Edited on"
}

4
Locale/it.json

@ -0,0 +1,4 @@
{
"CreatedOn": "Creato in data",
"EditedOn": "Modificato in data"
}

5
README.md

@ -12,9 +12,11 @@ Feel free to experiment with all of this stuff!
## Dependencies
- [Python >= 3.10.4](https://python.org)
- [Python Markdown >= 3.3.7](https://pypi.org/project/Markdown)
- (Included) [htmlmin >= 0.1.12](https://pypi.org/project/htmlmin)
- [pug-cli >= 1.0.0-alpha6](https://npmjs.com/package/pug-cli)
## Features roadmap
- [x] HTML minification
- [ ] Open Graph support
- [x] Custom categories for posts
- [x] Custom static page parts programmable by context
@ -32,6 +34,5 @@ Feel free to experiment with all of this stuff!
- [x] Generation of titles in right sidebar with clickable links
- [x] Detections of titles in a page
- [x] Custom static page parts by template
- [x] Pug support for pages
- [x] Markdown support for pages
- [x] Markdown + Pug support for pages
- [x] First working version

172
Source/Build.py

@ -8,12 +8,17 @@
| ================================= """
import argparse
import json
from Libs import htmlmin
import os
import shutil
from ast import literal_eval
from markdown import Markdown
from pathlib import Path
Extensions = {
'Pages': ('md', 'pug')}
def ReadFile(p):
try:
with open(p, 'r') as f:
@ -31,6 +36,15 @@ def WriteFile(p, c):
print("Error writing file {}".format(p))
return False
def LoadLocale(Lang):
Lang = Lang + '.json'
Folder = os.path.dirname(os.path.abspath(__file__)) + '/../Locale/'
File = ReadFile(Folder + Lang)
if File:
return json.loads(File)
else:
return json.loads(ReadFile(Folder + 'en.json'))
def StripExt(Path):
return ".".join(Path.split('.')[:-1])
@ -44,20 +58,6 @@ def GetLevels(Path, Sub=0, AsNum=False):
n = Path.count('/')
return n if AsNum else '../' * n
def GetDeepest(Paths):
Deepest = 0
for p in Paths:
l = GetLevels(p, True)
if l > Deepest:
Deepest = l
print(Deepest)
return Deepest
def GetRelative(Path, Levels):
print(Path, Levels)
#return GetLevels(Path, Levels)
return '../' * Levels
def DashifyStr(s, Limit=32):
Str, lc = '', Limit
for c in s[:Limit].replace(' ','-').replace(' ','-'):
@ -97,14 +97,13 @@ def GetTitleIdLine(Line, Title, Type):
NewLine += Line[Index+2:]
return NewLine
def MakeListTitle(File, Meta, Titles, Prefer, SiteRoot, CurLevels, PathPrefix=''):
print(PathPrefix)
def MakeListTitle(File, Meta, Titles, Prefer, SiteRoot, PathPrefix=''):
Title = GetTitle(Meta, Titles, Prefer)
Link = False if Meta['Index'] == 'Unlinked' else True
if Link:
Title = '[{}]({})'.format(
Title,
'{}{}.html'.format(PathPrefix, StripExt(File))) #(GetRelative(File, CurLevels), StripExt(File)))
'{}{}.html'.format(PathPrefix, StripExt(File)))
if Meta['Type'] == 'Post' and Meta['CreatedOn']:
Title = '[{}] {}'.format(
Meta['CreatedOn'],
@ -193,20 +192,16 @@ def PugCompileList(Pages):
Paths += '"{}" '.format(Path)
os.system('pug -P {} > /dev/null'.format(Paths))
def MakeContentHeader(Meta):
def MakeContentHeader(Meta, Locale):
Header = ''
if Meta['Type'] == 'Post':
# TODO: Fix the hardcoded italian
if Meta['CreatedOn'] and Meta['EditedOn']:
Header += "Creato in data {} \nModificato in data {} \n".format(Meta['CreatedOn'], Meta['EditedOn'])
elif Meta['CreatedOn'] and not Meta['EditedOn']:
Header += "Creato in data {} \n".format(Meta['CreatedOn'])
elif Meta['EditedOn'] and not Meta['CreatedOn']:
Header += "Modificato in data {} \n".format(Meta['EditedOn'])
if Meta['CreatedOn']:
Header += "{} {} \n".format(Locale['CreatedOn'], Meta['CreatedOn'])
if Meta['EditedOn']:
Header += "{} {} \n".format(Locale['EditedOn'], Meta['EditedOn'])
return Markdown().convert(Header)
def PatchHTML(Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, FolderRoots, Categories):
print(PagePath)
def PatchHTML(Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, FolderRoots, Categories, Locale):
HTMLTitles = FormatTitles(Titles)
for Line in Template.splitlines():
Line = Line.lstrip().rstrip()
@ -233,7 +228,7 @@ def PatchHTML(Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList
Template = Template.replace('[HTML:Page:Path]', PagePath)
Template = Template.replace('[HTML:Page:Style]', Meta['Style'])
Template = Template.replace('[HTML:Page:Content]', Content)
Template = Template.replace('[HTML:Page:ContentHeader]', MakeContentHeader(Meta))
Template = Template.replace('[HTML:Page:ContentHeader]', MakeContentHeader(Meta, Locale))
Template = Template.replace('[HTML:Site:AbsoluteRoot]', SiteRoot)
Template = Template.replace('[HTML:Site:RelativeRoot]', GetLevels(PagePath))
for i in FolderRoots:
@ -260,10 +255,8 @@ def OrderPages(Old):
New.remove([])
return New
def GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, Type='Page', Category=None):
List = ''
ToPop = []
LastParent = []
def GetHTMLPagesList(Pages, SiteRoot, PathPrefix, Type='Page', Category=None):
List, ToPop, LastParent = '', [], []
IndexPages = Pages.copy()
for e in IndexPages:
if e[3]['Index'] == 'False' or e[3]['Index'] == 'None':
@ -271,8 +264,7 @@ def GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, Type='Page', Catego
for i,e in enumerate(IndexPages):
if e[3]['Type'] != Type:
ToPop += [i]
ToPop.sort()
ToPop.reverse()
ToPop = RevSort(ToPop)
for i in ToPop:
IndexPages.pop(i)
if Type == 'Page':
@ -287,83 +279,107 @@ def GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, Type='Page', Catego
LastParent = CurParent
Levels = '- ' * (n-1+i)
if File[:-3].endswith('index.'):
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, CurLevels, PathPrefix)
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix)
else:
Title = CurParent[n-2+i]
List += Levels + Title + '\n'
if not (n > 1 and File[:-3].endswith('index.')):
Levels = '- ' * n
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, CurLevels, PathPrefix)
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix)
List += Levels + Title + '\n'
return Markdown().convert(List)
def DelTmp():
for File in Path('public').rglob('*.pug'):
os.remove(File)
for File in Path('public').rglob('*.md'):
os.remove(File)
for Ext in Extensions['Pages']:
for File in Path('public').rglob('*.{}'.format(Ext)):
os.remove(File)
def MakeSite(TemplatesText, PartsText, ContextParts, ContextPartsText, SiteRoot, FolderRoots):
Files = []
Pages = []
Categories = {}
for File in Path('Pages').rglob('*.pug'):
Files += [FileToStr(File, 'Pages/')]
for File in Path('Pages').rglob('*.md'):
Files += [FileToStr(File, 'Pages/')]
Files.sort()
Files.reverse()
def RevSort(List):
List.sort()
List.reverse()
return List
def DoMinify(HTML):
return htmlmin.minify(
input=HTML,
remove_comments=True,
remove_empty_space=True,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=True,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=True)
def MakeSite(TemplatesText, PartsText, ContextParts, ContextPartsText, SiteRoot, FolderRoots, Locale, Minify):
Files, Pages, Categories = [], [], {}
for Ext in Extensions['Pages']:
for File in Path('Pages').rglob('*.{}'.format(Ext)):
Files += [FileToStr(File, 'Pages/')]
Files = RevSort(Files)
for File in Files:
Content, Titles, Meta = PreProcessor('Pages/{}'.format(File), SiteRoot)
Pages += [[File, Content, Titles, Meta]]
for Category in Meta['Categories']:
Categories.update({Category:''})
PugCompileList(Pages)
print(Files)
for Category in Categories:
Categories[Category] = GetHTMLPagesList(Pages, SiteRoot, 0, '../../', 'Post', Category)
Categories[Category] = GetHTMLPagesList(
Pages=Pages,
SiteRoot=SiteRoot,
PathPrefix='../../', # This hardcodes paths, TODO make it somehow guess the path for every page containing the [HTML:Category] macro
Type='Post',
Category=Category)
for File, Content, Titles, Meta in Pages:
CurLevels = GetLevels(File, 0, True)
PathPrefix = GetLevels(File)
print(PathPrefix)
print(File, CurLevels)
HTMLPagesList = GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, 'Page')
HTMLPagesList = GetHTMLPagesList(
Pages=Pages,
SiteRoot=SiteRoot,
PathPrefix=GetLevels(File),
Type='Page')
PagePath = 'public/{}.html'.format(StripExt(File))
if File.endswith('.md'):
Content = Markdown().convert(Content)
elif File.endswith('.pug'):
Content = ReadFile(PagePath)
Template = TemplatesText[Meta['Template']]
Template = Template.replace(
'[HTML:Site:AbsoluteRoot]',
SiteRoot)
Template = Template.replace(
'[HTML:Site:RelativeRoot]',
GetLevels(File))
WriteFile(
PagePath,
PatchHTML(
Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList,
PagePath[len('public/'):], Content, Titles, Meta, SiteRoot, FolderRoots, Categories))
HTML = PatchHTML(
Template=TemplatesText[Meta['Template']],
PartsText=PartsText,
ContextParts=ContextParts,
ContextPartsText=ContextPartsText,
HTMLPagesList=HTMLPagesList,
PagePath=PagePath[len('public/'):],
Content=Content,
Titles=Titles,
Meta=Meta,
SiteRoot=SiteRoot,
FolderRoots=FolderRoots,
Categories=Categories,
Locale=Locale)
if Minify != 'False' and Minify != 'None':
HTML = DoMinify(HTML)
WriteFile(PagePath, HTML)
DelTmp()
def Main(Args):
ResetPublic()
shutil.copytree('Pages', 'public')
MakeSite(
LoadFromDir('Templates', '*.html'),
LoadFromDir('Parts', '*.html'),
literal_eval(Args.ContextParts) if Args.ContextParts else {},
LoadFromDir('ContextParts', '*.html'),
Args.SiteRoot if Args.SiteRoot else '/',
literal_eval(Args.FolderRoots) if Args.FolderRoots else {})
TemplatesText=LoadFromDir('Templates', '*.html'),
PartsText=LoadFromDir('Parts', '*.html'),
ContextParts=literal_eval(Args.ContextParts) if Args.ContextParts else {},
ContextPartsText=LoadFromDir('ContextParts', '*.html'),
SiteRoot=Args.SiteRoot if Args.SiteRoot else '/',
FolderRoots=literal_eval(Args.FolderRoots) if Args.FolderRoots else {},
Locale=LoadLocale(Args.SiteLang if Args.SiteLang else 'en'),
Minify=Args.Minify if Args.Minify else 'None')
os.system("cp -R Assets/* public/")
if __name__ == '__main__':
Parser = argparse.ArgumentParser()
Parser.add_argument('--SiteLang', type=str)
Parser.add_argument('--SiteRoot', type=str)
Parser.add_argument('--FolderRoots', type=str)
Parser.add_argument('--ContextParts', type=str)
Args = Parser.parse_args()
Main(Args)
Parser.add_argument('--Minify', type=str)
Main(
Args=Parser.parse_args())

30
Source/Libs/htmlmin/__init__.py

@ -0,0 +1,30 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from .main import minify, Minifier
__version__ = '0.1.12'

175
Source/Libs/htmlmin/command.py

@ -0,0 +1,175 @@
#!/usr/bin/env python
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import argparse
import codecs
import locale
import io
import sys
#import htmlmin
from . import Minifier
parser = argparse.ArgumentParser(
description='Minify HTML',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('input_file',
nargs='?',
metavar='INPUT',
help='File path to html file to minify. Defaults to stdin.',
)
parser.add_argument('output_file',
nargs='?',
metavar='OUTPUT',
help="File path to output to. Defaults to stdout.",
)
parser.add_argument('-c', '--remove-comments',
help=(
'''When set, comments will be removed. They can be kept on an individual basis
by starting them with a '!': <!--! comment -->. The '!' will be removed from
the final output. If you want a '!' as the leading character of your comment,
put two of them: <!--!! comment -->.
'''),
action='store_true')
parser.add_argument('-s', '--remove-empty-space',
help=(
'''When set, this removes empty space betwen tags in certain cases.
Specifically, it will remove empty space if and only if there a newline
character occurs within the space. Thus, code like
'<span>x</span> <span>y</span>' will be left alone, but code such as
' ...
</head>
<body>
...'
will become '...</head><body>...'. Note that this CAN break your
html if you spread two inline tags over two lines. Use with caution.
'''),
action='store_true')
parser.add_argument('--remove-all-empty-space',
help=(
'''When set, this removes ALL empty space betwen tags. WARNING: this can and
likely will cause unintended consequences. For instance, '<i>X</i> <i>Y</i>'
will become '<i>X</i><i>Y</i>'. Putting whitespace along with other text will
avoid this problem. Only use if you are confident in the result. Whitespace is
not removed from inside of tags, thus '<span> </span>' will be left alone.
'''),
action='store_true')
parser.add_argument('--keep-optional-attribute-quotes',
help=(
'''When set, this keeps all attribute quotes, even if they are optional.
'''),
action='store_true')
parser.add_argument('-H', '--in-head',
help=(
'''If you are parsing only a fragment of HTML, and the fragment occurs in the
head of the document, setting this will remove some extra whitespace.
'''),
action='store_true')
parser.add_argument('-k', '--keep-pre-attr',
help=(
'''HTMLMin supports the propietary attribute 'pre' that can be added to elements
to prevent minification. This attribute is removed by default. Set this flag to
keep the 'pre' attributes in place.
'''),
action='store_true')
parser.add_argument('-a', '--pre-attr',
help=(
'''The attribute htmlmin looks for to find blocks of HTML that it should not
minify. This attribute will be removed from the HTML unless '-k' is
specified. Defaults to 'pre'.
'''),
default='pre')
parser.add_argument('-p', '--pre-tags',
metavar='TAG',
help=(
'''By default, the contents of 'pre', and 'textarea' tags are left unminified.
You can specify different tags using the --pre-tags option. 'script' and 'style'
tags are always left unmininfied.
'''),
nargs='*',
default=['pre', 'textarea'])
parser.add_argument('-e', '--encoding',
help=("Encoding to read and write with. Default 'utf-8'."
" When reading from stdin, attempts to use the system's"
" encoding before defaulting to utf-8.\n\n"),
default=None,
)
def main():
args = parser.parse_args()
minifier = Minifier(
remove_comments=args.remove_comments,
remove_empty_space=args.remove_empty_space,
remove_optional_attribute_quotes=not args.keep_optional_attribute_quotes,
pre_tags=args.pre_tags,
keep_pre=args.keep_pre_attr,
pre_attr=args.pre_attr,
)
default_encoding = args.encoding or 'utf-8'
if args.input_file:
inp = codecs.open(args.input_file, encoding=default_encoding)
else:
encoding = args.encoding or sys.stdin.encoding \
or locale.getpreferredencoding() or default_encoding
inp = io.open(sys.stdin.fileno(), encoding=encoding)
for line in inp.readlines():
minifier.input(line)
if args.output_file:
codecs.open(
args.output_file, 'w', encoding=default_encoding).write(minifier.output)
else:
encoding = args.encoding or sys.stdout.encoding \
or locale.getpreferredencoding() or default_encoding
io.open(sys.stdout.fileno(), 'w', encoding=encoding).write(minifier.output)
if __name__ == '__main__':
main()

64
Source/Libs/htmlmin/decorator.py

@ -0,0 +1,64 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from .main import Minifier
def htmlmin(*args, **kwargs):
"""Minifies HTML that is returned by a function.
A simple decorator that minifies the HTML output of any function that it
decorates. It supports all the same options that :class:`htmlmin.minify` has.
With no options, it uses ``minify``'s default settings::
@htmlmin
def foobar():
return ' minify me! '
or::
@htmlmin(remove_comments=True)
def foobar():
return ' minify me! <!-- and remove me! -->'
"""
def _decorator(fn):
minify = Minifier(**kwargs).minify
def wrapper(*a, **kw):
return minify(fn(*a, **kw))
return wrapper
if len(args) == 1:
if callable(args[0]) and not kwargs:
return _decorator(args[0])
else:
raise RuntimeError(
'htmlmin decorator does accept positional arguments')
elif len(args) > 1:
raise RuntimeError(
'htmlmin decorator does accept positional arguments')
else:
return _decorator

204
Source/Libs/htmlmin/escape.py

@ -0,0 +1,204 @@
"""
Copyright (c) 2015, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import re
try:
from html import escape
except ImportError:
from cgi import escape
import re
NO_QUOTES = 0
SINGLE_QUOTE = 1
DOUBLE_QUOTE = 2
UPPER_A = ord('A')
UPPER_F = ord('F')
UPPER_Z = ord('Z')
LOWER_A = ord('a')
LOWER_F = ord('f')
LOWER_Z = ord('z')
ZERO = ord('0')
NINE = ord('9')
# https://www.w3.org/TR/html5/syntax.html#attributes-0
CHARS_TO_QUOTE_RE = re.compile(u'[\x20\x09\x0a\x0c\x0d=><`]')
def escape_tag(val):
return escape(val)
def escape_attr_name(val):
return escape(val)
def escape_attr_value(val, double_quote=False):
val = escape_ambiguous_ampersand(val)
has_html_tag = '<' in val or '>' in val
if double_quote:
return (val.replace('"', '&#34;'), DOUBLE_QUOTE)
double_quote_count = 0
single_quote_count = 0
for ch in val:
if ch == '"':
double_quote_count += 1
elif ch == "'":
single_quote_count += 1
if double_quote_count > single_quote_count:
return (val.replace("'", '&#39;'), SINGLE_QUOTE)
elif single_quote_count:
return (val.replace('"', '&#34;'), DOUBLE_QUOTE)
if not val or CHARS_TO_QUOTE_RE.search(val):
return (val, DOUBLE_QUOTE)
return (val, NO_QUOTES)
def escape_ambiguous_ampersand(val):
# TODO: this function could probably me made a lot faster.
if not '&' in val: # short circuit for speed
return val
state = 0
result = []
amp_buff = []
for c in val:
if state == 0: # beginning
if c == '&':
state = 1
else:
result.append(c)
elif state == 1: # ampersand
ord_c = ord(c)
if (UPPER_A <= ord_c <= UPPER_Z or
LOWER_A <= ord_c <= LOWER_Z or
ZERO <= ord_c <= NINE):
amp_buff.append(c) # TODO: use "name character references" section
# https://html.spec.whatwg.org/multipage/syntax.html#named-character-references
elif c == '#':
state = 2
elif c == ';':
if amp_buff:
result.append('&')
result.extend(amp_buff)
result.append(';')
else:
result.append('&;')
state = 0
amp_buff = []
elif c == '&':
if amp_buff:
result.append('&amp;')
result.extend(amp_buff)
else:
result.append('&')
amp_buff = []
else:
result.append('&')
result.extend(amp_buff)
result.append(c)
state = 0
amp_buff = []
elif state == 2: # numeric character reference
ord_c = ord(c)
if c == 'x' or c == 'X':
state = 3
elif ZERO <= ord_c <= NINE:
amp_buff.append(c)
elif c == ';':
if amp_buff:
result.append('&#')
result.extend(amp_buff)
result.append(';')
else:
result.append('&#;')
state = 0
amp_buff = []
elif c == '&':
if amp_buff:
result.append('&amp;#')
result.extend(amp_buff)
else:
result.append('&#')
state = 1
amp_buff = []
else:
if amp_buff:
result.append('&amp;#')
result.extend(amp_buff)
result.append(c)
else:
result.append('&#')
result.append(c)
state = 0
amp_buff = []
elif state == 3: # hex character reference
ord_c = ord(c)
if (UPPER_A <= ord_c <= UPPER_F or
LOWER_A <= ord_c <= LOWER_F or
ZERO <= ord_c <= NINE):
amp_buff.append(c)
elif c == ';':
if amp_buff:
result.append('&#x')
result.extend(amp_buff)
result.append(';')
else:
result.append('&#x;')
state = 0
amp_buff = []
elif c == '&':
if amp_buff:
result.append('&amp;#x')
result.extend(amp_buff)
else:
result.append('&#x')
state = 1
amp_buff = []
else:
if amp_buff:
result.append('&amp;#x')
result.extend(amp_buff)
result.append(c)
else:
result.append('&#x')
result.append(c)
state = 0
amp_buff = []
if state == 1:
result.append('&amp;')
result.extend(amp_buff)
elif state == 2:
result.append('&amp;#')
result.extend(amp_buff)
elif state == 3:
result.append('&amp;#x')
result.extend(amp_buff)
return ''.join(result)

193
Source/Libs/htmlmin/main.py

@ -0,0 +1,193 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import cgi
from . import parser
def minify(input,
remove_comments=False,
remove_empty_space=False,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=False,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=False,
pre_tags=parser.PRE_TAGS,
pre_attr='pre',
cls=parser.HTMLMinParser):
"""Minifies HTML in one shot.
:param input: A string containing the HTML to be minified.
:param remove_comments: Remove comments found in HTML. Individual comments can
be maintained by putting a ``!`` as the first character inside the comment.
Thus::
<!-- FOO --> <!--! BAR -->
Will become simply::
<!-- BAR -->
The added exclamation is removed.
:param remove_empty_space: Remove empty space found in HTML between an opening
and a closing tag and when it contains a newline or carriage return. If
whitespace is found that is only spaces and/or tabs, it will be turned into
a single space. Be careful, this can have unintended consequences.
:param remove_all_empty_space: A more extreme version of
``remove_empty_space``, this removes all empty whitespace found between
tags. This is almost guaranteed to break your HTML unless you are very
careful.
:param reduce_boolean_attributes: Where allowed by the HTML5 specification,
attributes such as 'disabled' and 'readonly' will have their value removed,
so 'disabled="true"' will simply become 'disabled'. This is generally a
good option to turn on except when JavaScript relies on the values.
:param remove_optional_attribute_quotes: When True, optional quotes around
attributes are removed. When False, all attribute quotes are left intact.
Defaults to True.
:param conver_charrefs: Decode character references such as &amp; and &#46;
to their single charater values where safe. This currently only applies to
attributes. Data content between tags will be left encoded.
:param keep_pre: By default, htmlmin uses the special attribute ``pre`` to
allow you to demarcate areas of HTML that should not be minified. It removes
this attribute as it finds it. Setting this value to ``True`` tells htmlmin
to leave the attribute in the output.
:param pre_tags: A list of tag names that should never be minified. You are
free to change this list as you see fit, but you will probably want to
include ``pre`` and ``textarea`` if you make any changes to the list. Note
that ``<script>`` and ``<style>`` tags are never minimized.
:param pre_attr: Specifies the attribute that, when found in an HTML tag,
indicates that the content of the tag should not be minified. Defaults to
``pre``. You can also prefix individual tag attributes with
``{pre_attr}-`` to prevent the contents of the individual attribute from
being changed.
:return: A string containing the minified HTML.
If you are going to be minifying multiple HTML documents, each with the same
settings, consider using :class:`.Minifier`.
"""
minifier = cls(
remove_comments=remove_comments,
remove_empty_space=remove_empty_space,
remove_all_empty_space=remove_all_empty_space,
reduce_empty_attributes=reduce_empty_attributes,
reduce_boolean_attributes=reduce_boolean_attributes,
remove_optional_attribute_quotes=remove_optional_attribute_quotes,
convert_charrefs=convert_charrefs,
keep_pre=keep_pre,
pre_tags=pre_tags,
pre_attr=pre_attr)
minifier.feed(input)
minifier.close()
return minifier.result
class Minifier(object):
"""An object that supports HTML Minification.
Options are passed into this class at initialization time and are then
persisted across each use of the instance. If you are going to be minifying
multiple peices of HTML, this will be more efficient than using
:class:`htmlmin.minify`.
See :class:`htmlmin.minify` for an explanation of options.
"""
def __init__(self,
remove_comments=False,
remove_empty_space=False,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=False,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=False,
pre_tags=parser.PRE_TAGS,
pre_attr='pre',
cls=parser.HTMLMinParser):
"""Initialize the Minifier.
See :class:`htmlmin.minify` for an explanation of options.
"""
self._parser = cls(
remove_comments=remove_comments,
remove_empty_space=remove_empty_space,
remove_all_empty_space=remove_all_empty_space,
reduce_empty_attributes=reduce_empty_attributes,
reduce_boolean_attributes=reduce_boolean_attributes,
remove_optional_attribute_quotes=remove_optional_attribute_quotes,
convert_charrefs=convert_charrefs,
keep_pre=keep_pre,
pre_tags=pre_tags,
pre_attr=pre_attr)
def minify(self, *input):
"""Runs HTML through the minifier in one pass.
:param input: HTML to be fed into the minimizer. Multiple chunks of HTML
can be provided, and they are fed in sequentially as if they were
concatenated.
:returns: A string containing the minified HTML.
This is the simplest way to use an existing ``Minifier`` instance. This
method takes in HTML and minfies it, returning the result. Note that this
method resets the internal state of the parser before it does any work. If
there is pending HTML in the buffers, it will be lost.
"""
self._parser.reset()
self.input(*input)
return self.finalize()
def input(self, *input):
"""Feed more HTML into the input stream
:param input: HTML to be fed into the minimizer. Multiple chunks of HTML
can be provided, and they are fed in sequentially as if they were
concatenated. You can also call this method multiple times to achieve
the same effect.
"""
for i in input:
self._parser.feed(i)
@property
def output(self):
"""Retrieve the minified output generated thus far.
"""
return self._parser.result
def finalize(self):
"""Finishes current input HTML and returns mininified result.
This method flushes any remaining input HTML and returns the minified
result. It resets the state of the internal parser in the process so that
new HTML can be minified. Be sure to call this method before you reuse
the ``Minifier`` instance on a new HTML document.
"""
self._parser.close()
result = self._parser.result
self._parser.reset()
return result

92
Source/Libs/htmlmin/middleware.py

@ -0,0 +1,92 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from .main import Minifier
class HTMLMinMiddleware(object):
"""WSGI Middleware that minifies html on the way out.
:param by_default: Specifies if minification should be turned on or off by
default. Defaults to ``True``.
:param keep_header: The middleware recognizes one custom HTTP header that
can be used to turn minification on or off on a per-request basis:
``X-HTML-Min-Enable``. Setting the header to ``true`` will turn minfication
on; anything else will turn minification off. If ``by_default`` is set to
``False``, this header is how you would turn minification back on. The
middleware, by default, removes the header from the output. Setting this
to ``True`` leaves the header in tact.
:param debug: A quick setting to turn all minification off. The middleware
is effectively bypassed.
This simple middleware minifies any HTML content that passes through it. Any
additional keyword arguments beyond the three settings the middleware has are
passed on to the internal minifier. The documentation for the options can
be found under :class:`htmlmin.minify`.
"""
def __init__(self, app, by_default=True, keep_header=False,
debug=False, **kwargs):
self.app = app
self.by_default = by_default
self.debug = debug
self.keep_header = keep_header
self.minifier = Minifier(**kwargs)
def __call__(self, environ, start_response):
if self.debug:
return self.app(environ, start_response)
should_minify = [] # need to use a mutable object so we can change it
# in a different scope.
def minified_start_response(status, headers, exc_info=None):
should_minify.append(self.should_minify(headers))
if not self.keep_header:
headers = [(header, value) for header, value in
headers if header != 'X-HTML-Min-Enable']
start_response(status, headers, exc_info)
html = [i for i in self.app(environ, minified_start_response)]
if should_minify[0]:
return [self.minifier.minify(*html)]
return html
def should_minify(self, headers):
is_html = False
flag_header = None
for header, value in headers:
if not is_html and header == 'Content-Type' and value == 'text/html':
is_html = True
if flag_header is not None:
break
if flag_header is None and header == 'X-HTML-Min-Enable':
flag_header = (value.lower() == 'true')
if is_html:
break
return is_html and (
(self.by_default and flag_header != False) or
(not self.by_default and flag_header))

408
Source/Libs/htmlmin/parser.py

@ -0,0 +1,408 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from __future__ import unicode_literals
import logging
import sys
import re
from .python3html.parser import HTMLParser
from . import escape
# https://www.w3.org/TR/html5/single-page.html#space-character
HTML_SPACE_RE = re.compile('[\x20\x09\x0a\x0c\x0d]+')
HTML_ALL_SPACE_RE = re.compile('^[\x20\x09\x0a\x0c\x0d]+$')
HTML_LEADING_SPACE_RE = re.compile(
'^[\x20\x09\x0a\x0c\x0d]+')
HTML_TRAILING_SPACE_RE = re.compile(
'[\x20\x09\x0a\x0c\x0d]+$')
HTML_LEADING_TRAILING_SPACE_RE = re.compile(
'(^[\x20\x09\x0a\x0c\x0d]+)|([\x20\x09\x0a\x0c\x0d]+$)')
PRE_TAGS = ('pre', 'textarea') # styles and scripts are never minified
# http://www.w3.org/TR/html51/syntax.html#elements-0
NO_CLOSE_TAGS = ('area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img',
'input', 'keygen', 'link', 'meta', 'param', 'source', 'track',
'wbr')
# http://www.w3.org/TR/html51/index.html#attributes-1
BOOLEAN_ATTRIBUTES = {
'audio': ('autoplay', 'controls', 'hidden', 'loop', 'muted',),
'button': ('autofocus', 'disabled', 'formnovalidate', 'hidden',),
'command': ('checked', 'disabled', 'hidden'),
'dialog': ('hidden', 'open',),
'fieldset': ('disabled', 'hidden',),
'form': ('hidden', 'novalidate',),
'iframe': ('hidden', 'seamless',),
'img': ('hidden', 'ismap',),
'input': ('autofocus', 'checked', 'disabled', 'formnovalidate', 'hidden',
'multiple', 'readonly', 'required',),
'keygen': ('autofocus', 'disabled', 'hidden',),
'object': ('hidden', 'typesmustmatch',),
'ol': ('hidden', 'reversed',),
'optgroup': ('disabled', 'hidden',),
'option': ('disabled', 'hidden', 'selected',),
'script': ('async', 'defer', 'hidden',),
'select': ('autofocus', 'disabled', 'hidden', 'multiple', 'required',),
'style': ('hidden', 'scoped',),
'textarea': ('autofocus', 'disabled', 'hidden', 'readonly', 'required',),
'track': ('default', 'hidden', ),
'video': ('autoplay', 'controls', 'hidden', 'loop', 'muted',),
'*': ('hidden',),
}
# a list of tags and tags that they are closed by
TAG_SETS = {
'li': ('li',),
'dd': ('dd', 'dt'),
'rp': ('rp', 'rt'),
'p': ('address', 'article', 'aside', 'blockquote', 'dir', 'div', 'dl',
'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hgroup', 'hr', 'menu', 'nav', 'ol', 'p', 'pre', 'section',
'table', 'ul'),
'optgroup': ('optgroup',),
'option': ('option', 'optgroup'),
'colgroup': '*',
'tbody': ('tbody', 'tfoot'),
'tfoot': ('tbody',),
'tr': ('tr',),
'td': ('td', 'th'),
}
TAG_SETS['dt'] = TAG_SETS['dd']
TAG_SETS['rt'] = TAG_SETS['rp']
TAG_SETS['thead'] = TAG_SETS['tbody']
TAG_SETS['th'] = TAG_SETS['td']
# Tag omission rules:
# http://www.w3.org/TR/html51/syntax.html#optional-tags
class HTMLMinError(Exception): pass
class ParseError(HTMLMinError): pass
class OpenTagNotFoundError(ParseError): pass
class HTMLMinParser(HTMLParser):
def __init__(self,
remove_comments=False,
remove_empty_space=False,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=False,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=False,
pre_tags=PRE_TAGS,
pre_attr='pre'):
if sys.version_info[0] >= 3 and sys.version_info[1] >= 4:
# convert_charrefs is True by default in Python 3.5.0 and newer. It was
# introduced in 3.4.
HTMLParser.__init__(self, convert_charrefs=False)
else:
HTMLParser.__init__(self)
self.keep_pre = keep_pre
self.pre_tags = pre_tags
self.remove_comments = remove_comments
self.remove_empty_space = remove_empty_space
self.remove_all_empty_space = remove_all_empty_space
self.reduce_empty_attributes = reduce_empty_attributes
self.reduce_boolean_attributes = reduce_boolean_attributes
self.remove_optional_attribute_quotes = remove_optional_attribute_quotes
self.convert_charrefs = convert_charrefs
self.pre_attr = pre_attr
self.reset()
def _tag_lang(self):
return self._tag_stack[0][2] if self._tag_stack else None
def build_tag(self, tag, attrs, close_tag):
has_pre = False
if self.reduce_boolean_attributes:
bool_attrs = BOOLEAN_ATTRIBUTES.get(tag, BOOLEAN_ATTRIBUTES['*'])
else:
bool_attrs = False
lang = self._tag_lang()
attrs = list(attrs) # We're modifying it in place
last_quoted = last_no_slash = i = -1
for k, v in attrs:
pre_prefix = k.startswith("{}-".format(self.pre_attr))
if pre_prefix:
k = k[len(self.pre_attr)+1:]
if k == self.pre_attr:
has_pre = True
if not self.keep_pre and not pre_prefix:
continue
if v and self.convert_charrefs and not pre_prefix:
v = HTMLParser.unescape(self, v)
if k == 'lang':
lang = v
if v == self._tag_lang():
continue
i += 1
if not pre_prefix:
k = escape.escape_attr_name(k)
if (v is None or (not v and self.reduce_empty_attributes) or
(bool_attrs and k in bool_attrs)):
# For our use case, we treat boolean attributes as quoted because they
# don't require space between them and "/>" in closing tags.
attrs[i] = k
last_quoted = i
else:
if pre_prefix:
has_double_quotes = '"' in v
has_single_quotes = "'" in v
if not has_double_quotes:
if not has_single_quotes and self.remove_optional_attribute_quotes:
q = escape.NO_QUOTES
else:
q = escape.DOUBLE_QUOTE
elif not has_single_quotes:
q = escape.SINGLE_QUOTES
else:
logging.error('Unsafe content found in pre-attribute. Escaping.')
(v, q) = escape.escape_attr_value(
v, double_quote=not self.remove_optional_attribute_quotes)
else:
(v, q) = escape.escape_attr_value(
v, double_quote=not self.remove_optional_attribute_quotes)
if q == escape.NO_QUOTES:
attrs[i] = '%s=%s' % (k, v)
if v[-1] != '/':
last_no_slash = i
else:
q = '"' if q == escape.DOUBLE_QUOTE else "'"
attrs[i] = '%s=%s%s%s' % (k, q, v, q)
last_quoted = i
i += 1
if i != len(attrs):
del attrs[i:]
# 1. If there are no attributes, no additional space is necessary.
# 2. If last attribute is quoted, no additional space is necessary.
# 3. Two things are happening here:
# a) according to the standard, <foo bar=baz/> should be treated as <foo
# bar="baz/"> so space is necessary if this is self-closing tag,
# however
# b) reportedly (https://github.com/mankyd/htmlmin/pull/12), older
# versions of WebKit interpret <foo bar=baz/> as self-closing tag so
# we need the space if the last argument ends with a slash.
space_maybe = ''
if attrs:
needs_space = lambda last_attr: (last_attr[-1] not in '"\'' and
(close_tag or last_attr[-1] == '/'))
if needs_space(attrs[-1][-1]):
# If moving attributes around can help, do it. Otherwise bite the
# bullet and put the space in.
i = last_no_slash if last_quoted == -1 else last_quoted
if i == -1 or needs_space(attrs[i]):
space_maybe = ' '
else:
attrs.append(attrs[i])
del attrs[i]
return has_pre, '<%s%s%s%s%s>' % (escape.escape_tag(tag),
' ' if attrs else '',
' '.join(attrs),
space_maybe,
'/' if close_tag else ''), lang
def handle_decl(self, decl):
if (len(self._data_buffer) == 1 and
HTML_SPACE_RE.match(self._data_buffer[0][0])):
self._data_buffer = []
self._data_buffer.append('<!' + decl + '>')
self._after_doctype = True
def _close_tags_up_to(self, tag):
num_pres = 0
i = 0
for i, t in enumerate(self._tag_stack):
if t[1]:
num_pres += 1
if t[0] == tag:
break
# Only the html tag can close out everything. Put on the brakes if
# we encounter a closing tag that we didn't recognize.
if tag != 'html' and t[0] in ('body', 'html', 'head'):
raise OpenTagNotFoundError()
self._tag_stack = self._tag_stack[i+1:]
return num_pres
def handle_starttag(self, tag, attrs):
self._after_doctype = False
if tag == 'head':
self._in_head = True
elif self._in_head and tag == 'title':
self._in_title = True
self._title_newly_opened = True
for t in self._tag_stack:
closed_by_tags = TAG_SETS.get(t[0])
if closed_by_tags and (closed_by_tags == '*' or tag in closed_by_tags):
self._in_pre_tag -= self._close_tags_up_to(t[0])
break
has_pre, data, lang = self.build_tag(tag, attrs, False)
start_pre = False
if (has_pre or self._in_pre_tag > 0 or
tag == 'script' or tag == 'style' or tag in self.pre_tags):
self._in_pre_tag += 1
start_pre = True
self._tag_stack.insert(0, (tag, start_pre, lang))
self._data_buffer.append(data)
def handle_endtag(self, tag):
# According to the spec, <p> tags don't get closed when a parent a
# tag closes them. Here's some logic that addresses this.
if tag == 'a':
contains_p = False
for i, t in enumerate(self._tag_stack):
if t[0] == 'p':
contains_p = True
elif t[0] == 'a':
break
if contains_p: # the p tag, and all its children should be left open
a_tag = self._tag_stack.pop(i)
if a_tag[1]:
self._in_pre_tag -= 1
else:
if tag == 'head':
# TODO: Did we know that we were in an head tag?! If not, we need to
# reminify everything to remove extra spaces.
self._in_head = False
elif tag == 'title':
self._in_title = False
self._title_newly_opened = False
try:
self._in_pre_tag -= self._close_tags_up_to(tag)
except OpenTagNotFoundError:
# Some tags don't require a start tag. Most do. Either way, we leave
# closing tags along since they affect output. For instance, a '</p>'
# results in a '<p></p>' in Chrome.
pass
if tag not in NO_CLOSE_TAGS:
self._data_buffer.extend(['</', escape.escape_tag(tag), '>'])
def handle_startendtag(self, tag, attrs):
self._after_doctype = False
data = self.build_tag(tag, attrs, tag not in NO_CLOSE_TAGS)[1]
self._data_buffer.append(data)
def handle_comment(self, data):
if not self.remove_comments or re.match(r'^(?:!|\[if\s)', data):
self._data_buffer.append('<!--{}-->'.format(
data[1:] if len(data) and data[0] == '!' else data))
def handle_data(self, data):
if self._in_pre_tag > 0:
self._data_buffer.append(data)
else:
# remove_all_empty_space matches everything. remove_empty_space only
# matches if there's a newline involved.
if self.remove_all_empty_space or self._in_head or self._after_doctype:
if HTML_ALL_SPACE_RE.match(data):
return
elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and