Cleaned code a bit, added HTML minification

This commit is contained in:
octospacc 2022-06-20 16:16:41 +02:00
parent 8dee196213
commit 2cc2bfc62a
14 changed files with 1892 additions and 80 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.pyc

4
Locale/en.json Normal file
View File

@ -0,0 +1,4 @@
{
"CreatedOn": "Created on",
"EditedOn": "Edited on"
}

4
Locale/it.json Normal file
View File

@ -0,0 +1,4 @@
{
"CreatedOn": "Creato in data",
"EditedOn": "Modificato in data"
}

View File

@ -12,9 +12,11 @@ Feel free to experiment with all of this stuff!
## Dependencies
- [Python >= 3.10.4](https://python.org)
- [Python Markdown >= 3.3.7](https://pypi.org/project/Markdown)
- (Included) [htmlmin >= 0.1.12](https://pypi.org/project/htmlmin)
- [pug-cli >= 1.0.0-alpha6](https://npmjs.com/package/pug-cli)
## Features roadmap
- [x] HTML minification
- [ ] Open Graph support
- [x] Custom categories for posts
- [x] Custom static page parts programmable by context
@ -32,6 +34,5 @@ Feel free to experiment with all of this stuff!
- [x] Generation of titles in right sidebar with clickable links
- [x] Detections of titles in a page
- [x] Custom static page parts by template
- [x] Pug support for pages
- [x] Markdown support for pages
- [x] Markdown + Pug support for pages
- [x] First working version

View File

@ -8,12 +8,17 @@
| ================================= """
import argparse
import json
from Libs import htmlmin
import os
import shutil
from ast import literal_eval
from markdown import Markdown
from pathlib import Path
Extensions = {
'Pages': ('md', 'pug')}
def ReadFile(p):
try:
with open(p, 'r') as f:
@ -31,6 +36,15 @@ def WriteFile(p, c):
print("Error writing file {}".format(p))
return False
def LoadLocale(Lang):
Lang = Lang + '.json'
Folder = os.path.dirname(os.path.abspath(__file__)) + '/../Locale/'
File = ReadFile(Folder + Lang)
if File:
return json.loads(File)
else:
return json.loads(ReadFile(Folder + 'en.json'))
def StripExt(Path):
return ".".join(Path.split('.')[:-1])
@ -44,20 +58,6 @@ def GetLevels(Path, Sub=0, AsNum=False):
n = Path.count('/')
return n if AsNum else '../' * n
def GetDeepest(Paths):
Deepest = 0
for p in Paths:
l = GetLevels(p, True)
if l > Deepest:
Deepest = l
print(Deepest)
return Deepest
def GetRelative(Path, Levels):
print(Path, Levels)
#return GetLevels(Path, Levels)
return '../' * Levels
def DashifyStr(s, Limit=32):
Str, lc = '', Limit
for c in s[:Limit].replace(' ','-').replace(' ','-'):
@ -97,14 +97,13 @@ def GetTitleIdLine(Line, Title, Type):
NewLine += Line[Index+2:]
return NewLine
def MakeListTitle(File, Meta, Titles, Prefer, SiteRoot, CurLevels, PathPrefix=''):
print(PathPrefix)
def MakeListTitle(File, Meta, Titles, Prefer, SiteRoot, PathPrefix=''):
Title = GetTitle(Meta, Titles, Prefer)
Link = False if Meta['Index'] == 'Unlinked' else True
if Link:
Title = '[{}]({})'.format(
Title,
'{}{}.html'.format(PathPrefix, StripExt(File))) #(GetRelative(File, CurLevels), StripExt(File)))
'{}{}.html'.format(PathPrefix, StripExt(File)))
if Meta['Type'] == 'Post' and Meta['CreatedOn']:
Title = '[{}] {}'.format(
Meta['CreatedOn'],
@ -193,20 +192,16 @@ def PugCompileList(Pages):
Paths += '"{}" '.format(Path)
os.system('pug -P {} > /dev/null'.format(Paths))
def MakeContentHeader(Meta):
def MakeContentHeader(Meta, Locale):
Header = ''
if Meta['Type'] == 'Post':
# TODO: Fix the hardcoded italian
if Meta['CreatedOn'] and Meta['EditedOn']:
Header += "Creato in data {} \nModificato in data {} \n".format(Meta['CreatedOn'], Meta['EditedOn'])
elif Meta['CreatedOn'] and not Meta['EditedOn']:
Header += "Creato in data {} \n".format(Meta['CreatedOn'])
elif Meta['EditedOn'] and not Meta['CreatedOn']:
Header += "Modificato in data {} \n".format(Meta['EditedOn'])
if Meta['CreatedOn']:
Header += "{} {} \n".format(Locale['CreatedOn'], Meta['CreatedOn'])
if Meta['EditedOn']:
Header += "{} {} \n".format(Locale['EditedOn'], Meta['EditedOn'])
return Markdown().convert(Header)
def PatchHTML(Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, FolderRoots, Categories):
print(PagePath)
def PatchHTML(Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList, PagePath, Content, Titles, Meta, SiteRoot, FolderRoots, Categories, Locale):
HTMLTitles = FormatTitles(Titles)
for Line in Template.splitlines():
Line = Line.lstrip().rstrip()
@ -233,7 +228,7 @@ def PatchHTML(Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList
Template = Template.replace('[HTML:Page:Path]', PagePath)
Template = Template.replace('[HTML:Page:Style]', Meta['Style'])
Template = Template.replace('[HTML:Page:Content]', Content)
Template = Template.replace('[HTML:Page:ContentHeader]', MakeContentHeader(Meta))
Template = Template.replace('[HTML:Page:ContentHeader]', MakeContentHeader(Meta, Locale))
Template = Template.replace('[HTML:Site:AbsoluteRoot]', SiteRoot)
Template = Template.replace('[HTML:Site:RelativeRoot]', GetLevels(PagePath))
for i in FolderRoots:
@ -260,10 +255,8 @@ def OrderPages(Old):
New.remove([])
return New
def GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, Type='Page', Category=None):
List = ''
ToPop = []
LastParent = []
def GetHTMLPagesList(Pages, SiteRoot, PathPrefix, Type='Page', Category=None):
List, ToPop, LastParent = '', [], []
IndexPages = Pages.copy()
for e in IndexPages:
if e[3]['Index'] == 'False' or e[3]['Index'] == 'None':
@ -271,8 +264,7 @@ def GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, Type='Page', Catego
for i,e in enumerate(IndexPages):
if e[3]['Type'] != Type:
ToPop += [i]
ToPop.sort()
ToPop.reverse()
ToPop = RevSort(ToPop)
for i in ToPop:
IndexPages.pop(i)
if Type == 'Page':
@ -287,83 +279,107 @@ def GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, Type='Page', Catego
LastParent = CurParent
Levels = '- ' * (n-1+i)
if File[:-3].endswith('index.'):
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, CurLevels, PathPrefix)
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix)
else:
Title = CurParent[n-2+i]
List += Levels + Title + '\n'
if not (n > 1 and File[:-3].endswith('index.')):
Levels = '- ' * n
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, CurLevels, PathPrefix)
Title = MakeListTitle(File, Meta, Titles, 'HTMLTitle', SiteRoot, PathPrefix)
List += Levels + Title + '\n'
return Markdown().convert(List)
def DelTmp():
for File in Path('public').rglob('*.pug'):
os.remove(File)
for File in Path('public').rglob('*.md'):
os.remove(File)
for Ext in Extensions['Pages']:
for File in Path('public').rglob('*.{}'.format(Ext)):
os.remove(File)
def MakeSite(TemplatesText, PartsText, ContextParts, ContextPartsText, SiteRoot, FolderRoots):
Files = []
Pages = []
Categories = {}
for File in Path('Pages').rglob('*.pug'):
Files += [FileToStr(File, 'Pages/')]
for File in Path('Pages').rglob('*.md'):
Files += [FileToStr(File, 'Pages/')]
Files.sort()
Files.reverse()
def RevSort(List):
List.sort()
List.reverse()
return List
def DoMinify(HTML):
return htmlmin.minify(
input=HTML,
remove_comments=True,
remove_empty_space=True,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=True,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=True)
def MakeSite(TemplatesText, PartsText, ContextParts, ContextPartsText, SiteRoot, FolderRoots, Locale, Minify):
Files, Pages, Categories = [], [], {}
for Ext in Extensions['Pages']:
for File in Path('Pages').rglob('*.{}'.format(Ext)):
Files += [FileToStr(File, 'Pages/')]
Files = RevSort(Files)
for File in Files:
Content, Titles, Meta = PreProcessor('Pages/{}'.format(File), SiteRoot)
Pages += [[File, Content, Titles, Meta]]
for Category in Meta['Categories']:
Categories.update({Category:''})
PugCompileList(Pages)
print(Files)
for Category in Categories:
Categories[Category] = GetHTMLPagesList(Pages, SiteRoot, 0, '../../', 'Post', Category)
Categories[Category] = GetHTMLPagesList(
Pages=Pages,
SiteRoot=SiteRoot,
PathPrefix='../../', # This hardcodes paths, TODO make it somehow guess the path for every page containing the [HTML:Category] macro
Type='Post',
Category=Category)
for File, Content, Titles, Meta in Pages:
CurLevels = GetLevels(File, 0, True)
PathPrefix = GetLevels(File)
print(PathPrefix)
print(File, CurLevels)
HTMLPagesList = GetHTMLPagesList(Pages, SiteRoot, CurLevels, PathPrefix, 'Page')
HTMLPagesList = GetHTMLPagesList(
Pages=Pages,
SiteRoot=SiteRoot,
PathPrefix=GetLevels(File),
Type='Page')
PagePath = 'public/{}.html'.format(StripExt(File))
if File.endswith('.md'):
Content = Markdown().convert(Content)
elif File.endswith('.pug'):
Content = ReadFile(PagePath)
Template = TemplatesText[Meta['Template']]
Template = Template.replace(
'[HTML:Site:AbsoluteRoot]',
SiteRoot)
Template = Template.replace(
'[HTML:Site:RelativeRoot]',
GetLevels(File))
WriteFile(
PagePath,
PatchHTML(
Template, PartsText, ContextParts, ContextPartsText, HTMLPagesList,
PagePath[len('public/'):], Content, Titles, Meta, SiteRoot, FolderRoots, Categories))
HTML = PatchHTML(
Template=TemplatesText[Meta['Template']],
PartsText=PartsText,
ContextParts=ContextParts,
ContextPartsText=ContextPartsText,
HTMLPagesList=HTMLPagesList,
PagePath=PagePath[len('public/'):],
Content=Content,
Titles=Titles,
Meta=Meta,
SiteRoot=SiteRoot,
FolderRoots=FolderRoots,
Categories=Categories,
Locale=Locale)
if Minify != 'False' and Minify != 'None':
HTML = DoMinify(HTML)
WriteFile(PagePath, HTML)
DelTmp()
def Main(Args):
ResetPublic()
shutil.copytree('Pages', 'public')
MakeSite(
LoadFromDir('Templates', '*.html'),
LoadFromDir('Parts', '*.html'),
literal_eval(Args.ContextParts) if Args.ContextParts else {},
LoadFromDir('ContextParts', '*.html'),
Args.SiteRoot if Args.SiteRoot else '/',
literal_eval(Args.FolderRoots) if Args.FolderRoots else {})
TemplatesText=LoadFromDir('Templates', '*.html'),
PartsText=LoadFromDir('Parts', '*.html'),
ContextParts=literal_eval(Args.ContextParts) if Args.ContextParts else {},
ContextPartsText=LoadFromDir('ContextParts', '*.html'),
SiteRoot=Args.SiteRoot if Args.SiteRoot else '/',
FolderRoots=literal_eval(Args.FolderRoots) if Args.FolderRoots else {},
Locale=LoadLocale(Args.SiteLang if Args.SiteLang else 'en'),
Minify=Args.Minify if Args.Minify else 'None')
os.system("cp -R Assets/* public/")
if __name__ == '__main__':
Parser = argparse.ArgumentParser()
Parser.add_argument('--SiteLang', type=str)
Parser.add_argument('--SiteRoot', type=str)
Parser.add_argument('--FolderRoots', type=str)
Parser.add_argument('--ContextParts', type=str)
Args = Parser.parse_args()
Main(Args)
Parser.add_argument('--Minify', type=str)
Main(
Args=Parser.parse_args())

View File

@ -0,0 +1,30 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from .main import minify, Minifier
__version__ = '0.1.12'

175
Source/Libs/htmlmin/command.py Executable file
View File

@ -0,0 +1,175 @@
#!/usr/bin/env python
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import argparse
import codecs
import locale
import io
import sys
#import htmlmin
from . import Minifier
parser = argparse.ArgumentParser(
description='Minify HTML',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('input_file',
nargs='?',
metavar='INPUT',
help='File path to html file to minify. Defaults to stdin.',
)
parser.add_argument('output_file',
nargs='?',
metavar='OUTPUT',
help="File path to output to. Defaults to stdout.",
)
parser.add_argument('-c', '--remove-comments',
help=(
'''When set, comments will be removed. They can be kept on an individual basis
by starting them with a '!': <!--! comment -->. The '!' will be removed from
the final output. If you want a '!' as the leading character of your comment,
put two of them: <!--!! comment -->.
'''),
action='store_true')
parser.add_argument('-s', '--remove-empty-space',
help=(
'''When set, this removes empty space betwen tags in certain cases.
Specifically, it will remove empty space if and only if there a newline
character occurs within the space. Thus, code like
'<span>x</span> <span>y</span>' will be left alone, but code such as
' ...
</head>
<body>
...'
will become '...</head><body>...'. Note that this CAN break your
html if you spread two inline tags over two lines. Use with caution.
'''),
action='store_true')
parser.add_argument('--remove-all-empty-space',
help=(
'''When set, this removes ALL empty space betwen tags. WARNING: this can and
likely will cause unintended consequences. For instance, '<i>X</i> <i>Y</i>'
will become '<i>X</i><i>Y</i>'. Putting whitespace along with other text will
avoid this problem. Only use if you are confident in the result. Whitespace is
not removed from inside of tags, thus '<span> </span>' will be left alone.
'''),
action='store_true')
parser.add_argument('--keep-optional-attribute-quotes',
help=(
'''When set, this keeps all attribute quotes, even if they are optional.
'''),
action='store_true')
parser.add_argument('-H', '--in-head',
help=(
'''If you are parsing only a fragment of HTML, and the fragment occurs in the
head of the document, setting this will remove some extra whitespace.
'''),
action='store_true')
parser.add_argument('-k', '--keep-pre-attr',
help=(
'''HTMLMin supports the propietary attribute 'pre' that can be added to elements
to prevent minification. This attribute is removed by default. Set this flag to
keep the 'pre' attributes in place.
'''),
action='store_true')
parser.add_argument('-a', '--pre-attr',
help=(
'''The attribute htmlmin looks for to find blocks of HTML that it should not
minify. This attribute will be removed from the HTML unless '-k' is
specified. Defaults to 'pre'.
'''),
default='pre')
parser.add_argument('-p', '--pre-tags',
metavar='TAG',
help=(
'''By default, the contents of 'pre', and 'textarea' tags are left unminified.
You can specify different tags using the --pre-tags option. 'script' and 'style'
tags are always left unmininfied.
'''),
nargs='*',
default=['pre', 'textarea'])
parser.add_argument('-e', '--encoding',
help=("Encoding to read and write with. Default 'utf-8'."
" When reading from stdin, attempts to use the system's"
" encoding before defaulting to utf-8.\n\n"),
default=None,
)
def main():
args = parser.parse_args()
minifier = Minifier(
remove_comments=args.remove_comments,
remove_empty_space=args.remove_empty_space,
remove_optional_attribute_quotes=not args.keep_optional_attribute_quotes,
pre_tags=args.pre_tags,
keep_pre=args.keep_pre_attr,
pre_attr=args.pre_attr,
)
default_encoding = args.encoding or 'utf-8'
if args.input_file:
inp = codecs.open(args.input_file, encoding=default_encoding)
else:
encoding = args.encoding or sys.stdin.encoding \
or locale.getpreferredencoding() or default_encoding
inp = io.open(sys.stdin.fileno(), encoding=encoding)
for line in inp.readlines():
minifier.input(line)
if args.output_file:
codecs.open(
args.output_file, 'w', encoding=default_encoding).write(minifier.output)
else:
encoding = args.encoding or sys.stdout.encoding \
or locale.getpreferredencoding() or default_encoding
io.open(sys.stdout.fileno(), 'w', encoding=encoding).write(minifier.output)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,64 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from .main import Minifier
def htmlmin(*args, **kwargs):
"""Minifies HTML that is returned by a function.
A simple decorator that minifies the HTML output of any function that it
decorates. It supports all the same options that :class:`htmlmin.minify` has.
With no options, it uses ``minify``'s default settings::
@htmlmin
def foobar():
return ' minify me! '
or::
@htmlmin(remove_comments=True)
def foobar():
return ' minify me! <!-- and remove me! -->'
"""
def _decorator(fn):
minify = Minifier(**kwargs).minify
def wrapper(*a, **kw):
return minify(fn(*a, **kw))
return wrapper
if len(args) == 1:
if callable(args[0]) and not kwargs:
return _decorator(args[0])
else:
raise RuntimeError(
'htmlmin decorator does accept positional arguments')
elif len(args) > 1:
raise RuntimeError(
'htmlmin decorator does accept positional arguments')
else:
return _decorator

View File

@ -0,0 +1,204 @@
"""
Copyright (c) 2015, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import re
try:
from html import escape
except ImportError:
from cgi import escape
import re
NO_QUOTES = 0
SINGLE_QUOTE = 1
DOUBLE_QUOTE = 2
UPPER_A = ord('A')
UPPER_F = ord('F')
UPPER_Z = ord('Z')
LOWER_A = ord('a')
LOWER_F = ord('f')
LOWER_Z = ord('z')
ZERO = ord('0')
NINE = ord('9')
# https://www.w3.org/TR/html5/syntax.html#attributes-0
CHARS_TO_QUOTE_RE = re.compile(u'[\x20\x09\x0a\x0c\x0d=><`]')
def escape_tag(val):
return escape(val)
def escape_attr_name(val):
return escape(val)
def escape_attr_value(val, double_quote=False):
val = escape_ambiguous_ampersand(val)
has_html_tag = '<' in val or '>' in val
if double_quote:
return (val.replace('"', '&#34;'), DOUBLE_QUOTE)
double_quote_count = 0
single_quote_count = 0
for ch in val:
if ch == '"':
double_quote_count += 1
elif ch == "'":
single_quote_count += 1
if double_quote_count > single_quote_count:
return (val.replace("'", '&#39;'), SINGLE_QUOTE)
elif single_quote_count:
return (val.replace('"', '&#34;'), DOUBLE_QUOTE)
if not val or CHARS_TO_QUOTE_RE.search(val):
return (val, DOUBLE_QUOTE)
return (val, NO_QUOTES)
def escape_ambiguous_ampersand(val):
# TODO: this function could probably me made a lot faster.
if not '&' in val: # short circuit for speed
return val
state = 0
result = []
amp_buff = []
for c in val:
if state == 0: # beginning
if c == '&':
state = 1
else:
result.append(c)
elif state == 1: # ampersand
ord_c = ord(c)
if (UPPER_A <= ord_c <= UPPER_Z or
LOWER_A <= ord_c <= LOWER_Z or
ZERO <= ord_c <= NINE):
amp_buff.append(c) # TODO: use "name character references" section
# https://html.spec.whatwg.org/multipage/syntax.html#named-character-references
elif c == '#':
state = 2
elif c == ';':
if amp_buff:
result.append('&')
result.extend(amp_buff)
result.append(';')
else:
result.append('&;')
state = 0
amp_buff = []
elif c == '&':
if amp_buff:
result.append('&amp;')
result.extend(amp_buff)
else:
result.append('&')
amp_buff = []
else:
result.append('&')
result.extend(amp_buff)
result.append(c)
state = 0
amp_buff = []
elif state == 2: # numeric character reference
ord_c = ord(c)
if c == 'x' or c == 'X':
state = 3
elif ZERO <= ord_c <= NINE:
amp_buff.append(c)
elif c == ';':
if amp_buff:
result.append('&#')
result.extend(amp_buff)
result.append(';')
else:
result.append('&#;')
state = 0
amp_buff = []
elif c == '&':
if amp_buff:
result.append('&amp;#')
result.extend(amp_buff)
else:
result.append('&#')
state = 1
amp_buff = []
else:
if amp_buff:
result.append('&amp;#')
result.extend(amp_buff)
result.append(c)
else:
result.append('&#')
result.append(c)
state = 0
amp_buff = []
elif state == 3: # hex character reference
ord_c = ord(c)
if (UPPER_A <= ord_c <= UPPER_F or
LOWER_A <= ord_c <= LOWER_F or
ZERO <= ord_c <= NINE):
amp_buff.append(c)
elif c == ';':
if amp_buff:
result.append('&#x')
result.extend(amp_buff)
result.append(';')
else:
result.append('&#x;')
state = 0
amp_buff = []
elif c == '&':
if amp_buff:
result.append('&amp;#x')
result.extend(amp_buff)
else:
result.append('&#x')
state = 1
amp_buff = []
else:
if amp_buff:
result.append('&amp;#x')
result.extend(amp_buff)
result.append(c)
else:
result.append('&#x')
result.append(c)
state = 0
amp_buff = []
if state == 1:
result.append('&amp;')
result.extend(amp_buff)
elif state == 2:
result.append('&amp;#')
result.extend(amp_buff)
elif state == 3:
result.append('&amp;#x')
result.extend(amp_buff)
return ''.join(result)

193
Source/Libs/htmlmin/main.py Normal file
View File

@ -0,0 +1,193 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import cgi
from . import parser
def minify(input,
remove_comments=False,
remove_empty_space=False,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=False,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=False,
pre_tags=parser.PRE_TAGS,
pre_attr='pre',
cls=parser.HTMLMinParser):
"""Minifies HTML in one shot.
:param input: A string containing the HTML to be minified.
:param remove_comments: Remove comments found in HTML. Individual comments can
be maintained by putting a ``!`` as the first character inside the comment.
Thus::
<!-- FOO --> <!--! BAR -->
Will become simply::
<!-- BAR -->
The added exclamation is removed.
:param remove_empty_space: Remove empty space found in HTML between an opening
and a closing tag and when it contains a newline or carriage return. If
whitespace is found that is only spaces and/or tabs, it will be turned into
a single space. Be careful, this can have unintended consequences.
:param remove_all_empty_space: A more extreme version of
``remove_empty_space``, this removes all empty whitespace found between
tags. This is almost guaranteed to break your HTML unless you are very
careful.
:param reduce_boolean_attributes: Where allowed by the HTML5 specification,
attributes such as 'disabled' and 'readonly' will have their value removed,
so 'disabled="true"' will simply become 'disabled'. This is generally a
good option to turn on except when JavaScript relies on the values.
:param remove_optional_attribute_quotes: When True, optional quotes around
attributes are removed. When False, all attribute quotes are left intact.
Defaults to True.
:param conver_charrefs: Decode character references such as &amp; and &#46;
to their single charater values where safe. This currently only applies to
attributes. Data content between tags will be left encoded.
:param keep_pre: By default, htmlmin uses the special attribute ``pre`` to
allow you to demarcate areas of HTML that should not be minified. It removes
this attribute as it finds it. Setting this value to ``True`` tells htmlmin
to leave the attribute in the output.
:param pre_tags: A list of tag names that should never be minified. You are
free to change this list as you see fit, but you will probably want to
include ``pre`` and ``textarea`` if you make any changes to the list. Note
that ``<script>`` and ``<style>`` tags are never minimized.
:param pre_attr: Specifies the attribute that, when found in an HTML tag,
indicates that the content of the tag should not be minified. Defaults to
``pre``. You can also prefix individual tag attributes with
``{pre_attr}-`` to prevent the contents of the individual attribute from
being changed.
:return: A string containing the minified HTML.
If you are going to be minifying multiple HTML documents, each with the same
settings, consider using :class:`.Minifier`.
"""
minifier = cls(
remove_comments=remove_comments,
remove_empty_space=remove_empty_space,
remove_all_empty_space=remove_all_empty_space,
reduce_empty_attributes=reduce_empty_attributes,
reduce_boolean_attributes=reduce_boolean_attributes,
remove_optional_attribute_quotes=remove_optional_attribute_quotes,
convert_charrefs=convert_charrefs,
keep_pre=keep_pre,
pre_tags=pre_tags,
pre_attr=pre_attr)
minifier.feed(input)
minifier.close()
return minifier.result
class Minifier(object):
"""An object that supports HTML Minification.
Options are passed into this class at initialization time and are then
persisted across each use of the instance. If you are going to be minifying
multiple peices of HTML, this will be more efficient than using
:class:`htmlmin.minify`.
See :class:`htmlmin.minify` for an explanation of options.
"""
def __init__(self,
remove_comments=False,
remove_empty_space=False,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=False,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=False,
pre_tags=parser.PRE_TAGS,
pre_attr='pre',
cls=parser.HTMLMinParser):
"""Initialize the Minifier.
See :class:`htmlmin.minify` for an explanation of options.
"""
self._parser = cls(
remove_comments=remove_comments,
remove_empty_space=remove_empty_space,
remove_all_empty_space=remove_all_empty_space,
reduce_empty_attributes=reduce_empty_attributes,
reduce_boolean_attributes=reduce_boolean_attributes,
remove_optional_attribute_quotes=remove_optional_attribute_quotes,
convert_charrefs=convert_charrefs,
keep_pre=keep_pre,
pre_tags=pre_tags,
pre_attr=pre_attr)
def minify(self, *input):
"""Runs HTML through the minifier in one pass.
:param input: HTML to be fed into the minimizer. Multiple chunks of HTML
can be provided, and they are fed in sequentially as if they were
concatenated.
:returns: A string containing the minified HTML.
This is the simplest way to use an existing ``Minifier`` instance. This
method takes in HTML and minfies it, returning the result. Note that this
method resets the internal state of the parser before it does any work. If
there is pending HTML in the buffers, it will be lost.
"""
self._parser.reset()
self.input(*input)
return self.finalize()
def input(self, *input):
"""Feed more HTML into the input stream
:param input: HTML to be fed into the minimizer. Multiple chunks of HTML
can be provided, and they are fed in sequentially as if they were
concatenated. You can also call this method multiple times to achieve
the same effect.
"""
for i in input:
self._parser.feed(i)
@property
def output(self):
"""Retrieve the minified output generated thus far.
"""
return self._parser.result
def finalize(self):
"""Finishes current input HTML and returns mininified result.
This method flushes any remaining input HTML and returns the minified
result. It resets the state of the internal parser in the process so that
new HTML can be minified. Be sure to call this method before you reuse
the ``Minifier`` instance on a new HTML document.
"""
self._parser.close()
result = self._parser.result
self._parser.reset()
return result

View File

@ -0,0 +1,92 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from .main import Minifier
class HTMLMinMiddleware(object):
"""WSGI Middleware that minifies html on the way out.
:param by_default: Specifies if minification should be turned on or off by
default. Defaults to ``True``.
:param keep_header: The middleware recognizes one custom HTTP header that
can be used to turn minification on or off on a per-request basis:
``X-HTML-Min-Enable``. Setting the header to ``true`` will turn minfication
on; anything else will turn minification off. If ``by_default`` is set to
``False``, this header is how you would turn minification back on. The
middleware, by default, removes the header from the output. Setting this
to ``True`` leaves the header in tact.
:param debug: A quick setting to turn all minification off. The middleware
is effectively bypassed.
This simple middleware minifies any HTML content that passes through it. Any
additional keyword arguments beyond the three settings the middleware has are
passed on to the internal minifier. The documentation for the options can
be found under :class:`htmlmin.minify`.
"""
def __init__(self, app, by_default=True, keep_header=False,
debug=False, **kwargs):
self.app = app
self.by_default = by_default
self.debug = debug
self.keep_header = keep_header
self.minifier = Minifier(**kwargs)
def __call__(self, environ, start_response):
if self.debug:
return self.app(environ, start_response)
should_minify = [] # need to use a mutable object so we can change it
# in a different scope.
def minified_start_response(status, headers, exc_info=None):
should_minify.append(self.should_minify(headers))
if not self.keep_header:
headers = [(header, value) for header, value in
headers if header != 'X-HTML-Min-Enable']
start_response(status, headers, exc_info)
html = [i for i in self.app(environ, minified_start_response)]
if should_minify[0]:
return [self.minifier.minify(*html)]
return html
def should_minify(self, headers):
is_html = False
flag_header = None
for header, value in headers:
if not is_html and header == 'Content-Type' and value == 'text/html':
is_html = True
if flag_header is not None:
break
if flag_header is None and header == 'X-HTML-Min-Enable':
flag_header = (value.lower() == 'true')
if is_html:
break
return is_html and (
(self.by_default and flag_header != False) or
(not self.by_default and flag_header))

View File

@ -0,0 +1,408 @@
"""
Copyright (c) 2013, Dave Mankoff
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Dave Mankoff nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from __future__ import unicode_literals
import logging
import sys
import re
from .python3html.parser import HTMLParser
from . import escape
# https://www.w3.org/TR/html5/single-page.html#space-character
HTML_SPACE_RE = re.compile('[\x20\x09\x0a\x0c\x0d]+')
HTML_ALL_SPACE_RE = re.compile('^[\x20\x09\x0a\x0c\x0d]+$')
HTML_LEADING_SPACE_RE = re.compile(
'^[\x20\x09\x0a\x0c\x0d]+')
HTML_TRAILING_SPACE_RE = re.compile(
'[\x20\x09\x0a\x0c\x0d]+$')
HTML_LEADING_TRAILING_SPACE_RE = re.compile(
'(^[\x20\x09\x0a\x0c\x0d]+)|([\x20\x09\x0a\x0c\x0d]+$)')
PRE_TAGS = ('pre', 'textarea') # styles and scripts are never minified
# http://www.w3.org/TR/html51/syntax.html#elements-0
NO_CLOSE_TAGS = ('area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img',
'input', 'keygen', 'link', 'meta', 'param', 'source', 'track',
'wbr')
# http://www.w3.org/TR/html51/index.html#attributes-1
BOOLEAN_ATTRIBUTES = {
'audio': ('autoplay', 'controls', 'hidden', 'loop', 'muted',),
'button': ('autofocus', 'disabled', 'formnovalidate', 'hidden',),
'command': ('checked', 'disabled', 'hidden'),
'dialog': ('hidden', 'open',),
'fieldset': ('disabled', 'hidden',),
'form': ('hidden', 'novalidate',),
'iframe': ('hidden', 'seamless',),
'img': ('hidden', 'ismap',),
'input': ('autofocus', 'checked', 'disabled', 'formnovalidate', 'hidden',
'multiple', 'readonly', 'required',),
'keygen': ('autofocus', 'disabled', 'hidden',),
'object': ('hidden', 'typesmustmatch',),
'ol': ('hidden', 'reversed',),
'optgroup': ('disabled', 'hidden',),
'option': ('disabled', 'hidden', 'selected',),
'script': ('async', 'defer', 'hidden',),
'select': ('autofocus', 'disabled', 'hidden', 'multiple', 'required',),
'style': ('hidden', 'scoped',),
'textarea': ('autofocus', 'disabled', 'hidden', 'readonly', 'required',),
'track': ('default', 'hidden', ),
'video': ('autoplay', 'controls', 'hidden', 'loop', 'muted',),
'*': ('hidden',),
}
# a list of tags and tags that they are closed by
TAG_SETS = {
'li': ('li',),
'dd': ('dd', 'dt'),
'rp': ('rp', 'rt'),
'p': ('address', 'article', 'aside', 'blockquote', 'dir', 'div', 'dl',
'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hgroup', 'hr', 'menu', 'nav', 'ol', 'p', 'pre', 'section',
'table', 'ul'),
'optgroup': ('optgroup',),
'option': ('option', 'optgroup'),
'colgroup': '*',
'tbody': ('tbody', 'tfoot'),
'tfoot': ('tbody',),
'tr': ('tr',),
'td': ('td', 'th'),
}
TAG_SETS['dt'] = TAG_SETS['dd']
TAG_SETS['rt'] = TAG_SETS['rp']
TAG_SETS['thead'] = TAG_SETS['tbody']
TAG_SETS['th'] = TAG_SETS['td']
# Tag omission rules:
# http://www.w3.org/TR/html51/syntax.html#optional-tags
class HTMLMinError(Exception): pass
class ParseError(HTMLMinError): pass
class OpenTagNotFoundError(ParseError): pass
class HTMLMinParser(HTMLParser):
def __init__(self,
remove_comments=False,
remove_empty_space=False,
remove_all_empty_space=False,
reduce_empty_attributes=True,
reduce_boolean_attributes=False,
remove_optional_attribute_quotes=True,
convert_charrefs=True,
keep_pre=False,
pre_tags=PRE_TAGS,
pre_attr='pre'):
if sys.version_info[0] >= 3 and sys.version_info[1] >= 4:
# convert_charrefs is True by default in Python 3.5.0 and newer. It was
# introduced in 3.4.
HTMLParser.__init__(self, convert_charrefs=False)
else:
HTMLParser.__init__(self)
self.keep_pre = keep_pre
self.pre_tags = pre_tags
self.remove_comments = remove_comments
self.remove_empty_space = remove_empty_space
self.remove_all_empty_space = remove_all_empty_space
self.reduce_empty_attributes = reduce_empty_attributes
self.reduce_boolean_attributes = reduce_boolean_attributes
self.remove_optional_attribute_quotes = remove_optional_attribute_quotes
self.convert_charrefs = convert_charrefs
self.pre_attr = pre_attr
self.reset()
def _tag_lang(self):
return self._tag_stack[0][2] if self._tag_stack else None
def build_tag(self, tag, attrs, close_tag):
has_pre = False
if self.reduce_boolean_attributes:
bool_attrs = BOOLEAN_ATTRIBUTES.get(tag, BOOLEAN_ATTRIBUTES['*'])
else:
bool_attrs = False
lang = self._tag_lang()
attrs = list(attrs) # We're modifying it in place
last_quoted = last_no_slash = i = -1
for k, v in attrs:
pre_prefix = k.startswith("{}-".format(self.pre_attr))
if pre_prefix:
k = k[len(self.pre_attr)+1:]
if k == self.pre_attr:
has_pre = True
if not self.keep_pre and not pre_prefix:
continue
if v and self.convert_charrefs and not pre_prefix:
v = HTMLParser.unescape(self, v)
if k == 'lang':
lang = v
if v == self._tag_lang():
continue
i += 1
if not pre_prefix:
k = escape.escape_attr_name(k)
if (v is None or (not v and self.reduce_empty_attributes) or
(bool_attrs and k in bool_attrs)):
# For our use case, we treat boolean attributes as quoted because they
# don't require space between them and "/>" in closing tags.
attrs[i] = k
last_quoted = i
else:
if pre_prefix:
has_double_quotes = '"' in v
has_single_quotes = "'" in v
if not has_double_quotes:
if not has_single_quotes and self.remove_optional_attribute_quotes:
q = escape.NO_QUOTES
else:
q = escape.DOUBLE_QUOTE
elif not has_single_quotes:
q = escape.SINGLE_QUOTES
else:
logging.error('Unsafe content found in pre-attribute. Escaping.')
(v, q) = escape.escape_attr_value(
v, double_quote=not self.remove_optional_attribute_quotes)
else:
(v, q) = escape.escape_attr_value(
v, double_quote=not self.remove_optional_attribute_quotes)
if q == escape.NO_QUOTES:
attrs[i] = '%s=%s' % (k, v)
if v[-1] != '/':
last_no_slash = i
else:
q = '"' if q == escape.DOUBLE_QUOTE else "'"
attrs[i] = '%s=%s%s%s' % (k, q, v, q)
last_quoted = i
i += 1
if i != len(attrs):
del attrs[i:]
# 1. If there are no attributes, no additional space is necessary.
# 2. If last attribute is quoted, no additional space is necessary.
# 3. Two things are happening here:
# a) according to the standard, <foo bar=baz/> should be treated as <foo
# bar="baz/"> so space is necessary if this is self-closing tag,
# however
# b) reportedly (https://github.com/mankyd/htmlmin/pull/12), older
# versions of WebKit interpret <foo bar=baz/> as self-closing tag so
# we need the space if the last argument ends with a slash.
space_maybe = ''
if attrs:
needs_space = lambda last_attr: (last_attr[-1] not in '"\'' and
(close_tag or last_attr[-1] == '/'))
if needs_space(attrs[-1][-1]):
# If moving attributes around can help, do it. Otherwise bite the
# bullet and put the space in.
i = last_no_slash if last_quoted == -1 else last_quoted
if i == -1 or needs_space(attrs[i]):
space_maybe = ' '
else:
attrs.append(attrs[i])
del attrs[i]
return has_pre, '<%s%s%s%s%s>' % (escape.escape_tag(tag),
' ' if attrs else '',
' '.join(attrs),
space_maybe,
'/' if close_tag else ''), lang
def handle_decl(self, decl):
if (len(self._data_buffer) == 1 and
HTML_SPACE_RE.match(self._data_buffer[0][0])):
self._data_buffer = []
self._data_buffer.append('<!' + decl + '>')
self._after_doctype = True
def _close_tags_up_to(self, tag):
num_pres = 0
i = 0
for i, t in enumerate(self._tag_stack):
if t[1]:
num_pres += 1
if t[0] == tag:
break
# Only the html tag can close out everything. Put on the brakes if
# we encounter a closing tag that we didn't recognize.
if tag != 'html' and t[0] in ('body', 'html', 'head'):
raise OpenTagNotFoundError()
self._tag_stack = self._tag_stack[i+1:]
return num_pres
def handle_starttag(self, tag, attrs):
self._after_doctype = False
if tag == 'head':
self._in_head = True
elif self._in_head and tag == 'title':
self._in_title = True
self._title_newly_opened = True
for t in self._tag_stack:
closed_by_tags = TAG_SETS.get(t[0])
if closed_by_tags and (closed_by_tags == '*' or tag in closed_by_tags):
self._in_pre_tag -= self._close_tags_up_to(t[0])
break
has_pre, data, lang = self.build_tag(tag, attrs, False)
start_pre = False
if (has_pre or self._in_pre_tag > 0 or
tag == 'script' or tag == 'style' or tag in self.pre_tags):
self._in_pre_tag += 1
start_pre = True
self._tag_stack.insert(0, (tag, start_pre, lang))
self._data_buffer.append(data)
def handle_endtag(self, tag):
# According to the spec, <p> tags don't get closed when a parent a
# tag closes them. Here's some logic that addresses this.
if tag == 'a':
contains_p = False
for i, t in enumerate(self._tag_stack):
if t[0] == 'p':
contains_p = True
elif t[0] == 'a':
break
if contains_p: # the p tag, and all its children should be left open
a_tag = self._tag_stack.pop(i)
if a_tag[1]:
self._in_pre_tag -= 1
else:
if tag == 'head':
# TODO: Did we know that we were in an head tag?! If not, we need to
# reminify everything to remove extra spaces.
self._in_head = False
elif tag == 'title':
self._in_title = False
self._title_newly_opened = False
try:
self._in_pre_tag -= self._close_tags_up_to(tag)
except OpenTagNotFoundError:
# Some tags don't require a start tag. Most do. Either way, we leave
# closing tags along since they affect output. For instance, a '</p>'
# results in a '<p></p>' in Chrome.
pass
if tag not in NO_CLOSE_TAGS:
self._data_buffer.extend(['</', escape.escape_tag(tag), '>'])
def handle_startendtag(self, tag, attrs):
self._after_doctype = False
data = self.build_tag(tag, attrs, tag not in NO_CLOSE_TAGS)[1]
self._data_buffer.append(data)
def handle_comment(self, data):
if not self.remove_comments or re.match(r'^(?:!|\[if\s)', data):
self._data_buffer.append('<!--{}-->'.format(
data[1:] if len(data) and data[0] == '!' else data))
def handle_data(self, data):
if self._in_pre_tag > 0:
self._data_buffer.append(data)
else:
# remove_all_empty_space matches everything. remove_empty_space only
# matches if there's a newline involved.
if self.remove_all_empty_space or self._in_head or self._after_doctype:
if HTML_ALL_SPACE_RE.match(data):
return
elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and
('\n' in data or '\r' in data)):
return
# if we're in the title, remove leading and trailing whitespace.
# note that the title may be parsed in chunks if entityref's or charrefs
# are encountered.
if self._in_title:
if self.__title_trailing_whitespace:
self._data_buffer.append(' ')
self.__title_trailing_whitespace = (
HTML_ALL_SPACE_RE.match(data[-1]) is not None)
if self._title_newly_opened:
self._title_newly_opened = False
data = HTML_LEADING_TRAILING_SPACE_RE.sub('', data)
else:
data = HTML_TRAILING_SPACE_RE.sub(
'', HTML_LEADING_TRAILING_SPACE_RE.sub(' ', data))
data = HTML_SPACE_RE.sub(' ', data)
if not data:
return
if self._in_pre_tag == 0 and self._data_buffer:
# If we're not in a pre block, its possible that we append two spaces
# together, which we want to avoid. For instance, if we remove a comment
# from between two blocks of text: a <!-- B --> c => a c.
if data[0] == ' ' and self._data_buffer[-1][-1] == ' ':
data = data[1:]
if not data:
return
self._data_buffer.append(data)
def handle_entityref(self, data):
if self._in_title:
if not self._title_newly_opened and self.__title_trailing_whitespace:
self._data_buffer.append(' ')
self.__title_trailing_whitespace = False
self._title_newly_opened = False
self._data_buffer.append('&{};'.format(data))
def handle_charref(self, data):
if self._in_title:
if not self._title_newly_opened and self.__title_trailing_whitespace:
self._data_buffer.append(' ')
self.__title_trailing_whitespace = False
self._title_newly_opened = False
self._data_buffer.append('&#{};'.format(data))
def handle_pi(self, data):
self._data_buffer.append('<?' + data + '>')
def unknown_decl(self, data):
self._data_buffer.append('<![' + data + ']>')
def reset(self):
self._data_buffer = []
self._in_pre_tag = 0
self._in_head = False
self._in_title = False
self._after_doctype = False
self._tag_stack = []
self._title_newly_opened = False
self.__title_trailing_whitespace = False
HTMLParser.reset(self)
def unescape(self, val):
"""Override this method so that we can handle char ref conversion ourself.
"""
return val
@property
def result(self):
return ''.join(self._data_buffer)

View File

@ -0,0 +1,139 @@
"""
General functions for HTML manipulation.
"""
import re as _re
try:
from html.entities import html5 as _html5
unichr = chr
except ImportError:
import htmlentitydefs
_html5 = {'apos;':u"'"}
for k, v in htmlentitydefs.name2codepoint.iteritems():
_html5[k + ';'] = unichr(v)
__all__ = ['escape', 'unescape']
def escape(s, quote=True):
"""
Replace special characters "&", "<" and ">" to HTML-safe sequences.
If the optional flag quote is true (the default), the quotation mark
characters, both double quote (") and single quote (') characters are also
translated.
"""
s = s.replace("&", "&amp;") # Must be done first!
s = s.replace("<", "&lt;")
s = s.replace(">", "&gt;")
if quote:
s = s.replace('"', "&quot;")
s = s.replace('\'', "&#x27;")
return s
# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
_invalid_charrefs = {
0x00: '\ufffd', # REPLACEMENT CHARACTER
0x0d: '\r', # CARRIAGE RETURN
0x80: '\u20ac', # EURO SIGN
0x81: '\x81', # <control>
0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
0x85: '\u2026', # HORIZONTAL ELLIPSIS
0x86: '\u2020', # DAGGER
0x87: '\u2021', # DOUBLE DAGGER
0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
0x89: '\u2030', # PER MILLE SIGN
0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
0x8d: '\x8d', # <control>
0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
0x8f: '\x8f', # <control>
0x90: '\x90', # <control>
0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
0x95: '\u2022', # BULLET
0x96: '\u2013', # EN DASH
0x97: '\u2014', # EM DASH
0x98: '\u02dc', # SMALL TILDE
0x99: '\u2122', # TRADE MARK SIGN
0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9c: '\u0153', # LATIN SMALL LIGATURE OE
0x9d: '\x9d', # <control>
0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
}
_invalid_codepoints = {
# 0x0001 to 0x0008
0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
# 0x000E to 0x001F
0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
# 0x007F to 0x009F
0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
# 0xFDD0 to 0xFDEF
0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
# others
0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
0x10fffe, 0x10ffff
}
def _replace_charref(s):
s = s.group(1)
if s[0] == '#':
# numeric charref
if s[1] in 'xX':
num = int(s[2:].rstrip(';'), 16)
else:
num = int(s[1:].rstrip(';'))
if num in _invalid_charrefs:
return _invalid_charrefs[num]
if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
return '\uFFFD'
if num in _invalid_codepoints:
return ''
return unichr(num)
else:
# named charref
if s in _html5:
return _html5[s]
# find the longest matching name (as defined by the standard)
for x in range(len(s)-1, 1, -1):
if s[:x] in _html5:
return _html5[s[:x]] + s[x:]
else:
return '&' + s
_charref = _re.compile(r'&(#[0-9]+;?'
r'|#[xX][0-9a-fA-F]+;?'
r'|[^\t\n\f <&#;]{1,32};?)')
def unescape(s):
"""
Convert all named and numeric character references (e.g. &gt;, &#62;,
&x3e;) in the string s to the corresponding unicode characters.
This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the list of
HTML 5 named character references defined in html.entities.html5.
"""
if '&' not in s:
return s
return _charref.sub(_replace_charref, s)

View File

@ -0,0 +1,481 @@
"""A parser for HTML and XHTML."""
########
# This is copied from Python3 and the slightly modified to support needed
# features. The original file can be found at:
# https://github.com/python/cpython/blob/44b548dda872c0d4f30afd6b44fd74b053a55ad8/Lib/html/parser.py
#
# The largest difference is the reinstatment of the unescape method in
# HTMLParser, which is needed for features in htmlmin. Changes are also
# made to ensure Python2.7 compatability.
########
# This file is based on sgmllib.py, but the API is slightly different.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
import re
import warnings
try:
import _markupbase as markupbase
except ImportError:
import markupbase
from . import unescape
__all__ = ['HTMLParser']
# Regular expressions used for parsing
interesting_normal = re.compile('[&<]')
incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
# explode, so don't do it.
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
attrfind_tolerant = re.compile(
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
)?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
class HTMLParser(markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Usage:
p = HTMLParser()
p.feed(data)
...
p.close()
Start tags are handled by calling self.handle_starttag() or
self.handle_startendtag(); end tags by self.handle_endtag(). The
data between tags is passed from the parser to the derived class
by calling self.handle_data() with the data as argument (the data
may be split up in arbitrary chunks). If convert_charrefs is
True the character references are converted automatically to the
corresponding Unicode character (and self.handle_data() is no
longer split in chunks), otherwise they are passed by calling
self.handle_entityref() or self.handle_charref() with the string
containing respectively the named or numeric reference as the
argument.
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self, convert_charrefs=True):
"""Initialize and reset this instance.
If convert_charrefs is True (the default), all character references
are automatically converted to the corresponding Unicode characters.
"""
self.convert_charrefs = convert_charrefs
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
markupbase.ParserBase.reset(self)
def feed(self, data):
r"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""Handle any buffered data."""
self.goahead(1)
__starttag_text = None
def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
# or there's more text incoming. If the latter is True,
# we can't pass the text to handle_data in case we have
# a charref cut in half at end. Try to determine if
# this is the case before proceeding by looking for an
# & near the end and see if it's followed by a space or ;.
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
not re.compile(r'[\s;]').search(rawdata, amppos)):
break # wait till we get all the text
j = n
else:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(self.unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif startswith("</", i):
k = self.parse_endtag(i)
elif startswith("<!--", i):
k = self.parse_comment(i)
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(self.unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
if match:
name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
else:
if ";" in rawdata[i:]: # bail by consuming &#
self.handle_data(rawdata[i:i+2])
i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete.match(rawdata, i)
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete
break
elif (i + 1) < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")
i = self.updatepos(i, i + 1)
else:
break
else:
assert 0, "interesting.search() lied"
# end while
if end and i < n and not self.cdata_elem:
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(self.unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
# Internal -- parse html declarations, return length or -1 if not terminated
# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
# See also parse_declaration in _markupbase
def parse_html_declaration(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<!', ('unexpected call to '
'parse_html_declaration()')
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+3] == '<![':
return self.parse_marked_section(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
if gtpos == -1:
return -1
self.handle_decl(rawdata[i+2:gtpos])
return gtpos+1
else:
return self.parse_bogus_comment(i)
# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
'parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
if report:
self.handle_comment(rawdata[i+2:pos])
return pos + 1
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
match = piclose.search(rawdata, i+2) # >
if not match:
return -1
j = match.start()
self.handle_pi(rawdata[i+2: j])
j = match.end()
return j
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend_tolerant.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if next == "/":
if rawdata.startswith("/>", j):
return j + 2
if rawdata.startswith("/", j):
# buffer boundary
return -1
# else bogus input
if j > i:
return j
else:
return i + 1
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
if j > i:
return j
else:
return i + 1
raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
match = endendtag.search(rawdata, i+1) # >
if not match:
return -1
gtpos = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
if self.cdata_elem is not None:
self.handle_data(rawdata[i:gtpos])
return gtpos
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
namematch = tagfind_tolerant.match(rawdata, i+2)
if not namematch:
# w3.org/TR/html5/tokenization.html#end-tag-open-state
if rawdata[i:i+3] == '</>':
return i+3
else:
return self.parse_bogus_comment(i)
tagname = namematch.group(1).lower()
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after tha name should cover
# most of the cases and is much simpler
gtpos = rawdata.find('>', namematch.end())
self.handle_endtag(tagname)
return gtpos+1
elem = match.group(1).lower() # script or style
if self.cdata_elem is not None:
if elem != self.cdata_elem:
self.handle_data(rawdata[i:gtpos])
return gtpos
self.handle_endtag(elem.lower())
self.clear_cdata_mode()
return gtpos
# Overridable -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
# Overridable -- handle start tag
def handle_starttag(self, tag, attrs):
pass
# Overridable -- handle end tag
def handle_endtag(self, tag):
pass
# Overridable -- handle character reference
def handle_charref(self, name):
pass
# Overridable -- handle entity reference
def handle_entityref(self, name):
pass
# Overridable -- handle data
def handle_data(self, data):
pass
# Overridable -- handle comment
def handle_comment(self, data):
pass
# Overridable -- handle declaration
def handle_decl(self, decl):
pass
# Overridable -- handle processing instruction
def handle_pi(self, data):
pass
def unknown_decl(self, data):
pass
# Internal -- helper to remove special character quoting
def unescape(self, s):
return unescape(s)