Transition to monorepo on a new Dev branch

2025-06-05 22:09:23 +02:00 · 2023-02-22 10:29:46 +01:00
parent 8436f03ec7
commit 3a483dc0ef
3712 changed files with 751 additions and 11 deletions
--- a/App/Source/Libs/htmlmin/parser.py
+++ b/App/Source/Libs/htmlmin/parser.py
@ -0,0 +1,408 @@
+"""
+Copyright (c) 2013, Dave Mankoff
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Dave Mankoff nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL DAVE MANKOFF BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+from __future__ import unicode_literals
+import logging
+import sys
+
+import re
+from .python3html.parser import HTMLParser
+
+from . import escape
+
+# https://www.w3.org/TR/html5/single-page.html#space-character
+HTML_SPACE_RE = re.compile('[\x20\x09\x0a\x0c\x0d]+')
+HTML_ALL_SPACE_RE = re.compile('^[\x20\x09\x0a\x0c\x0d]+$')
+HTML_LEADING_SPACE_RE = re.compile(
+  '^[\x20\x09\x0a\x0c\x0d]+')
+HTML_TRAILING_SPACE_RE = re.compile(
+  '[\x20\x09\x0a\x0c\x0d]+$')
+HTML_LEADING_TRAILING_SPACE_RE = re.compile(
+  '(^[\x20\x09\x0a\x0c\x0d]+)|([\x20\x09\x0a\x0c\x0d]+$)')
+
+PRE_TAGS = ('pre', 'textarea')  # styles and scripts are never minified
+# http://www.w3.org/TR/html51/syntax.html#elements-0
+NO_CLOSE_TAGS = ('area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img',
+                 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track',
+                 'wbr')
+# http://www.w3.org/TR/html51/index.html#attributes-1
+BOOLEAN_ATTRIBUTES = {
+  'audio': ('autoplay', 'controls', 'hidden', 'loop', 'muted',),
+  'button': ('autofocus', 'disabled', 'formnovalidate', 'hidden',),
+  'command': ('checked', 'disabled', 'hidden'),
+  'dialog': ('hidden', 'open',),
+  'fieldset': ('disabled', 'hidden',),
+  'form': ('hidden', 'novalidate',),
+  'iframe': ('hidden', 'seamless',),
+  'img': ('hidden', 'ismap',),
+  'input': ('autofocus', 'checked', 'disabled', 'formnovalidate', 'hidden',
+            'multiple', 'readonly', 'required',),
+  'keygen': ('autofocus', 'disabled', 'hidden',),
+  'object': ('hidden', 'typesmustmatch',),
+  'ol': ('hidden', 'reversed',),
+  'optgroup': ('disabled', 'hidden',),
+  'option': ('disabled', 'hidden', 'selected',),
+  'script': ('async', 'defer', 'hidden',),
+  'select': ('autofocus', 'disabled', 'hidden', 'multiple', 'required',),
+  'style': ('hidden', 'scoped',),
+  'textarea': ('autofocus', 'disabled', 'hidden', 'readonly', 'required',),
+  'track': ('default', 'hidden', ),
+  'video': ('autoplay', 'controls', 'hidden', 'loop', 'muted',),
+  '*': ('hidden',),
+}
+
+# a list of tags and tags that they are closed by
+TAG_SETS = {
+  'li': ('li',),
+  'dd': ('dd', 'dt'),
+  'rp': ('rp', 'rt'),
+  'p': ('address', 'article', 'aside', 'blockquote', 'dir', 'div', 'dl',
+        'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+        'header', 'hgroup', 'hr', 'menu', 'nav', 'ol', 'p', 'pre', 'section',
+        'table', 'ul'),
+  'optgroup': ('optgroup',),
+  'option': ('option', 'optgroup'),
+  'colgroup': '*',
+  'tbody': ('tbody', 'tfoot'),
+  'tfoot': ('tbody',),
+  'tr': ('tr',),
+  'td': ('td', 'th'),
+}
+TAG_SETS['dt'] = TAG_SETS['dd']
+TAG_SETS['rt'] = TAG_SETS['rp']
+TAG_SETS['thead'] = TAG_SETS['tbody']
+TAG_SETS['th'] = TAG_SETS['td']
+
+# Tag omission rules:
+# http://www.w3.org/TR/html51/syntax.html#optional-tags
+
+class HTMLMinError(Exception): pass
+class ParseError(HTMLMinError): pass
+class OpenTagNotFoundError(ParseError): pass
+
+class HTMLMinParser(HTMLParser):
+  def __init__(self,
+               remove_comments=False,
+               remove_empty_space=False,
+               remove_all_empty_space=False,
+               reduce_empty_attributes=True,
+               reduce_boolean_attributes=False,
+               remove_optional_attribute_quotes=True,
+               convert_charrefs=True,
+               keep_pre=False,
+               pre_tags=PRE_TAGS,
+               pre_attr='pre'):
+    if sys.version_info[0] >= 3 and sys.version_info[1] >= 4:
+      # convert_charrefs is True by default in Python 3.5.0 and newer. It was
+      # introduced in 3.4.
+      HTMLParser.__init__(self, convert_charrefs=False)
+    else:
+      HTMLParser.__init__(self)
+    self.keep_pre = keep_pre
+    self.pre_tags = pre_tags
+    self.remove_comments = remove_comments
+    self.remove_empty_space = remove_empty_space
+    self.remove_all_empty_space = remove_all_empty_space
+    self.reduce_empty_attributes = reduce_empty_attributes
+    self.reduce_boolean_attributes = reduce_boolean_attributes
+    self.remove_optional_attribute_quotes = remove_optional_attribute_quotes
+    self.convert_charrefs = convert_charrefs
+    self.pre_attr = pre_attr
+    self.reset()
+
+  def _tag_lang(self):
+    return self._tag_stack[0][2] if self._tag_stack else None
+
+  def build_tag(self, tag, attrs, close_tag):
+    has_pre = False
+
+    if self.reduce_boolean_attributes:
+      bool_attrs = BOOLEAN_ATTRIBUTES.get(tag, BOOLEAN_ATTRIBUTES['*'])
+    else:
+      bool_attrs = False
+
+    lang = self._tag_lang()
+    attrs = list(attrs)  # We're modifying it in place
+    last_quoted = last_no_slash = i = -1
+    for k, v in attrs:
+      pre_prefix = k.startswith("{}-".format(self.pre_attr))
+      if pre_prefix:
+        k = k[len(self.pre_attr)+1:]
+      if k == self.pre_attr:
+        has_pre = True
+        if not self.keep_pre and not pre_prefix:
+          continue
+      if v and self.convert_charrefs and not pre_prefix:
+        v = HTMLParser.unescape(self, v)
+      if k == 'lang':
+        lang = v
+        if v == self._tag_lang():
+          continue
+
+      i += 1
+      if not pre_prefix:
+        k = escape.escape_attr_name(k)
+      if (v is None or (not v and self.reduce_empty_attributes) or
+          (bool_attrs and k in bool_attrs)):
+        # For our use case, we treat boolean attributes as quoted because they
+        # don't require space between them and "/>" in closing tags.
+        attrs[i] = k
+        last_quoted = i
+      else:
+        if pre_prefix:
+          has_double_quotes = '"' in v
+          has_single_quotes = "'" in v
+          if not has_double_quotes:
+            if not has_single_quotes and self.remove_optional_attribute_quotes:
+              q = escape.NO_QUOTES
+            else:
+              q = escape.DOUBLE_QUOTE
+          elif not has_single_quotes:
+            q = escape.SINGLE_QUOTES
+          else:
+            logging.error('Unsafe content found in pre-attribute. Escaping.')
+            (v, q) = escape.escape_attr_value(
+              v, double_quote=not self.remove_optional_attribute_quotes)
+        else:
+          (v, q) = escape.escape_attr_value(
+            v, double_quote=not self.remove_optional_attribute_quotes)
+        if q == escape.NO_QUOTES:
+          attrs[i] = '%s=%s' % (k, v)
+          if v[-1] != '/':
+            last_no_slash = i
+        else:
+          q = '"' if q == escape.DOUBLE_QUOTE else "'"
+          attrs[i] = '%s=%s%s%s' % (k, q, v, q)
+          last_quoted = i
+
+    i += 1
+    if i != len(attrs):
+      del attrs[i:]
+
+    # 1. If there are no attributes, no additional space is necessary.
+    # 2. If last attribute is quoted, no additional space is necessary.
+    # 3. Two things are happening here:
+    #    a) according to the standard, <foo bar=baz/> should be treated as <foo
+    #       bar="baz/"> so space is necessary if this is self-closing tag,
+    #       however
+    #    b) reportedly (https://github.com/mankyd/htmlmin/pull/12), older
+    #       versions of WebKit interpret <foo bar=baz/> as self-closing tag so
+    #       we need the space if the last argument ends with a slash.
+    space_maybe = ''
+    if attrs:
+      needs_space = lambda last_attr: (last_attr[-1] not in '"\'' and
+                                       (close_tag or last_attr[-1] == '/'))
+      if needs_space(attrs[-1][-1]):
+        # If moving attributes around can help, do it.  Otherwise bite the
+        # bullet and put the space in.
+        i = last_no_slash if last_quoted == -1 else last_quoted
+        if i == -1 or needs_space(attrs[i]):
+          space_maybe = ' '
+        else:
+          attrs.append(attrs[i])
+          del attrs[i]
+
+    return has_pre, '<%s%s%s%s%s>' % (escape.escape_tag(tag),
+                                      ' ' if attrs else '',
+                                      ' '.join(attrs),
+                                      space_maybe,
+                                      '/' if close_tag else ''), lang
+
+  def handle_decl(self, decl):
+    if (len(self._data_buffer) == 1 and 
+        HTML_SPACE_RE.match(self._data_buffer[0][0])):
+      self._data_buffer = []
+    self._data_buffer.append('<!' + decl + '>')
+    self._after_doctype = True
+
+  def _close_tags_up_to(self, tag):
+    num_pres = 0
+    i = 0
+    for i, t in enumerate(self._tag_stack):
+      if t[1]:
+        num_pres += 1
+      if t[0] == tag:
+        break
+
+      # Only the html tag can close out everything. Put on the brakes if
+      # we encounter a closing tag that we didn't recognize.
+      if tag != 'html' and t[0] in ('body', 'html', 'head'):
+        raise OpenTagNotFoundError()
+
+    self._tag_stack = self._tag_stack[i+1:]
+
+    return num_pres
+
+  def handle_starttag(self, tag, attrs):
+    self._after_doctype = False
+    if tag == 'head':
+      self._in_head = True
+    elif self._in_head and tag == 'title':
+      self._in_title = True
+      self._title_newly_opened = True
+
+    for t in self._tag_stack:
+      closed_by_tags = TAG_SETS.get(t[0])
+      if closed_by_tags and (closed_by_tags == '*' or tag in closed_by_tags):
+        self._in_pre_tag -= self._close_tags_up_to(t[0])
+        break
+
+    has_pre, data, lang = self.build_tag(tag, attrs, False)
+    start_pre = False
+    if (has_pre or self._in_pre_tag > 0 or
+        tag == 'script' or tag == 'style' or tag in self.pre_tags):
+      self._in_pre_tag += 1
+      start_pre = True
+
+    self._tag_stack.insert(0, (tag, start_pre, lang))
+    self._data_buffer.append(data)
+
+  def handle_endtag(self, tag):
+    # According to the spec, <p> tags don't get closed when a parent a
+    # tag closes them. Here's some logic that addresses this.
+    if tag == 'a':
+      contains_p = False
+      for i, t in enumerate(self._tag_stack):
+        if t[0] == 'p':
+          contains_p = True
+        elif t[0] == 'a':
+          break
+      if contains_p: # the p tag, and all its children should be left open
+        a_tag = self._tag_stack.pop(i)
+        if a_tag[1]:
+          self._in_pre_tag -= 1
+    else:
+      if tag == 'head':
+        # TODO: Did we know that we were in an head tag?! If not, we need to
+        # reminify everything to remove extra spaces.
+        self._in_head = False
+      elif tag == 'title':
+        self._in_title = False
+        self._title_newly_opened = False
+      try:
+        self._in_pre_tag -= self._close_tags_up_to(tag)
+      except OpenTagNotFoundError:
+        # Some tags don't require a start tag. Most do. Either way, we leave
+        # closing tags along since they affect output. For instance, a '</p>'
+        # results in a '<p></p>' in Chrome.
+        pass
+    if tag not in NO_CLOSE_TAGS:
+      self._data_buffer.extend(['</', escape.escape_tag(tag), '>'])
+
+  def handle_startendtag(self, tag, attrs):
+    self._after_doctype = False
+    data = self.build_tag(tag, attrs, tag not in NO_CLOSE_TAGS)[1]
+    self._data_buffer.append(data)
+
+  def handle_comment(self, data):
+    if not self.remove_comments or re.match(r'^(?:!|\[if\s)', data):
+      self._data_buffer.append('<!--{}-->'.format(
+          data[1:] if len(data) and data[0] == '!' else data))
+
+  def handle_data(self, data):
+    if self._in_pre_tag > 0:
+      self._data_buffer.append(data)
+    else:
+      # remove_all_empty_space matches everything. remove_empty_space only
+      # matches if there's a newline involved.
+      if self.remove_all_empty_space or self._in_head or self._after_doctype:
+        if HTML_ALL_SPACE_RE.match(data):
+          return
+      elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and
+            ('\n' in data or '\r' in data)):
+        return
+
+      # if we're in the title, remove leading and trailing whitespace.
+      # note that the title may be parsed in chunks if entityref's or charrefs
+      # are encountered.
+      if self._in_title:
+        if self.__title_trailing_whitespace:
+          self._data_buffer.append(' ')
+        self.__title_trailing_whitespace = (
+          HTML_ALL_SPACE_RE.match(data[-1]) is not None)
+        if self._title_newly_opened:
+          self._title_newly_opened = False
+          data = HTML_LEADING_TRAILING_SPACE_RE.sub('', data)
+        else:
+          data = HTML_TRAILING_SPACE_RE.sub(
+            '', HTML_LEADING_TRAILING_SPACE_RE.sub(' ', data))
+
+      data = HTML_SPACE_RE.sub(' ', data)
+      if not data:
+        return
+
+      if self._in_pre_tag == 0 and self._data_buffer:
+        # If we're not in a pre block, its possible that we append two spaces
+        # together, which we want to avoid. For instance, if we remove a comment
+        # from between two blocks of text: a <!-- B --> c => a  c.
+        if data[0] == ' ' and self._data_buffer[-1][-1] == ' ':
+          data = data[1:]
+          if not data:
+            return
+      self._data_buffer.append(data)
+
+  def handle_entityref(self, data):
+    if self._in_title:
+      if not self._title_newly_opened and self.__title_trailing_whitespace:
+        self._data_buffer.append(' ')
+        self.__title_trailing_whitespace = False
+      self._title_newly_opened = False
+    self._data_buffer.append('&{};'.format(data))
+
+  def handle_charref(self, data):
+    if self._in_title:
+      if not self._title_newly_opened and self.__title_trailing_whitespace:
+        self._data_buffer.append(' ')
+        self.__title_trailing_whitespace = False
+      self._title_newly_opened = False
+    self._data_buffer.append('&#{};'.format(data))
+
+  def handle_pi(self, data):
+    self._data_buffer.append('<?' + data + '>')
+
+  def unknown_decl(self, data):
+    self._data_buffer.append('<![' + data + ']>')
+
+  def reset(self):
+    self._data_buffer = []
+    self._in_pre_tag = 0
+    self._in_head = False
+    self._in_title = False
+    self._after_doctype = False
+    self._tag_stack = []
+    self._title_newly_opened = False
+    self.__title_trailing_whitespace = False
+    HTMLParser.reset(self)
+
+  def unescape(self, val):
+    """Override this method so that we can handle char ref conversion ourself.
+    """
+    return val
+
+  @property
+  def result(self):
+    return ''.join(self._data_buffer)