From 74f44e6cb4af8739f68034734ee00361933bda07 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Thu, 4 Mar 2021 23:35:39 +0100 Subject: [PATCH 1/5] Minor unescape optimisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First of all, avoid unnecessary dictionary lookup by using get() method once rather than ‘in’ operator followed by lookup. Second of all, optimise _charref by observing that there are no named character references which start with a digit or contain non-alphanumeric characters and the longest named reference consists of 31 letters (not 32). --- htmlmin/python3html/__init__.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/htmlmin/python3html/__init__.py b/htmlmin/python3html/__init__.py index b836eef..1c639ca 100644 --- a/htmlmin/python3html/__init__.py +++ b/htmlmin/python3html/__init__.py @@ -103,28 +103,30 @@ def _replace_charref(s): num = int(s[2:].rstrip(';'), 16) else: num = int(s[1:].rstrip(';')) - if num in _invalid_charrefs: - return _invalid_charrefs[num] + v = _invalid_charrefs.get(num) + if v is not None: + return v if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF: return '\uFFFD' if num in _invalid_codepoints: return '' return unichr(num) - else: - # named charref - if s in _html5: - return _html5[s] - # find the longest matching name (as defined by the standard) - for x in range(len(s)-1, 1, -1): - if s[:x] in _html5: - return _html5[s[:x]] + s[x:] - else: - return '&' + s + + # named charref + v = _html5.get(s) + if v is not None: + return v + # find the longest matching name (as defined by the standard) + for x in range(len(s)-1, 1, -1): + v = _html5.get(s[:x]) + if v is not None: + return v + s[x:] + return '&' + s _charref = _re.compile(r'&(#[0-9]+;?' r'|#[xX][0-9a-fA-F]+;?' - r'|[^\t\n\f <&#;]{1,32};?)') + r'|[a-zA-Z][0-9a-zA-Z]{,30};?)') def unescape(s): """ From 3633e0b44525f94f510d40fff4fcbb504c45228a Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 2 Mar 2021 17:34:48 +0100 Subject: [PATCH 2/5] Add several new tests Add a handful of new cases into FEATURES_TEXTS; create new class for convert_charrefs feature and add more casses testing it including checking behaviour with text new tests for convert_charref feature; add new test for quality of minification when all options are turned off; and (when verifying quality) check reduction in bytes in addition to reduction in character count. Some of the new tests demonstrate bugs. Add appropriate comments. --- htmlmin/tests/tests.py | 140 +++++++++++++++++++++++++++++++++-------- 1 file changed, 114 insertions(+), 26 deletions(-) diff --git a/htmlmin/tests/tests.py b/htmlmin/tests/tests.py index c54d656..f70e653 100644 --- a/htmlmin/tests/tests.py +++ b/htmlmin/tests/tests.py @@ -199,6 +199,16 @@ ' ☃X Y & Z ', '☃X Y & Z', ), + 'pre_respected_on_title': ( + ' Foo bar ', + ' Foo bar ', + ), + # TODO: This is invalid HTML but regardless we should handle it sensibly + # rather than removing trailing whitespace everywhere. + 'missing_title_end': ( + ' Test </head><p>Foo <i> bar </i> and baz. </p>', + '<head><title>Test</head><p> Foo<i> bar</i> and baz.</p>', + ), 'dont_minify_scripts_or_styles': ( '<body> <script> X </script> <style> X</style> </body>', '<body> <script> X </script> <style> X</style> </body>', @@ -221,18 +231,74 @@ ('<html><body lang=en><p>This is an example.' '<p lang=pl>I po polsku <span lang=el>and more English</span>.'), ), - 'convert_charrefs': ( - '<input value=""'''<.π> "">', - u'<input value=""\'\'\'<.\u03c0> "">', - ), - 'convert_charrefs_false': ( - '<input value=""'''<.π> "">', - '<input value=""'''<.π> "">', - ), 'dont_convert_pre_attr': ( '<input pre-value=""'''<.π> "">', '<input value="'''<.π> ">', ), + 'remove_entity_space': ( + '<p>Foo bar baz</p>', + '<p>Foo bar baz</p>', + ), + # TODO: Fix, this should generate &amp; + 'escape_after_close_tag_removal': ( + '<p><br>Foo &</br>amp; bar, <br>baz &am</br>p; qux</p>', + '<p><br>Foo & bar, <br>baz & qux</p>', + ), + # Note: the ‘]’ being eaten is Python bug in _markupbase.py, see + # https://github.com/python/cpython/pull/24720 + 'leave_cdata_alone': ( + '<p>Leave <![CDATA[ & & & < < ]]> alone.', + '<p>Leave <![CDATA[ & & & < < ]> alone.', + ), +} + +# key: (input, out_attribute_on, out_attribute_off, out_text_on, out_text_off) +CONVERT_CHARREFS_TEXTS = { + 'entities': ( + '"'''<.π> "', + u'"\'\'\'<.\u03C0> "', + '"'''<.π> "', + '"'''<.π> "', + '"'''<.π> "', + ), + 'not_escaped': ( + 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', + # TODO: Fix. There is no named character reference ‘M’ and as such ‘&M’ is + # perfectly valid way to write ‘&M’ according to HTML5. Changing it to + # ‘&M;’ changes the text. This is probably Python bug. + 'Tiffany & Co. H&M; 1&2 1&2;', + ), + 'at_end': ( + ' 1&2', + ' 1&2', + ' 1&2', + ' 1&2', + ' 1&2', + ), + 'no_semicolon': ( + '/?sect=2¶=5&par=8', + # TODO: Fix. Inside of an attribute value, if named character reference + # does not end with a semicolon and is proceeded by an equal sign, it must + # be left intact. + u'/?sect=2\u00B6=5&par=8', + '/?sect=2¶=5&par=8', + '/?sect=2¶=5&par=8', + # TODO: Fix. There is no named character reference ‘par’ (even though + # there’s ‘par;’) and as such ‘&par’ is perfectly valid way to write ‘&par’ + # according to HTML5. Changing it to ‘∥’ changes the text. This is + # probably Python bug. + '/?sect=2¶=5∥=8', + ), + 'followed_by_eq': ( + '/?sect=2&para=5', + '/?sect=2¶=5', + '/?sect=2&para=5', + '/?sect=2&para=5', + '/?sect=2&para=5', + ) } SELF_CLOSE_TEXTS = { @@ -334,16 +400,14 @@ ), } +def _make_test(inp, out, **kw): + return lambda self: self.assertEqual(self.minify(inp, **kw), out) + class HTMLMinTestMeta(type): def __new__(cls, name, bases, dct): - def make_test(text): - def inner_test(self): - self.assertEqual(self.minify(text[0]), text[1]) - return inner_test - for k, v in dct.get('__reference_texts__',{}).items(): if 'test_'+k not in dct: - dct['test_'+k] = make_test(v) + dct['test_'+k] = _make_test(*v) return type.__new__(cls, str(name), bases, dct) class HTMLMinTestCase( @@ -354,19 +418,28 @@ def setUp(self): class TestMinifyFunction(HTMLMinTestCase): __reference_texts__ = MINIFY_FUNCTION_TEXTS - def test_basic_minification_quality(self): + def _test_minification_quality(self, want_chars, want_bytes, *args, **kw): import codecs with codecs.open('htmlmin/tests/large_test.html', encoding='utf-8') as inpf: inp = inpf.read() - out = self.minify(inp) - self.assertEqual(len(inp) - len(out), 9408) + out = self.minify(inp, *args, **kw) + got_chars = len(inp) - len(out) + got_bytes = len(inp.encode('utf-8')) - len(out.encode('utf-8')) + self.assertEqual((got_chars, got_bytes), (want_chars, want_bytes)) + + def test_poor_minification_quality(self): + self._test_minification_quality(754, 754, + reduce_empty_attributes=False, + remove_optional_attribute_quotes=False, + convert_charrefs=False) + + def test_basic_minification_quality(self): + self._test_minification_quality(9408, 9398) def test_high_minification_quality(self): - import codecs - with codecs.open('htmlmin/tests/large_test.html', encoding='utf-8') as inpf: - inp = inpf.read() - out = self.minify(inp, remove_all_empty_space=True, remove_comments=True) - self.assertEqual(len(inp) - len(out), 12518) + self._test_minification_quality(12518, 12508, + remove_all_empty_space=True, + remove_comments=True) class TestMinifierObject(HTMLMinTestCase): __reference_texts__ = MINIFY_FUNCTION_TEXTS @@ -393,7 +466,7 @@ def test_buffered_input(self): self.minifier.input(text[0][len(text[0]) // 2:]) self.assertEqual(self.minifier.finalize(), text[1]) - + class TestMinifyFeatures(HTMLMinTestCase): __reference_texts__ = FEATURES_TEXTS @@ -479,10 +552,25 @@ def test_dont_minify_scripts_or_styles(self): text = self.__reference_texts__['dont_minify_scripts_or_styles'] self.assertEqual(htmlmin.minify(text[0], pre_tags=[]), text[1]) - def test_convert_charrefs_false(self): - text = self.__reference_texts__['convert_charrefs_false'] - self.assertEqual(htmlmin.minify(text[0], convert_charrefs=False), text[1]) +def _make_test_convert_charrefs(tests): + def setUp(self): self.minify = htmlmin.minify + d = {'setUp': setUp} + + def add_test(key, fmt, inp, out, convert_charrefs): + key = 'test_{}_{}'.format(key, ('off', 'on')[int(convert_charrefs)]) + d[key] = _make_test(fmt.format(inp), fmt.format(out), + convert_charrefs=convert_charrefs) + + for key, test in tests.items(): + inp = test[0] + add_test(key + '_in_attr_value', '<input value="{}">', inp, test[1], True) + add_test(key + '_in_attr_value', '<input value="{}">', inp, test[2], False) + add_test(key + '_in_text', '<p>{}', inp, test[3], True) + add_test(key + '_in_text', '<p>{}', inp, test[4], False) + + return type('TestConvertCharrefs', (unittest.TestCase,), d) +TestConvertCharrefs = _make_test_convert_charrefs(CONVERT_CHARREFS_TEXTS) class TestSelfClosingTags(HTMLMinTestCase): __reference_texts__ = SELF_CLOSE_TEXTS From e34ab8ac5baccd0711699d9bbb74c3626166b9c4 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz <mina86@mina86.com> Date: Thu, 4 Mar 2021 22:31:59 +0100 Subject: [PATCH 3/5] Fix unescaping character reference lacking semicolon in attribute values For historical reasons, inside of an attribute value, a named character reerence which is not terminated by a semicolon must be interpreted verbatim. --- htmlmin/parser.py | 5 +++-- htmlmin/python3html/__init__.py | 23 ++++++++++++++++------- htmlmin/tests/tests.py | 5 +---- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/htmlmin/parser.py b/htmlmin/parser.py index 5263cca..495a6ee 100644 --- a/htmlmin/parser.py +++ b/htmlmin/parser.py @@ -31,6 +31,7 @@ import re from .python3html.parser import HTMLParser +from .python3html import unescape as py_unescape from . import escape @@ -157,7 +158,7 @@ def build_tag(self, tag, attrs, close_tag): if not self.keep_pre and not pre_prefix: continue if v and self.convert_charrefs and not pre_prefix: - v = HTMLParser.unescape(self, v) + v = py_unescape(v, in_attr=True) if k == 'lang': lang = v if v == self._tag_lang(): @@ -233,7 +234,7 @@ def build_tag(self, tag, attrs, close_tag): '/' if close_tag else ''), lang def handle_decl(self, decl): - if (len(self._data_buffer) == 1 and + if (len(self._data_buffer) == 1 and HTML_SPACE_RE.match(self._data_buffer[0][0])): self._data_buffer = [] self._data_buffer.append('<!' + decl + '>') diff --git a/htmlmin/python3html/__init__.py b/htmlmin/python3html/__init__.py index 1c639ca..a399fdd 100644 --- a/htmlmin/python3html/__init__.py +++ b/htmlmin/python3html/__init__.py @@ -95,7 +95,7 @@ def escape(s, quote=True): } -def _replace_charref(s): +def _replace_charref(s, in_attr): s = s.group(1) if s[0] == '#': # numeric charref @@ -117,10 +117,11 @@ def _replace_charref(s): if v is not None: return v # find the longest matching name (as defined by the standard) - for x in range(len(s)-1, 1, -1): - v = _html5.get(s[:x]) - if v is not None: - return v + s[x:] + if not in_attr: + for x in range(len(s)-1, 1, -1): + v = _html5.get(s[:x]) + if v is not None: + return v + s[x:] return '&' + s @@ -128,7 +129,14 @@ def _replace_charref(s): r'|#[xX][0-9a-fA-F]+;?' r'|[a-zA-Z][0-9a-zA-Z]{,30};?)') -def unescape(s): +# Like _charref but requires ; after named reference, see +# https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state +_charref_in_attr = _re.compile(r'&(#[0-9]+;?' + r'|#[xX][0-9a-fA-F]+;?' + r'|[a-zA-Z][0-9a-zA-Z]{,30};)') + + +def unescape(s, in_attr=False): """ Convert all named and numeric character references (e.g. >, >, &x3e;) in the string s to the corresponding unicode characters. @@ -138,4 +146,5 @@ def unescape(s): """ if '&' not in s: return s - return _charref.sub(_replace_charref, s) + return (_charref_in_attr if in_attr else _charref).sub( + lambda m: _replace_charref(m, in_attr), s) diff --git a/htmlmin/tests/tests.py b/htmlmin/tests/tests.py index f70e653..dfb602f 100644 --- a/htmlmin/tests/tests.py +++ b/htmlmin/tests/tests.py @@ -280,10 +280,7 @@ ), 'no_semicolon': ( '/?sect=2¶=5&par=8', - # TODO: Fix. Inside of an attribute value, if named character reference - # does not end with a semicolon and is proceeded by an equal sign, it must - # be left intact. - u'/?sect=2\u00B6=5&par=8', + '/?sect=2¶=5&par=8', '/?sect=2¶=5&par=8', '/?sect=2¶=5&par=8', # TODO: Fix. There is no named character reference ‘par’ (even though From a8c63f5462bfe7a5311be712b23a50cfb590c943 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz <mina86@mina86.com> Date: Tue, 2 Mar 2021 20:58:59 +0100 Subject: [PATCH 4/5] Rework data handling --- htmlmin/parser.py | 108 +++++++++++++++++++---------------------- htmlmin/tests/tests.py | 4 +- 2 files changed, 50 insertions(+), 62 deletions(-) diff --git a/htmlmin/parser.py b/htmlmin/parser.py index 495a6ee..3dd483e 100644 --- a/htmlmin/parser.py +++ b/htmlmin/parser.py @@ -36,14 +36,9 @@ from . import escape # https://www.w3.org/TR/html5/single-page.html#space-character -HTML_SPACE_RE = re.compile('[\x20\x09\x0a\x0c\x0d]+') -HTML_ALL_SPACE_RE = re.compile('^[\x20\x09\x0a\x0c\x0d]+$') -HTML_LEADING_SPACE_RE = re.compile( - '^[\x20\x09\x0a\x0c\x0d]+') -HTML_TRAILING_SPACE_RE = re.compile( - '[\x20\x09\x0a\x0c\x0d]+$') -HTML_LEADING_TRAILING_SPACE_RE = re.compile( - '(^[\x20\x09\x0a\x0c\x0d]+)|([\x20\x09\x0a\x0c\x0d]+$)') +HTML_SPACE_CHARS = '\x20\x09\x0a\x0c\x0d' +HTML_SPACE_RE = re.compile('[{}]+'.format(HTML_SPACE_CHARS)) +HTML_ALL_SPACE_RE = re.compile('^[{}]+$'.format(HTML_SPACE_CHARS)) PRE_TAGS = ('pre', 'textarea') # styles and scripts are never minified # http://www.w3.org/TR/html51/syntax.html#elements-0 @@ -138,6 +133,9 @@ def __init__(self, def _tag_lang(self): return self._tag_stack[0][2] if self._tag_stack else None + def _ends_with_one_of(self, one_of): + return self._data_buffer and self._data_buffer[-1][-1] in one_of + def build_tag(self, tag, attrs, close_tag): has_pre = False @@ -235,7 +233,7 @@ def build_tag(self, tag, attrs, close_tag): def handle_decl(self, decl): if (len(self._data_buffer) == 1 and - HTML_SPACE_RE.match(self._data_buffer[0][0])): + self._data_buffer[0][0] in HTML_SPACE_CHARS): self._data_buffer = [] self._data_buffer.append('<!' + decl + '>') self._after_doctype = True @@ -246,6 +244,10 @@ def _close_tags_up_to(self, tag): for i, t in enumerate(self._tag_stack): if t[1]: num_pres += 1 + if t[0] == 'title': + self._in_title = False + self._title_newly_opened = False + self._title_trailing_whitespace = False if t[0] == tag: break @@ -265,6 +267,7 @@ def handle_starttag(self, tag, attrs): elif self._in_head and tag == 'title': self._in_title = True self._title_newly_opened = True + self._title_trailing_whitespace = False for t in self._tag_stack: closed_by_tags = TAG_SETS.get(t[0]) @@ -301,9 +304,6 @@ def handle_endtag(self, tag): # TODO: Did we know that we were in an head tag?! If not, we need to # reminify everything to remove extra spaces. self._in_head = False - elif tag == 'title': - self._in_title = False - self._title_newly_opened = False try: self._in_pre_tag -= self._close_tags_up_to(tag) except OpenTagNotFoundError: @@ -327,60 +327,50 @@ def handle_comment(self, data): def handle_data(self, data): if self._in_pre_tag > 0: self._data_buffer.append(data) - else: - # remove_all_empty_space matches everything. remove_empty_space only - # matches if there's a newline involved. - if self.remove_all_empty_space or self._in_head or self._after_doctype: - if HTML_ALL_SPACE_RE.match(data): - return - elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and - ('\n' in data or '\r' in data)): - return - - # if we're in the title, remove leading and trailing whitespace. - # note that the title may be parsed in chunks if entityref's or charrefs - # are encountered. - if self._in_title: - if self.__title_trailing_whitespace: - self._data_buffer.append(' ') - self.__title_trailing_whitespace = ( - HTML_ALL_SPACE_RE.match(data[-1]) is not None) - if self._title_newly_opened: - self._title_newly_opened = False - data = HTML_LEADING_TRAILING_SPACE_RE.sub('', data) - else: - data = HTML_TRAILING_SPACE_RE.sub( - '', HTML_LEADING_TRAILING_SPACE_RE.sub(' ', data)) + return - data = HTML_SPACE_RE.sub(' ', data) - if not data: + # remove_all_empty_space matches everything. remove_empty_space only + # matches if there's a newline involved. + if self.remove_all_empty_space or self._in_head or self._after_doctype: + if HTML_ALL_SPACE_RE.match(data): return + elif (self.remove_empty_space and HTML_ALL_SPACE_RE.match(data) and + ('\n' in data or '\r' in data)): + return + + data = HTML_SPACE_RE.sub(' ', data) + if self._title_trailing_whitespace: + data = ' ' + data + self._title_trailing_whitespace = False + elif not data: + return + + if data[0] == ' ': + # This checks for two conditions: + # * If we're in the title, remove leading whitespace. + # * If we're not in a pre block, its possible that we append two spaces + # together, which we want to avoid. For instance, if we remove + # a comment from between two blocks of text: a <!-- B --> c => a c. + if (self._title_newly_opened or self._ends_with_one_of(HTML_SPACE_CHARS)): + data = data[1:] + if not data: + return + + # If we’re in title, delay insertion of trailing white space. We don’t want + # to insert it if we’re going to close the tag. + self._title_newly_opened = False + if self._in_title and data[-1] == ' ': + self._title_trailing_whitespace = True + data = data[:-1] - if self._in_pre_tag == 0 and self._data_buffer: - # If we're not in a pre block, its possible that we append two spaces - # together, which we want to avoid. For instance, if we remove a comment - # from between two blocks of text: a <!-- B --> c => a c. - if data[0] == ' ' and self._data_buffer[-1][-1] == ' ': - data = data[1:] - if not data: - return + if data: self._data_buffer.append(data) def handle_entityref(self, data): - if self._in_title: - if not self._title_newly_opened and self.__title_trailing_whitespace: - self._data_buffer.append(' ') - self.__title_trailing_whitespace = False - self._title_newly_opened = False - self._data_buffer.append('&{};'.format(data)) + self.handle_data('&{};'.format(data)) def handle_charref(self, data): - if self._in_title: - if not self._title_newly_opened and self.__title_trailing_whitespace: - self._data_buffer.append(' ') - self.__title_trailing_whitespace = False - self._title_newly_opened = False - self._data_buffer.append('&#{};'.format(data)) + self.handle_data('&#{};'.format(data)) def handle_pi(self, data): self._data_buffer.append('<?' + data + '>') @@ -396,7 +386,7 @@ def reset(self): self._after_doctype = False self._tag_stack = [] self._title_newly_opened = False - self.__title_trailing_whitespace = False + self._title_trailing_whitespace = False HTMLParser.reset(self) def unescape(self, val): diff --git a/htmlmin/tests/tests.py b/htmlmin/tests/tests.py index dfb602f..e185aa5 100644 --- a/htmlmin/tests/tests.py +++ b/htmlmin/tests/tests.py @@ -203,11 +203,9 @@ '<head><title pre> Foo bar ', ' Foo bar ', ), - # TODO: This is invalid HTML but regardless we should handle it sensibly - # rather than removing trailing whitespace everywhere. 'missing_title_end': ( ' Test </head><p>Foo <i> bar </i> and baz. </p>', - '<head><title>Test</head><p> Foo<i> bar</i> and baz.</p>', + '<head><title>Test</head><p>Foo <i> bar </i> and baz. </p>', ), 'dont_minify_scripts_or_styles': ( '<body> <script> X </script> <style> X</style> </body>', From ef79591320e5927e34b4cf0395b2ff159e4d54eb Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz <mina86@mina86.com> Date: Tue, 2 Mar 2021 21:15:41 +0100 Subject: [PATCH 5/5] Convert character references in data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-escape characters in data to minimise code further. In data sections only ampersand and less-than sign need to be escaped. Since characters are always shorter than their entities not escaping what doesn’t need to saves space. Furthermore, don’t escape ampersand in situations in which HTML5 dictates it doesn’t need to be escape. --- htmlmin/main.py | 8 ++++---- htmlmin/parser.py | 42 ++++++++++++++++++++++++++++++++++++++++++ htmlmin/tests/tests.py | 19 +++++++++---------- 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/htmlmin/main.py b/htmlmin/main.py index 6d21c06..ee0dcb6 100644 --- a/htmlmin/main.py +++ b/htmlmin/main.py @@ -70,9 +70,9 @@ def minify(input, :param remove_optional_attribute_quotes: When True, optional quotes around attributes are removed. When False, all attribute quotes are left intact. Defaults to True. - :param conver_charrefs: Decode character references such as & and . - to their single charater values where safe. This currently only applies to - attributes. Data content between tags will be left encoded. + :param convert_charrefs: Decode character references such as & and . + to their single charater values where safe. This applies to attributes as + well as data. :param keep_pre: By default, htmlmin uses the special attribute ``pre`` to allow you to demarcate areas of HTML that should not be minified. It removes this attribute as it finds it. Setting this value to ``True`` tells htmlmin @@ -83,7 +83,7 @@ def minify(input, that ``<script>`` and ``<style>`` tags are never minimized. :param pre_attr: Specifies the attribute that, when found in an HTML tag, indicates that the content of the tag should not be minified. Defaults to - ``pre``. You can also prefix individual tag attributes with + ``pre``. You can also prefix individual tag attributes with ``{pre_attr}-`` to prevent the contents of the individual attribute from being changed. :return: A string containing the minified HTML. diff --git a/htmlmin/parser.py b/htmlmin/parser.py index 3dd483e..7da241e 100644 --- a/htmlmin/parser.py +++ b/htmlmin/parser.py @@ -136,6 +136,15 @@ def _tag_lang(self): def _ends_with_one_of(self, one_of): return self._data_buffer and self._data_buffer[-1][-1] in one_of + def _pop_if_one_of(self, one_of): + if not self._ends_with_one_of(one_of): + return False + if len(self._data_buffer[-1]) == 1: + self._data_buffer.pop() + else: + self._data_buffer[-1] = self._data_buffer[-1][:-1] + return True + def build_tag(self, tag, attrs, close_tag): has_pre = False @@ -329,6 +338,39 @@ def handle_data(self, data): self._data_buffer.append(data) return + if self.convert_charrefs: + data = HTMLParser.unescape(self, data) + + # If trailing character in buffer is an ampersand (which can happen if + # we’ve removed a close tag) we need to make sure it’s escaped if it needs + # to be. + if self._data_buffer and self._pop_if_one_of('&'): + data = '&' + data + + # Within text only < and & need to be escaped. Furthermore, the latter + # doesn’t need to be escaped under certain conditions. It needs to be + # escaped if: + # + # 1) it’s followed by a hash sign, + # 2) it’s followed by a defined named character reference or + # 3) forms an ambiguous ampersand, i.e. it’s followed by a sequence of + # alphanumeric characters finished with a semicolon (whether or not it + # matches a defined named character reference). + # + # Because there are defined named character references which do not end in + # a semicolon, some ‘&<alnum>+’ strings need escaping and some don’t. + # (Yeah, HTML5 is weird). + # + # For now we’re covering the first case, the second case by checking if + # ampersand is followed by a letter while the third case is partially + # covered by the second and than we also explicitly check for ambiguous + # ampersand with digits as first character. + # + # This way we unnecessarily escape ‘&blah’ but at least we don’t escape + # ‘&123’ while escaping ‘&123;’. + data = re.sub('&(?=[a-zA-Z#]|[0-9][a-zA-Z0-9]*;)', + '&', data).replace('<', '<') + # remove_all_empty_space matches everything. remove_empty_space only # matches if there's a newline involved. if self.remove_all_empty_space or self._in_head or self._after_doctype: diff --git a/htmlmin/tests/tests.py b/htmlmin/tests/tests.py index e185aa5..227793d 100644 --- a/htmlmin/tests/tests.py +++ b/htmlmin/tests/tests.py @@ -197,7 +197,7 @@ ), 'remove_head_spaces': ( '<head> <title> ☃X Y & Z ', - '☃X Y & Z', + '☃X Y & Z', ), 'pre_respected_on_title': ( ' Foo bar ', @@ -235,12 +235,11 @@ ), 'remove_entity_space': ( '

Foo bar baz

', - '

Foo bar baz

', + '

Foo bar baz

', ), - # TODO: Fix, this should generate &amp; 'escape_after_close_tag_removal': ( '


Foo &
amp; bar,
baz &am
p; qux

', - '


Foo & bar,
baz & qux

', + '


Foo &amp; bar,
baz &amp; qux

', ), # Note: the ‘]’ being eaten is Python bug in _markupbase.py, see # https://github.com/python/cpython/pull/24720 @@ -256,14 +255,14 @@ '"'''<.π> "', u'"\'\'\'<.\u03C0> "', '"'''<.π> "', - '"'''<.π> "', + u'"\'\'\'<.\u03C0> "', '"'''<.π> "', ), 'not_escaped': ( 'Tiffany & Co. H&M 1&2 1&2;', 'Tiffany & Co. H&M 1&2 1&2;', 'Tiffany & Co. H&M 1&2 1&2;', - 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', # TODO: Fix. There is no named character reference ‘M’ and as such ‘&M’ is # perfectly valid way to write ‘&M’ according to HTML5. Changing it to # ‘&M;’ changes the text. This is probably Python bug. @@ -273,14 +272,14 @@ ' 1&2', ' 1&2', ' 1&2', - ' 1&2', + ' 1&2', ' 1&2', ), 'no_semicolon': ( '/?sect=2¶=5&par=8', '/?sect=2¶=5&par=8', '/?sect=2¶=5&par=8', - '/?sect=2¶=5&par=8', + '/?sect=2\u00B6=5&par=8', # TODO: Fix. There is no named character reference ‘par’ (even though # there’s ‘par;’) and as such ‘&par’ is perfectly valid way to write ‘&par’ # according to HTML5. Changing it to ‘∥’ changes the text. This is @@ -429,10 +428,10 @@ def test_poor_minification_quality(self): convert_charrefs=False) def test_basic_minification_quality(self): - self._test_minification_quality(9408, 9398) + self._test_minification_quality(9595, 9582) def test_high_minification_quality(self): - self._test_minification_quality(12518, 12508, + self._test_minification_quality(12705, 12692, remove_all_empty_space=True, remove_comments=True)