diff --git a/htmlmin/main.py b/htmlmin/main.py index 6d21c06..ee0dcb6 100644 --- a/htmlmin/main.py +++ b/htmlmin/main.py @@ -70,9 +70,9 @@ def minify(input, :param remove_optional_attribute_quotes: When True, optional quotes around attributes are removed. When False, all attribute quotes are left intact. Defaults to True. - :param conver_charrefs: Decode character references such as & and . - to their single charater values where safe. This currently only applies to - attributes. Data content between tags will be left encoded. + :param convert_charrefs: Decode character references such as & and . + to their single charater values where safe. This applies to attributes as + well as data. :param keep_pre: By default, htmlmin uses the special attribute ``pre`` to allow you to demarcate areas of HTML that should not be minified. It removes this attribute as it finds it. Setting this value to ``True`` tells htmlmin @@ -83,7 +83,7 @@ def minify(input, that `` ', @@ -221,18 +229,70 @@ ('

This is an example.' '

I po polsku and more English.'), ), - 'convert_charrefs': ( - '', - u'', - ), - 'convert_charrefs_false': ( - '', - '', - ), 'dont_convert_pre_attr': ( '', '', ), + 'remove_entity_space': ( + '

Foo bar baz

', + '

Foo bar baz

', + ), + 'escape_after_close_tag_removal': ( + '


Foo &
amp; bar,
baz &am
p; qux

', + '


Foo & bar,
baz & qux

', + ), + # Note: the ‘]’ being eaten is Python bug in _markupbase.py, see + # https://github.com/python/cpython/pull/24720 + 'leave_cdata_alone': ( + '

Leave alone.', + '

Leave alone.', + ), +} + +# key: (input, out_attribute_on, out_attribute_off, out_text_on, out_text_off) +CONVERT_CHARREFS_TEXTS = { + 'entities': ( + '"'''<.π> "', + u'"\'\'\'<.\u03C0> "', + '"'''<.π> "', + u'"\'\'\'<.\u03C0> "', + '"'''<.π> "', + ), + 'not_escaped': ( + 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', + 'Tiffany & Co. H&M 1&2 1&2;', + # TODO: Fix. There is no named character reference ‘M’ and as such ‘&M’ is + # perfectly valid way to write ‘&M’ according to HTML5. Changing it to + # ‘&M;’ changes the text. This is probably Python bug. + 'Tiffany & Co. H&M; 1&2 1&2;', + ), + 'at_end': ( + ' 1&2', + ' 1&2', + ' 1&2', + ' 1&2', + ' 1&2', + ), + 'no_semicolon': ( + '/?sect=2¶=5&par=8', + '/?sect=2¶=5&par=8', + '/?sect=2¶=5&par=8', + '/?sect=2\u00B6=5&par=8', + # TODO: Fix. There is no named character reference ‘par’ (even though + # there’s ‘par;’) and as such ‘&par’ is perfectly valid way to write ‘&par’ + # according to HTML5. Changing it to ‘∥’ changes the text. This is + # probably Python bug. + '/?sect=2¶=5∥=8', + ), + 'followed_by_eq': ( + '/?sect=2&para=5', + '/?sect=2¶=5', + '/?sect=2&para=5', + '/?sect=2&para=5', + '/?sect=2&para=5', + ) } SELF_CLOSE_TEXTS = { @@ -334,16 +394,14 @@ ), } +def _make_test(inp, out, **kw): + return lambda self: self.assertEqual(self.minify(inp, **kw), out) + class HTMLMinTestMeta(type): def __new__(cls, name, bases, dct): - def make_test(text): - def inner_test(self): - self.assertEqual(self.minify(text[0]), text[1]) - return inner_test - for k, v in dct.get('__reference_texts__',{}).items(): if 'test_'+k not in dct: - dct['test_'+k] = make_test(v) + dct['test_'+k] = _make_test(*v) return type.__new__(cls, str(name), bases, dct) class HTMLMinTestCase( @@ -354,19 +412,28 @@ def setUp(self): class TestMinifyFunction(HTMLMinTestCase): __reference_texts__ = MINIFY_FUNCTION_TEXTS - def test_basic_minification_quality(self): + def _test_minification_quality(self, want_chars, want_bytes, *args, **kw): import codecs with codecs.open('htmlmin/tests/large_test.html', encoding='utf-8') as inpf: inp = inpf.read() - out = self.minify(inp) - self.assertEqual(len(inp) - len(out), 9408) + out = self.minify(inp, *args, **kw) + got_chars = len(inp) - len(out) + got_bytes = len(inp.encode('utf-8')) - len(out.encode('utf-8')) + self.assertEqual((got_chars, got_bytes), (want_chars, want_bytes)) + + def test_poor_minification_quality(self): + self._test_minification_quality(754, 754, + reduce_empty_attributes=False, + remove_optional_attribute_quotes=False, + convert_charrefs=False) + + def test_basic_minification_quality(self): + self._test_minification_quality(9595, 9582) def test_high_minification_quality(self): - import codecs - with codecs.open('htmlmin/tests/large_test.html', encoding='utf-8') as inpf: - inp = inpf.read() - out = self.minify(inp, remove_all_empty_space=True, remove_comments=True) - self.assertEqual(len(inp) - len(out), 12518) + self._test_minification_quality(12705, 12692, + remove_all_empty_space=True, + remove_comments=True) class TestMinifierObject(HTMLMinTestCase): __reference_texts__ = MINIFY_FUNCTION_TEXTS @@ -393,7 +460,7 @@ def test_buffered_input(self): self.minifier.input(text[0][len(text[0]) // 2:]) self.assertEqual(self.minifier.finalize(), text[1]) - + class TestMinifyFeatures(HTMLMinTestCase): __reference_texts__ = FEATURES_TEXTS @@ -479,10 +546,25 @@ def test_dont_minify_scripts_or_styles(self): text = self.__reference_texts__['dont_minify_scripts_or_styles'] self.assertEqual(htmlmin.minify(text[0], pre_tags=[]), text[1]) - def test_convert_charrefs_false(self): - text = self.__reference_texts__['convert_charrefs_false'] - self.assertEqual(htmlmin.minify(text[0], convert_charrefs=False), text[1]) +def _make_test_convert_charrefs(tests): + def setUp(self): self.minify = htmlmin.minify + d = {'setUp': setUp} + + def add_test(key, fmt, inp, out, convert_charrefs): + key = 'test_{}_{}'.format(key, ('off', 'on')[int(convert_charrefs)]) + d[key] = _make_test(fmt.format(inp), fmt.format(out), + convert_charrefs=convert_charrefs) + + for key, test in tests.items(): + inp = test[0] + add_test(key + '_in_attr_value', '', inp, test[1], True) + add_test(key + '_in_attr_value', '', inp, test[2], False) + add_test(key + '_in_text', '

{}', inp, test[3], True) + add_test(key + '_in_text', '

{}', inp, test[4], False) + + return type('TestConvertCharrefs', (unittest.TestCase,), d) +TestConvertCharrefs = _make_test_convert_charrefs(CONVERT_CHARREFS_TEXTS) class TestSelfClosingTags(HTMLMinTestCase): __reference_texts__ = SELF_CLOSE_TEXTS