From ea9438dd50eb142ad1044deeab50c495c7c4a254 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sat, 28 Dec 2019 12:49:07 +0000 Subject: [PATCH 01/23] Resolves #828 --- composer.json | 1 + src/HtmlProcessor/AbstractHtmlProcessor.php | 103 +++++++++- src/HtmlProcessor/HtmlParser.php | 188 ++++++++++++++++++ .../CssToAttributeConverterTest.php | 4 +- 4 files changed, 284 insertions(+), 12 deletions(-) create mode 100644 src/HtmlProcessor/HtmlParser.php diff --git a/composer.json b/composer.json index 30fcca0a..bffbe577 100644 --- a/composer.json +++ b/composer.json @@ -40,6 +40,7 @@ "php": "~7.1.0 || ~7.2.0 || ~7.3.0 || ~7.4.0 || ~8.0.0", "ext-dom": "*", "ext-libxml": "*", + "masterminds/html5": "^2.7", "symfony/css-selector": "^3.4.32 || ^4.4 || ^5.1" }, "require-dev": { diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 479e6e34..4dd53a34 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -4,6 +4,9 @@ namespace Pelago\Emogrifier\HtmlProcessor; +use DOMNode; +use Masterminds\HTML5; + /** * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes. * @@ -37,6 +40,11 @@ abstract class AbstractHtmlProcessor */ protected $domDocument = null; + /** + * @var HTML5|null + */ + protected $html5 = null; + /** * @var \DOMXPath */ @@ -120,6 +128,27 @@ public function getDomDocument(): \DOMDocument return $this->domDocument; } + /** + * Provides access to the internal HTML5 instance. + * + * @return HTML5|null + */ + public function getHtml5(): HTML5 + { + if ($this->domDocument === null) { + throw new \UnexpectedValueException( + ( + self::class . + '::fromHtml() has not yet been called on ' . + static::class + ), + 1570472239 + ); + } + + return $this->html5; + } + /** * @param \DOMDocument $domDocument */ @@ -136,7 +165,7 @@ private function setDomDocument(\DOMDocument $domDocument): void */ public function render(): string { - $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML(); + $htmlWithPossibleErroneousClosingTags = $this->saveHTML(); return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags); } @@ -148,7 +177,7 @@ public function render(): string */ public function renderBodyContent(): string { - $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement()); + $htmlWithPossibleErroneousClosingTags = $this->saveHTML($this->getBodyElement()); $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags); return \preg_replace('%]*+)?+>%', '', $bodyNodeHtml); @@ -198,13 +227,19 @@ private function createUnifiedDomDocument(string $html): void */ private function createRawDomDocument(string $html): void { - $domDocument = new \DOMDocument(); - $domDocument->strictErrorChecking = false; - $domDocument->formatOutput = true; - $libXmlState = \libxml_use_internal_errors(true); - $domDocument->loadHTML($this->prepareHtmlForDomConversion($html)); - \libxml_clear_errors(); - \libxml_use_internal_errors($libXmlState); + $html = $this->prepareHtmlForDomConversion($html); + if ($this->isHtml5($html)) { + $this->html5 = new HTML5(['disable_html_ns' => true]); + $domDocument = $this->html5->parse($html); + } else { + $domDocument = new \DOMDocument(); + $domDocument->strictErrorChecking = false; + $domDocument->formatOutput = true; + $libXmlState = \libxml_use_internal_errors(true); + $domDocument->loadHTML($html); + \libxml_clear_errors(); + \libxml_use_internal_errors($libXmlState); + } $this->setDomDocument($domDocument); } @@ -220,7 +255,8 @@ private function createRawDomDocument(string $html): void private function prepareHtmlForDomConversion(string $html): string { $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html); - $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes); + $htmlWithRootElms = $this->addMissingRootElements($htmlWithSelfClosingSlashes); + $htmlWithDocumentType = $this->ensureDocumentType($htmlWithRootElms); return $this->addContentTypeMetaTag($htmlWithDocumentType); } @@ -334,4 +370,51 @@ private function ensureExistenceOfBodyElement(): void } $htmlElement->appendChild($this->getDomDocument()->createElement('body')); } + + /** + * masterminds/html5-php has some quirks where it doesn't handle the same as DOMDocument. This fixes those instances: + * - content before html/body + * - missing or elements + * + * @param string $html + * @return string + */ + private function addMissingRootElements(string $html) + { + $parser = new HtmlParser; + $parser->loadHtml($html); + + return $parser->saveHtml(); + } + + /** + * Check if the document contains a HTML5 DOCTYPE. + * + * @param string $html + * @return bool + */ + private function isHtml5(string $html) + { + return strspn($html, " \t\r\n") === stripos($html, ''); + } + + /** + * Dumps the internal document into a string using HTML formatting. + * + * @param DOMNode $dom [optional] parameter to output a subset of the document. + * @return string the HTML, or false if an error occurred. + */ + private function saveHTML($dom = null) + { + if (isset($this->html5)) { + if ($dom === null) { + $dom = $this->domDocument; + } + + return $this->html5->saveHTML($dom); + } + + // Fall back to DOMDocument. + return $this->getDomDocument()->saveHTML($dom); + } } diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php new file mode 100644 index 00000000..fa5072d4 --- /dev/null +++ b/src/HtmlProcessor/HtmlParser.php @@ -0,0 +1,188 @@ + + */ +class HtmlParser +{ + /** + * Structure of a basic HTML document. + * + * @var array + */ + protected $tree = [ + 'doctype' => '', + 'html' => [ + 'start' => "", + 'end' => "", // can't have attributes on closing html tags + 'content' => [], + ], + 'head' => [ + 'start' => "", // can't have attributes on head tag + 'end' => "", // can't have attributes on closing head tag + 'content' => [] + ], + 'body' => [ + 'start' => "", + 'end' => "", // can't have attributes on closing body tag + 'content' => [] + ], + ]; + + /** + * What root element did we last add to. + * + * @var string|null + */ + protected $previousKey = null; + + /** + * Parse a HTML document. + * + * @param string $html + */ + public function loadHtml($html) + { + $i = 0; + while ($i < strlen($html)) { + if ($html[$i] == "<") { + // Found a tag, get chars until the end of the tag. + $tag = ""; + while ($i < strlen($html) && $html[$i] != ">") { + $tag .= $html[$i++]; + } + + if ($i < strlen($html) && $html[$i] == ">") { + $tag .= $html[$i++]; + + // Copy any whitespace following the tag. + // Anything added here needs to be added to the rtrim in the nodeName function. + while ($i < strlen($html) && preg_match('/\s/', $html[$i])) { + $tag .= $html[$i++]; + } + } else { + // Missing closing tag? + $tag .= ">"; + } + + $this->addToTree($tag); + } else { + $this->addToTree($html[$i++]); + } + } + } + + /** + * Format the document in a structured way (ensures root elements exists and moves scripts/css into ). + * + * @return string + */ + public function saveHtml() + { + // Initialise buffer. + $buffer = ''; + + // Add - this is optional. + $buffer .= $this->tree['doctype']; + + // Add + $buffer .= $this->tree['html']['start']; + + // Add head + $buffer .= $this->tree['head']['start']; + foreach ($this->tree['head']['content'] as $node) { + $buffer .= $node; + } + $buffer .= $this->tree['head']['end']; + + // Add body + $buffer .= $this->tree['body']['start']; + foreach ($this->tree['body']['content'] as $node) { + $buffer .= $node; + } + $buffer .= $this->tree['body']['end']; + + // Close tag + return $buffer . $this->tree['html']['end']; + } + + /** + * Add a node into the tree for the correct parent. + * + * @param string $node + * @return bool + */ + protected function addToTree($node) + { + if ($node[0] == "<") { + switch (strtolower($this->nodeName($node))) { + case "!doctype": + if (empty($this->tree['doctype'])) { + return $this->tree['doctype'] = $node; + } + + // Don't overwrite if we've already got a doctype defintion. + return true; + + case "html": + return $this->addTo('html', $node, false); + + case "head": + return $this->addTo('head', $node); + + default: + return $this->addTo($this->previousKey ?? 'body', $node); + } + } + + // text node + return $this->addTo($this->previousKey ?? 'body', $node); + } + + /** + * Add a node to the the tree. + * + * @param string $key + * @param string $node + * @param bool $setPrevious + * @return bool + */ + protected function addTo($key, $node, $setPrevious = true) + { + $previousKey = $key; + + if (stripos($node, "<$key") !== false) { + $this->tree[$key]['start'] = $node; + } elseif (stristr($node, "/$key>")) { + $this->tree[$key]['end'] = $node; + $previousKey = null; + } else { + $this->tree[$key]['content'][] = $node; + } + + if ($setPrevious) { + $this->previousKey = $previousKey; + } + + return true; + } + + /** + * Get the name of a node without + * + * @param string $node + * @return string + */ + protected function nodeName($node) + { + $name = preg_replace('/>\s*/', '', ltrim($node, " ['Hi'], 'float: none' => ['
'], 'p.border-spacing' => ['

Hi

'], - 'height: auto' => [''], - 'width: auto' => [''], + 'height: auto' => [''], + 'width: auto' => [''], ]; } From 9eec51f91f1accfa3c8aa475fa9171c8225bcd90 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 12:21:50 +0000 Subject: [PATCH 02/23] phpcs fixes --- src/HtmlProcessor/AbstractHtmlProcessor.php | 10 ++-- src/HtmlProcessor/HtmlParser.php | 56 +++++++++++---------- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 4dd53a34..40835033 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -132,6 +132,7 @@ public function getDomDocument(): \DOMDocument * Provides access to the internal HTML5 instance. * * @return HTML5|null + * @throws \UnexpectedValueException */ public function getHtml5(): HTML5 { @@ -372,7 +373,8 @@ private function ensureExistenceOfBodyElement(): void } /** - * masterminds/html5-php has some quirks where it doesn't handle the same as DOMDocument. This fixes those instances: + * masterminds/html5-php has some quirks where it doesn't handle the same as DOMDocument. + * This fixes those instances: * - content before html/body * - missing or elements * @@ -381,7 +383,7 @@ private function ensureExistenceOfBodyElement(): void */ private function addMissingRootElements(string $html) { - $parser = new HtmlParser; + $parser = new HtmlParser(); $parser->loadHtml($html); return $parser->saveHtml(); @@ -395,7 +397,7 @@ private function addMissingRootElements(string $html) */ private function isHtml5(string $html) { - return strspn($html, " \t\r\n") === stripos($html, ''); + return \strspn($html, " \t\r\n") === \stripos($html, ''); } /** @@ -404,7 +406,7 @@ private function isHtml5(string $html) * @param DOMNode $dom [optional] parameter to output a subset of the document. * @return string the HTML, or false if an error occurred. */ - private function saveHTML($dom = null) + private function saveHTML(DOMNode $dom = null) { if (isset($this->html5)) { if ($dom === null) { diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index fa5072d4..7f454fdc 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -19,18 +19,18 @@ class HtmlParser protected $tree = [ 'doctype' => '', 'html' => [ - 'start' => "", - 'end' => "", // can't have attributes on closing html tags + 'start' => '', + 'end' => '', 'content' => [], ], 'head' => [ - 'start' => "", // can't have attributes on head tag - 'end' => "", // can't have attributes on closing head tag + 'start' => '', + 'end' => '', 'content' => [] ], 'body' => [ - 'start' => "", - 'end' => "", // can't have attributes on closing body tag + 'start' => '', + 'end' => '', 'content' => [] ], ]; @@ -47,28 +47,28 @@ class HtmlParser * * @param string $html */ - public function loadHtml($html) + public function loadHtml(string $html) { $i = 0; - while ($i < strlen($html)) { - if ($html[$i] == "<") { + while ($i < \strlen($html)) { + if ($html[$i] == '<') { // Found a tag, get chars until the end of the tag. - $tag = ""; - while ($i < strlen($html) && $html[$i] != ">") { + $tag = ''; + while ($i < \strlen($html) && $html[$i] != '>') { $tag .= $html[$i++]; } - if ($i < strlen($html) && $html[$i] == ">") { + if ($i < \strlen($html) && $html[$i] == '>') { $tag .= $html[$i++]; // Copy any whitespace following the tag. // Anything added here needs to be added to the rtrim in the nodeName function. - while ($i < strlen($html) && preg_match('/\s/', $html[$i])) { + while ($i < \strlen($html) && \preg_match('/\s/', $html[$i])) { $tag .= $html[$i++]; } } else { // Missing closing tag? - $tag .= ">"; + $tag .= '>'; } $this->addToTree($tag); @@ -118,22 +118,24 @@ public function saveHtml() * @param string $node * @return bool */ - protected function addToTree($node) + protected function addToTree(string $node) { - if ($node[0] == "<") { - switch (strtolower($this->nodeName($node))) { - case "!doctype": + if ($node[0] == '<') { + switch (\strtolower($this->nodeName($node))) { + case '!doctype': if (empty($this->tree['doctype'])) { - return $this->tree['doctype'] = $node; + $this->tree['doctype'] = $node; + + return $this->tree['doctype']; } // Don't overwrite if we've already got a doctype defintion. return true; - case "html": + case 'html': return $this->addTo('html', $node, false); - case "head": + case 'head': return $this->addTo('head', $node); default: @@ -153,13 +155,13 @@ protected function addToTree($node) * @param bool $setPrevious * @return bool */ - protected function addTo($key, $node, $setPrevious = true) + protected function addTo(string $key, string $node, bool $setPrevious = true) { $previousKey = $key; - if (stripos($node, "<$key") !== false) { + if (\stripos($node, '<' . $key) !== false) { $this->tree[$key]['start'] = $node; - } elseif (stristr($node, "/$key>")) { + } elseif (\stristr($node, '/' . $key . '>')) { $this->tree[$key]['end'] = $node; $previousKey = null; } else { @@ -179,10 +181,10 @@ protected function addTo($key, $node, $setPrevious = true) * @param string $node * @return string */ - protected function nodeName($node) + protected function nodeName(string $node) { - $name = preg_replace('/>\s*/', '', ltrim($node, "\s*/', '', \ltrim($node, ' Date: Tue, 14 Jan 2020 12:33:48 +0000 Subject: [PATCH 03/23] phpcs fixes --- src/HtmlProcessor/AbstractHtmlProcessor.php | 4 ++++ src/HtmlProcessor/HtmlParser.php | 18 +++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 40835033..a5a264c0 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -132,6 +132,7 @@ public function getDomDocument(): \DOMDocument * Provides access to the internal HTML5 instance. * * @return HTML5|null + * * @throws \UnexpectedValueException */ public function getHtml5(): HTML5 @@ -379,6 +380,7 @@ private function ensureExistenceOfBodyElement(): void * - missing or elements * * @param string $html + * * @return string */ private function addMissingRootElements(string $html) @@ -393,6 +395,7 @@ private function addMissingRootElements(string $html) * Check if the document contains a HTML5 DOCTYPE. * * @param string $html + * * @return bool */ private function isHtml5(string $html) @@ -404,6 +407,7 @@ private function isHtml5(string $html) * Dumps the internal document into a string using HTML formatting. * * @param DOMNode $dom [optional] parameter to output a subset of the document. + * * @return string the HTML, or false if an error occurred. */ private function saveHTML(DOMNode $dom = null) diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index 7f454fdc..53e9fb9a 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -26,12 +26,12 @@ class HtmlParser 'head' => [ 'start' => '', 'end' => '', - 'content' => [] + 'content' => [], ], 'body' => [ 'start' => '', 'end' => '', - 'content' => [] + 'content' => [], ], ]; @@ -50,20 +50,21 @@ class HtmlParser public function loadHtml(string $html) { $i = 0; - while ($i < \strlen($html)) { + $len = \strlen($html); + while ($i < $len) { if ($html[$i] == '<') { // Found a tag, get chars until the end of the tag. $tag = ''; - while ($i < \strlen($html) && $html[$i] != '>') { + while ($i < $len && $html[$i] != '>') { $tag .= $html[$i++]; } - if ($i < \strlen($html) && $html[$i] == '>') { + if ($i < $len && $html[$i] == '>') { $tag .= $html[$i++]; // Copy any whitespace following the tag. // Anything added here needs to be added to the rtrim in the nodeName function. - while ($i < \strlen($html) && \preg_match('/\s/', $html[$i])) { + while ($i < $len && \preg_match('/\s/', $html[$i])) { $tag .= $html[$i++]; } } else { @@ -116,6 +117,7 @@ public function saveHtml() * Add a node into the tree for the correct parent. * * @param string $node + * * @return bool */ protected function addToTree(string $node) @@ -152,7 +154,8 @@ protected function addToTree(string $node) * * @param string $key * @param string $node - * @param bool $setPrevious + * @param bool $setPrevious + * * @return bool */ protected function addTo(string $key, string $node, bool $setPrevious = true) @@ -179,6 +182,7 @@ protected function addTo(string $key, string $node, bool $setPrevious = true) * Get the name of a node without * * @param string $node + * * @return string */ protected function nodeName(string $node) From 4e57439bab914a04ac4dde2d20f0c2c18c87c935 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 12:47:03 +0000 Subject: [PATCH 04/23] phpmd fix --- src/HtmlProcessor/HtmlParser.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index 53e9fb9a..d948b403 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -138,15 +138,15 @@ protected function addToTree(string $node) return $this->addTo('html', $node, false); case 'head': - return $this->addTo('head', $node); + return $this->addTo('head', $node, true); default: - return $this->addTo($this->previousKey ?? 'body', $node); + return $this->addTo($this->previousKey ?? 'body', $node, true); } } // text node - return $this->addTo($this->previousKey ?? 'body', $node); + return $this->addTo($this->previousKey ?? 'body', $node, true); } /** @@ -158,7 +158,7 @@ protected function addToTree(string $node) * * @return bool */ - protected function addTo(string $key, string $node, bool $setPrevious = true) + protected function addTo(string $key, string $node, bool $setPrevious) { $previousKey = $key; From 7561579e4ef1197473d150d9b38fb552bc1eaf2b Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 12:47:15 +0000 Subject: [PATCH 05/23] fix typo --- src/HtmlProcessor/HtmlParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index d948b403..e6f0deeb 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -131,7 +131,7 @@ protected function addToTree(string $node) return $this->tree['doctype']; } - // Don't overwrite if we've already got a doctype defintion. + // Don't overwrite if we've already got a doctype definition. return true; case 'html': From b5a2a0e794bb8f7b2d903ca0c886ec9368fd79b0 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 12:56:36 +0000 Subject: [PATCH 06/23] fix psalm? --- src/HtmlProcessor/AbstractHtmlProcessor.php | 2 +- src/HtmlProcessor/HtmlParser.php | 52 +++++++++++---------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index a5a264c0..ffa483c5 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -131,7 +131,7 @@ public function getDomDocument(): \DOMDocument /** * Provides access to the internal HTML5 instance. * - * @return HTML5|null + * @return HTML5 * * @throws \UnexpectedValueException */ diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index e6f0deeb..4aa3af3b 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -52,19 +52,19 @@ public function loadHtml(string $html) $i = 0; $len = \strlen($html); while ($i < $len) { - if ($html[$i] == '<') { + if ((string)$html[$i] == '<') { // Found a tag, get chars until the end of the tag. $tag = ''; - while ($i < $len && $html[$i] != '>') { + while ($i < $len && (string)$html[$i] != '>') { $tag .= $html[$i++]; } - if ($i < $len && $html[$i] == '>') { + if ($i < $len && (string)$html[$i] == '>') { $tag .= $html[$i++]; // Copy any whitespace following the tag. // Anything added here needs to be added to the rtrim in the nodeName function. - while ($i < $len && \preg_match('/\s/', $html[$i])) { + while ($i < $len && \preg_match('/\s/', (string)$html[$i])) { $tag .= $html[$i++]; } } else { @@ -90,35 +90,35 @@ public function saveHtml() $buffer = ''; // Add - this is optional. - $buffer .= $this->tree['doctype']; + $buffer .= (string)$this->tree['doctype']; // Add - $buffer .= $this->tree['html']['start']; + $buffer .= (string)$this->tree['html']['start']; // Add head - $buffer .= $this->tree['head']['start']; - foreach ($this->tree['head']['content'] as $node) { - $buffer .= $node; + $buffer .= (string)$this->tree['head']['start']; + foreach ((array)$this->tree['head']['content'] as $node) { + $buffer .= (string)$node; } - $buffer .= $this->tree['head']['end']; + $buffer .= (string)$this->tree['head']['end']; // Add body $buffer .= $this->tree['body']['start']; - foreach ($this->tree['body']['content'] as $node) { - $buffer .= $node; + foreach ((array)$this->tree['body']['content'] as $node) { + $buffer .= (string)$node; } - $buffer .= $this->tree['body']['end']; + $buffer .= (string)$this->tree['body']['end']; // Close tag - return $buffer . $this->tree['html']['end']; + return $buffer . (string)$this->tree['html']['end']; } /** * Add a node into the tree for the correct parent. * - * @param string $node + * @param string $node * - * @return bool + * @return void */ protected function addToTree(string $node) { @@ -128,25 +128,29 @@ protected function addToTree(string $node) if (empty($this->tree['doctype'])) { $this->tree['doctype'] = $node; - return $this->tree['doctype']; + return; } // Don't overwrite if we've already got a doctype definition. - return true; + return; case 'html': - return $this->addTo('html', $node, false); + $this->addTo('html', $node, false); + return; case 'head': - return $this->addTo('head', $node, true); + $this->addTo('head', $node, true); + return; default: - return $this->addTo($this->previousKey ?? 'body', $node, true); + $this->addTo($this->previousKey ?? 'body', $node, true); + return; } } // text node - return $this->addTo($this->previousKey ?? 'body', $node, true); + $this->addTo($this->previousKey ?? 'body', $node, true); + return; } /** @@ -156,7 +160,7 @@ protected function addToTree(string $node) * @param string $node * @param bool $setPrevious * - * @return bool + * @return void */ protected function addTo(string $key, string $node, bool $setPrevious) { @@ -174,8 +178,6 @@ protected function addTo(string $key, string $node, bool $setPrevious) if ($setPrevious) { $this->previousKey = $previousKey; } - - return true; } /** From dcea31f1e3d99176cf72475941728bc15d7e0aa6 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 12:57:55 +0000 Subject: [PATCH 07/23] removed unnecessary return statement --- src/HtmlProcessor/HtmlParser.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index 4aa3af3b..ee521e3f 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -150,7 +150,6 @@ protected function addToTree(string $node) // text node $this->addTo($this->previousKey ?? 'body', $node, true); - return; } /** From 948a72a594739bdf823aa80c06624c951548a9af Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 13:20:06 +0000 Subject: [PATCH 08/23] suppress psalm --- src/HtmlProcessor/HtmlParser.php | 41 +++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index ee521e3f..db10a7f6 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -46,25 +46,29 @@ class HtmlParser * Parse a HTML document. * * @param string $html + * + * @return void + * + * @psalm-suppress MixedOperand */ public function loadHtml(string $html) { $i = 0; $len = \strlen($html); while ($i < $len) { - if ((string)$html[$i] == '<') { + if ($html[$i] == '<') { // Found a tag, get chars until the end of the tag. $tag = ''; - while ($i < $len && (string)$html[$i] != '>') { + while ($i < $len && $html[$i] != '>') { $tag .= $html[$i++]; } - if ($i < $len && (string)$html[$i] == '>') { + if ($i < $len && $html[$i] == '>') { $tag .= $html[$i++]; // Copy any whitespace following the tag. // Anything added here needs to be added to the rtrim in the nodeName function. - while ($i < $len && \preg_match('/\s/', (string)$html[$i])) { + while ($i < $len && \preg_match('/\s/', $html[$i])) { $tag .= $html[$i++]; } } else { @@ -83,6 +87,10 @@ public function loadHtml(string $html) * Format the document in a structured way (ensures root elements exists and moves scripts/css into ). * * @return string + * + * @psalm-suppress MixedArrayAccess + * @psalm-suppress MixedAssignment + * @psalm-suppress MixedOperand */ public function saveHtml() { @@ -90,27 +98,27 @@ public function saveHtml() $buffer = ''; // Add - this is optional. - $buffer .= (string)$this->tree['doctype']; + $buffer .= $this->tree['doctype']; // Add - $buffer .= (string)$this->tree['html']['start']; + $buffer .= $this->tree['html']['start']; // Add head - $buffer .= (string)$this->tree['head']['start']; - foreach ((array)$this->tree['head']['content'] as $node) { - $buffer .= (string)$node; + $buffer .= $this->tree['head']['start']; + foreach ($this->tree['head']['content'] as $node) { + $buffer .= $node; } - $buffer .= (string)$this->tree['head']['end']; + $buffer .= $this->tree['head']['end']; // Add body $buffer .= $this->tree['body']['start']; - foreach ((array)$this->tree['body']['content'] as $node) { - $buffer .= (string)$node; + foreach ($this->tree['body']['content'] as $node) { + $buffer .= $node; } - $buffer .= (string)$this->tree['body']['end']; + $buffer .= $this->tree['body']['end']; // Close tag - return $buffer . (string)$this->tree['html']['end']; + return $buffer . $this->tree['html']['end']; } /** @@ -136,14 +144,17 @@ protected function addToTree(string $node) case 'html': $this->addTo('html', $node, false); + return; case 'head': $this->addTo('head', $node, true); + return; default: $this->addTo($this->previousKey ?? 'body', $node, true); + return; } } @@ -160,6 +171,8 @@ protected function addToTree(string $node) * @param bool $setPrevious * * @return void + * + * @psalm-suppress MixedArrayAssignment */ protected function addTo(string $key, string $node, bool $setPrevious) { From 1f4b4b81686ce3a60f92e59d9355b96fbd276b76 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 13:24:28 +0000 Subject: [PATCH 09/23] fix psalm --- src/HtmlProcessor/AbstractHtmlProcessor.php | 2 +- src/HtmlProcessor/HtmlParser.php | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index ffa483c5..a5a264c0 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -131,7 +131,7 @@ public function getDomDocument(): \DOMDocument /** * Provides access to the internal HTML5 instance. * - * @return HTML5 + * @return HTML5|null * * @throws \UnexpectedValueException */ diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php index db10a7f6..848cd7b6 100644 --- a/src/HtmlProcessor/HtmlParser.php +++ b/src/HtmlProcessor/HtmlParser.php @@ -63,12 +63,12 @@ public function loadHtml(string $html) $tag .= $html[$i++]; } - if ($i < $len && $html[$i] == '>') { + if ($i < $len && (string)$html[$i] == '>') { $tag .= $html[$i++]; // Copy any whitespace following the tag. // Anything added here needs to be added to the rtrim in the nodeName function. - while ($i < $len && \preg_match('/\s/', $html[$i])) { + while ($i < $len && \preg_match('/\s/', (string)$html[$i])) { $tag .= $html[$i++]; } } else { From 73dd75bce8e7fbe447015cd2b6764eed40d8ded1 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 13:27:50 +0000 Subject: [PATCH 10/23] fix psalm --- src/HtmlProcessor/AbstractHtmlProcessor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index a5a264c0..f6ecde76 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -41,7 +41,7 @@ abstract class AbstractHtmlProcessor protected $domDocument = null; /** - * @var HTML5|null + * @var HTML5 */ protected $html5 = null; @@ -131,7 +131,7 @@ public function getDomDocument(): \DOMDocument /** * Provides access to the internal HTML5 instance. * - * @return HTML5|null + * @return HTML5 * * @throws \UnexpectedValueException */ From ad25ad3c5d1372b9b711fa49dad14274a1759625 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Tue, 14 Jan 2020 13:34:10 +0000 Subject: [PATCH 11/23] fix psalm --- src/HtmlProcessor/AbstractHtmlProcessor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index f6ecde76..5c5bc3f0 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -41,7 +41,7 @@ abstract class AbstractHtmlProcessor protected $domDocument = null; /** - * @var HTML5 + * @var HTML5|null */ protected $html5 = null; @@ -137,7 +137,7 @@ public function getDomDocument(): \DOMDocument */ public function getHtml5(): HTML5 { - if ($this->domDocument === null) { + if ($this->domDocument === null || $this->html5 === null) { throw new \UnexpectedValueException( ( self::class . From 593362e13681827ff0534c79dbb4c488a77100a8 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Thu, 21 May 2020 13:36:52 +0100 Subject: [PATCH 12/23] Reverted changes to tests --- tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php b/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php index 1127cc7a..78182bc2 100644 --- a/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php +++ b/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php @@ -103,10 +103,8 @@ public function matchingCssToHtmlMappingDataProvider(): array 'background => bgcolor' => ['

Bonjour

', 'bgcolor="red"'], 'width with px' => ['

Hi

', 'width="100"'], 'width with %' => ['

Hi

', 'width="50%"'], - 'width with decimal %' => ['

Hi

', 'width="50.5%"'], 'height with px' => ['

Hi

', 'height="100"'], 'height with %' => ['

Hi

', 'height="50%"'], - 'height with decimal %' => ['

Hi

', 'height="50.5%"'], 'img.margin: 0 auto (horizontal centering) => align=center' => [ '', 'align="center"', @@ -173,8 +171,8 @@ public function notMatchingCssToHtmlMappingDataProvider(): array 'span.float' => ['Hi'], 'float: none' => ['
'], 'p.border-spacing' => ['

Hi

'], - 'height: auto' => [''], - 'width: auto' => [''], + 'height: auto' => [''], + 'width: auto' => [''], ]; } From 1dd812ed352fb8490ddb00ab7ffca996949b518a Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Thu, 21 May 2020 17:23:59 +0100 Subject: [PATCH 13/23] Removed HtmlParser.php --- src/HtmlProcessor/AbstractHtmlProcessor.php | 25 +-- src/HtmlProcessor/HtmlParser.php | 208 -------------------- 2 files changed, 3 insertions(+), 230 deletions(-) delete mode 100644 src/HtmlProcessor/HtmlParser.php diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 5c5bc3f0..7fe8215b 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -257,8 +257,7 @@ private function createRawDomDocument(string $html): void private function prepareHtmlForDomConversion(string $html): string { $htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html); - $htmlWithRootElms = $this->addMissingRootElements($htmlWithSelfClosingSlashes); - $htmlWithDocumentType = $this->ensureDocumentType($htmlWithRootElms); + $htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes); return $this->addContentTypeMetaTag($htmlWithDocumentType); } @@ -373,24 +372,6 @@ private function ensureExistenceOfBodyElement(): void $htmlElement->appendChild($this->getDomDocument()->createElement('body')); } - /** - * masterminds/html5-php has some quirks where it doesn't handle the same as DOMDocument. - * This fixes those instances: - * - content before html/body - * - missing or elements - * - * @param string $html - * - * @return string - */ - private function addMissingRootElements(string $html) - { - $parser = new HtmlParser(); - $parser->loadHtml($html); - - return $parser->saveHtml(); - } - /** * Check if the document contains a HTML5 DOCTYPE. * @@ -398,7 +379,7 @@ private function addMissingRootElements(string $html) * * @return bool */ - private function isHtml5(string $html) + private function isHtml5(string $html): bool { return \strspn($html, " \t\r\n") === \stripos($html, ''); } @@ -410,7 +391,7 @@ private function isHtml5(string $html) * * @return string the HTML, or false if an error occurred. */ - private function saveHTML(DOMNode $dom = null) + private function saveHTML(DOMNode $dom = null): string { if (isset($this->html5)) { if ($dom === null) { diff --git a/src/HtmlProcessor/HtmlParser.php b/src/HtmlProcessor/HtmlParser.php deleted file mode 100644 index 848cd7b6..00000000 --- a/src/HtmlProcessor/HtmlParser.php +++ /dev/null @@ -1,208 +0,0 @@ - - */ -class HtmlParser -{ - /** - * Structure of a basic HTML document. - * - * @var array - */ - protected $tree = [ - 'doctype' => '', - 'html' => [ - 'start' => '', - 'end' => '', - 'content' => [], - ], - 'head' => [ - 'start' => '', - 'end' => '', - 'content' => [], - ], - 'body' => [ - 'start' => '', - 'end' => '', - 'content' => [], - ], - ]; - - /** - * What root element did we last add to. - * - * @var string|null - */ - protected $previousKey = null; - - /** - * Parse a HTML document. - * - * @param string $html - * - * @return void - * - * @psalm-suppress MixedOperand - */ - public function loadHtml(string $html) - { - $i = 0; - $len = \strlen($html); - while ($i < $len) { - if ($html[$i] == '<') { - // Found a tag, get chars until the end of the tag. - $tag = ''; - while ($i < $len && $html[$i] != '>') { - $tag .= $html[$i++]; - } - - if ($i < $len && (string)$html[$i] == '>') { - $tag .= $html[$i++]; - - // Copy any whitespace following the tag. - // Anything added here needs to be added to the rtrim in the nodeName function. - while ($i < $len && \preg_match('/\s/', (string)$html[$i])) { - $tag .= $html[$i++]; - } - } else { - // Missing closing tag? - $tag .= '>'; - } - - $this->addToTree($tag); - } else { - $this->addToTree($html[$i++]); - } - } - } - - /** - * Format the document in a structured way (ensures root elements exists and moves scripts/css into ). - * - * @return string - * - * @psalm-suppress MixedArrayAccess - * @psalm-suppress MixedAssignment - * @psalm-suppress MixedOperand - */ - public function saveHtml() - { - // Initialise buffer. - $buffer = ''; - - // Add - this is optional. - $buffer .= $this->tree['doctype']; - - // Add - $buffer .= $this->tree['html']['start']; - - // Add head - $buffer .= $this->tree['head']['start']; - foreach ($this->tree['head']['content'] as $node) { - $buffer .= $node; - } - $buffer .= $this->tree['head']['end']; - - // Add body - $buffer .= $this->tree['body']['start']; - foreach ($this->tree['body']['content'] as $node) { - $buffer .= $node; - } - $buffer .= $this->tree['body']['end']; - - // Close tag - return $buffer . $this->tree['html']['end']; - } - - /** - * Add a node into the tree for the correct parent. - * - * @param string $node - * - * @return void - */ - protected function addToTree(string $node) - { - if ($node[0] == '<') { - switch (\strtolower($this->nodeName($node))) { - case '!doctype': - if (empty($this->tree['doctype'])) { - $this->tree['doctype'] = $node; - - return; - } - - // Don't overwrite if we've already got a doctype definition. - return; - - case 'html': - $this->addTo('html', $node, false); - - return; - - case 'head': - $this->addTo('head', $node, true); - - return; - - default: - $this->addTo($this->previousKey ?? 'body', $node, true); - - return; - } - } - - // text node - $this->addTo($this->previousKey ?? 'body', $node, true); - } - - /** - * Add a node to the the tree. - * - * @param string $key - * @param string $node - * @param bool $setPrevious - * - * @return void - * - * @psalm-suppress MixedArrayAssignment - */ - protected function addTo(string $key, string $node, bool $setPrevious) - { - $previousKey = $key; - - if (\stripos($node, '<' . $key) !== false) { - $this->tree[$key]['start'] = $node; - } elseif (\stristr($node, '/' . $key . '>')) { - $this->tree[$key]['end'] = $node; - $previousKey = null; - } else { - $this->tree[$key]['content'][] = $node; - } - - if ($setPrevious) { - $this->previousKey = $previousKey; - } - } - - /** - * Get the name of a node without - * - * @param string $node - * - * @return string - */ - protected function nodeName(string $node) - { - $name = \preg_replace('/>\s*/', '', \ltrim($node, ' Date: Thu, 21 May 2020 17:24:11 +0100 Subject: [PATCH 14/23] Removed getHtml5 --- src/HtmlProcessor/AbstractHtmlProcessor.php | 23 --------------------- 1 file changed, 23 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 7fe8215b..9275e3a2 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -128,29 +128,6 @@ public function getDomDocument(): \DOMDocument return $this->domDocument; } - /** - * Provides access to the internal HTML5 instance. - * - * @return HTML5 - * - * @throws \UnexpectedValueException - */ - public function getHtml5(): HTML5 - { - if ($this->domDocument === null || $this->html5 === null) { - throw new \UnexpectedValueException( - ( - self::class . - '::fromHtml() has not yet been called on ' . - static::class - ), - 1570472239 - ); - } - - return $this->html5; - } - /** * @param \DOMDocument $domDocument */ From 1254c9af1dc20bee89f711ebbabddd31f69253a0 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Fri, 22 May 2020 13:39:57 +0100 Subject: [PATCH 15/23] Reverted changes to tests --- tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php b/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php index 78182bc2..544763d5 100644 --- a/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php +++ b/tests/Unit/HtmlProcessor/CssToAttributeConverterTest.php @@ -103,8 +103,10 @@ public function matchingCssToHtmlMappingDataProvider(): array 'background => bgcolor' => ['

Bonjour

', 'bgcolor="red"'], 'width with px' => ['

Hi

', 'width="100"'], 'width with %' => ['

Hi

', 'width="50%"'], + 'width with decimal %' => ['

Hi

', 'width="50.5%"'], 'height with px' => ['

Hi

', 'height="100"'], 'height with %' => ['

Hi

', 'height="50%"'], + 'height with decimal %' => ['

Hi

', 'height="50.5%"'], 'img.margin: 0 auto (horizontal centering) => align=center' => [ '', 'align="center"', From 4a5d56957690b4f8561bb3048067dda2cba79177 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Mon, 6 Jul 2020 18:43:35 +0100 Subject: [PATCH 16/23] Only use if masterminds/html5 is installed --- composer.json | 4 +++- src/HtmlProcessor/AbstractHtmlProcessor.php | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/composer.json b/composer.json index bffbe577..1d63e7e9 100644 --- a/composer.json +++ b/composer.json @@ -40,7 +40,6 @@ "php": "~7.1.0 || ~7.2.0 || ~7.3.0 || ~7.4.0 || ~8.0.0", "ext-dom": "*", "ext-libxml": "*", - "masterminds/html5": "^2.7", "symfony/css-selector": "^3.4.32 || ^4.4 || ^5.1" }, "require-dev": { @@ -48,6 +47,9 @@ "slevomat/coding-standard": "^6.4.1", "squizlabs/php_codesniffer": "^3.5.8" }, + "suggest": { + "masterminds/html5": "Use instead of PHP's built-in DOMDocument for HTML5 support." + }, "autoload": { "psr-4": { "Pelago\\Emogrifier\\": "src/" diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 9275e3a2..e8323412 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -358,7 +358,8 @@ private function ensureExistenceOfBodyElement(): void */ private function isHtml5(string $html): bool { - return \strspn($html, " \t\r\n") === \stripos($html, ''); + return \strspn($html, " \t\r\n") === \stripos($html, '') + && class_exists(HTML5::class); } /** From 22c3ee75df7e0454d7b9265548474cfc8b57c28f Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 13:53:23 +0100 Subject: [PATCH 17/23] Added masterminds/html5 to require-dev --- composer.json | 1 + 1 file changed, 1 insertion(+) diff --git a/composer.json b/composer.json index 1d63e7e9..91036bd0 100644 --- a/composer.json +++ b/composer.json @@ -43,6 +43,7 @@ "symfony/css-selector": "^3.4.32 || ^4.4 || ^5.1" }, "require-dev": { + "masterminds/html5": "^2.7", "php-parallel-lint/php-parallel-lint": "^1.2.0", "slevomat/coding-standard": "^6.4.1", "squizlabs/php_codesniffer": "^3.5.8" From 3070946713f88f18f5672445c616de18201ae926 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 13:53:57 +0100 Subject: [PATCH 18/23] added html5 flag to fromHtml --- src/HtmlProcessor/AbstractHtmlProcessor.php | 70 +++++++++++---------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index e8323412..385e3e0e 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -63,19 +63,20 @@ private function __construct() * Builds a new instance from the given HTML. * * @param string $unprocessedHtml raw HTML, must be UTF-encoded, must not be empty + * @param bool $html5 use masterminds/html5 parser instead of DOMDocument. * * @return static * * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string */ - public static function fromHtml(string $unprocessedHtml): self + public static function fromHtml(string $unprocessedHtml, bool $html5 = false): self { if ($unprocessedHtml === '') { throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647); } $instance = new static(); - $instance->setHtml($unprocessedHtml); + $instance->setHtml($unprocessedHtml, $html5); return $instance; } @@ -99,10 +100,11 @@ public static function fromDomDocument(\DOMDocument $document): self * Sets the HTML to process. * * @param string $html the HTML to process, must be UTF-8-encoded + * @param bool $html5 use masterminds/html5 parser instead of DOMDocument. */ - private function setHtml(string $html): void + private function setHtml(string $html, bool $html5): void { - $this->createUnifiedDomDocument($html); + $this->createUnifiedDomDocument($html, $html5); } /** @@ -192,13 +194,36 @@ private function getBodyElement(): \DOMElement * The DOM document will always have a BODY element and a document type. * * @param string $html + * @param bool $html5 */ - private function createUnifiedDomDocument(string $html): void + private function createUnifiedDomDocument(string $html, bool $html5): void { - $this->createRawDomDocument($html); + $html = $this->prepareHtmlForDomConversion($html); + + $html5 ? $this->createHtml5Document($html) : $this->createRawDomDocument($html); + $this->ensureExistenceOfBodyElement(); } + /** + * Creates a HTML5 document parser instance from the given HTML. + * + * @param string $html + * + * @throws \RuntimeException + */ + private function createHtml5Document(string $html): void + { + if (! class_exists(HTML5::class)) { + throw new \RuntimeException("Class " . HTML5::class . "not found. Install the masterminds/html5 library."); + } + + $this->html5 = new HTML5(['disable_html_ns' => true]); + $domDocument = $this->html5->parse($html); + + $this->setDomDocument($domDocument); + } + /** * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument. * @@ -206,19 +231,13 @@ private function createUnifiedDomDocument(string $html): void */ private function createRawDomDocument(string $html): void { - $html = $this->prepareHtmlForDomConversion($html); - if ($this->isHtml5($html)) { - $this->html5 = new HTML5(['disable_html_ns' => true]); - $domDocument = $this->html5->parse($html); - } else { - $domDocument = new \DOMDocument(); - $domDocument->strictErrorChecking = false; - $domDocument->formatOutput = true; - $libXmlState = \libxml_use_internal_errors(true); - $domDocument->loadHTML($html); - \libxml_clear_errors(); - \libxml_use_internal_errors($libXmlState); - } + $domDocument = new \DOMDocument(); + $domDocument->strictErrorChecking = false; + $domDocument->formatOutput = true; + $libXmlState = \libxml_use_internal_errors(true); + $domDocument->loadHTML($html); + \libxml_clear_errors(); + \libxml_use_internal_errors($libXmlState); $this->setDomDocument($domDocument); } @@ -349,19 +368,6 @@ private function ensureExistenceOfBodyElement(): void $htmlElement->appendChild($this->getDomDocument()->createElement('body')); } - /** - * Check if the document contains a HTML5 DOCTYPE. - * - * @param string $html - * - * @return bool - */ - private function isHtml5(string $html): bool - { - return \strspn($html, " \t\r\n") === \stripos($html, '') - && class_exists(HTML5::class); - } - /** * Dumps the internal document into a string using HTML formatting. * From 5a8adf06d0819da1dbe82128e0657fa717be71e6 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 14:00:21 +0100 Subject: [PATCH 19/23] added an env flag --- src/HtmlProcessor/AbstractHtmlProcessor.php | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 385e3e0e..e9d7ce47 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -69,7 +69,7 @@ private function __construct() * * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string */ - public static function fromHtml(string $unprocessedHtml, bool $html5 = false): self + public static function fromHtml(string $unprocessedHtml, ?bool $html5 = null): self { if ($unprocessedHtml === '') { throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647); @@ -102,8 +102,11 @@ public static function fromDomDocument(\DOMDocument $document): self * @param string $html the HTML to process, must be UTF-8-encoded * @param bool $html5 use masterminds/html5 parser instead of DOMDocument. */ - private function setHtml(string $html, bool $html5): void + private function setHtml(string $html, ?bool $html5): void { + // If html5 is NULL, fallback to the environment flag. + $html5 = $html5 ?? $this->isHtml5Env(); + $this->createUnifiedDomDocument($html, $html5); } @@ -388,4 +391,14 @@ private function saveHTML(DOMNode $dom = null): string // Fall back to DOMDocument. return $this->getDomDocument()->saveHTML($dom); } + + /** + * Check whether HTML5 environment is enabled. + * + * @return bool + */ + private function isHtml5Env(): bool + { + return (bool) (getenv('EMOGRIFIER_HTML5') ?? false); + } } From 87438f7230d7a51955160a3cff7560af51360f64 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 14:15:08 +0100 Subject: [PATCH 20/23] updated ci:tests --- .github/CONTRIBUTING.md | 2 +- .github/workflows/ci.yml | 2 +- composer.json | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 6287d654..26325d58 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -81,7 +81,7 @@ code coverage of the fixed bugs and the new features. To run the existing PHPUnit tests, run this command: ```shell -composer ci:tests:unit +composer ci:tests ``` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 440ac5aa..e3d3e43f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -155,4 +155,4 @@ jobs: fi - name: Run Tests - run: composer ci:tests:unit + run: composer ci:tests diff --git a/composer.json b/composer.json index 91036bd0..0c29d0e6 100644 --- a/composer.json +++ b/composer.json @@ -77,9 +77,12 @@ "ci:php:md": "\"./tools/phpmd.phar\" src text config/phpmd.xml", "ci:php:psalm": "\"./tools/psalm.phar\" --show-info=false", "ci:tests:unit": "\"./tools/phpunit.phar\"", + "ci:tests:html5:unit": "EMOGRIFIER_HTML5=true \"./tools/phpunit.phar\"", "ci:tests:sof": "\"./tools/phpunit.phar\" --stop-on-failure", + "ci:tests:html5:sof": "EMOGRIFIER_HTML5=true \"./tools/phpunit.phar\" --stop-on-failure", "ci:tests": [ - "@ci:tests:unit" + "@ci:tests:unit", + "@ci:tests:html5:unit" ], "ci:dynamic": [ "@ci:tests" From dda45f8fcc04fffa8ec4bcf4693d0e26ab330735 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 14:17:25 +0100 Subject: [PATCH 21/23] fix psalm --- src/HtmlProcessor/AbstractHtmlProcessor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index e9d7ce47..5d6724b9 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -217,7 +217,7 @@ private function createUnifiedDomDocument(string $html, bool $html5): void */ private function createHtml5Document(string $html): void { - if (! class_exists(HTML5::class)) { + if (!\class_exists(HTML5::class)) { throw new \RuntimeException("Class " . HTML5::class . "not found. Install the masterminds/html5 library."); } @@ -399,6 +399,6 @@ private function saveHTML(DOMNode $dom = null): string */ private function isHtml5Env(): bool { - return (bool) (getenv('EMOGRIFIER_HTML5') ?? false); + return (bool)(\getenv('EMOGRIFIER_HTML5') ?? false); } } From e159cf087f5b049b63f2af7f518259b0b268ceef Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 14:18:58 +0100 Subject: [PATCH 22/23] fix psalm --- src/HtmlProcessor/AbstractHtmlProcessor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 5d6724b9..c2eccf5e 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -218,7 +218,7 @@ private function createUnifiedDomDocument(string $html, bool $html5): void private function createHtml5Document(string $html): void { if (!\class_exists(HTML5::class)) { - throw new \RuntimeException("Class " . HTML5::class . "not found. Install the masterminds/html5 library."); + throw new \RuntimeException('Class ' . HTML5::class . 'not found. Install the masterminds/html5 library.'); } $this->html5 = new HTML5(['disable_html_ns' => true]); From 11eb83e0d39b82f3382ebb9499c193218c251571 Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Sun, 13 Sep 2020 14:29:12 +0100 Subject: [PATCH 23/23] fix psalm --- src/HtmlProcessor/AbstractHtmlProcessor.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index c2eccf5e..0c73695e 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -399,6 +399,11 @@ private function saveHTML(DOMNode $dom = null): string */ private function isHtml5Env(): bool { - return (bool)(\getenv('EMOGRIFIER_HTML5') ?? false); + $env = \getenv('EMOGRIFIER_HTML5'); + if (is_bool($env)) { + return $env; + } + + return false; } }