From 03b12dc3117737b8960bc4144102ef5cca346547 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dmsnell@git.wordpress.org>
Date: Fri, 20 Sep 2024 22:30:04 +0000
Subject: [PATCH] HTML API: Add `normalize()` to give us the HTML we always
 wanted.

HTML often appears in ways that are unexpected. It may be missing implicit tags, may have unquoted, single-quoted, or double-quoted attributes, may contain duplicate attributes, may contain unescaped text content, or any number of other possible invalid constructions. The HTML API understands all fo these inputs, but downline parsers may not, and HTML snippets which are safe on their own may introduce problems when joined with other HTML snippets.

This patch introduces the `serialize()` method on the HTML Processor, which prints a fully-normative HTML output, eliminating invalid markup along the way. It produces a string which contains every missing tag, double-quoted attributes, and no duplicates. A `normalize()` static method on the HTML Processor provides a convenient wrapper for constructing a fragment parser and immediately serializing.

Subclasses relying on the `serialize_token()` method may perform structural HTML modifications with as much security as the upcoming `\Dom\HTMLDocument()` parser will, though these are not
able to provide the full safety that will eventually appear with `set_inner_html()`.

Further work may explore serializing to XML (which involves a number of other important transformations) and adding constraints to serialization (such as only allowing inline/flow/formatting elements and text).

Developed in https://github.com/wordpress/wordpress-develop/pull/7331
Discussed in https://core.trac.wordpress.org/ticket/62036

Props dmsnell, jonsurrell, westonruter.
Fixes #62036.


git-svn-id: https://develop.svn.wordpress.org/trunk@59076 602fd350-edb4-49c9-b593-d223f7449a82
---
 .../html-api/class-wp-html-processor.php      | 210 +++++++++++++
 .../html-api/class-wp-html-tag-processor.php  |   3 +
 .../html-api/wpHtmlProcessor-serialize.php    | 287 ++++++++++++++++++
 3 files changed, 500 insertions(+)
 create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
index caa6de1d0905a..ba8be1e7d4004 100644
--- a/src/wp-includes/html-api/class-wp-html-processor.php
+++ b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -1029,6 +1029,216 @@ public function get_current_depth(): int {
 		return count( $this->breadcrumbs );
 	}
 
+	/**
+	 * Normalizes an HTML fragment by serializing it.
+	 *
+	 * This method assumes that the given HTML snippet is found in BODY context.
+	 * For normalizing full documents or fragments found in other contexts, create
+	 * a new processor using {@see WP_HTML_Processor::create_fragment} or
+	 * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize}
+	 * on the created instances.
+	 *
+	 * Many aspects of an input HTML fragment may be changed during normalization.
+	 *
+	 *  - Attribute values will be double-quoted.
+	 *  - Duplicate attributes will be removed.
+	 *  - Omitted tags will be added.
+	 *  - Tag and attribute name casing will be lower-cased,
+	 *    except for specific SVG and MathML tags or attributes.
+	 *  - Text will be re-encoded, null bytes handled,
+	 *    and invalid UTF-8 replaced with U+FFFD.
+	 *  - Any incomplete syntax trailing at the end will be omitted,
+	 *    for example, an unclosed comment opener will be removed.
+	 *
+	 * Example:
+	 *
+	 *     echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
+	 *     // <a href="#anchor" v="5" enabled>One</a>
+	 *
+	 *     echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
+	 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
+	 *
+	 *     echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
+	 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
+	 *
+	 * @since 6.7.0
+	 *
+	 * @param string $html Input HTML to normalize.
+	 *
+	 * @return string|null Normalized output, or `null` if unable to normalize.
+	 */
+	public static function normalize( string $html ): ?string {
+		return static::create_fragment( $html )->serialize();
+	}
+
+	/**
+	 * Returns normalized HTML for a fragment by serializing it.
+	 *
+	 * This differs from {@see WP_HTML_Processor::normalize} in that it starts with
+	 * a specific HTML Processor, which _must_ not have already started scanning;
+	 * it must be in the initial ready state and will be in the completed state once
+	 * serialization is complete.
+	 *
+	 * Many aspects of an input HTML fragment may be changed during normalization.
+	 *
+	 *  - Attribute values will be double-quoted.
+	 *  - Duplicate attributes will be removed.
+	 *  - Omitted tags will be added.
+	 *  - Tag and attribute name casing will be lower-cased,
+	 *    except for specific SVG and MathML tags or attributes.
+	 *  - Text will be re-encoded, null bytes handled,
+	 *    and invalid UTF-8 replaced with U+FFFD.
+	 *  - Any incomplete syntax trailing at the end will be omitted,
+	 *    for example, an unclosed comment opener will be removed.
+	 *
+	 * Example:
+	 *
+	 *     $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
+	 *     echo $processor->serialize();
+	 *     // <a href="#anchor" v="5" enabled>One</a>
+	 *
+	 *     $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
+	 *     echo $processor->serialize();
+	 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
+	 *
+	 *     $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
+	 *     echo $processor->serialize();
+	 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
+	 *
+	 * @since 6.7.0
+	 *
+	 * @return string|null Normalized HTML markup represented by processor,
+	 *                     or `null` if unable to generate serialization.
+	 */
+	public function serialize(): ?string {
+		if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
+			wp_trigger_error(
+				__METHOD__,
+				"An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.",
+				E_USER_WARNING
+			);
+			return null;
+		}
+
+		$html = '';
+		while ( $this->next_token() ) {
+			$html .= $this->serialize_token();
+		}
+
+		if ( null !== $this->get_last_error() ) {
+			wp_trigger_error(
+				__METHOD__,
+				"Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
+				E_USER_WARNING
+			);
+			return null;
+		}
+
+		return $html;
+	}
+
+	/**
+	 * Serializes the currently-matched token.
+	 *
+	 * This method produces a fully-normative HTML string for the currently-matched token,
+	 * if able. If not matched at any token or if the token doesn't correspond to any HTML
+	 * it will return an empty string (for example, presumptuous end tags are ignored).
+	 *
+	 * @see static::serialize()
+	 *
+	 * @since 6.7.0
+	 *
+	 * @return string Serialization of token, or empty string if no serialization exists.
+	 */
+	protected function serialize_token(): string {
+		$html       = '';
+		$token_type = $this->get_token_type();
+
+		switch ( $token_type ) {
+			case '#text':
+				$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
+				break;
+
+			// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
+			case '#presumptuous-tag':
+				break;
+
+			case '#funky-comment':
+			case '#comment':
+				$html .= "<!--{$this->get_full_comment_text()}-->";
+				break;
+
+			case '#cdata-section':
+				$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
+				break;
+
+			case 'html':
+				$html .= '<!DOCTYPE html>';
+				break;
+		}
+
+		if ( '#tag' !== $token_type ) {
+			return $html;
+		}
+
+		$tag_name       = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
+		$in_html        = 'html' === $this->get_namespace();
+		$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
+
+		if ( $this->is_tag_closer() ) {
+			$html .= "</{$qualified_name}>";
+			return $html;
+		}
+
+		$attribute_names = $this->get_attribute_names_with_prefix( '' );
+		if ( ! isset( $attribute_names ) ) {
+			$html .= "<{$qualified_name}>";
+			return $html;
+		}
+
+		$html .= "<{$qualified_name}";
+		foreach ( $attribute_names as $attribute_name ) {
+			$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
+			$value = $this->get_attribute( $attribute_name );
+
+			if ( is_string( $value ) ) {
+				$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
+			}
+
+			$html = str_replace( "\x00", "\u{FFFD}", $html );
+		}
+
+		if ( ! $in_html && $this->has_self_closing_flag() ) {
+			$html .= ' /';
+		}
+
+		$html .= '>';
+
+		// Flush out self-contained elements.
+		if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
+			$text = $this->get_modifiable_text();
+
+			switch ( $tag_name ) {
+				case 'IFRAME':
+				case 'NOEMBED':
+				case 'NOFRAMES':
+					$text = '';
+					break;
+
+				case 'SCRIPT':
+				case 'STYLE':
+					break;
+
+				default:
+					$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
+			}
+
+			$html .= "{$text}</{$qualified_name}>";
+		}
+
+		return $html;
+	}
+
 	/**
 	 * Parses next element in the 'initial' insertion mode.
 	 *
diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
index 2b115dd156014..5e29fc7d1590c 100644
--- a/src/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -1984,6 +1984,9 @@ private function parse_next_tag(): bool {
 				 *                     [#x10000-#xEFFFF]
 				 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
 				 *
+				 * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a
+				 *       special case with `<?xml ... ?>` syntax, but the `?` is part of the bogus comment.
+				 *
 				 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
 				 */
 				if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
new file mode 100644
index 0000000000000..e05ca28473e04
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
@@ -0,0 +1,287 @@
+<?php
+/**
+ * Unit tests covering WP_HTML_Processor serialization functionality.
+ *
+ * @package WordPress
+ * @subpackage HTML-API
+ *
+ * @since 6.7.0
+ */
+
+/**
+ * @group html-api
+ *
+ * @coversDefaultClass WP_HTML_Processor
+ */
+class Tests_HtmlApi_WpHtmlProcessor_Serialize extends WP_UnitTestCase {
+	/**
+	 * Ensures that basic text is properly encoded when serialized.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_properly_encodes_text() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( "apples > or\x00anges" ),
+			'apples &gt; oranges',
+			'Should have returned an HTML string with applicable characters properly encoded.'
+		);
+	}
+
+	/**
+	 * Ensures that unclosed elements are explicitly closed to ensure proper HTML isolation.
+	 *
+	 * When thinking about embedding HTML fragments into others, it's important that unclosed
+	 * elements aren't left dangling, otherwise a snippet of HTML may "swallow" parts of the
+	 * document that follow it.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_closes_unclosed_elements_at_end() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<div>' ),
+			'<div></div>',
+			'Should have provided the explicit closer to the un-closed DIV element.'
+		);
+	}
+
+	/**
+	 * Ensures that boolean attributes remain boolean and do not gain values.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_boolean_attributes_remain_boolean() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<input disabled>' ),
+			'<input disabled>',
+			'Should have preserved the boolean attribute upon serialization.'
+		);
+	}
+
+	/**
+	 * Ensures that attributes with values result in double-quoted attribute values.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_attributes_are_double_quoted() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<p id=3></p>' ),
+			'<p id="3"></p>',
+			'Should double-quote all attribute values.'
+		);
+	}
+
+	/**
+	 * Ensures that self-closing flags on HTML void elements are not serialized, to
+	 * prevent risk of conflating the flag with unquoted attribute values.
+	 *
+	 * Example:
+	 *
+	 *     BR element with "class" attribute having value "clear"
+	 *     <br class="clear"/>
+	 *
+	 *     BR element with "class" attribute having value "clear"
+	 *     <br class=clear />
+	 *
+	 *     BR element with "class" attribute having value "clear/"
+	 *     <br class=clear/>
+	 *
+	 * @ticket 62036
+	 */
+	public function test_void_elements_get_no_dangerous_self_closing_flag() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<br class="clear"/>' ),
+			'<br class="clear">',
+			'Should have removed dangerous self-closing flag on HTML void element.'
+		);
+	}
+
+	/**
+	 * Ensures that duplicate attributes are removed upon serialization.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_duplicate_attributes_are_removed() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<div one=1 one="one" one=\'won\' one>' ),
+			'<div one="1"></div>',
+			'Should have removed all but the first copy of an attribute when duplicates exist.'
+		);
+	}
+
+	/**
+	 * Ensures that SCRIPT contents are not escaped, as they are not parsed like text nodes are.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_script_contents_are_not_escaped() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( "<script>apples > or\x00anges</script>" ),
+			"<script>apples > or\u{FFFD}anges</script>",
+			'Should have preserved text inside a SCRIPT element, except for replacing NULL bytes.'
+		);
+	}
+
+	/**
+	 * Ensures that STYLE contents are not escaped, as they are not parsed like text nodes are.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_style_contents_are_not_escaped() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( "<style>apples > or\x00anges</style>" ),
+			"<style>apples > or\u{FFFD}anges</style>",
+			'Should have preserved text inside a STYLE element, except for replacing NULL bytes.'
+		);
+	}
+
+	public function test_unexpected_closing_tags_are_removed() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( 'one</div>two</span>three' ),
+			'onetwothree',
+			'Should have removed unpected closing tags.'
+		);
+	}
+
+	/**
+	 * Ensures that self-closing elements in foreign content retain their self-closing flag.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_self_closing_foreign_elements_retain_their_self_closing_flag() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<svg><g><g /></svg>' ),
+			'<svg><g><g /></g></svg>',
+			'Should have closed unclosed G element, but preserved the self-closing nature of the other G element.'
+		);
+	}
+
+	/**
+	 * Ensures that incomplete syntax elements at the end of an HTML string are removed from
+	 * the serialization, since these are often vectors of exploits for the successive HTML.
+	 *
+	 * @ticket 62036
+	 *
+	 * @dataProvider data_incomplete_syntax_tokens
+	 *
+	 * @param string $incomplete_token An incomplete HTML syntax token.
+	 */
+	public function test_should_remove_incomplete_input_from_end( string $incomplete_token ) {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( "content{$incomplete_token}" ),
+			'content',
+			'Should have removed the incomplete token from the end of the input.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_incomplete_syntax_tokens() {
+		return array(
+			'Comment opener'       => array( '<!--' ),
+			'Bogus comment opener' => array( '<![sneaky[' ),
+			'Incomplete tag'       => array( '<my-custom status="pending"' ),
+			'SCRIPT opening tag'   => array( '<script>' ),
+		);
+	}
+
+	/**
+	 * Ensures that presumptuous tag openers are treated as plaintext.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_encodes_presumptuous_opening_tags() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '<>' ),
+			'&lt;&gt;',
+			'Should have encoded the invalid presumptuous opening tag as plaintext.'
+		);
+	}
+
+	/**
+	 * Ensures that presumptuous tag closers are skipped in serialization.
+	 *
+	 * @ticket 62036
+	 */
+	public function test_skips_presumptuous_closing_tags() {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( '</>' ),
+			'',
+			'Should have completely ignored the presumptuous tag closer.'
+		);
+	}
+
+	/**
+	 * Ensures that invalid or "bogus" comments in HTML are normalized to their proper normative form.
+	 *
+	 * @ticket 62036
+	 *
+	 * @dataProvider data_bogus_comments
+	 *
+	 * @param string $opening      Start of bogus comment, e.g. "<!".
+	 * @param string $comment_text Comment content, as reported in a browser.
+	 * @param string $closing      End of bogus comment, e.g. ">".
+	 */
+	public function test_normalizes_bogus_comment_forms( string $opening, string $comment_text, string $closing ) {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( "{$opening}{$comment_text}{$closing}" ),
+			"<!--{$comment_text}-->",
+			'Should have replaced the invalid comment syntax with normative syntax.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public function data_bogus_comments() {
+		return array(
+			'False DOCTYPE'                         => array( '<!', 'html', '>' ),
+			'CDATA look-alike'                      => array( '<!', '[CDATA[inside]]', '>' ),
+			'Immediately-closed markup instruction' => array( '<!', '?', '>' ),
+			'Warning Symbol'                        => array( '<!', '', '>' ),
+			'PHP block look-alike'                  => array( '<', '?php foo(); ?', '>' ),
+			'Funky comment'                         => array( '</', '%display-name', '>' ),
+			'XML Processing Instruction look-alike' => array( '<', '?xml foo ', '>' ),
+		);
+	}
+
+	/**
+	 * Ensures that NULL bytes are properly handled.
+	 *
+	 * @ticket 62036
+	 *
+	 * @dataProvider data_tokens_with_null_bytes
+	 *
+	 * @param string $html_with_nulls HTML token containing NULL bytes in various places.
+	 * @param string $expected_output Expected parse of HTML after handling NULL bytes.
+	 */
+	public function test_replaces_null_bytes_appropriately( string $html_with_nulls, string $expected_output ) {
+		$this->assertSame(
+			WP_HTML_Processor::normalize( $html_with_nulls ),
+			$expected_output,
+			'Should have properly replaced or removed NULL bytes.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_tokens_with_null_bytes() {
+		return array(
+			'Tag name'             => array( "<img\x00id=5>", "<img\u{FFFD}id=5></img\u{FFFD}id=5>" ),
+			'Attribute name'       => array( "<img/\x00id=5>", "<img \u{FFFD}id=\"5\">" ),
+			'Attribute value'      => array( "<img id='5\x00'>", "<img id=\"5\u{FFFD}\">" ),
+			'Body text'            => array( "one\x00two", 'onetwo' ),
+			'Foreign content text' => array( "<svg>one\x00two</svg>", "<svg>one\u{FFFD}two</svg>" ),
+			'SCRIPT content'       => array( "<script>alert(\x00)</script>", "<script>alert(\u{FFFD})</script>" ),
+			'STYLE content'        => array( "<style>\x00 {}</style>", "<style>\u{FFFD} {}</style>" ),
+			'Comment text'         => array( "<!-- \x00 -->", "<!-- \u{FFFD} -->" ),
+		);
+	}
+}