diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index caa6de1d0905a..ba8be1e7d4004 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1029,6 +1029,216 @@ public function get_current_depth(): int { return count( $this->breadcrumbs ); } + /** + * Normalizes an HTML fragment by serializing it. + * + * This method assumes that the given HTML snippet is found in BODY context. + * For normalizing full documents or fragments found in other contexts, create + * a new processor using {@see WP_HTML_Processor::create_fragment} or + * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize} + * on the created instances. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * echo WP_HTML_Processor::normalize( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @param string $html Input HTML to normalize. + * + * @return string|null Normalized output, or `null` if unable to normalize. + */ + public static function normalize( string $html ): ?string { + return static::create_fragment( $html )->serialize(); + } + + /** + * Returns normalized HTML for a fragment by serializing it. + * + * This differs from {@see WP_HTML_Processor::normalize} in that it starts with + * a specific HTML Processor, which _must_ not have already started scanning; + * it must be in the initial ready state and will be in the completed state once + * serialization is complete. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @return string|null Normalized HTML markup represented by processor, + * or `null` if unable to generate serialization. + */ + public function serialize(): ?string { + if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { + wp_trigger_error( + __METHOD__, + "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.", + E_USER_WARNING + ); + return null; + } + + $html = ''; + while ( $this->next_token() ) { + $html .= $this->serialize_token(); + } + + if ( null !== $this->get_last_error() ) { + wp_trigger_error( + __METHOD__, + "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", + E_USER_WARNING + ); + return null; + } + + return $html; + } + + /** + * Serializes the currently-matched token. + * + * This method produces a fully-normative HTML string for the currently-matched token, + * if able. If not matched at any token or if the token doesn't correspond to any HTML + * it will return an empty string (for example, presumptuous end tags are ignored). + * + * @see static::serialize() + * + * @since 6.7.0 + * + * @return string Serialization of token, or empty string if no serialization exists. + */ + protected function serialize_token(): string { + $html = ''; + $token_type = $this->get_token_type(); + + switch ( $token_type ) { + case '#text': + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + break; + + // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. + case '#presumptuous-tag': + break; + + case '#funky-comment': + case '#comment': + $html .= ""; + break; + + case '#cdata-section': + $html .= "get_modifiable_text()}]]>"; + break; + + case 'html': + $html .= ''; + break; + } + + if ( '#tag' !== $token_type ) { + return $html; + } + + $tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() ); + $in_html = 'html' === $this->get_namespace(); + $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); + + if ( $this->is_tag_closer() ) { + $html .= ""; + return $html; + } + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( ! isset( $attribute_names ) ) { + $html .= "<{$qualified_name}>"; + return $html; + } + + $html .= "<{$qualified_name}"; + foreach ( $attribute_names as $attribute_name ) { + $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $value = $this->get_attribute( $attribute_name ); + + if ( is_string( $value ) ) { + $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + } + + $html = str_replace( "\x00", "\u{FFFD}", $html ); + } + + if ( ! $in_html && $this->has_self_closing_flag() ) { + $html .= ' /'; + } + + $html .= '>'; + + // Flush out self-contained elements. + if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { + $text = $this->get_modifiable_text(); + + switch ( $tag_name ) { + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + $text = ''; + break; + + case 'SCRIPT': + case 'STYLE': + break; + + default: + $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + } + + $html .= "{$text}"; + } + + return $html; + } + /** * Parses next element in the 'initial' insertion mode. * diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 2b115dd156014..5e29fc7d1590c 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1984,6 +1984,9 @@ private function parse_next_tag(): bool { * [#x10000-#xEFFFF] * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] * + * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a + * special case with `` syntax, but the `?` is part of the bogus comment. + * * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget */ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php new file mode 100644 index 0000000000000..e05ca28473e04 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -0,0 +1,287 @@ +assertSame( + WP_HTML_Processor::normalize( "apples > or\x00anges" ), + 'apples > oranges', + 'Should have returned an HTML string with applicable characters properly encoded.' + ); + } + + /** + * Ensures that unclosed elements are explicitly closed to ensure proper HTML isolation. + * + * When thinking about embedding HTML fragments into others, it's important that unclosed + * elements aren't left dangling, otherwise a snippet of HTML may "swallow" parts of the + * document that follow it. + * + * @ticket 62036 + */ + public function test_closes_unclosed_elements_at_end() { + $this->assertSame( + WP_HTML_Processor::normalize( '
' ), + '
', + 'Should have provided the explicit closer to the un-closed DIV element.' + ); + } + + /** + * Ensures that boolean attributes remain boolean and do not gain values. + * + * @ticket 62036 + */ + public function test_boolean_attributes_remain_boolean() { + $this->assertSame( + WP_HTML_Processor::normalize( '' ), + '', + 'Should have preserved the boolean attribute upon serialization.' + ); + } + + /** + * Ensures that attributes with values result in double-quoted attribute values. + * + * @ticket 62036 + */ + public function test_attributes_are_double_quoted() { + $this->assertSame( + WP_HTML_Processor::normalize( '

' ), + '

', + 'Should double-quote all attribute values.' + ); + } + + /** + * Ensures that self-closing flags on HTML void elements are not serialized, to + * prevent risk of conflating the flag with unquoted attribute values. + * + * Example: + * + * BR element with "class" attribute having value "clear" + *
+ * + * BR element with "class" attribute having value "clear" + *
+ * + * BR element with "class" attribute having value "clear/" + *
+ * + * @ticket 62036 + */ + public function test_void_elements_get_no_dangerous_self_closing_flag() { + $this->assertSame( + WP_HTML_Processor::normalize( '
' ), + '
', + 'Should have removed dangerous self-closing flag on HTML void element.' + ); + } + + /** + * Ensures that duplicate attributes are removed upon serialization. + * + * @ticket 62036 + */ + public function test_duplicate_attributes_are_removed() { + $this->assertSame( + WP_HTML_Processor::normalize( '
' ), + '
', + 'Should have removed all but the first copy of an attribute when duplicates exist.' + ); + } + + /** + * Ensures that SCRIPT contents are not escaped, as they are not parsed like text nodes are. + * + * @ticket 62036 + */ + public function test_script_contents_are_not_escaped() { + $this->assertSame( + WP_HTML_Processor::normalize( "" ), + "", + 'Should have preserved text inside a SCRIPT element, except for replacing NULL bytes.' + ); + } + + /** + * Ensures that STYLE contents are not escaped, as they are not parsed like text nodes are. + * + * @ticket 62036 + */ + public function test_style_contents_are_not_escaped() { + $this->assertSame( + WP_HTML_Processor::normalize( "" ), + "", + 'Should have preserved text inside a STYLE element, except for replacing NULL bytes.' + ); + } + + public function test_unexpected_closing_tags_are_removed() { + $this->assertSame( + WP_HTML_Processor::normalize( 'one
twothree' ), + 'onetwothree', + 'Should have removed unpected closing tags.' + ); + } + + /** + * Ensures that self-closing elements in foreign content retain their self-closing flag. + * + * @ticket 62036 + */ + public function test_self_closing_foreign_elements_retain_their_self_closing_flag() { + $this->assertSame( + WP_HTML_Processor::normalize( '' ), + '', + 'Should have closed unclosed G element, but preserved the self-closing nature of the other G element.' + ); + } + + /** + * Ensures that incomplete syntax elements at the end of an HTML string are removed from + * the serialization, since these are often vectors of exploits for the successive HTML. + * + * @ticket 62036 + * + * @dataProvider data_incomplete_syntax_tokens + * + * @param string $incomplete_token An incomplete HTML syntax token. + */ + public function test_should_remove_incomplete_input_from_end( string $incomplete_token ) { + $this->assertSame( + WP_HTML_Processor::normalize( "content{$incomplete_token}" ), + 'content', + 'Should have removed the incomplete token from the end of the input.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_incomplete_syntax_tokens() { + return array( + 'Comment opener' => array( '", + 'Should have replaced the invalid comment syntax with normative syntax.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_bogus_comments() { + return array( + 'False DOCTYPE' => array( '' ), + 'CDATA look-alike' => array( '' ), + 'Immediately-closed markup instruction' => array( '' ), + 'Warning Symbol' => array( '' ), + 'PHP block look-alike' => array( '<', '?php foo(); ?', '>' ), + 'Funky comment' => array( '' ), + 'XML Processing Instruction look-alike' => array( '<', '?xml foo ', '>' ), + ); + } + + /** + * Ensures that NULL bytes are properly handled. + * + * @ticket 62036 + * + * @dataProvider data_tokens_with_null_bytes + * + * @param string $html_with_nulls HTML token containing NULL bytes in various places. + * @param string $expected_output Expected parse of HTML after handling NULL bytes. + */ + public function test_replaces_null_bytes_appropriately( string $html_with_nulls, string $expected_output ) { + $this->assertSame( + WP_HTML_Processor::normalize( $html_with_nulls ), + $expected_output, + 'Should have properly replaced or removed NULL bytes.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_tokens_with_null_bytes() { + return array( + 'Tag name' => array( "", "" ), + 'Attribute name' => array( "", "" ), + 'Attribute value' => array( "", "" ), + 'Body text' => array( "one\x00two", 'onetwo' ), + 'Foreign content text' => array( "one\x00two", "one\u{FFFD}two" ), + 'SCRIPT content' => array( "", "" ), + 'STYLE content' => array( "", "" ), + 'Comment text' => array( "", "" ), + ); + } +}