From d372c97e107ea473d932f7ea20ab761cacc5219a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 09:07:37 -0700 Subject: [PATCH 01/21] HTML API: Add normalization functions. The HTML Processor understands HTML regardless of how it's written, but many other functions are unable to do so. There are all sorts of syntax peculiarities and semantics that would be helpful to eliminate using the knowledge contained in the HTML Processor. This patch introduces `WP_HTML_Processor::normalize( $html )` as a method which takes a fragment of HTML as input and then returns a serialized version of the input, "cleaning it up" by balancing all tags, providing all missing optional tags, re-encoding all text, removing all duplicate attributes, and double-quote-escaping all attribute values. Core-62036 --- .../html-api/class-wp-html-processor.php | 182 ++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 0ca7a52a2a329..8e33b9d332289 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1029,6 +1029,188 @@ public function get_current_depth(): int { return count( $this->breadcrumbs ); } + /** + * Normalize an HTML fragment by serializing it. + * + * This method assumes that the given HTML snippet is found in BODY context. + * For normalizing full documents or fragments found in other contexts, create + * a new processor using {@see WP_HTML_Processor::create_fragment} or + * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize} + * on the created instances. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * echo WP_HTML_Processor::normalize( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @param string $html Input HTML to normalize. + * + * @return string|null Normalized output, or `null` if unable to normalize. + */ + public static function normalize( string $html ): ?string { + return static::create_fragment( $html )->serialize(); + } + + /** + * Return normalized HTML for a fragment by serializing it. + * + * This differs from {@see WP_HTML_Processor::normalize} in that it starts with + * a specific HTML Processor, which _must_ not have already started scanning; + * it must be in the initial ready state and will be in the completed state once + * serialization is complete. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @return string|null Normalized HTML markup represented by processor, + * or `null` if unable to generate serialization. + */ + public function serialize(): ?string { + if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { + return null; + } + + $html = ''; + while ( $this->next_token() ) { + $token_type = $this->get_token_type(); + + switch ( $token_type ) { + case '#text': + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + break; + + case '#funky-comment': + case '#comment': + $html .= ""; + break; + + case '#cdata-section': + $html .= "get_modifiable_text()}]]>"; + break; + + case 'html': + $html .= ''; + break; + } + + if ( '#tag' !== $token_type ) { + continue; + } + + if ( $this->is_tag_closer() ) { + $html .= "get_qualified_tag_name()}>"; + continue; + } + + $tag_name = $this->get_tag(); + $qualitified_name = $this->get_qualified_tag_name(); + $in_html = 'html' === $this->get_namespace(); + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( ! isset( $attribute_names ) ) { + $html .= "<{$qualitified_name}>"; + continue; + } + + $html .= "<{$qualitified_name}"; + foreach ( $attribute_names as $attribute_name ) { + $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $value = $this->get_attribute( $attribute_name ); + + if ( is_string( $value ) ) { + $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + } + } + + if ( ! $in_html && $this->has_self_closing_flag() ) { + $html .= '/'; + } + + $html .= '>'; + + // Flush out self-contained elements. + if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { + $text = $this->get_modifiable_text(); + + switch ( $tag_name ) { + case 'STYLE': + $text = preg_replace_callback( + '~style)~i', + static function ( $tag_match ) { + return "\\3c\\2f{$tag_match['TAG_NAME']}"; + }, + $text + ); + break; + + case 'TEXTAREA': + case 'TITLE': + $text = preg_replace_callback( + "~{$tag_name})~i", + static function ( $tag_match ) { + return "</{$tag_match['TAG_NAME']}"; + }, + $text + ); + break; + } + + $html .= "{$text}"; + } + } + + if ( null !== $this->get_last_error() ) { + return null; + } + + return $html; + } + /** * Parses next element in the 'initial' insertion mode. * From 07e89bbb0d87d816b73637e0ceb4334e95371797 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 09:37:17 -0700 Subject: [PATCH 02/21] Lower-case names when calling for qualified tag name. --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 233d47eb8da95..355a7bb001923 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2840,7 +2840,7 @@ public function get_qualified_tag_name(): ?string { } if ( 'html' === $this->get_namespace() ) { - return $tag_name; + return strtolower( $tag_name ); } $lower_tag_name = strtolower( $tag_name ); From 754118ef6487d76d39d557469d4857beb8bf91e0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 09:47:34 -0700 Subject: [PATCH 03/21] Adjust some text encoding. --- .../html-api/class-wp-html-processor.php | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8e33b9d332289..a06f697d55ca9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1178,26 +1178,12 @@ public function serialize(): ?string { $text = $this->get_modifiable_text(); switch ( $tag_name ) { + case 'SCRIPT': case 'STYLE': - $text = preg_replace_callback( - '~style)~i', - static function ( $tag_match ) { - return "\\3c\\2f{$tag_match['TAG_NAME']}"; - }, - $text - ); break; - case 'TEXTAREA': - case 'TITLE': - $text = preg_replace_callback( - "~{$tag_name})~i", - static function ( $tag_match ) { - return "</{$tag_match['TAG_NAME']}"; - }, - $text - ); - break; + default: + $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); } $html .= "{$text}"; From 9c6e34cc0831b13add6b5596fb0d49cdd5d794c8 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 09:50:21 -0700 Subject: [PATCH 04/21] Remove inner text from elements which cannot contain it. --- src/wp-includes/html-api/class-wp-html-processor.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index a06f697d55ca9..1483d7f3bf9cd 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1178,6 +1178,12 @@ public function serialize(): ?string { $text = $this->get_modifiable_text(); switch ( $tag_name ) { + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + $text = ''; + break; + case 'SCRIPT': case 'STYLE': break; From 7f5783eb6b98a8f20801d1feae3949e261c3526b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:11:36 -0700 Subject: [PATCH 05/21] Lower-case inside of the serialization function. --- src/wp-includes/html-api/class-wp-html-processor.php | 12 ++++++------ .../html-api/class-wp-html-tag-processor.php | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 1483d7f3bf9cd..929123188aee9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1147,17 +1147,17 @@ public function serialize(): ?string { continue; } - $tag_name = $this->get_tag(); - $qualitified_name = $this->get_qualified_tag_name(); - $in_html = 'html' === $this->get_namespace(); + $tag_name = $this->get_tag(); + $in_html = 'html' === $this->get_namespace(); + $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); $attribute_names = $this->get_attribute_names_with_prefix( '' ); if ( ! isset( $attribute_names ) ) { - $html .= "<{$qualitified_name}>"; + $html .= "<{$qualified_name}>"; continue; } - $html .= "<{$qualitified_name}"; + $html .= "<{$qualified_name}"; foreach ( $attribute_names as $attribute_name ) { $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; $value = $this->get_attribute( $attribute_name ); @@ -1192,7 +1192,7 @@ public function serialize(): ?string { $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); } - $html .= "{$text}"; + $html .= "{$text}"; } } diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 355a7bb001923..233d47eb8da95 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2840,7 +2840,7 @@ public function get_qualified_tag_name(): ?string { } if ( 'html' === $this->get_namespace() ) { - return strtolower( $tag_name ); + return $tag_name; } $lower_tag_name = strtolower( $tag_name ); From 2aa89fa8baa563ce4fb17afddc8da40dfc46bf6b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:13:17 -0700 Subject: [PATCH 06/21] Progressive tense for function summaries. Co-authored-by: Weston Ruter --- src/wp-includes/html-api/class-wp-html-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 929123188aee9..144834b4a0a32 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1030,7 +1030,7 @@ public function get_current_depth(): int { } /** - * Normalize an HTML fragment by serializing it. + * Normalizes an HTML fragment by serializing it. * * This method assumes that the given HTML snippet is found in BODY context. * For normalizing full documents or fragments found in other contexts, create @@ -1072,7 +1072,7 @@ public static function normalize( string $html ): ?string { } /** - * Return normalized HTML for a fragment by serializing it. + * Returns normalized HTML for a fragment by serializing it. * * This differs from {@see WP_HTML_Processor::normalize} in that it starts with * a specific HTML Processor, which _must_ not have already started scanning; From 7029b2575d44b1edd08c27898437e61d6d22bc98 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:16:03 -0700 Subject: [PATCH 07/21] Leave space before self-closing flag to avoid later problems. If code later in the processing pipeline adds unquoted attributes and doesn't add the requisite space following that, then another parser might find that the solidus is part of the attribute value instead of serving as a self-closing flag. Co-authored-by: Weston Ruter --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 144834b4a0a32..061e9f2bcb37d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1168,7 +1168,7 @@ public function serialize(): ?string { } if ( ! $in_html && $this->has_self_closing_flag() ) { - $html .= '/'; + $html .= ' /'; } $html .= '>'; From 00a577355bb5f85113fc9b5ab0a589713411e40a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:28:44 -0700 Subject: [PATCH 08/21] Expand support for bogus comments. Co-authored-by: Weston Ruter --- .../html-api/class-wp-html-processor.php | 31 +++++++++++++++---- .../html-api/class-wp-html-tag-processor.php | 3 ++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 061e9f2bcb37d..f021882bd3704 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1124,11 +1124,30 @@ public function serialize(): ?string { $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); break; + // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. + case '#presumptuous-tag': + break; + case '#funky-comment': - case '#comment': $html .= ""; break; + case '#comment': + switch ( $this->get_comment_type() ) { + case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE: + $html .= ""; + break; + + case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE: + $html .= ""; + break; + + default: + $html .= ""; + + } + break; + case '#cdata-section': $html .= "get_modifiable_text()}]]>"; break; @@ -1142,15 +1161,15 @@ public function serialize(): ?string { continue; } - if ( $this->is_tag_closer() ) { - $html .= "get_qualified_tag_name()}>"; - continue; - } - $tag_name = $this->get_tag(); $in_html = 'html' === $this->get_namespace(); $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); + if ( $this->is_tag_closer() ) { + $html .= ""; + continue; + } + $attribute_names = $this->get_attribute_names_with_prefix( '' ); if ( ! isset( $attribute_names ) ) { $html .= "<{$qualified_name}>"; diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 233d47eb8da95..42c5a7851b9bc 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1984,6 +1984,9 @@ private function parse_next_tag(): bool { * [#x10000-#xEFFFF] * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] * + * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a + * special case with `` syntax, but the `?` is part of the bogus comment. + * * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget */ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { From 99799b7cf0da8372cced0a299d84dfb28838796d Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:30:17 -0700 Subject: [PATCH 09/21] Whitespace --- src/wp-includes/html-api/class-wp-html-processor.php | 1 - 1 file changed, 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f021882bd3704..9194d548b9009 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1144,7 +1144,6 @@ public function serialize(): ?string { default: $html .= ""; - } break; From db24c11f0745fbe0d04bacee9d9b74057f233178 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:34:19 -0700 Subject: [PATCH 10/21] Update docs examples. --- src/wp-includes/html-api/class-wp-html-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 9194d548b9009..de6f3ef03857c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1059,7 +1059,7 @@ public function get_current_depth(): int { * //

fun
cell
* * echo WP_HTML_Processor::normalize( ' syntax < <> "oddities"' ); - * // syntax < <> "oddities" + * // syntax < <> "oddities" * * @since 6.7.0 * @@ -1103,7 +1103,7 @@ public static function normalize( string $html ): ?string { * * $processor = WP_HTML_Processor::create_fragment( ' syntax < <> "oddities"' ); * echo $processor->serialize(); - * // syntax < <> "oddities" + * // syntax < <> "oddities" * * @since 6.7.0 * From 7b8aa53eb01ca92ae9acb47b9fc24e689ce42bce Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 11:42:31 -0700 Subject: [PATCH 11/21] Raise WP error when failing. Co-authored-by: Weston Ruter --- src/wp-includes/html-api/class-wp-html-processor.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index de6f3ef03857c..acef03f6a320a 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1112,6 +1112,11 @@ public static function normalize( string $html ): ?string { */ public function serialize(): ?string { if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { + wp_trigger_error( + __METHOD__, + "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.", + E_USER_ERROR + ); return null; } @@ -1215,6 +1220,11 @@ public function serialize(): ?string { } if ( null !== $this->get_last_error() ) { + wp_trigger_error( + __METHOD__, + "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", + E_USER_ERROR + ); return null; } From 18b5005d4aa2e2ec3618b200ca9039e2415711e3 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 11 Sep 2024 15:26:13 -0700 Subject: [PATCH 12/21] Change error level to warning. Co-authored-by: Weston Ruter --- src/wp-includes/html-api/class-wp-html-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index acef03f6a320a..e8c53cd1ce629 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1115,7 +1115,7 @@ public function serialize(): ?string { wp_trigger_error( __METHOD__, "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.", - E_USER_ERROR + E_USER_WARNING ); return null; } @@ -1223,7 +1223,7 @@ public function serialize(): ?string { wp_trigger_error( __METHOD__, "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", - E_USER_ERROR + E_USER_WARNING ); return null; } From 47f7f084d23b792bf52e9bdfd18b77a1eba31532 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 18 Sep 2024 14:47:25 -0700 Subject: [PATCH 13/21] WIP: Move actual token serialization into a single-unit method. --- .../html-api/class-wp-html-processor.php | 172 ++++++++++-------- 1 file changed, 92 insertions(+), 80 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index e8c53cd1ce629..e922aa75af27d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1122,110 +1122,122 @@ public function serialize(): ?string { $html = ''; while ( $this->next_token() ) { - $token_type = $this->get_token_type(); + $html .= $this->serialize_token(); + } - switch ( $token_type ) { - case '#text': - $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); - break; + if ( null !== $this->get_last_error() ) { + wp_trigger_error( + __METHOD__, + "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", + E_USER_WARNING + ); + return null; + } - // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. - case '#presumptuous-tag': - break; + return $html; + } - case '#funky-comment': - $html .= ""; - break; + /** + * Serializes a token. + * + * @return string Serialization of token, or empty string if no serialization exists. + */ + protected function serialize_token(): string { + $html = ''; + $token_type = $this->get_token_type(); - case '#comment': - switch ( $this->get_comment_type() ) { - case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE: - $html .= ""; - break; + switch ( $token_type ) { + case '#text': + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + break; - case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE: - $html .= ""; - break; + // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. + case '#presumptuous-tag': + break; - default: - $html .= ""; - } - break; + case '#funky-comment': + $html .= ""; + break; - case '#cdata-section': - $html .= "get_modifiable_text()}]]>"; - break; + case '#comment': + switch ( $this->get_comment_type() ) { + case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE: + $html .= ""; + break; - case 'html': - $html .= ''; - break; - } + case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE: + $html .= ""; + break; - if ( '#tag' !== $token_type ) { - continue; - } + default: + $html .= ""; + } + break; - $tag_name = $this->get_tag(); - $in_html = 'html' === $this->get_namespace(); - $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); + case '#cdata-section': + $html .= "get_modifiable_text()}]]>"; + break; - if ( $this->is_tag_closer() ) { - $html .= ""; - continue; - } + case 'html': + $html .= ''; + break; + } - $attribute_names = $this->get_attribute_names_with_prefix( '' ); - if ( ! isset( $attribute_names ) ) { - $html .= "<{$qualified_name}>"; - continue; - } + if ( '#tag' !== $token_type ) { + return $html; + } - $html .= "<{$qualified_name}"; - foreach ( $attribute_names as $attribute_name ) { - $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; - $value = $this->get_attribute( $attribute_name ); + $tag_name = $this->get_tag(); + $in_html = 'html' === $this->get_namespace(); + $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); - if ( is_string( $value ) ) { - $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; - } - } + if ( $this->is_tag_closer() ) { + $html .= ""; + return $html; + } + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( ! isset( $attribute_names ) ) { + $html .= "<{$qualified_name}>"; + return $html; + } + + $html .= "<{$qualified_name}"; + foreach ( $attribute_names as $attribute_name ) { + $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $value = $this->get_attribute( $attribute_name ); - if ( ! $in_html && $this->has_self_closing_flag() ) { - $html .= ' /'; + if ( is_string( $value ) ) { + $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; } + } - $html .= '>'; + if ( ! $in_html && $this->has_self_closing_flag() ) { + $html .= ' /'; + } - // Flush out self-contained elements. - if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { - $text = $this->get_modifiable_text(); + $html .= '>'; - switch ( $tag_name ) { - case 'IFRAME': - case 'NOEMBED': - case 'NOFRAMES': - $text = ''; - break; + // Flush out self-contained elements. + if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { + $text = $this->get_modifiable_text(); - case 'SCRIPT': - case 'STYLE': - break; + switch ( $tag_name ) { + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + $text = ''; + break; - default: - $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); - } + case 'SCRIPT': + case 'STYLE': + break; - $html .= "{$text}"; + default: + $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); } - } - if ( null !== $this->get_last_error() ) { - wp_trigger_error( - __METHOD__, - "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", - E_USER_WARNING - ); - return null; + $html .= "{$text}"; } return $html; From 92558b9f733607cca674547156fedda99a654cfe Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 11:54:18 -0700 Subject: [PATCH 14/21] Add basic unit test suite. --- .../html-api/wpHtmlProcessor-serialize.php | 252 ++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php new file mode 100644 index 0000000000000..69c14a4c5f603 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -0,0 +1,252 @@ +assertSame( + WP_HTML_Processor::normalize( "apples > or\x00anges" ), + 'apples > oranges', + 'Should have returned an HTML string with applicable characters properly encoded.' + ); + } + + /** + * Ensures that unclosed elements are explicitly closed to ensure proper HTML isolation. + * + * When thinking about embedding HTML fragments into others, it's important that unclosed + * elements aren't left dangling, otherwise a snippet of HTML may "swallow" parts of the + * document that follow it. + * + * @ticket 62036 + */ + public function test_closes_unclosed_elements_at_end() { + $this->assertSame( + WP_HTML_Processor::normalize( '
' ), + '
', + 'Should have provided the explicit closer to the un-closed DIV element.' + ); + } + + /** + * Ensures that boolean attributes remain boolean and do not gain values. + * + * @ticket 62036 + */ + public function test_boolean_attributes_remain_boolean() { + $this->assertSame( + WP_HTML_Processor::normalize( '' ), + '', + 'Should have preserved the boolean attribute upon serialization.' + ); + } + + /** + * Ensures that attributes with values result in double-quoted attribute values. + * + * @ticket 62036 + */ + public function test_attributes_are_double_quoted() { + $this->assertSame( + WP_HTML_Processor::normalize( '

' ), + '

', + 'Should double-quote all attribute values.' + ); + } + + /** + * Ensures that self-closing flags on HTML void elements are not serialized, to + * prevent risk of conflating the flag with unquoted attribute values. + * + * Example: + * + * BR element with "class" attribute having value "clear" + *
+ * + * BR element with "class" attribute having value "clear" + *
+ * + * BR element with "class" attribute having value "clear/" + *
+ * + * @ticket 62036 + */ + public function test_void_elements_get_no_dangerous_self_closing_flag() { + $this->assertSame( + WP_HTML_Processor::normalize( '
' ), + '
', + 'Should have removed dangerous self-closing flag on HTML void element.' + ); + } + + /** + * Ensures that duplicate attributes are removed upon serialization. + * + * @ticket 62036 + */ + public function test_duplicate_attributes_are_removed() { + $this->assertSame( + WP_HTML_Processor::normalize( '
'), + '
', + 'Should have removed all but the first copy of an attribute when duplicates exist.' + ); + } + + /** + * Ensures that SCRIPT contents are not escaped, as they are not parsed like text nodes are. + * + * @ticket 62036 + */ + public function test_script_contents_are_not_escaped() { + $this->assertSame( + WP_HTML_Processor::normalize( "" ), + "", + 'Should have preserved text inside a SCRIPT element, except for replacing NULL bytes.' + ); + } + + /** + * Ensures that STYLE contents are not escaped, as they are not parsed like text nodes are. + * + * @ticket 62036 + */ + public function test_style_contents_are_not_escaped() { + $this->assertSame( + WP_HTML_Processor::normalize( "" ), + "", + 'Should have preserved text inside a STYLE element, except for replacing NULL bytes.' + ); + } + + public function test_unexpected_closing_tags_are_removed() { + $this->assertSame( + WP_HTML_Processor::normalize( 'one
twothree' ), + 'onetwothree', + 'Should have removed unpected closing tags.' + ); + } + + /** + * Ensures that self-closing elements in foreign content retain their self-closing flag. + * + * @ticket 62036 + */ + public function test_self_closing_foreign_elements_retain_their_self_closing_flag() { + $this->assertSame( + WP_HTML_Processor::normalize( '' ), + '', + 'Should have closed unclosed G element, but preserved the self-closing nature of the other G element.' + ); + } + + /** + * Ensures that incomplete syntax elements at the end of an HTML string are removed from + * the serialization, since these are often vectors of exploits for the successive HTML. + * + * @ticket 62036 + * + * @dataProvider data_incomplete_syntax_tokens + * + * @param string $incomplete_token An incomplete HTML syntax token. + */ + public function test_should_remove_incomplete_input_from_end( string $incomplete_token ) { + $this->assertSame( + WP_HTML_Processor::normalize( "content{$incomplete_token}" ), + 'content', + 'Should have removed the incomplete token from the end of the input.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_incomplete_syntax_tokens() { + return array( + 'Comment opener' => array( '", + 'Should have replaced the invalid comment syntax with normative syntax.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public function data_bogus_comments() { + return array( + 'False DOCTYPE' => array( '' ), + 'CDATA look-alike' => array( '' ), + 'Immediately-closed markup instruction' => array( '' ), + 'Warning Symbol' => array( '' ), + 'PHP block look-alike' => array( '' ), + 'Funky comment' => array( '' ), + 'XML Processing Instruction look-alike' => array( '<', '?xml foo ', '>' ), + ); + } +} + From 95756e5ce2c10c3cb0c821f1b56cf9b69210c019 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 13:40:15 -0700 Subject: [PATCH 15/21] Call the new `get_full_comment_text()` --- .../html-api/class-wp-html-processor.php | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 532724783c112..9294ac2915b00 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1156,22 +1156,8 @@ protected function serialize_token(): string { break; case '#funky-comment': - $html .= ""; - break; - case '#comment': - switch ( $this->get_comment_type() ) { - case WP_HTML_Tag_Processor::COMMENT_AS_CDATA_LOOKALIKE: - $html .= ""; - break; - - case WP_HTML_Tag_Processor::COMMENT_AS_PI_NODE_LOOKALIKE: - $html .= ""; - break; - - default: - $html .= ""; - } + $html .= ""; break; case '#cdata-section': From fca648158826db0eb4cc4993c77ac3fb9b97ad56 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 14:03:11 -0700 Subject: [PATCH 16/21] Make the kwality 110% butter --- tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index 69c14a4c5f603..f2e76dbaeae29 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -102,7 +102,7 @@ public function test_void_elements_get_no_dangerous_self_closing_flag() { */ public function test_duplicate_attributes_are_removed() { $this->assertSame( - WP_HTML_Processor::normalize( '
'), + WP_HTML_Processor::normalize( '
' ), '
', 'Should have removed all but the first copy of an attribute when duplicates exist.' ); @@ -249,4 +249,3 @@ public function data_bogus_comments() { ); } } - From b37b3127719888e6b8b4afa2d05d39bca4c41eb5 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 14:39:23 -0700 Subject: [PATCH 17/21] Fix wrong test setup. --- tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index f2e76dbaeae29..4fa46d3cabc2f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -243,7 +243,7 @@ public function data_bogus_comments() { 'CDATA look-alike' => array( '' ), 'Immediately-closed markup instruction' => array( '' ), 'Warning Symbol' => array( '' ), - 'PHP block look-alike' => array( '' ), + 'PHP block look-alike' => array( '<', '?php foo(); ?', '>' ), 'Funky comment' => array( '' ), 'XML Processing Instruction look-alike' => array( '<', '?xml foo ', '>' ), ); From dd4ff16c5b507243c644d9b437150ba7534fd9ed Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 14:53:07 -0700 Subject: [PATCH 18/21] More tests and NULL byte transformation. --- .../html-api/class-wp-html-processor.php | 4 ++- .../html-api/wpHtmlProcessor-serialize.php | 36 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 9294ac2915b00..f3c4f2681f239 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1173,7 +1173,7 @@ protected function serialize_token(): string { return $html; } - $tag_name = $this->get_tag(); + $tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() ); $in_html = 'html' === $this->get_namespace(); $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); @@ -1196,6 +1196,8 @@ protected function serialize_token(): string { if ( is_string( $value ) ) { $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; } + + $html = str_replace( "\x00", "\u{FFFD}", $html ); } if ( ! $in_html && $this->has_self_closing_flag() ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php index 4fa46d3cabc2f..e05ca28473e04 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php @@ -248,4 +248,40 @@ public function data_bogus_comments() { 'XML Processing Instruction look-alike' => array( '<', '?xml foo ', '>' ), ); } + + /** + * Ensures that NULL bytes are properly handled. + * + * @ticket 62036 + * + * @dataProvider data_tokens_with_null_bytes + * + * @param string $html_with_nulls HTML token containing NULL bytes in various places. + * @param string $expected_output Expected parse of HTML after handling NULL bytes. + */ + public function test_replaces_null_bytes_appropriately( string $html_with_nulls, string $expected_output ) { + $this->assertSame( + WP_HTML_Processor::normalize( $html_with_nulls ), + $expected_output, + 'Should have properly replaced or removed NULL bytes.' + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_tokens_with_null_bytes() { + return array( + 'Tag name' => array( "", "" ), + 'Attribute name' => array( "", "" ), + 'Attribute value' => array( "", "" ), + 'Body text' => array( "one\x00two", 'onetwo' ), + 'Foreign content text' => array( "one\x00two", "one\u{FFFD}two" ), + 'SCRIPT content' => array( "", "" ), + 'STYLE content' => array( "", "" ), + 'Comment text' => array( "", "" ), + ); + } } From 588020f14eb51213945086dd3bc3cf5cb71104aa Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 15:06:28 -0700 Subject: [PATCH 19/21] Fix text alignment issue in comment. --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f3c4f2681f239..a95cad022f4c1 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1087,7 +1087,7 @@ public static function normalize( string $html ): ?string { * - Tag and attribute name casing will be lower-cased, * except for specific SVG and MathML tags or attributes. * - Text will be re-encoded, null bytes handled, - * and invalid UTF-8 replaced with U+FFFD. + * and invalid UTF-8 replaced with U+FFFD. * - Any incomplete syntax trailing at the end will be omitted, * for example, an unclosed comment opener will be removed. * From 0abf3677ef3d12572807b77775ec7a557054cff6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 15:07:37 -0700 Subject: [PATCH 20/21] Fix text alignment in docs. --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index a95cad022f4c1..ccb67c9f9fafe 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1046,7 +1046,7 @@ public function get_current_depth(): int { * - Tag and attribute name casing will be lower-cased, * except for specific SVG and MathML tags or attributes. * - Text will be re-encoded, null bytes handled, - * and invalid UTF-8 replaced with U+FFFD. + * and invalid UTF-8 replaced with U+FFFD. * - Any incomplete syntax trailing at the end will be omitted, * for example, an unclosed comment opener will be removed. * From 37b7fe898ab94a22878fe1f9957707abc63988d5 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 20 Sep 2024 15:12:16 -0700 Subject: [PATCH 21/21] Expand docblock for serialize_token --- src/wp-includes/html-api/class-wp-html-processor.php | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ccb67c9f9fafe..ba8be1e7d4004 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1138,7 +1138,15 @@ public function serialize(): ?string { } /** - * Serializes a token. + * Serializes the currently-matched token. + * + * This method produces a fully-normative HTML string for the currently-matched token, + * if able. If not matched at any token or if the token doesn't correspond to any HTML + * it will return an empty string (for example, presumptuous end tags are ignored). + * + * @see static::serialize() + * + * @since 6.7.0 * * @return string Serialization of token, or empty string if no serialization exists. */