diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index caa6de1d09..ba8be1e7d4 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -1029,6 +1029,216 @@ public function get_current_depth(): int { return count( $this->breadcrumbs ); } + /** + * Normalizes an HTML fragment by serializing it. + * + * This method assumes that the given HTML snippet is found in BODY context. + * For normalizing full documents or fragments found in other contexts, create + * a new processor using {@see WP_HTML_Processor::create_fragment} or + * {@see WP_HTML_Processor::create_full_parser} and call {@see WP_HTML_Processor::serialize} + * on the created instances. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * echo WP_HTML_Processor::normalize( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @param string $html Input HTML to normalize. + * + * @return string|null Normalized output, or `null` if unable to normalize. + */ + public static function normalize( string $html ): ?string { + return static::create_fragment( $html )->serialize(); + } + + /** + * Returns normalized HTML for a fragment by serializing it. + * + * This differs from {@see WP_HTML_Processor::normalize} in that it starts with + * a specific HTML Processor, which _must_ not have already started scanning; + * it must be in the initial ready state and will be in the completed state once + * serialization is complete. + * + * Many aspects of an input HTML fragment may be changed during normalization. + * + * - Attribute values will be double-quoted. + * - Duplicate attributes will be removed. + * - Omitted tags will be added. + * - Tag and attribute name casing will be lower-cased, + * except for specific SVG and MathML tags or attributes. + * - Text will be re-encoded, null bytes handled, + * and invalid UTF-8 replaced with U+FFFD. + * - Any incomplete syntax trailing at the end will be omitted, + * for example, an unclosed comment opener will be removed. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( 'One syntax < <> "oddities" + * + * @since 6.7.0 + * + * @return string|null Normalized HTML markup represented by processor, + * or `null` if unable to generate serialization. + */ + public function serialize(): ?string { + if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) { + wp_trigger_error( + __METHOD__, + "An HTML Processor which has already started processing cannot serialize it's contents. Serialize immediately after creating the instance.", + E_USER_WARNING + ); + return null; + } + + $html = ''; + while ( $this->next_token() ) { + $html .= $this->serialize_token(); + } + + if ( null !== $this->get_last_error() ) { + wp_trigger_error( + __METHOD__, + "Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.", + E_USER_WARNING + ); + return null; + } + + return $html; + } + + /** + * Serializes the currently-matched token. + * + * This method produces a fully-normative HTML string for the currently-matched token, + * if able. If not matched at any token or if the token doesn't correspond to any HTML + * it will return an empty string (for example, presumptuous end tags are ignored). + * + * @see static::serialize() + * + * @since 6.7.0 + * + * @return string Serialization of token, or empty string if no serialization exists. + */ + protected function serialize_token(): string { + $html = ''; + $token_type = $this->get_token_type(); + + switch ( $token_type ) { + case '#text': + $html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + break; + + // Unlike the `<>` which is interpreted as plaintext, this is ignored entirely. + case '#presumptuous-tag': + break; + + case '#funky-comment': + case '#comment': + $html .= ""; + break; + + case '#cdata-section': + $html .= "get_modifiable_text()}]]>"; + break; + + case 'html': + $html .= ''; + break; + } + + if ( '#tag' !== $token_type ) { + return $html; + } + + $tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() ); + $in_html = 'html' === $this->get_namespace(); + $qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name(); + + if ( $this->is_tag_closer() ) { + $html .= ""; + return $html; + } + + $attribute_names = $this->get_attribute_names_with_prefix( '' ); + if ( ! isset( $attribute_names ) ) { + $html .= "<{$qualified_name}>"; + return $html; + } + + $html .= "<{$qualified_name}"; + foreach ( $attribute_names as $attribute_name ) { + $html .= " {$this->get_qualified_attribute_name( $attribute_name )}"; + $value = $this->get_attribute( $attribute_name ); + + if ( is_string( $value ) ) { + $html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"'; + } + + $html = str_replace( "\x00", "\u{FFFD}", $html ); + } + + if ( ! $in_html && $this->has_self_closing_flag() ) { + $html .= ' /'; + } + + $html .= '>'; + + // Flush out self-contained elements. + if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) { + $text = $this->get_modifiable_text(); + + switch ( $tag_name ) { + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + $text = ''; + break; + + case 'SCRIPT': + case 'STYLE': + break; + + default: + $text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' ); + } + + $html .= "{$text}"; + } + + return $html; + } + /** * Parses next element in the 'initial' insertion mode. * diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index 2b115dd156..5e29fc7d15 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1984,6 +1984,9 @@ private function parse_next_tag(): bool { * [#x10000-#xEFFFF] * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] * + * @todo Processing instruction nodes in SGML may contain any kind of markup. XML defines a + * special case with `` syntax, but the `?` is part of the bogus comment. + * * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget */ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { diff --git a/wp-includes/version.php b/wp-includes/version.php index fedbeb62b2..4552bd00e6 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.7-alpha-59075'; +$wp_version = '6.7-alpha-59076'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.