Skip to content

Commit

Permalink
HTML API: Add support for list elements.
Browse files Browse the repository at this point in the history
Adds support for the following HTML elements to the HTML Processor:

 - LI, OL, UL.
 - DD, DL, DT.

Previously, these elements were not supported and the HTML Processor would bail when encountering them.
With this patch it will proceed to parse an HTML document when encountering those tags as long as other normal conditions don't cause it to bail (such as complicated format reconstruction).

Props audrasjb, jonsurrell, bernhard-reiter.
Fixes #60215.



git-svn-id: https://develop.svn.wordpress.org/trunk@57264 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Jan 10, 2024
1 parent eff1a3d commit 7c1c48c
Show file tree
Hide file tree
Showing 9 changed files with 704 additions and 44 deletions.
9 changes: 9 additions & 0 deletions phpcs.xml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,15 @@
<exclude-pattern>/wp-tests-config-sample\.php</exclude-pattern>
</rule>

<!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written
this way in the HTML specification, and these particular algorithms are complex and
highly imperative. Avoiding the goto introduces a number of risks that could make it
more difficult to maintain the relationship to the standard, lead to subtle differences
in the parsing, and distance the code from its standard. -->
<rule ref="Generic.PHP.DiscourageGoto.Found">
<exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern>
</rule>

<!-- Exclude sample config from modernization to prevent breaking CI workflows based on WP-CLI scaffold.
See: https://core.trac.wordpress.org/ticket/48082#comment:16 -->
<rule ref="Modernize.FunctionCalls.Dirname.FileConstant">
Expand Down
32 changes: 24 additions & 8 deletions src/wp-includes/html-api/class-wp-html-open-elements.php
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ public function has_element_in_specific_scope( $tag_name, $termination_list ) {
}

if ( in_array( $node->node_name, $termination_list, true ) ) {
return true;
return false;
}
}

Expand Down Expand Up @@ -166,18 +166,22 @@ public function has_element_in_scope( $tag_name ) {
* Returns whether a particular element is in list item scope.
*
* @since 6.4.0
* @since 6.5.0 Implemented: no longer throws on every invocation.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope
*
* @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
*
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_list_item_scope( $tag_name ) {
throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on list item scope.' );

return false; // The linter requires this unreachable code until the function is implemented and can return.
return $this->has_element_in_specific_scope(
$tag_name,
array(
// There are more elements that belong here which aren't currently supported.
'OL',
'UL',
)
);
}

/**
Expand Down Expand Up @@ -375,10 +379,22 @@ public function walk_down() {
* see WP_HTML_Open_Elements::walk_down().
*
* @since 6.4.0
* @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists.
*
* @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists.
*/
public function walk_up() {
public function walk_up( $above_this_node = null ) {
$has_found_node = null === $above_this_node;

for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) {
yield $this->stack[ $i ];
$node = $this->stack[ $i ];

if ( ! $has_found_node ) {
$has_found_node = $node === $above_this_node;
continue;
}

yield $node;
}
}

Expand Down
115 changes: 114 additions & 1 deletion src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
* - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
* - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
* - Links: A.
* - Lists: DL.
* - Lists: DD, DL, DT, LI, OL, LI.
* - Media elements: AUDIO, CANVAS, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
* - Paragraph: P.
* - Phrasing elements: ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
Expand Down Expand Up @@ -648,10 +648,12 @@ private function step_in_body() {
case '+MAIN':
case '+MENU':
case '+NAV':
case '+OL':
case '+P':
case '+SEARCH':
case '+SECTION':
case '+SUMMARY':
case '+UL':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
Expand Down Expand Up @@ -685,9 +687,11 @@ private function step_in_body() {
case '-MAIN':
case '-MENU':
case '-NAV':
case '-OL':
case '-SEARCH':
case '-SECTION':
case '-SUMMARY':
case '-UL':
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
// @todo Report parse error.
// Ignore the token.
Expand Down Expand Up @@ -755,6 +759,109 @@ private function step_in_body() {
$this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
return true;

/*
* > A start tag whose tag name is "li"
* > A start tag whose tag name is one of: "dd", "dt"
*/
case '+DD':
case '+DT':
case '+LI':
$this->state->frameset_ok = false;
$node = $this->state->stack_of_open_elements->current_node();
$is_li = 'LI' === $tag_name;

in_body_list_loop:
/*
* The logic for LI and DT/DD is the same except for one point: LI elements _only_
* close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
*/
if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
$node_name = $is_li ? 'LI' : $node->node_name;
$this->generate_implied_end_tags( $node_name );
if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
}

$this->state->stack_of_open_elements->pop_until( $node_name );
goto in_body_list_done;
}

if (
'ADDRESS' !== $node->node_name &&
'DIV' !== $node->node_name &&
'P' !== $node->node_name &&
$this->is_special( $node->node_name )
) {
/*
* > If node is in the special category, but is not an address, div,
* > or p element, then jump to the step labeled done below.
*/
goto in_body_list_done;
} else {
/*
* > Otherwise, set node to the previous entry in the stack of open elements
* > and return to the step labeled loop.
*/
foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
$node = $item;
break;
}
goto in_body_list_loop;
}

in_body_list_done:
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}

$this->insert_html_element( $this->state->current_token );
return true;

/*
* > An end tag whose tag name is "li"
* > An end tag whose tag name is one of: "dd", "dt"
*/
case '-DD':
case '-DT':
case '-LI':
if (
/*
* An end tag whose tag name is "li":
* If the stack of open elements does not have an li element in list item scope,
* then this is a parse error; ignore the token.
*/
(
'LI' === $tag_name &&
! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
) ||
/*
* An end tag whose tag name is one of: "dd", "dt":
* If the stack of open elements does not have an element in scope that is an
* HTML element with the same tag name as that of the token, then this is a
* parse error; ignore the token.
*/
(
'LI' !== $tag_name &&
! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name )
)
) {
/*
* This is a parse error, ignore the token.
*
* @todo Indicate a parse error once it's possible.
*/
return $this->step();
}

$this->generate_implied_end_tags( $tag_name );

if ( $tag_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
}

$this->state->stack_of_open_elements->pop_until( $tag_name );
return true;

/*
* > An end tag whose tag name is "p"
*/
Expand Down Expand Up @@ -1223,6 +1330,9 @@ private function close_a_p_element() {
*/
private function generate_implied_end_tags( $except_for_this_element = null ) {
$elements_with_implied_end_tags = array(
'DD',
'DT',
'LI',
'P',
);

Expand All @@ -1248,6 +1358,9 @@ private function generate_implied_end_tags( $except_for_this_element = null ) {
*/
private function generate_implied_end_tags_thoroughly() {
$elements_with_implied_end_tags = array(
'DD',
'DT',
'LI',
'P',
);

Expand Down
5 changes: 0 additions & 5 deletions tests/phpunit/tests/html-api/wpHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ public function data_unsupported_special_in_body_tags() {
'CAPTION' => array( 'CAPTION' ),
'COL' => array( 'COL' ),
'COLGROUP' => array( 'COLGROUP' ),
'DD' => array( 'DD' ),
'DT' => array( 'DT' ),
'EMBED' => array( 'EMBED' ),
'FORM' => array( 'FORM' ),
'FRAME' => array( 'FRAME' ),
Expand All @@ -180,7 +178,6 @@ public function data_unsupported_special_in_body_tags() {
'IFRAME' => array( 'IFRAME' ),
'INPUT' => array( 'INPUT' ),
'KEYGEN' => array( 'KEYGEN' ),
'LI' => array( 'LI' ),
'LINK' => array( 'LINK' ),
'LISTING' => array( 'LISTING' ),
'MARQUEE' => array( 'MARQUEE' ),
Expand All @@ -191,7 +188,6 @@ public function data_unsupported_special_in_body_tags() {
'NOFRAMES' => array( 'NOFRAMES' ),
'NOSCRIPT' => array( 'NOSCRIPT' ),
'OBJECT' => array( 'OBJECT' ),
'OL' => array( 'OL' ),
'OPTGROUP' => array( 'OPTGROUP' ),
'OPTION' => array( 'OPTION' ),
'PARAM' => array( 'PARAM' ),
Expand All @@ -218,7 +214,6 @@ public function data_unsupported_special_in_body_tags() {
'TITLE' => array( 'TITLE' ),
'TR' => array( 'TR' ),
'TRACK' => array( 'TRACK' ),
'UL' => array( 'UL' ),
'WBR' => array( 'WBR' ),
'XMP' => array( 'XMP' ),
);
Expand Down
Loading

0 comments on commit 7c1c48c

Please sign in to comment.