Skip to content

Commit

Permalink
Merge pull request #1411 from Automattic/fix/url-attribute-list-handling
Browse files Browse the repository at this point in the history
Fix URL protocol validation and parsing attribute values with multiple URLs
  • Loading branch information
westonruter committed Sep 7, 2018
2 parents 4015913 + 93d77d0 commit 60fe637
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 46 deletions.
102 changes: 66 additions & 36 deletions includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -1264,18 +1264,21 @@ private function check_attr_spec_rule_value_regex_casei( $node, $attr_name, $att
private function check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ) {
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) ) {
if ( $node->hasAttribute( $attr_name ) ) {
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $attr_name ) );
foreach ( $urls_to_test as $url ) {
foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) {
$url = urldecode( $url );
// Check if the host contains invalid chars.
$url_host = wp_parse_url( $url, PHP_URL_HOST );
if ( $url_host && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', $url_host ) ) {
return AMP_Rule_Spec::FAIL;

// Check if the protocol contains invalid chars (protocolCharIsValid: https://github.com/ampproject/amphtml/blob/af1e3a550feeafd732226202b8d1f26dcefefa18/validator/engine/parse-url.js#L31-L39).
$protocol = $this->parse_protocol( $url );
if ( isset( $protocol ) ) {
if ( ! preg_match( '/^[a-zA-Z0-9\+-]+/i', $protocol ) ) {
return AMP_Rule_Spec::FAIL;
}
$url = substr( $url, strlen( $protocol ) + 1 );
}

// Check if the protocol contains invalid chars.
$dots_pos = strpos( $url, ':' );
if ( false !== $dots_pos && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', substr( $url, 0, $dots_pos ) ) ) {
// Check if the host contains invalid chars (hostCharIsValid: https://github.com/ampproject/amphtml/blob/af1e3a550feeafd732226202b8d1f26dcefefa18/validator/engine/parse-url.js#L62-L103).
$host = wp_parse_url( $url, PHP_URL_HOST );
if ( $host && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', $host ) ) {
return AMP_Rule_Spec::FAIL;
}
}
Expand All @@ -1287,6 +1290,22 @@ private function check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_r
return AMP_Rule_Spec::NOT_APPLICABLE;
}

/**
* Parse protocol from URL.
*
* This may not be a valid protocol (scheme), but it will be where the protocol should be in the URL.
*
* @link https://github.com/ampproject/amphtml/blob/af1e3a550feeafd732226202b8d1f26dcefefa18/validator/engine/parse-url.js#L235-L282
* @param string $url URL.
* @return string|null Protocol without colon if matched. Otherwise null.
*/
private function parse_protocol( $url ) {
if ( preg_match( '#^[^/]+(?=:)#', $url, $matches ) ) {
return $matches[0];
}
return null;
}

/**
* Check if attribute has a protocol value rule determine if it matches.
*
Expand All @@ -1303,34 +1322,20 @@ private function check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_r
private function check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr_spec_rule ) {
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) ) {
if ( $node->hasAttribute( $attr_name ) ) {
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $attr_name ) );
foreach ( $urls_to_test as $url ) {
/*
* This seems to be an acceptable check since the AMP validator
* will allow a URL with no protocol to pass validation.
*/
$url_scheme = wp_parse_url( $url, PHP_URL_SCHEME );
if ( $url_scheme ) {
if ( ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) {
return AMP_Rule_Spec::FAIL;
}
foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) {
$url_scheme = $this->parse_protocol( $url );
if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) {
return AMP_Rule_Spec::FAIL;
}
}
return AMP_Rule_Spec::PASS;
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] ) ) {
foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) {
if ( $node->hasAttribute( $alternative_name ) ) {
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $alternative_name ) );
foreach ( $urls_to_test as $url ) {
/*
* This seems to be an acceptable check since the AMP validator
* will allow a URL with no protocol to pass validation.
*/
$url_scheme = wp_parse_url( $url, PHP_URL_SCHEME );
if ( $url_scheme ) {
if ( ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) {
return AMP_Rule_Spec::FAIL;
}
foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $alternative_name ), $attr_name ) as $url ) {
$url_scheme = $this->parse_protocol( $url );
if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) {
return AMP_Rule_Spec::FAIL;
}
}
return AMP_Rule_Spec::PASS;
Expand All @@ -1342,7 +1347,34 @@ private function check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr
}

/**
* Check if attribute has disallowed relative value rule determine if disallowed relative value matches.
* Extract URLs from attribute.
*
* @param DOMAttr $attribute_node Attribute node.
* @param string|null $spec_attr_name Non-alternative attribute name for the spec.
* @return string[] List of URLs.
*/
private function extract_attribute_urls( $attribute_node, $spec_attr_name = null ) {
/*
* Handle the srcset special case where the attribute value can contain multiple parts, each in the format `URL [WIDTH] [PIXEL_DENSITY]`.
* So we split the srcset attribute value by commas and then return the first token of each item, omitting width descriptor and pixel density descriptor.
* This splitting cannot be done for other URLs because it a comma can appear in a URL itself generally, but the syntax can break in srcset,
* unless the commas are URL-encoded.
*/
if ( 'srcset' === $attribute_node->nodeName || 'srcset' === $spec_attr_name ) {
return array_filter( array_map(
function ( $srcset_part ) {
// Remove descriptors for width and pixel density.
return preg_replace( '/\s.*$/', '', trim( $srcset_part ) );
},
preg_split( '/\s*,\s*/', $attribute_node->nodeValue )
) );
} else {
return array( $attribute_node->nodeValue );
}
}

/**
* Check if attribute has disallowed relative URL value according to rule spec.
*
* @param DOMElement $node Node.
* @param string $attr_name Attribute name.
Expand All @@ -1357,8 +1389,7 @@ private function check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr
private function check_attr_spec_rule_disallowed_relative( $node, $attr_name, $attr_spec_rule ) {
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOW_RELATIVE ] ) && ! ( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOW_RELATIVE ] ) ) {
if ( $node->hasAttribute( $attr_name ) ) {
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $attr_name ) );
foreach ( $urls_to_test as $url ) {
foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) {
$parsed_url = wp_parse_url( $url );

/*
Expand All @@ -1376,8 +1407,7 @@ private function check_attr_spec_rule_disallowed_relative( $node, $attr_name, $a
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] ) ) {
foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) {
if ( $node->hasAttribute( $alternative_name ) ) {
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $alternative_name ) );
foreach ( $urls_to_test as $url ) {
foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $alternative_name ), $attr_name ) as $url ) {
$parsed_url = wp_parse_url( $url );
if ( empty( $parsed_url['scheme'] ) ) {
return AMP_Rule_Spec::FAIL;
Expand Down
18 changes: 9 additions & 9 deletions tests/amp-tag-and-attribute-sanitizer-private-methods-tests.php
Original file line number Diff line number Diff line change
Expand Up @@ -1491,9 +1491,9 @@ public function get_check_attr_spec_rule_allowed_protocol() {
),
'protocol_multiple_fail' => array(
array(
'source' => '<div attribute1="http://example.com, data://domain.com"></div>',
'node_tag_name' => 'div',
'attr_name' => 'attribute1',
'source' => '<img srcset="http://example.com, data://domain.com">',
'node_tag_name' => 'img',
'attr_name' => 'srcset',
'attr_spec_rule' => array(
'value_url' => array(
'protocol' => array(
Expand Down Expand Up @@ -1586,9 +1586,9 @@ public function get_check_attr_spec_rule_disallowed_relative() {
),
'disallowed_relative_multiple_pass' => array(
array(
'source' => '<div attribute1="http://example.com, http://domain.com/path/to/resource"></div>',
'node_tag_name' => 'div',
'attr_name' => 'attribute1',
'source' => '<img srcset="http://example.com, http://domain.com/path/to/resource">',
'node_tag_name' => 'img',
'attr_name' => 'srcset',
'attr_spec_rule' => array(
'value_url' => array(
'allow_relative' => false,
Expand Down Expand Up @@ -1673,15 +1673,15 @@ public function get_check_attr_spec_rule_disallowed_relative() {
),
'disallowed_relative_alternative_multiple_fail' => array(
array(
'source' => '<div attribute1_alternative1="http://domain.com, /relative/path/to/resource"></div>',
'source' => '<div source_set="http://domain.com, /relative/path/to/resource"></div>',
'node_tag_name' => 'div',
'attr_name' => 'attribute1',
'attr_name' => 'srcset',
'attr_spec_rule' => array(
'value_url' => array(
'allow_relative' => false,
),
'alternative_names' => array(
'attribute1_alternative1'
'source_set',
),
),
),
Expand Down
4 changes: 4 additions & 0 deletions tests/test-amp-iframe-sanitizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ public function get_data() {
'<p>contents<iframe src="https://example.com/video/132886713" width="500" height="281"></iframe><iframe src="https://example.com/video/132886714" width="500" height="281"></iframe></p>',
'<p>contents</p><amp-iframe src="https://example.com/video/132886713" width="500" height="281" sandbox="allow-scripts allow-same-origin" layout="intrinsic" class="amp-wp-enforced-sizes"></amp-iframe><amp-iframe src="https://example.com/video/132886714" width="500" height="281" sandbox="allow-scripts allow-same-origin" layout="intrinsic" class="amp-wp-enforced-sizes"></amp-iframe>',
),
'iframe_src_with_commas_and_colons' => array(
'<iframe src="https://www.geoportail.gouv.fr/embed/visu.html?c=3.9735668054865076,43.90558192721261&z=15&l0=GEOLOGY.GEOLOGY::EXTERNAL:OGC:EXTERNALWMS(1)&permalink=yes" width="100" height="200" sandbox="allow-forms allow-scripts allow-same-origin" allowfullscreen=""></iframe>',
'<amp-iframe src="https://www.geoportail.gouv.fr/embed/visu.html?c=3.9735668054865076,43.90558192721261&amp;z=15&amp;l0=GEOLOGY.GEOLOGY::EXTERNAL:OGC:EXTERNALWMS(1)&amp;permalink=yes" width="100" height="200" sandbox="allow-forms allow-scripts allow-same-origin" allowfullscreen="" layout="intrinsic" class="amp-wp-enforced-sizes"></amp-iframe>',
),
);
}

Expand Down
7 changes: 6 additions & 1 deletion tests/test-tag-and-attribute-sanitizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,7 @@ public function get_body_data() {
'<a href="fb-messenger://example.com/path/to/content">Click me.</a>',
'<a href="webcal:foo">Click me.</a>',
'<a href="whatsapp:foo">Click me.</a>',
'<a href="web+mastodon:follow/@handle@instance">Click me.</a>',
) ),
),

Expand Down Expand Up @@ -630,7 +631,11 @@ public function get_body_data() {
),

'amp-img_with_good_protocols' => array(
'<amp-img src="https://example.com/resource1" srcset="https://example.com/resource1, https://example.com/resource2"></amp-img>',
'<amp-img src="https://example.com/resource1" srcset="https://example.com/resource1 320w, https://example.com/resource2 480w"></amp-img>',
),

'amp-img_with_relative_urls_containing_colons' => array(
'<amp-img src="/winning:yes.jpg" width="100" height="200"></amp-img>',
),

'allowed_tag_only' => array(
Expand Down

0 comments on commit 60fe637

Please sign in to comment.