Skip to content

Commit

Permalink
Merge pull request #771 from Mingun/escape-fixes
Browse files Browse the repository at this point in the history
Fixes in unescape routine
  • Loading branch information
Mingun committed Jun 29, 2024
2 parents 80f0e7c + 0315ed0 commit 9a72c7b
Show file tree
Hide file tree
Showing 8 changed files with 348 additions and 196 deletions.
15 changes: 15 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,24 @@
- [#773]: Fixed reporting incorrect end position in `Reader::read_to_end` family
of methods and trimming of the trailing spaces in `Reader::read_text` when
`trim_text_start` is set and the last event is not a `Text` event.
- [#771]: Character references now allow any number of leading zeroes as it should.
As a result, the following variants of `quick_xml::escape::EscapeError` are removed:
- `TooLongDecimal`
- `TooLongHexadecimal`
- [#771]: Fixed `Attribute::unescape_value` which does not unescape predefined values since 0.32.0.

### Misc Changes

- [#771]: `EscapeError::UnrecognizedSymbol` renamed to `EscapeError::UnrecognizedEntity`.
- [#771]: Implemented `PartialEq` for `EscapeError`.
- [#771]: Replace the following variants of `EscapeError` by `InvalidCharRef` variant
with a new `ParseCharRefError` inside:
- `EntityWithNull`
- `InvalidDecimal`
- `InvalidHexadecimal`
- `InvalidCodepoint`

[#771]: https://github.com/tafia/quick-xml/pull/771
[#772]: https://github.com/tafia/quick-xml/pull/772
[#773]: https://github.com/tafia/quick-xml/pull/773

Expand Down
4 changes: 2 additions & 2 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2135,9 +2135,9 @@ struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolve
lookahead: Result<PayloadEvent<'i>, DeError>,

/// Used to resolve unknown entities that would otherwise cause the parser
/// to return an [`EscapeError::UnrecognizedSymbol`] error.
/// to return an [`EscapeError::UnrecognizedEntity`] error.
///
/// [`EscapeError::UnrecognizedSymbol`]: crate::escape::EscapeError::UnrecognizedSymbol
/// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity
entity_resolver: E,
}

Expand Down
4 changes: 2 additions & 2 deletions src/de/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ pub trait EntityResolver {
/// Called when an entity needs to be resolved.
///
/// `None` is returned if a suitable value can not be found.
/// In that case an [`EscapeError::UnrecognizedSymbol`] will be returned by
/// In that case an [`EscapeError::UnrecognizedEntity`] will be returned by
/// a deserializer.
///
/// [`EscapeError::UnrecognizedSymbol`]: crate::escape::EscapeError::UnrecognizedSymbol
/// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity
fn resolve(&self, entity: &str) -> Option<&str>;
}

Expand Down
9 changes: 9 additions & 0 deletions src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ impl Decoder {

Ok(())
}

/// Decodes the `Cow` buffer, preserves the lifetime
pub(crate) fn decode_cow<'b>(&self, bytes: &Cow<'b, [u8]>) -> Result<Cow<'b, str>> {
match bytes {
Cow::Borrowed(bytes) => self.decode(bytes),
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
}
}
}

/// Decodes the provided bytes using the specified encoding.
Expand Down
234 changes: 70 additions & 164 deletions src/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,82 @@

use memchr::memchr2_iter;
use std::borrow::Cow;
use std::num::ParseIntError;
use std::ops::Range;

#[cfg(test)]
use pretty_assertions::assert_eq;
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
#[derive(Clone, Debug, PartialEq)]
pub enum ParseCharRefError {
/// Number contains sign character (`+` or `-`) which is not allowed.
UnexpectedSign,
/// Number cannot be parsed due to non-number characters or a numeric overflow.
InvalidNumber(ParseIntError),
/// Character reference represents not a valid unicode codepoint.
InvalidCodepoint(u32),
/// Character reference expanded to a not permitted character for an XML.
///
/// Currently, only `0x0` character produces this error.
IllegalCharacter(u32),
}

impl std::fmt::Display for ParseCharRefError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::UnexpectedSign => f.write_str("unexpected number sign"),
Self::InvalidNumber(e) => e.fmt(f),
Self::InvalidCodepoint(n) => write!(f, "`{}` is not a valid codepoint", n),
Self::IllegalCharacter(n) => write!(f, "0x{:x} character is not permitted in XML", n),
}
}
}

impl std::error::Error for ParseCharRefError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::InvalidNumber(e) => Some(e),
_ => None,
}
}
}

/// Error for XML escape / unescape.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, PartialEq)]
pub enum EscapeError {
/// Entity with Null character
EntityWithNull(Range<usize>),
/// Unrecognized escape symbol
UnrecognizedSymbol(Range<usize>, String),
/// Referenced entity in unknown to the parser.
UnrecognizedEntity(Range<usize>, String),
/// Cannot find `;` after `&`
UnterminatedEntity(Range<usize>),
/// Cannot convert Hexa to utf8
TooLongHexadecimal,
/// Character is not a valid hexadecimal value
InvalidHexadecimal(char),
/// Cannot convert decimal to hexa
TooLongDecimal,
/// Character is not a valid decimal value
InvalidDecimal(char),
/// Not a valid unicode codepoint
InvalidCodepoint(u32),
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
InvalidCharRef(ParseCharRefError),
}

impl std::fmt::Display for EscapeError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
EscapeError::EntityWithNull(e) => write!(
f,
"Error while escaping character at range {:?}: Null character entity not allowed",
e
),
EscapeError::UnrecognizedSymbol(rge, res) => write!(
f,
"Error while escaping character at range {:?}: Unrecognized escape symbol: {:?}",
rge, res
),
EscapeError::UnrecognizedEntity(rge, res) => {
write!(f, "at {:?}: unrecognized entity `{}`", rge, res)
}
EscapeError::UnterminatedEntity(e) => write!(
f,
"Error while escaping character at range {:?}: Cannot find ';' after '&'",
e
),
EscapeError::TooLongHexadecimal => write!(f, "Cannot convert hexadecimal to utf8"),
EscapeError::InvalidHexadecimal(e) => {
write!(f, "'{}' is not a valid hexadecimal character", e)
EscapeError::InvalidCharRef(e) => {
write!(f, "invalid character reference: {}", e)
}
EscapeError::TooLongDecimal => write!(f, "Cannot convert decimal to utf8"),
EscapeError::InvalidDecimal(e) => write!(f, "'{}' is not a valid decimal character", e),
EscapeError::InvalidCodepoint(n) => write!(f, "'{}' is not a valid codepoint", n),
}
}
}

impl std::error::Error for EscapeError {}
impl std::error::Error for EscapeError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::InvalidCharRef(e) => Some(e),
_ => None,
}
}
}

/// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
/// with their corresponding xml escaped value.
Expand Down Expand Up @@ -251,12 +271,12 @@ where
// search for character correctness
let pat = &raw[start + 1..end];
if let Some(entity) = pat.strip_prefix('#') {
let codepoint = parse_number(entity, start..end)?;
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
} else if let Some(value) = resolve_entity(pat) {
unescaped.push_str(value);
} else {
return Err(EscapeError::UnrecognizedSymbol(
return Err(EscapeError::UnrecognizedEntity(
start + 1..end,
pat.to_string(),
));
Expand Down Expand Up @@ -1796,141 +1816,27 @@ pub const fn resolve_html5_entity(entity: &str) -> Option<&'static str> {
Some(s)
}

fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
let code = if let Some(hex_digits) = bytes.strip_prefix('x') {
parse_hexadecimal(hex_digits)
fn parse_number(num: &str) -> Result<char, ParseCharRefError> {
let code = if let Some(hex) = num.strip_prefix('x') {
from_str_radix(hex, 16)?
} else {
parse_decimal(bytes)
}?;
from_str_radix(num, 10)?
};
if code == 0 {
return Err(EscapeError::EntityWithNull(range));
return Err(ParseCharRefError::IllegalCharacter(code));
}
match std::char::from_u32(code) {
Some(c) => Ok(c),
None => Err(EscapeError::InvalidCodepoint(code)),
}
}

fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF => 6 characters
if bytes.len() > 6 {
return Err(EscapeError::TooLongHexadecimal);
}
let mut code = 0;
for b in bytes.bytes() {
code <<= 4;
code += match b {
b'0'..=b'9' => b - b'0',
b'a'..=b'f' => b - b'a' + 10,
b'A'..=b'F' => b - b'A' + 10,
b => return Err(EscapeError::InvalidHexadecimal(b as char)),
} as u32;
None => Err(ParseCharRefError::InvalidCodepoint(code)),
}
Ok(code)
}

fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF = 1114111 => 7 characters
if bytes.len() > 7 {
return Err(EscapeError::TooLongDecimal);
}
let mut code = 0;
for b in bytes.bytes() {
code *= 10;
code += match b {
b'0'..=b'9' => b - b'0',
b => return Err(EscapeError::InvalidDecimal(b as char)),
} as u32;
#[inline]
fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
match src.as_bytes().first().copied() {
// We should not allow sign numbers, but u32::from_str_radix will accept `+`.
// We also handle `-` to be consistent in returned errors
Some(b'+') | Some(b'-') => Err(ParseCharRefError::UnexpectedSign),
_ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
}
Ok(code)
}

#[test]
fn test_unescape() {
let unchanged = unescape("test").unwrap();
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
// because it influences diff
// TODO: use assert_matches! when stabilized and other features will bump MSRV
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert_eq!(
unescape("&lt;&amp;test&apos;&quot;&gt;").unwrap(),
"<&test'\">"
);
assert_eq!(unescape("&#x30;").unwrap(), "0");
assert_eq!(unescape("&#48;").unwrap(), "0");
assert!(unescape("&foo;").is_err());
}

#[test]
fn test_unescape_with() {
let custom_entities = |ent: &str| match ent {
"foo" => Some("BAR"),
_ => None,
};

let unchanged = unescape_with("test", custom_entities).unwrap();
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
// because it influences diff
// TODO: use assert_matches! when stabilized and other features will bump MSRV
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert!(unescape_with("&lt;", custom_entities).is_err());
assert_eq!(unescape_with("&#x30;", custom_entities).unwrap(), "0");
assert_eq!(unescape_with("&#48;", custom_entities).unwrap(), "0");
assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR");
assert!(unescape_with("&fop;", custom_entities).is_err());
}

#[test]
fn test_escape() {
let unchanged = escape("test");
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
// because it influences diff
// TODO: use assert_matches! when stabilized and other features will bump MSRV
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert_eq!(escape("<&\"'>"), "&lt;&amp;&quot;&apos;&gt;");
assert_eq!(escape("<test>"), "&lt;test&gt;");
assert_eq!(escape("\"a\"bc"), "&quot;a&quot;bc");
assert_eq!(escape("\"a\"b&c"), "&quot;a&quot;b&amp;c");
assert_eq!(
escape("prefix_\"a\"b&<>c"),
"prefix_&quot;a&quot;b&amp;&lt;&gt;c"
);
}

#[test]
fn test_partial_escape() {
let unchanged = partial_escape("test");
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
// because it influences diff
// TODO: use assert_matches! when stabilized and other features will bump MSRV
assert_eq!(unchanged, Cow::Borrowed("test"));
assert!(matches!(unchanged, Cow::Borrowed(_)));

assert_eq!(partial_escape("<&\"'>"), "&lt;&amp;\"'&gt;");
assert_eq!(partial_escape("<test>"), "&lt;test&gt;");
assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc");
assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&amp;c");
assert_eq!(
partial_escape("prefix_\"a\"b&<>c"),
"prefix_\"a\"b&amp;&lt;&gt;c"
);
}

#[test]
fn test_minimal_escape() {
assert_eq!(minimal_escape("test"), Cow::Borrowed("test"));
assert_eq!(minimal_escape("<&\"'>"), "&lt;&amp;\"'>");
assert_eq!(minimal_escape("<test>"), "&lt;test>");
assert_eq!(minimal_escape("\"a\"bc"), "\"a\"bc");
assert_eq!(minimal_escape("\"a\"b&c"), "\"a\"b&amp;c");
assert_eq!(
minimal_escape("prefix_\"a\"b&<>c"),
"prefix_\"a\"b&amp;&lt;>c"
);
}
Loading

0 comments on commit 9a72c7b

Please sign in to comment.