diff --git a/src/errors.rs b/src/errors.rs index c1af08af..5f41b68e 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -72,6 +72,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escapei.rs b/src/escapei.rs index 64749c27..dd4453a6 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -9,7 +9,7 @@ use std::ops::Range; use pretty_assertions::assert_eq; /// Error for XML escape/unescqpe. -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum EscapeError { /// Entity with Null character EntityWithNull(::std::ops::Range), @@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result, EscapeError> { } /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using a dictionnary of custom entities. +/// value, using a dictionary of custom entities. /// /// # Pre-condition /// diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d6331bc7..7939edac 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,6 +4,7 @@ use crate::errors::{Error, Result as XmlResult}; use crate::escape::{do_unescape, escape}; +use crate::escapei::EscapeError; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -32,6 +33,13 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { + /// + pub fn normalized_value(&'a self) -> Result, EscapeError> { + let normalized = normalize_attribute_value(self.value.as_ref()); + let escaped = do_unescape(&*normalized, None)?; + Ok(Cow::Owned(escaped.into_owned())) + } + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are @@ -182,6 +190,90 @@ impl<'a> From> for Attribute<'a> { } } +/// Normalize the attribute value according to xml specification section 3.3.3 +/// +/// https://www.w3.org/TR/xml/#AVNormalize +/// +/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value +/// * Sequences of whitespace-like characters are replaced with a single whitespace character +/// * Character and entity references are substituted as defined by the spec +fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { + // TODO: character references, entity references, error handling associated with those + + #[derive(PartialEq)] + enum ParseState { + Space, + CDATA, + } + + // Trim characters from the beginning and end of the attribute value - this can't fail. + fn trim_value(attr: &[u8]) -> &[u8] { + let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c)); + + if first_non_space_char.is_none() { + // The entire value was whitespace-like characters + return b""; + } + + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c)); + + // Trim all whitespace-like characters away from the beginning and end of the attribute value. + let begin = first_non_space_char.unwrap(); + let end = last_non_space_char.unwrap_or(attr.len()); + &attr[begin..=end] + } + + let trimmed_attr = trim_value(attr); + + // A new buffer is only created when we encounter a situation that requires it. + let mut normalized: Option> = None; + // We start on character data because all whitespace-like characters are already trimmed away. + let mut current_state = ParseState::CDATA; + + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new + // buffer and continue using this buffer. + for (idx, ch) in trimmed_attr.iter().enumerate() { + match ch { + b'\n' | b'\r' | b'\t' | b' ' => match current_state { + ParseState::Space => match normalized { + Some(_) => continue, + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), + }, + ParseState::CDATA => { + current_state = ParseState::Space; + match normalized.as_mut() { + Some(buf) => buf.push(b' '), + None => { + let mut buf = Vec::from(&trimmed_attr[..idx]); + buf.push(b' '); + normalized = Some(buf); + } + } + } + }, + c @ _ => match current_state { + ParseState::Space => { + current_state = ParseState::CDATA; + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + ParseState::CDATA => { + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + }, + } + } + + match normalized { + Some(normalized) => Cow::Owned(normalized), + None => Cow::Borrowed(trimmed_attr), + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Iterator over XML attributes. @@ -798,6 +890,43 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + #[test] + fn attribute_value_normalization() { + // empty value + assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b"")); + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character + assert_eq!( + normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"), + Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec()) + ); + // leading and trailing spaces must be stripped + assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo")); + // leading space + assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar")); + // trailing space + assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz")); + // sequences of spaces must be replaced with a single space + assert_eq!( + normalize_attribute_value(b" foo bar baz "), + Cow::Owned::<[u8]>(b"foo bar baz".to_vec()) + ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + assert_eq!( + normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"), + Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec()) + ); + // character references for whitespace-like characters are not combined after substitution + assert_eq!( + normalize_attribute_value(b" Р"), + Cow::Owned::<[u8]>(b" \r\t\n".to_vec()) + ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + assert_eq!( + normalize_attribute_value(b" foo\tbar baz  delta\n\r echo foxtrotÐ"), + Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec()) + ); + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/reader.rs b/src/reader.rs index c3d0bbf3..029c00e6 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1436,10 +1436,7 @@ impl ReadElementState { /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) fn is_whitespace(b: u8) -> bool { - match b { - b' ' | b'\r' | b'\n' | b'\t' => true, - _ => false, - } + matches!(b, b' ' | b'\r' | b'\n' | b'\t') } ////////////////////////////////////////////////////////////////////////////////////////////////////