Properly normalize attribute values

closes tafia#371
dralley · Jun 22, 2022 · 21687c7 · 21687c7
1 parent 46b4d1d
commit 21687c7
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 6 deletions.
diff --git a/src/errors.rs b/src/errors.rs
@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)

diff --git a/src/escapei.rs b/src/escapei.rs
@@ -9,7 +9,7 @@ use std::ops::Range;
 use pretty_assertions::assert_eq;
 
 /// Error for XML escape/unescqpe.
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum EscapeError {
     /// Entity with Null character
     EntityWithNull(::std::ops::Range<usize>),
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
 }
 
 /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
-/// value, using a dictionnary of custom entities.
+/// value, using a dictionary of custom entities.
 ///
 /// # Pre-condition
 ///

diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -4,6 +4,7 @@
 
 use crate::errors::{Error, Result as XmlResult};
 use crate::escape::{do_unescape, escape};
+use crate::escapei::EscapeError;
 use crate::name::QName;
 use crate::reader::{is_whitespace, Reader};
 use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -32,6 +33,13 @@ pub struct Attribute<'a> {
 }
 
 impl<'a> Attribute<'a> {
+    ///
+    pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> {
+        let normalized = normalize_attribute_value(self.value.as_ref());
+        let escaped = do_unescape(&*normalized, None)?;
+        Ok(Cow::Owned(escaped.into_owned()))
+    }
+
     /// Returns the unescaped value.
     ///
     /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
@@ -182,6 +190,90 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
     }
 }
 
+/// Normalize the attribute value according to xml specification section 3.3.3
+///
+/// https://www.w3.org/TR/xml/#AVNormalize
+///
+/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
+/// * Sequences of whitespace-like characters are replaced with a single whitespace character
+/// * Character and entity references are substituted as defined by the spec
+fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
+    // TODO: character references, entity references, error handling associated with those
+
+    #[derive(PartialEq)]
+    enum ParseState {
+        Space,
+        CDATA,
+    }
+
+    // Trim characters from the beginning and end of the attribute value - this can't fail.
+    fn trim_value(attr: &[u8]) -> &[u8] {
+        let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));
+
+        if first_non_space_char.is_none() {
+            // The entire value was whitespace-like characters
+            return b"";
+        }
+
+        let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));
+
+        // Trim all whitespace-like characters away from the beginning and end of the attribute value.
+        let begin = first_non_space_char.unwrap();
+        let end = last_non_space_char.unwrap_or(attr.len());
+        &attr[begin..=end]
+    }
+
+    let trimmed_attr = trim_value(attr);
+
+    // A new buffer is only created when we encounter a situation that requires it.
+    let mut normalized: Option<Vec<u8>> = None;
+    // We start on character data because all whitespace-like characters are already trimmed away.
+    let mut current_state = ParseState::CDATA;
+
+    // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
+    // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
+    // buffer and continue using this buffer.
+    for (idx, ch) in trimmed_attr.iter().enumerate() {
+        match ch {
+            b'\n' | b'\r' | b'\t' | b' ' => match current_state {
+                ParseState::Space => match normalized {
+                    Some(_) => continue,
+                    None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
+                },
+                ParseState::CDATA => {
+                    current_state = ParseState::Space;
+                    match normalized.as_mut() {
+                        Some(buf) => buf.push(b' '),
+                        None => {
+                            let mut buf = Vec::from(&trimmed_attr[..idx]);
+                            buf.push(b' ');
+                            normalized = Some(buf);
+                        }
+                    }
+                }
+            },
+            c @ _ => match current_state {
+                ParseState::Space => {
+                    current_state = ParseState::CDATA;
+                    if let Some(normalized) = normalized.as_mut() {
+                        normalized.push(*c);
+                    }
+                }
+                ParseState::CDATA => {
+                    if let Some(normalized) = normalized.as_mut() {
+                        normalized.push(*c);
+                    }
+                }
+            },
+        }
+    }
+
+    match normalized {
+        Some(normalized) => Cow::Owned(normalized),
+        None => Cow::Borrowed(trimmed_attr),
+    }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Iterator over XML attributes.
@@ -798,6 +890,43 @@ mod xml {
     use super::*;
     use pretty_assertions::assert_eq;
 
+    #[test]
+    fn attribute_value_normalization() {
+        // empty value
+        assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b""));
+        // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
+        assert_eq!(
+            normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"),
+            Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec())
+        );
+        // leading and trailing spaces must be stripped
+        assert_eq!(normalize_attribute_value(b"  foo "), Cow::Borrowed(b"foo"));
+        // leading space
+        assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar"));
+        // trailing space
+        assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz"));
+        // sequences of spaces must be replaced with a single space
+        assert_eq!(
+            normalize_attribute_value(b"   foo bar   baz "),
+            Cow::Owned::<[u8]>(b"foo bar baz".to_vec())
+        );
+        // sequence replacement mixed with characters treated as whitespace (\t \r \n)
+        assert_eq!(
+            normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"),
+            Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec())
+        );
+        // character references for whitespace-like characters are not combined after substitution
+        assert_eq!(
+            normalize_attribute_value(b"&#x20;&#xD0;&#xA0;&#x90;"),
+            Cow::Owned::<[u8]>(b" \r\t\n".to_vec())
+        );
+        // sequence replacement mixed with characters treated as whitespace (\t \r \n)
+        assert_eq!(
+            normalize_attribute_value(b" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;"),
+            Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec())
+        );
+    }
+
     /// Checked attribute is the single attribute
     mod single {
         use super::*;

diff --git a/src/reader.rs b/src/reader.rs
@@ -1436,10 +1436,7 @@ impl ReadElementState {
 /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
 #[inline]
 pub(crate) fn is_whitespace(b: u8) -> bool {
-    match b {
-        b' ' | b'\r' | b'\n' | b'\t' => true,
-        _ => false,
-    }
+    matches!(b, b' ' | b'\r' | b'\n' | b'\t')
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////