Properly normalize attribute values

closes tafia#371
dralley · Apr 3, 2022 · 1cdea62 · 1cdea62
1 parent 8a74258
commit 1cdea62
Showing 1 changed file with 127 additions and 1 deletion.
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -331,6 +331,95 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
     }
 }
 
+// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
+// 2) Begin with a normalized value consisting of the empty string.
+// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
+//   * For a character reference, append the referenced character to the normalized value.
+//   * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
+//   * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
+//   * For another character, append the character to the normalized value.
+//
+// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
+// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
+//
+// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
+// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
+// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
+// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
+fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> {
+    // TODO: character references, entity references, error handling associated with those
+    // TODO: don't allocated unless needed?
+
+    #[derive(PartialEq)]
+    enum ParseState {
+        SpaceOrStart,
+        CDATA,
+    }
+
+    let mut value: Vec<u8> = Vec::new();
+    // Starting in the state where we think we've added a space means we implicitly skip leading spaces
+    let mut current_state = ParseState::SpaceOrStart;
+    // Used for trimming trailing spaces
+    let mut last_cdata_idx = 0;
+
+    // In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one
+    for ch in attr.as_ref() {
+        match ch {
+            b'\n' | b'\r' | b'\t' | b' ' => match current_state {
+                ParseState::SpaceOrStart => continue,
+                ParseState::CDATA => {
+                    current_state = ParseState::SpaceOrStart;
+                    value.push(b' ');
+                }
+            },
+            c @ _ => match current_state {
+                ParseState::SpaceOrStart => {
+                    current_state = ParseState::CDATA;
+                    last_cdata_idx = value.len();
+                    value.push(*c);
+                }
+                ParseState::CDATA => {
+                    last_cdata_idx = value.len();
+                    value.push(*c);
+                }
+            },
+        }
+    }
+
+    // Trim any trailing spaces
+    if current_state == ParseState::SpaceOrStart {
+        value.truncate(last_cdata_idx + 1);
+    }
+
+    Cow::Owned(value)
+
+    // let mut value: Vec<u8> = Vec::new();
+
+    // // TODO: replace sequences of spaces
+    // for i in 0..attr.len() {
+    //     let ch = attr[i];
+    //     match ch {
+    //         b'\n' => value.push(b' '),
+    //         b'\r' => value.push(b' '),
+    //         b'\t' => value.push(b' '),
+    //         c @ _ => value.push(c),
+    //     }
+    // }
+
+    // // Position where value starts after whitespace.
+    // let first_non_space_char = value
+    //     .iter()
+    //     .position(|c| !c.is_ascii_whitespace())
+    //     .unwrap_or(0);
+    // // Position where the trailing whitespace starts.
+    // let last_non_space_char = value
+    //     .iter()
+    //     .rposition(|c| !c.is_ascii_whitespace())
+    //     .and_then(|idx| Some(idx + 1))
+    //     .unwrap_or(0);
+    // Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec())
+}
+
 impl<'a> Iterator for Attributes<'a> {
     type Item = Result<Attribute<'a>>;
     fn next(&mut self) -> Option<Self::Item> {
@@ -355,7 +444,7 @@ impl<'a> Iterator for Attributes<'a> {
             ($key:expr, $val:expr) => {
                 Some(Ok(Attribute {
                     key: &self.bytes[$key],
-                    value: Cow::Borrowed(&self.bytes[$val]),
+                    value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])),
                 }))
             };
         }
@@ -513,4 +602,41 @@ mod tests {
         assert_eq!(&*a.value, b"ee");
         assert!(attributes.next().is_none());
     }
+
+    #[test]
+    fn attribute_value_normalization() {
+        // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
+        assert_eq!(
+            normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(),
+            b"foo bar baz delta"
+        );
+        // leading and trailing spaces must be stripped
+        assert_eq!(
+            normalize_attribute_value(Cow::Borrowed(b"  foo ")).as_ref(),
+            b"foo"
+        );
+        // leading space
+        assert_eq!(
+            normalize_attribute_value(Cow::Borrowed(b"  bar")).as_ref(),
+            b"bar"
+        );
+        // trailing space
+        assert_eq!(
+            normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(),
+            b"baz"
+        );
+        // sequences of spaces must be replaced with a single space
+        assert_eq!(
+            normalize_attribute_value(Cow::Borrowed(b"   foo bar   baz ")).as_ref(),
+            b"foo bar baz"
+        );
+        // sequence replacement mixed with characters treated as whitespace (\t \r \n)
+        assert_eq!(
+            normalize_attribute_value(Cow::Borrowed(
+                b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"
+            ))
+            .as_ref(),
+            b"foo bar baz delta echo foxtrot"
+        );
+    }
 }