Merge pull request #781 from Mingun/only-upper-cdata

Start CDATA section only after uppercase `<![CDATA[`
tafia · Jul 6, 2024 · 0960333 · 0960333
2 parents 22b3e45 + b71cf7c
commit 0960333
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 28 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -17,8 +17,13 @@
 
 ### Bug Fixes
 
+- [#781]: Fix conditions to start CDATA section. Only uppercase `<![CDATA[` can start it.
+  Previously any case was allowed.
+
 ### Misc Changes
 
+[#781]: https://github.com/tafia/quick-xml/pull/781
+
 
 ## 0.35.0 -- 2024-06-29
 

diff --git a/src/reader/mod.rs b/src/reader/mod.rs
@@ -974,32 +974,35 @@ impl BangType {
     /// - `chunk`: data read on current iteration and not yet consumed from reader
     #[inline(always)]
     fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
-        for i in memchr::memchr_iter(b'>', chunk) {
-            match self {
-                // Need to read at least 6 symbols (`!---->`) for properly finished comment
-                // <!----> - XML comment
-                //  012345 - i
-                Self::Comment if buf.len() + i > 4 => {
-                    if chunk[..i].ends_with(b"--") {
-                        // We cannot strip last `--` from the buffer because we need it in case of
-                        // check_comments enabled option. XML standard requires that comment
-                        // will not end with `--->` sequence because this is a special case of
-                        // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
-                        return Some((&chunk[..i], i + 1)); // +1 for `>`
-                    }
-                    // End sequence `-|->` was splitted at |
-                    //        buf --/   \-- chunk
-                    if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
-                        return Some((&chunk[..i], i + 1)); // +1 for `>`
-                    }
-                    // End sequence `--|>` was splitted at |
-                    //         buf --/   \-- chunk
-                    if i == 0 && buf.ends_with(b"--") {
-                        return Some((&[], i + 1)); // +1 for `>`
+        match self {
+            Self::Comment => {
+                for i in memchr::memchr_iter(b'>', chunk) {
+                    // Need to read at least 6 symbols (`!---->`) for properly finished comment
+                    // <!----> - XML comment
+                    //  012345 - i
+                    if buf.len() + i > 4 {
+                        if chunk[..i].ends_with(b"--") {
+                            // We cannot strip last `--` from the buffer because we need it in case of
+                            // check_comments enabled option. XML standard requires that comment
+                            // will not end with `--->` sequence because this is a special case of
+                            // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
+                            return Some((&chunk[..i], i + 1)); // +1 for `>`
+                        }
+                        // End sequence `-|->` was splitted at |
+                        //        buf --/   \-- chunk
+                        if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
+                            return Some((&chunk[..i], i + 1)); // +1 for `>`
+                        }
+                        // End sequence `--|>` was splitted at |
+                        //         buf --/   \-- chunk
+                        if i == 0 && buf.ends_with(b"--") {
+                            return Some((&[], i + 1)); // +1 for `>`
+                        }
                     }
                 }
-                Self::Comment => {}
-                Self::CData => {
+            }
+            Self::CData => {
+                for i in memchr::memchr_iter(b'>', chunk) {
                     if chunk[..i].ends_with(b"]]") {
                         return Some((&chunk[..i], i + 1)); // +1 for `>`
                     }
@@ -1014,7 +1017,9 @@ impl BangType {
                         return Some((&[], i + 1)); // +1 for `>`
                     }
                 }
-                Self::DocType => {
+            }
+            Self::DocType => {
+                for i in memchr::memchr_iter(b'>', chunk) {
                     let content = &chunk[..i];
                     let balance = memchr::memchr2_iter(b'<', b'>', content)
                         .map(|p| if content[p] == b'<' { 1i32 } else { -1 })

diff --git a/src/reader/state.rs b/src/reader/state.rs
@@ -128,14 +128,22 @@ impl ReaderState {
                     self.decoder(),
                 )))
             }
-            BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
+            // XML requires uppercase only:
+            // https://www.w3.org/TR/xml11/#sec-cdata-sect
+            // Even HTML5 required uppercase only:
+            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+            BangType::CData if buf.starts_with(b"![CDATA[") => {
                 debug_assert!(buf.ends_with(b"]]"));
                 Ok(Event::CData(BytesCData::wrap(
                     // Cut of `![CDATA[` and `]]` from start and end
                     &buf[8..len - 2],
                     self.decoder(),
                 )))
             }
+            // XML requires uppercase only, but we will check that on validation stage:
+            // https://www.w3.org/TR/xml11/#sec-prolog-dtd
+            // HTML5 allows mixed case for doctype declarations:
+            // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
             BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
                 match buf[8..].iter().position(|&b| !is_whitespace(b)) {
                     Some(start) => Ok(Event::DocType(BytesText::wrap(

diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs
@@ -51,7 +51,7 @@ fn fuzz_101() {
 
 #[test]
 fn fuzz_empty_doctype() {
-    let data: &[u8] = b"<!doctype  \n    >";
+    let data: &[u8] = b"<!DOCTYPE  \n    >";
     let mut reader = Reader::from_reader(data);
     let mut buf = Vec::new();
     assert!(matches!(

diff --git a/tests/reader-config.rs b/tests/reader-config.rs
@@ -471,7 +471,7 @@ mod trim_markup_names_in_closing_tags {
 }
 
 const XML: &str = " \t\r\n\
-<!doctype root \t\r\n> \t\r\n\
+<!DOCTYPE root \t\r\n> \t\r\n\
 <root \t\r\n> \t\r\n\
     <empty \t\r\n/> \t\r\n\
     text \t\r\n\

diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs
@@ -343,6 +343,8 @@ mod syntax {
         err!(unclosed24("<![CDATA[]h") => SyntaxError::UnclosedCData);
         err!(unclosed25("<![CDATA[]>") => SyntaxError::UnclosedCData);
 
+        err!(lowercase("<![cdata[]]>") => SyntaxError::UnclosedCData);
+
         ok!(normal1("<![CDATA[]]>")     => 12: Event::CData(BytesCData::new("")));
         ok!(normal2("<![CDATA[]]>rest") => 12: Event::CData(BytesCData::new("")));
     }