Skip to content

Commit

Permalink
Merge pull request #781 from Mingun/only-upper-cdata
Browse files Browse the repository at this point in the history
Start CDATA section only after uppercase `<![CDATA[`
  • Loading branch information
Mingun committed Jul 6, 2024
2 parents 22b3e45 + b71cf7c commit 0960333
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 28 deletions.
5 changes: 5 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@

### Bug Fixes

- [#781]: Fix conditions to start CDATA section. Only uppercase `<![CDATA[` can start it.
Previously any case was allowed.

### Misc Changes

[#781]: https://github.com/tafia/quick-xml/pull/781


## 0.35.0 -- 2024-06-29

Expand Down
55 changes: 30 additions & 25 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -974,32 +974,35 @@ impl BangType {
/// - `chunk`: data read on current iteration and not yet consumed from reader
#[inline(always)]
fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
for i in memchr::memchr_iter(b'>', chunk) {
match self {
// Need to read at least 6 symbols (`!---->`) for properly finished comment
// <!----> - XML comment
// 012345 - i
Self::Comment if buf.len() + i > 4 => {
if chunk[..i].ends_with(b"--") {
// We cannot strip last `--` from the buffer because we need it in case of
// check_comments enabled option. XML standard requires that comment
// will not end with `--->` sequence because this is a special case of
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `-|->` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `--|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"--") {
return Some((&[], i + 1)); // +1 for `>`
match self {
Self::Comment => {
for i in memchr::memchr_iter(b'>', chunk) {
// Need to read at least 6 symbols (`!---->`) for properly finished comment
// <!----> - XML comment
// 012345 - i
if buf.len() + i > 4 {
if chunk[..i].ends_with(b"--") {
// We cannot strip last `--` from the buffer because we need it in case of
// check_comments enabled option. XML standard requires that comment
// will not end with `--->` sequence because this is a special case of
// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `-|->` was splitted at |
// buf --/ \-- chunk
if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
// End sequence `--|>` was splitted at |
// buf --/ \-- chunk
if i == 0 && buf.ends_with(b"--") {
return Some((&[], i + 1)); // +1 for `>`
}
}
}
Self::Comment => {}
Self::CData => {
}
Self::CData => {
for i in memchr::memchr_iter(b'>', chunk) {
if chunk[..i].ends_with(b"]]") {
return Some((&chunk[..i], i + 1)); // +1 for `>`
}
Expand All @@ -1014,7 +1017,9 @@ impl BangType {
return Some((&[], i + 1)); // +1 for `>`
}
}
Self::DocType => {
}
Self::DocType => {
for i in memchr::memchr_iter(b'>', chunk) {
let content = &chunk[..i];
let balance = memchr::memchr2_iter(b'<', b'>', content)
.map(|p| if content[p] == b'<' { 1i32 } else { -1 })
Expand Down
10 changes: 9 additions & 1 deletion src/reader/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,22 @@ impl ReaderState {
self.decoder(),
)))
}
BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
// XML requires uppercase only:
// https://www.w3.org/TR/xml11/#sec-cdata-sect
// Even HTML5 required uppercase only:
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
BangType::CData if buf.starts_with(b"![CDATA[") => {
debug_assert!(buf.ends_with(b"]]"));
Ok(Event::CData(BytesCData::wrap(
// Cut of `![CDATA[` and `]]` from start and end
&buf[8..len - 2],
self.decoder(),
)))
}
// XML requires uppercase only, but we will check that on validation stage:
// https://www.w3.org/TR/xml11/#sec-prolog-dtd
// HTML5 allows mixed case for doctype declarations:
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
match buf[8..].iter().position(|&b| !is_whitespace(b)) {
Some(start) => Ok(Event::DocType(BytesText::wrap(
Expand Down
2 changes: 1 addition & 1 deletion tests/fuzzing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fn fuzz_101() {

#[test]
fn fuzz_empty_doctype() {
let data: &[u8] = b"<!doctype \n >";
let data: &[u8] = b"<!DOCTYPE \n >";
let mut reader = Reader::from_reader(data);
let mut buf = Vec::new();
assert!(matches!(
Expand Down
2 changes: 1 addition & 1 deletion tests/reader-config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ mod trim_markup_names_in_closing_tags {
}

const XML: &str = " \t\r\n\
<!doctype root \t\r\n> \t\r\n\
<!DOCTYPE root \t\r\n> \t\r\n\
<root \t\r\n> \t\r\n\
<empty \t\r\n/> \t\r\n\
text \t\r\n\
Expand Down
2 changes: 2 additions & 0 deletions tests/reader-errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ mod syntax {
err!(unclosed24("<![CDATA[]h") => SyntaxError::UnclosedCData);
err!(unclosed25("<![CDATA[]>") => SyntaxError::UnclosedCData);

err!(lowercase("<![cdata[]]>") => SyntaxError::UnclosedCData);

ok!(normal1("<![CDATA[]]>") => 12: Event::CData(BytesCData::new("")));
ok!(normal2("<![CDATA[]]>rest") => 12: Event::CData(BytesCData::new("")));
}
Expand Down

0 comments on commit 0960333

Please sign in to comment.