From c195f5d0b3dab46ef093f71e27d9a6c3898b2686 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 2 Jul 2024 20:58:57 +0500 Subject: [PATCH 01/10] Sort options alphabetically --- src/reader/mod.rs | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 759ba6aa..7f2ea60d 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -23,6 +23,25 @@ use crate::reader::state::ReaderState; #[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))] #[non_exhaustive] pub struct Config { + /// Whether unmatched closing tag names should be allowed. Unless enabled, + /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`] + /// is returned from read methods. + /// + /// When set to `true`, it won't check if a closing tag has a corresponding + /// opening tag at all. For example, `` will be permitted. + /// + /// Note that the emitted [`End`] event will not be modified if this is enabled, + /// ie. it will contain the data of the unmatched end tag. + /// + /// Note, that setting this to `true` will lead to additional allocates that + /// needed to store tag name for an [`End`] event. + /// + /// Default: `false` + /// + /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag + /// [`End`]: crate::events::Event::End + pub allow_unmatched_ends: bool, + /// Whether comments should be validated. If enabled, in case of invalid comment /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods. /// @@ -76,25 +95,6 @@ pub struct Config { /// [`expand_empty_elements`]: Self::expand_empty_elements pub check_end_names: bool, - /// Whether unmatched closing tag names should be allowed. Unless enabled, - /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`] - /// is returned from read methods. - /// - /// When set to `true`, it won't check if a closing tag has a corresponding - /// opening tag at all. For example, `` will be permitted. - /// - /// Note that the emitted [`End`] event will not be modified if this is enabled, - /// ie. it will contain the data of the unmatched end tag. - /// - /// Note, that setting this to `true` will lead to additional allocates that - /// needed to store tag name for an [`End`] event. - /// - /// Default: `false` - /// - /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag - /// [`End`]: crate::events::Event::End - pub allow_unmatched_ends: bool, - /// Whether empty elements should be split into an `Open` and a `Close` event. /// /// When set to `true`, all [`Empty`] events produced by a self-closing tag @@ -209,9 +209,9 @@ impl Config { impl Default for Config { fn default() -> Self { Self { + allow_unmatched_ends: false, check_comments: false, check_end_names: true, - allow_unmatched_ends: false, expand_empty_elements: false, trim_markup_names_in_closing_tags: true, trim_text_start: false, From a4c71fb6dc101328f51b552286a78e0e3b5b08b5 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 2 Jul 2024 21:26:56 +0500 Subject: [PATCH 02/10] Move tests for allow_unmatched_ends to its own sub-module --- tests/reader-config.rs | 69 +++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/tests/reader-config.rs b/tests/reader-config.rs index 781bca8c..8796075e 100644 --- a/tests/reader-config.rs +++ b/tests/reader-config.rs @@ -9,6 +9,54 @@ use quick_xml::errors::{Error, IllFormedError}; use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText, Event}; use quick_xml::reader::Reader; +mod allow_unmatched_ends { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn false_() { + let mut reader = Reader::from_str(""); + reader.config_mut().allow_unmatched_ends = false; + + assert_eq!( + reader.read_event().unwrap(), + Event::Start(BytesStart::new("tag")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new("tag")) + ); + match reader.read_event() { + Err(Error::IllFormed(cause)) => { + assert_eq!(cause, IllFormedError::UnmatchedEndTag("unmatched".into())); + } + x => panic!("Expected `Err(IllFormed(_))`, but got `{:?}`", x), + } + assert_eq!(reader.read_event().unwrap(), Event::Eof); + } + + #[test] + fn true_() { + let mut reader = Reader::from_str(""); + reader.config_mut().allow_unmatched_ends = true; + + assert_eq!( + reader.read_event().unwrap(), + Event::Start(BytesStart::new("tag")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new("tag")) + ); + // #770: We want to allow this + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new("unmatched")) + ); + assert_eq!(reader.read_event().unwrap(), Event::Eof); + } +} + mod check_comments { use super::*; @@ -344,27 +392,6 @@ mod check_end_names { ); assert_eq!(reader.read_event().unwrap(), Event::Eof); } - - #[test] - fn unmatched_end_tags() { - let mut reader = Reader::from_str(""); - reader.config_mut().allow_unmatched_ends = true; - - assert_eq!( - reader.read_event().unwrap(), - Event::Start(BytesStart::new("tag")) - ); - assert_eq!( - reader.read_event().unwrap(), - Event::End(BytesEnd::new("tag")) - ); - // #770: We want to allow this - assert_eq!( - reader.read_event().unwrap(), - Event::End(BytesEnd::new("unmatched")) - ); - assert_eq!(reader.read_event().unwrap(), Event::Eof); - } } } From 28c031d87ecd194fc7e5257d78a73a49ab683d03 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 2 Jul 2024 22:35:43 +0500 Subject: [PATCH 03/10] Remove incorrect example --- src/reader/mod.rs | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 7f2ea60d..8cbda948 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -835,23 +835,6 @@ trait XmlSource<'r, B> { /// Read input until start of markup (the `<`) is found or end of input is reached. /// - /// Returns a slice of data read up to `<` (exclusive), and a flag noting whether - /// `<` was found in the input or not. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc Date: Wed, 3 Jul 2024 20:45:10 +0500 Subject: [PATCH 04/10] Do not use `Default` implementation to construct parsers This is not generic code and it is more clear when we construct them with explicit state --- src/reader/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 8cbda948..b700adcb 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -360,7 +360,7 @@ macro_rules! read_until_close { }, // ` match $reader - .read_with(PiParser::default(), $buf, &mut $self.state.offset) + .read_with(PiParser(false), $buf, &mut $self.state.offset) $(.$await)? { Ok(bytes) => $self.state.emit_question_mark(bytes), @@ -373,7 +373,7 @@ macro_rules! read_until_close { }, // `<...` - opening or self-closed tag Ok(Some(_)) => match $reader - .read_with(ElementParser::default(), $buf, &mut $self.state.offset) + .read_with(ElementParser::Outside, $buf, &mut $self.state.offset) $(.$await)? { Ok(bytes) => Ok($self.state.emit_start(bytes)), From 0d99a16df15b6bd459621c5af0cc407e94cee1d4 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 3 Jul 2024 20:52:48 +0500 Subject: [PATCH 05/10] Assert syntax errors via pretty_assertions If test will fail due to another syntax error we get a nice diff --- src/reader/mod.rs | 56 +++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index b700adcb..329a734f 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1172,9 +1172,9 @@ mod test { // ^= 1 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedCData)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData), x => panic!( - "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1191,9 +1191,9 @@ mod test { // ^= 1 ^= 22 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedCData)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData), x => panic!( - "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1270,9 +1270,9 @@ mod test { // ^= 1 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment), x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1287,9 +1287,9 @@ mod test { // ^= 1 ^= 17 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment), x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1304,9 +1304,9 @@ mod test { // ^= 1 ^= 17 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment), x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1321,9 +1321,9 @@ mod test { // ^= 1 ^= 18 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment), x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1338,9 +1338,9 @@ mod test { // ^= 1 ^= 19 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedComment)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment), x => panic!( - "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1400,9 +1400,9 @@ mod test { // ^= 1 ^= 17 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype), x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1417,9 +1417,9 @@ mod test { // ^= 1 ^= 22 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype), x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1452,9 +1452,9 @@ mod test { // ^= 1 ^23 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype), x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1474,9 +1474,9 @@ mod test { // ^= 1 ^= 17 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype), x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1491,9 +1491,9 @@ mod test { // ^= 1 ^= 22 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype), x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1526,9 +1526,9 @@ mod test { // ^= 1 ^= 23 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype), x => panic!( - "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } @@ -1554,9 +1554,9 @@ mod test { // ^= 1 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? { - Err(Error::Syntax(SyntaxError::UnclosedTag)) => {} + Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag), x => panic!( - "Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`", + "Expected `Err(Syntax(_))`, but got `{:?}`", x ), } From d387ed7416af516034b1952375ac01ee7a0a513a Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 23 Jun 2024 12:01:21 +0500 Subject: [PATCH 06/10] Move `Parser`, `ElementParser` and `PiParser` to the new module `parser`. --- Changelog.md | 3 +++ src/lib.rs | 1 + src/{reader => parser}/element.rs | 4 ++-- src/parser/mod.rs | 29 +++++++++++++++++++++++++++++ src/{reader => parser}/pi.rs | 4 ++-- src/reader/async_tokio.rs | 5 ++--- src/reader/buffered_reader.rs | 3 ++- src/reader/mod.rs | 27 ++------------------------- src/reader/slice_reader.rs | 3 ++- 9 files changed, 45 insertions(+), 34 deletions(-) rename src/{reader => parser}/element.rs (98%) create mode 100644 src/parser/mod.rs rename src/{reader => parser}/pi.rs (98%) diff --git a/Changelog.md b/Changelog.md index 1c364f60..c15a19fd 100644 --- a/Changelog.md +++ b/Changelog.md @@ -22,6 +22,9 @@ ### Misc Changes +- [#780]: `reader::Parser`, `reader::ElementParser` and `reader::PiParser` moved to the new module `parser`. + +[#780]: https://github.com/tafia/quick-xml/pull/780 [#781]: https://github.com/tafia/quick-xml/pull/781 diff --git a/src/lib.rs b/src/lib.rs index f51f6834..b0f37f50 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -59,6 +59,7 @@ pub mod errors; pub mod escape; pub mod events; pub mod name; +pub mod parser; pub mod reader; #[cfg(feature = "serialize")] pub mod se; diff --git a/src/reader/element.rs b/src/parser/element.rs similarity index 98% rename from src/reader/element.rs rename to src/parser/element.rs index 323a8311..e257c5ac 100644 --- a/src/reader/element.rs +++ b/src/parser/element.rs @@ -1,7 +1,7 @@ //! Contains a parser for an XML element. use crate::errors::SyntaxError; -use crate::reader::Parser; +use crate::parser::Parser; /// A parser that search a `>` symbol in the slice outside of quoted regions. /// @@ -25,7 +25,7 @@ use crate::reader::Parser; /// /// ``` /// # use pretty_assertions::assert_eq; -/// use quick_xml::reader::{ElementParser, Parser}; +/// use quick_xml::parser::{ElementParser, Parser}; /// /// let mut parser = ElementParser::default(); /// diff --git a/src/parser/mod.rs b/src/parser/mod.rs new file mode 100644 index 00000000..90f97c9f --- /dev/null +++ b/src/parser/mod.rs @@ -0,0 +1,29 @@ +//! Contains low-level parsers of different XML pieces. + +use crate::errors::SyntaxError; + +mod element; +mod pi; + +pub use element::ElementParser; +pub use pi::PiParser; + +/// Used to decouple reading of data from data source and parsing XML structure from it. +/// This is a state preserved between getting chunks of bytes from the reader. +/// +/// This trait is implemented for every parser that processes piece of XML grammar. +pub trait Parser { + /// Process new data and try to determine end of the parsed thing. + /// + /// Returns position of the end of thing in `bytes` in case of successful search + /// and `None` otherwise. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a thing. + /// Should contain text in ASCII-compatible encoding + fn feed(&mut self, bytes: &[u8]) -> Option; + + /// Returns parse error produced by this parser in case of reaching end of + /// input without finding the end of a parsed thing. + fn eof_error() -> SyntaxError; +} diff --git a/src/reader/pi.rs b/src/parser/pi.rs similarity index 98% rename from src/reader/pi.rs rename to src/parser/pi.rs index 24e33d79..ee553579 100644 --- a/src/reader/pi.rs +++ b/src/parser/pi.rs @@ -1,7 +1,7 @@ //! Contains a parser for an XML processing instruction. use crate::errors::SyntaxError; -use crate::reader::Parser; +use crate::parser::Parser; /// A parser that search a `?>` sequence in the slice. /// @@ -19,7 +19,7 @@ use crate::reader::Parser; /// /// ``` /// # use pretty_assertions::assert_eq; -/// use quick_xml::reader::{Parser, PiParser}; +/// use quick_xml::parser::{Parser, PiParser}; /// /// let mut parser = PiParser::default(); /// diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index e1f28629..7b8fa688 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -7,10 +7,9 @@ use tokio::io::{self, AsyncBufRead, AsyncBufReadExt}; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::{QName, ResolveResult}; +use crate::parser::{ElementParser, Parser, PiParser}; use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{ - BangType, ElementParser, NsReader, ParseState, Parser, PiParser, ReadTextResult, Reader, Span, -}; +use crate::reader::{BangType, NsReader, ParseState, ReadTextResult, Reader, Span}; use crate::utils::is_whitespace; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index cd1e5f41..548eca92 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -8,7 +8,8 @@ use std::path::Path; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{BangType, Parser, ReadTextResult, Reader, Span, XmlSource}; +use crate::parser::Parser; +use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource}; use crate::utils::is_whitespace; macro_rules! impl_buffered_source { diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 329a734f..104f4102 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -8,6 +8,7 @@ use std::ops::Range; use crate::encoding::Decoder; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; +use crate::parser::{ElementParser, Parser, PiParser}; use crate::reader::state::ReaderState; /// A struct that holds a parser configuration. @@ -449,15 +450,11 @@ macro_rules! read_to_end { #[cfg(feature = "async-tokio")] mod async_tokio; mod buffered_reader; -mod element; mod ns_reader; -mod pi; mod slice_reader; mod state; -pub use element::ElementParser; pub use ns_reader::NsReader; -pub use pi::PiParser; /// Range of input in bytes, that corresponds to some piece of XML pub type Span = Range; @@ -790,26 +787,6 @@ enum ReadTextResult<'r, B> { Err(io::Error), } -/// Used to decouple reading of data from data source and parsing XML structure from it. -/// This is a state preserved between getting chunks of bytes from the reader. -/// -/// This trait is implemented for every parser that processes piece of XML grammar. -pub trait Parser { - /// Process new data and try to determine end of the parsed thing. - /// - /// Returns position of the end of thing in `bytes` in case of successful search - /// and `None` otherwise. - /// - /// # Parameters - /// - `bytes`: a slice to find the end of a thing. - /// Should contain text in ASCII-compatible encoding - fn feed(&mut self, bytes: &[u8]) -> Option; - - /// Returns parse error produced by this parser in case of reaching end of - /// input without finding the end of a parsed thing. - fn eof_error() -> SyntaxError; -} - /// Represents an input for a reader that can return borrowed data. /// /// There are two implementors of this trait: generic one that read data from @@ -1541,7 +1518,7 @@ mod test { mod read_element { use super::*; use crate::errors::{Error, SyntaxError}; - use crate::reader::ElementParser; + use crate::parser::ElementParser; use crate::utils::Bytes; use pretty_assertions::assert_eq; diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 88b9e2f7..1afc3e36 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -13,7 +13,8 @@ use encoding_rs::{Encoding, UTF_8}; use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{BangType, Parser, ReadTextResult, Reader, Span, XmlSource}; +use crate::parser::Parser; +use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource}; use crate::utils::is_whitespace; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. From ecc7ef37629b77db48260738ddb73def97ad8344 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 4 Jul 2024 20:33:21 +0500 Subject: [PATCH 07/10] Actually check that .error_position() is correct Check position for zero actually does not check anything because this is start position before parse. We move position by 1 by using Text event failures (36): syntax::tag::unclosed3::ns_reader::async_tokio syntax::tag::unclosed3::ns_reader::borrowed syntax::tag::unclosed3::ns_reader::buffered syntax::tag::unclosed3::reader::async_tokio syntax::tag::unclosed3::reader::borrowed syntax::tag::unclosed3::reader::buffered syntax::tag::unclosed4::ns_reader::async_tokio syntax::tag::unclosed4::ns_reader::borrowed syntax::tag::unclosed4::ns_reader::buffered syntax::tag::unclosed4::reader::async_tokio syntax::tag::unclosed4::reader::borrowed syntax::tag::unclosed4::reader::buffered syntax::tag::unclosed5::ns_reader::async_tokio syntax::tag::unclosed5::ns_reader::borrowed syntax::tag::unclosed5::ns_reader::buffered syntax::tag::unclosed5::reader::async_tokio syntax::tag::unclosed5::reader::borrowed syntax::tag::unclosed5::reader::buffered syntax::tag::unclosed6::ns_reader::async_tokio syntax::tag::unclosed6::ns_reader::borrowed syntax::tag::unclosed6::ns_reader::buffered syntax::tag::unclosed6::reader::async_tokio syntax::tag::unclosed6::reader::borrowed syntax::tag::unclosed6::reader::buffered syntax::tag::unclosed7::ns_reader::async_tokio syntax::tag::unclosed7::ns_reader::borrowed syntax::tag::unclosed7::ns_reader::buffered syntax::tag::unclosed7::reader::async_tokio syntax::tag::unclosed7::reader::borrowed syntax::tag::unclosed7::reader::buffered syntax::tag::unclosed8::ns_reader::async_tokio syntax::tag::unclosed8::ns_reader::borrowed syntax::tag::unclosed8::ns_reader::buffered syntax::tag::unclosed8::reader::async_tokio syntax::tag::unclosed8::reader::borrowed syntax::tag::unclosed8::reader::buffered --- tests/reader-errors.rs | 229 ++++++++++++++++++++++++----------------- 1 file changed, 135 insertions(+), 94 deletions(-) diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs index 34880c15..8f9c578e 100644 --- a/tests/reader-errors.rs +++ b/tests/reader-errors.rs @@ -101,10 +101,16 @@ mod syntax { #[test] fn borrowed() { let mut reader = Reader::from_str($xml); + assert_eq!( + reader + .read_event() + .expect("parser should return `Event::Text`"), + Event::Text(BytesText::new(".")) + ); match reader.read_event() { Err(Error::Syntax(cause)) => assert_eq!( (cause, reader.error_position(), reader.buffer_position()), - ($cause, 0, $pos), + ($cause, 1, $pos), ), x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), } @@ -120,10 +126,16 @@ mod syntax { fn buffered() { let mut buf = Vec::new(); let mut reader = Reader::from_str($xml); + assert_eq!( + reader + .read_event_into(&mut buf) + .expect("parser should return `Event::Text`"), + Event::Text(BytesText::new(".")) + ); match reader.read_event_into(&mut buf) { Err(Error::Syntax(cause)) => assert_eq!( (cause, reader.error_position(), reader.buffer_position()), - ($cause, 0, $pos), + ($cause, 1, $pos), ), x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), } @@ -140,10 +152,17 @@ mod syntax { async fn async_tokio() { let mut buf = Vec::new(); let mut reader = Reader::from_str($xml); + assert_eq!( + reader + .read_event_into_async(&mut buf) + .await + .expect("parser should return `Event::Text`"), + Event::Text(BytesText::new(".")) + ); match reader.read_event_into_async(&mut buf).await { Err(Error::Syntax(cause)) => assert_eq!( (cause, reader.error_position(), reader.buffer_position()), - ($cause, 0, $pos), + ($cause, 1, $pos), ), x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), } @@ -164,10 +183,17 @@ mod syntax { #[test] fn borrowed() { let mut reader = NsReader::from_str($xml); + assert_eq!( + reader + .read_resolved_event() + .expect("parser should return `Event::Text`") + .1, + Event::Text(BytesText::new(".")) + ); match reader.read_resolved_event() { Err(Error::Syntax(cause)) => assert_eq!( (cause, reader.error_position(), reader.buffer_position()), - ($cause, 0, $pos), + ($cause, 1, $pos), ), x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), } @@ -184,10 +210,17 @@ mod syntax { fn buffered() { let mut buf = Vec::new(); let mut reader = NsReader::from_str($xml); + assert_eq!( + reader + .read_resolved_event_into(&mut buf) + .expect("parser should return `Event::Text`") + .1, + Event::Text(BytesText::new(".")) + ); match reader.read_resolved_event_into(&mut buf) { Err(Error::Syntax(cause)) => assert_eq!( (cause, reader.error_position(), reader.buffer_position()), - ($cause, 0, $pos), + ($cause, 1, $pos), ), x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), } @@ -205,10 +238,18 @@ mod syntax { async fn async_tokio() { let mut buf = Vec::new(); let mut reader = NsReader::from_str($xml); + assert_eq!( + reader + .read_resolved_event_into_async(&mut buf) + .await + .expect("parser should return `Event::Text`") + .1, + Event::Text(BytesText::new(".")) + ); match reader.read_resolved_event_into_async(&mut buf).await { Err(Error::Syntax(cause)) => assert_eq!( (cause, reader.error_position(), reader.buffer_position()), - ($cause, 0, $pos), + ($cause, 1, $pos), ), x => panic!("Expected `Err(Syntax(_))`, but got {:?}", x), } @@ -232,14 +273,14 @@ mod syntax { mod tag { use super::*; - err!(unclosed1("<") => SyntaxError::UnclosedTag); - err!(unclosed2(" SyntaxError::UnclosedTag); - err!(unclosed3(" SyntaxError::UnclosedTag); - err!(unclosed4("< ") => SyntaxError::UnclosedTag); - err!(unclosed5("<\t") => SyntaxError::UnclosedTag); - err!(unclosed6("<\r") => SyntaxError::UnclosedTag); - err!(unclosed7("<\n") => SyntaxError::UnclosedTag); - err!(unclosed8("< \t\r\nx") => SyntaxError::UnclosedTag); + err!(unclosed1(".<") => SyntaxError::UnclosedTag); + err!(unclosed2(". SyntaxError::UnclosedTag); + err!(unclosed3(". SyntaxError::UnclosedTag); + err!(unclosed4(".< ") => SyntaxError::UnclosedTag); + err!(unclosed5(".<\t") => SyntaxError::UnclosedTag); + err!(unclosed6(".<\r") => SyntaxError::UnclosedTag); + err!(unclosed7(".<\n") => SyntaxError::UnclosedTag); + err!(unclosed8(".< \t\r\nx") => SyntaxError::UnclosedTag); /// Closed tags can be tested only in pair with open tags, because otherwise /// `IllFormedError::UnmatchedEndTag` will be raised @@ -289,25 +330,25 @@ mod syntax { } // Incorrect after-bang symbol is detected early, so buffer_position() stay at `!` - err!(unclosed_bang1(" 1, SyntaxError::InvalidBangMarkup); - err!(unclosed_bang2("") => 1, SyntaxError::InvalidBangMarkup); - err!(unclosed_bang3(" 1, SyntaxError::InvalidBangMarkup); - err!(unclosed_bang4("") => 1, SyntaxError::InvalidBangMarkup); + err!(unclosed_bang1(". 2, SyntaxError::InvalidBangMarkup); + err!(unclosed_bang2(".") => 2, SyntaxError::InvalidBangMarkup); + err!(unclosed_bang3(". 2, SyntaxError::InvalidBangMarkup); + err!(unclosed_bang4(".") => 2, SyntaxError::InvalidBangMarkup); /// https://www.w3.org/TR/xml11/#NT-Comment mod comment { use super::*; - err!(unclosed01(" SyntaxError::UnclosedComment); - err!(unclosed02("") => SyntaxError::UnclosedComment); - err!(unclosed07("") => SyntaxError::UnclosedComment); - err!(unclosed10("") => SyntaxError::UnclosedComment); + err!(unclosed07(".") => SyntaxError::UnclosedComment); + err!(unclosed10(".") => 7: Event::Comment(BytesText::new(""))); ok!(normal2("rest") => 7: Event::Comment(BytesText::new(""))); @@ -317,33 +358,33 @@ mod syntax { mod cdata { use super::*; - err!(unclosed01(" SyntaxError::UnclosedCData); - err!(unclosed02(" SyntaxError::UnclosedCData); - err!(unclosed03(" SyntaxError::UnclosedCData); - err!(unclosed04("") => SyntaxError::UnclosedCData); - err!(unclosed05(" SyntaxError::UnclosedCData); - err!(unclosed06(" SyntaxError::UnclosedCData); - err!(unclosed07("") => SyntaxError::UnclosedCData); - err!(unclosed08(" SyntaxError::UnclosedCData); - err!(unclosed09(" SyntaxError::UnclosedCData); - err!(unclosed10("") => SyntaxError::UnclosedCData); - err!(unclosed11(" SyntaxError::UnclosedCData); - err!(unclosed12(" SyntaxError::UnclosedCData); - err!(unclosed13("") => SyntaxError::UnclosedCData); - err!(unclosed14(" SyntaxError::UnclosedCData); - err!(unclosed15(" SyntaxError::UnclosedCData); - err!(unclosed16("") => SyntaxError::UnclosedCData); - err!(unclosed17(" SyntaxError::UnclosedCData); - err!(unclosed18(" SyntaxError::UnclosedCData); - err!(unclosed19("") => SyntaxError::UnclosedCData); - err!(unclosed20(" SyntaxError::UnclosedCData); - err!(unclosed21(" SyntaxError::UnclosedCData); - err!(unclosed22("") => SyntaxError::UnclosedCData); - err!(unclosed23(" SyntaxError::UnclosedCData); - err!(unclosed24(" SyntaxError::UnclosedCData); - err!(unclosed25("") => SyntaxError::UnclosedCData); - - err!(lowercase("") => SyntaxError::UnclosedCData); + err!(unclosed01(". SyntaxError::UnclosedCData); + err!(unclosed02(". SyntaxError::UnclosedCData); + err!(unclosed03(". SyntaxError::UnclosedCData); + err!(unclosed04(".") => SyntaxError::UnclosedCData); + err!(unclosed05(". SyntaxError::UnclosedCData); + err!(unclosed06(". SyntaxError::UnclosedCData); + err!(unclosed07(".") => SyntaxError::UnclosedCData); + err!(unclosed08(". SyntaxError::UnclosedCData); + err!(unclosed09(". SyntaxError::UnclosedCData); + err!(unclosed10(".") => SyntaxError::UnclosedCData); + err!(unclosed11(". SyntaxError::UnclosedCData); + err!(unclosed12(". SyntaxError::UnclosedCData); + err!(unclosed13(".") => SyntaxError::UnclosedCData); + err!(unclosed14(". SyntaxError::UnclosedCData); + err!(unclosed15(". SyntaxError::UnclosedCData); + err!(unclosed16(".") => SyntaxError::UnclosedCData); + err!(unclosed17(". SyntaxError::UnclosedCData); + err!(unclosed18(". SyntaxError::UnclosedCData); + err!(unclosed19(".") => SyntaxError::UnclosedCData); + err!(unclosed20(". SyntaxError::UnclosedCData); + err!(unclosed21(". SyntaxError::UnclosedCData); + err!(unclosed22(".") => SyntaxError::UnclosedCData); + err!(unclosed23(". SyntaxError::UnclosedCData); + err!(unclosed24(". SyntaxError::UnclosedCData); + err!(unclosed25(".") => SyntaxError::UnclosedCData); + + err!(lowercase(".") => SyntaxError::UnclosedCData); ok!(normal1("") => 12: Event::CData(BytesCData::new(""))); ok!(normal2("rest") => 12: Event::CData(BytesCData::new(""))); @@ -355,29 +396,29 @@ mod syntax { mod doctype { use super::*; - err!(unclosed01(" SyntaxError::UnclosedDoctype); - err!(unclosed02(" SyntaxError::UnclosedDoctype); - err!(unclosed03(" SyntaxError::UnclosedDoctype); - err!(unclosed04("") => SyntaxError::UnclosedDoctype); - err!(unclosed05(" SyntaxError::UnclosedDoctype); - err!(unclosed06(" SyntaxError::UnclosedDoctype); - err!(unclosed07("") => SyntaxError::UnclosedDoctype); - err!(unclosed08(" SyntaxError::UnclosedDoctype); - err!(unclosed09(" SyntaxError::UnclosedDoctype); - err!(unclosed10("") => SyntaxError::UnclosedDoctype); - err!(unclosed11(" SyntaxError::UnclosedDoctype); - err!(unclosed12(" SyntaxError::UnclosedDoctype); - err!(unclosed13("") => SyntaxError::UnclosedDoctype); - err!(unclosed14(" SyntaxError::UnclosedDoctype); - err!(unclosed15(" SyntaxError::UnclosedDoctype); - err!(unclosed16("") => SyntaxError::UnclosedDoctype); - err!(unclosed17(" SyntaxError::UnclosedDoctype); - err!(unclosed18(" SyntaxError::UnclosedDoctype); - err!(unclosed19("") => SyntaxError::UnclosedDoctype); - err!(unclosed20(" SyntaxError::UnclosedDoctype); - err!(unclosed21(" SyntaxError::UnclosedDoctype); + err!(unclosed01(". SyntaxError::UnclosedDoctype); + err!(unclosed02(". SyntaxError::UnclosedDoctype); + err!(unclosed03(". SyntaxError::UnclosedDoctype); + err!(unclosed04(".") => SyntaxError::UnclosedDoctype); + err!(unclosed05(". SyntaxError::UnclosedDoctype); + err!(unclosed06(". SyntaxError::UnclosedDoctype); + err!(unclosed07(".") => SyntaxError::UnclosedDoctype); + err!(unclosed08(". SyntaxError::UnclosedDoctype); + err!(unclosed09(". SyntaxError::UnclosedDoctype); + err!(unclosed10(".") => SyntaxError::UnclosedDoctype); + err!(unclosed11(". SyntaxError::UnclosedDoctype); + err!(unclosed12(". SyntaxError::UnclosedDoctype); + err!(unclosed13(".") => SyntaxError::UnclosedDoctype); + err!(unclosed14(". SyntaxError::UnclosedDoctype); + err!(unclosed15(". SyntaxError::UnclosedDoctype); + err!(unclosed16(".") => SyntaxError::UnclosedDoctype); + err!(unclosed17(". SyntaxError::UnclosedDoctype); + err!(unclosed18(". SyntaxError::UnclosedDoctype); + err!(unclosed19(".") => SyntaxError::UnclosedDoctype); + err!(unclosed20(". SyntaxError::UnclosedDoctype); + err!(unclosed21(". SyntaxError::UnclosedDoctype); // results in IllFormed(MissingDoctypeName), checked below - err!(unclosed22(" SyntaxError::UnclosedDoctype); + err!(unclosed22(". SyntaxError::UnclosedDoctype); // According to the grammar, XML declaration MUST contain at least one space // and an element name, but we do not consider this as a _syntax_ error. @@ -389,16 +430,16 @@ mod syntax { mod pi { use super::*; - err!(unclosed1(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed2(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed3("") => SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed4(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed5(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed6(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed7(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed8(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed9(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed10(" SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed01(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed02(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed03(".") => SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed04(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed05(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed06(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed07(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed08(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed09(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed10(". SyntaxError::UnclosedPIOrXmlDecl); // According to the grammar, processing instruction MUST contain a non-empty // target name, but we do not consider this as a _syntax_ error. @@ -412,10 +453,10 @@ mod syntax { mod decl { use super::*; - err!(unclosed1(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed2(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed3(" SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed4(" SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed1(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed2(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed3(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed4(". SyntaxError::UnclosedPIOrXmlDecl); // According to the grammar, XML declaration MUST contain at least one space // and `version` attribute, but we do not consider this as a _syntax_ error. @@ -827,9 +868,9 @@ mod ill_formed { // ^= 13 ok!(missing_doctype_name3("") => 15: Event::DocType(BytesText::new("x"))); - err!(unmatched_end_tag1("") => 0: IllFormedError::UnmatchedEndTag("".to_string())); - err!(unmatched_end_tag2("") => 0: IllFormedError::UnmatchedEndTag("end".to_string())); - err!(unmatched_end_tag3("") => 0: IllFormedError::UnmatchedEndTag("end".to_string())); + err2!(unmatched_end_tag1(".") => 1: IllFormedError::UnmatchedEndTag("".to_string())); + err2!(unmatched_end_tag2(".") => 1: IllFormedError::UnmatchedEndTag("end".to_string())); + err2!(unmatched_end_tag3(".") => 1: IllFormedError::UnmatchedEndTag("end".to_string())); ok!(mismatched_end_tag1("") => 7: Event::Start(BytesStart::new("start"))); err2!(mismatched_end_tag2("") => 7: IllFormedError::MismatchedEndTag { From df65be07e401d759ed9e8faf211bc20f7e8fbf53 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 4 Jul 2024 20:55:12 +0500 Subject: [PATCH 08/10] Fix incorrect `.error_position()` when encountering syntax error for open or self-closed tag --- Changelog.md | 1 + src/reader/mod.rs | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Changelog.md b/Changelog.md index c15a19fd..35b51fc1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -19,6 +19,7 @@ - [#781]: Fix conditions to start CDATA section. Only uppercase ` $self.state.emit_bang(bang_type, bytes), Err(e) => { - // Ok($self.state.emit_start(bytes)), - Err(e) => Err(e), + Err(e) => { + // We want to report error at `<`, but offset was increased, + // so return it back (-1 for `<`) + $self.state.last_error_offset = start - 1; + Err(e) + } }, // `<` - syntax error, tag not closed Ok(None) => { From 45e8be4eab0fb91a532d59cb017a039a74ab6262 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 6 Jul 2024 20:57:03 +0500 Subject: [PATCH 09/10] Use `.error_position()` instead of `.buffer_position()` in examples where error position is reported `.error_position()` is intended to report position where error start occurring, `.buffer_position()` is intended to show to what position reader read data tests/issues.rs leave untouched because this is the code from real problems in GH issues, it should remain as close to issue as possible --- README.md | 4 ++-- examples/custom_entities.rs | 2 +- examples/read_buffered.rs | 2 +- examples/read_texts.rs | 2 +- src/reader/async_tokio.rs | 2 +- src/reader/buffered_reader.rs | 2 +- src/reader/mod.rs | 2 +- src/writer.rs | 2 +- tests/reader.rs | 4 ++-- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 058ab8cc..92804a06 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ loop { // when the input is a &str or a &[u8], we don't actually need to use another // buffer, we could directly call `reader.read_event()` match reader.read_event_into(&mut buf) { - Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), // exits the loop when reaching end of file Ok(Event::Eof) => break, @@ -98,7 +98,7 @@ loop { Ok(Event::Eof) => break, // we can either move or borrow the event to write, depending on your use-case Ok(e) => assert!(writer.write_event(e).is_ok()), - Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), } } diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index e68a6824..37d172ac 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -68,7 +68,7 @@ fn main() -> Result<(), Box> { ); } Ok(Event::Eof) => break, - Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), _ => (), } } diff --git a/examples/read_buffered.rs b/examples/read_buffered.rs index 4f0a20ec..d5e4cb4a 100644 --- a/examples/read_buffered.rs +++ b/examples/read_buffered.rs @@ -23,7 +23,7 @@ fn main() -> Result<(), quick_xml::Error> { count += 1; } Ok(Event::Eof) => break, // exits the loop when reaching end of file - Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), _ => (), // There are several other `Event`s we do not consider here } } diff --git a/examples/read_texts.rs b/examples/read_texts.rs index 666d1bdb..95ef0978 100644 --- a/examples/read_texts.rs +++ b/examples/read_texts.rs @@ -18,7 +18,7 @@ fn main() { println!("{:?}", txt); } Ok(Event::Eof) => break, // exits the loop when reaching end of file - Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), _ => (), // There are several other `Event`s we do not consider here } } diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 7b8fa688..2e75b43f 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -58,7 +58,7 @@ impl Reader { /// match reader.read_event_into_async(&mut buf).await { /// Ok(Event::Start(_)) => count += 1, /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// Ok(Event::Eof) => break, /// _ => (), /// } diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 548eca92..c8af425e 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -328,7 +328,7 @@ impl Reader { /// match reader.read_event_into(&mut buf) { /// Ok(Event::Start(_)) => count += 1, /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// Ok(Event::Eof) => break, /// _ => (), /// } diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 1cdf9030..46a30e86 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -589,7 +589,7 @@ impl EncodingRef { /// // when the input is a &str or a &[u8], we don't actually need to use another /// // buffer, we could directly call `reader.read_event()` /// match reader.read_event_into(&mut buf) { -/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), +/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// // exits the loop when reaching end of file /// Ok(Event::Eof) => break, /// diff --git a/src/writer.rs b/src/writer.rs index 81b7df34..5e16590f 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -52,7 +52,7 @@ use {crate::de::DeError, serde::Serialize}; /// Ok(Event::Eof) => break, /// // we can either move or borrow the event to write, depending on your use-case /// Ok(e) => assert!(writer.write_event(e.borrow()).is_ok()), -/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), +/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), /// } /// } /// diff --git a/tests/reader.rs b/tests/reader.rs index 4921cd87..2bc27e57 100644 --- a/tests/reader.rs +++ b/tests/reader.rs @@ -175,7 +175,7 @@ fn test_escaped_content() { Ok(c) => assert_eq!(c, ""), Err(e) => panic!( "cannot escape content at position {}: {:?}", - r.buffer_position(), + r.error_position(), e ), } @@ -183,7 +183,7 @@ fn test_escaped_content() { Ok(e) => panic!("Expecting text event, got {:?}", e), Err(e) => panic!( "Cannot get next event at position {}: {:?}", - r.buffer_position(), + r.error_position(), e ), } From 6a48a28f4cc182112d9005f66f8ab4336ace3293 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 3 Jul 2024 20:03:05 +0500 Subject: [PATCH 10/10] Allow to have attributes in closing tags (compatibility with the Adobe Flash parser) --- Changelog.md | 2 + src/reader/buffered_reader.rs | 48 ------- src/reader/mod.rs | 235 +++++++++++++--------------------- src/reader/slice_reader.rs | 23 ---- tests/issues.rs | 27 ++++ 5 files changed, 117 insertions(+), 218 deletions(-) diff --git a/Changelog.md b/Changelog.md index 35b51fc1..c36ddad8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -24,7 +24,9 @@ ### Misc Changes - [#780]: `reader::Parser`, `reader::ElementParser` and `reader::PiParser` moved to the new module `parser`. +- [#776]: Allow to have attributes in the end tag for compatibility reasons with Adobe Flash XML parser. +[#776]: https://github.com/tafia/quick-xml/issues/776 [#780]: https://github.com/tafia/quick-xml/pull/780 [#781]: https://github.com/tafia/quick-xml/pull/781 diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index c8af425e..95ac1dc7 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -101,54 +101,6 @@ macro_rules! impl_buffered_source { ReadTextResult::UpToEof(&buf[start..]) } - #[inline] - $($async)? fn read_bytes_until $(<$lf>)? ( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut u64, - ) -> io::Result<(&'b [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - let mut read = 0; - let start = buf.len(); - loop { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(e); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - - let used = i + 1; - self $(.$reader)? .consume(used); - read += used as u64; - - *position += read; - return Ok((&buf[start..], true)); - } - None => { - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used as u64; - } - } - } - - *position += read; - Ok((&buf[start..], false)) - } - #[inline] $($async)? fn read_with<$($lf,)? P: Parser>( &mut self, diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 46a30e86..6e030b73 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -345,18 +345,26 @@ macro_rules! read_until_close { } }, // `` we will parse `` as end tag + // `` which probably no one existing parser + // does. This is malformed XML, however it is tolerated by some parsers + // (e.g. the one used by Adobe Flash) and such documents do exist in the wild. Ok(Some(b'/')) => match $reader - .read_bytes_until(b'>', $buf, &mut $self.state.offset) + .read_with(ElementParser::Outside, $buf, &mut $self.state.offset) $(.$await)? { - Ok((bytes, true)) => $self.state.emit_end(bytes), - Ok((_, false)) => { + Ok(bytes) => $self.state.emit_end(bytes), + Err(e) => { // We want to report error at `<`, but offset was increased, // so return it back (-1 for `<`) $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedTag)) + Err(e) } - Err(e) => Err(Error::Io(e.into())), }, // ` match $reader @@ -824,39 +832,6 @@ trait XmlSource<'r, B> { /// [events]: crate::events::Event fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>; - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte` (exclusive), - /// and a flag noting whether `byte` was found in the input or not. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// (b"abc".as_ref(), true) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut u64, - ) -> io::Result<(&'r [u8], bool)>; - /// Read input until processing instruction is finished. /// /// This method expect that start sequence of a parser already was read. @@ -1022,115 +997,6 @@ mod test { $buf:expr $(, $async:ident, $await:ident)? ) => { - mod read_bytes_until { - use super::*; - // Use Bytes for printing bytes as strings for ASCII range - use crate::utils::Bytes; - use pretty_assertions::assert_eq; - - /// Checks that search in the empty buffer returns `None` - #[$test] - $($async)? fn empty() { - let buf = $buf; - let mut position = 0; - let mut input = b"".as_ref(); - // ^= 0 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), false) - ); - assert_eq!(position, 0); - } - - /// Checks that search in the buffer non-existent value returns entire buffer - /// as a result and set `position` to `len()` - #[$test] - $($async)? fn non_existent() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef".as_ref(); - // ^= 6 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), false) - ); - assert_eq!(position, 6); - } - - /// Checks that search in the buffer an element that is located in the front of - /// buffer returns empty slice as a result and set `position` to one symbol - /// after match (`1`) - #[$test] - $($async)? fn at_the_start() { - let buf = $buf; - let mut position = 0; - let mut input = b"*abcdef".as_ref(); - // ^= 1 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b""), true) - ); - assert_eq!(position, 1); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the middle of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match - #[$test] - $($async)? fn inside() { - let buf = $buf; - let mut position = 0; - let mut input = b"abc*def".as_ref(); - // ^= 4 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abc"), true) - ); - assert_eq!(position, 4); // position after the symbol matched - } - - /// Checks that search in the buffer an element that is located in the end of - /// buffer returns slice before that symbol as a result and set `position` to one - /// symbol after match (`len()`) - #[$test] - $($async)? fn in_the_end() { - let buf = $buf; - let mut position = 0; - let mut input = b"abcdef*".as_ref(); - // ^= 7 - - let (bytes, found) = $source(&mut input) - .read_bytes_until(b'*', buf, &mut position) - $(.$await)? - .unwrap(); - assert_eq!( - (Bytes(bytes), found), - (Bytes(b"abcdef"), true) - ); - assert_eq!(position, 7); // position after the symbol matched - } - } - mod read_bang_element { use super::*; use crate::errors::{Error, SyntaxError}; @@ -1693,6 +1559,81 @@ mod test { assert_eq!(position, 42); } } + + mod close { + use super::*; + use pretty_assertions::assert_eq; + + #[$test] + $($async)? fn empty_tag() { + let buf = $buf; + let mut position = 1; + let mut input = b"/ >".as_ref(); + // ^= 4 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/ ") + ); + assert_eq!(position, 4); + } + + #[$test] + $($async)? fn normal() { + let buf = $buf; + let mut position = 1; + let mut input = b"/tag>".as_ref(); + // ^= 6 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/tag") + ); + assert_eq!(position, 6); + } + + #[$test] + $($async)? fn empty_ns_empty_tag() { + let buf = $buf; + let mut position = 1; + let mut input = b"/:>".as_ref(); + // ^= 4 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/:") + ); + assert_eq!(position, 4); + } + + #[$test] + $($async)? fn empty_ns() { + let buf = $buf; + let mut position = 1; + let mut input = b"/:tag>".as_ref(); + // ^= 7 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(b"/:tag") + ); + assert_eq!(position, 7); + } + + #[$test] + $($async)? fn with_attributes() { + let buf = $buf; + let mut position = 1; + let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); + // ^= 40 + + assert_eq!( + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), + Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#) + ); + assert_eq!(position, 40); + } + } } /// Ensures, that no empty `Text` events are generated diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 1afc3e36..6b4c2804 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -284,29 +284,6 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } - #[inline] - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut u64, - ) -> io::Result<(&'a [u8], bool)> { - // search byte must be within the ascii range - debug_assert!(byte.is_ascii()); - - if let Some(i) = memchr::memchr(byte, self) { - *position += i as u64 + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - Ok((bytes, true)) - } else { - *position += self.len() as u64; - let bytes = &self[..]; - *self = &[]; - Ok((bytes, false)) - } - } - #[inline] fn read_with

(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]> where diff --git a/tests/issues.rs b/tests/issues.rs index e4a75696..c14f09c6 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -364,3 +364,30 @@ fn issue774() { Event::End(BytesEnd::new("tag")) ); } + +/// Regression test for https://github.com/tafia/quick-xml/issues/776 +#[test] +fn issue776() { + let mut reader = Reader::from_str(r#""#); + // We still think that the name of the end tag is everything between `` + // and if we do not disable this check we get error + reader.config_mut().check_end_names = false; + + assert_eq!( + reader.read_event().unwrap(), + Event::Start(BytesStart::new("tag")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new("tag/")) + ); + + assert_eq!( + reader.read_event().unwrap(), + Event::Start(BytesStart::new("tag")) + ); + assert_eq!( + reader.read_event().unwrap(), + Event::End(BytesEnd::new(r#"tag attr=">""#)) + ); +}