From 1b2b4be1da1294b4703ab5166c6ce60fe11b7886 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Thu, 9 Mar 2023 13:42:37 -0500 Subject: [PATCH 1/4] fix: Schema.from_document defensively copies the document when there are blank text node objects. previously we raised an exception. --- ext/nokogiri/nokogiri.h | 1 + ext/nokogiri/xml_document.c | 27 ++++++++++++++++++++++ ext/nokogiri/xml_schema.c | 45 +++++++++++-------------------------- test/xml/test_schema.rb | 25 +++++++++++---------- 4 files changed, 54 insertions(+), 44 deletions(-) diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index c380b8f3a3..e6dc5721fc 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -168,6 +168,7 @@ typedef struct _nokogiriXsltStylesheetTuple { void noko_xml_document_pin_node(xmlNodePtr); void noko_xml_document_pin_namespace(xmlNsPtr, xmlDocPtr); +int noko_xml_document_has_wrapped_blank_nodes_p(xmlDocPtr c_document); int noko_io_read(void *ctx, char *buffer, int len); int noko_io_write(void *ctx, char *buffer, int len); diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index 93bdc45183..4fa2473994 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -685,6 +685,33 @@ noko_xml_document_unwrap(VALUE rb_document) return c_document; } +/* Schema creation will remove and deallocate "blank" nodes. + * If those blank nodes have been exposed to Ruby, they could get freed + * out from under the VALUE pointer. This function checks to see if any of + * those nodes have been exposed to Ruby, and if so we should raise an exception. + */ +int +noko_xml_document_has_wrapped_blank_nodes_p(xmlDocPtr c_document) +{ + VALUE cache = DOC_NODE_CACHE(c_document); + + if (NIL_P(cache)) { + return 0; + } + + for (long jnode = 0; jnode < RARRAY_LEN(cache); jnode++) { + xmlNodePtr node; + VALUE element = rb_ary_entry(cache, jnode); + + Noko_Node_Get_Struct(element, xmlNode, node); + if (xmlIsBlankNode(node)) { + return 1; + } + } + + return 0; +} + void noko_xml_document_pin_node(xmlNodePtr node) { diff --git a/ext/nokogiri/xml_schema.c b/ext/nokogiri/xml_schema.c index 9e5b5e8cf1..5aeecb1e6f 100644 --- a/ext/nokogiri/xml_schema.c +++ b/ext/nokogiri/xml_schema.c @@ -191,32 +191,6 @@ read_memory(int argc, VALUE *argv, VALUE klass) return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); } -/* Schema creation will remove and deallocate "blank" nodes. - * If those blank nodes have been exposed to Ruby, they could get freed - * out from under the VALUE pointer. This function checks to see if any of - * those nodes have been exposed to Ruby, and if so we should raise an exception. - */ -static int -has_blank_nodes_p(VALUE cache) -{ - long i; - - if (NIL_P(cache)) { - return 0; - } - - for (i = 0; i < RARRAY_LEN(cache); i++) { - xmlNodePtr node; - VALUE element = rb_ary_entry(cache, i); - Noko_Node_Get_Struct(element, xmlNode, node); - if (xmlIsBlankNode(node)) { - return 1; - } - } - - return 0; -} - /* * call-seq: * from_document(document) → Nokogiri::XML::Schema @@ -233,8 +207,10 @@ from_document(int argc, VALUE *argv, VALUE klass) { VALUE rb_document; VALUE rb_parse_options; + VALUE rb_schema; xmlDocPtr c_document; xmlSchemaParserCtxtPtr c_parser_context; + int defensive_copy_p = 0; rb_scan_args(argc, argv, "11", &rb_document, &rb_parse_options); @@ -248,16 +224,21 @@ from_document(int argc, VALUE *argv, VALUE klass) c_document = deprecated_node_type_arg->doc; } - if (has_blank_nodes_p(DOC_NODE_CACHE(c_document))) { - rb_raise( - rb_eArgError, - "Creating a schema from a document that has blank nodes exposed to Ruby is dangerous" - ); + if (noko_xml_document_has_wrapped_blank_nodes_p(c_document)) { + // see https://github.com/sparklemotion/nokogiri/pull/2001 + c_document = xmlCopyDoc(c_document, 1); + defensive_copy_p = 1; } c_parser_context = xmlSchemaNewDocParserCtxt(c_document); + rb_schema = xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); - return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); + if (defensive_copy_p) { + xmlFreeDoc(c_document); + c_document = NULL; + } + + return rb_schema; } void diff --git a/test/xml/test_schema.rb b/test/xml/test_schema.rb index f77ee97af6..3b31d134d7 100644 --- a/test/xml/test_schema.rb +++ b/test/xml/test_schema.rb @@ -10,15 +10,18 @@ def setup assert(@xsd = Nokogiri::XML::Schema(File.read(PO_SCHEMA_FILE))) end - def test_issue_1985_segv_on_schema_parse + def test_issue_1985_schema_parse_modifying_underlying_document skip_unless_libxml2("Pure Java version doesn't have this bug") - # This is a test for a workaround for a bug in LibXML2. The upstream - # bug is here: https://gitlab.gnome.org/GNOME/libxml2/issues/148 - # Schema creation can result in dangling pointers. If no nodes have - # been exposed, then it should be fine to create a schema. If nodes - # have been exposed to Ruby, then we need to make sure they won't be - # freed out from under us. + # This is a test for a workaround for a bug in LibXML2: + # + # https://gitlab.gnome.org/GNOME/libxml2/issues/148 + # + # Schema creation can modify the original document -- removal of blank text nodes -- which + # results in dangling pointers. + # + # If no nodes have been exposed, then it should be fine to create a schema. If nodes have + # been exposed to Ruby, then we need to make sure they won't be freed out from under us. doc = <<~EOF @@ -32,12 +35,10 @@ def test_issue_1985_segv_on_schema_parse # This is not OK, nodes have been exposed to Ruby xsd_doc = Nokogiri::XML(doc) - xsd_doc.root.children.find(&:blank?) # Finds a node + child = xsd_doc.root.children.find(&:blank?) # Find a blank node that would be freed without the fix - ex = assert_raises(ArgumentError) do - Nokogiri::XML::Schema.from_document(xsd_doc) - end - assert_match(/blank nodes/, ex.message) + Nokogiri::XML::Schema.from_document(xsd_doc) + assert(child.to_s) # This will raise a valgrind error if the node was freed end def test_schema_read_memory From e0bd4a5614e80a4c033268a930b5f2cd597392c5 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Thu, 9 Mar 2023 13:53:40 -0500 Subject: [PATCH 2/4] style: variable naming as a prefactor --- ext/nokogiri/nokogiri.h | 1 + ext/nokogiri/xslt_stylesheet.c | 41 +++++++++++++++++----------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index e6dc5721fc..3313961ca9 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -51,6 +51,7 @@ #include #include #include +#include #include #include diff --git a/ext/nokogiri/xslt_stylesheet.c b/ext/nokogiri/xslt_stylesheet.c index 176cd4360b..62eb065b98 100644 --- a/ext/nokogiri/xslt_stylesheet.c +++ b/ext/nokogiri/xslt_stylesheet.c @@ -244,58 +244,57 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj) static VALUE rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self) { - VALUE xmldoc, paramobj, errstr, exception ; - xmlDocPtr xml ; - xmlDocPtr result ; + VALUE rb_document, rb_param, rb_error_str; + xmlDocPtr c_document ; + xmlDocPtr c_result_document ; nokogiriXsltStylesheetTuple *wrapper; const char **params ; long param_len, j ; int parse_error_occurred ; - rb_scan_args(argc, argv, "11", &xmldoc, ¶mobj); - if (NIL_P(paramobj)) { paramobj = rb_ary_new2(0L) ; } - if (!rb_obj_is_kind_of(xmldoc, cNokogiriXmlDocument)) { + rb_scan_args(argc, argv, "11", &rb_document, &rb_param); + if (NIL_P(rb_param)) { rb_param = rb_ary_new2(0L) ; } + if (!rb_obj_is_kind_of(rb_document, cNokogiriXmlDocument)) { rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::Document"); } /* handle hashes as arguments. */ - if (T_HASH == TYPE(paramobj)) { - paramobj = rb_funcall(paramobj, rb_intern("to_a"), 0); - paramobj = rb_funcall(paramobj, rb_intern("flatten"), 0); + if (T_HASH == TYPE(rb_param)) { + rb_param = rb_funcall(rb_param, rb_intern("to_a"), 0); + rb_param = rb_funcall(rb_param, rb_intern("flatten"), 0); } - Check_Type(paramobj, T_ARRAY); + Check_Type(rb_param, T_ARRAY); - xml = noko_xml_document_unwrap(xmldoc); + c_document = noko_xml_document_unwrap(rb_document); TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper); - param_len = RARRAY_LEN(paramobj); + param_len = RARRAY_LEN(rb_param); params = ruby_xcalloc((size_t)param_len + 1, sizeof(char *)); for (j = 0 ; j < param_len ; j++) { - VALUE entry = rb_ary_entry(paramobj, j); + VALUE entry = rb_ary_entry(rb_param, j); const char *ptr = StringValueCStr(entry); params[j] = ptr; } params[param_len] = 0 ; - errstr = rb_str_new(0, 0); - xsltSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler); - xmlSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler); + rb_error_str = rb_str_new(0, 0); + xsltSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler); + xmlSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler); - result = xsltApplyStylesheet(wrapper->ss, xml, params); + c_result_document = xsltApplyStylesheet(wrapper->ss, c_document, params); ruby_xfree(params); xsltSetGenericErrorFunc(NULL, NULL); xmlSetGenericErrorFunc(NULL, NULL); - parse_error_occurred = (Qfalse == rb_funcall(errstr, rb_intern("empty?"), 0)); + parse_error_occurred = (Qfalse == rb_funcall(rb_error_str, rb_intern("empty?"), 0)); if (parse_error_occurred) { - exception = rb_exc_new3(rb_eRuntimeError, errstr); - rb_exc_raise(exception); + rb_exc_raise(rb_exc_new3(rb_eRuntimeError, rb_error_str)); } - return noko_xml_document_wrap((VALUE)0, result) ; + return noko_xml_document_wrap((VALUE)0, c_result_document) ; } static void From 805405910b0115b5d48ed68130d31e4f60fb4578 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Thu, 9 Mar 2023 14:21:38 -0500 Subject: [PATCH 3/4] fix: ensure XSLT.transform doesn't modify the original doc which it does be default when xsl:strip-space is used this approach makes a defensive copy of the doc if there's a chance the original may be modified in an unsafe way: - if any spaces will be stripped - and there are blank node objects that might be removed Fixes #2800 --- ext/nokogiri/xslt_stylesheet.c | 15 ++++++++++ test/test_xslt_transforms.rb | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/ext/nokogiri/xslt_stylesheet.c b/ext/nokogiri/xslt_stylesheet.c index 62eb065b98..097b8bf704 100644 --- a/ext/nokogiri/xslt_stylesheet.c +++ b/ext/nokogiri/xslt_stylesheet.c @@ -251,6 +251,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self) const char **params ; long param_len, j ; int parse_error_occurred ; + int defensive_copy_p = 0; rb_scan_args(argc, argv, "11", &rb_document, &rb_param); if (NIL_P(rb_param)) { rb_param = rb_ary_new2(0L) ; } @@ -278,12 +279,26 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self) } params[param_len] = 0 ; + xsltTransformContextPtr c_transform_context = xsltNewTransformContext(wrapper->ss, c_document); + if (xsltNeedElemSpaceHandling(c_transform_context) && + noko_xml_document_has_wrapped_blank_nodes_p(c_document)) { + // see https://github.com/sparklemotion/nokogiri/issues/2800 + c_document = xmlCopyDoc(c_document, 1); + defensive_copy_p = 1; + } + xsltFreeTransformContext(c_transform_context); + rb_error_str = rb_str_new(0, 0); xsltSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler); xmlSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler); c_result_document = xsltApplyStylesheet(wrapper->ss, c_document, params); + ruby_xfree(params); + if (defensive_copy_p) { + xmlFreeDoc(c_document); + c_document = NULL; + } xsltSetGenericErrorFunc(NULL, NULL); xmlSetGenericErrorFunc(NULL, NULL); diff --git a/test/test_xslt_transforms.rb b/test/test_xslt_transforms.rb index 43431e5d96..ca834ced9c 100644 --- a/test/test_xslt_transforms.rb +++ b/test/test_xslt_transforms.rb @@ -366,6 +366,60 @@ def test_non_html_xslt_transform end end + describe "https://github.com/sparklemotion/nokogiri/issues/2800" do + let(:doc) do + Nokogiri::XML::Document.parse(<<~XML) + + abc + + xyz + + XML + end + + let(:stylesheet) do + Nokogiri::XSLT.parse(<<~XSL) + + + + + [] + + + XSL + end + + it "should modify the original doc if no wrapped blank text nodes would be removed" do + # this is the default libxslt behavior + skip_unless_libxml2("libxslt bug is not present in JRuby") + + result = stylesheet.transform(doc) + + assert_includes(result.to_s, "[abc][][xyz]", "xsl:strip-space should work") + doc.at_css("entry:nth-child(2)").tap do |entry| + assert_equal(1, entry.children.length) + assert_equal("", entry.children.first.content, "original doc should be modified") + end + end + + it "should not modify the original doc if wrapped blank text nodes would be removed" do + skip_unless_libxml2("libxslt bug is not present in JRuby") + + # wrap the blank text node + assert(child = doc.css("title").children.find(&:blank?)) + + result = stylesheet.transform(doc) + + assert(child.to_s) # raise a valgrind error if the fix isn't working + + assert_includes(result.to_s, "[abc][][xyz]", "xsl:strip-space should work") + doc.at_css("entry:nth-child(2)").tap do |entry| + assert_equal(1, entry.children.length) + assert_equal(" ", entry.children.first.content, "original doc is unmodified") + end + end + end + describe "DEFAULT_XSLT parse options" do it "is the union of DEFAULT_XML and libxslt's XSLT_PARSE_OPTIONS" do xslt_parse_options = Nokogiri::XML::ParseOptions.new.noent.dtdload.dtdattr.nocdata From cfbb42f3edd6cb33bd7d26f630816b0cbde20622 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Thu, 9 Mar 2023 15:47:09 -0500 Subject: [PATCH 4/4] doc: update CHANGELOG for 2800 --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05ff8eac51..215b3587ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,12 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Changed +* [CRuby] `Schema.from_document` now makes a defensive copy of the document if it has blank text nodes with Ruby objects instantiated for them. This prevents unsafe behavior in libxml2 from causing a segfault. There is a small performance cost, but we think this has the virtue of being "what the user meant" since modifying the original is surprising behavior for most users. Previously this was addressed in v1.10.9 by raising an exception. + + ### Fixed -* [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation. -* [JRuby] Serializing with `#write_to` will fall back to the document encoding when no encoding is specified, like libxml2 does. Previously JRuby emitted UTF-8 in this situation. +* [CRuby] `XSLT.transform` now makes a defensive copy of the document if it has blank text nodes with Ruby objects instantiated for them _and_ the template uses `xsl:strip-spaces`. This prevents unsafe behavior in libxslt from causing a segfault. There is a small performance cost, but we think this has the virtue of being "what the user meant" since modifying the original is surprising behavior for most users. Previously this would allow unsafe memory access and potentially segfault. [[#2800](https://github.com/sparklemotion/nokogiri/issues/2800)] ### Improved