diff --git a/CHANGELOG.md b/CHANGELOG.md index 05ff8eac51..215b3587ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,12 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Changed +* [CRuby] `Schema.from_document` now makes a defensive copy of the document if it has blank text nodes with Ruby objects instantiated for them. This prevents unsafe behavior in libxml2 from causing a segfault. There is a small performance cost, but we think this has the virtue of being "what the user meant" since modifying the original is surprising behavior for most users. Previously this was addressed in v1.10.9 by raising an exception. + + ### Fixed -* [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation. -* [JRuby] Serializing with `#write_to` will fall back to the document encoding when no encoding is specified, like libxml2 does. Previously JRuby emitted UTF-8 in this situation. +* [CRuby] `XSLT.transform` now makes a defensive copy of the document if it has blank text nodes with Ruby objects instantiated for them _and_ the template uses `xsl:strip-spaces`. This prevents unsafe behavior in libxslt from causing a segfault. There is a small performance cost, but we think this has the virtue of being "what the user meant" since modifying the original is surprising behavior for most users. Previously this would allow unsafe memory access and potentially segfault. [[#2800](https://github.com/sparklemotion/nokogiri/issues/2800)] ### Improved diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index c380b8f3a3..3313961ca9 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -168,6 +169,7 @@ typedef struct _nokogiriXsltStylesheetTuple { void noko_xml_document_pin_node(xmlNodePtr); void noko_xml_document_pin_namespace(xmlNsPtr, xmlDocPtr); +int noko_xml_document_has_wrapped_blank_nodes_p(xmlDocPtr c_document); int noko_io_read(void *ctx, char *buffer, int len); int noko_io_write(void *ctx, char *buffer, int len); diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index 93bdc45183..4fa2473994 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -685,6 +685,33 @@ noko_xml_document_unwrap(VALUE rb_document) return c_document; } +/* Schema creation will remove and deallocate "blank" nodes. + * If those blank nodes have been exposed to Ruby, they could get freed + * out from under the VALUE pointer. This function checks to see if any of + * those nodes have been exposed to Ruby, and if so we should raise an exception. + */ +int +noko_xml_document_has_wrapped_blank_nodes_p(xmlDocPtr c_document) +{ + VALUE cache = DOC_NODE_CACHE(c_document); + + if (NIL_P(cache)) { + return 0; + } + + for (long jnode = 0; jnode < RARRAY_LEN(cache); jnode++) { + xmlNodePtr node; + VALUE element = rb_ary_entry(cache, jnode); + + Noko_Node_Get_Struct(element, xmlNode, node); + if (xmlIsBlankNode(node)) { + return 1; + } + } + + return 0; +} + void noko_xml_document_pin_node(xmlNodePtr node) { diff --git a/ext/nokogiri/xml_schema.c b/ext/nokogiri/xml_schema.c index 9e5b5e8cf1..5aeecb1e6f 100644 --- a/ext/nokogiri/xml_schema.c +++ b/ext/nokogiri/xml_schema.c @@ -191,32 +191,6 @@ read_memory(int argc, VALUE *argv, VALUE klass) return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); } -/* Schema creation will remove and deallocate "blank" nodes. - * If those blank nodes have been exposed to Ruby, they could get freed - * out from under the VALUE pointer. This function checks to see if any of - * those nodes have been exposed to Ruby, and if so we should raise an exception. - */ -static int -has_blank_nodes_p(VALUE cache) -{ - long i; - - if (NIL_P(cache)) { - return 0; - } - - for (i = 0; i < RARRAY_LEN(cache); i++) { - xmlNodePtr node; - VALUE element = rb_ary_entry(cache, i); - Noko_Node_Get_Struct(element, xmlNode, node); - if (xmlIsBlankNode(node)) { - return 1; - } - } - - return 0; -} - /* * call-seq: * from_document(document) → Nokogiri::XML::Schema @@ -233,8 +207,10 @@ from_document(int argc, VALUE *argv, VALUE klass) { VALUE rb_document; VALUE rb_parse_options; + VALUE rb_schema; xmlDocPtr c_document; xmlSchemaParserCtxtPtr c_parser_context; + int defensive_copy_p = 0; rb_scan_args(argc, argv, "11", &rb_document, &rb_parse_options); @@ -248,16 +224,21 @@ from_document(int argc, VALUE *argv, VALUE klass) c_document = deprecated_node_type_arg->doc; } - if (has_blank_nodes_p(DOC_NODE_CACHE(c_document))) { - rb_raise( - rb_eArgError, - "Creating a schema from a document that has blank nodes exposed to Ruby is dangerous" - ); + if (noko_xml_document_has_wrapped_blank_nodes_p(c_document)) { + // see https://github.com/sparklemotion/nokogiri/pull/2001 + c_document = xmlCopyDoc(c_document, 1); + defensive_copy_p = 1; } c_parser_context = xmlSchemaNewDocParserCtxt(c_document); + rb_schema = xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); - return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options); + if (defensive_copy_p) { + xmlFreeDoc(c_document); + c_document = NULL; + } + + return rb_schema; } void diff --git a/ext/nokogiri/xslt_stylesheet.c b/ext/nokogiri/xslt_stylesheet.c index 176cd4360b..097b8bf704 100644 --- a/ext/nokogiri/xslt_stylesheet.c +++ b/ext/nokogiri/xslt_stylesheet.c @@ -244,58 +244,72 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj) static VALUE rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self) { - VALUE xmldoc, paramobj, errstr, exception ; - xmlDocPtr xml ; - xmlDocPtr result ; + VALUE rb_document, rb_param, rb_error_str; + xmlDocPtr c_document ; + xmlDocPtr c_result_document ; nokogiriXsltStylesheetTuple *wrapper; const char **params ; long param_len, j ; int parse_error_occurred ; + int defensive_copy_p = 0; - rb_scan_args(argc, argv, "11", &xmldoc, ¶mobj); - if (NIL_P(paramobj)) { paramobj = rb_ary_new2(0L) ; } - if (!rb_obj_is_kind_of(xmldoc, cNokogiriXmlDocument)) { + rb_scan_args(argc, argv, "11", &rb_document, &rb_param); + if (NIL_P(rb_param)) { rb_param = rb_ary_new2(0L) ; } + if (!rb_obj_is_kind_of(rb_document, cNokogiriXmlDocument)) { rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::Document"); } /* handle hashes as arguments. */ - if (T_HASH == TYPE(paramobj)) { - paramobj = rb_funcall(paramobj, rb_intern("to_a"), 0); - paramobj = rb_funcall(paramobj, rb_intern("flatten"), 0); + if (T_HASH == TYPE(rb_param)) { + rb_param = rb_funcall(rb_param, rb_intern("to_a"), 0); + rb_param = rb_funcall(rb_param, rb_intern("flatten"), 0); } - Check_Type(paramobj, T_ARRAY); + Check_Type(rb_param, T_ARRAY); - xml = noko_xml_document_unwrap(xmldoc); + c_document = noko_xml_document_unwrap(rb_document); TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper); - param_len = RARRAY_LEN(paramobj); + param_len = RARRAY_LEN(rb_param); params = ruby_xcalloc((size_t)param_len + 1, sizeof(char *)); for (j = 0 ; j < param_len ; j++) { - VALUE entry = rb_ary_entry(paramobj, j); + VALUE entry = rb_ary_entry(rb_param, j); const char *ptr = StringValueCStr(entry); params[j] = ptr; } params[param_len] = 0 ; - errstr = rb_str_new(0, 0); - xsltSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler); - xmlSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler); + xsltTransformContextPtr c_transform_context = xsltNewTransformContext(wrapper->ss, c_document); + if (xsltNeedElemSpaceHandling(c_transform_context) && + noko_xml_document_has_wrapped_blank_nodes_p(c_document)) { + // see https://github.com/sparklemotion/nokogiri/issues/2800 + c_document = xmlCopyDoc(c_document, 1); + defensive_copy_p = 1; + } + xsltFreeTransformContext(c_transform_context); + + rb_error_str = rb_str_new(0, 0); + xsltSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler); + xmlSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler); + + c_result_document = xsltApplyStylesheet(wrapper->ss, c_document, params); - result = xsltApplyStylesheet(wrapper->ss, xml, params); ruby_xfree(params); + if (defensive_copy_p) { + xmlFreeDoc(c_document); + c_document = NULL; + } xsltSetGenericErrorFunc(NULL, NULL); xmlSetGenericErrorFunc(NULL, NULL); - parse_error_occurred = (Qfalse == rb_funcall(errstr, rb_intern("empty?"), 0)); + parse_error_occurred = (Qfalse == rb_funcall(rb_error_str, rb_intern("empty?"), 0)); if (parse_error_occurred) { - exception = rb_exc_new3(rb_eRuntimeError, errstr); - rb_exc_raise(exception); + rb_exc_raise(rb_exc_new3(rb_eRuntimeError, rb_error_str)); } - return noko_xml_document_wrap((VALUE)0, result) ; + return noko_xml_document_wrap((VALUE)0, c_result_document) ; } static void diff --git a/test/test_xslt_transforms.rb b/test/test_xslt_transforms.rb index 43431e5d96..ca834ced9c 100644 --- a/test/test_xslt_transforms.rb +++ b/test/test_xslt_transforms.rb @@ -366,6 +366,60 @@ def test_non_html_xslt_transform end end + describe "https://github.com/sparklemotion/nokogiri/issues/2800" do + let(:doc) do + Nokogiri::XML::Document.parse(<<~XML) + + abc + + xyz + + XML + end + + let(:stylesheet) do + Nokogiri::XSLT.parse(<<~XSL) + + + + + [] + + + XSL + end + + it "should modify the original doc if no wrapped blank text nodes would be removed" do + # this is the default libxslt behavior + skip_unless_libxml2("libxslt bug is not present in JRuby") + + result = stylesheet.transform(doc) + + assert_includes(result.to_s, "[abc][][xyz]", "xsl:strip-space should work") + doc.at_css("entry:nth-child(2)").tap do |entry| + assert_equal(1, entry.children.length) + assert_equal("", entry.children.first.content, "original doc should be modified") + end + end + + it "should not modify the original doc if wrapped blank text nodes would be removed" do + skip_unless_libxml2("libxslt bug is not present in JRuby") + + # wrap the blank text node + assert(child = doc.css("title").children.find(&:blank?)) + + result = stylesheet.transform(doc) + + assert(child.to_s) # raise a valgrind error if the fix isn't working + + assert_includes(result.to_s, "[abc][][xyz]", "xsl:strip-space should work") + doc.at_css("entry:nth-child(2)").tap do |entry| + assert_equal(1, entry.children.length) + assert_equal(" ", entry.children.first.content, "original doc is unmodified") + end + end + end + describe "DEFAULT_XSLT parse options" do it "is the union of DEFAULT_XML and libxslt's XSLT_PARSE_OPTIONS" do xslt_parse_options = Nokogiri::XML::ParseOptions.new.noent.dtdload.dtdattr.nocdata diff --git a/test/xml/test_schema.rb b/test/xml/test_schema.rb index f77ee97af6..3b31d134d7 100644 --- a/test/xml/test_schema.rb +++ b/test/xml/test_schema.rb @@ -10,15 +10,18 @@ def setup assert(@xsd = Nokogiri::XML::Schema(File.read(PO_SCHEMA_FILE))) end - def test_issue_1985_segv_on_schema_parse + def test_issue_1985_schema_parse_modifying_underlying_document skip_unless_libxml2("Pure Java version doesn't have this bug") - # This is a test for a workaround for a bug in LibXML2. The upstream - # bug is here: https://gitlab.gnome.org/GNOME/libxml2/issues/148 - # Schema creation can result in dangling pointers. If no nodes have - # been exposed, then it should be fine to create a schema. If nodes - # have been exposed to Ruby, then we need to make sure they won't be - # freed out from under us. + # This is a test for a workaround for a bug in LibXML2: + # + # https://gitlab.gnome.org/GNOME/libxml2/issues/148 + # + # Schema creation can modify the original document -- removal of blank text nodes -- which + # results in dangling pointers. + # + # If no nodes have been exposed, then it should be fine to create a schema. If nodes have + # been exposed to Ruby, then we need to make sure they won't be freed out from under us. doc = <<~EOF @@ -32,12 +35,10 @@ def test_issue_1985_segv_on_schema_parse # This is not OK, nodes have been exposed to Ruby xsd_doc = Nokogiri::XML(doc) - xsd_doc.root.children.find(&:blank?) # Finds a node + child = xsd_doc.root.children.find(&:blank?) # Find a blank node that would be freed without the fix - ex = assert_raises(ArgumentError) do - Nokogiri::XML::Schema.from_document(xsd_doc) - end - assert_match(/blank nodes/, ex.message) + Nokogiri::XML::Schema.from_document(xsd_doc) + assert(child.to_s) # This will raise a valgrind error if the node was freed end def test_schema_read_memory