diff --git a/CHANGELOG.md b/CHANGELOG.md index 025306b9a0f..7be0b055201 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ We've resolved many long-standing bugs in the various schema classes, validation * Introduce support for a new SAX callback `XML::SAX::Document#reference`, which is called to report some parsed XML entities when `XML::SAX::ParserContext#replace_entities` is set to the default value `false`. This is necessary functionality for some applications that were previously relying on incorrect entity error reporting which has been fixed (see below). For more information, read the docs for `Nokogiri::XML::SAX::Document`. [#1926] @flavorjones * `XML::SAX::Parser#parse_memory` and `#parse_file` now accept an optional `encoding` argument. When not provided, the parser will fall back to the encoding passed to the initializer, and then fall back to autodetection. [#3288] @flavorjones * `XML::SAX::ParserContext.memory` now accepts an optional `encoding` argument. When not provided, the encoding will be autodetected. [#3288] @flavorjones +* `XML::DocumentFragment#parse_options` and `HTML4::DocumentFragment#parse_options` return the options used to parse the document fragment. @flavorjones * [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. [#3119] @flavorjones * [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones * [CRuby] Introduce a compile-time option, `--disable-xml2-legacy`, to remove from libxml2 its dependencies on `zlib` and `liblzma` and disable implicit `HTTP` network requests. These all remain enabled by default, and are present in the precompiled native gems. This option is a precursor for removing these libraries in a future major release, but may be interesting for the security-minded who do not need features like automatic decompression and would like to remove these dependencies. You can read more and give feedback on these plans in #3168. [#3247] @flavorjones diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb index 1681822acbb..eae79bcb14a 100644 --- a/lib/nokogiri/html4/document_fragment.rb +++ b/lib/nokogiri/html4/document_fragment.rb @@ -91,6 +91,7 @@ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEF return self unless tags options = Nokogiri::XML::ParseOptions.new(options) if Integer === options + @parse_options = options yield options if block_given? if ctx diff --git a/lib/nokogiri/xml/document_fragment.rb b/lib/nokogiri/xml/document_fragment.rb index 40cc8f4fa11..dbdc46b4243 100644 --- a/lib/nokogiri/xml/document_fragment.rb +++ b/lib/nokogiri/xml/document_fragment.rb @@ -4,6 +4,11 @@ module Nokogiri module XML class DocumentFragment < Nokogiri::XML::Node + # The options used to parse the document fragment. Returns the value of any options that were + # passed into the constructor as a parameter or set in a config block, else the default + # options for the specific subclass. + attr_reader :parse_options + #### # Create a Nokogiri::XML::DocumentFragment from +tags+ def self.parse(tags, options = ParseOptions::DEFAULT_XML, &block) @@ -20,6 +25,7 @@ def initialize(document, tags = nil, ctx = nil, options = ParseOptions::DEFAULT_ return self unless tags options = Nokogiri::XML::ParseOptions.new(options) if Integer === options + @parse_options = options yield options if block_given? children = if ctx diff --git a/test/html4/sax/test_document_error.rb b/test/html4/sax/test_document_error.rb index 513164148fd..fdd15ef694a 100644 --- a/test/html4/sax/test_document_error.rb +++ b/test/html4/sax/test_document_error.rb @@ -20,15 +20,10 @@ def start_document end def test_warning_document_encounters_error_but_terminates_normally - # Probably I'm doing something wrong, but I can't make nekohtml report errors, - # despite setting http://cyberneko.org/html/features/report-errors. - # See https://nekohtml.sourceforge.net/settings.html for more info. - # I'd love some help here if someone finds this comment and cares enough to dig in. - skip_unless_libxml2("nekohtml sax parser does not seem to report errors?") - warning_parser = Nokogiri::HTML4::SAX::Parser.new(Nokogiri::SAX::TestCase::Doc.new) warning_parser.parse("<
= 2.14.0") + it "behaves as if the comment is closed immediately before the end of the input stream" do # COMPLIANT + assert_pattern do + subject => { + name: "div", + attributes: [{ name: "id", value: "under-test" }], + children: [ + { name: "comment", content: "start of unterminated comment" } + ] + } + end + end + elsif Nokogiri.uses_libxml? it "behaves as if the comment is unterminated and doesn't exist" do # NON-COMPLIANT assert_equal 0, subject.children.length assert_equal 1, doc.errors.length @@ -132,8 +144,12 @@ class TestComment < Nokogiri::TestCase assert_equal inner_div, subject.children[1] assert_predicate subject.children[2], :comment? assert_equal "bar", subject.children[2].content - assert_equal 1, doc.errors.length - assert_match(/Comment incorrectly closed/, doc.errors.first.to_s) + if Nokogiri.uses_libxml?(">= 2.14.0") + assert_empty doc.errors + else + assert_equal 1, doc.errors.length + assert_match(/Comment incorrectly closed/, doc.errors.first.to_s) + end end else # jruby, or libxml2 system lib less than 2.9.11 it "behaves as if the comment encompasses the inner div" do # NON-COMPLIANT @@ -161,7 +177,22 @@ class TestComment < Nokogiri::TestCase let(:body) { doc.at_css("body") } let(:subject) { doc.at_css("div#under-test") } - if Nokogiri.uses_libxml?("= 2.9.14") + if Nokogiri.uses_libxml?(">= 2.14.0") + it "parses as comments" do # COMPLIANT + assert_pattern do + body.children => [ + { + name: "div", + children: [ + { name: "comment", content: " comment
hello" }, + ] + end + end + elsif Nokogiri.uses_libxml?("= 2.9.14") it "parses as PCDATA" do # NON-COMPLIANT assert_equal 1, body.children.length assert_equal subject, body.children.first @@ -212,7 +243,21 @@ class TestComment < Nokogiri::TestCase let(:body) { doc.at_css("body") } let(:subject) { doc.at_css("div#under-test") } - if Nokogiri.uses_libxml?("= 2.9.14") + if Nokogiri.uses_libxml?(">= 2.14.0") + it "parses the [ + { + name: "div", children: [ + { name: "comment", content: "[if foo]" }, + { name: "div", attributes: [{name: "id", value: "do-i-exist"}] }, + { name: "comment", content: "[endif]" }, + ] + } + ] + end + end + elsif Nokogiri.uses_libxml?("= 2.9.14") it "parses the -
one") - doc2 = Nokogiri::HTML4("two") + doc1 = Nokogiri::HTML4("
one") + doc2 = Nokogiri::HTML4("
two") node1 = doc1.at_css("#unique") node2 = doc2.at_css("#unique") original_errors1 = doc1.errors.dup original_errors2 = doc2.errors.dup - assert(original_errors1.any? { |e| e.to_s.include?("Tag diva invalid") }, "it should complain about the tag name") - assert(original_errors2.any? { |e| e.to_s.include?("Tag dive invalid") }, "it should complain about the tag name") + + refute_empty(original_errors1) + refute_empty(original_errors2) node1.add_child(node2) @@ -734,6 +733,8 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 doc = Nokogiri::HTML4::Document.parse(html) expected = if Nokogiri.jruby? [Nokogiri::XML::Node::COMMENT_NODE, Nokogiri::XML::Node::PI_NODE] + elsif Nokogiri.uses_libxml?(">= 2.14.0") + [Nokogiri::XML::Node::COMMENT_NODE, Nokogiri::XML::Node::COMMENT_NODE] elsif Nokogiri.uses_libxml?(">= 2.10.0") [Nokogiri::XML::Node::COMMENT_NODE] else @@ -802,7 +803,7 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 end describe "read memory" do - let(:input) { "
" } describe "strict parsing" do let(:parse_options) { html_strict } @@ -824,7 +825,7 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262 end describe "read io" do - let(:input) { StringIO.new("
") } describe "strict parsing" do let(:parse_options) { html_strict } diff --git a/test/html4/test_document_encoding.rb b/test/html4/test_document_encoding.rb index 0abc7057818..1cadf151589 100644 --- a/test/html4/test_document_encoding.rb +++ b/test/html4/test_document_encoding.rb @@ -148,7 +148,7 @@ def binopen(file) end describe "error handling" do - RAW = " RAW, "read_io" => StringIO.new(RAW) }.each do |flavor, input| it "#{flavor} should handle errors" do diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb index a157ff025e5..6ef3faac074 100644 --- a/test/html4/test_document_fragment.rb +++ b/test/html4/test_document_fragment.rb @@ -188,7 +188,29 @@ def test_element_children_counts def test_malformed_fragment_is_corrected fragment = Nokogiri::HTML4::DocumentFragment.parse("
") - assert_equal("
", fragment.to_s) + + if Nokogiri.uses_libxml?(">= 2.14.0") + assert_pattern do + fragment => [ + { name: "div", attributes: [ + { name: "<", value: ""}, + { name: "div", value: ""}, + ]} + ] + end + else + assert_equal("
", fragment.to_s) + end + end + + def test_malformed_html5_fragment_serializes_like_gumbo + skip_unless_libxml2(">= 2.14.0") + + fragment = Nokogiri::HTML4::DocumentFragment.parse("
") + + pending "libxml2 does not serialize HTML5 like gumbo (yet)" do + assert_equal('
', fragment.to_s) + end end def test_unclosed_script_tag @@ -198,37 +220,29 @@ def test_unclosed_script_tag end def test_error_propagation_on_fragment_parse - frag = Nokogiri::HTML4::DocumentFragment.parse("oh, hello there.") - assert(frag.errors.any? { |err| err.to_s.include?("Tag hello invalid") }, "errors should be copied to the fragment") + frag = Nokogiri::HTML4::DocumentFragment.parse("oh, hello there") + refute_empty(frag.errors) end def test_error_propagation_on_fragment_parse_in_node_context doc = Nokogiri::HTML4::Document.parse("
") context_node = doc.at_css("div") - frag = Nokogiri::HTML4::DocumentFragment.new(doc, "oh, hello there.", context_node) - assert( - frag.errors.any? do |err| - err.to_s.include?("Tag hello invalid") - end, - "errors should be on the context node's document", - ) + frag = Nokogiri::HTML4::DocumentFragment.new(doc, "oh, hello there", context_node) + refute_empty(frag.errors) end def test_error_propagation_on_fragment_parse_in_node_context_should_not_include_preexisting_errors - doc = Nokogiri::HTML4::Document.parse("
") - assert(doc.errors.any? { |err| err.to_s.include?("jimmy") }, "assert on setup") + doc = Nokogiri::HTML4::Document.parse("
") + refute_empty(doc.errors) + doc_errors = doc.errors.map(&:to_s) context_node = doc.at_css("div") - frag = Nokogiri::HTML4::DocumentFragment.new(doc, "oh, hello there.", context_node) - assert( - frag.errors.any? do |err| - err.to_s.include?("Tag hello invalid") - end, - "errors should be on the context node's document", - ) + frag = Nokogiri::HTML4::DocumentFragment.new(doc, "oh, hello there.", context_node) + refute_empty(frag.errors) + assert( frag.errors.none? do |err| - err.to_s.include?("jimmy") + doc_errors.include?(err.to_s) end, "errors should not include pre-existing document errors", ) @@ -245,14 +259,15 @@ def test_capturing_nonparse_errors_during_fragment_clone def test_capturing_nonparse_errors_during_node_copy_between_fragments # Errors should be emitted while parsing only, and should not change when moving nodes. - frag1 = Nokogiri::HTML4.fragment("one") - frag2 = Nokogiri::HTML4.fragment("two") + frag1 = Nokogiri::HTML4.fragment("
one") + frag2 = Nokogiri::HTML4.fragment("
two") node1 = frag1.at_css("#unique") node2 = frag2.at_css("#unique") original_errors1 = frag1.errors.dup original_errors2 = frag2.errors.dup - assert(original_errors1.any? { |e| e.to_s.include?("Tag diva invalid") }, "it should complain about the tag name") - assert(original_errors2.any? { |e| e.to_s.include?("Tag dive invalid") }, "it should complain about the tag name") + + refute_empty(original_errors1) + refute_empty(original_errors2) node1.add_child(node2) @@ -341,106 +356,108 @@ def test_parse_with_io Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML).norecover end - let(:input) { "
foofoo
" } it "sets the test up correctly" do + refute_predicate(html4_default, :strict?) + refute_predicate(html4_default, :huge?) assert_predicate(html4_strict, :strict?) + assert_predicate(html4_huge, :huge?) end describe "HTML4.fragment" do - it "has sane defaults" do + it "has reasonable defaults" do frag = Nokogiri::HTML4.fragment(input) + assert_equal("
foo
", frag.to_html) - refute_empty(frag.errors) + assert_equal(html4_default, frag.parse_options) end it "accepts options" do - frag = Nokogiri::HTML4.fragment(input, nil, html4_default) - assert_equal("
foo
", frag.to_html) - refute_empty(frag.errors) + frag = Nokogiri::HTML4.fragment(input, nil, html4_huge) - assert_raises(Nokogiri::SyntaxError) do - Nokogiri::HTML4.fragment(input, nil, html4_strict) - end + assert_equal("
foo
", frag.to_html) + assert_equal(html4_huge, frag.parse_options) end it "takes a config block" do default_config = nil - Nokogiri::HTML4.fragment(input) do |config| - default_config = config + frag = Nokogiri::HTML4.fragment(input) do |config| + default_config = config.dup + config.huge end - refute_predicate(default_config, :strict?) - assert_raises(Nokogiri::SyntaxError) do - Nokogiri::HTML4.fragment(input, &:norecover) - end + assert_equal(html4_default, default_config) + refute_predicate(default_config, :huge?) + assert_predicate(frag.parse_options, :huge?) end end describe "HTML4::DocumentFragment.parse" do - it "has sane defaults" do + it "has reasonable defaults" do frag = Nokogiri::HTML4::DocumentFragment.parse(input) + assert_equal("
foo
", frag.to_html) - refute_empty(frag.errors) + assert_equal(html4_default, frag.parse_options) end it "accepts options" do - frag = Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_default) - assert_equal("
foo
", frag.to_html) - refute_empty(frag.errors) + frag = Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_huge) - assert_raises(Nokogiri::SyntaxError) do - Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_strict) - end + assert_equal("
foo
", frag.to_html) + assert_equal(html4_huge, frag.parse_options) end it "takes a config block" do default_config = nil - Nokogiri::HTML4::DocumentFragment.parse(input) do |config| - default_config = config + frag = Nokogiri::HTML4::DocumentFragment.parse(input) do |config| + default_config = config.dup + config.huge end - refute_predicate(default_config, :strict?) - assert_raises(Nokogiri::SyntaxError) do - Nokogiri::HTML4::DocumentFragment.parse(input, &:norecover) - end + assert_equal(html4_default, default_config) + refute_predicate(default_config, :huge?) + assert_predicate(frag.parse_options, :huge?) end end describe "HTML4::DocumentFragment.new" do describe "without a context node" do - it "has sane defaults" do + it "has reasonable defaults" do frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) + assert_equal("
foo
", frag.to_html) - refute_empty(frag.errors) + assert_equal(html4_default, frag.parse_options) end it "accepts options" do - frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_default) - assert_equal("
foo
", frag.to_html) - refute_empty(frag.errors) + frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_huge) - assert_raises(Nokogiri::SyntaxError) do - Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_strict) - end + assert_equal("
foo
", frag.to_html) + assert_equal(html4_huge, frag.parse_options) end it "takes a config block" do default_config = nil - Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) do |config| - default_config = config + frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) do |config| + default_config = config.dup + config.huge end - refute_predicate(default_config, :strict?) - assert_raises(Nokogiri::SyntaxError) do - Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, &:norecover) - end + assert_equal(html4_default, default_config) + refute_predicate(default_config, :huge?) + assert_predicate(frag.parse_options, :huge?) end end describe "with a context node" do let(:document) { Nokogiri::HTML4::Document.parse("") } let(:context_node) { document.at_css("context") } + let(:input) { "
foo= 2.14.0") + assert_equal(0, sub_elements.length) + elsif Nokogiri.uses_libxml? assert_equal(65, sub_elements.length) else assert_equal(105, sub_elements.length) @@ -66,7 +68,12 @@ def test_subelements end def test_default_sub_element - assert_equal("div", ElementDescription["body"].default_sub_element) + sub_element = ElementDescription["body"].default_sub_element + if Nokogiri.uses_libxml?(">= 2.14.0") + assert_nil(sub_element) + else + assert_equal("div", sub_element) + end end def test_null_default_sub_element @@ -86,7 +93,11 @@ def test_optional_attributes def test_deprecated_attributes attrs = ElementDescription["table"].deprecated_attributes assert(attrs) - assert_equal(2, attrs.length) + if Nokogiri.uses_libxml?(">= 2.14.0") + assert_equal(0, attrs.length) + else + assert_equal(2, attrs.length) + end end def test_required_attributes diff --git a/test/html4/test_node.rb b/test/html4/test_node.rb index 60759b9ceb3..45ee8feb193 100644 --- a/test/html4/test_node.rb +++ b/test/html4/test_node.rb @@ -168,13 +168,10 @@ def test_fragment_serialization end def test_to_html_does_not_contain_entities - # as generated by a tool like NKF html = "\r\n

test paragraph\r\nfoo bar

\r\n\r\n" nokogiri = Nokogiri::HTML4.parse(html) - if RUBY_PLATFORM.include?("java") - # NKF linebreak modes are not supported as of jruby 1.2 - # see http://jira.codehaus.org/browse/JRUBY-3602 for status + if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.14.0") assert_equal( "

testparagraph\nfoobar

", nokogiri.at("p").to_html.delete(" "), diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index 4a8e1fb40c6..abb11161a34 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -105,9 +105,19 @@ def test_node_context_parsing_of_malformed_html_fragment context_node = doc.at_css("div") nodeset = context_node.parse("
") - assert_equal(1, doc.errors.length) - assert_equal(1, nodeset.length) - assert_equal("
", nodeset.to_s) + if Nokogiri.uses_libxml?(">= 2.14.0") + assert_empty(doc.errors) + assert_pattern do + nodeset => [ + { name: "div", attributes: [{name: "<", value: ""}, { name: "div", value: ""}] }, + ] + end + else + assert_equal(1, doc.errors.length) + assert_equal(1, nodeset.length) + assert_equal("
", nodeset.to_s) + end + assert_instance_of(Nokogiri::HTML4::Document, nodeset.document) assert_instance_of(Nokogiri::HTML4::Document, nodeset.first.document) end @@ -117,14 +127,25 @@ def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_correct context_node = doc.at_css("div") nodeset = context_node.parse("
", &:recover) - assert_equal(1, doc.errors.length) - assert_equal(1, nodeset.length) - assert_equal("
", nodeset.to_s) + if Nokogiri.uses_libxml?(">= 2.14.0") + assert_empty(doc.errors) + assert_pattern do + nodeset => [ + { name: "div", attributes: [{name: "<", value: ""}, { name: "div", value: ""}] }, + ] + end + else + assert_equal(1, doc.errors.length) + assert_equal(1, nodeset.length) + assert_equal("
", nodeset.to_s) + end assert_instance_of(Nokogiri::HTML4::Document, nodeset.document) assert_instance_of(Nokogiri::HTML4::Document, nodeset.first.document) end def test_node_context_parsing_of_malformed_html_fragment_without_recover_is_not_corrected + skip("libxml2 2.14.0 no longer raises this error") if Nokogiri.uses_libxml?(">= 2.14.0") + doc = HTML4.parse("
") context_node = doc.at_css("div") assert_raises(Nokogiri::XML::SyntaxError) do