Skip to content

Commit

Permalink
feat: DocumentFragment::{XML,HTML4}#parse_options
Browse files Browse the repository at this point in the history
and fix some libxml2 HTML5-related changes, specifically around errors
that are no longer generated.
  • Loading branch information
flavorjones committed Oct 5, 2024
1 parent c238f14 commit 9e57ff0
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ We've resolved many long-standing bugs in the various schema classes, validation
* Introduce support for a new SAX callback `XML::SAX::Document#reference`, which is called to report some parsed XML entities when `XML::SAX::ParserContext#replace_entities` is set to the default value `false`. This is necessary functionality for some applications that were previously relying on incorrect entity error reporting which has been fixed (see below). For more information, read the docs for `Nokogiri::XML::SAX::Document`. [#1926] @flavorjones
* `XML::SAX::Parser#parse_memory` and `#parse_file` now accept an optional `encoding` argument. When not provided, the parser will fall back to the encoding passed to the initializer, and then fall back to autodetection. [#3288] @flavorjones
* `XML::SAX::ParserContext.memory` now accepts an optional `encoding` argument. When not provided, the encoding will be autodetected. [#3288] @flavorjones
* `XML::DocumentFragment#parse_options` and `HTML4::DocumentFragment#parse_options` return the options used to parse the document fragment. @flavorjones
* [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. [#3119] @flavorjones
* [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones
* [CRuby] Introduce a compile-time option, `--disable-xml2-legacy`, to remove from libxml2 its dependencies on `zlib` and `liblzma` and disable implicit `HTTP` network requests. These all remain enabled by default, and are present in the precompiled native gems. This option is a precursor for removing these libraries in a future major release, but may be interesting for the security-minded who do not need features like automatic decompression and would like to remove these dependencies. You can read more and give feedback on these plans in #3168. [#3247] @flavorjones
Expand Down
1 change: 1 addition & 0 deletions lib/nokogiri/html4/document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEF
return self unless tags

options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
@parse_options = options
yield options if block_given?

if ctx
Expand Down
6 changes: 6 additions & 0 deletions lib/nokogiri/xml/document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
module Nokogiri
module XML
class DocumentFragment < Nokogiri::XML::Node
# The options used to parse the document fragment. Returns the value of any options that were
# passed into the constructor as a parameter or set in a config block, else the default
# options for the specific subclass.
attr_reader :parse_options

####
# Create a Nokogiri::XML::DocumentFragment from +tags+
def self.parse(tags, options = ParseOptions::DEFAULT_XML, &block)
Expand All @@ -20,6 +25,7 @@ def initialize(document, tags = nil, ctx = nil, options = ParseOptions::DEFAULT_
return self unless tags

options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
@parse_options = options
yield options if block_given?

children = if ctx
Expand Down
100 changes: 47 additions & 53 deletions test/html4/test_document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -220,31 +220,31 @@ def test_unclosed_script_tag
end

def test_error_propagation_on_fragment_parse
frag = Nokogiri::HTML4::DocumentFragment.parse("<hello>oh, hello there.</hello>")
assert(frag.errors.any? { |err| err.to_s.include?("Tag hello invalid") }, "errors should be copied to the fragment")
frag = Nokogiri::HTML4::DocumentFragment.parse("<hello>oh, hello there</goodbye>")
assert(frag.errors.any? { |err| err.to_s.include?("Unexpected end tag") }, "errors should be copied to the fragment")
end

def test_error_propagation_on_fragment_parse_in_node_context
doc = Nokogiri::HTML4::Document.parse("<html><body><div></div></body></html>")
context_node = doc.at_css("div")
frag = Nokogiri::HTML4::DocumentFragment.new(doc, "<hello>oh, hello there.</hello>", context_node)
frag = Nokogiri::HTML4::DocumentFragment.new(doc, "<hello>oh, hello there</goodbye>", context_node)
assert(
frag.errors.any? do |err|
err.to_s.include?("Tag hello invalid")
err.to_s.include?("Unexpected end tag")
end,
"errors should be on the context node's document",
)
end

def test_error_propagation_on_fragment_parse_in_node_context_should_not_include_preexisting_errors
doc = Nokogiri::HTML4::Document.parse("<html><body><div></div><jimmy></jimmy></body></html>")
doc = Nokogiri::HTML4::Document.parse("<html><body><div></div></jimmy></body></html>")
assert(doc.errors.any? { |err| err.to_s.include?("jimmy") }, "assert on setup")

context_node = doc.at_css("div")
frag = Nokogiri::HTML4::DocumentFragment.new(doc, "<hello>oh, hello there.</hello>", context_node)
frag = Nokogiri::HTML4::DocumentFragment.new(doc, "<hello>oh, hello there.</goodbye>", context_node)
assert(
frag.errors.any? do |err|
err.to_s.include?("Tag hello invalid")
err.to_s.include?("goodbye")
end,
"errors should be on the context node's document",
)
Expand All @@ -267,14 +267,14 @@ def test_capturing_nonparse_errors_during_fragment_clone

def test_capturing_nonparse_errors_during_node_copy_between_fragments
# Errors should be emitted while parsing only, and should not change when moving nodes.
frag1 = Nokogiri::HTML4.fragment("<diva id='unique'>one</diva>")
frag2 = Nokogiri::HTML4.fragment("<dive id='unique'>two</dive>")
frag1 = Nokogiri::HTML4.fragment("<div id='unique'>one</foo1>")
frag2 = Nokogiri::HTML4.fragment("<div id='unique'>two</foo2>")
node1 = frag1.at_css("#unique")
node2 = frag2.at_css("#unique")
original_errors1 = frag1.errors.dup
original_errors2 = frag2.errors.dup
assert(original_errors1.any? { |e| e.to_s.include?("Tag diva invalid") }, "it should complain about the tag name")
assert(original_errors2.any? { |e| e.to_s.include?("Tag dive invalid") }, "it should complain about the tag name")
assert(original_errors1.any? { |e| e.to_s.include?("Unexpected end tag") })
assert(original_errors2.any? { |e| e.to_s.include?("Unexpected end tag") })

node1.add_child(node2)

Expand Down Expand Up @@ -370,93 +370,87 @@ def test_parse_with_io
end

describe "HTML4.fragment" do
it "has sane defaults" do
it "has reasonable defaults" do
frag = Nokogiri::HTML4.fragment(input)

assert_equal("<div>foo</div>", frag.to_html)
refute_empty(frag.errors)
assert_equal(html4_default, frag.parse_options)
end

it "accepts options" do
frag = Nokogiri::HTML4.fragment(input, nil, html4_default)
assert_equal("<div>foo</div>", frag.to_html)
refute_empty(frag.errors)
frag = Nokogiri::HTML4.fragment(input, nil, html4_strict)

assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4.fragment(input, nil, html4_strict)
end
assert_equal("<div>foo</div>", frag.to_html)
assert_equal(html4_strict, frag.parse_options)
end

it "takes a config block" do
default_config = nil
Nokogiri::HTML4.fragment(input) do |config|
default_config = config
frag = Nokogiri::HTML4.fragment(input) do |config|
default_config = config.dup
config.strict
end
refute_predicate(default_config, :strict?)

assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4.fragment(input, &:norecover)
end
assert_equal(html4_default, default_config)
refute_predicate(default_config, :strict?)
assert_predicate(frag.parse_options, :strict?)
end
end

describe "HTML4::DocumentFragment.parse" do
it "has sane defaults" do
it "has reasonable defaults" do
frag = Nokogiri::HTML4::DocumentFragment.parse(input)

assert_equal("<div>foo</div>", frag.to_html)
refute_empty(frag.errors)
assert_equal(html4_default, frag.parse_options)
end

it "accepts options" do
frag = Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_default)
assert_equal("<div>foo</div>", frag.to_html)
refute_empty(frag.errors)
frag = Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_strict)

assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4::DocumentFragment.parse(input, nil, html4_strict)
end
assert_equal("<div>foo</div>", frag.to_html)
assert_equal(html4_strict, frag.parse_options)
end

it "takes a config block" do
default_config = nil
Nokogiri::HTML4::DocumentFragment.parse(input) do |config|
default_config = config
frag = Nokogiri::HTML4::DocumentFragment.parse(input) do |config|
default_config = config.dup
config.strict
end
refute_predicate(default_config, :strict?)

assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4::DocumentFragment.parse(input, &:norecover)
end
assert_equal(html4_default, default_config)
refute_predicate(default_config, :strict?)
assert_predicate(frag.parse_options, :strict?)
end
end

describe "HTML4::DocumentFragment.new" do
describe "without a context node" do
it "has sane defaults" do
it "has reasonable defaults" do
frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input)

assert_equal("<div>foo</div>", frag.to_html)
refute_empty(frag.errors)
assert_equal(html4_default, frag.parse_options)
end

it "accepts options" do
frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_default)
assert_equal("<div>foo</div>", frag.to_html)
refute_empty(frag.errors)
frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_strict)

assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, nil, html4_strict)
end
assert_equal("<div>foo</div>", frag.to_html)
assert_equal(html4_strict, frag.parse_options)
end

it "takes a config block" do
default_config = nil
Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) do |config|
default_config = config
frag = Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input) do |config|
default_config = config.dup
config.strict
end
refute_predicate(default_config, :strict?)

assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4::DocumentFragment.new(Nokogiri::HTML4::Document.new, input, &:norecover)
end
assert_equal(html4_default, default_config)
refute_predicate(default_config, :strict?)
assert_predicate(frag.parse_options, :strict?)
end
end

Expand Down

0 comments on commit 9e57ff0

Please sign in to comment.