Skip to content

Commit

Permalink
HTML4::DocumentFragment.parse and #initialize take kwargs
Browse files Browse the repository at this point in the history
Related to #3323

This commit was merged and expanded from #3336, thank you @MattJones!

Co-authored-by: Matt Jones <matthew.hartley.jones@gmail.com>
  • Loading branch information
flavorjones and MattJones committed Dec 8, 2024
1 parent ce05c0d commit 01d2a56
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 39 deletions.
118 changes: 83 additions & 35 deletions lib/nokogiri/html4/document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,60 @@ module HTML4
class DocumentFragment < Nokogiri::XML::DocumentFragment
#
# :call-seq:
# parse(tags) => DocumentFragment
# parse(tags, encoding) => DocumentFragment
# parse(tags, encoding, options) => DocumentFragment
# parse(tags, encoding) { |options| ... } => DocumentFragment
# parse(input) { |options| ... } → HTML4::DocumentFragment
# parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment
#
# Parse an HTML4 fragment.
# Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This
# method creates a new, empty HTML4::Document to contain the fragment.
#
# [Parameters]
# - +tags+ (optional String, or any object that responds to +#read+ such as an IO, or
# StringIO)
# - +encoding+ (optional String) the name of the encoding that should be used when processing
# the document. (default +nil+ for auto-detection)
# - +options+ (optional) configuration object that sets options during parsing, such as
# Nokogiri::XML::ParseOptions::RECOVER. See Nokogiri::XML::ParseOptions for more
# information.
# [Required Parameters]
# - +input+ (String | IO) The content to be parsed.
#
# [Yields] If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify
# before the fragment is parsed. See Nokogiri::XML::ParseOptions for more information.
# [Optional Keyword Arguments]
# - +encoding:+ (String) The name of the encoding that should be used when processing the
# document. When not provided, the encoding will be determined based on the document
# content.
#
# [Returns] DocumentFragment
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
# behaviors during parsing. See ParseOptions for more information. The default value is
# +ParseOptions::DEFAULT_HTML+.
#
# [Yields]
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
# can be configured before parsing. See ParseOptions for more information.
#
# [Returns] HTML4::DocumentFragment
#
# *Example:* Parsing a string
#
# fragment = DocumentFragment.parse("<div>Hello World</div>")
# fragment = HTML4::DocumentFragment.parse("<div>Hello World</div>")
#
# *Example:* Parsing an IO
#
# fragment = File.open("fragment.html") do |file|
# DocumentFragment.parse(file)
# HTML4::DocumentFragment.parse(file)
# end
#
# *Example:* Specifying encoding
#
# fragment = DocumentFragment.parse(input, "EUC-JP")
# fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP")
#
# *Example:* Setting parse options dynamically
#
# DocumentFragment.parse("<div>Hello World") do |options|
# HTML4::DocumentFragment.parse("<div>Hello World") do |options|
# options.huge.pedantic
# end
#
def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
def self.parse(
input,
encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
encoding: encoding_, options: options_,
&block
)
# TODO: this method should take a context node.
doc = HTML4::Document.new

if tags.respond_to?(:read)
if input.respond_to?(:read)
# Handle IO-like objects (IO, File, StringIO, etc.)
# The _read_ method of these objects doesn't accept an +encoding+ parameter.
# Encoding is usually set when the IO object is created or opened,
Expand All @@ -65,12 +74,12 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML,
#
# For StringIO specifically, _set_encoding_ affects only the internal string,
# not how the data is read out.
tags.set_encoding(encoding) if encoding && tags.respond_to?(:set_encoding)
tags = tags.read
input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding)
input = input.read
end

encoding ||= if tags.respond_to?(:encoding)
encoding = tags.encoding
encoding ||= if input.respond_to?(:encoding)
encoding = input.encoding
if encoding == ::Encoding::ASCII_8BIT
"UTF-8"
else
Expand All @@ -82,32 +91,71 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML,

doc.encoding = encoding

new(doc, tags, nil, options, &block)
new(doc, input, options: options, &block)
end

# It's recommended to use either DocumentFragment.parse or XML::Node#parse rather than call this
# method directly.
def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
return self unless tags
#
# :call-seq:
# new(document) { |options| ... } → HTML4::DocumentFragment
# new(document, input) { |options| ... } → HTML4::DocumentFragment
# new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment
#
# Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment.
#
# 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather
# than call this method directly.
#
# [Required Parameters]
# - +document+ (HTML4::Document) The parent document to associate the returned fragment with.
#
# [Optional Parameters]
# - +input+ (String) The content to be parsed.
#
# [Optional Keyword Arguments]
# - +context:+ (Nokogiri::XML::Node) The <b>context node</b> for the subtree created. See
# below for more information.
#
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
# behaviors during parsing. See ParseOptions for more information. The default value is
# +ParseOptions::DEFAULT_HTML+.
#
# [Yields]
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
# can be configured before parsing. See ParseOptions for more information.
#
# [Returns] HTML4::DocumentFragment
#
# === Context \Node
#
# If a context node is specified using +context:+, then the fragment will be created by
# calling XML::Node#parse on that node, so the parser will behave as if that Node is the
# parent of the fragment subtree.
#
def initialize(
document, input = nil,
context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
context: context_, options: options_
) # rubocop:disable Lint/MissingSuper
return self unless input

options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
@parse_options = options
yield options if block_given?

if ctx
if context
preexisting_errors = document.errors.dup
node_set = ctx.parse("<div>#{tags}</div>", options)
node_set = context.parse("<div>#{input}</div>", options)
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
self.errors = document.errors - preexisting_errors
else
# This is a horrible hack, but I don't care
path = if /^\s*?<body/i.match?(tags)
path = if /^\s*?<body/i.match?(input)
"/html/body"
else
"/html/body/node()"
end

temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
temp_doc = HTML4::Document.parse("<html><body>#{input}", nil, document.encoding, options)
temp_doc.xpath(path).each { |child| child.parent = self }
self.errors = temp_doc.errors
end
Expand Down
26 changes: 22 additions & 4 deletions test/html4/test_document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,14 @@ def test_parse_with_io
assert_equal("hello world", fragment.content)
end

it "returns a string matching an encoding passed with kwargs" do
input = "<div>hello world</div>"

fragment = Nokogiri::HTML4::DocumentFragment.parse(input, encoding: "ISO-8859-1")
assert_equal("ISO-8859-1", fragment.document.encoding)
assert_equal("hello world", fragment.content)
end

it "respects encoding for empty strings" do
fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
assert_equal "UTF-8", fragment.to_html.encoding.to_s
Expand Down Expand Up @@ -384,6 +392,13 @@ def test_parse_with_io
assert_equal(html4_huge, frag.parse_options)
end

it "accepts options as kwargs" do
frag = Nokogiri::HTML4::DocumentFragment.parse(input, options: html4_huge)

assert_equal("<div>foo</div>", frag.to_html)
assert_equal(html4_huge, frag.parse_options)
end

it "takes a config block" do
default_config = nil
frag = Nokogiri::HTML4.fragment(input) do |config|
Expand Down Expand Up @@ -495,9 +510,9 @@ def test_parse_with_io
Class.new(Nokogiri::HTML4::DocumentFragment) do
attr_accessor :initialized_with, :initialized_count

def initialize(*args)
def initialize(*args, **kwargs)
super
@initialized_with = args
@initialized_with = [args, kwargs]
@initialized_count ||= 0
@initialized_count += 1
end
Expand All @@ -516,8 +531,11 @@ def initialize(*args)
end

it "passes args to #initialize" do
fragment = klass.new(html, "<div>a</div>")
assert_equal([html, "<div>a</div>"], fragment.initialized_with)
fragment = klass.new(html, "<div>a</div>", options: 1)
assert_equal(
[[html, "<div>a</div>"], { options: 1 }],
fragment.initialized_with,
)
end
end

Expand Down

0 comments on commit 01d2a56

Please sign in to comment.