From 01d2a5648d037fe2fb4f53f25cd06e8b0c08a4c2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 8 Dec 2024 09:36:19 -0500 Subject: [PATCH] HTML4::DocumentFragment.parse and #initialize take kwargs Related to #3323 This commit was merged and expanded from #3336, thank you @MattJones! Co-authored-by: Matt Jones --- lib/nokogiri/html4/document_fragment.rb | 118 +++++++++++++++++------- test/html4/test_document_fragment.rb | 26 +++++- 2 files changed, 105 insertions(+), 39 deletions(-) diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb index eae79bcb14a..2a70cb1a571 100644 --- a/lib/nokogiri/html4/document_fragment.rb +++ b/lib/nokogiri/html4/document_fragment.rb @@ -5,51 +5,60 @@ module HTML4 class DocumentFragment < Nokogiri::XML::DocumentFragment # # :call-seq: - # parse(tags) => DocumentFragment - # parse(tags, encoding) => DocumentFragment - # parse(tags, encoding, options) => DocumentFragment - # parse(tags, encoding) { |options| ... } => DocumentFragment + # parse(input) { |options| ... } → HTML4::DocumentFragment + # parse(input, encoding:, options:) { |options| ... } → HTML4::DocumentFragment # - # Parse an HTML4 fragment. + # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. This + # method creates a new, empty HTML4::Document to contain the fragment. # - # [Parameters] - # - +tags+ (optional String, or any object that responds to +#read+ such as an IO, or - # StringIO) - # - +encoding+ (optional String) the name of the encoding that should be used when processing - # the document. (default +nil+ for auto-detection) - # - +options+ (optional) configuration object that sets options during parsing, such as - # Nokogiri::XML::ParseOptions::RECOVER. See Nokogiri::XML::ParseOptions for more - # information. + # [Required Parameters] + # - +input+ (String | IO) The content to be parsed. # - # [Yields] If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify - # before the fragment is parsed. See Nokogiri::XML::ParseOptions for more information. + # [Optional Keyword Arguments] + # - +encoding:+ (String) The name of the encoding that should be used when processing the + # document. When not provided, the encoding will be determined based on the document + # content. # - # [Returns] DocumentFragment + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_HTML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See ParseOptions for more information. + # + # [Returns] HTML4::DocumentFragment # # *Example:* Parsing a string # - # fragment = DocumentFragment.parse("
Hello World
") + # fragment = HTML4::DocumentFragment.parse("
Hello World
") # # *Example:* Parsing an IO # # fragment = File.open("fragment.html") do |file| - # DocumentFragment.parse(file) + # HTML4::DocumentFragment.parse(file) # end # # *Example:* Specifying encoding # - # fragment = DocumentFragment.parse(input, "EUC-JP") + # fragment = HTML4::DocumentFragment.parse(input, encoding: "EUC-JP") # # *Example:* Setting parse options dynamically # - # DocumentFragment.parse("
Hello World") do |options| + # HTML4::DocumentFragment.parse("
Hello World") do |options| # options.huge.pedantic # end # - def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) + def self.parse( + input, + encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML, + encoding: encoding_, options: options_, + &block + ) + # TODO: this method should take a context node. doc = HTML4::Document.new - if tags.respond_to?(:read) + if input.respond_to?(:read) # Handle IO-like objects (IO, File, StringIO, etc.) # The _read_ method of these objects doesn't accept an +encoding+ parameter. # Encoding is usually set when the IO object is created or opened, @@ -65,12 +74,12 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, # # For StringIO specifically, _set_encoding_ affects only the internal string, # not how the data is read out. - tags.set_encoding(encoding) if encoding && tags.respond_to?(:set_encoding) - tags = tags.read + input.set_encoding(encoding) if encoding && input.respond_to?(:set_encoding) + input = input.read end - encoding ||= if tags.respond_to?(:encoding) - encoding = tags.encoding + encoding ||= if input.respond_to?(:encoding) + encoding = input.encoding if encoding == ::Encoding::ASCII_8BIT "UTF-8" else @@ -82,32 +91,71 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, doc.encoding = encoding - new(doc, tags, nil, options, &block) + new(doc, input, options: options, &block) end - # It's recommended to use either DocumentFragment.parse or XML::Node#parse rather than call this - # method directly. - def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper - return self unless tags + # + # :call-seq: + # new(document) { |options| ... } → HTML4::DocumentFragment + # new(document, input) { |options| ... } → HTML4::DocumentFragment + # new(document, input, context:, options:) { |options| ... } → HTML4::DocumentFragment + # + # Parse \HTML4 fragment input from a String, and return a new HTML4::DocumentFragment. + # + # 💡 It's recommended to use either HTML4::DocumentFragment.parse or XML::Node#parse rather + # than call this method directly. + # + # [Required Parameters] + # - +document+ (HTML4::Document) The parent document to associate the returned fragment with. + # + # [Optional Parameters] + # - +input+ (String) The content to be parsed. + # + # [Optional Keyword Arguments] + # - +context:+ (Nokogiri::XML::Node) The context node for the subtree created. See + # below for more information. + # + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_HTML+. + # + # [Yields] + # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which + # can be configured before parsing. See ParseOptions for more information. + # + # [Returns] HTML4::DocumentFragment + # + # === Context \Node + # + # If a context node is specified using +context:+, then the fragment will be created by + # calling XML::Node#parse on that node, so the parser will behave as if that Node is the + # parent of the fragment subtree. + # + def initialize( + document, input = nil, + context_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML, + context: context_, options: options_ + ) # rubocop:disable Lint/MissingSuper + return self unless input options = Nokogiri::XML::ParseOptions.new(options) if Integer === options @parse_options = options yield options if block_given? - if ctx + if context preexisting_errors = document.errors.dup - node_set = ctx.parse("
#{tags}
", options) + node_set = context.parse("
#{input}
", options) node_set.first.children.each { |child| child.parent = self } unless node_set.empty? self.errors = document.errors - preexisting_errors else # This is a horrible hack, but I don't care - path = if /^\s*?#{tags}", nil, document.encoding, options) + temp_doc = HTML4::Document.parse("#{input}", nil, document.encoding, options) temp_doc.xpath(path).each { |child| child.parent = self } self.errors = temp_doc.errors end diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb index 6ef3faac074..57ad1193b3f 100644 --- a/test/html4/test_document_fragment.rb +++ b/test/html4/test_document_fragment.rb @@ -326,6 +326,14 @@ def test_parse_with_io assert_equal("hello world", fragment.content) end + it "returns a string matching an encoding passed with kwargs" do + input = "
hello world
" + + fragment = Nokogiri::HTML4::DocumentFragment.parse(input, encoding: "ISO-8859-1") + assert_equal("ISO-8859-1", fragment.document.encoding) + assert_equal("hello world", fragment.content) + end + it "respects encoding for empty strings" do fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8") assert_equal "UTF-8", fragment.to_html.encoding.to_s @@ -384,6 +392,13 @@ def test_parse_with_io assert_equal(html4_huge, frag.parse_options) end + it "accepts options as kwargs" do + frag = Nokogiri::HTML4::DocumentFragment.parse(input, options: html4_huge) + + assert_equal("
foo
", frag.to_html) + assert_equal(html4_huge, frag.parse_options) + end + it "takes a config block" do default_config = nil frag = Nokogiri::HTML4.fragment(input) do |config| @@ -495,9 +510,9 @@ def test_parse_with_io Class.new(Nokogiri::HTML4::DocumentFragment) do attr_accessor :initialized_with, :initialized_count - def initialize(*args) + def initialize(*args, **kwargs) super - @initialized_with = args + @initialized_with = [args, kwargs] @initialized_count ||= 0 @initialized_count += 1 end @@ -516,8 +531,11 @@ def initialize(*args) end it "passes args to #initialize" do - fragment = klass.new(html, "
a
") - assert_equal([html, "
a
"], fragment.initialized_with) + fragment = klass.new(html, "
a
", options: 1) + assert_equal( + [[html, "
a
"], { options: 1 }], + fragment.initialized_with, + ) end end