Skip to content

Commit

Permalink
HTML4::Document.parse accepts kwargs
Browse files Browse the repository at this point in the history
Part of #3323
  • Loading branch information
flavorjones committed Dec 8, 2024
1 parent 4349d90 commit 2da0c10
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 26 deletions.
67 changes: 44 additions & 23 deletions lib/nokogiri/html4/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -161,52 +161,73 @@ def xpath_doctype
end

class << self
###
# Parse HTML. +string_or_io+ may be a String, or any object that
# responds to _read_ and _close_ such as an IO, or StringIO.
# +url+ is resource where this document is located. +encoding+ is the
# encoding that should be used when processing the document. +options+
# is a number that sets options in the parser, such as
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
# Nokogiri::XML::ParseOptions.
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
# :call-seq:
# parse(input) { |options| ... } => Nokogiri::HTML4::Document
# parse(input, url:, encoding:, options:) => Nokogiri::HTML4::Document
#
# Parse \HTML4 input from a String or IO object, and return a new HTML4::Document.
#
# [Required Parameters]
# - +input+ (String | IO) The content to be parsed.
#
# [Optional Keyword Arguments]
# - +url:+ (String) The base URI for this document.
#
# - +encoding:+ (String) The name of the encoding that should be used when processing the
# document. When not provided, the encoding will be determined based on the document
# content.
#
# - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some
# behaviors during parsing. See ParseOptions for more information. The default value is
# +ParseOptions::DEFAULT_HTML+.
#
# [Yields]
# If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which
# can be configured before parsing. See Nokogiri::XML::ParseOptions for more information.
#
# [Returns] Nokogiri::HTML4::Document
def parse(
input,
url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_HTML,
url: url_, encoding: encoding_, options: options_
)
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
yield options if block_given?

url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
url ||= input.respond_to?(:path) ? input.path : nil

if string_or_io.respond_to?(:encoding)
unless string_or_io.encoding == Encoding::ASCII_8BIT
encoding ||= string_or_io.encoding.name
if input.respond_to?(:encoding)
unless input.encoding == Encoding::ASCII_8BIT
encoding ||= input.encoding.name
end
end

if string_or_io.respond_to?(:read)
if string_or_io.is_a?(Pathname)
if input.respond_to?(:read)
if input.is_a?(Pathname)
# resolve the Pathname to the file and open it as an IO object, see #2110
string_or_io = string_or_io.expand_path.open
url ||= string_or_io.path
input = input.expand_path.open
url ||= input.path
end

unless encoding
string_or_io = EncodingReader.new(string_or_io)
input = EncodingReader.new(input)
begin
return read_io(string_or_io, url, encoding, options.to_i)
return read_io(input, url, encoding, options.to_i)
rescue EncodingReader::EncodingFound => e
encoding = e.found_encoding
end
end
return read_io(string_or_io, url, encoding, options.to_i)
return read_io(input, url, encoding, options.to_i)
end

# read_memory pukes on empty docs
if string_or_io.nil? || string_or_io.empty?
if input.nil? || input.empty?
return encoding ? new.tap { |i| i.encoding = encoding } : new
end

encoding ||= EncodingReader.detect_encoding(string_or_io)
encoding ||= EncodingReader.detect_encoding(input)

read_memory(string_or_io, url, encoding, options.to_i)
read_memory(input, url, encoding, options.to_i)
end
end
end
Expand Down
33 changes: 30 additions & 3 deletions test/html4/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ def test_document_parse_method_with_url
assert_equal("http://foobar.example.com/", doc.url)
end

def test_document_parse_method_with_url_kwarg
doc = Nokogiri::HTML4("<html></html>", url: "http://foobar.example.com/", encoding: "UTF-8")
refute_empty(doc.to_s, "Document should not be empty")
assert_equal("http://foobar.example.com/", doc.url)
end

###
# Nokogiri::HTML4 returns an empty Document when given a blank string GH#11
def test_empty_string_returns_empty_doc
Expand Down Expand Up @@ -231,7 +237,7 @@ def test_title=

def test_meta_encoding_without_head
encoding = "EUC-JP"
html = Nokogiri::HTML4("<html><body>foo</body></html>", nil, encoding)
html = Nokogiri::HTML4("<html><body>foo</body></html>", encoding: encoding)

assert_nil(html.meta_encoding)

Expand All @@ -246,7 +252,7 @@ def test_meta_encoding_without_head

def test_html5_meta_encoding_without_head
encoding = "EUC-JP"
html = Nokogiri::HTML4("<!DOCTYPE html><html><body>foo</body></html>", nil, encoding)
html = Nokogiri::HTML4("<!DOCTYPE html><html><body>foo</body></html>", encoding: encoding)

assert_nil(html.meta_encoding)

Expand Down Expand Up @@ -722,7 +728,7 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262
html_fragment = <<~HTML
<img width="16" height="16" src="images/icon.gif" border="0" alt="Inactive hide details for &quot;User&quot; ---19/05/2015 12:55:29---Provvediamo subito nell&#8217;integrare">
HTML
doc = Nokogiri::HTML4(html_fragment, nil, "ISO-8859-1")
doc = Nokogiri::HTML4(html_fragment, encoding: "ISO-8859-1")
html = doc.to_html
assert html.index("src=\"images/icon.gif\"")
assert_equal "ISO-8859-1", html.encoding.name
Expand Down Expand Up @@ -815,6 +821,14 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262
assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
assert_nil(exception.path)
end

it "raises exception on parse error using kwarg" do
exception = assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4.parse(input, options: parse_options)
end
assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
assert_nil(exception.path)
end
end

describe "default options" do
Expand All @@ -838,13 +852,26 @@ def test_silencing_nonparse_errors_during_attribute_insertion_1262
assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
assert_nil(exception.path)
end

it "raises exception on parse error using kwargs" do
exception = assert_raises(Nokogiri::SyntaxError) do
Nokogiri::HTML4.parse(input, encoding: "UTF-8", options: parse_options)
end
assert_match(/Parser without recover option encountered error or warning/, exception.to_s)
assert_nil(exception.path)
end
end

describe "default options" do
it "does not raise exception on parse error" do
doc = Nokogiri::HTML4.parse(input, nil, "UTF-8")
assert_operator(doc.errors.length, :>, 0)
end

it "does not raise exception on parse error using kwarg" do
doc = Nokogiri::HTML4.parse(input, encoding: "UTF-8")
assert_operator(doc.errors.length, :>, 0)
end
end
end
end
Expand Down

0 comments on commit 2da0c10

Please sign in to comment.