diff --git a/ext/nokogiri/html4_document.c b/ext/nokogiri/html4_document.c index 2a308a109f9..e3e0ee0847f 100644 --- a/ext/nokogiri/html4_document.c +++ b/ext/nokogiri/html4_document.c @@ -7,9 +7,9 @@ static ID id_to_s; /* * call-seq: - * new + * new(uri=nil, external_id=nil) → HTML4::Document * - * Create a new document + * Create a new empty document with base URI +uri+ and external ID +external_id+. */ static VALUE rb_html_document_s_new(int argc, VALUE *argv, VALUE klass) diff --git a/ext/nokogiri/nokogiri.c b/ext/nokogiri/nokogiri.c index 66c40bda263..a43813b9ceb 100644 --- a/ext/nokogiri/nokogiri.c +++ b/ext/nokogiri/nokogiri.c @@ -185,8 +185,8 @@ Init_nokogiri(void) { mNokogiri = rb_define_module("Nokogiri"); mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo"); - mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4"); - mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX"); + mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4"); + mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX"); mNokogiriHtml5 = rb_define_module_under(mNokogiri, "HTML5"); mNokogiriXml = rb_define_module_under(mNokogiri, "XML"); mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX"); diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index e1022f67f5f..740819300b5 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -370,6 +370,8 @@ noko_xml_document_s_read_io(VALUE rb_class, VALUE rb_encoding, VALUE rb_options) { + /* TODO: deprecate this method, parse should be the preferred entry point. then we can make this + private. */ libxmlStructuredErrorHandlerState handler_state; VALUE rb_errors = rb_ary_new(); @@ -417,6 +419,8 @@ noko_xml_document_s_read_memory(VALUE rb_class, VALUE rb_encoding, VALUE rb_options) { + /* TODO: deprecate this method, parse should be the preferred entry point. then we can make this + private. */ VALUE rb_errors = rb_ary_new(); xmlSetStructuredErrorFunc((void *)rb_errors, noko__error_array_pusher); @@ -444,9 +448,9 @@ noko_xml_document_s_read_memory(VALUE rb_class, /* * call-seq: - * new(version = default) + * new(version = "1.0") * - * Create a new document with +version+ (defaults to "1.0") + * Create a new empty document declaring XML version +version+. */ static VALUE new (int argc, VALUE *argv, VALUE klass) @@ -756,9 +760,7 @@ void noko_init_xml_document(void) { assert(cNokogiriXmlNode); - /* - * Nokogiri::XML::Document wraps an xml document. - */ + cNokogiriXmlDocument = rb_define_class_under(mNokogiriXml, "Document", cNokogiriXmlNode); rb_define_alloc_func(cNokogiriXmlDocument, _xml_document_alloc); diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index 9ca26db494a..5566e058ce0 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -46,11 +46,11 @@ def self.HTML5(...) # The document and fragment parsing methods support options that are different from # Nokogiri::HTML4::Document or Nokogiri::XML::Document. # - # - Nokogiri.HTML5(html, url:, encoding:, **parse_options) - # - Nokogiri::HTML5.parse(html, url:, encoding:, **parse_options) - # - Nokogiri::HTML5::Document.parse(html, url:, encoding:, **parse_options) - # - Nokogiri::HTML5.fragment(html, encoding = nil, **parse_options) - # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **parse_options) + # - Nokogiri.HTML5(input, url:, encoding:, **parse_options) + # - Nokogiri::HTML5.parse(input, url:, encoding:, **parse_options) + # - Nokogiri::HTML5::Document.parse(input, url:, encoding:, **parse_options) + # - Nokogiri::HTML5.fragment(input, encoding:, **parse_options) + # - Nokogiri::HTML5::DocumentFragment.parse(input, encoding:, **parse_options) # # The four currently supported parse options are # diff --git a/lib/nokogiri/html5/document.rb b/lib/nokogiri/html5/document.rb index 51b77c23b36..8bb4f10d28c 100644 --- a/lib/nokogiri/html5/document.rb +++ b/lib/nokogiri/html5/document.rb @@ -50,8 +50,9 @@ class Document < Nokogiri::HTML4::Document class << self # :call-seq: - # parse(input) { |parse_options| ... } - # parse(input, url:, encoding:, **parse_options) + # parse(input) { |options| ... } → HTML5::Document + # parse(input, url: encoding:) { |options| ... } → HTML5::Document + # parse(input, **options) → HTML5::Document # # Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the # encoding of +input+ if it can be determined, or else falls back to the +encoding:+ @@ -62,11 +63,25 @@ class << self # # [Optional Parameters] # - +url:+ (String) the base URI of the document. - # - +encoding+ (Encoding) The encoding that should be used when processing the - # document. This option is only used as a fallback when the encoding of +input+ cannot be - # determined. - # - +parse_options+ (Hash) represents keywords arguments that control the behavior of the - # parser. See rdoc-ref:HTML5@Parsing+options for a list of available options. + # + # [Optional Keyword Arguments] + # - +encoding:+ (Encoding) The name of the encoding that should be used when processing the + # document. When not provided, the encoding will be determined based on the document + # content. + # + # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0) + # + # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default + # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) + # + # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an + # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) + # + # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+ + # elements as text. (default +false+) + # + # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options. # # [Yields] # If present, the block will be passed a Hash object to modify with parse options before the diff --git a/lib/nokogiri/xml/document.rb b/lib/nokogiri/xml/document.rb index 2ef98c9dd47..6c9d4949a12 100644 --- a/lib/nokogiri/xml/document.rb +++ b/lib/nokogiri/xml/document.rb @@ -5,12 +5,12 @@ module Nokogiri module XML - # Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document - # is created by parsing an XML document. See Nokogiri::XML::Document.parse for more information - # on parsing. + # Nokogiri::XML::Document is the main entry point for dealing with \XML documents. The Document + # is created by parsing \XML content from a String or an IO object. See + # Nokogiri::XML::Document.parse for more information on parsing. # - # For searching a Document, see Nokogiri::XML::Searchable#css and - # Nokogiri::XML::Searchable#xpath + # Document inherits a great deal of functionality from its superclass Nokogiri::XML::Node, so + # please read that class's documentation as well. class Document < Nokogiri::XML::Node # See http://www.w3.org/TR/REC-xml-names/#ns-decl for more details. Note that we're not # attempting to handle unicode characters partly because libxml2 doesn't handle unicode @@ -25,34 +25,34 @@ class Document < Nokogiri::XML::Node class << self # call-seq: - # parse(input, url: nil, encoding: nil, options: DEFAULT_XML) { |options| } => Nokogiri::XML::Document + # parse(input) { |options| ... } => Nokogiri::XML::Document + # parse(input, url:, encoding:, options:) => Nokogiri::XML::Document # - # Parse XML input from a String or IO object, and return a new Document object. + # Parse \XML input from a String or IO object, and return a new XML::Document. # - # By default, Nokogiri treats documents as untrusted, and so does not attempt to load DTDs + # 🛡 By default, Nokogiri treats documents as untrusted, and so does not attempt to load DTDs # or access the network. See Nokogiri::XML::ParseOptions for a complete list of options; and # that module's DEFAULT_XML constant for what's set (and not set) by default. # - # See also: Nokogiri.XML() which is a convenience method which will call this method. + # [Required Parameters] + # - +input+ (String | IO) The content to be parsed. # - # [Parameters] - # - +input+ (String, IO) The content to be parsed. - # - # [Keyword arguments] - # - +url:+ (String) The URI where this document is located. + # [Optional Keyword Arguments] + # - +url:+ (String) The base URI for this document. # # - +encoding:+ (String) The name of the encoding that should be used when processing the - # document. (default +nil+ means that the encoding will be determined based on the - # document content) + # document. When not provided, the encoding will be determined based on the document + # content. # - # - +options+ (Nokogiri::XML::ParseOptions) Configuration object that determines some - # behaviors during parsing, such as Nokogiri::XML::ParseOptions::RECOVER. See the - # Nokogiri::XML::ParseOptions for more information. + # - +options:+ (Nokogiri::XML::ParseOptions) Configuration object that determines some + # behaviors during parsing. See ParseOptions for more information. The default value is + # +ParseOptions::DEFAULT_XML+. # # [Yields] # If a block is given, a Nokogiri::XML::ParseOptions object is yielded to the block which - # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information. + # can be configured before parsing. See Nokogiri::XML::ParseOptions for more information. # + # [Returns] Nokogiri::XML::Document def parse( string_or_io, url_ = nil, encoding_ = nil, options_ = XML::ParseOptions::DEFAULT_XML, @@ -72,6 +72,7 @@ def parse( end doc = if string_or_io.respond_to?(:read) + # TODO: should we instead check for respond_to?(:to_path) ? if string_or_io.is_a?(Pathname) # resolve the Pathname to the file and open it as an IO object, see #2110 string_or_io = string_or_io.expand_path.open