Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(libxml): default Reader node encoding to UTF-8 #3084

Merged
merged 1 commit into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## v1.16.next / unreleased

### Fixed

* [CRuby] `XML::Reader` defaults the encoding to UTF-8 if it's not specified in either the document or as a method parameter. Previously non-ASCII characters were serialized as NCRs in this case. [#2891] (@flavorjones)


## v1.16.0 / 2023-12-27

### Notable Changes
Expand Down
28 changes: 24 additions & 4 deletions ext/nokogiri/xml_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@ VALUE cNokogiriXmlReader;
static void
xml_reader_deallocate(void *data)
{
// free the document separately because we _may_ have triggered preservation by calling
// xmlTextReaderCurrentDoc during a read_more.
xmlTextReaderPtr reader = data;
xmlDocPtr doc = xmlTextReaderCurrentDoc(reader);
xmlFreeTextReader(reader);
if (doc) {
xmlFreeDoc(doc);
}
}

static const rb_data_type_t xml_reader_type = {
Expand Down Expand Up @@ -515,6 +521,7 @@ read_more(VALUE self)
xmlErrorConstPtr error;
VALUE error_list;
int ret;
xmlDocPtr c_document;

TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

Expand All @@ -524,6 +531,16 @@ read_more(VALUE self)
ret = xmlTextReaderRead(reader);
xmlSetStructuredErrorFunc(NULL, NULL);

c_document = xmlTextReaderCurrentDoc(reader);
if (c_document && c_document->encoding == NULL) {
VALUE constructor_encoding = rb_iv_get(self, "@encoding");
if (RTEST(constructor_encoding)) {
c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
} else {
c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
}
}

if (ret == 1) { return self; }
if (ret == 0) { return Qnil; }

Expand Down Expand Up @@ -707,15 +724,18 @@ rb_xml_reader_encoding(VALUE rb_reader)
const char *parser_encoding;
VALUE constructor_encoding;

TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding) {
return NOKOGIRI_STR_NEW2(parser_encoding);
}

constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}

TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding == NULL) { return Qnil; }
return NOKOGIRI_STR_NEW2(parser_encoding);
return Qnil;
}

void
Expand Down
115 changes: 109 additions & 6 deletions test/xml/test_reader_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,135 @@ def setup
)
end

def test_libxml2_detects_internal_encoding_correctly
skip_unless_libxml2("This feature wasn't implemented for JRuby")
def test_detects_internal_encoding_correctly
skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
<anotaci\xF3n>inspiraci\xF3n</anotaci\xF3n>
XML

assert_nil(reader.encoding)

reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end

def test_libxml2_overrides_internal_encoding_when_specified
def test_reader_defaults_internal_encoding_to_utf8
skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0"?>
<root attr="foo"><employee /></root>
XML

assert_nil(reader.encoding)

reader.each do
assert_equal("UTF-8", reader.encoding)
end
end

def test_override_internal_encoding_when_specified
if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
skip("iconv is not compiled into libxml2")
end

#
# note that libxml2 behavior around document encoding changed at least twice between 2.9
# and 2.12, so the testing here is superficial -- asserting on the reported encoding, but
# not asserting on the bytes in the document or the serialized nodes.
#
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
<foo>asdf</foo>
XML

assert_equal("UTF-8", reader.encoding)
reader.each do

reader.read

if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.12.0")
assert_equal("UTF-8", reader.encoding)
else
assert_equal("ISO-8859-1", reader.encoding)
end

reader = Nokogiri::XML::Reader(<<~XML, nil, "ISO-8859-1")
<?xml version="1.0" encoding="UTF-8"?>
<foo>asdf</foo>
XML

assert_equal("ISO-8859-1", reader.encoding)

reader.read

if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.12.0")
assert_equal("ISO-8859-1", reader.encoding)
else
assert_equal("UTF-8", reader.encoding)
end
end

def test_attribute_encoding_issue_2891_no_encoding_specified
if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
skip("iconv is not compiled into libxml2")
end

# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML

assert_nil(reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding) unless Nokogiri.jruby? # JRuby doesn't support encoding detection
assert_equal(
"<anotación tipo=\"inspiración\">INSPIRACIÓN</anotación>",
reader.outer_xml,
)
end

def test_attribute_encoding_issue_2891_correct_encoding_specified
if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
skip("iconv is not compiled into libxml2")
end

# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML

assert_equal("UTF-8", reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding)
assert_equal(
"<anotación tipo=\"inspiración\">INSPIRACIÓN</anotación>",
reader.outer_xml,
)
end

def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8
xml = <<~XML
<?xml version="1.0"?>
<test>\u{82B1}\u{82F1}</test>
XML
reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS")

assert_equal("Shift_JIS", reader.encoding)

reader.read

assert_equal("Shift_JIS", reader.encoding)
end

def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))
Expand Down