From 75c415b9960e4df8fa3c4285201747284848ca90 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Wed, 22 Jan 2025 01:25:12 +0000 Subject: [PATCH] re #476: handle byte order marks --- README.md | 5 + changelog/current.md | 1 + src/c4/yml/common.hpp | 14 +++ src/c4/yml/parse_engine.def.hpp | 100 ++++++++++++++- src/c4/yml/parse_engine.hpp | 8 ++ test/test_doc.cpp | 209 ++++++++++++++++++++++++++++++++ test/test_lib/test_case.cpp | 17 +-- 7 files changed, 344 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b5a66daf..b9eb88c9 100644 --- a/README.md +++ b/README.md @@ -762,6 +762,11 @@ following situations: reflects the usual practice of having at most 1 or 2 tag directives; also, be aware that this feature is under consideration for removal in YAML 1.3. +* Byte Order Marks: while ryml correctly handles BOMs at the beginning + of the stream or documents (as per the standard), BOMs inside + scalars are ignored. The [standard mandates that they should be + quoted](https://yaml.org/spec/1.2.2/#52-character-encodings) when + emitted, this is not done. * ryml tends to be on the permissive side in several cases where the YAML standard dictates that there should be an error; in many of these cases, ryml will tolerate the input. This may be good or bad, but in diff --git a/changelog/current.md b/changelog/current.md index 8d1ea45f..4a87e4d3 100644 --- a/changelog/current.md +++ b/changelog/current.md @@ -10,6 +10,7 @@ - add workarounds for problems with codegen of gcc 11,12,13. - improve CI coverage of gcc and clang optimization levels. - [BREAKING] Fix [#477](https://github.com/biojppm/rapidyaml/issues/477) ([PR#479](https://github.com/biojppm/rapidyaml/pull/479)): changed `read()` to overwrite existing entries. The provided implementations had an inconsistency between `std::map` (which wasn't overwriting) and `std::vector` (which *was* overwriting). +- Fix [#476](https://github.com/biojppm/rapidyaml/issues/476) [PR#493](https://github.com/biojppm/rapidyaml/pull/493): add handling of Byte Order Marks. - [PR#492](https://github.com/biojppm/rapidyaml/pull/492): fix emit of explicit keys when indented: ```yaml fixed: diff --git a/src/c4/yml/common.hpp b/src/c4/yml/common.hpp index a01be17f..7561d48f 100644 --- a/src/c4/yml/common.hpp +++ b/src/c4/yml/common.hpp @@ -419,6 +419,20 @@ struct RYML_EXPORT Callbacks /** @} */ +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +typedef enum { + NOBOM, + UTF8, + UTF16LE, + UTF16BE, + UTF32LE, + UTF32BE, +} Encoding_e; + + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index 598b0953..a6d5b9f7 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -5,13 +5,13 @@ #include "c4/error.hpp" #include "c4/charconv.hpp" #include "c4/utf.hpp" -#include #include #include "c4/yml/detail/parser_dbg.hpp" #include "c4/yml/filter_processor.hpp" #ifdef RYML_DBG +#include #include "c4/yml/detail/print.hpp" #endif @@ -261,6 +261,9 @@ ParseEngine::ParseEngine(EventHandler *evt_handler, ParserOptions , m_evt_handler(evt_handler) , m_pending_anchors() , m_pending_tags() + , m_was_inside_qmrk(false) + , m_doc_empty(false) + , m_encoding(NOBOM) , m_newline_offsets() , m_newline_offsets_size(0) , m_newline_offsets_capacity(0) @@ -277,6 +280,9 @@ ParseEngine::ParseEngine(ParseEngine &&that) noexcept , m_evt_handler(that.m_evt_handler) , m_pending_anchors(that.m_pending_anchors) , m_pending_tags(that.m_pending_tags) + , m_was_inside_qmrk(false) + , m_doc_empty(false) + , m_encoding(NOBOM) , m_newline_offsets(that.m_newline_offsets) , m_newline_offsets_size(that.m_newline_offsets_size) , m_newline_offsets_capacity(that.m_newline_offsets_capacity) @@ -293,6 +299,9 @@ ParseEngine::ParseEngine(ParseEngine const& that) , m_evt_handler(that.m_evt_handler) , m_pending_anchors(that.m_pending_anchors) , m_pending_tags(that.m_pending_tags) + , m_was_inside_qmrk(false) + , m_doc_empty(false) + , m_encoding(NOBOM) , m_newline_offsets() , m_newline_offsets_size() , m_newline_offsets_capacity() @@ -317,6 +326,9 @@ ParseEngine& ParseEngine::operator=(ParseEngine &&th m_evt_handler = that.m_evt_handler; m_pending_anchors = that.m_pending_anchors; m_pending_tags = that.m_pending_tags; + m_was_inside_qmrk = that.m_was_inside_qmrk; + m_doc_empty = that.m_doc_empty; + m_encoding = that.m_encoding; m_newline_offsets = (that.m_newline_offsets); m_newline_offsets_size = (that.m_newline_offsets_size); m_newline_offsets_capacity = (that.m_newline_offsets_capacity); @@ -337,6 +349,9 @@ ParseEngine& ParseEngine::operator=(ParseEngine cons m_evt_handler = that.m_evt_handler; m_pending_anchors = that.m_pending_anchors; m_pending_tags = that.m_pending_tags; + m_was_inside_qmrk = that.m_was_inside_qmrk; + m_doc_empty = that.m_doc_empty; + m_encoding = that.m_encoding; if(that.m_newline_offsets_capacity > m_newline_offsets_capacity) _resize_locations(that.m_newline_offsets_capacity); _RYML_CB_CHECK(m_evt_handler->m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity); @@ -357,6 +372,9 @@ void ParseEngine::_clr() m_evt_handler = {}; m_pending_anchors = {}; m_pending_tags = {}; + m_was_inside_qmrk = false; + m_doc_empty = true; + m_encoding = NOBOM; m_newline_offsets = {}; m_newline_offsets_size = {}; m_newline_offsets_capacity = {}; @@ -385,11 +403,12 @@ void ParseEngine::_reset() m_pending_anchors = {}; m_pending_tags = {}; m_doc_empty = true; + m_was_inside_qmrk = false; + m_encoding = NOBOM; if(m_options.locations()) { _prepare_locations(); } - m_was_inside_qmrk = false; } @@ -4351,6 +4370,72 @@ void ParseEngine::_handle_directive(csubstr rem) } } +template +bool ParseEngine::_handle_bom() +{ + const csubstr rem = m_evt_handler->m_curr->line_contents.rem; + if(rem.len) + { + const csubstr rest = rem.sub(1); + // https://yaml.org/spec/1.2.2/#52-character-encodings + #define _rymlisascii(c) ((c) <= '\x7f') // is the character ASCII? + if(rem.begins_with("\x00\x00\xfe\xff") || (rem.begins_with("\x00\x00\x00") && rem.len >= 4u && _rymlisascii(rem.str[3]))) + { + _c4dbgp("byte order mark: UTF32BE"); + _handle_bom(UTF32BE); + _line_progressed(4); + return true; + } + else if(rem.begins_with("\xff\xfe\x00\x00") || (rest.begins_with("\x00\x00\x00") && rem.len >= 4u && _rymlisascii(rem.str[0]))) + { + _c4dbgp("byte order mark: UTF32LE"); + _handle_bom(UTF32LE); + _line_progressed(4); + return true; + } + else if(rem.begins_with("\xfe\xff") || (rem.begins_with('\x00') && rem.len >= 2u && _rymlisascii(rem.str[1]))) + { + _c4dbgp("byte order mark: UTF16BE"); + _handle_bom(UTF16BE); + _line_progressed(2); + return true; + } + else if(rem.begins_with("\xff\xfe") || (rest.begins_with('\x00') && rem.len >= 2u && _rymlisascii(rem.str[0]))) + { + _c4dbgp("byte order mark: UTF16LE"); + _handle_bom(UTF16LE); + _line_progressed(2); + return true; + } + else if(rem.begins_with("\xef\xbb\xbf")) + { + _c4dbgp("byte order mark: UTF8"); + _handle_bom(UTF8); + _line_progressed(3); + return true; + } + #undef _rymlisascii + } + return false; +} + +template +void ParseEngine::_handle_bom(Encoding_e enc) +{ + if(m_encoding == NOBOM) + { + const bool is_beginning_of_file = m_evt_handler->m_curr->line_contents.rem.str == m_buf.str; + if(enc == UTF8 || is_beginning_of_file) + m_encoding = enc; + else + _c4err("non-UTF8 byte order mark can appear only at the beginning of the file"); + } + else if(enc != m_encoding) + { + _c4err("byte order mark can only be set once"); + } +} + //----------------------------------------------------------------------------- @@ -7202,6 +7287,10 @@ void ParseEngine::_handle_unk_json() _set_indentation(m_evt_handler->m_curr->line_contents.current_col(rem)); _line_progressed(1); } + else if(_handle_bom()) + { + _c4dbgp("byte order mark"); + } else { _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, ! has_any(SSCL)); @@ -7288,8 +7377,13 @@ void ParseEngine::_handle_unk() if(m_evt_handler->m_curr->line_contents.indentation == 0u && _at_line_begin()) { - const char first = rem.str[0]; _c4dbgp("rtop: zero indent + at line begin"); + if(_handle_bom()) + { + _c4dbgp("byte order mark!"); + rem = m_evt_handler->m_curr->line_contents.rem; + } + const char first = rem.str[0]; if(first == '-') { _c4dbgp("rtop: suspecting doc"); diff --git a/src/c4/yml/parse_engine.hpp b/src/c4/yml/parse_engine.hpp index 77a41241..9bd76850 100644 --- a/src/c4/yml/parse_engine.hpp +++ b/src/c4/yml/parse_engine.hpp @@ -366,6 +366,10 @@ class ParseEngine /** Get the latest YAML buffer parsed by this object. */ csubstr source() const { return m_buf; } + /** Get the encoding of the latest YAML buffer parsed by this object. + * If no encoding was specified, UTF8 is assumed as per the YAML standard. */ + Encoding_e encoding() const { return m_encoding != NOBOM ? m_encoding : UTF8; } + id_type stack_capacity() const { RYML_ASSERT(m_evt_handler); return m_evt_handler->m_stack.capacity(); } size_t locations_capacity() const { return m_newline_offsets_capacity; } @@ -714,6 +718,8 @@ class ParseEngine void _handle_annotations_and_indentation_after_start_mapblck(size_t key_indentation, size_t key_line); size_t _select_indentation_from_annotations(size_t val_indentation, size_t val_line); void _handle_directive(csubstr rem); + bool _handle_bom(); + void _handle_bom(Encoding_e enc); void _check_tag(csubstr tag); @@ -738,6 +744,8 @@ class ParseEngine bool m_was_inside_qmrk; bool m_doc_empty = true; + Encoding_e m_encoding = UTF8; + private: size_t *m_newline_offsets; diff --git a/test/test_doc.cpp b/test/test_doc.cpp index 8de933a8..0ae17ace 100644 --- a/test/test_doc.cpp +++ b/test/test_doc.cpp @@ -47,6 +47,215 @@ scalar } +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +struct bomspec { csubstr bom; const char *name; Encoding_e encoding; }; +const bomspec boms[] = { + {"", "NOBOM", UTF8}, + {"\xef\xbb\xbf", "UTF8", UTF8}, + {"\xfe\xff", "UTF16BE", UTF16BE}, + {"\xff\xfe", "UTF16LE", UTF16LE}, + {"\x00\x00\xfe\xff", "UTF32BE", UTF32BE}, + {"\xff\xfe\x00\x00", "UTF32LE", UTF32LE}, +}; +template +void test_boms(CreateFn &&createfn, TestFn &&testfn) +{ + Parser::handler_type handler; + Parser parser(&handler); + for(const bomspec bom : boms) + { + std::string buf = std::forward(createfn)(bom); + SCOPED_TRACE(bom.name); + SCOPED_TRACE(buf); + Tree tree = parse_in_arena(&parser, to_csubstr(buf)); + std::forward(testfn)(parser, tree, bom); + } +} +template +void test_boms_json(CreateFn &&createfn, TestFn &&testfn) +{ + Parser::handler_type handler; + Parser parser(&handler); + for(const bomspec bom : boms) + { + std::string buf = std::forward(createfn)(bom); + SCOPED_TRACE(bom.name); + SCOPED_TRACE(buf); + Tree tree = parse_json_in_arena(&parser, to_csubstr(buf)); + std::forward(testfn)(parser, tree, bom); + } +} +template +void test_boms2(CreateFn &&createfn, TestFn &&testfn) +{ + for(const bomspec bom1 : boms) + { + SCOPED_TRACE(bom1.name); + for(const bomspec bom2 : boms) + { + SCOPED_TRACE(bom2.name); + std::string buf = std::forward(createfn)(bom1, bom2); + SCOPED_TRACE(buf); + if(bom1.encoding == bom2.encoding || bom2.bom.empty()) + { + Parser::handler_type handler; + Parser parser(&handler); + Tree tree = parse_in_arena(&parser, to_csubstr(buf)); + std::forward(testfn)(parser, tree, bom1); + } + else + { + pfn_error orig = get_callbacks().m_error; + ExpectError::check_error([&]{ + Tree tree; + Parser::handler_type handler; + Parser parser(&handler); + ASSERT_EQ((pfn_error)tree.callbacks().m_error, (pfn_error)parser.callbacks().m_error); + ASSERT_NE((pfn_error)tree.callbacks().m_error, orig); + parse_in_arena(&parser, to_csubstr(buf), &tree); + }); + } + } + } +} + +TEST(byte_order_mark, only_bom) +{ + test_boms( + [](bomspec bom){ + return std::string(bom.bom.str, bom.bom.len); + }, + [](Parser const& parser, Tree const&, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + }); +} + +TEST(byte_order_mark, bom_and_scalar) +{ + test_boms( + [](bomspec bom){ + std::string yaml(bom.bom.str, bom.bom.len); + yaml.append("this is a scalar"); + return yaml; + }, + [](Parser const& parser, Tree const& tree, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + EXPECT_EQ(tree.rootref().val(), "this is a scalar"); + }); +} + +TEST(byte_order_mark, scalar_and_bom) +{ + auto mkyaml = [](bomspec bom){ + std::string yaml("this is a scalar"); + yaml.append(bom.bom.str, bom.bom.len); + return yaml; + }; + test_boms(mkyaml, + [&](Parser const& parser, Tree const& tree, bomspec bom){ + EXPECT_EQ(parser.encoding(), UTF8); + EXPECT_EQ(tree.rootref().val(), mkyaml(bom)); + }); +} + +TEST(byte_order_mark, scalar_bom_scalar) +{ + auto mkyaml = [](bomspec bom){ + std::string yaml("this is a scalar"); + yaml.append(bom.bom.str, bom.bom.len); + yaml.append("and it continues"); + return yaml; + }; + test_boms(mkyaml, + [&](Parser const& parser, Tree const& tree, bomspec bom){ + EXPECT_EQ(parser.encoding(), UTF8); + EXPECT_EQ(tree.rootref().val(), mkyaml(bom)); + }); +} + +TEST(byte_order_mark, bom_and_seq) +{ + auto mkyaml = [](bomspec bom){ + std::string yaml(bom.bom.str, bom.bom.len); + yaml.append("[]"); + return yaml; + }; + auto test = [&](Parser const& parser, Tree const&, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + }; + test_boms(mkyaml, test); + test_boms_json(mkyaml, test); +} + +TEST(byte_order_mark, bom_and_map) +{ + auto mkyaml = [](bomspec bom){ + std::string yaml(bom.bom.str, bom.bom.len); + yaml.append("{}"); + return yaml; + }; + auto test = [&](Parser const& parser, Tree const&, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + }; + test_boms(mkyaml, test); + test_boms_json(mkyaml, test); +} + +TEST(byte_order_mark, bom_and_doc) +{ + auto mkyaml = [](bomspec bom){ + std::string yaml(bom.bom.str, bom.bom.len); + yaml.append("---\nabc"); + return yaml; + }; + auto test = [&](Parser const& parser, Tree const& tree, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + EXPECT_EQ(tree.docref(0).val(), "abc"); + }; + test_boms(mkyaml, test); +} + +TEST(byte_order_mark, bom_doc_bom) +{ + auto mkyaml = [](bomspec bom1, bomspec bom2){ + std::string yaml(bom1.bom.str, bom1.bom.len); + yaml.append("---\n"); + yaml.append(bom2.bom.str, bom2.bom.len); + yaml.append("abc"); + std::cout << bom1.name << " vs " << bom2.name << "\n" << yaml << "\n"; + return yaml; + }; + auto test = [&](Parser const& parser, Tree const& tree, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + EXPECT_EQ(tree.docref(0).val(), "abc"); + }; + test_boms2(mkyaml, test); +} + +TEST(byte_order_mark, bom_scalar_doc_bom_scalar) +{ + auto mkyaml = [](bomspec bom1, bomspec bom2){ + std::string yaml(bom1.bom.str, bom1.bom.len); + yaml.append("abc\n"); + yaml.append("---\n"); + yaml.append(bom2.bom.str, bom2.bom.len); + yaml.append("def\n"); + std::cout << bom1.name << " vs " << bom2.name << "\n" << yaml << "\n"; + return yaml; + }; + auto test = [&](Parser const& parser, Tree const& tree, bomspec bom){ + EXPECT_EQ(parser.encoding(), bom.encoding); + ASSERT_EQ(tree.rootref().num_children(), 2); + EXPECT_EQ(tree.docref(0).val(), "abc"); + EXPECT_EQ(tree.docref(1).val(), "def"); + }; + test_boms2(mkyaml, test); +} + + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/test/test_lib/test_case.cpp b/test/test_lib/test_case.cpp index fc53a63c..78b714c2 100644 --- a/test/test_lib/test_case.cpp +++ b/test/test_lib/test_case.cpp @@ -216,7 +216,7 @@ ExpectError::ExpectError(Tree *tree, Location loc) : m_got_an_error(false) , m_tree(tree) , m_glob_prev(get_callbacks()) - , m_tree_prev(tree ? tree->callbacks() : Callbacks{}) + , m_tree_prev(tree ? tree->callbacks() : m_glob_prev) , expected_location(loc) { auto err = [](const char* msg, size_t len, Location errloc, void *this_) { @@ -230,20 +230,23 @@ ExpectError::ExpectError(Tree *tree, Location loc) ); C4_UNREACHABLE_AFTER_ERR(); }; + pfn_error perr = err; #ifdef RYML_NO_DEFAULT_CALLBACKS - c4::yml::Callbacks tcb((void*)this, nullptr, nullptr, err); - c4::yml::Callbacks gcb((void*)this, nullptr, nullptr, err); + c4::yml::Callbacks tcb((void*)this, nullptr, nullptr, perr); + c4::yml::Callbacks gcb((void*)this, nullptr, nullptr, perr); #else - c4::yml::Callbacks tcb((void*)this, tree ? m_tree_prev.m_allocate : nullptr, tree ? m_tree_prev.m_free : nullptr, err); - c4::yml::Callbacks gcb((void*)this, m_glob_prev.m_allocate, m_glob_prev.m_free, err); + c4::yml::Callbacks tcb((void*)this, tree ? m_tree_prev.m_allocate : nullptr, tree ? m_tree_prev.m_free : nullptr, perr); + c4::yml::Callbacks gcb((void*)this, m_glob_prev.m_allocate, m_glob_prev.m_free, perr); #endif if(tree) { - _c4dbgp("setting error callback: tree"); + _c4dbgpf("setting error callback: tree err={}", c4::fmt::hex(perr)); tree->callbacks(tcb); + EXPECT_EQ(tree->callbacks().m_error, perr); } - _c4dbgp("setting error callback: global"); + _c4dbgpf("setting error callback: global err={}", c4::fmt::hex(perr)); set_callbacks(gcb); + EXPECT_EQ(get_callbacks().m_error, perr); } ExpectError::~ExpectError()