diff --git a/.gitignore b/.gitignore index 641b684ab..496148970 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ build/ install/ .python-version compile_commands.json +launch.json # test files /Testing/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 11c52e064..8abfb16dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,9 +36,11 @@ c4_add_library(ryml c4/yml/common.cpp c4/yml/emit.def.hpp c4/yml/emit.hpp + c4/yml/filter_processor.hpp c4/yml/export.hpp c4/yml/node.hpp c4/yml/node.cpp + c4/yml/node_type.hpp c4/yml/parse.hpp c4/yml/parse.cpp c4/yml/preprocess.hpp diff --git a/ext/c4core b/ext/c4core index 8f060a469..cb4d74334 160000 --- a/ext/c4core +++ b/ext/c4core @@ -1 +1 @@ -Subproject commit 8f060a4693eef378e92ac0dff35875bc81b910db +Subproject commit cb4d74334c4609d08064fcce0a1ff06ff088cf4e diff --git a/samples/quickstart.cpp b/samples/quickstart.cpp index 1363dad1a..dc7923f8d 100644 --- a/samples/quickstart.cpp +++ b/samples/quickstart.cpp @@ -138,26 +138,32 @@ namespace sample { bool report_check(int line, const char *predicate, bool result); #ifdef __GNUC__ #if __GNUC__ == 4 && __GNUC_MINOR__ >= 8 -struct CheckPredicate { +#define CHECK CheckPredicate{__FILE__, __LINE__} +struct CheckPredicate +{ const char *file; const int line; - void operator() (bool predicate) const { if (!report_check(line, nullptr, predicate)) { +#ifdef RYML_DBG RYML_DEBUG_BREAK(); +#endif } } }; -#define CHECK CheckPredicate{__FILE__, __LINE__} #endif #endif -#if !defined(CHECK) +#ifndef CHECK +#ifndef RYML_DBG /// a quick'n'dirty assertion to verify a predicate +#define CHECK(predicate) report_check(__LINE__, #predicate, (predicate)) +#else #define CHECK(predicate) do { if(!report_check(__LINE__, #predicate, (predicate))) { RYML_DEBUG_BREAK(); } } while(0) #endif +#endif //----------------------------------------------------------------------------- @@ -3877,7 +3883,7 @@ void sample_error_handler() struct GlobalAllocatorExample { std::vector memory_pool = std::vector(10u * 1024u); // 10KB - size_t num_allocs = 0, alloc_size = 0; + size_t num_allocs = 0, alloc_size = 0, corr_size = 0; size_t num_deallocs = 0, dealloc_size = 0; void *allocate(size_t len) @@ -3885,11 +3891,20 @@ struct GlobalAllocatorExample void *ptr = &memory_pool[alloc_size]; alloc_size += len; ++num_allocs; - if(C4_UNLIKELY(alloc_size > memory_pool.size())) + // ensure the ptr is aligned + uintptr_t uptr = (uintptr_t)ptr; + const uintptr_t align = alignof(max_align_t); + if (uptr % align) { - std::cerr << "out of memory! requested=" << alloc_size << " vs " << memory_pool.size() << " available" << std::endl; - std::abort(); + uintptr_t prev = uptr - (uptr % align); + uintptr_t next = prev + align; + uintptr_t corr = next - uptr; + ptr = (void*)(((char*)ptr) + corr); + corr_size += corr; } + C4_CHECK_MSG(alloc_size + corr_size <= memory_pool.size(), + "out of memory! requested=%zu+%zu available=%zu\n", + alloc_size, corr_size, memory_pool.size()); return ptr; } diff --git a/src/c4/yml/common.cpp b/src/c4/yml/common.cpp index 1a86e86ae..40811d27f 100644 --- a/src/c4/yml/common.cpp +++ b/src/c4/yml/common.cpp @@ -89,9 +89,9 @@ Callbacks::Callbacks(void *user_data, pfn_allocate alloc_, pfn_free free_, pfn_e m_error(error_) #endif { - C4_CHECK(m_allocate); - C4_CHECK(m_free); - C4_CHECK(m_error); + RYML_CHECK(m_allocate); + RYML_CHECK(m_free); + RYML_CHECK(m_error); } diff --git a/src/c4/yml/common.hpp b/src/c4/yml/common.hpp index f74de9dd8..f6cd6b68b 100644 --- a/src/c4/yml/common.hpp +++ b/src/c4/yml/common.hpp @@ -6,6 +6,17 @@ #include + + +#include + + +#ifndef RYML_ERRMSG_SIZE +/// size for the error message buffer + #define RYML_ERRMSG_SIZE 1024 +#endif + + #ifndef RYML_USE_ASSERT # define RYML_USE_ASSERT C4_USE_ASSERT #endif @@ -20,16 +31,18 @@ #endif -#if defined(NDEBUG) || defined(C4_NO_DEBUG_BREAK) +#ifndef RYML_DBG +# define RYML_DEBUG_BREAK() +#elif (defined(NDEBUG) || defined(C4_NO_DEBUG_BREAK)) # define RYML_DEBUG_BREAK() #else # define RYML_DEBUG_BREAK() \ - { \ + do { \ if(c4::get_error_flags() & c4::ON_ERROR_DEBUGBREAK) \ { \ C4_DEBUG_BREAK(); \ } \ - } + } while(0) #endif @@ -37,7 +50,7 @@ do { \ if(!(cond)) \ { \ - RYML_DEBUG_BREAK() \ + RYML_DEBUG_BREAK(); \ c4::yml::error("check failed: " #cond, c4::yml::Location(__FILE__, __LINE__, 0)); \ } \ } while(0) @@ -47,7 +60,7 @@ { \ if(!(cond)) \ { \ - RYML_DEBUG_BREAK() \ + RYML_DEBUG_BREAK(); \ c4::yml::error(msg ": check failed: " #cond, c4::yml::Location(__FILE__, __LINE__, 0)); \ } \ } while(0) @@ -197,7 +210,7 @@ RYML_EXPORT void reset_callbacks(); do \ { \ const char msg[] = msg_literal; \ - RYML_DEBUG_BREAK() \ + RYML_DEBUG_BREAK(); \ (cb).m_error(msg, sizeof(msg), c4::yml::Location(__FILE__, 0, __LINE__, 0), (cb).m_user_data); \ } while(0) #define _RYML_CB_CHECK(cb, cond) \ @@ -206,7 +219,7 @@ do \ if(!(cond)) \ { \ const char msg[] = "check failed: " #cond; \ - RYML_DEBUG_BREAK() \ + RYML_DEBUG_BREAK(); \ (cb).m_error(msg, sizeof(msg), c4::yml::Location(__FILE__, 0, __LINE__, 0), (cb).m_user_data); \ } \ } while(0) @@ -236,27 +249,61 @@ struct _charconstant_t } // namespace detail +typedef enum { + BLOCK_LITERAL, //!< keep newlines (|) + BLOCK_FOLD //!< replace newline with single space (>) +} BlockStyle_e; + +typedef enum { + CHOMP_CLIP, //!< single newline at end (default) + CHOMP_STRIP, //!< no newline at end (-) + CHOMP_KEEP //!< all newlines from end (+) +} BlockChomp_e; + + +/** Abstracts the fact that a filter result may not fit in the intended memory. */ +struct FilterResult +{ + C4_ALWAYS_INLINE bool valid() const noexcept { return str.str != nullptr; } + C4_ALWAYS_INLINE size_t required_len() const noexcept { return str.len; } + C4_ALWAYS_INLINE csubstr get() { RYML_ASSERT(valid()); return str; } + csubstr str; +}; +/** Abstracts the fact that a filter result may not fit in the intended memory. */ +struct FilterResultExtending +{ + C4_ALWAYS_INLINE bool valid() const noexcept { return str.str != nullptr; } + C4_ALWAYS_INLINE size_t required_len() const noexcept { return reqlen; } + C4_ALWAYS_INLINE csubstr get() { RYML_ASSERT(valid()); return str; } + csubstr str; + size_t reqlen; +}; + + namespace detail { struct _SubstrWriter { substr buf; size_t pos; - _SubstrWriter(substr buf_, size_t pos_=0) : buf(buf_), pos(pos_) {} + _SubstrWriter(substr buf_, size_t pos_=0) : buf(buf_), pos(pos_) { C4_ASSERT(buf.str); } void append(csubstr s) { C4_ASSERT(!s.overlaps(buf)); - if(pos + s.len <= buf.len) + C4_ASSERT(s.str || !s.len); + if(s.len && pos + s.len <= buf.len) memcpy(buf.str + pos, s.str, s.len); pos += s.len; } void append(char c) { + C4_ASSERT(buf.str); if(pos < buf.len) buf.str[pos] = c; ++pos; } void append_n(char c, size_t numtimes) { + C4_ASSERT(buf.str); if(pos + numtimes < buf.len) memset(buf.str + pos, c, numtimes); pos += numtimes; @@ -274,6 +321,43 @@ struct _SubstrWriter /// @endcond + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +template +C4_NO_INLINE void _parse_dump(DumpFn dumpfn, csubstr fmt, Args&& ...args) +{ + char writebuf[256]; + auto results = format_dump_resume(dumpfn, writebuf, fmt, std::forward(args)...); + // resume writing if the results failed to fit the buffer + if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte. + { + results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward(args)...); + if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) + { + results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward(args)...); + C4_CHECK(results.bufsize <= sizeof(writebuf)); + } + } +} +template +void _report_err(Callbacks const& C4_RESTRICT callbacks, Location const& C4_RESTRICT loc, csubstr fmt, Args const& C4_RESTRICT ...args) +{ + char errmsg[RYML_ERRMSG_SIZE]; + detail::_SubstrWriter writer(errmsg); + auto dumpfn = [&writer](csubstr s){ writer.append(s); }; + _parse_dump(dumpfn, fmt, args...); + writer.append('\n'); + if(loc.name.len) + _parse_dump(dumpfn, "{}:", loc.name); + _parse_dump(dumpfn, "{}:{}: ", loc.line, loc.col); + size_t len = writer.pos < RYML_ERRMSG_SIZE ? writer.pos : RYML_ERRMSG_SIZE; + printf("AQUI 0\n"); + callbacks.m_error(errmsg, len, loc, callbacks.m_user_data); +} + C4_SUPPRESS_WARNING_GCC_CLANG_POP } // namespace yml diff --git a/src/c4/yml/detail/parser_dbg.hpp b/src/c4/yml/detail/parser_dbg.hpp index 457f1700d..57c17d023 100644 --- a/src/c4/yml/detail/parser_dbg.hpp +++ b/src/c4/yml/detail/parser_dbg.hpp @@ -24,10 +24,14 @@ #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" // some debugging scaffolds -#ifdef RYML_DBG #include +#ifdef RYML_DBG namespace c4 { -inline void _dbg_dumper(csubstr s) { fwrite(s.str, 1, s.len, stdout); }; +inline void _dbg_dumper(csubstr s) +{ + if(s.str) + fwrite(s.str, 1, s.len, stdout); +} template void _dbg_printf(c4::csubstr fmt, Args&& ...args) { @@ -50,21 +54,24 @@ void _dbg_printf(c4::csubstr fmt, Args&& ...args) # define _c4dbgp(msg) _dbg_printf("{}:{}: " msg "\n", __FILE__, __LINE__ ) # define _c4dbgq(msg) _dbg_printf(msg "\n") # define _c4err(fmt, ...) \ - do { if(c4::is_debugger_attached()) { C4_DEBUG_BREAK(); } \ - this->_err("ERROR:\n" "{}:{}: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); } while(0) + do { RYML_DEBUG_BREAK(); this->_err("ERROR:\n" "{}:{}: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); } while(0) #else # define _c4dbgt(fmt, ...) # define _c4dbgpf(fmt, ...) # define _c4dbgp(msg) # define _c4dbgq(msg) # define _c4err(fmt, ...) \ - do { if(c4::is_debugger_attached()) { C4_DEBUG_BREAK(); } \ - this->_err("ERROR: " fmt, ## __VA_ARGS__); } while(0) + this->_err("ERROR: " fmt, ## __VA_ARGS__) #endif + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + #define _c4prsp(sp) sp #define _c4presc(s) __c4presc(s.str, s.len) -inline c4::csubstr _c4prc(const char &C4_RESTRICT c) +inline c4::csubstr _c4prc(const char &C4_RESTRICT c) // pass by reference! { switch(c) { @@ -81,6 +88,7 @@ inline c4::csubstr _c4prc(const char &C4_RESTRICT c) } inline void __c4presc(const char *s, size_t len) { + RYML_ASSERT(s || !len); size_t prev = 0; for(size_t i = 0; i < len; ++i) { @@ -123,7 +131,8 @@ inline void __c4presc(const char *s, size_t len) } } } - fwrite(s + prev, 1, len - prev, stdout); + if(len > prev) + fwrite(s + prev, 1, len - prev, stdout); } #pragma clang diagnostic pop diff --git a/src/c4/yml/detail/stack.hpp b/src/c4/yml/detail/stack.hpp index a3f060d70..26ed8e515 100644 --- a/src/c4/yml/detail/stack.hpp +++ b/src/c4/yml/detail/stack.hpp @@ -177,6 +177,7 @@ void stack::reserve(size_t sz) return; } T *buf = (T*) m_callbacks.m_allocate(sz * sizeof(T), m_stack, m_callbacks.m_user_data); + RYML_ASSERT(((uintptr_t)buf % alignof(T)) == 0u); memcpy(buf, m_stack, m_size * sizeof(T)); if(m_stack != m_buf) { diff --git a/src/c4/yml/emit.hpp b/src/c4/yml/emit.hpp index c7cdd2a1a..1baf2ef5e 100644 --- a/src/c4/yml/emit.hpp +++ b/src/c4/yml/emit.hpp @@ -16,11 +16,11 @@ #define RYML_DEPRECATE_EMIT \ RYML_DEPRECATED("use emit_yaml() instead. See https://github.com/biojppm/rapidyaml/issues/120") +#define RYML_DEPRECATE_EMITRS \ + RYML_DEPRECATED("use emitrs_yaml() instead. See https://github.com/biojppm/rapidyaml/issues/120") #ifdef emit #error "emit is defined, likely from a Qt include. This will cause a compilation error. See https://github.com/biojppm/rapidyaml/issues/120" #endif -#define RYML_DEPRECATE_EMITRS \ - RYML_DEPRECATED("use emitrs_yaml() instead. See https://github.com/biojppm/rapidyaml/issues/120") //----------------------------------------------------------------------------- diff --git a/src/c4/yml/filter_processor.hpp b/src/c4/yml/filter_processor.hpp new file mode 100644 index 000000000..4e4ba4955 --- /dev/null +++ b/src/c4/yml/filter_processor.hpp @@ -0,0 +1,500 @@ +#ifndef _C4_YML_FILTER_PROCESSOR_HPP_ +#define _C4_YML_FILTER_PROCESSOR_HPP_ + +#include "c4/yml/common.hpp" + +#ifdef RYML_DBG +#include "c4/charconv.hpp" +#include "c4/yml/detail/parser_dbg.hpp" +#endif + +namespace c4 { +namespace yml { + +/** @name filter_processors filter processors */ +/** @{ */ + +//----------------------------------------------------------------------------- + +/** Filters an input string into a different output string */ +struct FilterProcessorSrcDst +{ + csubstr src; + substr dst; + size_t rpos; ///< read position + size_t wpos; ///< write position + + C4_ALWAYS_INLINE FilterProcessorSrcDst(csubstr src_, substr dst_) noexcept + : src(src_) + , dst(dst_) + , rpos(0) + , wpos(0) + { + RYML_ASSERT(!dst.overlaps(src)); + } + + C4_ALWAYS_INLINE void setwpos(size_t wpos_) noexcept { wpos = wpos_; } + C4_ALWAYS_INLINE void setpos(size_t rpos_, size_t wpos_) noexcept { rpos = rpos_; wpos = wpos_; } + C4_ALWAYS_INLINE void set_at_end() noexcept { skip(src.len - rpos); } + + C4_ALWAYS_INLINE bool has_more_chars() const noexcept { return rpos < src.len; } + C4_ALWAYS_INLINE bool has_more_chars(size_t maxpos) const noexcept { RYML_ASSERT(maxpos <= src.len); return rpos < maxpos; } + + C4_ALWAYS_INLINE csubstr rem() const noexcept { return src.sub(rpos); } + C4_ALWAYS_INLINE csubstr sofar() const noexcept { return csubstr(dst.str, wpos <= dst.len ? wpos : dst.len); } + C4_ALWAYS_INLINE FilterResult result() const noexcept + { + FilterResult ret; + ret.str.str = wpos <= dst.len ? dst.str : nullptr; + ret.str.len = wpos; + return ret; + } + + C4_ALWAYS_INLINE char curr() const noexcept { RYML_ASSERT(rpos < src.len); return src[rpos]; } + C4_ALWAYS_INLINE char next() const noexcept { return rpos+1 < src.len ? src[rpos+1] : '\0'; } + C4_ALWAYS_INLINE bool skipped_chars() const noexcept { return wpos != rpos; } + + C4_ALWAYS_INLINE void skip() noexcept { ++rpos; } + C4_ALWAYS_INLINE void skip(size_t num) noexcept { rpos += num; } + + C4_ALWAYS_INLINE void set_at(size_t pos, char c) noexcept + { + RYML_ASSERT(pos < wpos); + dst.str[pos] = c; + } + C4_ALWAYS_INLINE void set(char c) noexcept + { + if(wpos < dst.len) + dst.str[wpos] = c; + ++wpos; + } + C4_ALWAYS_INLINE void set(char c, size_t num) noexcept + { + RYML_ASSERT(num > 0); + if(wpos + num <= dst.len) + memset(dst.str + wpos, c, num); + wpos += num; + } + + C4_ALWAYS_INLINE void copy() noexcept + { + RYML_ASSERT(rpos < src.len); + if(wpos < dst.len) + dst.str[wpos] = src.str[rpos]; + ++wpos; + ++rpos; + } + C4_ALWAYS_INLINE void copy(size_t num) noexcept + { + RYML_ASSERT(num); + RYML_ASSERT(rpos+num <= src.len); + if(wpos + num <= dst.len) + memcpy(dst.str + wpos, src.str + rpos, num); + wpos += num; + rpos += num; + } + + C4_ALWAYS_INLINE void translate_esc(char c) noexcept + { + if(wpos < dst.len) + dst.str[wpos] = c; + ++wpos; + rpos += 2; + } + C4_ALWAYS_INLINE void translate_esc_bulk(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept + { + RYML_ASSERT(nw > 0); + RYML_ASSERT(nr > 0); + RYML_ASSERT(rpos+nr <= src.len); + if(wpos+nw <= dst.len) + memcpy(dst.str + wpos, s, nw); + wpos += nw; + rpos += 1 + nr; + } + C4_ALWAYS_INLINE void translate_esc_extending(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept + { + translate_esc_bulk(s, nw, nr); + } +}; + + +//----------------------------------------------------------------------------- +// filter in place + +// debugging scaffold +#if defined(RYML_DBG) && 0 +#define _c4dbgip(...) _c4dbgpf(__VA_ARGS__) +#else +#define _c4dbgip(...) +#endif + +/** Filters in place. While the result may be larger than the source, + * any extending happens only at the end of the string. Consequently, + * it's impossible for characters to be left unfiltered. + * + * @see FilterProcessorInplaceMidExtending */ +struct FilterProcessorInplaceEndExtending +{ + substr src; ///< the subject string + size_t wcap; ///< write capacity - the capacity of the subject string's buffer + size_t rpos; ///< read position + size_t wpos; ///< write position + + C4_ALWAYS_INLINE FilterProcessorInplaceEndExtending(substr src_, size_t wcap_) noexcept + : src(src_) + , wcap(wcap_) + , rpos(0) + , wpos(0) + { + RYML_ASSERT(wcap >= src.len); + } + + C4_ALWAYS_INLINE void setwpos(size_t wpos_) noexcept { wpos = wpos_; } + C4_ALWAYS_INLINE void setpos(size_t rpos_, size_t wpos_) noexcept { rpos = rpos_; wpos = wpos_; } + C4_ALWAYS_INLINE void set_at_end() noexcept { skip(src.len - rpos); } + + C4_ALWAYS_INLINE bool has_more_chars() const noexcept { return rpos < src.len; } + C4_ALWAYS_INLINE bool has_more_chars(size_t maxpos) const noexcept { RYML_ASSERT(maxpos <= src.len); return rpos < maxpos; } + + C4_ALWAYS_INLINE FilterResult result() const noexcept + { + _c4dbgip("inplace: wpos={} wcap={} small={}", wpos, wcap, wpos > rpos); + FilterResult ret; + ret.str.str = (wpos <= wcap) ? src.str : nullptr; + ret.str.len = wpos; + return ret; + } + C4_ALWAYS_INLINE csubstr sofar() const noexcept { return csubstr(src.str, wpos <= wcap ? wpos : wcap); } + C4_ALWAYS_INLINE csubstr rem() const noexcept { return src.sub(rpos); } + + C4_ALWAYS_INLINE char curr() const noexcept { RYML_ASSERT(rpos < src.len); return src[rpos]; } + C4_ALWAYS_INLINE char next() const noexcept { return rpos+1 < src.len ? src[rpos+1] : '\0'; } + + C4_ALWAYS_INLINE void skip() noexcept { ++rpos; } + C4_ALWAYS_INLINE void skip(size_t num) noexcept { rpos += num; } + + void set_at(size_t pos, char c) noexcept + { + RYML_ASSERT(pos < wpos); + const size_t save = wpos; + wpos = pos; + set(c); + wpos = save; + } + void set(char c) noexcept + { + if(wpos < wcap) // respect write-capacity + src.str[wpos] = c; + ++wpos; + } + void set(char c, size_t num) noexcept + { + RYML_ASSERT(num); + if(wpos + num <= wcap) // respect write-capacity + memset(src.str + wpos, c, num); + wpos += num; + } + + void copy() noexcept + { + RYML_ASSERT(wpos <= rpos); + RYML_ASSERT(rpos < src.len); + if(wpos < wcap) // respect write-capacity + src.str[wpos] = src.str[rpos]; + ++rpos; + ++wpos; + } + void copy(size_t num) noexcept + { + RYML_ASSERT(num); + RYML_ASSERT(rpos+num <= src.len); + RYML_ASSERT(wpos <= rpos); + if(wpos + num <= wcap) // respect write-capacity + { + if(wpos + num <= rpos) // there is no overlap + memcpy(src.str + wpos, src.str + rpos, num); + else // there is overlap + memmove(src.str + wpos, src.str + rpos, num); + } + rpos += num; + wpos += num; + } + + void translate_esc(char c) noexcept + { + RYML_ASSERT(rpos + 2 <= src.len); + RYML_ASSERT(wpos <= rpos); + if(wpos < wcap) // respect write-capacity + src.str[wpos] = c; + rpos += 2; // add 1u to account for the escape character + ++wpos; + } + + void translate_esc_bulk(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept + { + RYML_ASSERT(nw > 0); + RYML_ASSERT(nr > 0); + RYML_ASSERT(nw <= nr + 1u); + RYML_ASSERT(rpos+nr <= src.len); + RYML_ASSERT(wpos <= rpos); + const size_t wpos_next = wpos + nw; + const size_t rpos_next = rpos + nr + 1u; // add 1u to account for the escape character + RYML_ASSERT(wpos_next <= rpos_next); + if(wpos_next <= wcap) + memcpy(src.str + wpos, s, nw); + rpos = rpos_next; + wpos = wpos_next; + } + + C4_ALWAYS_INLINE void translate_esc_extending(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept + { + translate_esc_bulk(s, nw, nr); + } +}; + + +/** Filters in place. The result may be larger than the source, and + * extending may happen anywhere. As a result some characters may be + * left unfiltered when there is no slack in the buffer and the + * write-position would overlap the read-position. Consequently, it's + * possible for characters to be left unfiltered. In YAML, this + * happens only with double-quoted strings, and only with a small + * number of escape sequences such as \L which is substituted by three + * bytes. These escape sequences cause a call to translate_esc_extending() + * which is the only entry point to this unfiltered situation. + * + * @see FilterProcessorInplaceMidExtending */ +struct FilterProcessorInplaceMidExtending +{ + substr src; ///< the subject string + size_t wcap; ///< write capacity - the capacity of the subject string's buffer + size_t rpos; ///< read position + size_t wpos; ///< write position + size_t maxcap; ///< the max capacity needed for filtering the string. This may be larger than the final string size. + bool unfiltered_chars; ///< number of characters that were not added to wpos from lack of capacity + + C4_ALWAYS_INLINE FilterProcessorInplaceMidExtending(substr src_, size_t wcap_) noexcept + : src(src_) + , wcap(wcap_) + , rpos(0) + , wpos(0) + , maxcap(src.len) + , unfiltered_chars(false) + { + RYML_ASSERT(wcap >= src.len); + } + + C4_ALWAYS_INLINE void setwpos(size_t wpos_) noexcept { wpos = wpos_; } + C4_ALWAYS_INLINE void setpos(size_t rpos_, size_t wpos_) noexcept { rpos = rpos_; wpos = wpos_; } + C4_ALWAYS_INLINE void set_at_end() noexcept { skip(src.len - rpos); } + + C4_ALWAYS_INLINE bool has_more_chars() const noexcept { return rpos < src.len; } + C4_ALWAYS_INLINE bool has_more_chars(size_t maxpos) const noexcept { RYML_ASSERT(maxpos <= src.len); return rpos < maxpos; } + + C4_ALWAYS_INLINE FilterResultExtending result() const noexcept + { + _c4dbgip("inplace: wpos={} wcap={} unfiltered={} maxcap={}", this->wpos, this->wcap, this->unfiltered_chars, this->maxcap); + FilterResultExtending ret; + ret.str.str = (wpos <= wcap && !unfiltered_chars) ? src.str : nullptr; + ret.str.len = wpos; + ret.reqlen = maxcap; + return ret; + } + C4_ALWAYS_INLINE csubstr sofar() const noexcept { return csubstr(src.str, wpos <= wcap ? wpos : wcap); } + C4_ALWAYS_INLINE csubstr rem() const noexcept { return src.sub(rpos); } + + C4_ALWAYS_INLINE char curr() const noexcept { RYML_ASSERT(rpos < src.len); return src[rpos]; } + C4_ALWAYS_INLINE char next() const noexcept { return rpos+1 < src.len ? src[rpos+1] : '\0'; } + + C4_ALWAYS_INLINE void skip() noexcept { ++rpos; } + C4_ALWAYS_INLINE void skip(size_t num) noexcept { rpos += num; } + + void set_at(size_t pos, char c) noexcept + { + RYML_ASSERT(pos < wpos); + const size_t save = wpos; + wpos = pos; + set(c); + wpos = save; + } + void set(char c) noexcept + { + if(wpos < wcap) // respect write-capacity + { + if((wpos <= rpos) && !unfiltered_chars) + src.str[wpos] = c; + } + else + { + _c4dbgip("inplace: add unwritten {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, (wpos+1u > maxcap ? wpos+1u : maxcap)); + unfiltered_chars = true; + } + ++wpos; + maxcap = wpos > maxcap ? wpos : maxcap; + } + void set(char c, size_t num) noexcept + { + RYML_ASSERT(num); + if(wpos + num <= wcap) // respect write-capacity + { + if((wpos <= rpos) && !unfiltered_chars) + memset(src.str + wpos, c, num); + } + else + { + _c4dbgip("inplace: add unwritten {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, (wpos+num > maxcap ? wpos+num : maxcap)); + unfiltered_chars = true; + } + wpos += num; + maxcap = wpos > maxcap ? wpos : maxcap; + } + + void copy() noexcept + { + RYML_ASSERT(rpos < src.len); + if(wpos < wcap) // respect write-capacity + { + if((wpos < rpos) && !unfiltered_chars) // write only if wpos is behind rpos + src.str[wpos] = src.str[rpos]; + } + else + { + _c4dbgip("inplace: add unwritten {}->{} (wpos={}!=rpos={})={} (wpos={}{}!", unfiltered_chars, true, wpos, rpos, wpos!=rpos, wpos, wcap, wpos maxcap ? wpos+1u : maxcap)); + unfiltered_chars = true; + } + ++rpos; + ++wpos; + maxcap = wpos > maxcap ? wpos : maxcap; + } + void copy(size_t num) noexcept + { + RYML_ASSERT(num); + RYML_ASSERT(rpos+num <= src.len); + if(wpos + num <= wcap) // respect write-capacity + { + if((wpos < rpos) && !unfiltered_chars) // write only if wpos is behind rpos + { + if(wpos + num <= rpos) // there is no overlap + memcpy(src.str + wpos, src.str + rpos, num); + else // there is overlap + memmove(src.str + wpos, src.str + rpos, num); + } + } + else + { + _c4dbgip("inplace: add unwritten {}->{} (wpos={}!=rpos={})={} (wpos={}{}!", unfiltered_chars, true, wpos, rpos, wpos!=rpos, wpos, wcap, wpos maxcap ? wpos : maxcap; + } + + void translate_esc(char c) noexcept + { + RYML_ASSERT(rpos + 2 <= src.len); + if(wpos < wcap) // respect write-capacity + { + if((wpos <= rpos) && !unfiltered_chars) + src.str[wpos] = c; + } + else + { + _c4dbgip("inplace: add unfiltered {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, (wpos+1u > maxcap ? wpos+1u : maxcap)); + unfiltered_chars = true; + } + rpos += 2; + ++wpos; + maxcap = wpos > maxcap ? wpos : maxcap; + } + + C4_NO_INLINE void translate_esc_bulk(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept + { + RYML_ASSERT(nw > 0); + RYML_ASSERT(nr > 0); + RYML_ASSERT(nr+1u >= nw); + const size_t wpos_next = wpos + nw; + const size_t rpos_next = rpos + nr + 1u; // add 1u to account for the escape character + if(wpos_next <= wcap) // respect write-capacity + { + if((wpos <= rpos) && !unfiltered_chars) // write only if wpos is behind rpos + memcpy(src.str + wpos, s, nw); + } + else + { + _c4dbgip("inplace: add unwritten {}->{} (wpos={}!=rpos={})={} (wpos={}{}!", unfiltered_chars, true, wpos, rpos, wpos!=rpos, wpos, wcap, wpos maxcap ? wpos : maxcap; + } + + C4_NO_INLINE void translate_esc_extending(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept + { + RYML_ASSERT(nw > 0); + RYML_ASSERT(nr > 0); + RYML_ASSERT(rpos+nr <= src.len); + const size_t wpos_next = wpos + nw; + const size_t rpos_next = rpos + nr + 1u; // add 1u to account for the escape character + if(wpos_next <= rpos_next) // read and write do not overlap. just do a vanilla copy. + { + if((wpos_next <= wcap) && !unfiltered_chars) + memcpy(src.str + wpos, s, nw); + rpos = rpos_next; + wpos = wpos_next; + maxcap = wpos > maxcap ? wpos : maxcap; + } + else // there is overlap. move the (to-be-read) string to the right. + { + const size_t excess = wpos_next - rpos_next; + RYML_ASSERT(wpos_next > rpos_next); + if(src.len + excess <= wcap) // ensure we do not go past the end + { + RYML_ASSERT(rpos+nr+excess <= src.len); + if(wpos_next <= wcap) + { + if(!unfiltered_chars) + { + memmove(src.str + wpos_next, src.str + rpos_next, src.len - rpos_next); + memcpy(src.str + wpos, s, nw); + } + rpos = wpos_next; // wpos, not rpos + } + else + { + rpos = rpos_next; + //const size_t unw = nw > (nr + 1u) ? nw - (nr + 1u) : 0; + _c4dbgip("inplace: add unfiltered {}->{} maxcap={}->{}!", unfiltered_chars, true); + unfiltered_chars = true; + } + wpos = wpos_next; + // extend the string up to capacity + src.len += excess; + maxcap = wpos > maxcap ? wpos : maxcap; + } + else + { + //const size_t unw = nw > (nr + 1u) ? nw - (nr + 1u) : 0; + RYML_ASSERT(rpos_next <= src.len); + const size_t required_size = wpos_next + (src.len - rpos_next); + _c4dbgip("inplace: add unfiltered {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, required_size > maxcap ? required_size : maxcap); + RYML_ASSERT(required_size > wcap); + unfiltered_chars = true; + maxcap = required_size > maxcap ? required_size : maxcap; + wpos = wpos_next; + rpos = rpos_next; + } + } + } +}; + +#undef _c4dbgip + + +/** @} */ + +} // namespace yml +} // namespace c4 + +#endif /* _C4_YML_FILTER_PROCESSOR_HPP_ */ diff --git a/src/c4/yml/node_type.hpp b/src/c4/yml/node_type.hpp new file mode 100644 index 000000000..09140a316 --- /dev/null +++ b/src/c4/yml/node_type.hpp @@ -0,0 +1,192 @@ +#ifndef C4_YML_NODE_TYPE_HPP_ +#define C4_YML_NODE_TYPE_HPP_ + +#ifndef _C4_YML_COMMON_HPP_ +#include "c4/yml/common.hpp" +#endif + +C4_SUPPRESS_WARNING_MSVC_PUSH +C4_SUPPRESS_WARNING_GCC_CLANG_PUSH +C4_SUPPRESS_WARNING_GCC_CLANG("-Wold-style-cast") + +namespace c4 { +namespace yml { + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + + +/** the integral type necessary to cover all the bits marking node types */ +using type_bits = uint64_t; + + +/** a bit mask for marking node types */ +typedef enum : type_bits { + // a convenience define, undefined below + #define c4bit(v) (type_bits(1) << v) + NOTYPE = 0, ///< no node type is set + VAL = c4bit(0), ///< a leaf node, has a (possibly empty) value + KEY = c4bit(1), ///< is member of a map, must have non-empty key + MAP = c4bit(2), ///< a map: a parent of keyvals + SEQ = c4bit(3), ///< a seq: a parent of vals + DOC = c4bit(4), ///< a document + STREAM = c4bit(5)|SEQ, ///< a stream: a seq of docs + KEYREF = c4bit(6), ///< a *reference: the key references an &anchor + VALREF = c4bit(7), ///< a *reference: the val references an &anchor + KEYANCH = c4bit(8), ///< the key has an &anchor + VALANCH = c4bit(9), ///< the val has an &anchor + KEYTAG = c4bit(10), ///< the key has an explicit tag/type + VALTAG = c4bit(11), ///< the val has an explicit tag/type + _TYMASK = c4bit(12)-1, // all the bits up to here + VALQUO = c4bit(12), ///< the val is quoted by '', "", > or | + KEYQUO = c4bit(13), ///< the key is quoted by '', "", > or | + KEYVAL = KEY|VAL, + KEYSEQ = KEY|SEQ, + KEYMAP = KEY|MAP, + DOCMAP = DOC|MAP, + DOCSEQ = DOC|SEQ, + DOCVAL = DOC|VAL, + _KEYMASK = KEY | KEYQUO | KEYANCH | KEYREF | KEYTAG, + _VALMASK = VAL | VALQUO | VALANCH | VALREF | VALTAG, + _WIP_KEY_UNFILT = c4bit(14), ///< the key scalar is yet to be filtered. Eg, when the parser is set not to filter. + _WIP_VAL_UNFILT = c4bit(15), ///< the val scalar is yet to be filtered. Eg, when the parser is set not to filter. + // these flags are from a work in progress and should be used with care + _WIP_STYLE_FLOW_SL = c4bit(16), ///< mark container with single-line flow format (seqs as '[val1,val2], maps as '{key: val, key2: val2}') + _WIP_STYLE_FLOW_ML = c4bit(17), ///< mark container with multi-line flow format (seqs as '[val1,\nval2], maps as '{key: val,\nkey2: val2}') + _WIP_STYLE_BLOCK = c4bit(18), ///< mark container with block format (seqs as '- val\n', maps as 'key: val') + _WIP_KEY_LITERAL = c4bit(19), ///< mark key scalar as multiline, block literal | + _WIP_VAL_LITERAL = c4bit(20), ///< mark val scalar as multiline, block literal | + _WIP_KEY_FOLDED = c4bit(21), ///< mark key scalar as multiline, block folded > + _WIP_VAL_FOLDED = c4bit(22), ///< mark val scalar as multiline, block folded > + _WIP_KEY_SQUO = c4bit(23), ///< mark key scalar as single quoted + _WIP_VAL_SQUO = c4bit(24), ///< mark val scalar as single quoted + _WIP_KEY_DQUO = c4bit(25), ///< mark key scalar as double quoted + _WIP_VAL_DQUO = c4bit(26), ///< mark val scalar as double quoted + _WIP_KEY_PLAIN = c4bit(27), ///< mark key scalar as plain scalar (unquoted, even when multiline) + _WIP_VAL_PLAIN = c4bit(28), ///< mark val scalar as plain scalar (unquoted, even when multiline) + _WIP_KEY_STYLE = _WIP_KEY_LITERAL|_WIP_KEY_FOLDED|_WIP_KEY_SQUO|_WIP_KEY_DQUO|_WIP_KEY_PLAIN, + _WIP_VAL_STYLE = _WIP_VAL_LITERAL|_WIP_VAL_FOLDED|_WIP_VAL_SQUO|_WIP_VAL_DQUO|_WIP_VAL_PLAIN, + _WIP_KEY_FT_NL = c4bit(29), ///< features: mark key scalar as having \n in its contents + _WIP_VAL_FT_NL = c4bit(30), ///< features: mark val scalar as having \n in its contents + _WIP_KEY_FT_SQ = c4bit(31), ///< features: mark key scalar as having single quotes in its contents + _WIP_VAL_FT_SQ = c4bit(32), ///< features: mark val scalar as having single quotes in its contents + _WIP_KEY_FT_DQ = c4bit(33), ///< features: mark key scalar as having double quotes in its contents + _WIP_VAL_FT_DQ = c4bit(34), ///< features: mark val scalar as having double quotes in its contents + #undef c4bit +} NodeType_e; + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +/** wraps a NodeType_e element with some syntactic sugar and predicates */ +struct NodeType +{ +public: + + NodeType_e type; + +public: + + C4_ALWAYS_INLINE NodeType() noexcept : type(NOTYPE) {} + C4_ALWAYS_INLINE NodeType(NodeType_e t) noexcept : type(t) {} + C4_ALWAYS_INLINE NodeType(type_bits t) noexcept : type((NodeType_e)t) {} + + C4_ALWAYS_INLINE const char *type_str() const noexcept { return type_str(type); } + static const char* type_str(NodeType_e t) noexcept; + + C4_ALWAYS_INLINE void set(NodeType_e t) noexcept { type = t; } + C4_ALWAYS_INLINE void set(type_bits t) noexcept { type = (NodeType_e)t; } + + C4_ALWAYS_INLINE void add(NodeType_e t) noexcept { type = (NodeType_e)(type|t); } + C4_ALWAYS_INLINE void add(type_bits t) noexcept { type = (NodeType_e)(type|t); } + + C4_ALWAYS_INLINE void rem(NodeType_e t) noexcept { type = (NodeType_e)(type & ~t); } + C4_ALWAYS_INLINE void rem(type_bits t) noexcept { type = (NodeType_e)(type & ~t); } + + C4_ALWAYS_INLINE void clear() noexcept { type = NOTYPE; } + +public: + + C4_ALWAYS_INLINE operator NodeType_e & C4_RESTRICT () noexcept { return type; } + C4_ALWAYS_INLINE operator NodeType_e const& C4_RESTRICT () const noexcept { return type; } + + C4_ALWAYS_INLINE bool operator== (NodeType_e t) const noexcept { return type == t; } + C4_ALWAYS_INLINE bool operator!= (NodeType_e t) const noexcept { return type != t; } + +public: + + #if defined(__clang__) + # pragma clang diagnostic push + # pragma clang diagnostic ignored "-Wnull-dereference" + #elif defined(__GNUC__) + # pragma GCC diagnostic push + # if __GNUC__ >= 6 + # pragma GCC diagnostic ignored "-Wnull-dereference" + # endif + #endif + + C4_ALWAYS_INLINE bool is_notype() const noexcept { return type == NOTYPE; } + C4_ALWAYS_INLINE bool is_stream() const noexcept { return ((type & STREAM) == STREAM) != 0; } + C4_ALWAYS_INLINE bool is_doc() const noexcept { return (type & DOC) != 0; } + C4_ALWAYS_INLINE bool is_container() const noexcept { return (type & (MAP|SEQ|STREAM)) != 0; } + C4_ALWAYS_INLINE bool is_map() const noexcept { return (type & MAP) != 0; } + C4_ALWAYS_INLINE bool is_seq() const noexcept { return (type & SEQ) != 0; } + C4_ALWAYS_INLINE bool has_key() const noexcept { return (type & KEY) != 0; } + C4_ALWAYS_INLINE bool has_val() const noexcept { return (type & VAL) != 0; } + C4_ALWAYS_INLINE bool is_val() const noexcept { return (type & KEYVAL) == VAL; } + C4_ALWAYS_INLINE bool is_keyval() const noexcept { return (type & KEYVAL) == KEYVAL; } + C4_ALWAYS_INLINE bool has_key_tag() const noexcept { return (type & (KEY|KEYTAG)) == (KEY|KEYTAG); } + C4_ALWAYS_INLINE bool has_val_tag() const noexcept { return ((type & VALTAG) && (type & (VAL|MAP|SEQ))); } + C4_ALWAYS_INLINE bool has_key_anchor() const noexcept { return (type & (KEY|KEYANCH)) == (KEY|KEYANCH); } + C4_ALWAYS_INLINE bool is_key_anchor() const noexcept { return (type & (KEY|KEYANCH)) == (KEY|KEYANCH); } + C4_ALWAYS_INLINE bool has_val_anchor() const noexcept { return (type & VALANCH) != 0 && (type & (VAL|SEQ|MAP)) != 0; } + C4_ALWAYS_INLINE bool is_val_anchor() const noexcept { return (type & VALANCH) != 0 && (type & (VAL|SEQ|MAP)) != 0; } + C4_ALWAYS_INLINE bool has_anchor() const noexcept { return (type & (KEYANCH|VALANCH)) != 0; } + C4_ALWAYS_INLINE bool is_anchor() const noexcept { return (type & (KEYANCH|VALANCH)) != 0; } + C4_ALWAYS_INLINE bool is_key_ref() const noexcept { return (type & KEYREF) != 0; } + C4_ALWAYS_INLINE bool is_val_ref() const noexcept { return (type & VALREF) != 0; } + C4_ALWAYS_INLINE bool is_ref() const noexcept { return (type & (KEYREF|VALREF)) != 0; } + C4_ALWAYS_INLINE bool is_anchor_or_ref() const noexcept { return (type & (KEYANCH|VALANCH|KEYREF|VALREF)) != 0; } + C4_ALWAYS_INLINE bool is_key_quoted() const noexcept { return (type & (KEY|KEYQUO)) == (KEY|KEYQUO); } + C4_ALWAYS_INLINE bool is_val_quoted() const noexcept { return (type & (VAL|VALQUO)) == (VAL|VALQUO); } + C4_ALWAYS_INLINE bool is_quoted() const noexcept { return (type & (KEY|KEYQUO)) == (KEY|KEYQUO) || (type & (VAL|VALQUO)) == (VAL|VALQUO); } + + // these predicates are a work in progress and subject to change. Don't use yet. + C4_ALWAYS_INLINE bool default_block() const noexcept { return (type & (_WIP_STYLE_BLOCK|_WIP_STYLE_FLOW_ML|_WIP_STYLE_FLOW_SL)) == 0; } + C4_ALWAYS_INLINE bool marked_block() const noexcept { return (type & (_WIP_STYLE_BLOCK)) != 0; } + C4_ALWAYS_INLINE bool marked_flow_sl() const noexcept { return (type & (_WIP_STYLE_FLOW_SL)) != 0; } + C4_ALWAYS_INLINE bool marked_flow_ml() const noexcept { return (type & (_WIP_STYLE_FLOW_ML)) != 0; } + C4_ALWAYS_INLINE bool marked_flow() const noexcept { return (type & (_WIP_STYLE_FLOW_ML|_WIP_STYLE_FLOW_SL)) != 0; } + C4_ALWAYS_INLINE bool key_marked_literal() const noexcept { return (type & (_WIP_KEY_LITERAL)) != 0; } + C4_ALWAYS_INLINE bool val_marked_literal() const noexcept { return (type & (_WIP_VAL_LITERAL)) != 0; } + C4_ALWAYS_INLINE bool key_marked_folded() const noexcept { return (type & (_WIP_KEY_FOLDED)) != 0; } + C4_ALWAYS_INLINE bool val_marked_folded() const noexcept { return (type & (_WIP_VAL_FOLDED)) != 0; } + C4_ALWAYS_INLINE bool key_marked_squo() const noexcept { return (type & (_WIP_KEY_SQUO)) != 0; } + C4_ALWAYS_INLINE bool val_marked_squo() const noexcept { return (type & (_WIP_VAL_SQUO)) != 0; } + C4_ALWAYS_INLINE bool key_marked_dquo() const noexcept { return (type & (_WIP_KEY_DQUO)) != 0; } + C4_ALWAYS_INLINE bool val_marked_dquo() const noexcept { return (type & (_WIP_VAL_DQUO)) != 0; } + C4_ALWAYS_INLINE bool key_marked_plain() const noexcept { return (type & (_WIP_KEY_PLAIN)) != 0; } + C4_ALWAYS_INLINE bool val_marked_plain() const noexcept { return (type & (_WIP_VAL_PLAIN)) != 0; } + + C4_ALWAYS_INLINE bool _wip_key_unfiltered() const noexcept { return (type & (_WIP_KEY_UNFILT)) != 0; } + C4_ALWAYS_INLINE bool _wip_val_unfiltered() const noexcept { return (type & (_WIP_VAL_UNFILT)) != 0; } + + #if defined(__clang__) + # pragma clang diagnostic pop + #elif defined(__GNUC__) + # pragma GCC diagnostic pop + #endif + +}; + + +} // namespace yml +} // namespace c4 + +C4_SUPPRESS_WARNING_MSVC_POP +C4_SUPPRESS_WARNING_GCC_CLANG_POP + +#endif /* C4_YML_NODE_TYPE_HPP_ */ diff --git a/src/c4/yml/parse.cpp b/src/c4/yml/parse.cpp index a0f0dadee..790e49288 100644 --- a/src/c4/yml/parse.cpp +++ b/src/c4/yml/parse.cpp @@ -1,5 +1,6 @@ #include "c4/yml/parse.hpp" #include "c4/error.hpp" +#include "c4/charconv.hpp" #include "c4/utf.hpp" #include @@ -8,14 +9,11 @@ #include #include "c4/yml/detail/parser_dbg.hpp" +#include "c4/yml/filter_processor.hpp" #ifdef RYML_DBG #include "c4/yml/detail/print.hpp" #endif -#ifndef RYML_ERRMSG_SIZE - #define RYML_ERRMSG_SIZE 1024 -#endif - //#define RYML_WITH_TAB_TOKENS #ifdef RYML_WITH_TAB_TOKENS #define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__ @@ -49,22 +47,6 @@ namespace yml { namespace { -template -void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args) -{ - char writebuf[256]; - auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward(args)...); - // resume writing if the results failed to fit the buffer - if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte. - { - results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward(args)...); - if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) - { - results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward(args)...); - } - } -} - bool _is_scalar_next__runk(csubstr s) { return !(s.begins_with(": ") || s.begins_with_any("#,{}[]%&") || s.begins_with("? ") || s == "-" || s.begins_with("- ") || s.begins_with(":\"") || s.begins_with(":'")); @@ -97,9 +79,7 @@ bool _is_doc_sep(csubstr s) return false; } -/** @p i is set to the first non whitespace character after the line - * @return the number of empty lines after the initial position */ -size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation) +inline size_t _count_following_newlines(csubstr r, size_t *C4_RESTRICT i) { RYML_ASSERT(r[*i] == '\n'); size_t numnl_following = 0; @@ -107,10 +87,47 @@ size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indenta for( ; *i < r.len; ++(*i)) { if(r.str[*i] == '\n') - { ++numnl_following; - if(indentation) // skip the indentation after the newline + // skip leading whitespace + else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r') + ; + else + break; + } + return numnl_following; +} + + +//----------------------------------------------------------------------------- + +/** @p i is set to the first non whitespace character after the line + * @return the number of empty lines after the initial position */ +inline size_t _count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation) +{ + RYML_ASSERT(r[*i] == '\n'); + size_t numnl_following = 0; + ++(*i); + if(indentation == 0) + { + for( ; *i < r.len; ++(*i)) + { + if(r.str[*i] == '\n') + ++numnl_following; + // skip leading whitespace + else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r') + ; + else + break; + } + } + else + { + for( ; *i < r.len; ++(*i)) + { + if(r.str[*i] == '\n') { + ++numnl_following; + // skip the indentation after the newline size_t stop = *i + indentation; for( ; *i < r.len; ++(*i)) { @@ -120,11 +137,12 @@ size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indenta } C4_UNUSED(stop); } + // skip leading whitespace + else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r') + ; + else + break; } - else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r') // skip leading whitespace - ; - else - break; } return numnl_following; } @@ -132,6 +150,8 @@ size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indenta } // anon namespace +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- //----------------------------------------------------------------------------- Parser::~Parser() @@ -159,7 +179,6 @@ Parser::Parser(Callbacks const& cb, ParserOptions opts) , m_key_anchor() , m_val_anchor_indentation(0) , m_val_anchor() - , m_filter_arena() , m_newline_offsets() , m_newline_offsets_size(0) , m_newline_offsets_capacity(0) @@ -188,7 +207,6 @@ Parser::Parser(Parser &&that) , m_key_anchor(that.m_key_anchor) , m_val_anchor_indentation(that.m_val_anchor_indentation) , m_val_anchor(that.m_val_anchor) - , m_filter_arena(that.m_filter_arena) , m_newline_offsets(that.m_newline_offsets) , m_newline_offsets_size(that.m_newline_offsets_size) , m_newline_offsets_capacity(that.m_newline_offsets_capacity) @@ -216,7 +234,6 @@ Parser::Parser(Parser const& that) , m_key_anchor(that.m_key_anchor) , m_val_anchor_indentation(that.m_val_anchor_indentation) , m_val_anchor(that.m_val_anchor) - , m_filter_arena() , m_newline_offsets() , m_newline_offsets_size() , m_newline_offsets_capacity() @@ -229,10 +246,6 @@ Parser::Parser(Parser const& that) memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t)); m_newline_offsets_size = that.m_newline_offsets_size; } - if(that.m_filter_arena.len) - { - _resize_filter_arena(that.m_filter_arena.len); - } } Parser& Parser::operator=(Parser &&that) @@ -256,7 +269,6 @@ Parser& Parser::operator=(Parser &&that) m_key_anchor = (that.m_key_anchor); m_val_anchor_indentation = (that.m_val_anchor_indentation); m_val_anchor = (that.m_val_anchor); - m_filter_arena = that.m_filter_arena; m_newline_offsets = (that.m_newline_offsets); m_newline_offsets_size = (that.m_newline_offsets_size); m_newline_offsets_capacity = (that.m_newline_offsets_capacity); @@ -286,8 +298,6 @@ Parser& Parser::operator=(Parser const& that) m_key_anchor = (that.m_key_anchor); m_val_anchor_indentation = (that.m_val_anchor_indentation); m_val_anchor = (that.m_val_anchor); - if(that.m_filter_arena.len > 0) - _resize_filter_arena(that.m_filter_arena.len); if(that.m_newline_offsets_capacity > m_newline_offsets_capacity) _resize_locations(that.m_newline_offsets_capacity); _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity); @@ -318,7 +328,6 @@ void Parser::_clr() m_key_anchor = {}; m_val_anchor_indentation = {}; m_val_anchor = {}; - m_filter_arena = {}; m_newline_offsets = {}; m_newline_offsets_size = {}; m_newline_offsets_capacity = {}; @@ -335,11 +344,6 @@ void Parser::_free() m_newline_offsets_capacity = 0u; m_newline_offsets_buf = 0u; } - if(m_filter_arena.len) - { - _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len); - m_filter_arena = {}; - } m_stack._free(); } @@ -416,6 +420,7 @@ void Parser::_fmt_msg(DumpFn &&dumpfn) const //----------------------------------------------------------------------------- + template void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const { @@ -429,12 +434,13 @@ void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const m_tree->m_callbacks.m_error(errmsg, len, m_state->pos, m_tree->m_callbacks.m_user_data); } + //----------------------------------------------------------------------------- #ifdef RYML_DBG template void Parser::_dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const { - auto dumpfn = [](csubstr s){ fwrite(s.str, 1, s.len, stdout); }; + auto dumpfn = [](csubstr s){ if(s.str) fwrite(s.str, 1, s.len, stdout); }; _parse_dump(dumpfn, fmt, args...); dumpfn("\n"); _fmt_msg(dumpfn); @@ -2847,7 +2853,7 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) { substr full = _scan_complex_key(s, n).trimr(" \t\r\n"); if(full != s) - s = _filter_plain_scalar(full, scalar_indentation); + s = _filter_scalar_plain(full, scalar_indentation); } } // deal with plain (unquoted) scalars that continue to the next line @@ -2866,7 +2872,7 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n)); substr full = _scan_plain_scalar_blck(s, n, scalar_indentation); if(full.len >= s.len) - s = _filter_plain_scalar(full, scalar_indentation); + s = _filter_scalar_plain(full, scalar_indentation); } } else @@ -2877,7 +2883,7 @@ csubstr Parser::_extend_scanned_scalar(csubstr s) { _c4dbgp("rscalar[FLOW]"); substr full = _scan_plain_scalar_flow(s, n); - s = _filter_plain_scalar(full, /*indentation*/0); + s = _filter_scalar_plain(full, /*indentation*/0); } } } @@ -4192,16 +4198,13 @@ csubstr Parser::_scan_squot_scalar() s = s.sub(0, pos-1); } + _c4dbgpf("scanned scalar: \"{}\"", s); + if(needs_filter) { - csubstr ret = _filter_squot_scalar(s); - _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty()); - _c4dbgpf("final scalar: \"{}\"", ret); - return ret; + return _filter_scalar_squot(s); } - _c4dbgpf("final scalar: \"{}\"", s); - return s; } @@ -4293,19 +4296,17 @@ csubstr Parser::_scan_dquot_scalar() s = s.sub(0, pos-1); } + _c4dbgpf("scanned scalar: \"{}\"", s); + if(needs_filter) { - csubstr ret = _filter_dquot_scalar(s); - _c4dbgpf("final scalar: [{}]\"{}\"", ret.len, ret); - _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty()); - return ret; + return _filter_scalar_dquot(s); } - _c4dbgpf("final scalar: \"{}\"", s); - return s; } + //----------------------------------------------------------------------------- csubstr Parser::_scan_block() { @@ -4324,7 +4325,7 @@ csubstr Parser::_scan_block() _c4dbgpf("scanning block: specs=\"{}\"", s); // parse the spec - BlockStyle_e newline = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL; + BlockStyle_e block_style = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL; BlockChomp_e chomp = CHOMP_CLIP; // default to clip unless + or - are used size_t indentation = npos; // have to find out if no spec is given csubstr digits; @@ -4363,7 +4364,7 @@ csubstr Parser::_scan_block() _line_ended(); _scan_line(); - _c4dbgpf("scanning block: style={} chomp={} indentation={}", newline==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation); + _c4dbgpf("scanning block: style={} chomp={} indentation={}", block_style==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation); // start with a zero-length block, already pointing at the right place substr raw_block(m_buf.data() + m_state->pos.offset, size_t(0));// m_state->line_contents.full.sub(0, 0); @@ -4507,7 +4508,15 @@ csubstr Parser::_scan_block() _c4dbgpf("scanning block: raw=~~~{}~~~", raw_block); // ok! now we strip the newlines and spaces according to the specs - s = _filter_block_scalar(raw_block, newline, chomp, indentation); + switch(block_style) + { + case BLOCK_FOLD: + s = _filter_scalar_block_folded(raw_block, chomp, indentation); + break; + case BLOCK_LITERAL: + s = _filter_scalar_block_literal(raw_block, chomp, indentation); + break; + } _c4dbgpf("scanning block: final=~~~{}~~~", s); @@ -4515,825 +4524,1221 @@ csubstr Parser::_scan_block() } +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- //----------------------------------------------------------------------------- -template -bool Parser::_filter_nl(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos, size_t indentation) -{ - // a debugging scaffold: - #if 0 - #define _c4dbgfnl(fmt, ...) _c4dbgpf("filter_nl[{}]: " fmt, *i, __VA_ARGS__) - #else - #define _c4dbgfnl(...) - #endif - - const char curr = r[*i]; - bool replaced = false; +// a debugging scaffold: +#if 0 +#define _c4dbgfws(fmt, ...) _c4dbgpf("filt_ws[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfws(...) +#endif - _RYML_CB_ASSERT(m_stack.m_callbacks, indentation != npos); - _RYML_CB_ASSERT(m_stack.m_callbacks, curr == '\n'); +template +bool Parser::_filter_ws_handle_to_first_non_space(FilterProcessor &proc) noexcept +{ + _c4dbgfws("found whitespace '{}'", _c4prc(proc.curr())); + _RYML_CB_ASSERT(this->callbacks(), proc.curr() == ' ' || proc.curr() == '\t'); - _c4dbgfnl("found newline. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos)); - size_t ii = *i; - size_t numnl_following = count_following_newlines(r, &ii, indentation); - if(numnl_following) - { - _c4dbgfnl("{} consecutive (empty) lines {} in the middle. totalws={}", 1+numnl_following, ii < r.len ? "in the middle" : "at the end", ii - *i); - for(size_t j = 0; j < numnl_following; ++j) - m_filter_arena.str[(*pos)++] = '\n'; - } - else + const size_t first_pos = proc.rpos > 0 ? proc.src.first_not_of(" \t", proc.rpos) : proc.src.first_not_of(' ', proc.rpos); + if(first_pos != npos) { - if(r.first_not_of(" \t", *i+1) != npos) + const char first_char = proc.src[first_pos]; + _c4dbgfws("firstnonws='{}'@{}", _c4prc(first_char), first_pos); + if(first_char == '\n' || first_char == '\r') // skip trailing whitespace { - m_filter_arena.str[(*pos)++] = ' '; - _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos)); - replaced = true; - } - else - { - if C4_IF_CONSTEXPR (keep_trailing_whitespace) - { - m_filter_arena.str[(*pos)++] = ' '; - _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos)); - replaced = true; - } - else - { - _c4dbgfnl("last newline, everything else is whitespace. ii={}/{}", ii, r.len); - *i = r.len; - } + _c4dbgfws("whitespace is trailing on line", ""); + proc.skip(first_pos - proc.rpos); } - if C4_IF_CONSTEXPR (backslash_is_escape) + else // a legit whitespace { - if(ii < r.len && r.str[ii] == '\\') - { - const char next = ii+1 < r.len ? r.str[ii+1] : '\0'; - if(next == ' ' || next == '\t') - { - _c4dbgfnl("extend skip to backslash{}", ""); - ++ii; - } - } + proc.copy(); + _c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", proc.wpos, proc.sofar()); } + return true; + } + else + { + return false; } - *i = ii - 1; // correct for the loop increment +} - #undef _c4dbgfnl +template +void Parser::_filter_ws_copy_trailing(FilterProcessor &proc) noexcept +{ + if(!_filter_ws_handle_to_first_non_space(proc)) + { + _c4dbgfws("... everything else is trailing whitespace - copy {} chars", proc.src.len - proc.rpos); + proc.copy(proc.src.len - proc.rpos); + } +} - return replaced; +template +void Parser::_filter_ws_skip_trailing(FilterProcessor &proc) noexcept +{ + _RYML_CB_ASSERT(this->callbacks(), _filter_ws_handle_to_first_non_space(proc)); } +#undef _c4dbgfws + +//----------------------------------------------------------------------------- //----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +/* plain scalars */ -template -void Parser::_filter_ws(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos) -{ - // a debugging scaffold: - #if 0 - #define _c4dbgfws(fmt, ...) _c4dbgpf("filt_nl[{}]: " fmt, *i, __VA_ARGS__) - #else - #define _c4dbgfws(...) - #endif +// a debugging scaffold: +#if 0 +#define _c4dbgfps(fmt, ...) _c4dbgpf("filt_plain[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfps(fmt, ...) +#endif - const char curr = r[*i]; - _c4dbgfws("found whitespace '{}'", _c4prc(curr)); - _RYML_CB_ASSERT(m_stack.m_callbacks, curr == ' ' || curr == '\t'); +template +void Parser::_filter_nl_plain(FilterProcessor &C4_RESTRICT proc, size_t indentation) noexcept +{ + _RYML_CB_ASSERT(this->callbacks(), proc.curr() == '\n'); - size_t first = *i > 0 ? r.first_not_of(" \t", *i) : r.first_not_of(' ', *i); - if(first != npos) + _c4dbgfps("found newline. sofar=[{}]~~~{}~~~", proc.wpos, proc.sofar()); + size_t ii = proc.rpos; + const size_t numnl_following = _count_following_newlines(proc.src, &ii, indentation); + if(numnl_following) { - if(r[first] == '\n' || r[first] == '\r') // skip trailing whitespace + proc.set('\n', numnl_following); + _c4dbgfps("{} consecutive (empty) lines {}. totalws={}", 1+numnl_following, ii < proc.src.len ? "in the middle" : "at the end", proc.rpos-ii); + } + else + { + const size_t ret = proc.src.first_not_of(" \t", proc.rpos+1); + if(ret != npos) { - _c4dbgfws("whitespace is trailing on line. firstnonws='{}'@{}", _c4prc(r[first]), first); - *i = first - 1; // correct for the loop increment + proc.set(' '); + _c4dbgfps("single newline. convert to space. ret={}/{}. sofar=[{}]~~~{}~~~", ii, proc.src.len, proc.wpos, proc.sofar()); } - else // a legit whitespace + else { - m_filter_arena.str[(*pos)++] = curr; - _c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos)); + _c4dbgfps("last newline, everything else is whitespace. ii={}/{}", ii, proc.src.len); + ii = proc.src.len; } } - else + proc.rpos = ii; +} + +template +auto Parser::_filter_plain(FilterProcessor &C4_RESTRICT proc, size_t indentation) noexcept -> decltype(proc.result()) +{ + _RYML_CB_ASSERT(this->callbacks(), indentation != npos); + _c4dbgfps("before=[{}]~~~{}~~~", proc.src.len, proc.src); + + while(proc.has_more_chars()) { - _c4dbgfws("... everything else is trailing whitespace{}", ""); - if C4_IF_CONSTEXPR (keep_trailing_whitespace) - for(size_t j = *i; j < r.len; ++j) - m_filter_arena.str[(*pos)++] = r[j]; - *i = r.len; + const char curr = proc.curr(); + _c4dbgfps("'{}', sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) + { + case ' ': + case '\t': + _c4dbgfps("whitespace", curr); + _filter_ws_skip_trailing(proc); + break; + case '\n': + _c4dbgfps("newline", curr); + _filter_nl_plain(proc, /*indentation*/indentation); + break; + case '\r': // skip \r --- https://stackoverflow.com/questions/1885900 + _c4dbgfps("carriage return, ignore", curr); + proc.skip(); + break; + default: + proc.copy(); + break; + } } - #undef _c4dbgfws + _c4dbgfps("after[{}]=~~~{}~~~", proc.wpos, proc.sofar()); + + return proc.result(); } +#undef _c4dbgfps -//----------------------------------------------------------------------------- -csubstr Parser::_filter_plain_scalar(substr s, size_t indentation) + +FilterResult Parser::filter_scalar_plain(csubstr scalar, substr dst, size_t indentation) noexcept { - // a debugging scaffold: - #if 0 - #define _c4dbgfps(...) _c4dbgpf("filt_plain_scalar" __VA_ARGS__) - #else - #define _c4dbgfps(...) - #endif + FilterProcessorSrcDst proc(scalar, dst); + return _filter_plain(proc, indentation); +} + +FilterResult Parser::filter_scalar_plain_in_place(substr dst, size_t cap, size_t indentation) noexcept +{ + FilterProcessorInplaceEndExtending proc(dst, cap); + return _filter_plain(proc, indentation); +} + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +/* single quoted */ - _c4dbgfps("before=~~~{}~~~", s); +// a debugging scaffold: +#if 0 +#define _c4dbgfsq(fmt, ...) _c4dbgpf("filt_squo[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfsq(fmt, ...) +#endif + +template +void Parser::_filter_nl_squoted(FilterProcessor &C4_RESTRICT proc) noexcept +{ + _RYML_CB_ASSERT(this->callbacks(), proc.curr() == '\n'); - substr r = s.triml(" \t"); - _grow_filter_arena(r.len); - size_t pos = 0; // the filtered size - bool filtered_chars = false; - for(size_t i = 0; i < r.len; ++i) + _c4dbgfsq("found newline. sofar=[{}]~~~{}~~~", proc.wpos, proc.sofar()); + size_t ii = proc.rpos; + const size_t numnl_following = _count_following_newlines(proc.src, &ii); + if(numnl_following) { - const char curr = r.str[i]; - _c4dbgfps("[{}]: '{}'", i, _c4prc(curr)); - if(curr == ' ' || curr == '\t') - { - _filter_ws(r, &i, &pos); - } - else if(curr == '\n') - { - filtered_chars = _filter_nl(r, &i, &pos, indentation); - } - else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900 + proc.set('\n', numnl_following); + _c4dbgfsq("{} consecutive (empty) lines {}. totalws={}", 1+numnl_following, ii < proc.src.len ? "in the middle" : "at the end", proc.rpos-ii); + } + else + { + const size_t ret = proc.src.first_not_of(" \t", proc.rpos+1); + if(ret != npos) { - ; + proc.set(' '); + _c4dbgfsq("single newline. convert to space. ret={}/{}. sofar=[{}]~~~{}~~~", ii, proc.src.len, proc.wpos, proc.sofar()); } else { - m_filter_arena.str[pos++] = r[i]; + proc.set(' '); + _c4dbgfsq("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, proc.src.len, proc.wpos, proc.sofar()); } } - - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); - if(pos < r.len || filtered_chars) - { - r = _finish_filter_arena(r, pos); - } - - _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len); - _c4dbgfps("#filteredchars={} after=~~~{}~~~", s.len - r.len, r); - - #undef _c4dbgfps - return r; + proc.rpos = ii; } - -//----------------------------------------------------------------------------- -csubstr Parser::_filter_squot_scalar(substr s) +template +auto Parser::_filter_squoted(FilterProcessor &C4_RESTRICT proc) noexcept -> decltype(proc.result()) { - // a debugging scaffold: - #if 0 - #define _c4dbgfsq(...) _c4dbgpf("filt_squo_scalar") - #else - #define _c4dbgfsq(...) - #endif - // from the YAML spec for double-quoted scalars: // https://yaml.org/spec/1.2-old/spec.html#style/flow/single-quoted - _c4dbgfsq(": before=~~~{}~~~", s); + _c4dbgfsq("before=[{}]~~~{}~~~", proc.src.len, proc.src); - _grow_filter_arena(s.len); - substr r = s; - size_t pos = 0; // the filtered size - bool filtered_chars = false; - for(size_t i = 0; i < r.len; ++i) + while(proc.has_more_chars()) { - const char curr = r[i]; - _c4dbgfsq("[{}]: '{}'", i, _c4prc(curr)); - if(curr == ' ' || curr == '\t') - { - _filter_ws(r, &i, &pos); - } - else if(curr == '\n') - { - filtered_chars = _filter_nl(r, &i, &pos, /*indentation*/0); - } - else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900 - { - ; - } - else if(curr == '\'') + const char curr = proc.curr(); + _c4dbgfsq("'{}', sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) { - char next = i+1 < r.len ? r[i+1] : '\0'; - if(next == '\'') + case ' ': + case '\t': + _c4dbgfsq("whitespace", curr); + _filter_ws_copy_trailing(proc); + break; + case '\n': + _c4dbgfsq("newline", curr); + _filter_nl_squoted(proc); + break; + case '\r': // skip \r --- https://stackoverflow.com/questions/1885900 + _c4dbgfsq("skip cr", curr); + proc.skip(); + break; + case '\'': + _c4dbgfsq("squote", curr); + if(proc.next() == '\'') { - _c4dbgfsq("[{}]: two consecutive quotes", i); - filtered_chars = true; - m_filter_arena.str[pos++] = '\''; - ++i; + _c4dbgfsq("two consecutive squotes", curr); + proc.skip(); + proc.copy(); } - } - else - { - m_filter_arena.str[pos++] = curr; + break; + default: + proc.copy(); + break; } } - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); - if(pos < r.len || filtered_chars) - { - r = _finish_filter_arena(r, pos); - } + _c4dbgfsq(": #filteredchars={} after=~~~[{}]{}~~~", proc.src.len-proc.sofar().len, proc.sofar().len, proc.sofar()); + + return proc.result(); +} - _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len); - _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r); +#undef _c4dbgfsq - #undef _c4dbgfsq - return r; +FilterResult Parser::filter_scalar_squoted(csubstr scalar, substr dst) noexcept +{ + FilterProcessorSrcDst proc(scalar, dst); + return _filter_squoted(proc); +} + +FilterResult Parser::filter_scalar_squoted_in_place(substr dst, size_t cap) noexcept +{ + FilterProcessorInplaceEndExtending proc(dst, cap); + return _filter_squoted(proc); } //----------------------------------------------------------------------------- -csubstr Parser::_filter_dquot_scalar(substr s) -{ - // a debugging scaffold: - #if 0 - #define _c4dbgfdq(...) _c4dbgpf("filt_dquo_scalar" __VA_ARGS__) - #else - #define _c4dbgfdq(...) - #endif +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +/* double quoted */ - _c4dbgfdq(": before=~~~{}~~~", s); +// a debugging scaffold: +#if 0 +#define _c4dbgfdq(fmt, ...) _c4dbgpf("filt_dquo[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfdq(...) +#endif - // from the YAML spec for double-quoted scalars: - // https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted - // - // All leading and trailing white space characters are excluded - // from the content. Each continuation line must therefore contain - // at least one non-space character. Empty lines, if any, are - // consumed as part of the line folding. +template +void Parser::_filter_nl_dquoted(FilterProcessor &C4_RESTRICT proc) noexcept +{ + _RYML_CB_ASSERT(this->callbacks(), proc.curr() == '\n'); - _grow_filter_arena(s.len + 2u * s.count('\\')); - substr r = s; - size_t pos = 0; // the filtered size - bool filtered_chars = false; - for(size_t i = 0; i < r.len; ++i) + _c4dbgfdq("found newline. sofar=[{}]~~~{}~~~", proc.wpos, proc.sofar()); + size_t ii = proc.rpos; + const size_t numnl_following = _count_following_newlines(proc.src, &ii); + if(numnl_following) { - const char curr = r[i]; - _c4dbgfdq("[{}]: '{}'", i, _c4prc(curr)); - if(curr == ' ' || curr == '\t') - { - _filter_ws(r, &i, &pos); - } - else if(curr == '\n') + proc.set('\n', numnl_following); + _c4dbgfdq("{} consecutive (empty) lines {}. totalws={}", 1+numnl_following, ii < proc.src.len ? "in the middle" : "at the end", proc.rpos-ii); + } + else + { + const size_t ret = proc.src.first_not_of(" \t", proc.rpos+1); + if(ret != npos) { - filtered_chars = _filter_nl(r, &i, &pos, /*indentation*/0); + proc.set(' '); + _c4dbgfdq("single newline. convert to space. ret={}/{}. sofar=[{}]~~~{}~~~", ii, proc.src.len, proc.wpos, proc.sofar()); } - else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900 + else { - ; + proc.set(' '); + _c4dbgfdq("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, proc.src.len, proc.wpos, proc.sofar()); } - else if(curr == '\\') + if(ii < proc.src.len && proc.src.str[ii] == '\\') { - char next = i+1 < r.len ? r[i+1] : '\0'; - _c4dbgfdq("[{}]: backslash, next='{}'", i, _c4prc(next)); - filtered_chars = true; - if(next == '\r') - { - if(i+2 < r.len && r[i+2] == '\n') - { - ++i; // newline escaped with \ -- skip both (add only one as i is loop-incremented) - next = '\n'; - _c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", i); - } - } - // remember the loop will also increment i - if(next == '\n') - { - size_t ii = i + 2; - for( ; ii < r.len; ++ii) - { - if(r.str[ii] == ' ' || r.str[ii] == '\t') // skip leading whitespace - ; - else - break; - } - i += ii - i - 1; - } - else if(next == '"' || next == '/' || next == ' ' || next == '\t') // escapes for json compatibility - { - m_filter_arena.str[pos++] = next; - ++i; - } - else if(next == '\r') - { - //++i; - } - else if(next == 'n') - { - m_filter_arena.str[pos++] = '\n'; - ++i; - } - else if(next == 'r') - { - m_filter_arena.str[pos++] = '\r'; - ++i; // skip - } - else if(next == 't') - { - m_filter_arena.str[pos++] = '\t'; - ++i; - } - else if(next == '\\') - { - m_filter_arena.str[pos++] = '\\'; - ++i; - } - else if(next == 'x') // UTF8 - { - if(i + 1u + 2u >= r.len) - _c4err("\\x requires 2 hex digits"); - uint8_t byteval = {}; - if(!read_hex(r.sub(i + 2u, 2u), &byteval)) - _c4err("failed to read \\x codepoint"); - m_filter_arena.str[pos++] = *(char*)&byteval; - i += 1u + 2u; - } - else if(next == 'u') // UTF16 - { - if(i + 1u + 4u >= r.len) - _c4err("\\u requires 4 hex digits"); - char readbuf[8]; - csubstr codepoint = r.sub(i + 2u, 4u); - uint32_t codepoint_val = {}; - if(!read_hex(codepoint, &codepoint_val)) - _c4err("failed to parse \\u codepoint"); - size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); - C4_ASSERT(numbytes <= 4); - memcpy(m_filter_arena.str + pos, readbuf, numbytes); - pos += numbytes; - i += 1u + 4u; - } - else if(next == 'U') // UTF32 - { - if(i + 1u + 8u >= r.len) - _c4err("\\U requires 8 hex digits"); - char readbuf[8]; - csubstr codepoint = r.sub(i + 2u, 8u); - uint32_t codepoint_val = {}; - if(!read_hex(codepoint, &codepoint_val)) - _c4err("failed to parse \\U codepoint"); - size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); - C4_ASSERT(numbytes <= 4); - memcpy(m_filter_arena.str + pos, readbuf, numbytes); - pos += numbytes; - i += 1u + 8u; - } - // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char - else if(next == '0') - { - m_filter_arena.str[pos++] = '\0'; - ++i; - } - else if(next == 'b') // backspace - { - m_filter_arena.str[pos++] = '\b'; - ++i; - } - else if(next == 'f') // form feed - { - m_filter_arena.str[pos++] = '\f'; - ++i; - } - else if(next == 'a') // bell character - { - m_filter_arena.str[pos++] = '\a'; - ++i; - } - else if(next == 'v') // vertical tab - { - m_filter_arena.str[pos++] = '\v'; - ++i; - } - else if(next == 'e') // escape character - { - m_filter_arena.str[pos++] = '\x1b'; - ++i; - } - else if(next == '_') // unicode non breaking space \u00a0 - { - // https://www.compart.com/en/unicode/U+00a0 - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2); - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x60, 0xa0); - ++i; - } - else if(next == 'N') // unicode next line \u0085 + _c4dbgfdq("backslash at [{}]", ii); + const char next = ii+1 < proc.src.len ? proc.src.str[ii+1] : '\0'; + if(next == ' ' || next == '\t') { - // https://www.compart.com/en/unicode/U+0085 - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2); - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x7b, 0x85); - ++i; + _c4dbgfdq("extend skip to backslash", ""); + ++ii; } - else if(next == 'L') // unicode line separator \u2028 - { - // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2); - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80); - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x58, 0xa8); - ++i; - } - else if(next == 'P') // unicode paragraph separator \u2029 - { - // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2); - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80); - m_filter_arena.str[pos++] = _RYML_CHCONST(-0x57, 0xa9); - ++i; - } - _c4dbgfdq("[{}]: backslash...sofar=[{}]~~~{}~~~", i, pos, m_filter_arena.first(pos)); } - else + } + proc.rpos = ii; +} + +template +void Parser::_filter_dquoted_backslash(FilterProcessor &C4_RESTRICT proc) +{ + char next = proc.next(); + _c4dbgfdq("backslash, next='{}'", _c4prc(next)); + if(next == '\r') + { + if(proc.rpos+2 < proc.src.len && proc.src.str[proc.rpos+2] == '\n') { - m_filter_arena.str[pos++] = curr; + proc.skip(); // newline escaped with \ -- skip both (add only one as i is loop-incremented) + next = '\n'; + _c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", proc.rpos); } } - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); - if(pos < r.len || filtered_chars) + if(next == '\n') { - r = _finish_filter_arena(r, pos); + size_t ii = proc.rpos + 2; + for( ; ii < proc.src.len; ++ii) + { + // skip leading whitespace + if(proc.src.str[ii] == ' ' || proc.src.str[ii] == '\t') + ; + else + break; + } + proc.skip(ii - proc.rpos); } - - _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len); - _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r); - - #undef _c4dbgfdq - - return r; + else if(next == '"' || next == '/' || next == ' ' || next == '\t') + { + // escapes for json compatibility + proc.translate_esc(next); + _c4dbgfdq("here, used '{}'", _c4prc(next)); + } + else if(next == '\r') + { + //proc.skip(); + } + else if(next == 'n') + { + proc.translate_esc('\n'); + } + else if(next == 'r') + { + proc.translate_esc('\r'); + } + else if(next == 't') + { + proc.translate_esc('\t'); + } + else if(next == '\\') + { + proc.translate_esc('\\'); + } + else if(next == 'x') // UTF8 + { + if(C4_UNLIKELY(proc.rpos + 1u + 2u >= proc.src.len)) + _c4err("\\x requires 2 hex digits. scalar pos={}", proc.rpos); + csubstr codepoint = proc.src.sub(proc.rpos + 2u, 2u); + _c4dbgfdq("utf8 ~~~{}~~~ rpos={} rem=~~~{}~~~", codepoint, proc.rpos, proc.src.sub(proc.rpos)); + uint8_t byteval = {}; + if(C4_UNLIKELY(!read_hex(codepoint, &byteval))) + _c4err("failed to read \\x codepoint. scalar pos={}", proc.rpos); + proc.translate_esc_bulk((const char*)&byteval, 1u, /*nread*/3u); + _c4dbgfdq("utf8 after rpos={} rem=~~~{}~~~", proc.rpos, proc.src.sub(proc.rpos)); + } + else if(next == 'u') // UTF16 + { + if(C4_UNLIKELY(proc.rpos + 1u + 4u >= proc.src.len)) + _c4err("\\u requires 4 hex digits. scalar pos={}", proc.rpos); + char readbuf[8]; + csubstr codepoint = proc.src.sub(proc.rpos + 2u, 4u); + uint32_t codepoint_val = {}; + if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val))) + _c4err("failed to parse \\u codepoint. scalar pos={}", proc.rpos); + size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); + C4_ASSERT(numbytes <= 4); + proc.translate_esc_bulk(readbuf, numbytes, /*nread*/5u); + } + else if(next == 'U') // UTF32 + { + if(C4_UNLIKELY(proc.rpos + 1u + 8u >= proc.src.len)) + _c4err("\\U requires 8 hex digits. scalar pos={}", proc.rpos); + char readbuf[8]; + csubstr codepoint = proc.src.sub(proc.rpos + 2u, 8u); + uint32_t codepoint_val = {}; + if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val))) + _c4err("failed to parse \\U codepoint. scalar pos={}", proc.rpos); + size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); + C4_ASSERT(numbytes <= 4); + proc.translate_esc_bulk(readbuf, numbytes, /*nread*/9u); + } + // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char + else if(next == '0') + { + proc.translate_esc('\0'); + } + else if(next == 'b') // backspace + { + proc.translate_esc('\b'); + } + else if(next == 'f') // form feed + { + proc.translate_esc('\f'); + } + else if(next == 'a') // bell character + { + proc.translate_esc('\a'); + } + else if(next == 'v') // vertical tab + { + proc.translate_esc('\v'); + } + else if(next == 'e') // escape character + { + proc.translate_esc('\x1b'); + } + else if(next == '_') // unicode non breaking space \u00a0 + { + // https://www.compart.com/en/unicode/U+00a0 + const char payload[] = { + _RYML_CHCONST(-0x3e, 0xc2), + _RYML_CHCONST(-0x60, 0xa0), + }; + proc.translate_esc_bulk(payload, /*nwrite*/2, /*nread*/1); + } + else if(next == 'N') // unicode next line \u0085 + { + // https://www.compart.com/en/unicode/U+0085 + const char payload[] = { + _RYML_CHCONST(-0x3e, 0xc2), + _RYML_CHCONST(-0x7b, 0x85), + }; + proc.translate_esc_bulk(payload, /*nwrite*/2, /*nread*/1); + } + else if(next == 'L') // unicode line separator \u2028 + { + // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex + const char payload[] = { + _RYML_CHCONST(-0x1e, 0xe2), + _RYML_CHCONST(-0x80, 0x80), + _RYML_CHCONST(-0x58, 0xa8), + }; + proc.translate_esc_extending(payload, /*nwrite*/3, /*nread*/1); + } + else if(next == 'P') // unicode paragraph separator \u2029 + { + // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex + const char payload[] = { + _RYML_CHCONST(-0x1e, 0xe2), + _RYML_CHCONST(-0x80, 0x80), + _RYML_CHCONST(-0x57, 0xa9), + }; + proc.translate_esc_extending(payload, /*nwrite*/3, /*nread*/1); + } + else if(next == '\0') + { + proc.skip(); + } + else + { + _c4err("unknown character '{}' after '\\' pos={}", _c4prc(next), proc.rpos); + } + _c4dbgfdq("backslash...sofar=[{}]~~~{}~~~", proc.wpos, proc.sofar()); } -//----------------------------------------------------------------------------- -bool Parser::_apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp) +template +auto Parser::_filter_dquoted(FilterProcessor &C4_RESTRICT proc) -> decltype(proc.result()) { - substr trimmed = buf.first(*pos).trimr('\n'); - bool added_newline = false; - switch(chomp) + _c4dbgfdq("before=[{}]~~~{}~~~", proc.src.len, proc.src); + // from the YAML spec for double-quoted scalars: + // https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted + while(proc.has_more_chars()) { - case CHOMP_KEEP: - if(trimmed.len == *pos) + const char curr = proc.curr(); + _c4dbgfdq("'{}' sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) { - _c4dbgpf("chomp=KEEP: add missing newline @{}", *pos); - //m_filter_arena.str[(*pos)++] = '\n'; - added_newline = true; + case ' ': + case '\t': + { + _c4dbgfdq("whitespace", curr); + _filter_ws_copy_trailing(proc); + break; } - break; - case CHOMP_CLIP: - if(trimmed.len == *pos) + case '\n': { - _c4dbgpf("chomp=CLIP: add missing newline @{}", *pos); - m_filter_arena.str[(*pos)++] = '\n'; - added_newline = true; + _c4dbgfdq("newline", curr); + _filter_nl_dquoted(proc); + break; } - else + case '\r': // skip \r --- https://stackoverflow.com/questions/1885900 { - _c4dbgpf("chomp=CLIP: include single trailing newline @{}", trimmed.len+1); - *pos = trimmed.len + 1; + _c4dbgfdq("carriage return, ignore", curr); + proc.skip(); + break; + } + case '\\': + { + _filter_dquoted_backslash(proc); + break; + } + default: + { + proc.copy(); + break; + } } - break; - case CHOMP_STRIP: - _c4dbgpf("chomp=STRIP: strip {}-{}-{} newlines", *pos, trimmed.len, *pos-trimmed.len); - *pos = trimmed.len; - break; - default: - _c4err("unknown chomp style"); } - return added_newline; + _c4dbgfdq("after[{}]=~~~{}~~~", proc.wpos, proc.sofar()); + return proc.result(); } +#undef _c4dbgfdq + +FilterResult Parser::filter_scalar_dquoted(csubstr scalar, substr dst) +{ + FilterProcessorSrcDst proc(scalar, dst); + return _filter_dquoted(proc); +} + +FilterResultExtending Parser::filter_scalar_dquoted_in_place(substr dst, size_t cap) +{ + FilterProcessorInplaceMidExtending proc(dst, cap); + return _filter_dquoted(proc); +} + + +//----------------------------------------------------------------------------- //----------------------------------------------------------------------------- -csubstr Parser::_filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation) +//----------------------------------------------------------------------------- +// block filtering helpers + +RYML_EXPORT size_t detail::_find_last_newline_and_larger_indentation(csubstr s, size_t indentation) noexcept +{ + if(indentation + 1 > s.len) + return npos; + for(size_t i = s.len-indentation-1; i != size_t(-1); --i) + { + if(s.str[i] == '\n') + { + csubstr rem = s.sub(i + 1); + size_t first = rem.first_not_of(' '); + first = (first != npos) ? first : rem.len; + if(first > indentation) + return i; + } + } + return npos; +} + +template +void Parser::_filter_chomp(FilterProcessor &C4_RESTRICT proc, BlockChomp_e chomp, size_t indentation) noexcept { + _RYML_CB_ASSERT(this->callbacks(), chomp == CHOMP_CLIP || chomp == CHOMP_KEEP || chomp == CHOMP_STRIP); + _RYML_CB_ASSERT(this->callbacks(), proc.rem().first_not_of(" \n\r") == npos); + // a debugging scaffold: #if 0 - #define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block" fmt, __VA_ARGS__) + #define _c4dbgchomp(fmt, ...) _c4dbgpf("chomp[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) #else - #define _c4dbgfbl(...) + #define _c4dbgchomp(...) #endif - _c4dbgfbl(": indentation={} before=[{}]~~~{}~~~", indentation, s.len, s); + // advance to the last line having spaces beyond the indentation + { + size_t last = detail::_find_last_newline_and_larger_indentation(proc.rem(), indentation); + if(last != npos) + { + _c4dbgchomp("found newline and larger indentation. last={}", last); + last = proc.rpos + last + size_t(1) + indentation; // last started at to-be-read. + _RYML_CB_ASSERT(this->callbacks(), last <= proc.src.len); + // remove indentation spaces, copy the rest + while((proc.rpos < last) && proc.has_more_chars()) + { + const char curr = proc.curr(); + _c4dbgchomp("curr='{}'", _c4prc(curr)); + _RYML_CB_ASSERT(this->callbacks(), curr == '\n' || curr == '\r'); + switch(curr) + { + case '\n': + { + _c4dbgchomp("newline! remlen={}", proc.rem().len); + proc.copy(); + // are there spaces after the newline? + csubstr at_next_line = proc.rem(); + if(at_next_line.begins_with(' ')) + { + _c4dbgchomp("next line begins with spaces. indentation={}", indentation); + // there are spaces. + size_t first_non_space = at_next_line.first_not_of(' '); + _c4dbgchomp("first_non_space={}", first_non_space); + if(first_non_space == npos) + { + _c4dbgchomp("{} spaces, to the end", at_next_line.len); + first_non_space = at_next_line.len; + } + if(first_non_space <= indentation) + { + _c4dbgchomp("skip spaces={}<=indentation={}", first_non_space, indentation); + proc.skip(first_non_space); + } + else + { + _c4dbgchomp("skip indentation={}{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfb(...) +#endif + +template +void Parser::_filter_block_indentation(FilterProcessor &C4_RESTRICT proc, size_t indentation) noexcept +{ + csubstr rem = proc.rem(); // remaining + if(rem.len) { - case BLOCK_LITERAL: + size_t first = rem.first_not_of(' '); + if(first != npos) + { + _c4dbgfb("{} spaces follow before next nonws character", first); + if(first < indentation) + { + _c4dbgfb("skip {}<{} spaces from indentation", first, indentation); + proc.skip(first); + } + else + { + _c4dbgfb("skip {} spaces from indentation", indentation); + proc.skip(indentation); + } + } + else { - _c4dbgp("filt_block: style=literal"); - // trim leading whitespace up to indentation + C4_ERROR("crl"); + // UNCOVERED + _c4dbgfb("all spaces to the end: {} spaces", first); + first = rem.len; + if(first) { - size_t numws = r.first_not_of(' '); - if(numws != npos) + if(first < indentation) { - if(numws > indentation) - r = r.sub(indentation); - else - r = r.sub(numws); - _c4dbgfbl(": after triml=[{}]~~~{}~~~", r.len, r); + _c4dbgfb("skip everything", first); + proc.skip(proc.src.len - proc.rpos); } else { - if(chomp != CHOMP_KEEP || r.len == 0) - { - _c4dbgfbl(": all spaces {}, return empty", r.len); - return r.first(0); - } - else - { - r[0] = '\n'; - return r.first(1); - } + _c4dbgfb("skip {} spaces from indentation", indentation); + proc.skip(indentation); } } - _grow_filter_arena(s.len + 2u); // use s.len! because we may need to add a newline at the end, so the leading indentation will allow space for that newline - size_t pos = 0; // the filtered size - for(size_t i = 0; i < r.len; ++i) + } + } +} + +template +size_t Parser::_handle_all_whitespace(FilterProcessor &C4_RESTRICT proc, BlockChomp_e chomp) noexcept +{ + csubstr contents = proc.src.trimr(" \n\r"); + _c4dbgfb("ws: contents_len={} wslen={}", contents.len, proc.src.len-contents.len); + if(!contents.len) + { + _c4dbgfb("ws: all whitespace: len={}", proc.src.len); + if(chomp == CHOMP_KEEP && proc.src.len) + { + _c4dbgfb("ws: chomp=KEEP all {} newlines", proc.src.count('\n')); + while(proc.has_more_chars()) { - const char curr = r.str[i]; - _c4dbgfbl("[{}]='{}' pos={}", i, _c4prc(curr), pos); - if(curr == '\r') - continue; - m_filter_arena.str[pos++] = curr; + const char curr = proc.curr(); if(curr == '\n') - { - _c4dbgfbl("[{}]: found newline", i); - // skip indentation on the next line - csubstr rem = r.sub(i+1); - size_t first = rem.first_not_of(' '); - if(first != npos) - { - _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len); - _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len); - _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, rem.str[first]); - if(first < indentation) - { - _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation); - i += first; - } - else - { - _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); - i += indentation; - } - } - else - { - _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len); - first = rem.len; - _c4dbgfbl("[{}]: {} spaces to the end", i, first); - if(first) - { - if(first < indentation) - { - _c4dbgfbl("[{}]: skip everything", i); - --pos; - break; - } - else - { - _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); - i += indentation; - } - } - else if(i+1 == r.len) - { - if(chomp == CHOMP_STRIP) - --pos; - break; - } - } - } + proc.copy(); + else + proc.skip(); } - _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= pos); - _c4dbgfbl(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r); - bool changed = _apply_chomp(m_filter_arena, &pos, chomp); - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= s.len); - if(pos < r.len || changed) + if(!proc.wpos) { - r = _finish_filter_arena(s, pos); // write into s + proc.set('\n'); } + } + } + return contents.len; +} + +template +size_t Parser::_extend_to_chomp(FilterProcessor &C4_RESTRICT proc, size_t contents_len) noexcept +{ + _c4dbgfb("contents_len={}", contents_len); + + _RYML_CB_ASSERT(this->callbacks(), contents_len > 0u); + + // extend contents to just before the first newline at the end, + // in case it is preceded by spaces + size_t firstnewl = proc.src.first_of('\n', contents_len); + if(firstnewl != npos) + { + contents_len = firstnewl; + _c4dbgfb("contents_len={} <--- firstnewl={}", contents_len, firstnewl); + } + else + { + contents_len = proc.src.len; + _c4dbgfb("contents_len={} <--- src.len={}", contents_len, proc.src.len); + } + + return contents_len; +} + +#undef _c4dbgfb + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +// a debugging scaffold: +#if 0 +#define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block_lit[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfbl(...) +#endif + +template +auto Parser::_filter_block_literal(FilterProcessor &C4_RESTRICT proc, size_t indentation, BlockChomp_e chomp) noexcept -> decltype(proc.result()) +{ + _c4dbgfbl("indentation={} before=[{}]~~~{}~~~", indentation, proc.src.len, proc.src); + + size_t contents_len = _handle_all_whitespace(proc, chomp); + if(!contents_len) + return proc.result(); + + contents_len = _extend_to_chomp(proc, contents_len); + + _c4dbgfbl("to filter=[{}]~~~{}~~~", contents_len, proc.src.first(contents_len)); + + _filter_block_indentation(proc, indentation); + + // now filter the bulk + while(proc.has_more_chars(/*maxpos*/contents_len)) + { + const char curr = proc.curr(); + _c4dbgfbl("'{}' sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) + { + case '\n': + { + _c4dbgfbl("found newline. skip indentation on the next line", curr); + proc.copy(); // copy the newline + _filter_block_indentation(proc, indentation); break; } - case BLOCK_FOLD: + case '\r': + proc.skip(); + break; + default: + proc.copy(); + break; + } + } + + _c4dbgfbl("before chomp: #tochomp={} sofar=[{}]~~~{}~~~", proc.rem().len, proc.sofar().len, proc.sofar()); + + _filter_chomp(proc, chomp, indentation); + + _c4dbgfbl("final=[{}]~~~{}~~~", proc.sofar().len, proc.sofar()); + + return proc.result(); +} + +#undef _c4dbgfbl + +FilterResult Parser::filter_scalar_block_literal(csubstr scalar, substr dst, size_t indentation, BlockChomp_e chomp) noexcept +{ + FilterProcessorSrcDst proc(scalar, dst); + return _filter_block_literal(proc, indentation, chomp); +} + +FilterResult Parser::filter_scalar_block_literal_in_place(substr scalar, size_t cap, size_t indentation, BlockChomp_e chomp) noexcept +{ + FilterProcessorInplaceEndExtending proc(scalar, cap); + return _filter_block_literal(proc, indentation, chomp); +} + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +// a debugging scaffold: +#if 0 +#define _c4dbgfbf(fmt, ...) _c4dbgpf("filt_block_folded[{}->{}]: " fmt, proc.rpos, proc.wpos, __VA_ARGS__) +#else +#define _c4dbgfbf(...) +#endif + + +template +void Parser::_filter_block_folded_newlines_leading(FilterProcessor &C4_RESTRICT proc, size_t indentation, size_t len) noexcept +{ + _filter_block_indentation(proc, indentation); + while(proc.has_more_chars(len)) + { + const char curr = proc.curr(); + _c4dbgfbf("'{}' sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) { - _c4dbgp("filt_block: style=fold"); - _grow_filter_arena(r.len + 2); - size_t pos = 0; // the filtered size - bool filtered_chars = false; - bool started = false; - bool is_indented = false; - size_t i = r.first_not_of(' '); - _c4dbgfbl(": first non space at {}", i); - if(i > indentation) + case '\n': + _c4dbgfbf("newline.", curr); + proc.copy(); + _filter_block_indentation(proc, indentation); + break; + case '\r': + proc.skip(); + break; + case ' ': + case '\t': + { + size_t first = proc.rem().first_not_of(" \t"); + _c4dbgfbf("space. first={}", first); + if(first == npos) + first = proc.rem().len; + _c4dbgfbf("... indentation increased to {}", first); + _filter_block_folded_indented_block(proc, indentation, len, first); + break; + } + default: + _c4dbgfbf("newl leading: not space, not newline. stop.", 0); + return; + } + } +} + +template +void Parser::_filter_block_folded_newlines(FilterProcessor &C4_RESTRICT proc, size_t indentation, size_t len) noexcept +{ + _RYML_CB_ASSERT(this->callbacks(), proc.curr() == '\n'); + size_t num_newl = 0; + size_t wpos_at_first_newl = npos; + while(proc.has_more_chars(len)) + { + const char curr = proc.curr(); + _c4dbgfbf("'{}' sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) + { + case '\n': + _c4dbgfbf("newline. sofar={}", num_newl); + switch(++num_newl) { - is_indented = true; - i = indentation; + case 1u: + _c4dbgfbf("... this is the first newline. turn into space. wpos={}", proc.wpos); + wpos_at_first_newl = proc.wpos; + proc.skip(); + proc.set(' '); + break; + case 2u: + _c4dbgfbf("... this is the second newline. prev space (at wpos={}) must be newline", wpos_at_first_newl); + _RYML_CB_ASSERT(this->callbacks(), wpos_at_first_newl != npos); + _RYML_CB_ASSERT(this->callbacks(), proc.sofar()[wpos_at_first_newl] == ' '); + _RYML_CB_ASSERT(this->callbacks(), wpos_at_first_newl + 1u == proc.wpos); + proc.skip(); + proc.set_at(wpos_at_first_newl, '\n'); + _RYML_CB_ASSERT(this->callbacks(), proc.sofar()[wpos_at_first_newl] == '\n'); + break; + default: + _c4dbgfbf("... subsequent newline (num_newl={}). copy", num_newl); + proc.copy(); + break; } - _c4dbgfbl(": start folding at {}, is_indented={}", i, (int)is_indented); - auto on_change_indentation = [&](size_t numnl_following, size_t last_newl, size_t first_non_whitespace){ - _c4dbgfbl("[{}]: add 1+{} newlines", i, numnl_following); - for(size_t j = 0; j < 1 + numnl_following; ++j) - m_filter_arena.str[pos++] = '\n'; - for(i = last_newl + 1 + indentation; i < first_non_whitespace; ++i) + _filter_block_indentation(proc, indentation); + break; + case ' ': + case '\t': + { + size_t first = proc.rem().first_not_of(" \t"); + _c4dbgfbf("space. first={}", first); + if(first == npos) + first = proc.rem().len; + _c4dbgfbf("... indentation increased to {}", first); + if(num_newl) { - if(r.str[i] == '\r') - continue; - _c4dbgfbl("[{}]: add '{}'", i, _c4prc(r.str[i])); - m_filter_arena.str[pos++] = r.str[i]; + _c4dbgfbf("... prev space (at wpos={}) must be newline", wpos_at_first_newl); + proc.set_at(wpos_at_first_newl, '\n'); + } + if(num_newl > 1u) { + _c4dbgfbf("... add missing newline", wpos_at_first_newl); + proc.set('\n'); } - --i; - }; - for( ; i < r.len; ++i) + _filter_block_folded_indented_block(proc, indentation, len, first); + num_newl = 0; + wpos_at_first_newl = npos; + break; + } + case '\r': + proc.skip(); + break; + default: + _c4dbgfbf("not space, not newline. stop.", 0); + return; + } + } +} + + +template +void Parser::_filter_block_folded_indented_block(FilterProcessor &C4_RESTRICT proc, size_t indentation, size_t len, size_t curr_indentation) noexcept +{ + _RYML_CB_ASSERT(this->callbacks(), (proc.rem().first_not_of(" \t") == curr_indentation) || (proc.rem().first_not_of(" \t") == npos)); + proc.copy(curr_indentation); + while(proc.has_more_chars(len)) + { + const char curr = proc.curr(); + _c4dbgfbf("'{}' sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) + { + case '\n': { - const char curr = r.str[i]; - _c4dbgfbl("[{}]='{}'", i, _c4prc(curr)); - if(curr == '\n') + proc.copy(); + _filter_block_indentation(proc, indentation); + csubstr rem = proc.rem(); + const size_t first = rem.first_not_of(' '); + _c4dbgfbf("newline. firstns={}", first); + if(first == 0) { - filtered_chars = true; - // skip indentation on the next line, and advance over the next non-indented blank lines as well - size_t first_non_whitespace; - size_t numnl_following = (size_t)-1; - while(r[i] == '\n') + const char c = rem[first]; + _c4dbgfbf("firstns={}='{}'", first, _c4prc(c)); + if(c == '\n' || c == '\r') { - ++numnl_following; - csubstr rem = r.sub(i+1); - size_t first = rem.first_not_of(' '); - _c4dbgfbl("[{}]: found newline. first={} rem.len={}", i, first, rem.len); - if(first != npos) - { - first_non_whitespace = first + i+1; - while(first_non_whitespace < r.len && r[first_non_whitespace] == '\r') - ++first_non_whitespace; - _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len); - _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len); - _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, _c4prc(rem.str[first])); - if(first < indentation) - { - _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation); - i += first; - } - else - { - _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); - i += indentation; - if(first > indentation) - { - _c4dbgfbl("[{}]: {} further indented than {}, stop newlining", i, first, indentation); - goto finished_counting_newlines; - } - } - // prepare the next while loop iteration - // by setting i at the next newline after - // an empty line - if(r[first_non_whitespace] == '\n') - i = first_non_whitespace; - else - goto finished_counting_newlines; - } - else - { - _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len); - first = rem.len; - first_non_whitespace = first + i+1; - if(first) - { - _c4dbgfbl("[{}]: {} spaces to the end", i, first); - if(first < indentation) - { - _c4dbgfbl("[{}]: skip everything", i); - i += first; - } - else - { - _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); - i += indentation; - if(first > indentation) - { - _c4dbgfbl("[{}]: {} spaces missing. not done yet", i, indentation - first); - goto finished_counting_newlines; - } - } - } - else // if(i+1 == r.len) - { - _c4dbgfbl("[{}]: it's the final newline", i); - _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 == r.len); - _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len == 0); - } - goto end_of_scalar; - } - } - end_of_scalar: - // Write all the trailing newlines. Since we're - // at the end no folding is needed, so write every - // newline (add 1). - _c4dbgfbl("[{}]: add {} trailing newlines", i, 1+numnl_following); - for(size_t j = 0; j < 1 + numnl_following; ++j) - m_filter_arena.str[pos++] = '\n'; - break; - finished_counting_newlines: - _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace); - while(first_non_whitespace < r.len && r[first_non_whitespace] == '\t') - ++first_non_whitespace; - _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace); - _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace <= r.len); - size_t last_newl = r.last_of('\n', first_non_whitespace); - size_t this_indentation = first_non_whitespace - last_newl - 1; - _c4dbgfbl("[{}]: #newlines={} firstnonws={} lastnewl={} this_indentation={} vs indentation={}", i, numnl_following, first_non_whitespace, last_newl, this_indentation, indentation); - _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace >= last_newl + 1); - _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation >= indentation); - if(!started) - { - _c4dbgfbl("[{}]: #newlines={}. write all leading newlines", i, numnl_following); - for(size_t j = 0; j < 1 + numnl_following; ++j) - m_filter_arena.str[pos++] = '\n'; - if(this_indentation > indentation) - { - is_indented = true; - _c4dbgfbl("[{}]: advance ->{}", i, last_newl + indentation); - i = last_newl + indentation; - } - else - { - i = first_non_whitespace - 1; - _c4dbgfbl("[{}]: advance ->{}", i, first_non_whitespace); - } - } - else if(this_indentation == indentation) - { - _c4dbgfbl("[{}]: same indentation", i); - if(!is_indented) - { - if(numnl_following == 0) - { - _c4dbgfbl("[{}]: fold!", i); - m_filter_arena.str[pos++] = ' '; - } - else - { - _c4dbgfbl("[{}]: add {} newlines", i, 1 + numnl_following); - for(size_t j = 0; j < numnl_following; ++j) - m_filter_arena.str[pos++] = '\n'; - } - i = first_non_whitespace - 1; - _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace); - } - else - { - _c4dbgfbl("[{}]: back to ref indentation", i); - is_indented = false; - on_change_indentation(numnl_following, last_newl, first_non_whitespace); - _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace); - } + ; } else { - _c4dbgfbl("[{}]: increased indentation.", i); - is_indented = true; - _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation > indentation); - on_change_indentation(numnl_following, last_newl, first_non_whitespace); - _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace); + _c4dbgfbf("done with indented block", first); + goto endloop; } } - else if(curr != '\r') + else if(first != npos) { - if(curr != '\t') - started = true; - m_filter_arena.str[pos++] = curr; + proc.copy(first); + _c4dbgfbf("copy all {} spaces", first); } + break; } - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); - _c4dbgfbl(": #filteredchars={} after=[{}]~~~{}~~~", (int)s.len - (int)pos, pos, m_filter_arena.first(pos)); - bool changed = _apply_chomp(m_filter_arena, &pos, chomp); - if(pos < r.len || filtered_chars || changed) - { - r = _finish_filter_arena(s, pos); // write into s - } + break; + case '\r': + proc.skip(); + break; + default: + proc.copy(); + break; } - break; - default: - _c4err("unknown block style"); } + endloop: + return; +} + + +template +auto Parser::_filter_block_folded(FilterProcessor &C4_RESTRICT proc, size_t indentation, BlockChomp_e chomp) noexcept -> decltype(proc.result()) +{ + _c4dbgfbf("indentation={} before=[{}]~~~{}~~~", indentation, proc.src.len, proc.src); + + size_t contents_len = _handle_all_whitespace(proc, chomp); + if(!contents_len) + return proc.result(); + + contents_len = _extend_to_chomp(proc, contents_len); + + _c4dbgfbf("to filter=[{}]~~~{}~~~", contents_len, proc.src.first(contents_len)); + + _filter_block_folded_newlines_leading(proc, indentation, contents_len); + + // now filter the bulk + while(proc.has_more_chars(/*maxpos*/contents_len)) + { + const char curr = proc.curr(); + _c4dbgfbf("'{}' sofar=[{}]~~~{}~~~", _c4prc(curr), proc.wpos, proc.sofar()); + switch(curr) + { + case '\n': + { + _c4dbgfbf("found newline", curr); + _filter_block_folded_newlines(proc, indentation, contents_len); + break; + } + case '\r': + proc.skip(); + break; + default: + proc.copy(); + break; + } + } + + _c4dbgfbf("before chomp: #tochomp={} sofar=[{}]~~~{}~~~", proc.rem().len, proc.sofar().len, proc.sofar()); + + _filter_chomp(proc, chomp, indentation); + + _c4dbgfbf("final=[{}]~~~{}~~~", proc.sofar().len, proc.sofar()); + + return proc.result(); +} + +#undef _c4dbgfbf + +FilterResult Parser::filter_scalar_block_folded(csubstr scalar, substr dst, size_t indentation, BlockChomp_e chomp) noexcept +{ + FilterProcessorSrcDst proc(scalar, dst); + return _filter_block_folded(proc, indentation, chomp); +} + +FilterResult Parser::filter_scalar_block_folded_in_place(substr scalar, size_t cap, size_t indentation, BlockChomp_e chomp) noexcept +{ + FilterProcessorInplaceEndExtending proc(scalar, cap); + return _filter_block_folded(proc, indentation, chomp); +} + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +csubstr Parser::_filter_scalar_plain(substr s, size_t indentation) +{ + _c4dbgpf("filtering plain scalar: s=[{}]~~~{}~~~", s.len, s); + FilterResult r = this->filter_scalar_plain_in_place(s, s.len, indentation); + _RYML_CB_ASSERT(m_stack.m_callbacks, r.valid()); + _c4dbgpf("filtering plain scalar: success! s=[{}]~~~{}~~~", r.get().len, r.get()); + return r.get(); +} + + +//----------------------------------------------------------------------------- +csubstr Parser::_filter_scalar_squot(substr s) +{ + _c4dbgpf("filtering squo scalar: s=[{}]~~~{}~~~", s.len, s); + FilterResult r = this->filter_scalar_squoted_in_place(s, s.len); + _RYML_CB_ASSERT(this->callbacks(), r.valid()); + _c4dbgpf("filtering squo scalar: success! s=[{}]~~~{}~~~", r.get().len, r.get()); + return r.get(); +} + + +//----------------------------------------------------------------------------- +csubstr Parser::_filter_scalar_dquot(substr s) +{ + _c4dbgpf("filtering dquo scalar: s=[{}]~~~{}~~~", s.len, s); + FilterResultExtending r = this->filter_scalar_dquoted_in_place(s, s.len); + if(C4_LIKELY(r.valid())) + { + _c4dbgpf("filtering dquo scalar: success! s=[{}]~~~{}~~~", r.get().len, r.get()); + return r.get(); + } + else + { + const size_t len = r.required_len(); + _c4dbgpf("filtering dquo scalar: not enough space: needs {}, have {}", len, s.len); + _RYML_CB_ASSERT(this->callbacks(), m_tree); + substr dst = m_tree->alloc_arena(len); + _c4dbgpf("filtering dquo scalar: dst.len={}", dst.len); + _RYML_CB_ASSERT(this->callbacks(), dst.len == len); + FilterResult rsd = this->filter_scalar_dquoted(s, dst); + _c4dbgpf("filtering dquo scalar: ... result now needs {} was {}", rsd.required_len(), len); + _RYML_CB_ASSERT(this->callbacks(), rsd.required_len() <= len); // may be smaller! + _RYML_CB_CHECK(m_stack.m_callbacks, rsd.valid()); + _c4dbgpf("filtering dquo scalar: success! s=[{}]~~~{}~~~", rsd.get().len, rsd.get()); + return rsd.get(); + } +} - _c4dbgfbl(": final=[{}]~~~{}~~~", r.len, r); - #undef _c4dbgfbl +//----------------------------------------------------------------------------- +csubstr Parser::_filter_scalar_block_literal(substr s, BlockChomp_e chomp, size_t indentation) +{ + _c4dbgpf("filtering block literal scalar: s=[{}]~~~{}~~~", s.len, s); + FilterResult r = this->filter_scalar_block_literal_in_place(s, s.len, indentation, chomp); + if(C4_LIKELY(r.valid())) + { + _c4dbgpf("filtering block literal scalar: success! s=[{}]~~~{}~~~", r.get().len, r.get()); + return r.get(); + } + else + { + _c4dbgpf("filtering block literal scalar: not enough space: needs {}, have {}", r.required_len(), s.len); + _RYML_CB_ASSERT(this->callbacks(), m_tree); + substr dst = m_tree->alloc_arena(r.required_len()); + FilterResult rsd = this->filter_scalar_block_literal(s, dst, indentation, chomp); + _RYML_CB_CHECK(m_stack.m_callbacks, rsd.valid()); + _c4dbgpf("filtering block literal scalar: success! s=[{}]~~~{}~~~", rsd.get().len, rsd.get()); + return rsd.get(); + } +} + - return r; +//----------------------------------------------------------------------------- +csubstr Parser::_filter_scalar_block_folded(substr s, BlockChomp_e chomp, size_t indentation) +{ + _c4dbgpf("filtering block folded scalar: s=[{}]~~~{}~~~", s.len, s); + FilterResult r = this->filter_scalar_block_folded_in_place(s, s.len, indentation, chomp); + if(C4_LIKELY(r.valid())) + { + _c4dbgpf("filtering block folded scalar: success! s=[{}]~~~{}~~~", r.get().len, r.get()); + return r.get(); + } + else + { + _c4dbgpf("filtering block folded scalar: not enough space: needs {}, have {}", r.required_len(), s.len); + _RYML_CB_ASSERT(this->callbacks(), m_tree); + substr dst = m_tree->alloc_arena(r.required_len()); + FilterResult rsd = this->filter_scalar_block_folded(s, dst, indentation, chomp); + _RYML_CB_CHECK(m_stack.m_callbacks, rsd.valid()); + _c4dbgpf("filtering block folded scalar: success! s=[{}]~~~{}~~~", rsd.get().len, rsd.get()); + return rsd.get(); + } } + //----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + size_t Parser::_count_nlines(csubstr src) { return 1 + src.count('\n'); @@ -5472,50 +5877,6 @@ csubstr Parser::_prfl(substr buf, flag_t flags) } -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- - -void Parser::_grow_filter_arena(size_t num_characters_needed) -{ - _c4dbgpf("grow: arena={} numchars={}", m_filter_arena.len, num_characters_needed); - if(num_characters_needed <= m_filter_arena.len) - return; - size_t sz = m_filter_arena.len << 1; - _c4dbgpf("grow: sz={}", sz); - sz = num_characters_needed > sz ? num_characters_needed : sz; - _c4dbgpf("grow: sz={}", sz); - sz = sz < 128u ? 128u : sz; - _c4dbgpf("grow: sz={}", sz); - _RYML_CB_ASSERT(m_stack.m_callbacks, sz >= num_characters_needed); - _resize_filter_arena(sz); -} - -void Parser::_resize_filter_arena(size_t num_characters) -{ - if(num_characters > m_filter_arena.len) - { - _c4dbgpf("resize: sz={}", num_characters); - char *prev = m_filter_arena.str; - if(m_filter_arena.str) - { - _RYML_CB_ASSERT(m_stack.m_callbacks, m_filter_arena.len > 0); - _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len); - } - m_filter_arena.str = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, char, num_characters, prev); - m_filter_arena.len = num_characters; - } -} - -substr Parser::_finish_filter_arena(substr dst, size_t pos) -{ - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); - _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= dst.len); - memcpy(dst.str, m_filter_arena.str, pos); - return dst.first(pos); -} - - //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/src/c4/yml/parse.hpp b/src/c4/yml/parse.hpp index 659edf7e0..498a37c17 100644 --- a/src/c4/yml/parse.hpp +++ b/src/c4/yml/parse.hpp @@ -28,19 +28,23 @@ struct RYML_EXPORT ParserOptions private: typedef enum : uint32_t { - LOCATIONS = (1 << 0), - DEFAULTS = 0, + LOCATIONS = (1u << 1), + DEFAULTS = 0u, } Flags_e; uint32_t flags = DEFAULTS; + public: + ParserOptions() = default; +public: + /** @name source location tracking */ /** @{ */ /** enable/disable source location tracking */ - ParserOptions& locations(bool enabled) + ParserOptions& locations(bool enabled) noexcept { if(enabled) flags |= LOCATIONS; @@ -48,7 +52,8 @@ struct RYML_EXPORT ParserOptions flags &= ~LOCATIONS; return *this; } - bool locations() const { return (flags & LOCATIONS) != 0u; } + /** query source location tracking status */ + C4_ALWAYS_INLINE bool locations() const noexcept { return (flags & LOCATIONS); } /** @} */ }; @@ -108,12 +113,8 @@ class RYML_EXPORT Parser _resize_locations(num_source_lines); } - /** Reserve a certain capacity for the character arena used to - * filter scalars. */ - void reserve_filter_arena(size_t num_characters) - { - _resize_filter_arena(num_characters); - } + RYML_DEPRECATED("filter arena no longer needed") + void reserve_filter_arena(size_t) {} /** @} */ @@ -123,7 +124,7 @@ class RYML_EXPORT Parser /** @{ */ /** Get the current callbacks in the parser. */ - Callbacks callbacks() const { return m_stack.m_callbacks; } + Callbacks const& callbacks() const { return m_stack.m_callbacks; } /** Get the name of the latest file parsed by this object. */ csubstr filename() const { return m_file; } @@ -133,7 +134,8 @@ class RYML_EXPORT Parser size_t stack_capacity() const { return m_stack.capacity(); } size_t locations_capacity() const { return m_newline_offsets_capacity; } - size_t filter_arena_capacity() const { return m_filter_arena.len; } + RYML_DEPRECATED("filter arena no longer needed") + size_t filter_arena_capacity() const { return 0u; } ParserOptions const& options() const { return m_options; } @@ -279,18 +281,39 @@ class RYML_EXPORT Parser /** @} */ -private: +public: - typedef enum { - BLOCK_LITERAL, //!< keep newlines (|) - BLOCK_FOLD //!< replace newline with single space (>) - } BlockStyle_e; + using LocCRef = Location const& C4_RESTRICT; - typedef enum { - CHOMP_CLIP, //!< single newline at end (default) - CHOMP_STRIP, //!< no newline at end (-) - CHOMP_KEEP //!< all newlines from end (+) - } BlockChomp_e; + /** @name scalar filtering */ + /** @{*/ + + /** filter a plain scalar */ + FilterResult filter_scalar_plain(csubstr scalar, substr dst, size_t indentation) noexcept; + /** filter a plain scalar in place */ + FilterResult filter_scalar_plain_in_place(substr scalar, size_t cap, size_t indentation) noexcept; + + /** filter a single-quoted scalar */ + FilterResult filter_scalar_squoted(csubstr scalar, substr dst) noexcept; + /** filter a single-quoted scalar in place */ + FilterResult filter_scalar_squoted_in_place(substr scalar, size_t cap) noexcept; + + /** filter a double-quoted scalar */ + FilterResult filter_scalar_dquoted(csubstr scalar, substr dst); + /** filter a double-quoted scalar in place */ + FilterResultExtending filter_scalar_dquoted_in_place(substr scalar, size_t cap); + + /** filter a block-literal scalar */ + FilterResult filter_scalar_block_literal(csubstr scalar, substr dst, size_t indentation, BlockChomp_e chomp) noexcept; + /** filter a block-literal scalar in place */ + FilterResult filter_scalar_block_literal_in_place(substr scalar, size_t cap, size_t indentation, BlockChomp_e chomp) noexcept; + + /** filter a block-folded scalar */ + FilterResult filter_scalar_block_folded(csubstr scalar, substr dst, size_t indentation, BlockChomp_e chomp) noexcept; + /** filter a block-folded scalar in place */ + FilterResult filter_scalar_block_folded_in_place(substr scalar, size_t cap, size_t indentation, BlockChomp_e chomp) noexcept; + + /** @} */ private: @@ -330,15 +353,15 @@ class RYML_EXPORT Parser csubstr _scan_to_next_nonempty_line(size_t indentation); csubstr _extend_scanned_scalar(csubstr currscalar); - csubstr _filter_squot_scalar(const substr s); - csubstr _filter_dquot_scalar(substr s); - csubstr _filter_plain_scalar(substr s, size_t indentation); - csubstr _filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation); - template - bool _filter_nl(substr scalar, size_t *C4_RESTRICT pos, size_t *C4_RESTRICT filter_arena_pos, size_t indentation); - template - void _filter_ws(substr scalar, size_t *C4_RESTRICT pos, size_t *C4_RESTRICT filter_arena_pos); - bool _apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp); +public: // exposed for testing. to be cleared. + + csubstr _filter_scalar_squot(substr s); + csubstr _filter_scalar_dquot(substr s); + csubstr _filter_scalar_plain(substr s, size_t indentation); + csubstr _filter_scalar_block_literal(substr s, BlockChomp_e chomp, size_t indentation); + csubstr _filter_scalar_block_folded(substr s, BlockChomp_e chomp, size_t indentation); + +private: void _handle_finished_file(); void _handle_line(); @@ -409,6 +432,34 @@ class RYML_EXPORT Parser static size_t _count_nlines(csubstr src); +public: + + template auto _filter_plain(FilterProcessor &C4_RESTRICT proc, size_t indentation) noexcept -> decltype(proc.result()); + template auto _filter_squoted(FilterProcessor &C4_RESTRICT proc) noexcept -> decltype(proc.result()); + template auto _filter_dquoted(FilterProcessor &C4_RESTRICT proc) -> decltype(proc.result()); + template auto _filter_block_literal(FilterProcessor &C4_RESTRICT proc, size_t indentation, BlockChomp_e chomp) noexcept -> decltype(proc.result()); + template auto _filter_block_folded(FilterProcessor &C4_RESTRICT proc, size_t indentation, BlockChomp_e chomp) noexcept -> decltype(proc.result()); + +public: + + template void _filter_nl_plain(FilterProcessor &C4_RESTRICT proc, size_t indentation) noexcept; + template void _filter_nl_squoted(FilterProcessor &C4_RESTRICT proc) noexcept; + template void _filter_nl_dquoted(FilterProcessor &C4_RESTRICT proc) noexcept; + + template bool _filter_ws_handle_to_first_non_space(FilterProcessor &C4_RESTRICT proc) noexcept; + template void _filter_ws_copy_trailing(FilterProcessor &C4_RESTRICT proc) noexcept; + template void _filter_ws_skip_trailing(FilterProcessor &C4_RESTRICT proc) noexcept; + + template void _filter_dquoted_backslash(FilterProcessor &C4_RESTRICT proc); + + template void _filter_chomp(FilterProcessor &C4_RESTRICT proc, BlockChomp_e chomp, size_t indentation) noexcept; + template size_t _handle_all_whitespace(FilterProcessor &C4_RESTRICT proc, BlockChomp_e chomp) noexcept; + template size_t _extend_to_chomp(FilterProcessor &C4_RESTRICT proc, size_t contents_len) noexcept; + template void _filter_block_indentation(FilterProcessor &C4_RESTRICT proc, size_t indentation) noexcept; + template void _filter_block_folded_newlines(FilterProcessor &C4_RESTRICT proc, size_t indentation, size_t len) noexcept; + template void _filter_block_folded_newlines_leading(FilterProcessor &C4_RESTRICT proc, size_t indentation, size_t len) noexcept; + template void _filter_block_folded_indented_block(FilterProcessor &C4_RESTRICT proc, size_t indentation, size_t len, size_t curr_indentation) noexcept; + private: typedef enum : flag_t { @@ -544,10 +595,6 @@ class RYML_EXPORT Parser void addrem_flags(flag_t on, flag_t off, State * s); void rem_flags(flag_t off, State * s); - void _resize_filter_arena(size_t num_characters); - void _grow_filter_arena(size_t num_characters); - substr _finish_filter_arena(substr dst, size_t pos); - void _prepare_locations(); void _resize_locations(size_t sz); bool _locations_dirty() const; @@ -566,6 +613,8 @@ class RYML_EXPORT Parser template void _dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const; #endif template void _err(csubstr fmt, Args const& C4_RESTRICT ...args) const; + template void _errloc(csubstr fmt, LocCRef loc, Args const& C4_RESTRICT ...args) const; + template void _fmt_msg(DumpFn &&dumpfn) const; static csubstr _prfl(substr buf, flag_t v); @@ -595,8 +644,6 @@ class RYML_EXPORT Parser size_t m_val_anchor_indentation; csubstr m_val_anchor; - substr m_filter_arena; - size_t *m_newline_offsets; size_t m_newline_offsets_size; size_t m_newline_offsets_capacity; @@ -696,6 +743,10 @@ RYML_DEPRECATED("use parse_in_arena() instead") inline void parse(csubstr filena /** @} */ +namespace detail { +RYML_EXPORT size_t _find_last_newline_and_larger_indentation(csubstr s, size_t indentation) noexcept; +} + } // namespace yml } // namespace c4 diff --git a/src/c4/yml/tree.cpp b/src/c4/yml/tree.cpp index b3819868e..a8e809c62 100644 --- a/src/c4/yml/tree.cpp +++ b/src/c4/yml/tree.cpp @@ -175,7 +175,7 @@ csubstr from_tag(YamlTag_e tag) //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- -const char* NodeType::type_str(NodeType_e ty) +const char* NodeType::type_str(NodeType_e ty) noexcept { switch(ty & _TYMASK) { diff --git a/src/c4/yml/tree.hpp b/src/c4/yml/tree.hpp index f95f59ead..ab21d6b96 100644 --- a/src/c4/yml/tree.hpp +++ b/src/c4/yml/tree.hpp @@ -7,6 +7,9 @@ #ifndef _C4_YML_COMMON_HPP_ #include "c4/yml/common.hpp" #endif +#ifndef C4_YML_NODE_TYPE_HPP_ +#include "c4/yml/node_type.hpp" +#endif #include #include @@ -132,173 +135,6 @@ struct TagDirective #endif - -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- - - -/** the integral type necessary to cover all the bits marking node types */ -using type_bits = uint64_t; - - -/** a bit mask for marking node types */ -typedef enum : type_bits { - // a convenience define, undefined below - #define c4bit(v) (type_bits(1) << v) - NOTYPE = 0, ///< no node type is set - VAL = c4bit(0), ///< a leaf node, has a (possibly empty) value - KEY = c4bit(1), ///< is member of a map, must have non-empty key - MAP = c4bit(2), ///< a map: a parent of keyvals - SEQ = c4bit(3), ///< a seq: a parent of vals - DOC = c4bit(4), ///< a document - STREAM = c4bit(5)|SEQ, ///< a stream: a seq of docs - KEYREF = c4bit(6), ///< a *reference: the key references an &anchor - VALREF = c4bit(7), ///< a *reference: the val references an &anchor - KEYANCH = c4bit(8), ///< the key has an &anchor - VALANCH = c4bit(9), ///< the val has an &anchor - KEYTAG = c4bit(10), ///< the key has an explicit tag/type - VALTAG = c4bit(11), ///< the val has an explicit tag/type - _TYMASK = c4bit(12)-1, // all the bits up to here - VALQUO = c4bit(12), ///< the val is quoted by '', "", > or | - KEYQUO = c4bit(13), ///< the key is quoted by '', "", > or | - KEYVAL = KEY|VAL, - KEYSEQ = KEY|SEQ, - KEYMAP = KEY|MAP, - DOCMAP = DOC|MAP, - DOCSEQ = DOC|SEQ, - DOCVAL = DOC|VAL, - _KEYMASK = KEY | KEYQUO | KEYANCH | KEYREF | KEYTAG, - _VALMASK = VAL | VALQUO | VALANCH | VALREF | VALTAG, - // these flags are from a work in progress and should be used with care - _WIP_STYLE_FLOW_SL = c4bit(14), ///< mark container with single-line flow format (seqs as '[val1,val2], maps as '{key: val, key2: val2}') - _WIP_STYLE_FLOW_ML = c4bit(15), ///< mark container with multi-line flow format (seqs as '[val1,\nval2], maps as '{key: val,\nkey2: val2}') - _WIP_STYLE_BLOCK = c4bit(16), ///< mark container with block format (seqs as '- val\n', maps as 'key: val') - _WIP_KEY_LITERAL = c4bit(17), ///< mark key scalar as multiline, block literal | - _WIP_VAL_LITERAL = c4bit(18), ///< mark val scalar as multiline, block literal | - _WIP_KEY_FOLDED = c4bit(19), ///< mark key scalar as multiline, block folded > - _WIP_VAL_FOLDED = c4bit(20), ///< mark val scalar as multiline, block folded > - _WIP_KEY_SQUO = c4bit(21), ///< mark key scalar as single quoted - _WIP_VAL_SQUO = c4bit(22), ///< mark val scalar as single quoted - _WIP_KEY_DQUO = c4bit(23), ///< mark key scalar as double quoted - _WIP_VAL_DQUO = c4bit(24), ///< mark val scalar as double quoted - _WIP_KEY_PLAIN = c4bit(25), ///< mark key scalar as plain scalar (unquoted, even when multiline) - _WIP_VAL_PLAIN = c4bit(26), ///< mark val scalar as plain scalar (unquoted, even when multiline) - _WIP_KEY_STYLE = _WIP_KEY_LITERAL|_WIP_KEY_FOLDED|_WIP_KEY_SQUO|_WIP_KEY_DQUO|_WIP_KEY_PLAIN, - _WIP_VAL_STYLE = _WIP_VAL_LITERAL|_WIP_VAL_FOLDED|_WIP_VAL_SQUO|_WIP_VAL_DQUO|_WIP_VAL_PLAIN, - _WIP_KEY_FT_NL = c4bit(27), ///< features: mark key scalar as having \n in its contents - _WIP_VAL_FT_NL = c4bit(28), ///< features: mark val scalar as having \n in its contents - _WIP_KEY_FT_SQ = c4bit(29), ///< features: mark key scalar as having single quotes in its contents - _WIP_VAL_FT_SQ = c4bit(30), ///< features: mark val scalar as having single quotes in its contents - _WIP_KEY_FT_DQ = c4bit(31), ///< features: mark key scalar as having double quotes in its contents - _WIP_VAL_FT_DQ = c4bit(32), ///< features: mark val scalar as having double quotes in its contents - #undef c4bit -} NodeType_e; - - -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- - -/** wraps a NodeType_e element with some syntactic sugar and predicates */ -struct NodeType -{ -public: - - NodeType_e type; - -public: - - C4_ALWAYS_INLINE NodeType() : type(NOTYPE) {} - C4_ALWAYS_INLINE NodeType(NodeType_e t) : type(t) {} - C4_ALWAYS_INLINE NodeType(type_bits t) : type((NodeType_e)t) {} - - C4_ALWAYS_INLINE const char *type_str() const { return type_str(type); } - static const char* type_str(NodeType_e t); - - C4_ALWAYS_INLINE void set(NodeType_e t) { type = t; } - C4_ALWAYS_INLINE void set(type_bits t) { type = (NodeType_e)t; } - - C4_ALWAYS_INLINE void add(NodeType_e t) { type = (NodeType_e)(type|t); } - C4_ALWAYS_INLINE void add(type_bits t) { type = (NodeType_e)(type|t); } - - C4_ALWAYS_INLINE void rem(NodeType_e t) { type = (NodeType_e)(type & ~t); } - C4_ALWAYS_INLINE void rem(type_bits t) { type = (NodeType_e)(type & ~t); } - - C4_ALWAYS_INLINE void clear() { type = NOTYPE; } - -public: - - C4_ALWAYS_INLINE operator NodeType_e & C4_RESTRICT () { return type; } - C4_ALWAYS_INLINE operator NodeType_e const& C4_RESTRICT () const { return type; } - - C4_ALWAYS_INLINE bool operator== (NodeType_e t) const { return type == t; } - C4_ALWAYS_INLINE bool operator!= (NodeType_e t) const { return type != t; } - -public: - - #if defined(__clang__) - # pragma clang diagnostic push - # pragma clang diagnostic ignored "-Wnull-dereference" - #elif defined(__GNUC__) - # pragma GCC diagnostic push - # if __GNUC__ >= 6 - # pragma GCC diagnostic ignored "-Wnull-dereference" - # endif - #endif - - C4_ALWAYS_INLINE bool is_notype() const { return type == NOTYPE; } - C4_ALWAYS_INLINE bool is_stream() const { return ((type & STREAM) == STREAM) != 0; } - C4_ALWAYS_INLINE bool is_doc() const { return (type & DOC) != 0; } - C4_ALWAYS_INLINE bool is_container() const { return (type & (MAP|SEQ|STREAM)) != 0; } - C4_ALWAYS_INLINE bool is_map() const { return (type & MAP) != 0; } - C4_ALWAYS_INLINE bool is_seq() const { return (type & SEQ) != 0; } - C4_ALWAYS_INLINE bool has_key() const { return (type & KEY) != 0; } - C4_ALWAYS_INLINE bool has_val() const { return (type & VAL) != 0; } - C4_ALWAYS_INLINE bool is_val() const { return (type & KEYVAL) == VAL; } - C4_ALWAYS_INLINE bool is_keyval() const { return (type & KEYVAL) == KEYVAL; } - C4_ALWAYS_INLINE bool has_key_tag() const { return (type & (KEY|KEYTAG)) == (KEY|KEYTAG); } - C4_ALWAYS_INLINE bool has_val_tag() const { return ((type & VALTAG) && (type & (VAL|MAP|SEQ))); } - C4_ALWAYS_INLINE bool has_key_anchor() const { return (type & (KEY|KEYANCH)) == (KEY|KEYANCH); } - C4_ALWAYS_INLINE bool is_key_anchor() const { return (type & (KEY|KEYANCH)) == (KEY|KEYANCH); } - C4_ALWAYS_INLINE bool has_val_anchor() const { return (type & VALANCH) != 0 && (type & (VAL|SEQ|MAP)) != 0; } - C4_ALWAYS_INLINE bool is_val_anchor() const { return (type & VALANCH) != 0 && (type & (VAL|SEQ|MAP)) != 0; } - C4_ALWAYS_INLINE bool has_anchor() const { return (type & (KEYANCH|VALANCH)) != 0; } - C4_ALWAYS_INLINE bool is_anchor() const { return (type & (KEYANCH|VALANCH)) != 0; } - C4_ALWAYS_INLINE bool is_key_ref() const { return (type & KEYREF) != 0; } - C4_ALWAYS_INLINE bool is_val_ref() const { return (type & VALREF) != 0; } - C4_ALWAYS_INLINE bool is_ref() const { return (type & (KEYREF|VALREF)) != 0; } - C4_ALWAYS_INLINE bool is_anchor_or_ref() const { return (type & (KEYANCH|VALANCH|KEYREF|VALREF)) != 0; } - C4_ALWAYS_INLINE bool is_key_quoted() const { return (type & (KEY|KEYQUO)) == (KEY|KEYQUO); } - C4_ALWAYS_INLINE bool is_val_quoted() const { return (type & (VAL|VALQUO)) == (VAL|VALQUO); } - C4_ALWAYS_INLINE bool is_quoted() const { return (type & (KEY|KEYQUO)) == (KEY|KEYQUO) || (type & (VAL|VALQUO)) == (VAL|VALQUO); } - - // these predicates are a work in progress and subject to change. Don't use yet. - C4_ALWAYS_INLINE bool default_block() const { return (type & (_WIP_STYLE_BLOCK|_WIP_STYLE_FLOW_ML|_WIP_STYLE_FLOW_SL)) == 0; } - C4_ALWAYS_INLINE bool marked_block() const { return (type & (_WIP_STYLE_BLOCK)) != 0; } - C4_ALWAYS_INLINE bool marked_flow_sl() const { return (type & (_WIP_STYLE_FLOW_SL)) != 0; } - C4_ALWAYS_INLINE bool marked_flow_ml() const { return (type & (_WIP_STYLE_FLOW_ML)) != 0; } - C4_ALWAYS_INLINE bool marked_flow() const { return (type & (_WIP_STYLE_FLOW_ML|_WIP_STYLE_FLOW_SL)) != 0; } - C4_ALWAYS_INLINE bool key_marked_literal() const { return (type & (_WIP_KEY_LITERAL)) != 0; } - C4_ALWAYS_INLINE bool val_marked_literal() const { return (type & (_WIP_VAL_LITERAL)) != 0; } - C4_ALWAYS_INLINE bool key_marked_folded() const { return (type & (_WIP_KEY_FOLDED)) != 0; } - C4_ALWAYS_INLINE bool val_marked_folded() const { return (type & (_WIP_VAL_FOLDED)) != 0; } - C4_ALWAYS_INLINE bool key_marked_squo() const { return (type & (_WIP_KEY_SQUO)) != 0; } - C4_ALWAYS_INLINE bool val_marked_squo() const { return (type & (_WIP_VAL_SQUO)) != 0; } - C4_ALWAYS_INLINE bool key_marked_dquo() const { return (type & (_WIP_KEY_DQUO)) != 0; } - C4_ALWAYS_INLINE bool val_marked_dquo() const { return (type & (_WIP_VAL_DQUO)) != 0; } - C4_ALWAYS_INLINE bool key_marked_plain() const { return (type & (_WIP_KEY_PLAIN)) != 0; } - C4_ALWAYS_INLINE bool val_marked_plain() const { return (type & (_WIP_VAL_PLAIN)) != 0; } - - #if defined(__clang__) - # pragma clang diagnostic pop - #elif defined(__GNUC__) - # pragma GCC diagnostic pop - #endif - -}; - - //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/src/ryml.natvis b/src/ryml.natvis index 5e43b1a5c..fb04101b4 100644 --- a/src/ryml.natvis +++ b/src/ryml.natvis @@ -191,4 +191,60 @@ See also: + + src={src.str,[rpos]} dst={dst.str,[wpos]} + + src + dst + rpos + wpos + + src.str,[rpos] + + + rpos + src.str + + + + + + + + src={src.str,[rpos]} dst={src.str,[wpos]} + + rpos + wpos + wcap + + src.str,[wcap] + + + wcap + src.str + + + + src + + src.str+rpos,[src.len-rpos] + + src.len-rpossrc.str+rpos + + + + src.str,[rpos] + + rpossrc.str + + + + src.str,[wpos] + + wpossrc.str + + + + + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a7811a7dd..1d30133c4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -61,6 +61,7 @@ endfunction() ryml_add_test(callbacks) ryml_add_test(stack) +ryml_add_test(filter) ryml_add_test(parser) ryml_add_test(tree) ryml_add_test(noderef) diff --git a/test/test_block_folded.cpp b/test/test_block_folded.cpp index 9d579c5a1..043134f45 100644 --- a/test/test_block_folded.cpp +++ b/test/test_block_folded.cpp @@ -3,6 +3,544 @@ namespace c4 { namespace yml { +struct blockfolded_case +{ + size_t indentation; + BlockChomp_e chomp; + csubstr input, expected; +}; + +void test_filter_src_dst(blockfolded_case const& blcase) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~", blcase.input.len, blcase.input, blcase.expected.len, blcase.expected); + std::string subject_; + subject_.resize(2 * blcase.input.size()); + c4::substr dst = to_substr(subject_); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_folded(blcase.input, dst, blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.is_sub(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); +} + +void test_filter_inplace(blockfolded_case const& blcase) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~", blcase.input.len, blcase.input, blcase.expected.len, blcase.expected); + if(blcase.input.len >= blcase.expected.len) + { + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + c4::substr dst = to_substr(subject_); + Parser parser1 = {}; + FilterResult result = parser1.filter_scalar_block_folded_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_folded(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_sub(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); + } + else + { + { + SCOPED_TRACE("spare size"); + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + subject_.resize(blcase.expected.len + 30); + c4::substr dst = to_substr(subject_).first(blcase.input.len); + c4::substr rem = to_substr(subject_).sub(blcase.expected.len); + rem.fill('^'); + Parser parser1 = {}; + FilterResult result = parser1.filter_scalar_block_folded_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_folded(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_super(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); + EXPECT_EQ(rem.first_not_of('^'), npos); + } + { + SCOPED_TRACE("trimmed size"); + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + subject_.resize(blcase.expected.len); + c4::substr dst = to_substr(subject_).first(blcase.input.len); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_folded_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_folded(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_super(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); + } + { + SCOPED_TRACE("insufficient size"); + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + c4::substr dst = to_substr(subject_); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_folded_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_folded(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(result.required_len(), blcase.expected.len); + } + ASSERT_FALSE(result.valid()); + } + } +} + +struct BlockFoldedFilterTest : public ::testing::TestWithParam +{ +}; + +std::string add_carriage_returns(csubstr input) +{ + std::string result; + result.reserve(input.len + input.count('\n')); + for(const char c : input) + { + if(c == '\n') + result += '\r'; + result += c; + } + return result; +} + +TEST_P(BlockFoldedFilterTest, filter_src_dst) +{ + test_filter_src_dst(GetParam()); +} +TEST_P(BlockFoldedFilterTest, filter_src_dst_carriage_return) +{ + ParamType p = GetParam(); + std::string subject = add_carriage_returns(p.input); + p.input = to_csubstr(subject); + test_filter_src_dst(p); +} +TEST_P(BlockFoldedFilterTest, filter_inplace) +{ + test_filter_inplace(GetParam()); +} +TEST_P(BlockFoldedFilterTest, filter_inplace_carriage_return) +{ + ParamType p = GetParam(); + std::string subject = add_carriage_returns(p.input); + p.input = to_csubstr(subject); + test_filter_inplace(p); +} + + +blockfolded_case test_cases_filter[] = { +#define bfc(indentation, chomp, input, output) blockfolded_case{indentation, chomp, csubstr(input), csubstr(output)} + // 0 + bfc(2, CHOMP_STRIP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text, with some \"quotes\" of various 'types', and also a blank line:\nplus another line at the end."), + bfc(2, CHOMP_CLIP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text, with some \"quotes\" of various 'types', and also a blank line:\nplus another line at the end.\n"), + bfc(2, CHOMP_KEEP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text, with some \"quotes\" of various 'types', and also a blank line:\nplus another line at the end.\n\n"), + // 3 + bfc(2, CHOMP_STRIP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text, with some \"quotes\" of various 'types', and also a blank line:\nplus another line at the end."), + bfc(2, CHOMP_CLIP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text, with some \"quotes\" of various 'types', and also a blank line:\nplus another line at the end.\n"), + bfc(2, CHOMP_KEEP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text, with some \"quotes\" of various 'types', and also a blank line:\nplus another line at the end.\n\n"), + // 6 + bfc(1, CHOMP_STRIP, "", ""), + bfc(1, CHOMP_CLIP, "", ""), + bfc(1, CHOMP_KEEP, "", ""), + // 9 + bfc(1, CHOMP_STRIP, "\n", ""), + bfc(1, CHOMP_CLIP, "\n", ""), + bfc(1, CHOMP_KEEP, "\n", "\n"), + // 12 + bfc(1, CHOMP_STRIP, "\n\n", ""), + bfc(1, CHOMP_CLIP, "\n\n", ""), + bfc(1, CHOMP_KEEP, "\n\n", "\n\n"), + // 15 + bfc(1, CHOMP_STRIP, "\n\n", ""), + bfc(1, CHOMP_CLIP, "\n\n", ""), + bfc(1, CHOMP_KEEP, "\n\n", "\n\n"), + // 18 + bfc(1, CHOMP_STRIP, "\n\n\n", ""), + bfc(1, CHOMP_CLIP, "\n\n\n", ""), + bfc(1, CHOMP_KEEP, "\n\n\n", "\n\n\n"), + // 21 + bfc(1, CHOMP_STRIP, "\n\n\n\n", ""), + bfc(1, CHOMP_CLIP, "\n\n\n\n", ""), + bfc(1, CHOMP_KEEP, "\n\n\n\n", "\n\n\n\n"), + // 24 + bfc(1, CHOMP_STRIP, "a", "a"), + bfc(1, CHOMP_CLIP, "a", "a\n"), + bfc(1, CHOMP_KEEP, "a", "a"), + // 27 + bfc(1, CHOMP_STRIP, "a\n", "a"), + bfc(1, CHOMP_CLIP, "a\n", "a\n"), + bfc(1, CHOMP_KEEP, "a\n", "a\n"), + // 30 + bfc(1, CHOMP_STRIP, "a\n\n", "a"), + bfc(1, CHOMP_CLIP, "a\n\n", "a\n"), + bfc(1, CHOMP_KEEP, "a\n\n", "a\n\n"), + // 33 + bfc(0, CHOMP_STRIP, "a\n\n", "a"), + bfc(0, CHOMP_CLIP, "a\n\n", "a\n"), + bfc(0, CHOMP_KEEP, "a\n\n", "a\n\n"), + // 36 + bfc(1, CHOMP_STRIP, "a\n\n\n", "a"), + bfc(1, CHOMP_CLIP, "a\n\n\n", "a\n"), + bfc(1, CHOMP_KEEP, "a\n\n\n", "a\n\n\n"), + // 39 + bfc(1, CHOMP_STRIP, "a\n\n\n\n", "a"), + bfc(1, CHOMP_CLIP, "a\n\n\n\n", "a\n"), + bfc(1, CHOMP_KEEP, "a\n\n\n\n", "a\n\n\n\n"), + // 42 + bfc(1, CHOMP_STRIP, " ab\n \n \n", "ab"), + bfc(1, CHOMP_CLIP, " ab\n \n \n", "ab\n"), + bfc(1, CHOMP_KEEP, " ab\n \n \n", "ab\n\n\n"), + // 45 + bfc(1, CHOMP_STRIP, " ab\n \n \n", "ab\n\n "), + bfc(1, CHOMP_CLIP, " ab\n \n \n", "ab\n\n \n"), + bfc(1, CHOMP_KEEP, " ab\n \n \n", "ab\n\n \n"), + // 48 + bfc(0, CHOMP_STRIP, "ab\n\n \n", "ab\n\n "), + bfc(0, CHOMP_CLIP, "ab\n\n \n", "ab\n\n \n"), + bfc(0, CHOMP_KEEP, "ab\n\n \n", "ab\n\n \n"), + // 51 + bfc(1, CHOMP_STRIP, "hello\nthere\n", "hello there"), + bfc(1, CHOMP_CLIP, "hello\nthere\n", "hello there\n"), + bfc(1, CHOMP_KEEP, "hello\nthere\n", "hello there\n"), + // 54 + bfc(0, CHOMP_STRIP, "hello\nthere\n", "hello there"), + bfc(0, CHOMP_CLIP, "hello\nthere\n", "hello there\n"), + bfc(0, CHOMP_KEEP, "hello\nthere\n", "hello there\n"), + // 57 + bfc(3, CHOMP_CLIP, + " There once was a short man from Ealing\n" + " Who got on a bus to Darjeeling\n" + " It said on the door\n" + " \"Please don't spit on the floor\"\n" + " So he carefully spat on the ceiling.\n", + "There once was a short man from Ealing " + "Who got on a bus to Darjeeling\n" + " It said on the door\n" + " \"Please don't spit on the floor\"\n" + "So he carefully spat on the ceiling.\n"), + bfc(3, CHOMP_CLIP, + " There once was a short man from Ealing\n" + " Who got on a bus to Darjeeling\n" + " It said on the door\n" + " extra 0\n" + " extra 2\n" + " extra 2\n" + " extra 4\n" + " extra 4\n" + " extra 4\n" + " extra 2\n" + " extra 0\n" + " \"Please don't spit on the floor\"\n" + " So he carefully spat on the ceiling.\n", + "There once was a short man from Ealing " + "Who got on a bus to Darjeeling\n" + " It said on the door\n" + " extra 0\n" + " extra 2\n" + " extra 2\n" + " extra 4\n" + " extra 4\n" + " extra 4\n" + " extra 2\n" + " extra 0\n" + " \"Please don't spit on the floor\"\n" + "So he carefully spat on the ceiling.\n"), + bfc(1, CHOMP_STRIP, " \n \n \n", ""), + // 60 + bfc(1, CHOMP_STRIP, " \n \n \n", ""), + bfc(1, CHOMP_CLIP, " \n \n \n", ""), + bfc(1, CHOMP_KEEP, " \n \n \n", "\n\n\n"), + // 63 + bfc(1, CHOMP_STRIP, " \n \n \n ", ""), + bfc(1, CHOMP_CLIP, " \n \n \n ", ""), + bfc(1, CHOMP_KEEP, " \n \n \n ", "\n\n\n"), + // 66 + bfc(1, CHOMP_STRIP, " \n \n \n \n \n \n \n \n \n", ""), + bfc(1, CHOMP_CLIP, " \n \n \n \n \n \n \n \n \n", ""), + bfc(1, CHOMP_KEEP, " \n \n \n \n \n \n \n \n \n", "\n\n\n\n\n\n\n\n\n"), + // 69 + bfc(1, CHOMP_STRIP, " \n \n \n\n \n \n\n \n \n", ""), + bfc(1, CHOMP_CLIP, " \n \n \n\n \n \n\n \n \n", ""), + bfc(1, CHOMP_KEEP, " \n \n \n\n \n \n\n \n \n", "\n\n\n\n\n\n\n\n\n"), + // 72 + bfc(7, CHOMP_STRIP, + " asd\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " ", + "asd"), + bfc(7, CHOMP_CLIP, + " asd\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " ", + "asd\n"), + bfc(7, CHOMP_KEEP, + " asd\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " ", + "asd\n\n\n\n\n\n"), + // 75 + bfc(5, CHOMP_STRIP, " asd\n \t ", "asd\n\t "), + bfc(5, CHOMP_CLIP, " asd\n \t ", "asd\n\t \n"), + bfc(5, CHOMP_KEEP, " asd\n \t ", "asd\n\t "), + // 78 + bfc(5, CHOMP_STRIP, " asd\n \t \n", "asd\n\t "), + bfc(5, CHOMP_CLIP, " asd\n \t \n", "asd\n\t \n"), + bfc(5, CHOMP_KEEP, " asd\n \t \n", "asd\n\t \n"), + // 81 + bfc(5, CHOMP_STRIP, " asd\n \t ", "asd\n \t "), + bfc(5, CHOMP_CLIP, " asd\n \t ", "asd\n \t \n"), + bfc(5, CHOMP_KEEP, " asd\n \t ", "asd\n \t "), + // 84 + bfc(5, CHOMP_STRIP, " asd\n \t \n", "asd\n \t "), + bfc(5, CHOMP_CLIP, " asd\n \t \n", "asd\n \t \n"), + bfc(5, CHOMP_KEEP, " asd\n \t \n", "asd\n \t \n"), + // 87 + bfc(2, CHOMP_CLIP, "\n foo\n bar\n", "\nfoo bar\n"), + bfc(2, CHOMP_CLIP, "\n\n foo\n bar\n", "\n\nfoo bar\n"), + bfc(2, CHOMP_CLIP, "\n\n\n foo\n bar\n", "\n\n\nfoo bar\n"), + // 90 + bfc(1, CHOMP_CLIP, + " folded\n" + " line\n" + "\n" + " next\n" + " line\n" + " * bullet\n" + "\n" + " * list\n" + " * lines\n" + "\n" + " last\n" + " line\n", + "folded line\n" + "next line\n" + " * bullet\n" + "\n" + " * list\n" + " * lines\n" + "\n" + "last line\n" + ""), + bfc(1, CHOMP_CLIP, + " \n" + " \n" + " literal\n" + " \n" + " \n" + " text\n" + "", + "\n" + " \n" + " literal\n" + " \n" + " \n" + " text\n"), + bfc(2, CHOMP_CLIP, + " \n" + " \n" + " literal\n" + " \n" + " \n" + " text\n" + "", + "\n" + "\n" + "literal\n" + " \n" + "\n" + "text\n"), + // 93 + bfc(5, CHOMP_CLIP, " asd\n ", "asd\n \n"), + bfc(5, CHOMP_CLIP, " asd\n ", "asd\n \n"), + bfc(5, CHOMP_CLIP, " asd\n \t ", "asd\n\t \n"), + // 96 + bfc(5, CHOMP_CLIP, " asd\n \t \n", "asd\n\t \n"), + bfc(5, CHOMP_CLIP, " asd\n \t", "asd\n \t\n"), + bfc(5, CHOMP_CLIP, " asd\n \t\n", "asd\n \t\n"), + // 99 + bfc(1, CHOMP_CLIP, + " Sammy Sosa completed another\n" + " fine season with great stats.\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + " What a year!\n" + , + "Sammy Sosa completed another fine season with great stats.\n" + "63 Home Runs 0.288 Batting Average\n" + "What a year!\n" + ), + bfc(1, CHOMP_CLIP, + " Sammy Sosa completed another\n" + " fine season with great stats.\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + " What a year!\n" + , + "Sammy Sosa completed another fine season with great stats.\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "What a year!\n" + ), + bfc(1, CHOMP_CLIP, + " Sammy Sosa completed another\n" + " fine season with great stats.\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + " What a year!\n" + , + "Sammy Sosa completed another fine season with great stats.\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + "What a year!\n" + ), + // 102 + bfc(1, CHOMP_CLIP, + " Sammy Sosa completed another\n" + " fine season with great stats.\n" + "\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + "\n" + " What a year!\n" + , + "Sammy Sosa completed another fine season with great stats.\n" + "\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + "\n" + "What a year!\n" + ), + bfc(1, CHOMP_CLIP, + " Sammy Sosa completed another\n" + " fine season with great stats.\n" + "\n" + "\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + "\n" + "\n" + " What a year!\n" + , + "Sammy Sosa completed another fine season with great stats.\n" + "\n" + "\n" + "\n" + " 63 Home Runs\n" + " 0.288 Batting Average\n" + "\n" + "\n" + "\n" + "What a year!\n" + ), + bfc(2, CHOMP_CLIP, + " more indented\n" + " regular\n" + , + " more indented\n" + "regular\n" + ), + // 105 + bfc(2, CHOMP_CLIP, + "\n" + "\n" + " more indented\n" + " regular\n" + , + "\n" + "\n" + " more indented\n" + "regular\n" + ), + #undef blc +}; + +INSTANTIATE_TEST_SUITE_P(block_folded_filter, + BlockFoldedFilterTest, + testing::ValuesIn(test_cases_filter)); + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + TEST(block_folded, basic) { { @@ -1488,7 +2026,7 @@ R"(>+ ADD_CASE_TO_GROUP("block folded, keep, empty docval trailing 1.1", R"(>+ )", - N(DOCVAL|VALQUO, "") + N(DOCVAL|VALQUO, "\n") ); ADD_CASE_TO_GROUP("block folded, keep, empty docval trailing 1.2", @@ -1558,7 +2096,7 @@ R"(- >+ - >+ )", -N(L{N(QV, "\n"), N(QV, ""),})); +N(L{N(QV, "\n"), N(QV, "\n"),})); ADD_CASE_TO_GROUP("block folded, empty block vals in seq 1", R"(- >+ diff --git a/test/test_block_literal.cpp b/test/test_block_literal.cpp index fe8e352be..89e66aeb7 100644 --- a/test/test_block_literal.cpp +++ b/test/test_block_literal.cpp @@ -3,6 +3,409 @@ namespace c4 { namespace yml { + +struct blocklit_case +{ + size_t indentation; + BlockChomp_e chomp; + csubstr input, expected; +}; + +void test_filter_src_dst(blocklit_case const& blcase) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~", blcase.input.len, blcase.input, blcase.expected.len, blcase.expected); + std::string subject_; + subject_.resize(2 * blcase.input.size()); + std::string subject_2 = subject_; + c4::substr dst = to_substr(subject_); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_literal(blcase.input, dst, blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.is_sub(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); +} + +void test_filter_inplace(blocklit_case const& blcase) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~", blcase.input.len, blcase.input, blcase.expected.len, blcase.expected); + if(blcase.input.len >= blcase.expected.len) + { + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + c4::substr dst = to_substr(subject_); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_literal_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_literal(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_sub(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); + } + else + { + { + SCOPED_TRACE("spare size"); + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + subject_.resize(blcase.expected.len + 30); + c4::substr dst = to_substr(subject_).first(blcase.input.len); + c4::substr rem = to_substr(subject_).sub(blcase.expected.len); + rem.fill('^'); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_literal_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_literal(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_super(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); + EXPECT_EQ(rem.first_not_of('^'), npos); + } + { + SCOPED_TRACE("trimmed size"); + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + subject_.resize(blcase.expected.len); + c4::substr dst = to_substr(subject_).first(blcase.input.len); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_literal_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_TRUE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_literal(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + const csubstr out = result.get(); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(out.len, blcase.expected.len); + } + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_super(dst)); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + EXPECT_EQ(out, blcase.expected); + } + { + SCOPED_TRACE("insufficient size"); + std::string subject_(blcase.input.str, blcase.input.len); + std::string subject_2 = subject_; + c4::substr dst = to_substr(subject_); + Parser proc = {}; + FilterResult result = proc.filter_scalar_block_literal_in_place(dst, subject_.size(), blcase.indentation, blcase.chomp); + ASSERT_FALSE(result.valid()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_block_literal(to_substr(subject_2), blcase.chomp, blcase.indentation); + EXPECT_GE(result.required_len(), blcase.expected.len); + EXPECT_EQ(sresult.len, result.str.len); + if(blcase.chomp != CHOMP_CLIP) + { + EXPECT_EQ(result.required_len(), blcase.expected.len); + } + } + } +} + +struct BlockLitFilterTest : public ::testing::TestWithParam +{ +}; + +std::string add_carriage_returns(csubstr input) +{ + std::string result; + result.reserve(input.len + input.count('\n')); + for(const char c : input) + { + if(c == '\n') + result += '\r'; + result += c; + } + return result; +} + +TEST_P(BlockLitFilterTest, filter_src_dst) +{ + test_filter_src_dst(GetParam()); +} +TEST_P(BlockLitFilterTest, filter_src_dst_carriage_return) +{ + ParamType p = GetParam(); + std::string subject = add_carriage_returns(p.input); + p.input = to_csubstr(subject); + test_filter_src_dst(p); +} +TEST_P(BlockLitFilterTest, filter_inplace) +{ + test_filter_inplace(GetParam()); +} +TEST_P(BlockLitFilterTest, filter_inplace_carriage_return) +{ + ParamType p = GetParam(); + std::string subject = add_carriage_returns(p.input); + p.input = to_csubstr(subject); + test_filter_inplace(p); +} + + +blocklit_case test_cases_filter[] = { +#define blc(indentation, chomp, input, output) blocklit_case{indentation, chomp, csubstr(input), csubstr(output)} + // 0 + blc(2, CHOMP_STRIP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text,\nwith some \"quotes\" of various 'types',\nand also a blank line:\n\nplus another line at the end."), + blc(2, CHOMP_CLIP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text,\nwith some \"quotes\" of various 'types',\nand also a blank line:\n\nplus another line at the end.\n"), + blc(2, CHOMP_KEEP, + "Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n\n", + "Several lines of text,\nwith some \"quotes\" of various 'types',\nand also a blank line:\n\nplus another line at the end.\n\n"), + // 3 + blc(2, CHOMP_STRIP, + " Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.", + "Several lines of text,\nwith some \"quotes\" of various 'types',\nand also a blank line:\n\nplus another line at the end."), + blc(2, CHOMP_CLIP, + " Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n", + "Several lines of text,\nwith some \"quotes\" of various 'types',\nand also a blank line:\n\nplus another line at the end.\n"), + blc(2, CHOMP_KEEP, + " Several lines of text,\n with some \"quotes\" of various 'types',\n and also a blank line:\n\n plus another line at the end.\n \n", + "Several lines of text,\nwith some \"quotes\" of various 'types',\nand also a blank line:\n\nplus another line at the end.\n\n"), + // 6 + blc(1, CHOMP_STRIP, "", ""), + blc(1, CHOMP_CLIP, "", ""), + blc(1, CHOMP_KEEP, "", ""), + // 9 + blc(1, CHOMP_STRIP, "\n", ""), + blc(1, CHOMP_CLIP, "\n", ""), + blc(1, CHOMP_KEEP, "\n", "\n"), + // 12 + blc(1, CHOMP_STRIP, "\n\n", ""), + blc(1, CHOMP_CLIP, "\n\n", ""), + blc(1, CHOMP_KEEP, "\n\n", "\n\n"), + // 15 + blc(1, CHOMP_STRIP, "\n\n", ""), + blc(1, CHOMP_CLIP, "\n\n", ""), + blc(1, CHOMP_KEEP, "\n\n", "\n\n"), + // 18 + blc(1, CHOMP_STRIP, "\n\n\n", ""), + blc(1, CHOMP_CLIP, "\n\n\n", ""), + blc(1, CHOMP_KEEP, "\n\n\n", "\n\n\n"), + // 21 + blc(1, CHOMP_STRIP, "\n\n\n\n", ""), + blc(1, CHOMP_CLIP, "\n\n\n\n", ""), + blc(1, CHOMP_KEEP, "\n\n\n\n", "\n\n\n\n"), + // 24 + blc(1, CHOMP_STRIP, "a", "a"), + blc(1, CHOMP_CLIP, "a", "a\n"), + blc(1, CHOMP_KEEP, "a", "a"), + // 27 + blc(1, CHOMP_STRIP, "a\n", "a"), + blc(1, CHOMP_CLIP, "a\n", "a\n"), + blc(1, CHOMP_KEEP, "a\n", "a\n"), + // 30 + blc(1, CHOMP_STRIP, "a\n\n", "a"), + blc(1, CHOMP_CLIP, "a\n\n", "a\n"), + blc(1, CHOMP_KEEP, "a\n\n", "a\n\n"), + // 33 + blc(0, CHOMP_STRIP, "a\n\n", "a"), + blc(0, CHOMP_CLIP, "a\n\n", "a\n"), + blc(0, CHOMP_KEEP, "a\n\n", "a\n\n"), + // 36 + blc(1, CHOMP_STRIP, "a\n\n\n", "a"), + blc(1, CHOMP_CLIP, "a\n\n\n", "a\n"), + blc(1, CHOMP_KEEP, "a\n\n\n", "a\n\n\n"), + // 39 + blc(1, CHOMP_STRIP, "a\n\n\n\n", "a"), + blc(1, CHOMP_CLIP, "a\n\n\n\n", "a\n"), + blc(1, CHOMP_KEEP, "a\n\n\n\n", "a\n\n\n\n"), + // 42 + blc(1, CHOMP_STRIP, " ab\n \n \n", "ab"), + blc(1, CHOMP_CLIP, " ab\n \n \n", "ab\n"), + blc(1, CHOMP_KEEP, " ab\n \n \n", "ab\n\n\n"), + // 45 + blc(1, CHOMP_STRIP, " ab\n \n \n", "ab\n\n "), + blc(1, CHOMP_CLIP, " ab\n \n \n", "ab\n\n \n"), + blc(1, CHOMP_KEEP, " ab\n \n \n", "ab\n\n \n"), + // 48 + blc(0, CHOMP_STRIP, "ab\n\n \n", "ab\n\n "), + blc(0, CHOMP_CLIP, "ab\n\n \n", "ab\n\n \n"), + blc(0, CHOMP_KEEP, "ab\n\n \n", "ab\n\n \n"), + // 51 + blc(1, CHOMP_STRIP, "hello\nthere\n", "hello\nthere"), + blc(1, CHOMP_CLIP, "hello\nthere\n", "hello\nthere\n"), + blc(1, CHOMP_KEEP, "hello\nthere\n", "hello\nthere\n"), + // 54 + blc(0, CHOMP_STRIP, "hello\nthere\n", "hello\nthere"), + blc(0, CHOMP_CLIP, "hello\nthere\n", "hello\nthere\n"), + blc(0, CHOMP_KEEP, "hello\nthere\n", "hello\nthere\n"), + // 57 + blc(3, CHOMP_CLIP, + " There once was a short man from Ealing\n" + " Who got on a bus to Darjeeling\n" + " It said on the door\n" + " \"Please don't spit on the floor\"\n" + " So he carefully spat on the ceiling.\n", + "There once was a short man from Ealing\n" + "Who got on a bus to Darjeeling\n" + " It said on the door\n" + " \"Please don't spit on the floor\"\n" + "So he carefully spat on the ceiling.\n"), + blc(8, CHOMP_CLIP, + "
\n" + "

\"Three is always greater than two,\n" + " even for large values of two\"

\n" + "

--Author Unknown

\n" + "
", + "
\n" + "

\"Three is always greater than two,\n" + " even for large values of two\"

\n" + "

--Author Unknown

\n" + "
\n"), + blc(2, CHOMP_CLIP, + " Several lines of text,\n" + " with some \"quotes\" of various 'types',\n" + " and also a blank line:\n" + " \n" + " plus another line at the end.\n", + "Several lines of text,\n" + "with some \"quotes\" of various 'types',\n" + "and also a blank line:\n" + "\n" + "plus another line at the end.\n"), + // 60 + blc(2, CHOMP_CLIP, + " Several lines of text,\n" + " with some \"quotes\" of various 'types',\n" + " and also a blank line:\n" + " \n" + " plus another line at the end.", + "Several lines of text,\n" + "with some \"quotes\" of various 'types',\n" + "and also a blank line:\n" + " \n" + "plus another line at the end.\n"), + blc(2, CHOMP_CLIP, + " Several lines of text,\n" + " with some \"quotes\" of various 'types',\n" + " and also a blank line:\n" + " \n" + " plus another line at the end.", + "Several lines of text,\n" + "with some \"quotes\" of various 'types',\n" + "and also a blank line:\n" + " \n" + "plus another line at the end.\n"), + blc(4, CHOMP_CLIP, + " #include \"{{hdr.filename}}\"\n \n {{src.gencode}}", + "#include \"{{hdr.filename}}\"\n\n{{src.gencode}}\n"), + // 63 + blc(1, CHOMP_STRIP, " \n \n \n", ""), + blc(1, CHOMP_CLIP, " \n \n \n", ""), + blc(1, CHOMP_KEEP, " \n \n \n", "\n\n\n"), + // 66 + blc(1, CHOMP_STRIP, " \n \n \n ", ""), + blc(1, CHOMP_CLIP, " \n \n \n ", ""), + blc(1, CHOMP_KEEP, " \n \n \n ", "\n\n\n"), + // 69 + blc(1, CHOMP_STRIP, " \n \n \n \n \n \n \n \n \n", ""), + blc(1, CHOMP_CLIP, " \n \n \n \n \n \n \n \n \n", ""), + blc(1, CHOMP_KEEP, " \n \n \n \n \n \n \n \n \n", "\n\n\n\n\n\n\n\n\n"), + // 72 + blc(1, CHOMP_STRIP, " \n \n \n\n \n \n\n \n \n", ""), + blc(1, CHOMP_CLIP, " \n \n \n\n \n \n\n \n \n", ""), + blc(1, CHOMP_KEEP, " \n \n \n\n \n \n\n \n \n", "\n\n\n\n\n\n\n\n\n"), + // 75 + blc(7, CHOMP_STRIP, + " asd\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " ", + "asd"), + blc(7, CHOMP_CLIP, + " asd\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " ", + "asd\n"), + blc(7, CHOMP_KEEP, + " asd\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " ", + "asd\n\n\n\n\n\n"), + // 78 + blc(5, CHOMP_STRIP, " asd\n \t ", "asd\n\t "), + blc(5, CHOMP_CLIP, " asd\n \t ", "asd\n\t \n"), + blc(5, CHOMP_KEEP, " asd\n \t ", "asd\n\t "), + // 81 + blc(5, CHOMP_STRIP, " asd\n \t \n", "asd\n\t "), + blc(5, CHOMP_CLIP, " asd\n \t \n", "asd\n\t \n"), + blc(5, CHOMP_KEEP, " asd\n \t \n", "asd\n\t \n"), + // 84 + blc(5, CHOMP_STRIP, " asd\n \t ", "asd\n \t "), + blc(5, CHOMP_CLIP, " asd\n \t ", "asd\n \t \n"), + blc(5, CHOMP_KEEP, " asd\n \t ", "asd\n \t "), + // 87 + blc(5, CHOMP_STRIP, " asd\n \t \n", "asd\n \t "), + blc(5, CHOMP_CLIP, " asd\n \t \n", "asd\n \t \n"), + blc(5, CHOMP_KEEP, " asd\n \t \n", "asd\n \t \n"), + // 90 + blc(5, CHOMP_CLIP, " asd\n ", "asd\n \n"), + blc(5, CHOMP_CLIP, " asd\n ", "asd\n \n"), + blc(5, CHOMP_CLIP, " asd\n \t ", "asd\n\t \n"), + // 93 + blc(5, CHOMP_CLIP, " asd\n \t", "asd\n \t\n"), + blc(2, CHOMP_CLIP, " ", ""), + blc(2, CHOMP_KEEP, " ", "\n"), + // 96 + blc(2, CHOMP_CLIP, " ", ""), + blc(2, CHOMP_STRIP, " ", ""), + + #undef blc +}; + +INSTANTIATE_TEST_SUITE_P(block_literal_filter, + BlockLitFilterTest, + testing::ValuesIn(test_cases_filter)); + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + TEST(block_literal, empty_block) { { diff --git a/test/test_callbacks.cpp b/test/test_callbacks.cpp index 6f4bbf0ed..45d0583ae 100644 --- a/test/test_callbacks.cpp +++ b/test/test_callbacks.cpp @@ -222,6 +222,7 @@ TEST(Callbacks, ne) TEST(Callbacks, cmp_user_data) { Callbacks before = get_callbacks(); + before.m_user_data = (void*)1u; Callbacks cp = before; EXPECT_EQ(cp, before); cp.m_user_data = (void*)(((char*)before.m_user_data) + 100u); diff --git a/test/test_case.cpp b/test/test_case.cpp index 538bfd334..f380afdf9 100644 --- a/test/test_case.cpp +++ b/test/test_case.cpp @@ -9,6 +9,7 @@ #endif #include +#include #if defined(_MSC_VER) # pragma warning(push) @@ -172,10 +173,10 @@ std::string format_error(const char* msg, size_t len, Location loc) return out; } -struct ExpectedError : public std::runtime_error +struct ExpectedError__ : public std::runtime_error { Location error_location; - ExpectedError(const char* msg, size_t len, Location loc) + ExpectedError__(const char* msg, size_t len, Location loc) : std::runtime_error(format_error(msg, len, loc)) , error_location(loc) { @@ -193,8 +194,9 @@ ExpectError::ExpectError(Tree *tree, Location loc) , expected_location(loc) { auto err = [](const char* msg, size_t len, Location errloc, void *this_) { - ((ExpectError*)this_)->m_got_an_error = true; - throw ExpectedError(msg, len, errloc); + _c4dbgp("called error callback!"); + ((ExpectError*)this_)->m_got_an_error = true; // assign in here to ensure the exception was thrown here + throw ExpectedError__(msg, len, errloc); }; #ifdef RYML_NO_DEFAULT_CALLBACKS c4::yml::Callbacks tcb((void*)this, nullptr, nullptr, err); @@ -220,15 +222,16 @@ void ExpectError::do_check(Tree *tree, std::function fn, Location expect auto context = ExpectError(tree, expected_location); try { + _c4dbgp("check expected error"); fn(); + _c4dbgp("check expected error: failed!"); } - catch(ExpectedError const& e) + catch(c4::yml::ExpectedError__ const& e) { - #if defined(RYML_DBG) - std::cout << "---------------\n"; - std::cout << "got an expected error:\n" << e.what() << "\n"; - std::cout << "---------------\n"; - #endif + _c4dbgpf("\n---------------\n" + "got an expected error:\n" + "{}\n" + "---------------\n", e.what()); if(context.expected_location) { EXPECT_EQ(static_cast(context.expected_location), @@ -240,7 +243,13 @@ void ExpectError::do_check(Tree *tree, std::function fn, Location expect EXPECT_EQ(e.error_location.offset, context.expected_location.offset); } } - }; + } + catch(...) + { + _c4dbgp("---------------\n" + "got an unexpected exception!\n" + "---------------\n"); + } EXPECT_TRUE(context.m_got_an_error); } diff --git a/test/test_case.hpp b/test/test_case.hpp index a5c08917b..b0ea87224 100644 --- a/test/test_case.hpp +++ b/test/test_case.hpp @@ -42,6 +42,7 @@ # include #endif + namespace c4 { inline void PrintTo(substr s, ::std::ostream* os) { os->write(s.str, (std::streamsize)s.len); } @@ -49,6 +50,8 @@ inline void PrintTo(csubstr s, ::std::ostream* os) { os->write(s.str, (std::stre namespace yml { +#define RYML_TRACE_FMT(fmt, ...) SCOPED_TRACE([&]{ return formatrs(fmt, __VA_ARGS__); }()) + inline void PrintTo(NodeType ty, ::std::ostream* os) { *os << ty.type_str(); diff --git a/test/test_double_quoted.cpp b/test/test_double_quoted.cpp index 6c3915873..f3a926bb5 100644 --- a/test/test_double_quoted.cpp +++ b/test/test_double_quoted.cpp @@ -1,48 +1,413 @@ +#include "./test_case.hpp" #include "./test_group.hpp" +C4_SUPPRESS_WARNING_GCC_CLANG_PUSH +C4_SUPPRESS_WARNING_GCC("-Wuseless-cast") + namespace c4 { namespace yml { +struct dquoted_case +{ + csubstr input, output; +}; + + +// double quoted filtering can result in an output larger than the input. +// so we ensure adequate test covering by using different sizes. +// test also cases where the destination string is not large +// enough to accomodate the filtered string. + +/** when filtering from src to dst, specifying the dst sz is enough to + * cover the different cases */ +void test_filter_src_dst(csubstr input, csubstr expected, size_t dst_sz) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~\nsz={}", input.len, input, expected.len, expected, dst_sz); + // fill the dst buffer with a ref char to ensure there is no + // write overflow. + const size_t actual_sz = size_t(30) + (dst_sz > expected.len ? dst_sz : expected.len); + std::string subject_; + subject_.resize(actual_sz); + const substr full = to_substr(subject_); + // fill the canary region + const char refchar = '`'; + full.sub(dst_sz).fill(refchar); + // filter now + const substr dst = full.first(dst_sz); + Parser proc = {}; + FilterResult result = proc.filter_scalar_dquoted(input, dst); + // check the result + EXPECT_EQ(result.required_len(), expected.len); + if(result.valid()) + { + const csubstr out = result.get(); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + RYML_TRACE_FMT("\nout.str=[{}]{}\ndst.str=[{}]{}", out.len,(void const*)out.str, dst.len,(void const*)dst.str); + EXPECT_TRUE(out.is_sub(dst)); + EXPECT_EQ(out, expected); + // check the fill character in the canary region + EXPECT_GT(full.sub(dst_sz).len, 0u); + } + EXPECT_EQ(full.sub(dst_sz).first_not_of(refchar), csubstr::npos); +} + + +void test_filter_inplace(csubstr input, csubstr expected, csubstr leading_input, csubstr leading_expected) +{ + // fill the dst buffer with a ref char to ensure there is no + // write overflow. + const size_t input_sz = leading_input.len + input.len; + const size_t expected_sz = leading_expected.len + expected.len; + const size_t max_sz = (input_sz > expected_sz ? input_sz : expected_sz); + const size_t full_sz = max_sz + size_t(30); + std::string expected_(leading_expected.str, leading_expected.len); + expected_ += std::string(expected.str, expected.len); + RYML_TRACE_FMT("\ninp=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~\nlead=[{}]~~~{}~~~\nlead_exp=[{}]~~~{}~~~\nmax_sz={}", input.len, input, expected.len, expected, leading_input.len, leading_input, leading_expected.len, leading_expected, max_sz); + auto run = [&](size_t cap){ + // create the string + std::string subject_(leading_input.str, leading_input.len); + subject_.append(input.str, input.len); + std::string subject_2 = subject_; + subject_.resize(full_sz); + // fill the canary region + const char refchar = '`'; + const substr full = to_substr(subject_); + full.sub(max_sz).fill(refchar); + substr dst = full.first(input_sz); + // filter now + Parser parser1 = {}; + FilterResultExtending result = parser1.filter_scalar_dquoted_in_place(dst, cap); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_dquot(to_substr(subject_2)); + EXPECT_GE(result.required_len(), expected_sz); + EXPECT_EQ(sresult.len, result.str.len); + if(result.valid()) + { + const csubstr out = result.get(); + EXPECT_EQ(out, expected_); + EXPECT_EQ(sresult, expected_); + EXPECT_EQ(sresult, out); + // check the fill character in the canary region. + EXPECT_GT(full.sub(max_sz).len, 0u); + EXPECT_EQ(full.first_not_of(refchar, max_sz), csubstr::npos); + } + }; + if(input_sz >= expected_sz) + { + RYML_TRACE_FMT("all good: input_sz={} >= expected_sz={}", input_sz, expected_sz); + run(input_sz); + } + else // input_sz < expected_sz + { + RYML_TRACE_FMT("expanding: input_sz={} < expected_sz={}", input_sz, expected_sz); + { + RYML_TRACE_FMT("expanding.1: up to larger expected_sz={}", expected_sz); + run(expected_sz); + } + // there is no room to filter if we pass input_sz as the capacity. + { + RYML_TRACE_FMT("expanding.2: up to smaller input_sz={}", input_sz); + run(input_sz); + } + } +} + + +//----------------------------------------------------------------------------- + +// some strings cannot be portably declared in double quotes in C++, +// so we use this helper macro, which creates an char array and +// associated csubstr. +#define DECLARE_CSUBSTR_FROM_CHAR_ARR(name, ...) \ + const char name##_[] = { __VA_ARGS__ }; \ + csubstr name = {name##_, C4_COUNTOF(name##_)} + +C4_SUPPRESS_WARNING_MSVC_WITH_PUSH(4566) // 4566: character represented by universal-character-name '\u263A' cannot be represented in the current code page (1252) + +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqescparsed, + '\\', + '"', + '\n', + '\r', + '\t', + '\t', + '/', + ' ', + '\0', + '\b', + '\f', + '\a', + '\v', + INT8_C(0x1b), + // \_ + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + // \N + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + // \L + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + // \P + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_underscore, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_underscore2, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_underscore3, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_underscore4, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x60, 0xa0), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_N, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_N2, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_N3, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_N4, + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + _RYML_CHCONST(-0x3e, 0xc2), _RYML_CHCONST(-0x7b, 0x85), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_L, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_L2, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_L3, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_L4, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x58, 0xa8), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_P, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_P2, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_P3, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + ); +DECLARE_CSUBSTR_FROM_CHAR_ARR(dqesc_P4, + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + _RYML_CHCONST(-0x1e, 0xe2), _RYML_CHCONST(-0x80, 0x80), _RYML_CHCONST(-0x57, 0xa9), + ); + +// declare double quoted test cases +dquoted_case test_cases_filter[] = { + #define dqc(input, ...) dquoted_case{csubstr(input), csubstr(__VA_ARGS__)} + // 0 + dqc("", ""), + dqc(" ", " "), + dqc(" ", " "), + dqc(" ", " "), + dqc(" ", " "), + // 5 + dqc("foo", "foo"), + dqc("foo bar", "foo bar"), + dqc("1 leading\n \\ttab", "1 leading \ttab"), + dqc("2 leading\n \\ tab", "2 leading \ttab"), + dqc("3 leading\n tab", "3 leading tab"), + // 10 + dqc("4 leading\n \\t tab", "4 leading \t tab"), + dqc("5 leading\n \\ tab", "5 leading \t tab"), + dqc("6 leading\n tab", "6 leading tab"), + dqc("Empty line\n\n as a line feed", "Empty line\nas a line feed"), + dqc(R"(foo\nbar:baz\tx \\$%^&*()x)", "foo\nbar:baz\tx \\$%^&*()x"), + // 15 + dqc(R"(\)", ""), + dqc(R"(\\)", "\\"), + dqc(R"(\\\)", "\\"), + dqc(R"(\\\\)", "\\\\"), + dqc(R"(\\\\\)", "\\\\"), + // 20 + dqc(R"(\ )", "\t"), + dqc(R"(\t)", "\t"), + dqc(R"(\ )", " "), + dqc(R"(\\ )", "\\ "), + dqc(R"(\")", "\""), + // 25 + dqc(R"(\"\")", "\"\""), + dqc(R"(\n)", "\n"), + dqc(R"(\r)", "\r"), + dqc(R"(\t)", "\t"), + dqc(R"(\0)", "\0"), + // 30 + dqc(R"(\b)", "\b"), + dqc(R"(\f)", "\f"), + dqc(R"(\a)", "\a"), + dqc(R"(\v)", "\v"), + dqc(R"(\e)", "\x1b"), + // 35 + dqc(R"(\_)", dqesc_underscore), + dqc(R"(\_\_)", dqesc_underscore2), + dqc(R"(\_\_\_)", dqesc_underscore3), + dqc(R"(\_\_\_\_)", dqesc_underscore4), + dqc(R"(\N)", dqesc_N), + // 40 + dqc(R"(\N\N)", dqesc_N2), + dqc(R"(\N\N\N)", dqesc_N3), + dqc(R"(\N\N\N\N)", dqesc_N4), + dqc(R"(\L)", dqesc_L), + dqc(R"(\L\L)", dqesc_L2), + // 45 + dqc(R"(\L\L\L)", dqesc_L3), + dqc(R"(\L\L\L\L)", dqesc_L4), + dqc(R"(\P)", dqesc_P), + dqc(R"(\P\P)", dqesc_P2), + dqc(R"(\P\P\P)", dqesc_P3), + // 50 + dqc(R"(\P\P\P\P)", dqesc_P4), + dqc(R"(\\\"\n\r\t\ \/\ \0\b\f\a\v\e\_\N\L\P)", dqescparsed), + dqc(R"(\u263A)", R"(☺)"), + dqc(R"(\u263a)", R"(☺)"), + dqc(R"(\u2705)", R"(✅)"), + // 55 + dqc(R"(\u2705\u2705)", R"(✅✅)"), + dqc(R"(\u2705\u2705\u2705)", R"(✅✅✅)"), + dqc(R"(\u2705\u2705\u2705\u2705)", R"(✅✅✅✅)"), + dqc(R"(\U0001D11E)", R"(𝄞)"), + dqc(R"(\U0001d11e)", R"(𝄞)"), + // 60 + dqc(R"(\U0001d11e\U0001D11E)", R"(𝄞𝄞)"), + dqc(R"(\U0001d11e\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞)"), + dqc(R"(\U0001d11e\U0001D11E\U0001D11E\U0001D11E)", R"(𝄞𝄞𝄞𝄞)"), + dqc(R"(\u263A\u2705\U0001D11E)", R"(☺✅𝄞)"), + dqc(R"(\b1998\t1999\t2000\n)", "\b1998\t1999\t2000\n"), + // 65 + dqc(R"(\x0d\x0a is \r\n)", "\r\n is \r\n"), + dqc("\n foo\n\n bar\n\n baz\n", " foo\nbar\nbaz "), + dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + // 70 + dqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + dqc("\n ", " "), + dqc(" \n ", " "), + dqc("\n\n ", "\n"), + dqc("\n\n\n ", "\n\n"), + // 75 + dqc("folded \nto a space, \n \nto a line feed, or \\\n \\ non-content", "folded to a space,\nto a line feed, or \t \tnon-content"), + dqc("folded \nto a space,\n \nto a line feed, or \\\n \\ non-content", "folded to a space,\nto a line feed, or \t \tnon-content"), + //dqc(" \n\ndetected\n\n", "\t\ndetected\n"), // this case cannot be prefixed with anything. + dqc(R"(This is a key\nthat has multiple lines\n)", "This is a key\nthat has multiple lines\n"), + dqc("This is a key\n\nthat has multiple lines\n\n", "This is a key\nthat has multiple lines\n"), + #undef dqc +}; +C4_SUPPRESS_WARNING_MSVC_POP + + +//----------------------------------------------------------------------------- + +TEST(double_quoted_filter, leading_tab) +{ +} + + +//----------------------------------------------------------------------------- + +struct DQuotedFilterSrcDstTest : public ::testing::TestWithParam +{ +}; + + +TEST_P(DQuotedFilterSrcDstTest, dst_is_same_size) +{ + dquoted_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.output, /*dst_sz*/dqc.output.len); +} + +TEST_P(DQuotedFilterSrcDstTest, dst_is_larger_size) +{ + dquoted_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.output, /*sz*/dqc.output.len + 2u); + test_filter_src_dst(dqc.input, dqc.output, /*sz*/dqc.output.len + 100u); +} + +TEST_P(DQuotedFilterSrcDstTest, dst_is_smaller_size) +{ + dquoted_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.output, /*sz*/dqc.output.len / 2u); +} + +TEST_P(DQuotedFilterSrcDstTest, dst_is_zero_size) +{ + dquoted_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.output, /*sz*/0u); +} + + + +struct DQuotedFilterInplaceTest : public ::testing::TestWithParam +{ +}; + + +TEST_P(DQuotedFilterInplaceTest, dst_is_same_size) +{ + dquoted_case dqc = GetParam(); + test_filter_inplace(dqc.input, dqc.output, /*leading*/"", /*leading_expected*/""); +} + +TEST_P(DQuotedFilterInplaceTest, dst_is_smaller_size) +{ + // test also with an expanding leading string ("\\L" expands from + // two to three bytes). This ensures coverage of cases where + // expected.len > capacity. + dquoted_case dqc = GetParam(); + test_filter_inplace(dqc.input, dqc.output, /*leading*/"\\L\\L\\L\\L", /*leading_expected*/dqesc_L4); +} + + +INSTANTIATE_TEST_SUITE_P(double_quoted_filter, + DQuotedFilterSrcDstTest, + testing::ValuesIn(test_cases_filter)); + +INSTANTIATE_TEST_SUITE_P(double_quoted_filter, + DQuotedFilterInplaceTest, + testing::ValuesIn(test_cases_filter)); + + +//----------------------------------------------------------------------------- + TEST(double_quoted, escaped_chars) { csubstr yaml = R"("\\\"\n\r\t\ \/\ \0\b\f\a\v\e\_\N\L\P")"; - // build the string like this because some of the characters are - // filtered out under the double quotes - std::string expected; - expected += '\\'; - expected += '"'; - expected += '\n'; - expected += '\r'; - expected += '\t'; - expected += '\t'; - expected += '/'; - expected += ' '; - expected += '\0'; - expected += '\b'; - expected += '\f'; - expected += '\a'; - expected += '\v'; - expected += INT8_C(0x1b); // \e - // - // wrap explicitly to avoid overflow - expected += _RYML_CHCONST(-0x3e, 0xc2); // \_ (1) - expected += _RYML_CHCONST(-0x60, 0xa0); // \_ (2) - // - expected += _RYML_CHCONST(-0x3e, 0xc2); // \N (1) - expected += _RYML_CHCONST(-0x7b, 0x85); // \N (2) - // - expected += _RYML_CHCONST(-0x1e, 0xe2); // \L (1) - expected += _RYML_CHCONST(-0x80, 0x80); // \L (2) - expected += _RYML_CHCONST(-0x58, 0xa8); // \L (3) - // - expected += _RYML_CHCONST(-0x1e, 0xe2); // \P (1) - expected += _RYML_CHCONST(-0x80, 0x80); // \P (2) - expected += _RYML_CHCONST(-0x57, 0xa9); // \P (3) - // Tree t = parse_in_arena(yaml); csubstr v = t.rootref().val(); std::string actual = {v.str, v.len}; - EXPECT_EQ(actual, expected); + // build the string like this because some of the characters are + // filtered out under the double quotes + EXPECT_EQ(actual, std::string(dqescparsed.str, dqescparsed.len)); } TEST(double_quoted, test_suite_3RLN) @@ -608,3 +973,5 @@ that has multiple lines } // namespace yml } // namespace c4 + +C4_SUPPRESS_WARNING_GCC_CLANG_POP diff --git a/test/test_filter.cpp b/test/test_filter.cpp new file mode 100644 index 000000000..72052b4f3 --- /dev/null +++ b/test/test_filter.cpp @@ -0,0 +1,1477 @@ +#ifdef RYML_SINGLE_HEADER +#include "ryml_all.hpp" +#else +#include "c4/yml/filter_processor.hpp" +#include "c4/yml/std/string.hpp" +#include "c4/yml/parse.hpp" +#endif +#include + + +namespace c4 { +namespace yml { + +template +struct TesterInplace_ +{ + std::string buf; + substr subject; + Processor proc; + C4_NO_COPY_OR_MOVE(TesterInplace_); + TesterInplace_(const char *str) + : buf(str) + , subject(to_substr(buf)) + , proc(subject, buf.capacity()) + { + } + void set_to_end(size_t slack) + { + ASSERT_GT(proc.wcap, 0); + ASSERT_LE(slack, proc.wcap); + proc.wpos = proc.wcap - slack; + } + void trim_capacity() { proc.wcap = buf.size(); } + void set_capacity(size_t cap) + { + buf.reserve(cap); + subject = to_substr(buf); + proc = FilterProcessorInplaceMidExtending(subject, cap); + } +}; +using TesterInplaceMid = TesterInplace_; +using TesterInplaceEnd = TesterInplace_; + + +struct TesterSrcDst +{ + std::string src_; + std::string dst_; + csubstr src; + substr dst; + FilterProcessorSrcDst proc; + C4_NO_COPY_OR_MOVE(TesterSrcDst); + TesterSrcDst(const char *str) + : src_(str) + , dst_(str) + , src(to_csubstr(src_)) + , dst(to_substr(dst_)) + , proc(src, dst) + { + } + void set_to_end(size_t slack) + { + ASSERT_GT(dst.len, 0); + ASSERT_LE(slack, dst.len); + proc.wpos = dst.len - slack; + } + void set_dst_size(size_t cap) + { + dst_.resize(cap); + dst = to_substr(dst_); + proc = FilterProcessorSrcDst(src, dst); + } +}; + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +TEST(FilterProcessorInplaceMid, set) +{ + TesterInplaceMid t("subject"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + t.proc.set('.'); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "."); + EXPECT_EQ(t.proc.result().get(), "."); + EXPECT_EQ(t.subject, ".ubject"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.skip(2); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 1); + t.proc.set('.', 2); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.subject, "...ject"); + EXPECT_EQ(t.proc.sofar(), "..."); + ASSERT_TRUE(t.proc.result().valid()); + EXPECT_EQ(t.proc.result().get(), "..."); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.skip(3); + t.proc.set('x', 3); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.subject, "...xxxt"); + EXPECT_EQ(t.proc.sofar(), "...xxx"); + ASSERT_TRUE(t.proc.result().valid()); + EXPECT_EQ(t.proc.result().get(), "...xxx"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.skip(1); + t.proc.set('.'); + EXPECT_EQ(t.proc.wpos, 7); + EXPECT_EQ(t.proc.rpos, 7); + EXPECT_EQ(t.subject, "...xxx."); + EXPECT_EQ(t.proc.sofar(), "...xxx."); + ASSERT_TRUE(t.proc.result().valid()); + EXPECT_EQ(t.proc.result().get(), "...xxx."); + EXPECT_FALSE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceEnd, set) +{ + TesterInplaceEnd t("subject"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + EXPECT_EQ(t.subject, "subject"); + t.proc.set('.'); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "."); + EXPECT_EQ(t.proc.result().get(), "."); + EXPECT_EQ(t.subject, ".ubject"); + t.proc.skip(2); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 1); + t.proc.set('.', 2); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.subject, "...ject"); + EXPECT_EQ(t.proc.sofar(), "..."); + ASSERT_TRUE(t.proc.result().valid()); + EXPECT_EQ(t.proc.result().get(), "..."); + t.proc.skip(3); + t.proc.set('x', 3); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.subject, "...xxxt"); + EXPECT_EQ(t.proc.sofar(), "...xxx"); + ASSERT_TRUE(t.proc.result().valid()); + EXPECT_EQ(t.proc.result().get(), "...xxx"); + t.proc.skip(1); + t.proc.set('.'); + EXPECT_EQ(t.proc.wpos, 7); + EXPECT_EQ(t.proc.rpos, 7); + EXPECT_EQ(t.subject, "...xxx."); + EXPECT_EQ(t.proc.sofar(), "...xxx."); + ASSERT_TRUE(t.proc.result().valid()); + EXPECT_EQ(t.proc.result().get(), "...xxx."); +} + +TEST(FilterProcessorSrcDst, set) +{ + TesterSrcDst t("subject"); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + t.proc.skip(); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.rpos, 1); + t.proc.set('.'); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.dst, ".ubject"); + EXPECT_EQ(t.proc.sofar(), "."); + EXPECT_EQ(t.proc.result().get(), "."); + t.proc.skip(2); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 1); + t.proc.set('.', 2); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.dst, "...ject"); + EXPECT_EQ(t.proc.sofar(), "..."); + EXPECT_EQ(t.proc.result().get(), "..."); + t.proc.skip(3); + t.proc.set('x', 3); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.dst, "...xxxt"); + EXPECT_EQ(t.proc.sofar(), "...xxx"); + EXPECT_EQ(t.proc.result().get(), "...xxx"); +} + +TEST(FilterProcessorInplaceMid, set_single_does_not_unfilter) +{ + // skip -> set + { + TesterInplaceMid t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 0); + t.proc.set('a'); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "a"); + EXPECT_EQ(t.proc.result().get(), "a"); + EXPECT_FALSE(t.proc.unfiltered_chars); + } + // set -> skip + { + TesterInplaceMid t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.set('a'); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 1); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "a"); + EXPECT_EQ(t.proc.result().get(), "a"); + EXPECT_FALSE(t.proc.unfiltered_chars); + } +} + +TEST(FilterProcessorInplaceEnd, set_single_does_not_unfilter) +{ + // skip -> set + { + TesterInplaceEnd t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 0); + t.proc.set('a'); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "a"); + EXPECT_EQ(t.proc.result().get(), "a"); + } + // set -> skip + { + TesterInplaceEnd t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.set('a'); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 1); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "a"); + EXPECT_EQ(t.proc.result().get(), "a"); + } +} + +TEST(FilterProcessorInplaceMid, set_bulk_does_not_unfilter) +{ + // skip -> set + { + TesterInplaceMid t("0123"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 4); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.skip(4); + t.proc.set('a', 4); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aaaa"); + EXPECT_EQ(t.proc.result().get(), "aaaa"); + EXPECT_FALSE(t.proc.unfiltered_chars); + } + // set -> skip + { + TesterInplaceMid t("0123"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 4); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.set('a', 4); + t.proc.skip(4); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aaaa"); + EXPECT_EQ(t.proc.result().get(), "aaaa"); + EXPECT_FALSE(t.proc.unfiltered_chars); + } +} + +TEST(FilterProcessorInplaceEnd, set_bulk_does_not_unfilter) +{ + // skip -> set + { + TesterInplaceEnd t("0123"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 4); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.skip(4); + t.proc.set('a', 4); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aaaa"); + EXPECT_EQ(t.proc.result().get(), "aaaa"); + } + // set -> skip + { + TesterInplaceEnd t("0123"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 4); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.set('a', 4); + t.proc.skip(4); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aaaa"); + EXPECT_EQ(t.proc.result().get(), "aaaa"); + } +} + + +//----------------------------------------------------------------------------- + +TEST(FilterProcessorInplaceMid, copy) +{ + { + TesterInplaceMid t("subject"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.subject, "subject"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.subject, "subject"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "sb"); + EXPECT_EQ(t.proc.result().get(), "sb"); + EXPECT_EQ(t.subject, "sbbject"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.copy(4); + EXPECT_EQ(t.proc.rpos, 7); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "sbject"); + EXPECT_EQ(t.proc.result().get(), "sbject"); + EXPECT_EQ(t.subject, "sbjectt"); + EXPECT_FALSE(t.proc.unfiltered_chars); + } + { + TesterInplaceMid t("s"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.subject, "s"); + EXPECT_FALSE(t.proc.unfiltered_chars); + } + { + TesterInplaceMid t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "0"); + ASSERT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.result().get(), "0"); + EXPECT_EQ(t.subject, "0"); + } + { + TesterInplaceMid t("012345"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 6); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + ASSERT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.result().get(), ""); + EXPECT_EQ(t.subject, "012345"); + t.proc.copy(6); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "012345"); + ASSERT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.result().get(), "012345"); + EXPECT_EQ(t.subject, "012345"); + } +} + +TEST(FilterProcessorInplaceEnd, copy) +{ + { + TesterInplaceEnd t("subject"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.subject, "subject"); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.subject, "subject"); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "sb"); + EXPECT_EQ(t.proc.result().get(), "sb"); + EXPECT_EQ(t.subject, "sbbject"); + t.proc.copy(4); + EXPECT_EQ(t.proc.rpos, 7); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "sbject"); + EXPECT_EQ(t.proc.result().get(), "sbject"); + EXPECT_EQ(t.subject, "sbjectt"); + } + { + TesterInplaceEnd t("s"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.subject, "s"); + } + { + TesterInplaceEnd t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "0"); + EXPECT_EQ(t.proc.result().get(), "0"); + EXPECT_EQ(t.subject, "0"); + } + { + TesterInplaceEnd t("012345"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 6); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + EXPECT_EQ(t.subject, "012345"); + t.proc.copy(6); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "012345"); + EXPECT_EQ(t.proc.result().get(), "012345"); + EXPECT_EQ(t.subject, "012345"); + } +} + +TEST(FilterProcessorSrcDst, copy) +{ + TesterSrcDst t("subject"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.result().get(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.dst, "subject"); + t.proc.skip(); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "s"); + EXPECT_EQ(t.proc.result().get(), "s"); + EXPECT_EQ(t.dst, "subject"); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "sb"); + EXPECT_EQ(t.proc.result().get(), "sb"); + EXPECT_EQ(t.dst, "sbbject"); + t.proc.copy(4); + EXPECT_EQ(t.proc.rpos, 7); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "sbject"); + EXPECT_EQ(t.proc.result().get(), "sbject"); + EXPECT_EQ(t.dst, "sbjectt"); +} + +TEST(FilterProcessorInplaceMid, copy_single_does_not_unfilter) +{ + TesterInplaceMid t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "0"); + EXPECT_EQ(t.proc.result().get(), "0"); + EXPECT_FALSE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceEnd, copy_single_does_not_unfilter) +{ + TesterInplaceEnd t("0"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 1); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.copy(); + EXPECT_EQ(t.proc.rpos, 1); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "0"); + EXPECT_EQ(t.proc.result().get(), "0"); +} + +TEST(FilterProcessorInplaceMid, copy_bulk_does_not_unfilter) +{ + TesterInplaceMid t("0123"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 4); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.copy(4); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "0123"); + EXPECT_EQ(t.proc.result().get(), "0123"); + EXPECT_FALSE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceEnd, copy_bulk_does_not_unfilter) +{ + TesterInplaceEnd t("0123"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 4); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.copy(4); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "0123"); + EXPECT_EQ(t.proc.result().get(), "0123"); +} + + +//----------------------------------------------------------------------------- + +TEST(FilterProcessorInplaceMid, translate_esc_single) +{ + TesterInplaceMid t("\\t\\b\\n\\r\\t"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc('\t'); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "\t"); + EXPECT_EQ(t.proc.result().get(), "\t"); + EXPECT_EQ(t.subject, "\tt\\b\\n\\r\\t"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc('\b'); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "\t\b"); + EXPECT_EQ(t.proc.result().get(), "\t\b"); + EXPECT_EQ(t.subject, "\t\b\\b\\n\\r\\t"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc('\n'); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.sofar(), "\t\b\n"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n"); + EXPECT_EQ(t.subject, "\t\b\nb\\n\\r\\t"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc('\r'); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "\t\b\n\r"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n\r"); + EXPECT_EQ(t.subject, "\t\b\n\r\\n\\r\\t"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc('\t'); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.sofar(), "\t\b\n\r\t"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n\r\t"); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\t"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.set_to_end(/*slack*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 9); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\t"); + EXPECT_FALSE(t.proc.unfiltered_chars); + // can write this one + t.proc.set('+'); + EXPECT_EQ(t.proc.rpos, 10); // this is fine, no read is done + EXPECT_EQ(t.proc.wpos, 10); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\+"); + EXPECT_FALSE(t.proc.unfiltered_chars); + // but this one will set to unfiltered + t.proc.set('x'); + EXPECT_EQ(t.proc.rpos, 10); // this is fine, no read is done + EXPECT_EQ(t.proc.wpos, 11); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\+"); + EXPECT_TRUE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceEnd, translate_esc_single) +{ + TesterInplaceEnd t("\\t\\b\\n\\r\\t"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc('\t'); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "\t"); + EXPECT_EQ(t.proc.result().get(), "\t"); + EXPECT_EQ(t.subject, "\tt\\b\\n\\r\\t"); + t.proc.translate_esc('\b'); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "\t\b"); + EXPECT_EQ(t.proc.result().get(), "\t\b"); + EXPECT_EQ(t.subject, "\t\b\\b\\n\\r\\t"); + t.proc.translate_esc('\n'); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.sofar(), "\t\b\n"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n"); + EXPECT_EQ(t.subject, "\t\b\nb\\n\\r\\t"); + t.proc.translate_esc('\r'); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "\t\b\n\r"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n\r"); + EXPECT_EQ(t.subject, "\t\b\n\r\\n\\r\\t"); + t.proc.translate_esc('\t'); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.sofar(), "\t\b\n\r\t"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n\r\t"); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\t"); + t.set_to_end(/*slack*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 9); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\t"); + // can write this one + t.proc.set('+'); + EXPECT_EQ(t.proc.rpos, 10); // this is fine, no read is done + EXPECT_EQ(t.proc.wpos, 10); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\+"); + // but this one will set to unfiltered + t.proc.set('x'); + EXPECT_EQ(t.proc.rpos, 10); // this is fine, no read is done + EXPECT_EQ(t.proc.wpos, 11); + EXPECT_EQ(t.subject, "\t\b\n\r\tn\\r\\+"); +} + +TEST(FilterProcessorSrcDst, translate_esc_single) +{ + TesterSrcDst t("\\t\\b\\n\\r\\t"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc('\t'); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 1); + EXPECT_EQ(t.proc.sofar(), "\t"); + EXPECT_EQ(t.proc.result().get(), "\t"); + t.proc.translate_esc('\b'); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "\t\b"); + EXPECT_EQ(t.proc.result().get(), "\t\b"); + t.proc.translate_esc('\n'); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.sofar(), "\t\b\n"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n"); + t.proc.translate_esc('\r'); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "\t\b\n\r"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n\r"); + t.proc.translate_esc('\t'); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.sofar(), "\t\b\n\r\t"); + EXPECT_EQ(t.proc.result().get(), "\t\b\n\r\t"); + t.set_to_end(/*slack*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 9); + // can write this one + t.proc.translate_esc('+'); + EXPECT_EQ(t.proc.rpos, 12); // this is fine, no read is done + EXPECT_EQ(t.proc.wpos, 10); + // but this one will set to unfiltered + t.proc.translate_esc('x'); + EXPECT_EQ(t.proc.rpos, 14); // this is fine, no read is done + EXPECT_EQ(t.proc.wpos, 11); +} + + +//----------------------------------------------------------------------------- + +TEST(FilterProcessorInplaceMid, translate_esc_bulk) +{ + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_bulk("aa", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "aa"); + EXPECT_EQ(t.proc.result().get(), "aa"); + EXPECT_EQ(t.subject, "aa11223344"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_bulk("bb", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aabb"); + EXPECT_EQ(t.proc.result().get(), "aabb"); + EXPECT_EQ(t.subject, "aabb223344"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_bulk("cc", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "aabbcc"); + EXPECT_EQ(t.proc.result().get(), "aabbcc"); + EXPECT_EQ(t.subject, "aabbcc3344"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_bulk("dd", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 8); + EXPECT_EQ(t.proc.sofar(), "aabbccdd"); + EXPECT_EQ(t.proc.result().get(), "aabbccdd"); + EXPECT_EQ(t.subject, "aabbccdd44"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_bulk("ee", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 10); + EXPECT_EQ(t.proc.sofar(), "aabbccddee"); + EXPECT_EQ(t.proc.result().get(), "aabbccddee"); + EXPECT_EQ(t.subject, "aabbccddee"); + EXPECT_FALSE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceEnd, translate_esc_bulk) +{ + TesterInplaceEnd t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_bulk("aa", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "aa"); + EXPECT_EQ(t.proc.result().get(), "aa"); + EXPECT_EQ(t.subject, "aa11223344"); + t.proc.translate_esc_bulk("bb", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aabb"); + EXPECT_EQ(t.proc.result().get(), "aabb"); + EXPECT_EQ(t.subject, "aabb223344"); + t.proc.translate_esc_bulk("cc", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "aabbcc"); + EXPECT_EQ(t.proc.result().get(), "aabbcc"); + EXPECT_EQ(t.subject, "aabbcc3344"); + t.proc.translate_esc_bulk("dd", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 8); + EXPECT_EQ(t.proc.sofar(), "aabbccdd"); + EXPECT_EQ(t.proc.result().get(), "aabbccdd"); + EXPECT_EQ(t.subject, "aabbccdd44"); + t.proc.translate_esc_bulk("ee", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 10); + EXPECT_EQ(t.proc.sofar(), "aabbccddee"); + EXPECT_EQ(t.proc.result().get(), "aabbccddee"); + EXPECT_EQ(t.subject, "aabbccddee"); +} + +TEST(FilterProcessorSrcDst, translate_esc_bulk) +{ + TesterSrcDst t("0011223344"); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_bulk("aa", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 2); + EXPECT_EQ(t.proc.sofar(), "aa"); + EXPECT_EQ(t.proc.result().get(), "aa"); + t.proc.translate_esc_bulk("bb", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.sofar(), "aabb"); + EXPECT_EQ(t.proc.result().get(), "aabb"); + t.proc.translate_esc_bulk("cc", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "aabbcc"); + EXPECT_EQ(t.proc.result().get(), "aabbcc"); + t.proc.translate_esc_bulk("dd", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 8); + EXPECT_EQ(t.proc.sofar(), "aabbccdd"); + EXPECT_EQ(t.proc.result().get(), "aabbccdd"); + t.proc.translate_esc_bulk("ee", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 10); + EXPECT_EQ(t.proc.sofar(), "aabbccddee"); + EXPECT_EQ(t.proc.result().get(), "aabbccddee"); +} + + +//----------------------------------------------------------------------------- + +TEST(FilterProcessorInplaceMid, translate_esc_extending_bulk_excess__trimmed_capacity) +{ + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaa", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "001"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("bbb", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "001122"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("ccc", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 9); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "001122334"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("ddd", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 12); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011223344"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + // write 4 characters + t.proc.translate_esc_extending("cccc", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 16); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011223344"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceMid, translate_esc_extending_bulk_excess__spare_capacity) +{ + TesterInplaceMid t("0011223344"); + const size_t needed_capacity = 16u; + const size_t smaller_capacity = 14u; + ASSERT_LT(smaller_capacity, needed_capacity); + t.set_capacity(smaller_capacity); + ASSERT_GE(t.buf.capacity(), smaller_capacity); + const substr full_subject = {&t.buf[0], smaller_capacity}; + full_subject.sub(t.buf.size()).fill('^'); + EXPECT_EQ(t.proc.wcap, smaller_capacity); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(full_subject, "0011223344^^^^"); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaa", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.src.len, 11); + EXPECT_EQ(t.proc.sofar(), "aaa"); + EXPECT_EQ(t.proc.result().get(), "aaa"); + EXPECT_EQ(full_subject, "aaa11223344^^^"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("bbb", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.src.len, 12); + EXPECT_EQ(t.proc.sofar(), "aaabbb"); + EXPECT_EQ(t.proc.result().get(), "aaabbb"); + EXPECT_EQ(full_subject, "aaabbb223344^^"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("ccc", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 9); + EXPECT_EQ(t.proc.wpos, 9); + EXPECT_EQ(t.proc.src.len, 13); + EXPECT_EQ(t.proc.sofar(), "aaabbbccc"); + EXPECT_EQ(t.proc.result().get(), "aaabbbccc"); + EXPECT_EQ(full_subject, "aaabbbccc3344^"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("ddd", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 12); + EXPECT_EQ(t.proc.wpos, 12); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "aaabbbcccddd"); + EXPECT_EQ(t.proc.result().get(), "aaabbbcccddd"); + EXPECT_EQ(full_subject, "aaabbbcccddd44"); + EXPECT_FALSE(t.proc.unfiltered_chars); + t.proc.translate_esc_extending("eeee", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 16); + EXPECT_EQ(t.proc.src.len, smaller_capacity); // not 16! limited + EXPECT_EQ(t.proc.sofar(), "aaabbbcccddd44"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(full_subject, "aaabbbcccddd44"); + EXPECT_TRUE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceMid, copy_after_translate_esc_extending_bulk_excess__trimmed_capacity) +{ + { + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.copy(); // do not write! + EXPECT_EQ(t.proc.rpos, 3); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "00112"); // must not copy 1. + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + } + { + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.copy(2); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "001122"); // must not copy 11. + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + } +} + +TEST(FilterProcessorInplaceMid, set_after_translate_esc_extending_bulk_excess__trimmed_capacity) +{ + { + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.set('!'); // do not write! + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "00112"); // must not set '!' + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + } + { + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.set('!', 2); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "001122"); // must not set '!' + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + } +} + +TEST(FilterProcessorInplaceMid, translate_esc_extending_with_temporary_excess_requirement__trimmed_capacity) +{ + TesterInplaceMid t("00112233445566"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 14); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.proc.maxcap, 14); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 4); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); // increased! + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc('b'); // do not write! + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112"); // must not set 'b' + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 5); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc('c'); // do not write! + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "001122"); // must not set 'c' + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 6); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc('d'); // can write + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 7); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "0011223"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 7); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc('e'); // can write + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 8); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112233"); // can set because now wpos < rpos + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 8); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc('f'); // can write + EXPECT_EQ(t.proc.rpos, 12); + EXPECT_EQ(t.proc.wpos, 9); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "001122334"); // can set because now wpos < rpos + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 9); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.translate_esc('g'); // can write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 10); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "0011223344"); // can set because now wpos < rpos + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 10); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('h'); // can write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 11); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112233445"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 11); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('i'); // can write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 12); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "001122334455"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 12); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('j'); // can write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 13); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "0011223344556"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 13); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('k'); // can write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 14); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112233445566"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 14); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('!'); // cannot write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 15); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112233445566"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 15); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('!'); // cannot write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 16); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112233445566"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 16); + EXPECT_EQ(t.proc.result().required_len(), 16); + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 16); + EXPECT_TRUE(t.proc.unfiltered_chars); + // 00112233445566 + // ^ (rpos) + // ^ (wpos) + t.proc.set('!'); // cannot write + EXPECT_EQ(t.proc.rpos, 14); + EXPECT_EQ(t.proc.wpos, 17); + EXPECT_EQ(t.proc.src.len, 14); + EXPECT_EQ(t.proc.sofar(), "00112233445566"); + EXPECT_EQ(t.proc.result().str.str, nullptr); + EXPECT_EQ(t.proc.result().str.len, 17); + EXPECT_EQ(t.proc.result().required_len(), 17); // increased! + EXPECT_EQ(t.subject, "00112233445566"); + EXPECT_EQ(t.proc.maxcap, 17); + EXPECT_TRUE(t.proc.unfiltered_chars); +} + +TEST(FilterProcessorInplaceMid, translate_esc_extending_after_translate_esc_extending_bulk_excess__trimmed_capacity) +{ + { + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.translate_esc('!'); // do not write! + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 5); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "00112"); // must not set '!' + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + } + { + TesterInplaceMid t("0011223344"); + t.trim_capacity(); + EXPECT_EQ(t.proc.wcap, 10); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_FALSE(t.proc.unfiltered_chars); + EXPECT_EQ(t.proc.sofar(), ""); + t.proc.translate_esc_extending("aaaa", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 4); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "0011"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + t.proc.translate_esc_bulk("!!", /*nw*/2, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.src.len, 10); + EXPECT_EQ(t.proc.sofar(), "001122"); // must not set '!' + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.subject, "0011223344"); + EXPECT_TRUE(t.proc.unfiltered_chars); + } +} + + +//----------------------------------------------------------------------------- + +TEST(FilterProcessorSrcDst, translate_esc_extending_bulk_excess) +{ + TesterSrcDst t("0011223344"); + const size_t needed_size = 16u; + const size_t smaller_size = 14u; + ASSERT_LT(smaller_size, needed_size); + t.set_dst_size(smaller_size); + ASSERT_EQ(t.dst.len, smaller_size); + to_substr(t.dst).fill('^'); + EXPECT_EQ(t.proc.rpos, 0); + EXPECT_EQ(t.proc.wpos, 0); + EXPECT_EQ(t.proc.sofar(), ""); + EXPECT_EQ(t.dst, csubstr("^^^^^^^^^^^^^^")); + t.proc.translate_esc_extending("aaa", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 2); + EXPECT_EQ(t.proc.wpos, 3); + EXPECT_EQ(t.proc.sofar(), "aaa"); + EXPECT_EQ(t.proc.result().get(), "aaa"); + EXPECT_EQ(t.dst, "aaa^^^^^^^^^^^"); + t.proc.translate_esc_extending("bbb", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 4); + EXPECT_EQ(t.proc.wpos, 6); + EXPECT_EQ(t.proc.sofar(), "aaabbb"); + EXPECT_EQ(t.proc.result().get(), "aaabbb"); + EXPECT_EQ(t.dst, "aaabbb^^^^^^^^"); + t.proc.translate_esc_extending("ccc", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 6); + EXPECT_EQ(t.proc.wpos, 9); + EXPECT_EQ(t.proc.sofar(), "aaabbbccc"); + EXPECT_EQ(t.proc.result().get(), "aaabbbccc"); + EXPECT_EQ(t.dst, "aaabbbccc^^^^^"); + t.proc.translate_esc_extending("ddd", /*nw*/3, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 8); + EXPECT_EQ(t.proc.wpos, 12); + EXPECT_EQ(t.proc.sofar(), "aaabbbcccddd"); + EXPECT_EQ(t.proc.result().get(), "aaabbbcccddd"); + EXPECT_EQ(t.dst, "aaabbbcccddd^^"); + t.proc.translate_esc_extending("eeee", /*nw*/4, /*nr*/1); + EXPECT_EQ(t.proc.rpos, 10); + EXPECT_EQ(t.proc.wpos, 16); + EXPECT_EQ(t.proc.sofar(), "aaabbbcccddd^^"); + EXPECT_EQ(t.proc.result().str, nullptr); + EXPECT_EQ(t.dst, "aaabbbcccddd^^"); +} + +TEST(Filter, _find_last_newline_and_larger_indentation) +{ + EXPECT_EQ(detail::_find_last_newline_and_larger_indentation("ab\n\n\n", 0), npos); + EXPECT_EQ(detail::_find_last_newline_and_larger_indentation("ab\n \n\n", 0), 2u); + EXPECT_EQ(detail::_find_last_newline_and_larger_indentation("ab\n\n \n", 0), 3u); + EXPECT_EQ(detail::_find_last_newline_and_larger_indentation("ab\n \n \n", 0), 4u); + EXPECT_EQ(detail::_find_last_newline_and_larger_indentation("ab\n \n \n", 1), 4u); + EXPECT_EQ(detail::_find_last_newline_and_larger_indentation("ab\n \n \n", 1), 2u); +} + +} // namespace yml +} // namespace c4 + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +// this is needed to use the test case library + +namespace c4 { +namespace yml { +struct Case; +Case const* get_case(csubstr /*name*/) +{ + return nullptr; +} +} // namespace yml +} // namespace c4 diff --git a/test/test_group.hpp b/test/test_group.hpp index f661ec9b5..14d7a5ec2 100644 --- a/test/test_group.hpp +++ b/test/test_group.hpp @@ -126,7 +126,8 @@ constexpr const NodeType_e QKV = (NodeType_e)(VAL | KEYQUO | VALQUO); #ifdef __GNUC__ #if __GNUC__ == 4 && __GNUC_MINOR__ >= 8 -struct CaseAdder { +struct CaseAdder +{ std::vector *group_cases; const csubstr file; const int line; @@ -149,9 +150,11 @@ struct CaseAdder { #define CASE_GROUP(group_name) \ \ + \ /* fwd declaration to fill the container with cases */ \ void add_cases_##group_name(std::vector *group_cases); \ \ + \ /* container with the cases */ \ std::vector const& get_cases_##group_name() \ { \ @@ -161,6 +164,7 @@ std::vector const& get_cases_##group_name() \ return cases_##group_name; \ } \ \ + \ /* container with the case names */ \ std::vector const& get_case_names_##group_name() \ { \ @@ -179,9 +183,12 @@ std::vector const& get_case_names_##group_name() \ return case_names_##group_name; \ } \ \ + \ INSTANTIATE_TEST_SUITE_P(group_name, YmlTestCase, ::testing::ValuesIn(get_case_names_##group_name())); \ + \ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(YmlTestCase); \ \ + \ /* used by the fixture to obtain a case by name */ \ Case const* get_case(csubstr name) \ { \ @@ -192,6 +199,7 @@ Case const* get_case(csubstr name) \ return nullptr; \ } \ \ + \ /* finally, define the cases by calling ADD_CASE_TO_GROUP() */ \ void add_cases_##group_name(std::vector *group_cases__) diff --git a/test/test_number.cpp b/test/test_number.cpp index 41c388ae6..a9344becf 100644 --- a/test/test_number.cpp +++ b/test/test_number.cpp @@ -1,4 +1,9 @@ #include "./test_group.hpp" +#ifdef RYML_SINGLE_HEADER +#include +#else +#include +#endif namespace c4 { namespace yml { @@ -14,6 +19,7 @@ auto mkvals() -> typename std::enable_if::value, std::vector< return std::vector({0, 1, 5, 10, std::numeric_limits::max(),}); } template +C4_NO_UBSAN_IOVRFLW void test_ints() { C4_SUPPRESS_WARNING_GCC_WITH_PUSH("-Wuseless-cast") diff --git a/test/test_parser.cpp b/test/test_parser.cpp index a1368599e..bb69c45af 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -24,7 +24,6 @@ void mklarge(Parser *p, Callbacks const& cb) new ((void*)p) Parser(cb); p->reserve_stack(20); // cause an allocation p->reserve_locations(128); // cause an allocation - p->reserve_filter_arena(128); // cause an allocation } @@ -91,28 +90,6 @@ TEST(Parser, reserve_locations) EXPECT_EQ(ts.dealloc_size, 128u * sizeof(size_t)); } -TEST(Parser, reserve_filter_arena) -{ - size_t cap = 256u; - CallbacksTester ts; - { - Parser parser(ts.callbacks()); - EXPECT_EQ(parser.filter_arena_capacity(), 0u); - EXPECT_EQ(parser.callbacks(), ts.callbacks()); - EXPECT_EQ(ts.num_allocs, 0u); - EXPECT_EQ(ts.num_deallocs, 0u); - parser.reserve_filter_arena(cap); - EXPECT_EQ(ts.num_allocs, 1u); - EXPECT_EQ(ts.num_deallocs, 0u); - EXPECT_EQ(ts.alloc_size, cap); - EXPECT_EQ(ts.dealloc_size, 0u); - } - EXPECT_EQ(ts.num_allocs, 1u); - EXPECT_EQ(ts.num_deallocs, 1u); - EXPECT_EQ(ts.alloc_size, cap); - EXPECT_EQ(ts.dealloc_size, cap); -} - TEST(Parser, copy_ctor) { { diff --git a/test/test_plain_scalar.cpp b/test/test_plain_scalar.cpp index be9d434c6..a5baadf09 100644 --- a/test/test_plain_scalar.cpp +++ b/test/test_plain_scalar.cpp @@ -3,6 +3,268 @@ namespace c4 { namespace yml { +struct plain_scalar_case +{ + size_t indentation; + csubstr input, expected; +}; + + +// double quoted filtering can result in an output larger than the input. +// so we ensure adequate test covering by using different sizes. +// test also cases where the destination string is not large +// enough to accomodate the filtered string. + +/** when filtering from src to dst, specifying the dst sz is enough to + * cover the different cases */ +void test_filter_src_dst(csubstr input, csubstr expected, size_t indentation, size_t dst_sz) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~\nsz={}", input.len, input, expected.len, expected, dst_sz); + // fill the dst buffer with a ref char to ensure there is no + // write overflow. + const size_t actual_sz = size_t(30) + (dst_sz > expected.len ? dst_sz : expected.len); + std::string subject_; + subject_.resize(actual_sz); + const substr full = to_substr(subject_); + // fill the canary region + const char refchar = '`'; + full.sub(dst_sz).fill(refchar); + // filter now + const substr dst = full.first(dst_sz); + Parser proc = {}; + FilterResult result = proc.filter_scalar_plain(input, dst, indentation); + // check the result + EXPECT_EQ(result.required_len(), expected.len); + if(result.valid()) + { + const csubstr out = result.get(); + RYML_TRACE_FMT("\nout=[{}]~~~{}~~~", out.len, out); + RYML_TRACE_FMT("\nout.str={}\ndst.str={}", (void const*)out.str, (void const*)dst.str); + EXPECT_TRUE(out.is_sub(dst)); + EXPECT_EQ(out, expected); + // check the fill character in the canary region + EXPECT_GT(full.sub(dst_sz).len, 0u); + EXPECT_EQ(full.sub(dst_sz).first_not_of(refchar), csubstr::npos); + } +} + + +void test_filter_inplace(csubstr input, csubstr expected, size_t indentation) +{ + EXPECT_LE(expected.len, input.len); + // fill the dst buffer with a ref char to ensure there is no + // write overflow. + const size_t max_sz = (input.len > expected.len ? input.len : expected.len); + const size_t full_sz = max_sz + size_t(30); + RYML_TRACE_FMT("\ninp=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~\nmax_sz={}", input.len, input, expected.len, expected, max_sz); + auto run = [&](size_t cap){ + // create the string + std::string subject_(input.str, input.len); + std::string subject_2 = subject_; + subject_.resize(full_sz); + // fill the canary region + const char refchar = '`'; + const substr full = to_substr(subject_); + full.sub(max_sz).fill(refchar); + substr dst = full.first(input.len); + // filter now + Parser parser1 = {}; + FilterResult result = parser1.filter_scalar_plain_in_place(dst, cap, indentation); + EXPECT_EQ(result.get().len, expected.len); + EXPECT_EQ(result.required_len(), expected.len); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_plain(to_substr(subject_2), indentation); + EXPECT_GE(result.required_len(), expected.len); + EXPECT_EQ(sresult.len, result.str.len); + if(result.valid()) + { + const csubstr out = result.get(); + EXPECT_EQ(out, expected); + // check the fill character in the canary region. + EXPECT_GT(full.sub(max_sz).len, 0u); + EXPECT_EQ(full.first_not_of(refchar, max_sz), csubstr::npos); + } + }; + if(input.len >= expected.len) + { + RYML_TRACE_FMT("all good: input.len={} >= expected.len={}", input.len, expected.len); + run(input.len); + } + else // input.len < expected.len + { + RYML_TRACE_FMT("expanding: input.len={} < expected.len={}", input.len, expected.len); + { + RYML_TRACE_FMT("expanding.1: up to larger expected.len={}", expected.len); + run(expected.len); + } + // there is no room to filter if we pass input.len as the capacity. + { + RYML_TRACE_FMT("expanding.2: up to smaller input.len={}", input.len); + run(input.len); + } + } +} + + +// declare test cases +plain_scalar_case test_cases_filter[] = { + #define psc(indentation, input, expected) plain_scalar_case{indentation, csubstr(input), csubstr(expected)} + // 0 + psc(0, "A", "A"), + psc(0, "A B", "A B"), + psc(0, "A\nB", "A B"), + psc(1, "A\nB", "A B"), + psc(2, "A\nB", "A B"), + // 5 + psc(0, "A\n\nB", "A\nB"), + psc(1, "A\n\nB", "A\nB"), + psc(2, "A\n\nB", "A\nB"), + psc(0, "A\n\n\nB", "A\n\nB"), + psc(1, "A\n\n\nB", "A\n\nB"), + // 10 + psc(2, "A\n\n\nB", "A\n\nB"), + psc(0, "A\n\n\n\nB", "A\n\n\nB"), + psc(1, "A\n\n\n\nB", "A\n\n\nB"), + psc(2, "A\n\n\n\nB", "A\n\n\nB"), + psc(0, "A\n\n\n\n\nB", "A\n\n\n\nB"), + // 15 + psc(1, "A\n\n\n\n\nB", "A\n\n\n\nB"), + psc(2, "A\n\n\n\n\nB", "A\n\n\n\nB"), + psc(0, "a\nb \n c\nd\n\ne", "a b c d\ne"), + psc(1, "a\nb \n c\nd\n\ne", "a b c d\ne"), + psc(2, "a\nb \n c\nd\n\ne", "a b c d\ne"), + //psc(0, "A\n \n", "A"), + //psc(1, "A\n \n", "A"), + //psc(2, "A\n \n", "A"), + // 20 + psc(0, "1st non-empty\n\n 2nd non-empty \n 3rd non-empty\n", "1st non-empty\n2nd non-empty 3rd non-empty"), + psc(1, "1st non-empty\n\n 2nd non-empty \n 3rd non-empty\n", "1st non-empty\n2nd non-empty 3rd non-empty"), + psc(2, "1st non-empty\n\n 2nd non-empty \n 3rd non-empty\n", "1st non-empty\n2nd non-empty 3rd non-empty"), + psc(0, "---word1\nword2\n", "---word1 word2"), + psc(0, "---word1\nword2", "---word1 word2"), + // 25 + psc(0, "---word1\n\nword2\n", "---word1\nword2"), + psc(0, "---word1\n\nword2", "---word1\nword2"), + psc(0, "---word1\n\n\nword2", "---word1\n\nword2"), + psc(0, "---word1\n\n\n\nword2", "---word1\n\n\nword2"), + psc(0, "---word1\n\n\n\n\nword2", "---word1\n\n\n\nword2"), + // 30 + psc(0, R"(value +with + +tabs +tabs + + foo + + bar + baz + +)", "value with\ntabs tabs\nfoo\nbar baz\n"), // !!! not sure the final \n is right + psc(2, R"(value +with + +tabs +tabs + + foo + + bar + baz + +)", "value with\ntabs tabs\nfoo\nbar baz\n"), // !!! not sure the final \n is right + psc(2, R"(value + with + + tabs + tabs + + foo + + bar + baz + +)", "value with\ntabs tabs\nfoo\nbar baz\n"), // !!! not sure the final \n is right + psc(2, R"(value + with + + tabs + tabs + + foo + + bar + baz + +)", "value with\ntabs tabs\nfoo\nbar baz\n"), // !!! not sure the final \n is right + // 35 + // 40 + // 45 + // 50 + // 55 + // 60 + // 65 + // 70 + // 75 + #undef psc +}; + + +struct PlainScalarFilterSrcDstTest : public ::testing::TestWithParam +{ +}; + +TEST_P(PlainScalarFilterSrcDstTest, dst_is_same_size) +{ + plain_scalar_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.expected, dqc.indentation, /*dst_sz*/dqc.expected.len); +} + +TEST_P(PlainScalarFilterSrcDstTest, dst_is_larger_size) +{ + plain_scalar_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.expected, dqc.indentation, /*sz*/dqc.expected.len + 2u); + test_filter_src_dst(dqc.input, dqc.expected, dqc.indentation, /*sz*/dqc.expected.len + 100u); +} + +TEST_P(PlainScalarFilterSrcDstTest, dst_is_smaller_size) +{ + plain_scalar_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.expected, dqc.indentation, /*sz*/dqc.expected.len / 2u); +} + +TEST_P(PlainScalarFilterSrcDstTest, dst_is_zero_size) +{ + plain_scalar_case dqc = GetParam(); + test_filter_src_dst(dqc.input, dqc.expected, dqc.indentation, /*sz*/0u); +} + +struct PlainScalarFilterInplaceTest : public ::testing::TestWithParam +{ +}; + +TEST_P(PlainScalarFilterInplaceTest, dst_is_same_size) +{ + plain_scalar_case dqc = GetParam(); + test_filter_inplace(dqc.input, dqc.expected, dqc.indentation); +} + + + +INSTANTIATE_TEST_SUITE_P(plain_scalar_filter, + PlainScalarFilterSrcDstTest, + testing::ValuesIn(test_cases_filter)); + +INSTANTIATE_TEST_SUITE_P(plain_scalar_filter, + PlainScalarFilterInplaceTest, + testing::ValuesIn(test_cases_filter)); + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + TEST(plain_scalar, issue153_seq) { Tree t = parse_in_arena("- A\n \n"); @@ -242,24 +504,43 @@ TEST(plain_scalar, test_suite_NB6Z_seq) TEST(plain_scalar, test_suite_NB6Z_docval) { - csubstr yaml = R"( + auto check = [](Tree const &t){ + ASSERT_TRUE(t.rootref().is_doc()); + ASSERT_TRUE(t.rootref().is_val()); + EXPECT_EQ(t.rootref().val(), csubstr("value with\ntabs tabs\nfoo\nbar baz")); + }; + { + SCOPED_TRACE("case 0"); + test_check_emit_check(R"( value with - + tabs tabs - + foo - + bar baz - -)"; - test_check_emit_check(yaml, [](Tree const &t){ - ASSERT_TRUE(t.rootref().is_doc()); - ASSERT_TRUE(t.rootref().is_val()); - EXPECT_EQ(t.rootref().val(), csubstr("value with\ntabs tabs\nfoo\nbar baz")); - }); + +)", check); + } + { + SCOPED_TRACE("case 1"); + test_check_emit_check(R"( +value +with + +tabs +tabs + + foo + + bar + baz + +)", check); + } } diff --git a/test/test_single_quoted.cpp b/test/test_single_quoted.cpp index d27fdb6e0..7dfc1c967 100644 --- a/test/test_single_quoted.cpp +++ b/test/test_single_quoted.cpp @@ -1,8 +1,137 @@ +#include "./test_case.hpp" #include "./test_group.hpp" namespace c4 { namespace yml { +struct squoted_case +{ + csubstr input, output; +}; + +void test_filter(csubstr input, csubstr expected) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~", input.len, input, expected.len, expected); + ASSERT_LE(expected.len, input.len); + std::string subject_; + subject_.resize(2 * input.size()); + c4::substr dst = to_substr(subject_); + Parser proc = {}; + FilterResult result = proc.filter_scalar_squoted(input, dst); + ASSERT_TRUE(result.valid()); + csubstr out = result.get(); + if(input != expected) + { + EXPECT_TRUE(out.is_sub(dst));// << "\ninput=" << input << "\nexpected=" << expected; + } + EXPECT_EQ(out, expected); + std::cout << "OK! ~~~" << input << "~~~ ---> ~~~" << out << "~~~\n"; +} + +void test_filter_inplace(csubstr input, csubstr expected) +{ + RYML_TRACE_FMT("\nstr=[{}]~~~{}~~~\nexp=[{}]~~~{}~~~", input.len, input, expected.len, expected); + ASSERT_LE(expected.len, input.len); + std::string subject_(input.str, input.len); + std::string subject_2 = subject_; + c4::substr dst = to_substr(subject_); + Parser parser1 = {}; + FilterResult result = parser1.filter_scalar_squoted_in_place(dst, subject_.size()); + Parser parser2 = {}; + Tree tree = parser2.parse_in_arena("file", "# set the tree in the parser"); + csubstr sresult = parser2._filter_scalar_squot(to_substr(subject_2)); + EXPECT_GE(result.required_len(), expected.len); + EXPECT_EQ(sresult.len, result.str.len); + ASSERT_TRUE(result.valid()); + csubstr out = result.get(); + ASSERT_TRUE(out.str); + EXPECT_TRUE(out.is_sub(dst));// << "\ninput=" << input << "\nexpected=" << expected; + EXPECT_EQ(out, expected); + std::cout << "OK! ~~~" << input << "~~~ ---> ~~~" << out << "~~~\n"; +} + +struct SQuotedFilterTest : public ::testing::TestWithParam +{ +}; + +TEST_P(SQuotedFilterTest, filter) +{ + squoted_case sqc = GetParam(); + test_filter(sqc.input, sqc.output); +} +TEST_P(SQuotedFilterTest, filter_inplace) +{ + squoted_case sqc = GetParam(); + test_filter_inplace(sqc.input, sqc.output); +} + +squoted_case test_cases_filter[] = { + #define sqc(input, output) squoted_case{csubstr(input), csubstr(output)} + // 0 + sqc("", ""), + sqc(" ", " "), + sqc(" ", " "), + sqc(" ", " "), + sqc(" ", " "), + // 5 + sqc("foo", "foo"), + sqc("quoted\nstring", "quoted string"), + sqc("quoted\n\nstring", "quoted\nstring"), + sqc("quoted\n\n\nstring", "quoted\n\nstring"), + sqc("quoted\n\n\n\nstring", "quoted\n\n\nstring"), + // 10 + sqc("quoted\n string", "quoted string"), + sqc("\"Howdy!\" he cried.", "\"Howdy!\" he cried."), + sqc(" # Not a ''comment''.", " # Not a 'comment'."), + sqc("|\\-*-/|", "|\\-*-/|"), + sqc("\t\n\ndetected\n\n", "\t\ndetected\n"), + // 15 + sqc(" 1st non-empty\n\n 2nd non-empty \n 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + sqc(" 1st non-empty\n\n 2nd non-empty \t\n \t3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + sqc(" 1st non-empty\n\n 2nd non-empty\t \n\t 3rd non-empty ", " 1st non-empty\n2nd non-empty 3rd non-empty "), + sqc("Several lines of text,\ncontaining ''single quotes'' and \"double quotes\". Escapes (like \\n) don''t do anything.\n\nNewlines can be added by leaving a blank line.\n Leading whitespace on lines is ignored.", + "Several lines of text, containing 'single quotes' and \"double quotes\". Escapes (like \\n) don't do anything.\nNewlines can be added by leaving a blank line. Leading whitespace on lines is ignored."), + sqc(R"(Some text ''with single quotes'' "and double quotes".)", "Some text 'with single quotes' \"and double quotes\"."), + // 20 + sqc(R"(Some text with escapes \n \r \t)", "Some text with escapes \\n \\r \\t"), + sqc("''", "'"), + sqc("''''", "''"), + sqc("''''''", "'''"), + sqc("''''''''", "''''"), + // 25 + sqc("''''''''''", "'''''"), + sqc("''''''''''''", "''''''"), + sqc(R"(a aaaa )", "a aaaa "), + sqc(R"(a aaaa )", "a aaaa "), + sqc(R"(a aaaa )", "a aaaa "), + // 30 + sqc(R"(a aaaa )", "a aaaa "), + sqc(R"(a aaaa )", "a aaaa "), + sqc(R"( a aaaa)", " a aaaa"), + sqc(R"( a aaaa)", " a aaaa"), + sqc(R"( a aaaa)", " a aaaa"), + // 35 + sqc(R"( a aaaa)", " a aaaa"), + sqc(R"( a aaaa)", " a aaaa"), + sqc(R"( a aaaa )", " a aaaa "), + sqc(R"( a aaaa )", " a aaaa "), + sqc(R"( a aaaa )", " a aaaa "), + // 40 + sqc(R"( a aaaa )", " a aaaa "), + sqc(R"( a aaaa )", " a aaaa "), + sqc(R"(x\ny:z\tx $%^&*()x)", "x\\ny:z\\tx $%^&*()x"), + #undef sqc +}; + +INSTANTIATE_TEST_SUITE_P(single_quoted_filter, + SQuotedFilterTest, + testing::ValuesIn(test_cases_filter)); + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + TEST(single_quoted, test_suite_KSS4) { csubstr yaml = R"( diff --git a/test/test_suite/test_suite_events_emitter.cpp b/test/test_suite/test_suite_events_emitter.cpp index 776bb9e27..c1abd0147 100644 --- a/test/test_suite/test_suite_events_emitter.cpp +++ b/test/test_suite/test_suite_events_emitter.cpp @@ -25,13 +25,13 @@ struct EventsEmitter template C4_ALWAYS_INLINE void pr(const char (&s)[N]) { - if(pos + N-1 <= buf.len) + if(N > 1 && pos + N-1 <= buf.len) memcpy(buf.str + pos, s, N-1); pos += N-1; } C4_ALWAYS_INLINE void pr(csubstr s) { - if(pos + s.len <= buf.len) + if(s.len && pos + s.len <= buf.len) memcpy(buf.str + pos, s.str, s.len); pos += s.len; } diff --git a/tools/amalgamate.py b/tools/amalgamate.py index 221397f03..d17e2208f 100644 --- a/tools/amalgamate.py +++ b/tools/amalgamate.py @@ -50,9 +50,7 @@ def amalgamate_ryml(filename: str, with_fastfloat=with_fastfloat, with_stl=with_stl) repo = "https://github.com/biojppm/rapidyaml" - defmacro = ryml_defmacro - srcfiles = [ - am.cmttext(f""" + ryml_preamble = f""" Rapid YAML - a library to parse and emit YAML, and do it fast. {repo} @@ -63,19 +61,22 @@ def amalgamate_ryml(filename: str, INSTRUCTIONS: - Include at will in any header of your project - In one (and only one) of your project source files, - #define {defmacro} and then include this header. + #define {ryml_defmacro} and then include this header. This will enable the function and class definitions in the header file. - To compile into a shared library, just define the preprocessor symbol RYML_SHARED . This will take care of symbol export/import. -"""), +""" + srcfiles = [ + am.cmttext(ryml_preamble), am.cmtfile("LICENSE.txt"), am.injcode(exports_def_code), am.onlyif(with_c4core, am.injcode(c4core_def_code)), am.onlyif(with_c4core, c4core_amalgamated), "src/c4/yml/export.hpp", "src/c4/yml/common.hpp", + "src/c4/yml/node_type.hpp", "src/c4/yml/tree.hpp", "src/c4/yml/node.hpp", "src/c4/yml/writer.hpp", @@ -84,6 +85,7 @@ def amalgamate_ryml(filename: str, "src/c4/yml/emit.hpp", "src/c4/yml/emit.def.hpp", "src/c4/yml/detail/stack.hpp", + "src/c4/yml/filter_processor.hpp", "src/c4/yml/parse.hpp", am.onlyif(with_stl, "src/c4/yml/std/map.hpp"), am.onlyif(with_stl, "src/c4/yml/std/string.hpp"), @@ -109,7 +111,7 @@ def amalgamate_ryml(filename: str, re.compile(r'^\s*#\s*include "(c4/.*)".*$'), re.compile(r'^\s*#\s*include <(c4/.*)>.*$'), ], - definition_macro=defmacro, + definition_macro=ryml_defmacro, repo=repo, result_incguard="_RYML_SINGLE_HEADER_AMALGAMATED_HPP_") result_with_only_first_includes = am.include_only_first(result)