From 1c9a97daf054c43cca3823c9ae243ba9a9496c40 Mon Sep 17 00:00:00 2001 From: Koki Fushimi Date: Sat, 15 Jun 2024 19:40:34 +0900 Subject: [PATCH 1/5] Refactoring of `scan_line_break`. * Performance improvement of `scan_line_break`. * Rename `scan_line_break` to `yaml_1_1_scan_line_break`. * Add `yaml_1_2_scan_line_break`. * Add better comments. * Add a TODO comment about possible bugs. --- src/scanner.jl | 108 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 28 deletions(-) diff --git a/src/scanner.jl b/src/scanner.jl index 9f59888..60b032b 100644 --- a/src/scanner.jl +++ b/src/scanner.jl @@ -757,31 +757,83 @@ end # If the stream is at a line break, advance past it. # -# Returns: -# '\r\n' : '\n' -# '\r' : '\n' -# '\n' : '\n' -# '\x85' : '\n' -# '\u2028' : '\u2028' -# '\u2029 : '\u2029' -# default : '' +# YAML 1.1 # -function scan_line_break(stream::TokenStream) - if in(peek(stream.input), "\r\n\u0085") - if prefix(stream.input, 2) == "\r\n" +# [22] b-line-feed ::= #xA /*LF*/ +# [23] b-carriage-return ::= #xD /*CR*/ +# [24] b-next-line ::= #x85 /*NEL*/ +# [25] b-line-separator ::= #x2028 /*LS*/ +# [26] b-paragraph-separator ::= #x2029 /*PS*/ +# [28] b-specific ::= b-line-separator | b-paragraph-separator +# [29] b-generic ::= ( b-carriage-return b-line-feed) | b-carriage-return | b-line-feed | b-next-line +# [30] b-as-line-feed ::= b-generic +# [31] b-normalized ::= b-as-line-feed | b-specific +# +# U+000D U+000A → U+000A +# U+000D → U+000A +# U+000A → U+000A +# U+0085 → U+000A +# U+2028 → U+2028 +# U+2029 → U+2029 +# otherwise → (empty) +# +function yaml_1_1_scan_line_break(stream::TokenStream)::String + c = peek(stream.input) + if c == '\u000d' + # TODO: + # This seems better for performance but gives errors and I don't know why. + # Perhaps, `prefx(stream.input, 2)` modifies `stream` and eventually escapes from an error. + # if peek(stream.input, 1) == '\u000a' + # forwardchars!(stream, 2) + # else + # forwardchars!(stream) + # end + if prefix(stream.input, 2) == "\u000d\u000a" forwardchars!(stream, 2) else forwardchars!(stream) end - return "\n" - elseif in(peek(stream.input), "\u2028\u2029") - ch = peek(stream.input) + "\u000a" + elseif c == '\u000a' || c == '\u0085' forwardchars!(stream) - return ch + "\u000a" + elseif c == '\u2028' || c == '\u2029' + forwardchars!(stream) + string(c) + else + "" + end +end +# +# YAML 1.2 +# +# [24] b-line-feed ::= x0A +# [25] b-carriage-return ::= x0D +# [26] b-char ::= b-line-feed | b-carriage-return +# [27] nb-char ::= c-printable - b-char - c-byte-order-mark +# [28] b-break ::= ( b-carriage-return b-line-feed ) | b-carriage-return | b-line-feed +# +# U+000D U+000A → U+000A +# U+000D → U+000A +# U+000A → U+000A +# otherwise → (empty) +# +function yaml_1_2_scan_line_break(stream::TokenStream)::String + c = peek(stream.input) + if c == '\u000d' + if peek(stream.input, 1) == '\u000a' + forwardchars!(stream, 2) + else + forwardchars!(stream) + end + "\u000a" + elseif c == '\u000a' + forwardchars!(stream) + "\u000a" + else + "" end - return "" end - # Scan past whitespace to the next token. function scan_to_next_token(stream::TokenStream) @@ -798,7 +850,7 @@ function scan_to_next_token(stream::TokenStream) end end - if scan_line_break(stream) != "" + if yaml_1_1_scan_line_break(stream) != "" if stream.flow_level == 0 stream.allow_simple_key = true end @@ -948,7 +1000,7 @@ function scan_directive_ignored_line(stream::TokenStream, start_mark::Mark) "expected a comment or a line break, but found '$(peek(stream.input))'", get_mark(stream))) end - scan_line_break(stream) + yaml_1_1_scan_line_break(stream) end @@ -1067,7 +1119,7 @@ function scan_block_scalar(stream::TokenStream, style::Char) end push!(chunks, prefix(stream.input, length)) forwardchars!(stream, length) - line_break = scan_line_break(stream) + line_break = yaml_1_1_scan_line_break(stream) breaks, end_mark = scan_block_scalar_breaks(stream, indent) if stream.column == indent && peek(stream.input) != '\0' if folded && line_break == "\n" && @@ -1113,7 +1165,7 @@ function scan_block_scalar_ignored_line(stream::TokenStream, start_mark::Mark) get_mark(stream))) end - scan_line_break(stream) + yaml_1_1_scan_line_break(stream) end @@ -1166,7 +1218,7 @@ function scan_block_scalar_indentation(stream::TokenStream) end_mark = get_mark(stream) while in(peek(stream.input), " \r\n\u0085\u2028\u2029") if peek(stream.input) != ' ' - push!(chunks, scan_line_break(stream)) + push!(chunks, yaml_1_1_scan_line_break(stream)) end_mark = get_mark(stream) else forwardchars!(stream) @@ -1188,7 +1240,7 @@ function scan_block_scalar_breaks(stream::TokenStream, indent) end while in(peek(stream.input), "\r\n\u0085\u2028\u2029") - push!(chunks, scan_line_break(stream)) + push!(chunks, yaml_1_1_scan_line_break(stream)) end_mark = get_mark(stream) while stream.column < indent && peek(stream.input) == ' ' forwardchars!(stream) @@ -1288,7 +1340,7 @@ function scan_flow_scalar_non_spaces(stream::TokenStream, double::Bool, push!(chunks, Char(parse(Int, prefix(stream.input, length), base = 16))) forwardchars!(stream, length) elseif in(c, "\r\n\u0085\u2028\u2029") - scan_line_break(stream) + yaml_1_1_scan_line_break(stream) append!(chunks, scan_flow_scalar_breaks(stream, double, start_mark)) else throw(ScannerError("while scanning a double-quoted scalar", @@ -1318,7 +1370,7 @@ function scan_flow_scalar_spaces(stream::TokenStream, double::Bool, throw(ScannerError("while scanning a quoted scalar", start_mark, "found unexpected end of stream", get_mark(stream))) elseif in(c, "\r\n\u0085\u2028\u2029") - line_break = scan_line_break(stream) + line_break = yaml_1_1_scan_line_break(stream) breaks = scan_flow_scalar_breaks(stream, double, start_mark) if line_break != '\n' push!(chunks, line_break) @@ -1351,7 +1403,7 @@ function scan_flow_scalar_breaks(stream::TokenStream, double::Bool, end if in(peek(stream.input), "\r\n\u0085\u2028\u2029") - push!(chunks, scan_line_break(stream)) + push!(chunks, yaml_1_1_scan_line_break(stream)) else return chunks end @@ -1435,7 +1487,7 @@ function scan_plain_spaces(stream::TokenStream, indent::Integer, forwardchars!(stream, length) c = peek(stream.input) if in(c, "\r\n\u0085\u2028\u2029") - line_break = scan_line_break(stream) + line_break = yaml_1_1_scan_line_break(stream) stream.allow_simple_key = true if peek(stream.input) == '\uFEFF' return Any[] @@ -1451,7 +1503,7 @@ function scan_plain_spaces(stream::TokenStream, indent::Integer, if peek(stream.input) == ' ' forwardchars!(stream) else - push!(breaks, scan_line_break(stream)) + push!(breaks, yaml_1_1_scan_line_break(stream)) if peek(stream.input) == '\uFEFF' return Any[] end From aa04daecaa261e51ca78b993ee6e9363cd51183f Mon Sep 17 00:00:00 2001 From: Koki Fushimi Date: Mon, 17 Jun 2024 17:12:53 +0900 Subject: [PATCH 2/5] Change `_fill` and `__fill` to better implementation and rename to `buffer!`. --- src/buffered_input.jl | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/buffered_input.jl b/src/buffered_input.jl index dfa7b5a..af1ec0d 100644 --- a/src/buffered_input.jl +++ b/src/buffered_input.jl @@ -15,29 +15,26 @@ mutable struct BufferedInput end end - -# Read and buffer n more characters -function __fill(bi::BufferedInput, bi_input::IO, n::Integer) - for _ in 1:n - c = eof(bi_input) ? '\0' : read(bi_input, Char) - i = bi.offset + bi.avail + 1 +# Read and buffer `n` more characters +function buffer!(bi::BufferedInput, n::Integer)::Nothing + for i in bi.offset + bi.avail .+ (1:n) + c = eof(bi.input) ? '\0' : read(bi.input, Char) if i ≤ length(bi.buffer) bi.buffer[i] = c else push!(bi.buffer, c) end - bi.avail += 1 end + bi.avail += n + nothing end -_fill(bi::BufferedInput, n::Integer) = __fill(bi, bi.input, n) - # Peek the character in the i-th position relative to the current position. # (0-based) function peek(bi::BufferedInput, i::Integer=0) i1 = i + 1 if bi.avail < i1 - _fill(bi, i1 - bi.avail) + buffer!(bi, i1 - bi.avail) end bi.buffer[bi.offset + i1] end @@ -48,7 +45,7 @@ end function prefix(bi::BufferedInput, n::Integer=1) n1 = n + 1 if bi.avail < n1 - _fill(bi, n1 - bi.avail) + buffer!(bi, n1 - bi.avail) end String(bi.buffer[bi.offset .+ (1:n)]) end From d2014f8454ad29e10e491752514ac4e808c2a7cf Mon Sep 17 00:00:00 2001 From: Koki Fushimi Date: Mon, 17 Jun 2024 17:20:14 +0900 Subject: [PATCH 3/5] Refactoring of `peek` functions. --- src/buffered_input.jl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/buffered_input.jl b/src/buffered_input.jl index af1ec0d..109993d 100644 --- a/src/buffered_input.jl +++ b/src/buffered_input.jl @@ -29,16 +29,14 @@ function buffer!(bi::BufferedInput, n::Integer)::Nothing nothing end -# Peek the character in the i-th position relative to the current position. -# (0-based) -function peek(bi::BufferedInput, i::Integer=0) - i1 = i + 1 - if bi.avail < i1 - buffer!(bi, i1 - bi.avail) - end - bi.buffer[bi.offset + i1] +# Peek the character in the `i`-th position relative to the current position. +function peek1(bi::BufferedInput, i::Integer=1)::Char + bi.avail < i && buffer!(bi, i - bi.avail) + bi.buffer[bi.offset + i] end +# peek function for 0-based indices +peek(bi::BufferedInput, i::Integer=0) = peek1(bi, i + 1) # Return the string formed from the first n characters from the current position # of the stream. From c4ef02d181ffa3cd68d1662b7e8ea6bd60cc7531 Mon Sep 17 00:00:00 2001 From: Koki Fushimi Date: Mon, 17 Jun 2024 17:24:43 +0900 Subject: [PATCH 4/5] Bug fix of `prefix(::BufferedInput, ::Integer)`. Change to not overbuffer. This bug fix brake the test `windows_newlines` but I think the test is incorrect. --- src/buffered_input.jl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/buffered_input.jl b/src/buffered_input.jl index af1ec0d..d48c2cb 100644 --- a/src/buffered_input.jl +++ b/src/buffered_input.jl @@ -42,11 +42,8 @@ end # Return the string formed from the first n characters from the current position # of the stream. -function prefix(bi::BufferedInput, n::Integer=1) - n1 = n + 1 - if bi.avail < n1 - buffer!(bi, n1 - bi.avail) - end +function prefix(bi::BufferedInput, n::Integer=1)::String + bi.avail < n && buffer!(bi, n - bi.avail) String(bi.buffer[bi.offset .+ (1:n)]) end From 110f4154fc9692aa1235a6b7b24ff68689427669 Mon Sep 17 00:00:00 2001 From: Koki Fushimi Date: Mon, 17 Jun 2024 19:33:10 +0900 Subject: [PATCH 5/5] Fix the test for Windows and use right implementation for `yaml_1_1_scan_line_break`. --- src/scanner.jl | 10 +--------- test/windows_newlines.data | Bin 9 -> 9 bytes 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/src/scanner.jl b/src/scanner.jl index d1c74a3..14311f9 100644 --- a/src/scanner.jl +++ b/src/scanner.jl @@ -782,15 +782,7 @@ end function yaml_1_1_scan_line_break(stream::TokenStream)::String c = peek(stream.input) if c == '\u000d' - # TODO: - # This seems better for performance but gives errors and I don't know why. - # Perhaps, `prefx(stream.input, 2)` modifies `stream` and eventually escapes from an error. - # if peek(stream.input, 1) == '\u000a' - # forwardchars!(stream, 2) - # else - # forwardchars!(stream) - # end - if prefix(stream.input, 2) == "\u000d\u000a" + if peek(stream.input, 1) == '\u000a' forwardchars!(stream, 2) else forwardchars!(stream) diff --git a/test/windows_newlines.data b/test/windows_newlines.data index 9db751f1952ff772a030de7f6dd1bb1f62b4dc13..4e3cf3d3b4384b3d836ba0f043381caee702817b 100644 GIT binary patch literal 9 Qcmc~u&B@8P;^pE601vPNa{vGU literal 9 Qcmc~u&B@8P;$`3k01uo3X#fBK