diff --git a/src/buffered_input.jl b/src/buffered_input.jl index dfa7b5a..dc46af4 100644 --- a/src/buffered_input.jl +++ b/src/buffered_input.jl @@ -15,41 +15,33 @@ mutable struct BufferedInput end end - -# Read and buffer n more characters -function __fill(bi::BufferedInput, bi_input::IO, n::Integer) - for _ in 1:n - c = eof(bi_input) ? '\0' : read(bi_input, Char) - i = bi.offset + bi.avail + 1 +# Read and buffer `n` more characters +function buffer!(bi::BufferedInput, n::Integer)::Nothing + for i in bi.offset + bi.avail .+ (1:n) + c = eof(bi.input) ? '\0' : read(bi.input, Char) if i ≤ length(bi.buffer) bi.buffer[i] = c else push!(bi.buffer, c) end - bi.avail += 1 end + bi.avail += n + nothing end -_fill(bi::BufferedInput, n::Integer) = __fill(bi, bi.input, n) - -# Peek the character in the i-th position relative to the current position. -# (0-based) -function peek(bi::BufferedInput, i::Integer=0) - i1 = i + 1 - if bi.avail < i1 - _fill(bi, i1 - bi.avail) - end - bi.buffer[bi.offset + i1] +# Peek the character in the `i`-th position relative to the current position. +function peek1(bi::BufferedInput, i::Integer=1)::Char + bi.avail < i && buffer!(bi, i - bi.avail) + bi.buffer[bi.offset + i] end +# peek function for 0-based indices +peek(bi::BufferedInput, i::Integer=0) = peek1(bi, i + 1) # Return the string formed from the first n characters from the current position # of the stream. -function prefix(bi::BufferedInput, n::Integer=1) - n1 = n + 1 - if bi.avail < n1 - _fill(bi, n1 - bi.avail) - end +function prefix(bi::BufferedInput, n::Integer=1)::String + bi.avail < n && buffer!(bi, n - bi.avail) String(bi.buffer[bi.offset .+ (1:n)]) end diff --git a/src/scanner.jl b/src/scanner.jl index 084a57e..14311f9 100644 --- a/src/scanner.jl +++ b/src/scanner.jl @@ -759,31 +759,75 @@ end # If the stream is at a line break, advance past it. # -# Returns: -# '\r\n' : '\n' -# '\r' : '\n' -# '\n' : '\n' -# '\x85' : '\n' -# '\u2028' : '\u2028' -# '\u2029 : '\u2029' -# default : '' +# YAML 1.1 # -function scan_line_break(stream::TokenStream) - if in(peek(stream.input), "\r\n\u0085") - if prefix(stream.input, 2) == "\r\n" +# [22] b-line-feed ::= #xA /*LF*/ +# [23] b-carriage-return ::= #xD /*CR*/ +# [24] b-next-line ::= #x85 /*NEL*/ +# [25] b-line-separator ::= #x2028 /*LS*/ +# [26] b-paragraph-separator ::= #x2029 /*PS*/ +# [28] b-specific ::= b-line-separator | b-paragraph-separator +# [29] b-generic ::= ( b-carriage-return b-line-feed) | b-carriage-return | b-line-feed | b-next-line +# [30] b-as-line-feed ::= b-generic +# [31] b-normalized ::= b-as-line-feed | b-specific +# +# U+000D U+000A → U+000A +# U+000D → U+000A +# U+000A → U+000A +# U+0085 → U+000A +# U+2028 → U+2028 +# U+2029 → U+2029 +# otherwise → (empty) +# +function yaml_1_1_scan_line_break(stream::TokenStream)::String + c = peek(stream.input) + if c == '\u000d' + if peek(stream.input, 1) == '\u000a' forwardchars!(stream, 2) else forwardchars!(stream) end - return "\n" - elseif in(peek(stream.input), "\u2028\u2029") - ch = peek(stream.input) + "\u000a" + elseif c == '\u000a' || c == '\u0085' forwardchars!(stream) - return ch + "\u000a" + elseif c == '\u2028' || c == '\u2029' + forwardchars!(stream) + string(c) + else + "" + end +end +# +# YAML 1.2 +# +# [24] b-line-feed ::= x0A +# [25] b-carriage-return ::= x0D +# [26] b-char ::= b-line-feed | b-carriage-return +# [27] nb-char ::= c-printable - b-char - c-byte-order-mark +# [28] b-break ::= ( b-carriage-return b-line-feed ) | b-carriage-return | b-line-feed +# +# U+000D U+000A → U+000A +# U+000D → U+000A +# U+000A → U+000A +# otherwise → (empty) +# +function yaml_1_2_scan_line_break(stream::TokenStream)::String + c = peek(stream.input) + if c == '\u000d' + if peek(stream.input, 1) == '\u000a' + forwardchars!(stream, 2) + else + forwardchars!(stream) + end + "\u000a" + elseif c == '\u000a' + forwardchars!(stream) + "\u000a" + else + "" end - return "" end - # Scan past whitespace to the next token. function scan_to_next_token(stream::TokenStream) @@ -800,7 +844,7 @@ function scan_to_next_token(stream::TokenStream) end end # line break - if scan_line_break(stream) != "" + if yaml_1_1_scan_line_break(stream) != "" if stream.flow_level == 0 stream.allow_simple_key = true end @@ -975,7 +1019,7 @@ function scan_directive_ignored_line(stream::TokenStream, start_mark::Mark) "expected a comment or a line break, but found '$(peek(stream.input))'", get_mark(stream))) end - scan_line_break(stream) + yaml_1_1_scan_line_break(stream) end @@ -1094,7 +1138,7 @@ function scan_block_scalar(stream::TokenStream, style::Char) end push!(chunks, prefix(stream.input, length)) forwardchars!(stream, length) - line_break = scan_line_break(stream) + line_break = yaml_1_1_scan_line_break(stream) breaks, end_mark = scan_block_scalar_breaks(stream, indent) if stream.column == indent && peek(stream.input) != '\0' if folded && line_break == "\n" && @@ -1140,7 +1184,7 @@ function scan_block_scalar_ignored_line(stream::TokenStream, start_mark::Mark) get_mark(stream))) end - scan_line_break(stream) + yaml_1_1_scan_line_break(stream) end @@ -1193,7 +1237,7 @@ function scan_block_scalar_indentation(stream::TokenStream) end_mark = get_mark(stream) while in(peek(stream.input), " \r\n\u0085\u2028\u2029") if peek(stream.input) != ' ' - push!(chunks, scan_line_break(stream)) + push!(chunks, yaml_1_1_scan_line_break(stream)) end_mark = get_mark(stream) else forwardchars!(stream) @@ -1215,7 +1259,7 @@ function scan_block_scalar_breaks(stream::TokenStream, indent) end while in(peek(stream.input), "\r\n\u0085\u2028\u2029") - push!(chunks, scan_line_break(stream)) + push!(chunks, yaml_1_1_scan_line_break(stream)) end_mark = get_mark(stream) while stream.column < indent && peek(stream.input) == ' ' forwardchars!(stream) @@ -1315,7 +1359,7 @@ function scan_flow_scalar_non_spaces(stream::TokenStream, double::Bool, push!(chunks, Char(parse(Int, prefix(stream.input, length), base = 16))) forwardchars!(stream, length) elseif in(c, "\r\n\u0085\u2028\u2029") - scan_line_break(stream) + yaml_1_1_scan_line_break(stream) append!(chunks, scan_flow_scalar_breaks(stream, double, start_mark)) else throw(ScannerError("while scanning a double-quoted scalar", @@ -1345,7 +1389,7 @@ function scan_flow_scalar_spaces(stream::TokenStream, double::Bool, throw(ScannerError("while scanning a quoted scalar", start_mark, "found unexpected end of stream", get_mark(stream))) elseif in(c, "\r\n\u0085\u2028\u2029") - line_break = scan_line_break(stream) + line_break = yaml_1_1_scan_line_break(stream) breaks = scan_flow_scalar_breaks(stream, double, start_mark) if line_break != '\n' push!(chunks, line_break) @@ -1378,7 +1422,7 @@ function scan_flow_scalar_breaks(stream::TokenStream, double::Bool, end if in(peek(stream.input), "\r\n\u0085\u2028\u2029") - push!(chunks, scan_line_break(stream)) + push!(chunks, yaml_1_1_scan_line_break(stream)) else return chunks end @@ -1462,7 +1506,7 @@ function scan_plain_spaces(stream::TokenStream, indent::Integer, forwardchars!(stream, length) c = peek(stream.input) if in(c, "\r\n\u0085\u2028\u2029") - line_break = scan_line_break(stream) + line_break = yaml_1_1_scan_line_break(stream) stream.allow_simple_key = true if peek(stream.input) == '\uFEFF' return Any[] @@ -1478,7 +1522,7 @@ function scan_plain_spaces(stream::TokenStream, indent::Integer, if peek(stream.input) == ' ' forwardchars!(stream) else - push!(breaks, scan_line_break(stream)) + push!(breaks, yaml_1_1_scan_line_break(stream)) if peek(stream.input) == '\uFEFF' return Any[] end diff --git a/test/windows_newlines.data b/test/windows_newlines.data index 9db751f..4e3cf3d 100644 Binary files a/test/windows_newlines.data and b/test/windows_newlines.data differ