-
Notifications
You must be signed in to change notification settings - Fork 601
/
pos_scanner.go
155 lines (135 loc) · 4.71 KB
/
pos_scanner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package hcl
import (
"bufio"
"bytes"
"github.com/apparentlymart/go-textseg/v15/textseg"
)
// RangeScanner is a helper that will scan over a buffer using a bufio.SplitFunc
// and visit a source range for each token matched.
//
// For example, this can be used with bufio.ScanLines to find the source range
// for each line in the file, skipping over the actual newline characters, which
// may be useful when printing source code snippets as part of diagnostic
// messages.
//
// The line and column information in the returned ranges is produced by
// counting newline characters and grapheme clusters respectively, which
// mimics the behavior we expect from a parser when producing ranges.
type RangeScanner struct {
filename string
b []byte
cb bufio.SplitFunc
pos Pos // position of next byte to process in b
cur Range // latest range
tok []byte // slice of b that is covered by cur
err error // error from last scan, if any
}
// NewRangeScanner creates a new RangeScanner for the given buffer, producing
// ranges for the given filename.
//
// Since ranges have grapheme-cluster granularity rather than byte granularity,
// the scanner will produce incorrect results if the given SplitFunc creates
// tokens between grapheme cluster boundaries. In particular, it is incorrect
// to use RangeScanner with bufio.ScanRunes because it will produce tokens
// around individual UTF-8 sequences, which will split any multi-sequence
// grapheme clusters.
func NewRangeScanner(b []byte, filename string, cb bufio.SplitFunc) *RangeScanner {
return NewRangeScannerFragment(b, filename, InitialPos, cb)
}
// NewRangeScannerFragment is like NewRangeScanner but the ranges it produces
// will be offset by the given starting position, which is appropriate for
// sub-slices of a file, whereas NewRangeScanner assumes it is scanning an
// entire file.
func NewRangeScannerFragment(b []byte, filename string, start Pos, cb bufio.SplitFunc) *RangeScanner {
return &RangeScanner{
filename: filename,
b: b,
cb: cb,
pos: start,
}
}
func (sc *RangeScanner) Scan() bool {
if sc.pos.Byte >= len(sc.b) || sc.err != nil {
// All done
return false
}
// Since we're operating on an in-memory buffer, we always pass the whole
// remainder of the buffer to our SplitFunc and set isEOF to let it know
// that it has the whole thing.
advance, token, err := sc.cb(sc.b[sc.pos.Byte:], true)
// Since we are setting isEOF to true this should never happen, but
// if it does we will just abort and assume the SplitFunc is misbehaving.
if advance == 0 && token == nil && err == nil {
return false
}
if err != nil {
sc.err = err
sc.cur = Range{
Filename: sc.filename,
Start: sc.pos,
End: sc.pos,
}
sc.tok = nil
return false
}
sc.tok = token
start := sc.pos
end := sc.pos
new := sc.pos
// adv is similar to token but it also includes any subsequent characters
// we're being asked to skip over by the SplitFunc.
// adv is a slice covering any additional bytes we are skipping over, based
// on what the SplitFunc told us to do with advance.
adv := sc.b[sc.pos.Byte : sc.pos.Byte+advance]
// We now need to scan over our token to count the grapheme clusters
// so we can correctly advance Column, and count the newlines so we
// can correctly advance Line.
advR := bytes.NewReader(adv)
gsc := bufio.NewScanner(advR)
advanced := 0
gsc.Split(textseg.ScanGraphemeClusters)
for gsc.Scan() {
gr := gsc.Bytes()
new.Byte += len(gr)
new.Column++
// We rely here on the fact that \r\n is considered a grapheme cluster
// and so we don't need to worry about miscounting additional lines
// on files with Windows-style line endings.
if len(gr) != 0 && (gr[0] == '\r' || gr[0] == '\n') {
new.Column = 1
new.Line++
}
if advanced < len(token) {
// If we've not yet found the end of our token then we'll
// also push our "end" marker along.
// (if advance > len(token) then we'll stop moving "end" early
// so that the caller only sees the range covered by token.)
end = new
}
advanced += len(gr)
}
sc.cur = Range{
Filename: sc.filename,
Start: start,
End: end,
}
sc.pos = new
return true
}
// Range returns a range that covers the latest token obtained after a call
// to Scan returns true.
func (sc *RangeScanner) Range() Range {
return sc.cur
}
// Bytes returns the slice of the input buffer that is covered by the range
// that would be returned by Range.
func (sc *RangeScanner) Bytes() []byte {
return sc.tok
}
// Err can be called after Scan returns false to determine if the latest read
// resulted in an error, and obtain that error if so.
func (sc *RangeScanner) Err() error {
return sc.err
}