diff --git a/csvparser/columns.go b/csvparser/columns.go new file mode 100644 index 0000000..c6c8af2 --- /dev/null +++ b/csvparser/columns.go @@ -0,0 +1,76 @@ +package csvparser + +type ( + Col[T any] interface { + Parse(data []byte, item *T) (int, error) + //Compile(x T, writer io.Writer) error + } + + opts struct { + sep byte + } + + ColFactory[T any] func(opts) Col[T] + + StringColumn[T any] struct { + inner StringType + setter func(x *T, v string) + getter func(x T) string + } + + IntColumn[T any] struct { + inner IntegerType + setter func(x *T, v int) + getter func(x T) int + } + + BoolColumn[T any] struct { + inner StringType + setter func(x T, v bool) + getter func(x T) bool + } +) + +func (s StringColumn[T]) Parse(data []byte, item *T) (int, error) { + val, n, err := s.inner.Parse(data) + if err != nil { + return n, err + } + s.setter(item, val) + return n, nil +} + +func (c IntColumn[T]) Parse(data []byte, item *T) (int, error) { + val, n, err := c.inner.Parse(data) + if err != nil { + return n, err + } + c.setter(item, val) + return n, nil +} + +func StringCol[T any]( + quote byte, + getter func(T) string, + setter func(*T, string), +) ColFactory[T] { + return func(opts opts) Col[T] { + return StringColumn[T]{ + inner: StrType(quote, opts.sep), + getter: getter, setter: setter, + } + } +} + +func IntCol[T any]( + quote byte, + getter func(T) int, + setter func(*T, int), +) ColFactory[T] { + return func(opts opts) Col[T] { + return IntColumn[T]{ + inner: IntType(quote, opts.sep), + getter: getter, setter: setter, + } + } +} diff --git a/csvparser/csvparser.go b/csvparser/csvparser.go deleted file mode 100644 index 86328ab..0000000 --- a/csvparser/csvparser.go +++ /dev/null @@ -1,197 +0,0 @@ -package csvparser - -import ( - "io" - "strconv" - - "github.com/pkg/errors" - "github.com/sonirico/stadio/slices" -) - -var ( - quote = []byte{byte('"')} - - SeparatorComma byte = ',' - SeparatorSemicolon byte = ';' - SeparatorTab byte = '\t' -) - -type ( - // Header represents a set of columns definitions - Header struct { - } - - StringColumn[T any] struct { - inner StringType - setter func(x *T, v string) - getter func(x T) string - } - - IntColumn[T any] struct { - inner IntegerType - setter func(x *T, v int) - getter func(x T) int - } - - BoolColumn[T any] struct { - inner StringType - setter func(x T, v bool) - getter func(x T) bool - } - - Type[T any] interface { - Parse(data []byte) (T, error) - //Compile(x T, writer io.Writer) error - } - - Col[T any] interface { - Parse(data []byte, item *T) error - //Compile(x T, writer io.Writer) error - } - - StringType struct { - Quoted bool - } - - IntegerType struct { - inner StringType - } - - Parser[T any] struct { - separator byte - columns []Col[T] - } -) - -func (p Parser[T]) Parse(data []byte, item *T) (err error) { - counter := 0 - for _, col := range p.columns { - pos := slices.IndexOf[byte]( - data, - func(x byte) bool { - return x == p.separator - }, - ) - - lastCol := counter == len(p.columns)-1 - - if pos == -1 && !lastCol { - // Only if no more separators have been found, and current column is not the last one, yield error - err = errors.Wrapf( - ErrColumnMismatch, - "want %d, have %d", - len(p.columns), - counter, - ) - } - - payload := data - if !lastCol { - payload = data[:pos] - } - - if err = col.Parse(payload, item); err != nil { - return err - } - - counter++ - - if !lastCol { - data = data[pos+1:] - } - } - return nil -} - -func NewParser[T any](sep byte, cols ...Col[T]) Parser[T] { - return Parser[T]{separator: sep, columns: cols} -} - -func (s StringColumn[T]) Parse(data []byte, item *T) error { - val, err := s.inner.Parse(data) - if err != nil { - return err - } - s.setter(item, val) - return nil -} - -func (c IntColumn[T]) Parse(data []byte, item *T) error { - val, err := c.inner.Parse(data) - if err != nil { - return err - } - c.setter(item, val) - return nil -} - -func (s StringType) Parse(data []byte) (string, error) { - if s.Quoted { - // ,"", - if len(data) > 2 { - return string(data[1 : len(data)-1]), nil // todo: nalloc - } - - return "", nil - // todo: assert data[0] == quote. Keep until other quote+separator are found - } - - return string(data), nil -} - -func (s StringType) Compile(data []byte, w io.Writer) error { - if s.Quoted { - - n, err := w.Write(quote) - if err != nil || n < 1 { - // todo: handle - } - } - - n, err := w.Write(data) - - if err != nil || n < 1 { - // todo: handle - } - - if s.Quoted { - n, err := w.Write(quote) - if err != nil || n < 1 { - // todo: handle - } - } - return nil -} - -func (i IntegerType) Parse(data []byte) (int, error) { - val, err := i.inner.Parse(data) - if err != nil { - return 0, err - } - - var res int64 - res, err = strconv.ParseInt(val, 10, 64) - return int(res), err -} - -func StringCol[T any]( - quoted bool, - getter func(T) string, - setter func(*T, string), -) Col[T] { - return StringColumn[T]{ - inner: StringType{Quoted: quoted}, - getter: getter, setter: setter, - } -} - -func IntCol[T any]( - quoted bool, - getter func(T) int, - setter func(*T, int), -) Col[T] { - return IntColumn[T]{ - inner: IntegerType{inner: StringType{Quoted: quoted}}, - getter: getter, setter: setter, - } -} diff --git a/csvparser/errors.go b/csvparser/errors.go index c8dd5ae..f3566e7 100644 --- a/csvparser/errors.go +++ b/csvparser/errors.go @@ -4,4 +4,5 @@ import "github.com/pkg/errors" var ( ErrColumnMismatch = errors.New("column mismatch") + ErrQuoteExpected = errors.New("quote was expected") ) diff --git a/csvparser/parser.go b/csvparser/parser.go new file mode 100644 index 0000000..9d767a8 --- /dev/null +++ b/csvparser/parser.go @@ -0,0 +1,64 @@ +package csvparser + +import ( + "bytes" +) + +var ( + quote = []byte{byte('"')} + + QuoteDouble byte = '"' + QuoteSimple byte = '\'' + QuoteNone byte = 0 + + SeparatorComma byte = ',' + SeparatorSemicolon byte = ';' + SeparatorTab byte = '\t' +) + +type ( + Parser[T any] struct { + separator byte + columns []Col[T] + } +) + +func (p Parser[T]) Parse(data []byte, item *T) (err error) { + data = bytes.TrimSpace(data) // cleanup phase + sepLen := 1 // len(p.separator) + + for i, col := range p.columns { + var read int + read, err = col.Parse(data, item) + if err != nil { + return + } + + // TODO: handle read =0 + _ = i + + if read > len(data) { + break + } + + // create a cursor to have better readability under the fact the column types will only parse + // its desired data, letting the parser have the liability to advance de cursor. + cursor := read + if read+sepLen <= len(data) { + cursor += sepLen + } + + data = data[cursor:] + } + return nil +} + +func New[T any](sep byte, cols ...ColFactory[T]) Parser[T] { + columns := make([]Col[T], len(cols)) + opt := opts{sep: sep} + + for i, c := range cols { + columns[i] = c(opt) + } + return Parser[T]{separator: sep, columns: columns} +} diff --git a/csvparser/parser_test.go b/csvparser/parser_test.go new file mode 100644 index 0000000..59260be --- /dev/null +++ b/csvparser/parser_test.go @@ -0,0 +1,134 @@ +package csvparser + +import ( + "reflect" + "testing" + + "github.com/pkg/errors" +) + +func TestParser_RawColumn(t *testing.T) { + type ( + args struct { + payload []byte + sep byte + } + + duck struct { + Name string + Siblings int + } + + want struct { + expected duck + err error + } + + testCase struct { + name string + args args + want want + } + ) + + var ( + duckNameSetter = func(d *duck, name string) { + d.Name = name + } + duckSiblingsSetter = func(d *duck, siblings int) { + d.Siblings = siblings + } + ) + + tests := []testCase{ + { + name: "simple csv string line should parse", + args: args{ + payload: []byte("a duck knight in shinny armor,2"), + sep: SeparatorComma, + }, + want: want{ + expected: duck{ + Name: "a duck knight in shinny armor", + Siblings: 2, + }, + }, + }, + { + name: "simple csv string line with trailing separator should parse", + args: args{ + payload: []byte("a duck knight in shinny armor,2,"), + sep: SeparatorComma, + }, + want: want{ + expected: duck{ + Name: "a duck knight in shinny armor", + Siblings: 2, + }, + }, + }, + { + name: "simple csv string line with trailing separator and spaces should parse", + args: args{ + payload: []byte("a duck knight in shinny armor,2, "), + sep: SeparatorComma, + }, + want: want{ + expected: duck{ + Name: "a duck knight in shinny armor", + Siblings: 2, + }, + }, + }, + { + name: "simple csv string line with trailing spaces at the start and spaces should parse", + args: args{ + payload: []byte(" a duck knight in shinny armor,2, "), + sep: SeparatorComma, + }, + want: want{ + expected: duck{ + Name: "a duck knight in shinny armor", + Siblings: 2, + }, + }, + }, + { + name: "blank column should render emptiness", + args: args{ + payload: []byte(",2"), + sep: SeparatorComma, + }, + want: want{ + expected: duck{ + Name: "", + Siblings: 2, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + + parser := New[duck]( + test.args.sep, + StringCol[duck](QuoteNone, nil, duckNameSetter), + IntCol[duck](QuoteNone, nil, duckSiblingsSetter), + ) + + rubberDuck := duck{} + + if err := parser.Parse(test.args.payload, &rubberDuck); !errors.Is(test.want.err, err) { + t.Errorf("unexpected error, want %v, have %v", + test.want.err, err) + } + + if !reflect.DeepEqual(test.want.expected, rubberDuck) { + t.Errorf("unexpected duck\nwant %v\nhave %v", + test.want.expected, rubberDuck) + } + + }) + } +} diff --git a/csvparser/types.go b/csvparser/types.go new file mode 100644 index 0000000..f837506 --- /dev/null +++ b/csvparser/types.go @@ -0,0 +1,106 @@ +package csvparser + +import ( + "io" + "strconv" + + "github.com/pkg/errors" + "github.com/sonirico/stadio/slices" +) + +type ( + parseOpts struct { + sep byte + } + + Type[T any] interface { + Parse(data []byte) (T, int, error) + //Compile(x T, writer io.Writer) error + } + + StringType struct { + sep byte + quote byte + } + + IntegerType struct { + inner StringType + } +) + +// Parse parses `data`, which is ensured to be non-nil and its length greater than zero +func (s StringType) Parse(data []byte) (string, int, error) { + if s.quote != QuoteNone { + if data[0] != s.quote { + return "", 0, errors.Wrapf(ErrQuoteExpected, "<%s>", string(s.quote)) + } + + i := 3 + // Find the next non-escaped quote + for i < len(data) { + prev := data[i-2] + middle := data[i-1] + next := data[i] + if middle == s.quote && prev != '\\' && next == s.sep { + break + } + i++ + } + + payload := data[1 : i-1] + return string(payload), len(payload), nil + } + + payload := data + + idx := slices.IndexOf(data, func(x byte) bool { return x == s.sep }) + if idx > -1 { + // next separator has not been found. End of line? + payload = data[:idx] + } + + return string(payload), len(payload), nil +} + +func (s StringType) Compile(data []byte, w io.Writer) error { + if s.quote != QuoteNone { + + n, err := w.Write(quote) + if err != nil || n < 1 { + // todo: handle + } + } + + n, err := w.Write(data) + + if err != nil || n < 1 { + // todo: handle + } + + if s.quote != QuoteNone { + n, err := w.Write(quote) + if err != nil || n < 1 { + // todo: handle + } + } + return nil +} + +func (i IntegerType) Parse(data []byte) (int, int, error) { + val, n, err := i.inner.Parse(data) + if err != nil { + return 0, n, err + } + + var res int64 + res, err = strconv.ParseInt(val, 10, 64) + return int(res), n, err +} + +func StrType(quote, sep byte) StringType { + return StringType{quote: quote, sep: sep} +} + +func IntType(quote, sep byte) IntegerType { + return IntegerType{inner: StrType(quote, sep)} +} diff --git a/csvparser/types_test.go b/csvparser/types_test.go new file mode 100644 index 0000000..a95ba87 --- /dev/null +++ b/csvparser/types_test.go @@ -0,0 +1,158 @@ +package csvparser + +import ( + "strings" + "testing" + + "github.com/pkg/errors" +) + +func TestStringType_Parse(t *testing.T) { + type ( + args struct { + payload []byte + quote byte + sep byte + } + + want struct { + expected []byte + read int + err error + } + + testCase struct { + name string + args args + want want + } + ) + + tests := []testCase{ + { + name: "unquoted simple string", + args: args{ + quote: QuoteNone, + sep: SeparatorComma, + payload: []byte("fmartingr,danirod_,3"), + }, + want: want{ + expected: []byte("fmartingr"), + read: 9, + err: nil, + }, + }, + { + name: "double quote simple string", + args: args{ + quote: QuoteDouble, + sep: SeparatorComma, + payload: []byte("\"fmartingr\",danirod_,3"), + }, + want: want{ + expected: []byte("fmartingr"), + read: 9, + err: nil, + }, + }, + { + name: "simple quote simple string", + args: args{ + quote: QuoteSimple, + sep: SeparatorComma, + payload: []byte("'fmartingr',danirod_,3"), + }, + want: want{ + expected: []byte("fmartingr"), + read: 9, + err: nil, + }, + }, + { + name: "non quote non-ascii string", + args: args{ + quote: QuoteNone, + sep: SeparatorComma, + payload: []byte("你好吗,danirod_,3"), + }, + want: want{ + expected: []byte("你好吗"), + read: 9, + err: nil, + }, + }, + { + name: "double quote non-ascii string", + args: args{ + quote: QuoteDouble, + sep: SeparatorComma, + payload: []byte("\"你好吗\",danirod_,3"), + }, + want: want{ + expected: []byte("你好吗"), + read: 9, + err: nil, + }, + }, + { + name: "double quote non-ascii string with escaped char same as quote", + args: args{ + quote: QuoteDouble, + sep: SeparatorComma, + payload: []byte("\"你\\\"好吗\",danirod_,3"), + }, + want: want{ + expected: []byte("你\\\"好吗"), + read: 11, + err: nil, + }, + }, + { + name: "double quote non-ascii string with escaped char same as quote and other char same as separator", + args: args{ + quote: QuoteDouble, + sep: SeparatorComma, + payload: []byte("\"你\\\"好,吗\",danirod_,3"), + }, + want: want{ + expected: []byte("你\\\"好,吗"), + read: 12, + err: nil, + }, + }, + { + name: "simple quoted json", + args: args{ + quote: QuoteSimple, + sep: SeparatorComma, + payload: []byte(`'{"name":"Pato","age":3}',danirod_,3`), + }, + want: want{ + expected: []byte(`{"name":"Pato","age":3}`), + read: 23, + err: nil, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + str := StrType(test.args.quote, test.args.sep) + actual, read, actualErr := str.Parse(test.args.payload) + if !errors.Is(test.want.err, actualErr) { + t.Fatalf("unexpected error. want %v, have %v", + test.want.err, actualErr) + } + + if test.want.read != read { + t.Fatalf("unexpected bytes read, want %d have %d", + test.want.read, read) + } + + if strings.Compare(string(test.want.expected), actual) != 0 { + t.Fatalf("unexpected result. want %v, have %v", + string(test.want.expected), actual) + } + }) + } +} diff --git a/examples/response_stream/main.go b/examples/response_stream/main.go index c438e1d..a4614a2 100644 --- a/examples/response_stream/main.go +++ b/examples/response_stream/main.go @@ -36,20 +36,20 @@ func parseCSVResponse() { ignoreLines := 1 // in order to ignore header - parser := csvparser.NewParser[Repo]( + parser := csvparser.New[Repo]( csvparser.SeparatorComma, csvparser.IntCol[Repo]( - false, + csvparser.QuoteNone, nil, func(x *Repo, rank int) { x.Rank = rank }, ), csvparser.StringCol[Repo]( - false, + csvparser.QuoteNone, nil, func(x *Repo, name string) { x.Name = name }, ), csvparser.IntCol[Repo]( - false, + csvparser.QuoteNone, nil, func(x *Repo, stars int) { x.Stars = stars }, ),