Skip to content

Commit

Permalink
fix: robuster csv parser
Browse files Browse the repository at this point in the history
  • Loading branch information
sonirico committed Oct 6, 2022
1 parent 8634f13 commit fdff1b3
Show file tree
Hide file tree
Showing 7 changed files with 405 additions and 75 deletions.
48 changes: 29 additions & 19 deletions csvparser/columns.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@ package csvparser

type (
Col[T any] interface {
Parse(data []byte, item *T) error
Parse(data []byte, item *T) (int, error)
//Compile(x T, writer io.Writer) error
}

opts struct {
sep byte
}

ColFactory[T any] func(opts) Col[T]

StringColumn[T any] struct {
inner StringType
setter func(x *T, v string)
Expand All @@ -25,42 +31,46 @@ type (
}
)

func (s StringColumn[T]) Parse(data []byte, item *T) error {
val, err := s.inner.Parse(data)
func (s StringColumn[T]) Parse(data []byte, item *T) (int, error) {
val, n, err := s.inner.Parse(data)
if err != nil {
return err
return n, err
}
s.setter(item, val)
return nil
return n, nil
}

func (c IntColumn[T]) Parse(data []byte, item *T) error {
val, err := c.inner.Parse(data)
func (c IntColumn[T]) Parse(data []byte, item *T) (int, error) {
val, n, err := c.inner.Parse(data)
if err != nil {
return err
return n, err
}
c.setter(item, val)
return nil
return n, nil
}

func StringCol[T any](
quoted bool,
quote byte,
getter func(T) string,
setter func(*T, string),
) Col[T] {
return StringColumn[T]{
inner: StrType(quoted),
getter: getter, setter: setter,
) ColFactory[T] {
return func(opts opts) Col[T] {
return StringColumn[T]{
inner: StrType(quote, opts.sep),
getter: getter, setter: setter,
}
}
}

func IntCol[T any](
quoted bool,
quote byte,
getter func(T) int,
setter func(*T, int),
) Col[T] {
return IntColumn[T]{
inner: IntType(quoted),
getter: getter, setter: setter,
) ColFactory[T] {
return func(opts opts) Col[T] {
return IntColumn[T]{
inner: IntType(quote, opts.sep),
getter: getter, setter: setter,
}
}
}
1 change: 1 addition & 0 deletions csvparser/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ import "github.com/pkg/errors"

var (
ErrColumnMismatch = errors.New("column mismatch")
ErrQuoteExpected = errors.New("quote was expected")
)
63 changes: 31 additions & 32 deletions csvparser/parser.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package csvparser

import (
"github.com/pkg/errors"
"github.com/sonirico/stadio/slices"
"bytes"
)

var (
quote = []byte{byte('"')}

QuoteDouble byte = '"'
QuoteSimple byte = '\''
QuoteNone byte = 0

SeparatorComma byte = ','
SeparatorSemicolon byte = ';'
SeparatorTab byte = '\t'
Expand All @@ -21,45 +24,41 @@ type (
)

func (p Parser[T]) Parse(data []byte, item *T) (err error) {
counter := 0
for _, col := range p.columns {
pos := slices.IndexOf[byte](
data,
func(x byte) bool {
return x == p.separator
},
)

lastCol := counter == len(p.columns)-1
data = bytes.TrimSpace(data) // cleanup phase
sepLen := 1 // len(p.separator)

if pos == -1 && !lastCol {
// Only if no more separators have been found, and current column is not the last one, yield error
err = errors.Wrapf(
ErrColumnMismatch,
"want %d, have %d",
len(p.columns),
counter,
)
for i, col := range p.columns {
var read int
read, err = col.Parse(data, item)
if err != nil {
return
}

payload := data
if !lastCol {
payload = data[:pos]
}
// TODO: handle read =0
_ = i

if err = col.Parse(payload, item); err != nil {
return err
if read > len(data) {
break
}

counter++

if !lastCol {
data = data[pos+1:]
// create a cursor to have better readability under the fact the column types will only parse
// its desired data, letting the parser have the liability to advance de cursor.
cursor := read
if read+sepLen <= len(data) {
cursor += sepLen
}

data = data[cursor:]
}
return nil
}

func NewParser[T any](sep byte, cols ...Col[T]) Parser[T] {
return Parser[T]{separator: sep, columns: cols}
func New[T any](sep byte, cols ...ColFactory[T]) Parser[T] {
columns := make([]Col[T], len(cols))
opt := opts{sep: sep}

for i, c := range cols {
columns[i] = c(opt)
}
return Parser[T]{separator: sep, columns: columns}
}
134 changes: 134 additions & 0 deletions csvparser/parser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package csvparser

import (
"reflect"
"testing"

"github.com/pkg/errors"
)

func TestParser_RawColumn(t *testing.T) {
type (
args struct {
payload []byte
sep byte
}

duck struct {
Name string
Siblings int
}

want struct {
expected duck
err error
}

testCase struct {
name string
args args
want want
}
)

var (
duckNameSetter = func(d *duck, name string) {
d.Name = name
}
duckSiblingsSetter = func(d *duck, siblings int) {
d.Siblings = siblings
}
)

tests := []testCase{
{
name: "simple csv string line should parse",
args: args{
payload: []byte("a duck knight in shinny armor,2"),
sep: SeparatorComma,
},
want: want{
expected: duck{
Name: "a duck knight in shinny armor",
Siblings: 2,
},
},
},
{
name: "simple csv string line with trailing separator should parse",
args: args{
payload: []byte("a duck knight in shinny armor,2,"),
sep: SeparatorComma,
},
want: want{
expected: duck{
Name: "a duck knight in shinny armor",
Siblings: 2,
},
},
},
{
name: "simple csv string line with trailing separator and spaces should parse",
args: args{
payload: []byte("a duck knight in shinny armor,2, "),
sep: SeparatorComma,
},
want: want{
expected: duck{
Name: "a duck knight in shinny armor",
Siblings: 2,
},
},
},
{
name: "simple csv string line with trailing spaces at the start and spaces should parse",
args: args{
payload: []byte(" a duck knight in shinny armor,2, "),
sep: SeparatorComma,
},
want: want{
expected: duck{
Name: "a duck knight in shinny armor",
Siblings: 2,
},
},
},
{
name: "blank column should render emptiness",
args: args{
payload: []byte(",2"),
sep: SeparatorComma,
},
want: want{
expected: duck{
Name: "",
Siblings: 2,
},
},
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {

parser := New[duck](
test.args.sep,
StringCol[duck](QuoteNone, nil, duckNameSetter),
IntCol[duck](QuoteNone, nil, duckSiblingsSetter),
)

rubberDuck := duck{}

if err := parser.Parse(test.args.payload, &rubberDuck); !errors.Is(test.want.err, err) {
t.Errorf("unexpected error, want %v, have %v",
test.want.err, err)
}

if !reflect.DeepEqual(test.want.expected, rubberDuck) {
t.Errorf("unexpected duck\nwant %v\nhave %v",
test.want.expected, rubberDuck)
}

})
}
}
Loading

0 comments on commit fdff1b3

Please sign in to comment.