diff --git a/cmd/parquetgen/dremel/testcases/doc/generated.go b/cmd/parquetgen/dremel/testcases/doc/generated.go index 5493d09..3b860a6 100644 --- a/cmd/parquetgen/dremel/testcases/doc/generated.go +++ b/cmd/parquetgen/dremel/testcases/doc/generated.go @@ -3,7 +3,6 @@ package doc // Code generated by github.com/parsyl/parquet. DO NOT EDIT. import ( - "bytes" "encoding/binary" "fmt" "io" @@ -11,9 +10,9 @@ import ( "github.com/parsyl/parquet" sch "github.com/parsyl/parquet/schema" + "github.com/valyala/bytebufferpool" "math" - "sort" ) type compression int @@ -25,6 +24,8 @@ const ( compressionUnknown compression = -1 ) +var buffpool = bytebufferpool.Pool{} + // ParquetWriter reprents a row group type ParquetWriter struct { fields []Field @@ -698,9 +699,13 @@ func (f *Int64Field) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *Int64Field) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint64(bs, uint64(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -748,9 +753,13 @@ func (f *Int64OptionalField) Schema() parquet.Field { } func (f *Int64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint64(bs, uint64(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -837,13 +846,16 @@ func (f *StringOptionalField) Scan(r *Document) { } func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) @@ -894,10 +906,10 @@ func (i *int64stats) add(val int64) { } } -func (f *int64stats) bytes(val int64) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int64stats) bytes(v int64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, uint64(v)) + return bs } func (f *int64stats) NullCount() *int64 { @@ -951,10 +963,10 @@ func (f *int64optionalStats) add(vals []int64, defs []uint8) { } } -func (f *int64optionalStats) bytes(val int64) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int64optionalStats) bytes(v int64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, uint64(v)) + return bs } func (f *int64optionalStats) NullCount() *int64 { @@ -979,16 +991,21 @@ func (f *int64optionalStats) Max() []byte { return f.bytes(f.max) } +const nilOptString = "__#NIL#__" + type stringOptionalStats struct { - vals []string - min []byte - max []byte + min string + max string nils int64 maxDef uint8 } func newStringOptionalStats(d uint8) *stringOptionalStats { - return &stringOptionalStats{maxDef: d} + return &stringOptionalStats{ + min: nilOptString, + max: nilOptString, + maxDef: d, + } } func (s *stringOptionalStats) add(vals []string, defs []uint8) { @@ -997,7 +1014,21 @@ func (s *stringOptionalStats) add(vals []string, defs []uint8) { if def < s.maxDef { s.nils++ } else { - s.vals = append(s.vals, vals[i]) + val := vals[i] + if s.min == nilOptString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilOptString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } i++ } } @@ -1012,29 +1043,17 @@ func (s *stringOptionalStats) DistinctCount() *int64 { } func (s *stringOptionalStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilOptString { + return nil } - return s.min + return []byte(s.min) } func (s *stringOptionalStats) Max() []byte { - if s.max == nil { - s.minMax() - } - return s.max -} - -func (s *stringOptionalStats) minMax() { - if len(s.vals) == 0 { - return + if s.max == nilOptString { + return nil } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } func pint32(i int32) *int32 { return &i } diff --git a/cmd/parquetgen/dremel/testcases/person/generated.go b/cmd/parquetgen/dremel/testcases/person/generated.go index 76cd891..01a7e84 100644 --- a/cmd/parquetgen/dremel/testcases/person/generated.go +++ b/cmd/parquetgen/dremel/testcases/person/generated.go @@ -3,7 +3,6 @@ package person // Code generated by github.com/parsyl/parquet. DO NOT EDIT. import ( - "bytes" "encoding/binary" "fmt" "io" @@ -11,9 +10,9 @@ import ( "github.com/parsyl/parquet" sch "github.com/parsyl/parquet/schema" + "github.com/valyala/bytebufferpool" "math" - "sort" ) type compression int @@ -25,6 +24,8 @@ const ( compressionUnknown compression = -1 ) +var buffpool = bytebufferpool.Pool{} + // ParquetWriter reprents a row group type ParquetWriter struct { fields []Field @@ -545,13 +546,16 @@ func (f *StringField) Schema() parquet.Field { } func (f *StringField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) @@ -640,13 +644,16 @@ func (f *StringOptionalField) Scan(r *Person) { } func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) @@ -699,9 +706,13 @@ func (f *Int32OptionalField) Schema() parquet.Field { } func (f *Int32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -745,18 +756,35 @@ func (f *Int32OptionalField) Levels() ([]uint8, []uint8) { return f.Defs, f.Reps } +const nilString = "__#NIL#__" + type stringStats struct { - vals []string - min []byte - max []byte + min string + max string } func newStringStats() *stringStats { - return &stringStats{} + return &stringStats{ + min: nilString, + max: nilString, + } } func (s *stringStats) add(val string) { - s.vals = append(s.vals, val) + if s.min == nilString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } } func (s *stringStats) NullCount() *int64 { @@ -768,41 +796,34 @@ func (s *stringStats) DistinctCount() *int64 { } func (s *stringStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilString { + return nil } - return s.min + return []byte(s.min) } func (s *stringStats) Max() []byte { - if s.max == nil { - s.minMax() + if s.max == nilString { + return nil } - return s.max + return []byte(s.max) } -func (s *stringStats) minMax() { - if len(s.vals) == 0 { - return - } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) -} +const nilOptString = "__#NIL#__" type stringOptionalStats struct { - vals []string - min []byte - max []byte + min string + max string nils int64 maxDef uint8 } func newStringOptionalStats(d uint8) *stringOptionalStats { - return &stringOptionalStats{maxDef: d} + return &stringOptionalStats{ + min: nilOptString, + max: nilOptString, + maxDef: d, + } } func (s *stringOptionalStats) add(vals []string, defs []uint8) { @@ -811,7 +832,21 @@ func (s *stringOptionalStats) add(vals []string, defs []uint8) { if def < s.maxDef { s.nils++ } else { - s.vals = append(s.vals, vals[i]) + val := vals[i] + if s.min == nilOptString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilOptString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } i++ } } @@ -826,29 +861,17 @@ func (s *stringOptionalStats) DistinctCount() *int64 { } func (s *stringOptionalStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilOptString { + return nil } - return s.min + return []byte(s.min) } func (s *stringOptionalStats) Max() []byte { - if s.max == nil { - s.minMax() - } - return s.max -} - -func (s *stringOptionalStats) minMax() { - if len(s.vals) == 0 { - return + if s.max == nilOptString { + return nil } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } type int32optionalStats struct { @@ -886,10 +909,10 @@ func (f *int32optionalStats) add(vals []int32, defs []uint8) { } } -func (f *int32optionalStats) bytes(val int32) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int32optionalStats) bytes(v int32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, uint32(v)) + return bs } func (f *int32optionalStats) NullCount() *int64 { diff --git a/cmd/parquetgen/dremel/testcases/repetition/generated.go b/cmd/parquetgen/dremel/testcases/repetition/generated.go index 9f06de4..bcb84f8 100644 --- a/cmd/parquetgen/dremel/testcases/repetition/generated.go +++ b/cmd/parquetgen/dremel/testcases/repetition/generated.go @@ -3,7 +3,6 @@ package repetition // Code generated by github.com/parsyl/parquet. DO NOT EDIT. import ( - "bytes" "encoding/binary" "fmt" "io" @@ -11,8 +10,7 @@ import ( "github.com/parsyl/parquet" sch "github.com/parsyl/parquet/schema" - - "sort" + "github.com/valyala/bytebufferpool" ) type compression int @@ -24,6 +22,8 @@ const ( compressionUnknown compression = -1 ) +var buffpool = bytebufferpool.Pool{} + // ParquetWriter reprents a row group type ParquetWriter struct { fields []Field @@ -823,13 +823,16 @@ func (f *StringOptionalField) Scan(r *Document) { } func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) @@ -860,16 +863,21 @@ func (f *StringOptionalField) Levels() ([]uint8, []uint8) { return f.Defs, f.Reps } +const nilOptString = "__#NIL#__" + type stringOptionalStats struct { - vals []string - min []byte - max []byte + min string + max string nils int64 maxDef uint8 } func newStringOptionalStats(d uint8) *stringOptionalStats { - return &stringOptionalStats{maxDef: d} + return &stringOptionalStats{ + min: nilOptString, + max: nilOptString, + maxDef: d, + } } func (s *stringOptionalStats) add(vals []string, defs []uint8) { @@ -878,7 +886,21 @@ func (s *stringOptionalStats) add(vals []string, defs []uint8) { if def < s.maxDef { s.nils++ } else { - s.vals = append(s.vals, vals[i]) + val := vals[i] + if s.min == nilOptString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilOptString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } i++ } } @@ -893,29 +915,17 @@ func (s *stringOptionalStats) DistinctCount() *int64 { } func (s *stringOptionalStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilOptString { + return nil } - return s.min + return []byte(s.min) } func (s *stringOptionalStats) Max() []byte { - if s.max == nil { - s.minMax() - } - return s.max -} - -func (s *stringOptionalStats) minMax() { - if len(s.vals) == 0 { - return + if s.max == nilOptString { + return nil } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } func pint32(i int32) *int32 { return &i } diff --git a/cmd/parquetgen/gen/funcs.go b/cmd/parquetgen/gen/funcs.go index 698ed77..8812561 100644 --- a/cmd/parquetgen/gen/funcs.go +++ b/cmd/parquetgen/gen/funcs.go @@ -43,16 +43,12 @@ var ( }, "imports": func(fields []fields.Field) []string { var out []string - var intFound, stringFound bool + var intFound bool for _, f := range fields { if !intFound && strings.Contains(f.Type, "int") { intFound = true out = append(out, `"math"`) } - if !stringFound && strings.Contains(f.Type, "string") { - stringFound = true - out = append(out, `"sort"`) - } } return out }, @@ -85,5 +81,57 @@ var ( } return "parquet.RequiredField" }, + "byteSize": func(f fields.Field) string { + var out string + switch f.Type { + case "int32", "*int32", "uint32", "*uint32", "float32", "*float32": + out = "4" + case "int64", "*int64", "uint64", "*uint64", "float64", "*float64": + out = "8" + } + return out + }, + // based on binary.Write + "putFunc": func(f fields.Field) string { + var out string + switch f.Type { + case "int32", "*int32", "uint32", "*uint32", "float32", "*float32": + out = "PutUint32" + case "int64", "*int64", "uint64", "*uint64", "float64", "*float64": + out = "PutUint64" + } + return out + }, + // based on binary.Write + "uintFunc": func(f fields.Field) string { + var out string + switch f.Type { + case "int32": + out = "uint32(v)" + case "*int32": + out = "uint32(*v)" + case "uint32": + out = "v" + case "*uint32": + out = "*v" + case "float32": + out = "math.Float32bits(v)" + case "*float32": + out = "math.Float32bits(*v)" + case "int64": + out = "uint64(v)" + case "*int64": + out = "uint64(*v)" + case "uint64": + out = "v" + case "*uint64": + out = "*v" + case "float64": + out = "math.Float64bits(v)" + case "*float64": + out = "math.Float64bits(*v)" + } + return out + }, } ) diff --git a/cmd/parquetgen/gen/gen.go b/cmd/parquetgen/gen/gen.go index 94def56..55be363 100644 --- a/cmd/parquetgen/gen/gen.go +++ b/cmd/parquetgen/gen/gen.go @@ -193,7 +193,7 @@ func getImport(i string) string { if i == "" { return "" } - return fmt.Sprintf(`"%s"`, i) + return fmt.Sprintf(`. "%s"`, i) } type newStruct struct { diff --git a/cmd/parquetgen/gen/template.go b/cmd/parquetgen/gen/template.go index 0f7bc67..672d2ac 100644 --- a/cmd/parquetgen/gen/template.go +++ b/cmd/parquetgen/gen/template.go @@ -9,10 +9,10 @@ var tpl = `package {{.Package}} import ( "fmt" "io" - "bytes" "strings" "encoding/binary" + "github.com/valyala/bytebufferpool" "github.com/parsyl/parquet" sch "github.com/parsyl/parquet/schema" {{.Import}} @@ -29,6 +29,8 @@ const ( compressionUnknown compression = -1 ) +var buffpool = bytebufferpool.Pool{} + // ParquetWriter reprents a row group type ParquetWriter struct { fields []Field diff --git a/cmd/parquetgen/gen/template_optional.go b/cmd/parquetgen/gen/template_optional.go index 691948a..1a6a1e0 100644 --- a/cmd/parquetgen/gen/template_optional.go +++ b/cmd/parquetgen/gen/template_optional.go @@ -26,9 +26,13 @@ func (f *{{.FieldType}}) Schema() parquet.Field { } func (f *{{.FieldType}}) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, {{byteSize .}}) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.{{ putFunc . }}(bs, {{ uintFunc . }}) + if _, err := buf.Write(bs); err != nil { return err } } @@ -109,10 +113,10 @@ func (f *{{removeStar .TypeName}}optionalStats) add(vals []{{removeStar .TypeNam } } -func (f *{{removeStar .TypeName}}optionalStats) bytes(val {{removeStar .TypeName}}) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *{{removeStar .TypeName}}optionalStats) bytes(v {{removeStar .TypeName}}) []byte { + bs := make([]byte, {{byteSize .}}) + binary.LittleEndian.{{ putFunc . }}(bs, {{ uintFunc . }}) + return bs } func (f *{{removeStar .TypeName}}optionalStats) NullCount() *int64 { diff --git a/cmd/parquetgen/gen/template_required.go b/cmd/parquetgen/gen/template_required.go index d8ca5ee..4610863 100644 --- a/cmd/parquetgen/gen/template_required.go +++ b/cmd/parquetgen/gen/template_required.go @@ -35,9 +35,13 @@ func (f *{{.FieldType}}) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *{{.FieldType}}) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, {{byteSize .}}) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.{{ putFunc . }}(bs, {{ uintFunc . }}) + if _, err := buf.Write(bs); err != nil { return err } } @@ -85,10 +89,10 @@ func (i *{{.TypeName}}stats) add(val {{.TypeName}}) { } } -func (f *{{.TypeName}}stats) bytes(val {{.TypeName}}) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *{{.TypeName}}stats) bytes(v {{.TypeName}}) []byte { + bs := make([]byte, {{byteSize .}}) + binary.LittleEndian.{{ putFunc . }}(bs, {{ uintFunc . }}) + return bs } func (f *{{.TypeName}}stats) NullCount() *int64 { diff --git a/cmd/parquetgen/gen/template_string.go b/cmd/parquetgen/gen/template_string.go index edd0532..0b54377 100644 --- a/cmd/parquetgen/gen/template_string.go +++ b/cmd/parquetgen/gen/template_string.go @@ -23,13 +23,16 @@ func (f *StringField) Schema() parquet.Field { } func (f *StringField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) @@ -77,18 +80,36 @@ func (f *StringField) Levels() ([]uint8, []uint8) { {{end}}` var stringStatsTpl = `{{define "stringStats"}} + +const nilString = "__#NIL#__" + type stringStats struct { - vals []string - min []byte - max []byte + min string + max string } func newStringStats() *stringStats { - return &stringStats{} + return &stringStats{ + min: nilString, + max: nilString, + } } func (s *stringStats) add(val string) { - s.vals = append(s.vals, val) + if s.min == nilString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } } func (s *stringStats) NullCount() *int64 { @@ -100,28 +121,16 @@ func (s *stringStats) DistinctCount() *int64 { } func (s *stringStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilString { + return nil } - return s.min + return []byte(s.min) } func (s *stringStats) Max() []byte { - if s.max == nil { - s.minMax() - } - return s.max -} - -func (s *stringStats) minMax() { - if len(s.vals) == 0 { - return + if s.max == nilString { + return nil } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } {{end}}` diff --git a/cmd/parquetgen/gen/template_string_optional.go b/cmd/parquetgen/gen/template_string_optional.go index 9b0a14d..47e3e23 100644 --- a/cmd/parquetgen/gen/template_string_optional.go +++ b/cmd/parquetgen/gen/template_string_optional.go @@ -44,13 +44,16 @@ func (f *StringOptionalField) Scan(r *{{.StructType}}) { } func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) @@ -83,16 +86,22 @@ func (f *StringOptionalField) Levels() ([]uint8, []uint8) { {{end}}` var stringOptionalStatsTpl = `{{define "stringOptionalStats"}} + +const nilOptString = "__#NIL#__" + type stringOptionalStats struct { - vals []string - min []byte - max []byte + min string + max string nils int64 maxDef uint8 } func newStringOptionalStats(d uint8) *stringOptionalStats { - return &stringOptionalStats{maxDef: d} + return &stringOptionalStats{ + min: nilOptString, + max: nilOptString, + maxDef: d, + } } func (s *stringOptionalStats) add(vals []string, defs []uint8) { @@ -101,7 +110,21 @@ func (s *stringOptionalStats) add(vals []string, defs []uint8) { if def < s.maxDef { s.nils++ } else { - s.vals = append(s.vals, vals[i]) + val := vals[i] + if s.min == nilOptString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilOptString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } i++ } } @@ -116,28 +139,16 @@ func (s *stringOptionalStats) DistinctCount() *int64 { } func (s *stringOptionalStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilOptString { + return nil } - return s.min + return []byte(s.min) } func (s *stringOptionalStats) Max() []byte { - if s.max == nil { - s.minMax() + if s.max == nilOptString { + return nil } - return s.max -} - -func (s *stringOptionalStats) minMax() { - if len(s.vals) == 0 { - return - } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } {{end}}` diff --git a/fields.go b/fields.go index dfcb498..9959961 100644 --- a/fields.go +++ b/fields.go @@ -3,6 +3,7 @@ package parquet import ( "bytes" "compress/gzip" + "github.com/valyala/bytebufferpool" "math/bits" "strings" @@ -26,6 +27,11 @@ const ( Repeated RepetitionType = 2 ) +var ( + buffpool = bytebufferpool.Pool{} + compresspool = bytebufferpool.Pool{} +) + type RepetitionTypes []RepetitionType // MaxDef returns the largest definition level @@ -88,7 +94,10 @@ func RequiredFieldUncompressed(r *RequiredField) { // DoWrite writes the actual raw data. func (f *RequiredField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count int, stats Stats) error { - l, cl, vals, err := compress(f.compression, vals) + buff := compresspool.Get() + defer compresspool.Put(buff) + + l, cl, vals, err := compress(f.compression, buff, vals) if err != nil { return err } @@ -218,11 +227,13 @@ func (f *OptionalField) valsFromDefs(defs []uint8, max uint8) int { return out } + // DoWrite is called by all optional field types to write the definition levels // and raw data to the io.Writer func (f *OptionalField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count int, stats Stats) error { - buf := bytes.Buffer{} - wc := &writeCounter{w: &buf} + buf := buffpool.Get() + defer buffpool.Put(buf) + wc := &writeCounter{w: buf} var repLen int64 @@ -241,8 +252,14 @@ func (f *OptionalField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count defLen := wc.n - repLen - wc.Write(vals) - l, cl, vals, err := compress(f.compression, buf.Bytes()) + if _, err = wc.Write(vals); err != nil { + return err + } + + compresBuf := compresspool.Get() + defer compresspool.Put(compresBuf) + + l, cl, vals, err := compress(f.compression, compresBuf, buf.Bytes()) if err != nil { return err } @@ -385,15 +402,20 @@ func pageData(r io.Reader, ph *sch.PageHeader, pg Page) ([]byte, error) { return data, nil } -func compress(codec sch.CompressionCodec, vals []byte) (int, int, []byte, error) { +func compress(codec sch.CompressionCodec, buf *bytebufferpool.ByteBuffer, vals []byte) (int, int, []byte, error) { var err error l := len(vals) switch codec { case sch.CompressionCodec_SNAPPY: - vals = snappy.Encode(nil, vals) + if v := snappy.MaxEncodedLen(len(vals)); v > cap(buf.B) { + buf.B = make([]byte, v) + } else { + buf.B = buf.B[:v] + } + + vals = snappy.Encode(buf.B, vals) case sch.CompressionCodec_GZIP: - var buf bytes.Buffer - zw, err := gzip.NewWriterLevel(&buf, gzip.BestSpeed) + zw, err := gzip.NewWriterLevel(buf, gzip.BestSpeed) if err != nil { return l, 0, vals, err } diff --git a/go.mod b/go.mod index eb21143..8511a82 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,8 @@ go 1.13 require ( github.com/apache/thrift v0.13.0 + github.com/bxcodec/faker/v3 v3.6.0 github.com/golang/snappy v0.0.2 github.com/stretchr/testify v1.7.0 + github.com/valyala/bytebufferpool v1.0.0 // indirect ) diff --git a/go.sum b/go.sum index ad4153a..2d7e91f 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/apache/thrift v0.13.0 h1:5hryIiq9gtn+MiLVn0wP37kb/uTeRZgN08WoCsAhIhI= github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= +github.com/bxcodec/faker/v3 v3.6.0 h1:Meuh+M6pQJsQJwxVALq6H5wpDzkZ4pStV9pmH7gbKKs= +github.com/bxcodec/faker/v3 v3.6.0/go.mod h1:gF31YgnMSMKgkvl+fyEo1xuSMbEuieyqfeslGYFjneM= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/golang/snappy v0.0.2 h1:aeE13tS0IiQgFjYdoL8qN3K1N2bXXtI6Vi51/y7BpMw= @@ -9,6 +11,8 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= diff --git a/parquet_generated_test.go b/parquet_generated_test.go index 42dd044..0051b06 100644 --- a/parquet_generated_test.go +++ b/parquet_generated_test.go @@ -3,7 +3,6 @@ package parquet_test // Code generated by github.com/parsyl/parquet. DO NOT EDIT. import ( - "bytes" "encoding/binary" "fmt" "io" @@ -11,9 +10,9 @@ import ( "github.com/parsyl/parquet" sch "github.com/parsyl/parquet/schema" + "github.com/valyala/bytebufferpool" "math" - "sort" ) type compression int @@ -25,6 +24,8 @@ const ( compressionUnknown compression = -1 ) +var buffpool = bytebufferpool.Pool{} + // ParquetWriter reprents a row group type ParquetWriter struct { fields []Field @@ -901,9 +902,13 @@ func (f *Int32Field) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *Int32Field) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -951,13 +956,16 @@ func (f *StringField) Schema() parquet.Field { } func (f *StringField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) @@ -1025,9 +1033,13 @@ func (f *Int32OptionalField) Schema() parquet.Field { } func (f *Int32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1105,9 +1117,13 @@ func (f *Int64Field) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *Int64Field) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint64(bs, uint64(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1155,9 +1171,13 @@ func (f *Int64OptionalField) Schema() parquet.Field { } func (f *Int64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint64(bs, uint64(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1244,13 +1264,16 @@ func (f *StringOptionalField) Scan(r *Person) { } func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - buf := bytes.Buffer{} + buf := buffpool.Get() + defer buffpool.Put(buf) + bs := make([]byte, 4) for _, s := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { return err } - buf.Write([]byte(s)) + buf.WriteString(s) } return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) @@ -1315,9 +1338,13 @@ func (f *Float32Field) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *Float32Field) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1377,9 +1404,13 @@ func (f *Float64Field) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *Float64Field) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint64(bs, math.Float64bits(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1427,9 +1458,13 @@ func (f *Float32OptionalField) Schema() parquet.Field { } func (f *Float32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1578,9 +1613,13 @@ func (f *Uint32Field) Read(r io.ReadSeeker, pg parquet.Page) error { } func (f *Uint32Field) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint32(bs, v) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1628,9 +1667,13 @@ func (f *Uint64OptionalField) Schema() parquet.Field { } func (f *Uint64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { - var buf bytes.Buffer + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) for _, v := range f.vals { - if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + binary.LittleEndian.PutUint64(bs, v) + if _, err := buf.Write(bs); err != nil { return err } } @@ -1756,10 +1799,10 @@ func (i *int32stats) add(val int32) { } } -func (f *int32stats) bytes(val int32) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int32stats) bytes(v int32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, uint32(v)) + return bs } func (f *int32stats) NullCount() *int64 { @@ -1778,18 +1821,35 @@ func (f *int32stats) Max() []byte { return f.bytes(f.max) } +const nilString = "__#NIL#__" + type stringStats struct { - vals []string - min []byte - max []byte + min string + max string } func newStringStats() *stringStats { - return &stringStats{} + return &stringStats{ + min: nilString, + max: nilString, + } } func (s *stringStats) add(val string) { - s.vals = append(s.vals, val) + if s.min == nilString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } } func (s *stringStats) NullCount() *int64 { @@ -1801,29 +1861,17 @@ func (s *stringStats) DistinctCount() *int64 { } func (s *stringStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilString { + return nil } - return s.min + return []byte(s.min) } func (s *stringStats) Max() []byte { - if s.max == nil { - s.minMax() - } - return s.max -} - -func (s *stringStats) minMax() { - if len(s.vals) == 0 { - return + if s.max == nilString { + return nil } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } type int32optionalStats struct { @@ -1861,10 +1909,10 @@ func (f *int32optionalStats) add(vals []int32, defs []uint8) { } } -func (f *int32optionalStats) bytes(val int32) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int32optionalStats) bytes(v int32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, uint32(v)) + return bs } func (f *int32optionalStats) NullCount() *int64 { @@ -1909,10 +1957,10 @@ func (i *int64stats) add(val int64) { } } -func (f *int64stats) bytes(val int64) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int64stats) bytes(v int64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, uint64(v)) + return bs } func (f *int64stats) NullCount() *int64 { @@ -1966,10 +2014,10 @@ func (f *int64optionalStats) add(vals []int64, defs []uint8) { } } -func (f *int64optionalStats) bytes(val int64) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *int64optionalStats) bytes(v int64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, uint64(v)) + return bs } func (f *int64optionalStats) NullCount() *int64 { @@ -1994,16 +2042,21 @@ func (f *int64optionalStats) Max() []byte { return f.bytes(f.max) } +const nilOptString = "__#NIL#__" + type stringOptionalStats struct { - vals []string - min []byte - max []byte + min string + max string nils int64 maxDef uint8 } func newStringOptionalStats(d uint8) *stringOptionalStats { - return &stringOptionalStats{maxDef: d} + return &stringOptionalStats{ + min: nilOptString, + max: nilOptString, + maxDef: d, + } } func (s *stringOptionalStats) add(vals []string, defs []uint8) { @@ -2012,7 +2065,21 @@ func (s *stringOptionalStats) add(vals []string, defs []uint8) { if def < s.maxDef { s.nils++ } else { - s.vals = append(s.vals, vals[i]) + val := vals[i] + if s.min == nilString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } i++ } } @@ -2027,29 +2094,17 @@ func (s *stringOptionalStats) DistinctCount() *int64 { } func (s *stringOptionalStats) Min() []byte { - if s.min == nil { - s.minMax() + if s.min == nilOptString { + return nil } - return s.min + return []byte(s.min) } func (s *stringOptionalStats) Max() []byte { - if s.max == nil { - s.minMax() - } - return s.max -} - -func (s *stringOptionalStats) minMax() { - if len(s.vals) == 0 { - return + if s.max == nilOptString { + return nil } - - tmp := make([]string, len(s.vals)) - copy(tmp, s.vals) - sort.Strings(tmp) - s.min = []byte(tmp[0]) - s.max = []byte(tmp[len(tmp)-1]) + return []byte(s.max) } type float32stats struct { @@ -2072,10 +2127,10 @@ func (i *float32stats) add(val float32) { } } -func (f *float32stats) bytes(val float32) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *float32stats) bytes(v float32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + return bs } func (f *float32stats) NullCount() *int64 { @@ -2114,10 +2169,10 @@ func (i *float64stats) add(val float64) { } } -func (f *float64stats) bytes(val float64) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *float64stats) bytes(v float64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, math.Float64bits(v)) + return bs } func (f *float64stats) NullCount() *int64 { @@ -2171,10 +2226,10 @@ func (f *float32optionalStats) add(vals []float32, defs []uint8) { } } -func (f *float32optionalStats) bytes(val float32) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *float32optionalStats) bytes(v float32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + return bs } func (f *float32optionalStats) NullCount() *int64 { @@ -2252,10 +2307,10 @@ func (i *uint32stats) add(val uint32) { } } -func (f *uint32stats) bytes(val uint32) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *uint32stats) bytes(v uint32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, v) + return bs } func (f *uint32stats) NullCount() *int64 { @@ -2309,10 +2364,10 @@ func (f *uint64optionalStats) add(vals []uint64, defs []uint8) { } } -func (f *uint64optionalStats) bytes(val uint64) []byte { - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, val) - return buf.Bytes() +func (f *uint64optionalStats) bytes(v uint64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, v) + return bs } func (f *uint64optionalStats) NullCount() *int64 { diff --git a/performance/base/parquet.go b/performance/base/parquet.go new file mode 100644 index 0000000..b2c6fe8 --- /dev/null +++ b/performance/base/parquet.go @@ -0,0 +1,2442 @@ +package base + +// Code generated by github.com/parsyl/parquet. DO NOT EDIT. + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "strings" + + "github.com/parsyl/parquet" + . "github.com/parsyl/parquet/performance/message" + sch "github.com/parsyl/parquet/schema" + "math" + "sort" +) + +type compression int + +const ( + compressionUncompressed compression = 0 + compressionSnappy compression = 1 + compressionGzip compression = 2 + compressionUnknown compression = -1 +) + +// ParquetWriter reprents a row group +type ParquetWriter struct { + fields []Field + + len int + + // child points to the next page + child *ParquetWriter + + // max is the number of Record items that can get written before + // a new set of column chunks is written + max int + + meta *parquet.Metadata + w io.Writer + compression compression +} + +func Fields(compression compression) []Field { + return []Field{ + NewStringOptionalField(readColStr0, writeColStr0, []string{"col_str_0"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr1, writeColStr1, []string{"col_str_1"}, fieldCompression(compression)), + NewStringOptionalField(readColStr2, writeColStr2, []string{"col_str_2"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr3, writeColStr3, []string{"col_str_3"}, fieldCompression(compression)), + NewStringOptionalField(readColStr4, writeColStr4, []string{"col_str_4"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr5, writeColStr5, []string{"col_str_5"}, fieldCompression(compression)), + NewStringOptionalField(readColStr6, writeColStr6, []string{"col_str_6"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr7, writeColStr7, []string{"col_str_7"}, fieldCompression(compression)), + NewStringOptionalField(readColStr8, writeColStr8, []string{"col_str_8"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr9, writeColStr9, []string{"col_str_9"}, fieldCompression(compression)), + NewInt64OptionalField(readColInt0, writeColInt0, []string{"col_int_0"}, []int{1}, optionalFieldCompression(compression)), + NewInt64Field(readColInt1, writeColInt1, []string{"col_int_1"}, fieldCompression(compression)), + NewInt64OptionalField(readColInt2, writeColInt2, []string{"col_int_2"}, []int{1}, optionalFieldCompression(compression)), + NewInt64Field(readColInt3, writeColInt3, []string{"col_int_3"}, fieldCompression(compression)), + NewInt64OptionalField(readColInt4, writeColInt4, []string{"col_int_4"}, []int{1}, optionalFieldCompression(compression)), + NewInt32OptionalField(readColInt32_0, writeColInt32_0, []string{"col_int_32_0"}, []int{1}, optionalFieldCompression(compression)), + NewInt32Field(readColInt32_1, writeColInt32_1, []string{"col_int_32_1"}, fieldCompression(compression)), + NewInt32OptionalField(readColInt32_2, writeColInt32_2, []string{"col_int_32_2"}, []int{1}, optionalFieldCompression(compression)), + NewInt32Field(readColInt32_3, writeColInt32_3, []string{"col_int_32_3"}, fieldCompression(compression)), + NewInt32OptionalField(readColInt32_4, writeColInt32_4, []string{"col_int_32_4"}, []int{1}, optionalFieldCompression(compression)), + NewFloat64OptionalField(readColFloat0, writeColFloat0, []string{"col_float_0"}, []int{1}, optionalFieldCompression(compression)), + NewFloat64Field(readColFloat1, writeColFloat1, []string{"col_float_1"}, fieldCompression(compression)), + NewFloat64OptionalField(readColFloat2, writeColFloat2, []string{"col_float_2"}, []int{1}, optionalFieldCompression(compression)), + NewFloat64Field(readColFloat3, writeColFloat3, []string{"col_float_3"}, fieldCompression(compression)), + NewFloat64OptionalField(readColFloat4, writeColFloat4, []string{"col_float_4"}, []int{1}, optionalFieldCompression(compression)), + NewFloat32OptionalField(readColFloat32_0, writeColFloat32_0, []string{"col_float_32_0"}, []int{1}, optionalFieldCompression(compression)), + NewFloat32Field(readColFloat32_1, writeColFloat32_1, []string{"col_float_32_1"}, fieldCompression(compression)), + NewFloat32OptionalField(readColFloat32_2, writeColFloat32_2, []string{"col_float_32_2"}, []int{1}, optionalFieldCompression(compression)), + NewFloat32Field(readColFloat32_3, writeColFloat32_3, []string{"col_float_32_3"}, fieldCompression(compression)), + NewFloat32OptionalField(readColFloat32_4, writeColFloat32_4, []string{"col_float_32_4"}, []int{1}, optionalFieldCompression(compression)), + NewBoolOptionalField(readColBool0, writeColBool0, []string{"col_bool_0"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool1, writeColBool1, []string{"col_bool_1"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool2, writeColBool2, []string{"col_bool_2"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool3, writeColBool3, []string{"col_bool_3"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool4, writeColBool4, []string{"col_bool_4"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool5, writeColBool5, []string{"col_bool_5"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool6, writeColBool6, []string{"col_bool_6"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool7, writeColBool7, []string{"col_bool_7"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool8, writeColBool8, []string{"col_bool_8"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool9, writeColBool9, []string{"col_bool_9"}, fieldCompression(compression)), + } +} + +func readColStr0(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr0 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr0}, []uint8{1}, nil + } +} + +func writeColStr0(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr0 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr1(x Message) string { + return x.ColStr1 +} + +func writeColStr1(x *Message, vals []string) { + x.ColStr1 = vals[0] +} + +func readColStr2(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr2 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr2}, []uint8{1}, nil + } +} + +func writeColStr2(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr2 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr3(x Message) string { + return x.ColStr3 +} + +func writeColStr3(x *Message, vals []string) { + x.ColStr3 = vals[0] +} + +func readColStr4(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr4 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr4}, []uint8{1}, nil + } +} + +func writeColStr4(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr4 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr5(x Message) string { + return x.ColStr5 +} + +func writeColStr5(x *Message, vals []string) { + x.ColStr5 = vals[0] +} + +func readColStr6(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr6 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr6}, []uint8{1}, nil + } +} + +func writeColStr6(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr6 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr7(x Message) string { + return x.ColStr7 +} + +func writeColStr7(x *Message, vals []string) { + x.ColStr7 = vals[0] +} + +func readColStr8(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr8 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr8}, []uint8{1}, nil + } +} + +func writeColStr8(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr8 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr9(x Message) string { + return x.ColStr9 +} + +func writeColStr9(x *Message, vals []string) { + x.ColStr9 = vals[0] +} + +func readColInt0(x Message) ([]int64, []uint8, []uint8) { + switch { + case x.ColInt0 == nil: + return nil, []uint8{0}, nil + default: + return []int64{*x.ColInt0}, []uint8{1}, nil + } +} + +func writeColInt0(x *Message, vals []int64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt0 = pint64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt1(x Message) int64 { + return x.ColInt1 +} + +func writeColInt1(x *Message, vals []int64) { + x.ColInt1 = vals[0] +} + +func readColInt2(x Message) ([]int64, []uint8, []uint8) { + switch { + case x.ColInt2 == nil: + return nil, []uint8{0}, nil + default: + return []int64{*x.ColInt2}, []uint8{1}, nil + } +} + +func writeColInt2(x *Message, vals []int64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt2 = pint64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt3(x Message) int64 { + return x.ColInt3 +} + +func writeColInt3(x *Message, vals []int64) { + x.ColInt3 = vals[0] +} + +func readColInt4(x Message) ([]int64, []uint8, []uint8) { + switch { + case x.ColInt4 == nil: + return nil, []uint8{0}, nil + default: + return []int64{*x.ColInt4}, []uint8{1}, nil + } +} + +func writeColInt4(x *Message, vals []int64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt4 = pint64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt32_0(x Message) ([]int32, []uint8, []uint8) { + switch { + case x.ColInt32_0 == nil: + return nil, []uint8{0}, nil + default: + return []int32{*x.ColInt32_0}, []uint8{1}, nil + } +} + +func writeColInt32_0(x *Message, vals []int32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt32_0 = pint32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt32_1(x Message) int32 { + return x.ColInt32_1 +} + +func writeColInt32_1(x *Message, vals []int32) { + x.ColInt32_1 = vals[0] +} + +func readColInt32_2(x Message) ([]int32, []uint8, []uint8) { + switch { + case x.ColInt32_2 == nil: + return nil, []uint8{0}, nil + default: + return []int32{*x.ColInt32_2}, []uint8{1}, nil + } +} + +func writeColInt32_2(x *Message, vals []int32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt32_2 = pint32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt32_3(x Message) int32 { + return x.ColInt32_3 +} + +func writeColInt32_3(x *Message, vals []int32) { + x.ColInt32_3 = vals[0] +} + +func readColInt32_4(x Message) ([]int32, []uint8, []uint8) { + switch { + case x.ColInt32_4 == nil: + return nil, []uint8{0}, nil + default: + return []int32{*x.ColInt32_4}, []uint8{1}, nil + } +} + +func writeColInt32_4(x *Message, vals []int32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt32_4 = pint32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat0(x Message) ([]float64, []uint8, []uint8) { + switch { + case x.ColFloat0 == nil: + return nil, []uint8{0}, nil + default: + return []float64{*x.ColFloat0}, []uint8{1}, nil + } +} + +func writeColFloat0(x *Message, vals []float64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat0 = pfloat64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat1(x Message) float64 { + return x.ColFloat1 +} + +func writeColFloat1(x *Message, vals []float64) { + x.ColFloat1 = vals[0] +} + +func readColFloat2(x Message) ([]float64, []uint8, []uint8) { + switch { + case x.ColFloat2 == nil: + return nil, []uint8{0}, nil + default: + return []float64{*x.ColFloat2}, []uint8{1}, nil + } +} + +func writeColFloat2(x *Message, vals []float64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat2 = pfloat64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat3(x Message) float64 { + return x.ColFloat3 +} + +func writeColFloat3(x *Message, vals []float64) { + x.ColFloat3 = vals[0] +} + +func readColFloat4(x Message) ([]float64, []uint8, []uint8) { + switch { + case x.ColFloat4 == nil: + return nil, []uint8{0}, nil + default: + return []float64{*x.ColFloat4}, []uint8{1}, nil + } +} + +func writeColFloat4(x *Message, vals []float64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat4 = pfloat64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat32_0(x Message) ([]float32, []uint8, []uint8) { + switch { + case x.ColFloat32_0 == nil: + return nil, []uint8{0}, nil + default: + return []float32{*x.ColFloat32_0}, []uint8{1}, nil + } +} + +func writeColFloat32_0(x *Message, vals []float32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat32_0 = pfloat32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat32_1(x Message) float32 { + return x.ColFloat32_1 +} + +func writeColFloat32_1(x *Message, vals []float32) { + x.ColFloat32_1 = vals[0] +} + +func readColFloat32_2(x Message) ([]float32, []uint8, []uint8) { + switch { + case x.ColFloat32_2 == nil: + return nil, []uint8{0}, nil + default: + return []float32{*x.ColFloat32_2}, []uint8{1}, nil + } +} + +func writeColFloat32_2(x *Message, vals []float32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat32_2 = pfloat32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat32_3(x Message) float32 { + return x.ColFloat32_3 +} + +func writeColFloat32_3(x *Message, vals []float32) { + x.ColFloat32_3 = vals[0] +} + +func readColFloat32_4(x Message) ([]float32, []uint8, []uint8) { + switch { + case x.ColFloat32_4 == nil: + return nil, []uint8{0}, nil + default: + return []float32{*x.ColFloat32_4}, []uint8{1}, nil + } +} + +func writeColFloat32_4(x *Message, vals []float32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat32_4 = pfloat32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool0(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool0 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool0}, []uint8{1}, nil + } +} + +func writeColBool0(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool0 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool1(x Message) bool { + return x.ColBool1 +} + +func writeColBool1(x *Message, vals []bool) { + x.ColBool1 = vals[0] +} + +func readColBool2(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool2 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool2}, []uint8{1}, nil + } +} + +func writeColBool2(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool2 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool3(x Message) bool { + return x.ColBool3 +} + +func writeColBool3(x *Message, vals []bool) { + x.ColBool3 = vals[0] +} + +func readColBool4(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool4 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool4}, []uint8{1}, nil + } +} + +func writeColBool4(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool4 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool5(x Message) bool { + return x.ColBool5 +} + +func writeColBool5(x *Message, vals []bool) { + x.ColBool5 = vals[0] +} + +func readColBool6(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool6 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool6}, []uint8{1}, nil + } +} + +func writeColBool6(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool6 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool7(x Message) bool { + return x.ColBool7 +} + +func writeColBool7(x *Message, vals []bool) { + x.ColBool7 = vals[0] +} + +func readColBool8(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool8 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool8}, []uint8{1}, nil + } +} + +func writeColBool8(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool8 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool9(x Message) bool { + return x.ColBool9 +} + +func writeColBool9(x *Message, vals []bool) { + x.ColBool9 = vals[0] +} + +func fieldCompression(c compression) func(*parquet.RequiredField) { + switch c { + case compressionUncompressed: + return parquet.RequiredFieldUncompressed + case compressionSnappy: + return parquet.RequiredFieldSnappy + case compressionGzip: + return parquet.RequiredFieldGzip + default: + return parquet.RequiredFieldUncompressed + } +} + +func optionalFieldCompression(c compression) func(*parquet.OptionalField) { + switch c { + case compressionUncompressed: + return parquet.OptionalFieldUncompressed + case compressionSnappy: + return parquet.OptionalFieldSnappy + case compressionGzip: + return parquet.OptionalFieldGzip + default: + return parquet.OptionalFieldUncompressed + } +} + +func NewParquetWriter(w io.Writer, opts ...func(*ParquetWriter) error) (*ParquetWriter, error) { + return newParquetWriter(w, append(opts, begin)...) +} + +func newParquetWriter(w io.Writer, opts ...func(*ParquetWriter) error) (*ParquetWriter, error) { + p := &ParquetWriter{ + max: 1000, + w: w, + compression: compressionSnappy, + } + + for _, opt := range opts { + if err := opt(p); err != nil { + return nil, err + } + } + + p.fields = Fields(p.compression) + if p.meta == nil { + ff := Fields(p.compression) + schema := make([]parquet.Field, len(ff)) + for i, f := range ff { + schema[i] = f.Schema() + } + p.meta = parquet.New(schema...) + } + + return p, nil +} + +// MaxPageSize is the maximum number of rows in each row groups' page. +func MaxPageSize(m int) func(*ParquetWriter) error { + return func(p *ParquetWriter) error { + p.max = m + return nil + } +} + +func begin(p *ParquetWriter) error { + _, err := p.w.Write([]byte("PAR1")) + return err +} + +func withMeta(m *parquet.Metadata) func(*ParquetWriter) error { + return func(p *ParquetWriter) error { + p.meta = m + return nil + } +} + +func Uncompressed(p *ParquetWriter) error { + p.compression = compressionUncompressed + return nil +} + +func Snappy(p *ParquetWriter) error { + p.compression = compressionSnappy + return nil +} + +func Gzip(p *ParquetWriter) error { + p.compression = compressionGzip + return nil +} + +func withCompression(c compression) func(*ParquetWriter) error { + return func(p *ParquetWriter) error { + p.compression = c + return nil + } +} + +func (p *ParquetWriter) Write() error { + for i, f := range p.fields { + if err := f.Write(p.w, p.meta); err != nil { + return err + } + + for child := p.child; child != nil; child = child.child { + if err := child.fields[i].Write(p.w, p.meta); err != nil { + return err + } + } + } + + p.fields = Fields(p.compression) + p.child = nil + p.len = 0 + + schema := make([]parquet.Field, len(p.fields)) + for i, f := range p.fields { + schema[i] = f.Schema() + } + p.meta.StartRowGroup(schema...) + return nil +} + +func (p *ParquetWriter) Close() error { + if err := p.meta.Footer(p.w); err != nil { + return err + } + + _, err := p.w.Write([]byte("PAR1")) + return err +} + +func (p *ParquetWriter) Add(rec Message) { + if p.len == p.max { + if p.child == nil { + // an error can't happen here + p.child, _ = newParquetWriter(p.w, MaxPageSize(p.max), withMeta(p.meta), withCompression(p.compression)) + } + + p.child.Add(rec) + return + } + + p.meta.NextDoc() + for _, f := range p.fields { + f.Add(rec) + } + + p.len++ +} + +type Field interface { + Add(r Message) + Write(w io.Writer, meta *parquet.Metadata) error + Schema() parquet.Field + Scan(r *Message) + Read(r io.ReadSeeker, pg parquet.Page) error + Name() string + Levels() ([]uint8, []uint8) +} + +func getFields(ff []Field) map[string]Field { + m := make(map[string]Field, len(ff)) + for _, f := range ff { + m[f.Name()] = f + } + return m +} + +func NewParquetReader(r io.ReadSeeker, opts ...func(*ParquetReader)) (*ParquetReader, error) { + ff := Fields(compressionUnknown) + pr := &ParquetReader{ + r: r, + } + + for _, opt := range opts { + opt(pr) + } + + schema := make([]parquet.Field, len(ff)) + for i, f := range ff { + pr.fieldNames = append(pr.fieldNames, f.Name()) + schema[i] = f.Schema() + } + + meta := parquet.New(schema...) + if err := meta.ReadFooter(r); err != nil { + return nil, err + } + pr.rows = meta.Rows() + var err error + pr.pages, err = meta.Pages() + if err != nil { + return nil, err + } + + pr.rowGroups = meta.RowGroups() + _, err = r.Seek(4, io.SeekStart) + if err != nil { + return nil, err + } + pr.meta = meta + + return pr, pr.readRowGroup() +} + +func readerIndex(i int) func(*ParquetReader) { + return func(p *ParquetReader) { + p.index = i + } +} + +// ParquetReader reads one page from a row group. +type ParquetReader struct { + fields map[string]Field + fieldNames []string + index int + cursor int64 + rows int64 + rowGroupCursor int64 + rowGroupCount int64 + pages map[string][]parquet.Page + meta *parquet.Metadata + err error + + r io.ReadSeeker + rowGroups []parquet.RowGroup +} + +type Levels struct { + Name string + Defs []uint8 + Reps []uint8 +} + +func (p *ParquetReader) Levels() []Levels { + var out []Levels + //for { + for _, name := range p.fieldNames { + f := p.fields[name] + d, r := f.Levels() + out = append(out, Levels{Name: f.Name(), Defs: d, Reps: r}) + } + // if err := p.readRowGroup(); err != nil { + // break + // } + //} + return out +} + +func (p *ParquetReader) Error() error { + return p.err +} + +func (p *ParquetReader) readRowGroup() error { + p.rowGroupCursor = 0 + + if len(p.rowGroups) == 0 { + p.rowGroupCount = 0 + return nil + } + + rg := p.rowGroups[0] + p.fields = getFields(Fields(compressionUnknown)) + p.rowGroupCount = rg.Rows + p.rowGroupCursor = 0 + for _, col := range rg.Columns() { + name := strings.Join(col.MetaData.PathInSchema, ".") + f, ok := p.fields[name] + if !ok { + return fmt.Errorf("unknown field: %s", name) + } + pages := p.pages[name] + if len(pages) <= p.index { + break + } + + pg := pages[0] + if err := f.Read(p.r, pg); err != nil { + return fmt.Errorf("unable to read field %s, err: %s", f.Name(), err) + } + p.pages[name] = p.pages[name][1:] + } + p.rowGroups = p.rowGroups[1:] + return nil +} + +func (p *ParquetReader) Rows() int64 { + return p.rows +} + +func (p *ParquetReader) Next() bool { + if p.err == nil && p.cursor >= p.rows { + return false + } + if p.rowGroupCursor >= p.rowGroupCount { + p.err = p.readRowGroup() + if p.err != nil { + return false + } + } + + p.cursor++ + p.rowGroupCursor++ + return true +} + +func (p *ParquetReader) Scan(x *Message) { + if p.err != nil { + return + } + + for _, name := range p.fieldNames { + f := p.fields[name] + f.Scan(x) + } +} + +type StringOptionalField struct { + parquet.OptionalField + vals []string + read func(r Message) ([]string, []uint8, []uint8) + write func(r *Message, vals []string, def, rep []uint8) (int, int) + stats *stringOptionalStats +} + +func NewStringOptionalField(read func(r Message) ([]string, []uint8, []uint8), write func(r *Message, vals []string, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *StringOptionalField { + return &StringOptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newStringOptionalStats(maxDef(types)), + } +} + +func (f *StringOptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: StringType, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *StringOptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *StringOptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := bytes.Buffer{} + + for _, s := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + return err + } + buf.Write([]byte(s)) + } + + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *StringOptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + for j := 0; j < f.Values(); j++ { + var x int32 + if err := binary.Read(rr, binary.LittleEndian, &x); err != nil { + return err + } + s := make([]byte, x) + if _, err := rr.Read(s); err != nil { + return err + } + + f.vals = append(f.vals, string(s)) + } + return nil +} + +func (f *StringOptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type StringField struct { + parquet.RequiredField + vals []string + read func(r Message) string + write func(r *Message, vals []string) + stats *stringStats +} + +func NewStringField(read func(r Message) string, write func(r *Message, vals []string), path []string, opts ...func(*parquet.RequiredField)) *StringField { + return &StringField{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newStringStats(), + } +} + +func (f *StringField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: StringType, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *StringField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := bytes.Buffer{} + + for _, s := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, int32(len(s))); err != nil { + return err + } + buf.Write([]byte(s)) + } + + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *StringField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + for j := 0; j < pg.N; j++ { + var x int32 + if err := binary.Read(rr, binary.LittleEndian, &x); err != nil { + return err + } + s := make([]byte, x) + if _, err := rr.Read(s); err != nil { + return err + } + + f.vals = append(f.vals, string(s)) + } + return nil +} + +func (f *StringField) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *StringField) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *StringField) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Int64OptionalField struct { + parquet.OptionalField + vals []int64 + read func(r Message) ([]int64, []uint8, []uint8) + write func(r *Message, vals []int64, def, rep []uint8) (int, int) + stats *int64optionalStats +} + +func NewInt64OptionalField(read func(r Message) ([]int64, []uint8, []uint8), write func(r *Message, vals []int64, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Int64OptionalField { + return &Int64OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newint64optionalStats(maxDef(types)), + } +} + +func (f *Int64OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int64Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Int64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Int64OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int64, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int64OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Int64OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Int64OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Int64Field struct { + vals []int64 + parquet.RequiredField + read func(r Message) int64 + write func(r *Message, vals []int64) + stats *int64stats +} + +func NewInt64Field(read func(r Message) int64, write func(r *Message, vals []int64), path []string, opts ...func(*parquet.RequiredField)) *Int64Field { + return &Int64Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newInt64stats(), + } +} + +func (f *Int64Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int64Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Int64Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int64, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int64Field) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Int64Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Int64Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Int64Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Int32OptionalField struct { + parquet.OptionalField + vals []int32 + read func(r Message) ([]int32, []uint8, []uint8) + write func(r *Message, vals []int32, def, rep []uint8) (int, int) + stats *int32optionalStats +} + +func NewInt32OptionalField(read func(r Message) ([]int32, []uint8, []uint8), write func(r *Message, vals []int32, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Int32OptionalField { + return &Int32OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newint32optionalStats(maxDef(types)), + } +} + +func (f *Int32OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int32Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Int32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Int32OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int32, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int32OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Int32OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Int32OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Int32Field struct { + vals []int32 + parquet.RequiredField + read func(r Message) int32 + write func(r *Message, vals []int32) + stats *int32stats +} + +func NewInt32Field(read func(r Message) int32, write func(r *Message, vals []int32), path []string, opts ...func(*parquet.RequiredField)) *Int32Field { + return &Int32Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newInt32stats(), + } +} + +func (f *Int32Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int32Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Int32Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int32, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int32Field) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Int32Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Int32Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Int32Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Float64OptionalField struct { + parquet.OptionalField + vals []float64 + read func(r Message) ([]float64, []uint8, []uint8) + write func(r *Message, vals []float64, def, rep []uint8) (int, int) + stats *float64optionalStats +} + +func NewFloat64OptionalField(read func(r Message) ([]float64, []uint8, []uint8), write func(r *Message, vals []float64, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Float64OptionalField { + return &Float64OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newfloat64optionalStats(maxDef(types)), + } +} + +func (f *Float64OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float64Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Float64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Float64OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float64, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float64OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Float64OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Float64OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Float64Field struct { + vals []float64 + parquet.RequiredField + read func(r Message) float64 + write func(r *Message, vals []float64) + stats *float64stats +} + +func NewFloat64Field(read func(r Message) float64, write func(r *Message, vals []float64), path []string, opts ...func(*parquet.RequiredField)) *Float64Field { + return &Float64Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newFloat64stats(), + } +} + +func (f *Float64Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float64Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Float64Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float64, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float64Field) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Float64Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Float64Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Float64Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Float32OptionalField struct { + parquet.OptionalField + vals []float32 + read func(r Message) ([]float32, []uint8, []uint8) + write func(r *Message, vals []float32, def, rep []uint8) (int, int) + stats *float32optionalStats +} + +func NewFloat32OptionalField(read func(r Message) ([]float32, []uint8, []uint8), write func(r *Message, vals []float32, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Float32OptionalField { + return &Float32OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newfloat32optionalStats(maxDef(types)), + } +} + +func (f *Float32OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float32Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Float32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Float32OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float32, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float32OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Float32OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Float32OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Float32Field struct { + vals []float32 + parquet.RequiredField + read func(r Message) float32 + write func(r *Message, vals []float32) + stats *float32stats +} + +func NewFloat32Field(read func(r Message) float32, write func(r *Message, vals []float32), path []string, opts ...func(*parquet.RequiredField)) *Float32Field { + return &Float32Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newFloat32stats(), + } +} + +func (f *Float32Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float32Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Float32Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float32, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float32Field) Write(w io.Writer, meta *parquet.Metadata) error { + var buf bytes.Buffer + for _, v := range f.vals { + if err := binary.Write(&buf, binary.LittleEndian, v); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Float32Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Float32Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Float32Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type BoolOptionalField struct { + parquet.OptionalField + vals []bool + read func(r Message) ([]bool, []uint8, []uint8) + write func(r *Message, vals []bool, defs, reps []uint8) (int, int) + stats *boolOptionalStats +} + +func NewBoolOptionalField(read func(r Message) ([]bool, []uint8, []uint8), write func(r *Message, vals []bool, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *BoolOptionalField { + return &BoolOptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newBoolOptionalStats(maxDef(types)), + } +} + +func (f *BoolOptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: BoolType, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *BoolOptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, sizes, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v, err := parquet.GetBools(rr, f.Values()-len(f.vals), sizes) + f.vals = append(f.vals, v...) + return err +} + +func (f *BoolOptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *BoolOptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *BoolOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + ln := len(f.vals) + byteNum := (ln + 7) / 8 + rawBuf := make([]byte, byteNum) + + for i := 0; i < ln; i++ { + if f.vals[i] { + rawBuf[i/8] = rawBuf[i/8] | (1 << uint32(i%8)) + } + } + + return f.DoWrite(w, meta, rawBuf, len(f.Defs), f.stats) +} + +func (f *BoolOptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type BoolField struct { + parquet.RequiredField + vals []bool + read func(r Message) bool + write func(r *Message, vals []bool) + stats *boolStats +} + +func NewBoolField(read func(r Message) bool, write func(r *Message, vals []bool), path []string, opts ...func(*parquet.RequiredField)) *BoolField { + return &BoolField{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + } +} + +func (f *BoolField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: BoolType, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *BoolField) Write(w io.Writer, meta *parquet.Metadata) error { + ln := len(f.vals) + n := (ln + 7) / 8 + rawBuf := make([]byte, n) + + for i := 0; i < ln; i++ { + if f.vals[i] { + rawBuf[i/8] = rawBuf[i/8] | (1 << uint32(i%8)) + } + } + + return f.DoWrite(w, meta, rawBuf, len(f.vals), newBoolStats()) +} + +func (f *BoolField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, sizes, err := f.DoRead(r, pg) + if err != nil { + return err + } + + f.vals, err = parquet.GetBools(rr, int(pg.N), sizes) + return err +} + +func (f *BoolField) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *BoolField) Add(r Message) { + v := f.read(r) + f.vals = append(f.vals, v) +} + +func (f *BoolField) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type stringOptionalStats struct { + vals []string + min []byte + max []byte + nils int64 + maxDef uint8 +} + +func newStringOptionalStats(d uint8) *stringOptionalStats { + return &stringOptionalStats{maxDef: d} +} + +func (s *stringOptionalStats) add(vals []string, defs []uint8) { + var i int + for _, def := range defs { + if def < s.maxDef { + s.nils++ + } else { + s.vals = append(s.vals, vals[i]) + i++ + } + } +} + +func (s *stringOptionalStats) NullCount() *int64 { + return &s.nils +} + +func (s *stringOptionalStats) DistinctCount() *int64 { + return nil +} + +func (s *stringOptionalStats) Min() []byte { + if s.min == nil { + s.minMax() + } + return s.min +} + +func (s *stringOptionalStats) Max() []byte { + if s.max == nil { + s.minMax() + } + return s.max +} + +func (s *stringOptionalStats) minMax() { + if len(s.vals) == 0 { + return + } + + tmp := make([]string, len(s.vals)) + copy(tmp, s.vals) + sort.Strings(tmp) + s.min = []byte(tmp[0]) + s.max = []byte(tmp[len(tmp)-1]) +} + +type stringStats struct { + vals []string + min []byte + max []byte +} + +func newStringStats() *stringStats { + return &stringStats{} +} + +func (s *stringStats) add(val string) { + s.vals = append(s.vals, val) +} + +func (s *stringStats) NullCount() *int64 { + return nil +} + +func (s *stringStats) DistinctCount() *int64 { + return nil +} + +func (s *stringStats) Min() []byte { + if s.min == nil { + s.minMax() + } + return s.min +} + +func (s *stringStats) Max() []byte { + if s.max == nil { + s.minMax() + } + return s.max +} + +func (s *stringStats) minMax() { + if len(s.vals) == 0 { + return + } + + tmp := make([]string, len(s.vals)) + copy(tmp, s.vals) + sort.Strings(tmp) + s.min = []byte(tmp[0]) + s.max = []byte(tmp[len(tmp)-1]) +} + +type int64optionalStats struct { + min int64 + max int64 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newint64optionalStats(d uint8) *int64optionalStats { + return &int64optionalStats{ + min: int64(math.MaxInt64), + maxDef: d, + } +} + +func (f *int64optionalStats) add(vals []int64, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *int64optionalStats) bytes(val int64) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *int64optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *int64optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *int64optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *int64optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type int64stats struct { + min int64 + max int64 +} + +func newInt64stats() *int64stats { + return &int64stats{ + min: int64(math.MaxInt64), + } +} + +func (i *int64stats) add(val int64) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *int64stats) bytes(val int64) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *int64stats) NullCount() *int64 { + return nil +} + +func (f *int64stats) DistinctCount() *int64 { + return nil +} + +func (f *int64stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *int64stats) Max() []byte { + return f.bytes(f.max) +} + +type int32optionalStats struct { + min int32 + max int32 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newint32optionalStats(d uint8) *int32optionalStats { + return &int32optionalStats{ + min: int32(math.MaxInt32), + maxDef: d, + } +} + +func (f *int32optionalStats) add(vals []int32, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *int32optionalStats) bytes(val int32) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *int32optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *int32optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *int32optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *int32optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type int32stats struct { + min int32 + max int32 +} + +func newInt32stats() *int32stats { + return &int32stats{ + min: int32(math.MaxInt32), + } +} + +func (i *int32stats) add(val int32) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *int32stats) bytes(val int32) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *int32stats) NullCount() *int64 { + return nil +} + +func (f *int32stats) DistinctCount() *int64 { + return nil +} + +func (f *int32stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *int32stats) Max() []byte { + return f.bytes(f.max) +} + +type float64optionalStats struct { + min float64 + max float64 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newfloat64optionalStats(d uint8) *float64optionalStats { + return &float64optionalStats{ + min: float64(math.MaxFloat64), + maxDef: d, + } +} + +func (f *float64optionalStats) add(vals []float64, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *float64optionalStats) bytes(val float64) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *float64optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *float64optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *float64optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *float64optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type float64stats struct { + min float64 + max float64 +} + +func newFloat64stats() *float64stats { + return &float64stats{ + min: float64(math.MaxFloat64), + } +} + +func (i *float64stats) add(val float64) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *float64stats) bytes(val float64) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *float64stats) NullCount() *int64 { + return nil +} + +func (f *float64stats) DistinctCount() *int64 { + return nil +} + +func (f *float64stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *float64stats) Max() []byte { + return f.bytes(f.max) +} + +type float32optionalStats struct { + min float32 + max float32 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newfloat32optionalStats(d uint8) *float32optionalStats { + return &float32optionalStats{ + min: float32(math.MaxFloat32), + maxDef: d, + } +} + +func (f *float32optionalStats) add(vals []float32, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *float32optionalStats) bytes(val float32) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *float32optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *float32optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *float32optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *float32optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type float32stats struct { + min float32 + max float32 +} + +func newFloat32stats() *float32stats { + return &float32stats{ + min: float32(math.MaxFloat32), + } +} + +func (i *float32stats) add(val float32) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *float32stats) bytes(val float32) []byte { + var buf bytes.Buffer + binary.Write(&buf, binary.LittleEndian, val) + return buf.Bytes() +} + +func (f *float32stats) NullCount() *int64 { + return nil +} + +func (f *float32stats) DistinctCount() *int64 { + return nil +} + +func (f *float32stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *float32stats) Max() []byte { + return f.bytes(f.max) +} + +type boolOptionalStats struct { + maxDef uint8 + nils int64 +} + +func newBoolOptionalStats(d uint8) *boolOptionalStats { + return &boolOptionalStats{maxDef: d} +} + +func (b *boolOptionalStats) add(vals []bool, defs []uint8) { + for _, def := range defs { + if def < b.maxDef { + b.nils++ + } + } +} + +func (b *boolOptionalStats) NullCount() *int64 { + return &b.nils +} + +func (b *boolOptionalStats) DistinctCount() *int64 { + return nil +} + +func (b *boolOptionalStats) Min() []byte { + return nil +} + +func (b *boolOptionalStats) Max() []byte { + return nil +} + +type boolStats struct{} + +func newBoolStats() *boolStats { return &boolStats{} } +func (b *boolStats) NullCount() *int64 { return nil } +func (b *boolStats) DistinctCount() *int64 { return nil } +func (b *boolStats) Min() []byte { return nil } +func (b *boolStats) Max() []byte { return nil } + +func pint32(i int32) *int32 { return &i } +func puint32(i uint32) *uint32 { return &i } +func pint64(i int64) *int64 { return &i } +func puint64(i uint64) *uint64 { return &i } +func pbool(b bool) *bool { return &b } +func pstring(s string) *string { return &s } +func pfloat32(f float32) *float32 { return &f } +func pfloat64(f float64) *float64 { return &f } + +// keeps track of the indices of repeated fields +// that have already been handled by a previous field +type indices []int + +func (i indices) rep(rep uint8) { + if rep > 0 { + r := int(rep) - 1 + i[r] = i[r] + 1 + for j := int(rep); j < len(i); j++ { + i[j] = 0 + } + } +} + +func maxDef(types []int) uint8 { + var out uint8 + for _, typ := range types { + if typ > 0 { + out++ + } + } + return out +} + +func Int32Type(se *sch.SchemaElement) { + t := sch.Type_INT32 + se.Type = &t +} + +func Uint32Type(se *sch.SchemaElement) { + t := sch.Type_INT32 + se.Type = &t + ct := sch.ConvertedType_UINT_32 + se.ConvertedType = &ct +} + +func Int64Type(se *sch.SchemaElement) { + t := sch.Type_INT64 + se.Type = &t +} + +func Uint64Type(se *sch.SchemaElement) { + t := sch.Type_INT64 + se.Type = &t + ct := sch.ConvertedType_UINT_64 + se.ConvertedType = &ct +} + +func Float32Type(se *sch.SchemaElement) { + t := sch.Type_FLOAT + se.Type = &t +} + +func Float64Type(se *sch.SchemaElement) { + t := sch.Type_DOUBLE + se.Type = &t +} + +func BoolType(se *sch.SchemaElement) { + t := sch.Type_BOOLEAN + se.Type = &t +} + +func StringType(se *sch.SchemaElement) { + t := sch.Type_BYTE_ARRAY + se.Type = &t +} diff --git a/performance/message/message.go b/performance/message/message.go new file mode 100644 index 0000000..917023b --- /dev/null +++ b/performance/message/message.go @@ -0,0 +1,53 @@ +package message + +// base +// go run cmd/parquetgen/main.go -input performance/message/message.go -type Message -package base -output performance/base/parquet.go -import github.com/parsyl/parquet/performance/message +// optimized +// go run cmd/parquetgen/main.go -input performance/message/message.go -type Message -package performance -output performance/parquet.go -import github.com/parsyl/parquet/performance/message +type Message struct { + ColStr0 *string `parquet:"col_str_0" json:"col_str_0" faker:"word"` + ColStr1 string `parquet:"col_str_1" json:"col_str_1" faker:"oneof: aaaaa, "` // optionally empty + ColStr2 *string `parquet:"col_str_2" json:"col_str_2" faker:"paragraph"` + ColStr3 string `parquet:"col_str_3" json:"col_str_3" faker:"paragraph"` + ColStr4 *string `parquet:"col_str_4" json:"col_str_4" faker:"sentence"` + ColStr5 string `parquet:"col_str_5" json:"col_str_5" faker:"sentence"` + ColStr6 *string `parquet:"col_str_6" json:"col_str_6" faker:"sentence"` + ColStr7 string `parquet:"col_str_7" json:"col_str_7" faker:"word"` + ColStr8 *string `parquet:"col_str_8" json:"col_str_8" faker:"word"` + ColStr9 string `parquet:"col_str_9" json:"col_str_9" faker:"word"` + + ColInt0 *int64 `parquet:"col_int_0" json:"col_int_0" faker:"unix_time"` + ColInt1 int64 `parquet:"col_int_1" json:"col_int_1" faker:"oneof: 0, 1"` + ColInt2 *int64 `parquet:"col_int_2" json:"col_int_2" faker:"unix_time"` + ColInt3 int64 `parquet:"col_int_3" json:"col_int_3" faker:"unix_time"` + ColInt4 *int64 `parquet:"col_int_4" json:"col_int_4" faker:"unix_time"` + + ColInt32_0 *int32 `parquet:"col_int_32_0" json:"col_int_32_0"` + ColInt32_1 int32 `parquet:"col_int_32_1" json:"col_int_32_1" faker:"oneof: 0, 1"` + ColInt32_2 *int32 `parquet:"col_int_32_2" json:"col_int_32_2"` + ColInt32_3 int32 `parquet:"col_int_32_3" json:"col_int_32_3"` + ColInt32_4 *int32 `parquet:"col_int_32_4" json:"col_int_32_4"` + + ColFloat0 *float64 `parquet:"col_float_0" json:"col_float_0"` + ColFloat1 float64 `parquet:"col_float_1" json:"col_float_1"` + ColFloat2 *float64 `parquet:"col_float_2" json:"col_float_2"` + ColFloat3 float64 `parquet:"col_float_3" json:"col_float_3"` + ColFloat4 *float64 `parquet:"col_float_4" json:"col_float_4"` + + ColFloat32_0 *float32 `parquet:"col_float_32_0" json:"col_float_32_0"` + ColFloat32_1 float32 `parquet:"col_float_32_1" json:"col_float_32_1" faker:"oneof: 0.0, 1.1"` + ColFloat32_2 *float32 `parquet:"col_float_32_2" json:"col_float_32_2"` + ColFloat32_3 float32 `parquet:"col_float_32_3" json:"col_float_32_3"` + ColFloat32_4 *float32 `parquet:"col_float_32_4" json:"col_float_32_4"` + + ColBool0 *bool `parquet:"col_bool_0" json:"col_bool_0"` + ColBool1 bool `parquet:"col_bool_1" json:"col_bool_1"` + ColBool2 *bool `parquet:"col_bool_2" json:"col_bool_2"` + ColBool3 bool `parquet:"col_bool_3" json:"col_bool_3"` + ColBool4 *bool `parquet:"col_bool_4" json:"col_bool_4"` + ColBool5 bool `parquet:"col_bool_5" json:"col_bool_5"` + ColBool6 *bool `parquet:"col_bool_6" json:"col_bool_6"` + ColBool7 bool `parquet:"col_bool_7" json:"col_bool_7"` + ColBool8 *bool `parquet:"col_bool_8" json:"col_bool_8"` + ColBool9 bool `parquet:"col_bool_9" json:"col_bool_9"` +} diff --git a/performance/parquet.go b/performance/parquet.go new file mode 100644 index 0000000..f4c4a48 --- /dev/null +++ b/performance/parquet.go @@ -0,0 +1,2493 @@ +package performance + +// Code generated by github.com/parsyl/parquet. DO NOT EDIT. + +import ( + "encoding/binary" + "fmt" + "io" + "strings" + + "github.com/parsyl/parquet" + . "github.com/parsyl/parquet/performance/message" + sch "github.com/parsyl/parquet/schema" + "github.com/valyala/bytebufferpool" + "math" +) + +type compression int + +const ( + compressionUncompressed compression = 0 + compressionSnappy compression = 1 + compressionGzip compression = 2 + compressionUnknown compression = -1 +) + +var buffpool = bytebufferpool.Pool{} + +// ParquetWriter reprents a row group +type ParquetWriter struct { + fields []Field + + len int + + // child points to the next page + child *ParquetWriter + + // max is the number of Record items that can get written before + // a new set of column chunks is written + max int + + meta *parquet.Metadata + w io.Writer + compression compression +} + +func Fields(compression compression) []Field { + return []Field{ + NewStringOptionalField(readColStr0, writeColStr0, []string{"col_str_0"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr1, writeColStr1, []string{"col_str_1"}, fieldCompression(compression)), + NewStringOptionalField(readColStr2, writeColStr2, []string{"col_str_2"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr3, writeColStr3, []string{"col_str_3"}, fieldCompression(compression)), + NewStringOptionalField(readColStr4, writeColStr4, []string{"col_str_4"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr5, writeColStr5, []string{"col_str_5"}, fieldCompression(compression)), + NewStringOptionalField(readColStr6, writeColStr6, []string{"col_str_6"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr7, writeColStr7, []string{"col_str_7"}, fieldCompression(compression)), + NewStringOptionalField(readColStr8, writeColStr8, []string{"col_str_8"}, []int{1}, optionalFieldCompression(compression)), + NewStringField(readColStr9, writeColStr9, []string{"col_str_9"}, fieldCompression(compression)), + NewInt64OptionalField(readColInt0, writeColInt0, []string{"col_int_0"}, []int{1}, optionalFieldCompression(compression)), + NewInt64Field(readColInt1, writeColInt1, []string{"col_int_1"}, fieldCompression(compression)), + NewInt64OptionalField(readColInt2, writeColInt2, []string{"col_int_2"}, []int{1}, optionalFieldCompression(compression)), + NewInt64Field(readColInt3, writeColInt3, []string{"col_int_3"}, fieldCompression(compression)), + NewInt64OptionalField(readColInt4, writeColInt4, []string{"col_int_4"}, []int{1}, optionalFieldCompression(compression)), + NewInt32OptionalField(readColInt32_0, writeColInt32_0, []string{"col_int_32_0"}, []int{1}, optionalFieldCompression(compression)), + NewInt32Field(readColInt32_1, writeColInt32_1, []string{"col_int_32_1"}, fieldCompression(compression)), + NewInt32OptionalField(readColInt32_2, writeColInt32_2, []string{"col_int_32_2"}, []int{1}, optionalFieldCompression(compression)), + NewInt32Field(readColInt32_3, writeColInt32_3, []string{"col_int_32_3"}, fieldCompression(compression)), + NewInt32OptionalField(readColInt32_4, writeColInt32_4, []string{"col_int_32_4"}, []int{1}, optionalFieldCompression(compression)), + NewFloat64OptionalField(readColFloat0, writeColFloat0, []string{"col_float_0"}, []int{1}, optionalFieldCompression(compression)), + NewFloat64Field(readColFloat1, writeColFloat1, []string{"col_float_1"}, fieldCompression(compression)), + NewFloat64OptionalField(readColFloat2, writeColFloat2, []string{"col_float_2"}, []int{1}, optionalFieldCompression(compression)), + NewFloat64Field(readColFloat3, writeColFloat3, []string{"col_float_3"}, fieldCompression(compression)), + NewFloat64OptionalField(readColFloat4, writeColFloat4, []string{"col_float_4"}, []int{1}, optionalFieldCompression(compression)), + NewFloat32OptionalField(readColFloat32_0, writeColFloat32_0, []string{"col_float_32_0"}, []int{1}, optionalFieldCompression(compression)), + NewFloat32Field(readColFloat32_1, writeColFloat32_1, []string{"col_float_32_1"}, fieldCompression(compression)), + NewFloat32OptionalField(readColFloat32_2, writeColFloat32_2, []string{"col_float_32_2"}, []int{1}, optionalFieldCompression(compression)), + NewFloat32Field(readColFloat32_3, writeColFloat32_3, []string{"col_float_32_3"}, fieldCompression(compression)), + NewFloat32OptionalField(readColFloat32_4, writeColFloat32_4, []string{"col_float_32_4"}, []int{1}, optionalFieldCompression(compression)), + NewBoolOptionalField(readColBool0, writeColBool0, []string{"col_bool_0"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool1, writeColBool1, []string{"col_bool_1"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool2, writeColBool2, []string{"col_bool_2"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool3, writeColBool3, []string{"col_bool_3"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool4, writeColBool4, []string{"col_bool_4"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool5, writeColBool5, []string{"col_bool_5"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool6, writeColBool6, []string{"col_bool_6"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool7, writeColBool7, []string{"col_bool_7"}, fieldCompression(compression)), + NewBoolOptionalField(readColBool8, writeColBool8, []string{"col_bool_8"}, []int{1}, optionalFieldCompression(compression)), + NewBoolField(readColBool9, writeColBool9, []string{"col_bool_9"}, fieldCompression(compression)), + } +} + +func readColStr0(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr0 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr0}, []uint8{1}, nil + } +} + +func writeColStr0(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr0 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr1(x Message) string { + return x.ColStr1 +} + +func writeColStr1(x *Message, vals []string) { + x.ColStr1 = vals[0] +} + +func readColStr2(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr2 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr2}, []uint8{1}, nil + } +} + +func writeColStr2(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr2 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr3(x Message) string { + return x.ColStr3 +} + +func writeColStr3(x *Message, vals []string) { + x.ColStr3 = vals[0] +} + +func readColStr4(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr4 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr4}, []uint8{1}, nil + } +} + +func writeColStr4(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr4 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr5(x Message) string { + return x.ColStr5 +} + +func writeColStr5(x *Message, vals []string) { + x.ColStr5 = vals[0] +} + +func readColStr6(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr6 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr6}, []uint8{1}, nil + } +} + +func writeColStr6(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr6 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr7(x Message) string { + return x.ColStr7 +} + +func writeColStr7(x *Message, vals []string) { + x.ColStr7 = vals[0] +} + +func readColStr8(x Message) ([]string, []uint8, []uint8) { + switch { + case x.ColStr8 == nil: + return nil, []uint8{0}, nil + default: + return []string{*x.ColStr8}, []uint8{1}, nil + } +} + +func writeColStr8(x *Message, vals []string, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColStr8 = pstring(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColStr9(x Message) string { + return x.ColStr9 +} + +func writeColStr9(x *Message, vals []string) { + x.ColStr9 = vals[0] +} + +func readColInt0(x Message) ([]int64, []uint8, []uint8) { + switch { + case x.ColInt0 == nil: + return nil, []uint8{0}, nil + default: + return []int64{*x.ColInt0}, []uint8{1}, nil + } +} + +func writeColInt0(x *Message, vals []int64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt0 = pint64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt1(x Message) int64 { + return x.ColInt1 +} + +func writeColInt1(x *Message, vals []int64) { + x.ColInt1 = vals[0] +} + +func readColInt2(x Message) ([]int64, []uint8, []uint8) { + switch { + case x.ColInt2 == nil: + return nil, []uint8{0}, nil + default: + return []int64{*x.ColInt2}, []uint8{1}, nil + } +} + +func writeColInt2(x *Message, vals []int64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt2 = pint64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt3(x Message) int64 { + return x.ColInt3 +} + +func writeColInt3(x *Message, vals []int64) { + x.ColInt3 = vals[0] +} + +func readColInt4(x Message) ([]int64, []uint8, []uint8) { + switch { + case x.ColInt4 == nil: + return nil, []uint8{0}, nil + default: + return []int64{*x.ColInt4}, []uint8{1}, nil + } +} + +func writeColInt4(x *Message, vals []int64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt4 = pint64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt32_0(x Message) ([]int32, []uint8, []uint8) { + switch { + case x.ColInt32_0 == nil: + return nil, []uint8{0}, nil + default: + return []int32{*x.ColInt32_0}, []uint8{1}, nil + } +} + +func writeColInt32_0(x *Message, vals []int32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt32_0 = pint32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt32_1(x Message) int32 { + return x.ColInt32_1 +} + +func writeColInt32_1(x *Message, vals []int32) { + x.ColInt32_1 = vals[0] +} + +func readColInt32_2(x Message) ([]int32, []uint8, []uint8) { + switch { + case x.ColInt32_2 == nil: + return nil, []uint8{0}, nil + default: + return []int32{*x.ColInt32_2}, []uint8{1}, nil + } +} + +func writeColInt32_2(x *Message, vals []int32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt32_2 = pint32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColInt32_3(x Message) int32 { + return x.ColInt32_3 +} + +func writeColInt32_3(x *Message, vals []int32) { + x.ColInt32_3 = vals[0] +} + +func readColInt32_4(x Message) ([]int32, []uint8, []uint8) { + switch { + case x.ColInt32_4 == nil: + return nil, []uint8{0}, nil + default: + return []int32{*x.ColInt32_4}, []uint8{1}, nil + } +} + +func writeColInt32_4(x *Message, vals []int32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColInt32_4 = pint32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat0(x Message) ([]float64, []uint8, []uint8) { + switch { + case x.ColFloat0 == nil: + return nil, []uint8{0}, nil + default: + return []float64{*x.ColFloat0}, []uint8{1}, nil + } +} + +func writeColFloat0(x *Message, vals []float64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat0 = pfloat64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat1(x Message) float64 { + return x.ColFloat1 +} + +func writeColFloat1(x *Message, vals []float64) { + x.ColFloat1 = vals[0] +} + +func readColFloat2(x Message) ([]float64, []uint8, []uint8) { + switch { + case x.ColFloat2 == nil: + return nil, []uint8{0}, nil + default: + return []float64{*x.ColFloat2}, []uint8{1}, nil + } +} + +func writeColFloat2(x *Message, vals []float64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat2 = pfloat64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat3(x Message) float64 { + return x.ColFloat3 +} + +func writeColFloat3(x *Message, vals []float64) { + x.ColFloat3 = vals[0] +} + +func readColFloat4(x Message) ([]float64, []uint8, []uint8) { + switch { + case x.ColFloat4 == nil: + return nil, []uint8{0}, nil + default: + return []float64{*x.ColFloat4}, []uint8{1}, nil + } +} + +func writeColFloat4(x *Message, vals []float64, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat4 = pfloat64(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat32_0(x Message) ([]float32, []uint8, []uint8) { + switch { + case x.ColFloat32_0 == nil: + return nil, []uint8{0}, nil + default: + return []float32{*x.ColFloat32_0}, []uint8{1}, nil + } +} + +func writeColFloat32_0(x *Message, vals []float32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat32_0 = pfloat32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat32_1(x Message) float32 { + return x.ColFloat32_1 +} + +func writeColFloat32_1(x *Message, vals []float32) { + x.ColFloat32_1 = vals[0] +} + +func readColFloat32_2(x Message) ([]float32, []uint8, []uint8) { + switch { + case x.ColFloat32_2 == nil: + return nil, []uint8{0}, nil + default: + return []float32{*x.ColFloat32_2}, []uint8{1}, nil + } +} + +func writeColFloat32_2(x *Message, vals []float32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat32_2 = pfloat32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColFloat32_3(x Message) float32 { + return x.ColFloat32_3 +} + +func writeColFloat32_3(x *Message, vals []float32) { + x.ColFloat32_3 = vals[0] +} + +func readColFloat32_4(x Message) ([]float32, []uint8, []uint8) { + switch { + case x.ColFloat32_4 == nil: + return nil, []uint8{0}, nil + default: + return []float32{*x.ColFloat32_4}, []uint8{1}, nil + } +} + +func writeColFloat32_4(x *Message, vals []float32, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColFloat32_4 = pfloat32(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool0(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool0 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool0}, []uint8{1}, nil + } +} + +func writeColBool0(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool0 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool1(x Message) bool { + return x.ColBool1 +} + +func writeColBool1(x *Message, vals []bool) { + x.ColBool1 = vals[0] +} + +func readColBool2(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool2 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool2}, []uint8{1}, nil + } +} + +func writeColBool2(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool2 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool3(x Message) bool { + return x.ColBool3 +} + +func writeColBool3(x *Message, vals []bool) { + x.ColBool3 = vals[0] +} + +func readColBool4(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool4 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool4}, []uint8{1}, nil + } +} + +func writeColBool4(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool4 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool5(x Message) bool { + return x.ColBool5 +} + +func writeColBool5(x *Message, vals []bool) { + x.ColBool5 = vals[0] +} + +func readColBool6(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool6 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool6}, []uint8{1}, nil + } +} + +func writeColBool6(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool6 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool7(x Message) bool { + return x.ColBool7 +} + +func writeColBool7(x *Message, vals []bool) { + x.ColBool7 = vals[0] +} + +func readColBool8(x Message) ([]bool, []uint8, []uint8) { + switch { + case x.ColBool8 == nil: + return nil, []uint8{0}, nil + default: + return []bool{*x.ColBool8}, []uint8{1}, nil + } +} + +func writeColBool8(x *Message, vals []bool, defs, reps []uint8) (int, int) { + def := defs[0] + switch def { + case 1: + x.ColBool8 = pbool(vals[0]) + return 1, 1 + } + + return 0, 1 +} + +func readColBool9(x Message) bool { + return x.ColBool9 +} + +func writeColBool9(x *Message, vals []bool) { + x.ColBool9 = vals[0] +} + +func fieldCompression(c compression) func(*parquet.RequiredField) { + switch c { + case compressionUncompressed: + return parquet.RequiredFieldUncompressed + case compressionSnappy: + return parquet.RequiredFieldSnappy + case compressionGzip: + return parquet.RequiredFieldGzip + default: + return parquet.RequiredFieldUncompressed + } +} + +func optionalFieldCompression(c compression) func(*parquet.OptionalField) { + switch c { + case compressionUncompressed: + return parquet.OptionalFieldUncompressed + case compressionSnappy: + return parquet.OptionalFieldSnappy + case compressionGzip: + return parquet.OptionalFieldGzip + default: + return parquet.OptionalFieldUncompressed + } +} + +func NewParquetWriter(w io.Writer, opts ...func(*ParquetWriter) error) (*ParquetWriter, error) { + return newParquetWriter(w, append(opts, begin)...) +} + +func newParquetWriter(w io.Writer, opts ...func(*ParquetWriter) error) (*ParquetWriter, error) { + p := &ParquetWriter{ + max: 1000, + w: w, + compression: compressionSnappy, + } + + for _, opt := range opts { + if err := opt(p); err != nil { + return nil, err + } + } + + p.fields = Fields(p.compression) + if p.meta == nil { + ff := Fields(p.compression) + schema := make([]parquet.Field, len(ff)) + for i, f := range ff { + schema[i] = f.Schema() + } + p.meta = parquet.New(schema...) + } + + return p, nil +} + +// MaxPageSize is the maximum number of rows in each row groups' page. +func MaxPageSize(m int) func(*ParquetWriter) error { + return func(p *ParquetWriter) error { + p.max = m + return nil + } +} + +func begin(p *ParquetWriter) error { + _, err := p.w.Write([]byte("PAR1")) + return err +} + +func withMeta(m *parquet.Metadata) func(*ParquetWriter) error { + return func(p *ParquetWriter) error { + p.meta = m + return nil + } +} + +func Uncompressed(p *ParquetWriter) error { + p.compression = compressionUncompressed + return nil +} + +func Snappy(p *ParquetWriter) error { + p.compression = compressionSnappy + return nil +} + +func Gzip(p *ParquetWriter) error { + p.compression = compressionGzip + return nil +} + +func withCompression(c compression) func(*ParquetWriter) error { + return func(p *ParquetWriter) error { + p.compression = c + return nil + } +} + +func (p *ParquetWriter) Write() error { + for i, f := range p.fields { + if err := f.Write(p.w, p.meta); err != nil { + return err + } + + for child := p.child; child != nil; child = child.child { + if err := child.fields[i].Write(p.w, p.meta); err != nil { + return err + } + } + } + + p.fields = Fields(p.compression) + p.child = nil + p.len = 0 + + schema := make([]parquet.Field, len(p.fields)) + for i, f := range p.fields { + schema[i] = f.Schema() + } + p.meta.StartRowGroup(schema...) + return nil +} + +func (p *ParquetWriter) Close() error { + if err := p.meta.Footer(p.w); err != nil { + return err + } + + _, err := p.w.Write([]byte("PAR1")) + return err +} + +func (p *ParquetWriter) Add(rec Message) { + if p.len == p.max { + if p.child == nil { + // an error can't happen here + p.child, _ = newParquetWriter(p.w, MaxPageSize(p.max), withMeta(p.meta), withCompression(p.compression)) + } + + p.child.Add(rec) + return + } + + p.meta.NextDoc() + for _, f := range p.fields { + f.Add(rec) + } + + p.len++ +} + +type Field interface { + Add(r Message) + Write(w io.Writer, meta *parquet.Metadata) error + Schema() parquet.Field + Scan(r *Message) + Read(r io.ReadSeeker, pg parquet.Page) error + Name() string + Levels() ([]uint8, []uint8) +} + +func getFields(ff []Field) map[string]Field { + m := make(map[string]Field, len(ff)) + for _, f := range ff { + m[f.Name()] = f + } + return m +} + +func NewParquetReader(r io.ReadSeeker, opts ...func(*ParquetReader)) (*ParquetReader, error) { + ff := Fields(compressionUnknown) + pr := &ParquetReader{ + r: r, + } + + for _, opt := range opts { + opt(pr) + } + + schema := make([]parquet.Field, len(ff)) + for i, f := range ff { + pr.fieldNames = append(pr.fieldNames, f.Name()) + schema[i] = f.Schema() + } + + meta := parquet.New(schema...) + if err := meta.ReadFooter(r); err != nil { + return nil, err + } + pr.rows = meta.Rows() + var err error + pr.pages, err = meta.Pages() + if err != nil { + return nil, err + } + + pr.rowGroups = meta.RowGroups() + _, err = r.Seek(4, io.SeekStart) + if err != nil { + return nil, err + } + pr.meta = meta + + return pr, pr.readRowGroup() +} + +func readerIndex(i int) func(*ParquetReader) { + return func(p *ParquetReader) { + p.index = i + } +} + +// ParquetReader reads one page from a row group. +type ParquetReader struct { + fields map[string]Field + fieldNames []string + index int + cursor int64 + rows int64 + rowGroupCursor int64 + rowGroupCount int64 + pages map[string][]parquet.Page + meta *parquet.Metadata + err error + + r io.ReadSeeker + rowGroups []parquet.RowGroup +} + +type Levels struct { + Name string + Defs []uint8 + Reps []uint8 +} + +func (p *ParquetReader) Levels() []Levels { + var out []Levels + //for { + for _, name := range p.fieldNames { + f := p.fields[name] + d, r := f.Levels() + out = append(out, Levels{Name: f.Name(), Defs: d, Reps: r}) + } + // if err := p.readRowGroup(); err != nil { + // break + // } + //} + return out +} + +func (p *ParquetReader) Error() error { + return p.err +} + +func (p *ParquetReader) readRowGroup() error { + p.rowGroupCursor = 0 + + if len(p.rowGroups) == 0 { + p.rowGroupCount = 0 + return nil + } + + rg := p.rowGroups[0] + p.fields = getFields(Fields(compressionUnknown)) + p.rowGroupCount = rg.Rows + p.rowGroupCursor = 0 + for _, col := range rg.Columns() { + name := strings.Join(col.MetaData.PathInSchema, ".") + f, ok := p.fields[name] + if !ok { + return fmt.Errorf("unknown field: %s", name) + } + pages := p.pages[name] + if len(pages) <= p.index { + break + } + + pg := pages[0] + if err := f.Read(p.r, pg); err != nil { + return fmt.Errorf("unable to read field %s, err: %s", f.Name(), err) + } + p.pages[name] = p.pages[name][1:] + } + p.rowGroups = p.rowGroups[1:] + return nil +} + +func (p *ParquetReader) Rows() int64 { + return p.rows +} + +func (p *ParquetReader) Next() bool { + if p.err == nil && p.cursor >= p.rows { + return false + } + if p.rowGroupCursor >= p.rowGroupCount { + p.err = p.readRowGroup() + if p.err != nil { + return false + } + } + + p.cursor++ + p.rowGroupCursor++ + return true +} + +func (p *ParquetReader) Scan(x *Message) { + if p.err != nil { + return + } + + for _, name := range p.fieldNames { + f := p.fields[name] + f.Scan(x) + } +} + +type StringOptionalField struct { + parquet.OptionalField + vals []string + read func(r Message) ([]string, []uint8, []uint8) + write func(r *Message, vals []string, def, rep []uint8) (int, int) + stats *stringOptionalStats +} + +func NewStringOptionalField(read func(r Message) ([]string, []uint8, []uint8), write func(r *Message, vals []string, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *StringOptionalField { + return &StringOptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newStringOptionalStats(maxDef(types)), + } +} + +func (f *StringOptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: StringType, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *StringOptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *StringOptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *StringOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) + for _, s := range f.vals { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { + return err + } + buf.WriteString(s) + } + + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *StringOptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + for j := 0; j < f.Values(); j++ { + var x int32 + if err := binary.Read(rr, binary.LittleEndian, &x); err != nil { + return err + } + s := make([]byte, x) + if _, err := rr.Read(s); err != nil { + return err + } + + f.vals = append(f.vals, string(s)) + } + return nil +} + +func (f *StringOptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type StringField struct { + parquet.RequiredField + vals []string + read func(r Message) string + write func(r *Message, vals []string) + stats *stringStats +} + +func NewStringField(read func(r Message) string, write func(r *Message, vals []string), path []string, opts ...func(*parquet.RequiredField)) *StringField { + return &StringField{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newStringStats(), + } +} + +func (f *StringField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: StringType, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *StringField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) + for _, s := range f.vals { + binary.LittleEndian.PutUint32(bs, uint32(len(s))) + if _, err := buf.Write(bs); err != nil { + return err + } + buf.WriteString(s) + } + + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *StringField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + for j := 0; j < pg.N; j++ { + var x int32 + if err := binary.Read(rr, binary.LittleEndian, &x); err != nil { + return err + } + s := make([]byte, x) + if _, err := rr.Read(s); err != nil { + return err + } + + f.vals = append(f.vals, string(s)) + } + return nil +} + +func (f *StringField) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *StringField) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *StringField) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Int64OptionalField struct { + parquet.OptionalField + vals []int64 + read func(r Message) ([]int64, []uint8, []uint8) + write func(r *Message, vals []int64, def, rep []uint8) (int, int) + stats *int64optionalStats +} + +func NewInt64OptionalField(read func(r Message) ([]int64, []uint8, []uint8), write func(r *Message, vals []int64, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Int64OptionalField { + return &Int64OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newint64optionalStats(maxDef(types)), + } +} + +func (f *Int64OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int64Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Int64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) + for _, v := range f.vals { + binary.LittleEndian.PutUint64(bs, uint64(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Int64OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int64, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int64OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Int64OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Int64OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Int64Field struct { + vals []int64 + parquet.RequiredField + read func(r Message) int64 + write func(r *Message, vals []int64) + stats *int64stats +} + +func NewInt64Field(read func(r Message) int64, write func(r *Message, vals []int64), path []string, opts ...func(*parquet.RequiredField)) *Int64Field { + return &Int64Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newInt64stats(), + } +} + +func (f *Int64Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int64Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Int64Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int64, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int64Field) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) + for _, v := range f.vals { + binary.LittleEndian.PutUint64(bs, uint64(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Int64Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Int64Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Int64Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Int32OptionalField struct { + parquet.OptionalField + vals []int32 + read func(r Message) ([]int32, []uint8, []uint8) + write func(r *Message, vals []int32, def, rep []uint8) (int, int) + stats *int32optionalStats +} + +func NewInt32OptionalField(read func(r Message) ([]int32, []uint8, []uint8), write func(r *Message, vals []int32, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Int32OptionalField { + return &Int32OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newint32optionalStats(maxDef(types)), + } +} + +func (f *Int32OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int32Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Int32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) + for _, v := range f.vals { + binary.LittleEndian.PutUint32(bs, uint32(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Int32OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int32, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int32OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Int32OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Int32OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Int32Field struct { + vals []int32 + parquet.RequiredField + read func(r Message) int32 + write func(r *Message, vals []int32) + stats *int32stats +} + +func NewInt32Field(read func(r Message) int32, write func(r *Message, vals []int32), path []string, opts ...func(*parquet.RequiredField)) *Int32Field { + return &Int32Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newInt32stats(), + } +} + +func (f *Int32Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Int32Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Int32Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]int32, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Int32Field) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) + for _, v := range f.vals { + binary.LittleEndian.PutUint32(bs, uint32(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Int32Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Int32Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Int32Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Float64OptionalField struct { + parquet.OptionalField + vals []float64 + read func(r Message) ([]float64, []uint8, []uint8) + write func(r *Message, vals []float64, def, rep []uint8) (int, int) + stats *float64optionalStats +} + +func NewFloat64OptionalField(read func(r Message) ([]float64, []uint8, []uint8), write func(r *Message, vals []float64, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Float64OptionalField { + return &Float64OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newfloat64optionalStats(maxDef(types)), + } +} + +func (f *Float64OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float64Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Float64OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) + for _, v := range f.vals { + binary.LittleEndian.PutUint64(bs, math.Float64bits(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Float64OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float64, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float64OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Float64OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Float64OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Float64Field struct { + vals []float64 + parquet.RequiredField + read func(r Message) float64 + write func(r *Message, vals []float64) + stats *float64stats +} + +func NewFloat64Field(read func(r Message) float64, write func(r *Message, vals []float64), path []string, opts ...func(*parquet.RequiredField)) *Float64Field { + return &Float64Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newFloat64stats(), + } +} + +func (f *Float64Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float64Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Float64Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float64, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float64Field) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 8) + for _, v := range f.vals { + binary.LittleEndian.PutUint64(bs, math.Float64bits(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Float64Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Float64Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Float64Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type Float32OptionalField struct { + parquet.OptionalField + vals []float32 + read func(r Message) ([]float32, []uint8, []uint8) + write func(r *Message, vals []float32, def, rep []uint8) (int, int) + stats *float32optionalStats +} + +func NewFloat32OptionalField(read func(r Message) ([]float32, []uint8, []uint8), write func(r *Message, vals []float32, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *Float32OptionalField { + return &Float32OptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newfloat32optionalStats(maxDef(types)), + } +} + +func (f *Float32OptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float32Type, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *Float32OptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) + for _, v := range f.vals { + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.Defs), f.stats) +} + +func (f *Float32OptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float32, f.Values()-len(f.vals)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float32OptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *Float32OptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *Float32OptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type Float32Field struct { + vals []float32 + parquet.RequiredField + read func(r Message) float32 + write func(r *Message, vals []float32) + stats *float32stats +} + +func NewFloat32Field(read func(r Message) float32, write func(r *Message, vals []float32), path []string, opts ...func(*parquet.RequiredField)) *Float32Field { + return &Float32Field{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + stats: newFloat32stats(), + } +} + +func (f *Float32Field) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: Float32Type, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *Float32Field) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, _, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v := make([]float32, int(pg.N)) + err = binary.Read(rr, binary.LittleEndian, &v) + f.vals = append(f.vals, v...) + return err +} + +func (f *Float32Field) Write(w io.Writer, meta *parquet.Metadata) error { + buf := buffpool.Get() + defer buffpool.Put(buf) + + bs := make([]byte, 4) + for _, v := range f.vals { + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + if _, err := buf.Write(bs); err != nil { + return err + } + } + return f.DoWrite(w, meta, buf.Bytes(), len(f.vals), f.stats) +} + +func (f *Float32Field) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *Float32Field) Add(r Message) { + v := f.read(r) + f.stats.add(v) + f.vals = append(f.vals, v) +} + +func (f *Float32Field) Levels() ([]uint8, []uint8) { + return nil, nil +} + +type BoolOptionalField struct { + parquet.OptionalField + vals []bool + read func(r Message) ([]bool, []uint8, []uint8) + write func(r *Message, vals []bool, defs, reps []uint8) (int, int) + stats *boolOptionalStats +} + +func NewBoolOptionalField(read func(r Message) ([]bool, []uint8, []uint8), write func(r *Message, vals []bool, defs, reps []uint8) (int, int), path []string, types []int, opts ...func(*parquet.OptionalField)) *BoolOptionalField { + return &BoolOptionalField{ + read: read, + write: write, + OptionalField: parquet.NewOptionalField(path, types, opts...), + stats: newBoolOptionalStats(maxDef(types)), + } +} + +func (f *BoolOptionalField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: BoolType, RepetitionType: f.RepetitionType, Types: f.Types} +} + +func (f *BoolOptionalField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, sizes, err := f.DoRead(r, pg) + if err != nil { + return err + } + + v, err := parquet.GetBools(rr, f.Values()-len(f.vals), sizes) + f.vals = append(f.vals, v...) + return err +} + +func (f *BoolOptionalField) Scan(r *Message) { + if len(f.Defs) == 0 { + return + } + + v, l := f.write(r, f.vals, f.Defs, f.Reps) + f.vals = f.vals[v:] + f.Defs = f.Defs[l:] + if len(f.Reps) > 0 { + f.Reps = f.Reps[l:] + } +} + +func (f *BoolOptionalField) Add(r Message) { + vals, defs, reps := f.read(r) + f.stats.add(vals, defs) + f.vals = append(f.vals, vals...) + f.Defs = append(f.Defs, defs...) + f.Reps = append(f.Reps, reps...) +} + +func (f *BoolOptionalField) Write(w io.Writer, meta *parquet.Metadata) error { + ln := len(f.vals) + byteNum := (ln + 7) / 8 + rawBuf := make([]byte, byteNum) + + for i := 0; i < ln; i++ { + if f.vals[i] { + rawBuf[i/8] = rawBuf[i/8] | (1 << uint32(i%8)) + } + } + + return f.DoWrite(w, meta, rawBuf, len(f.Defs), f.stats) +} + +func (f *BoolOptionalField) Levels() ([]uint8, []uint8) { + return f.Defs, f.Reps +} + +type BoolField struct { + parquet.RequiredField + vals []bool + read func(r Message) bool + write func(r *Message, vals []bool) + stats *boolStats +} + +func NewBoolField(read func(r Message) bool, write func(r *Message, vals []bool), path []string, opts ...func(*parquet.RequiredField)) *BoolField { + return &BoolField{ + read: read, + write: write, + RequiredField: parquet.NewRequiredField(path, opts...), + } +} + +func (f *BoolField) Schema() parquet.Field { + return parquet.Field{Name: f.Name(), Path: f.Path(), Type: BoolType, RepetitionType: parquet.RepetitionRequired, Types: []int{0}} +} + +func (f *BoolField) Write(w io.Writer, meta *parquet.Metadata) error { + ln := len(f.vals) + n := (ln + 7) / 8 + rawBuf := make([]byte, n) + + for i := 0; i < ln; i++ { + if f.vals[i] { + rawBuf[i/8] = rawBuf[i/8] | (1 << uint32(i%8)) + } + } + + return f.DoWrite(w, meta, rawBuf, len(f.vals), newBoolStats()) +} + +func (f *BoolField) Read(r io.ReadSeeker, pg parquet.Page) error { + rr, sizes, err := f.DoRead(r, pg) + if err != nil { + return err + } + + f.vals, err = parquet.GetBools(rr, int(pg.N), sizes) + return err +} + +func (f *BoolField) Scan(r *Message) { + if len(f.vals) == 0 { + return + } + + f.write(r, f.vals) + f.vals = f.vals[1:] +} + +func (f *BoolField) Add(r Message) { + v := f.read(r) + f.vals = append(f.vals, v) +} + +func (f *BoolField) Levels() ([]uint8, []uint8) { + return nil, nil +} + +const nilOptString = "__#NIL#__" + +type stringOptionalStats struct { + min string + max string + nils int64 + maxDef uint8 +} + +func newStringOptionalStats(d uint8) *stringOptionalStats { + return &stringOptionalStats{ + min: nilOptString, + max: nilOptString, + maxDef: d, + } +} + +func (s *stringOptionalStats) add(vals []string, defs []uint8) { + var i int + for _, def := range defs { + if def < s.maxDef { + s.nils++ + } else { + val := vals[i] + if s.min == nilString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } + i++ + } + } +} + +func (s *stringOptionalStats) NullCount() *int64 { + return &s.nils +} + +func (s *stringOptionalStats) DistinctCount() *int64 { + return nil +} + +func (s *stringOptionalStats) Min() []byte { + if s.min == nilOptString { + return nil + } + return []byte(s.min) +} + +func (s *stringOptionalStats) Max() []byte { + if s.max == nilOptString { + return nil + } + return []byte(s.max) +} + +const nilString = "__#NIL#__" + +type stringStats struct { + min string + max string +} + +func newStringStats() *stringStats { + return &stringStats{ + min: nilString, + max: nilString, + } +} + +func (s *stringStats) add(val string) { + if s.min == nilString { + s.min = val + } else { + if val < s.min { + s.min = val + } + } + if s.max == nilString { + s.max = val + } else { + if val > s.max { + s.max = val + } + } +} + +func (s *stringStats) NullCount() *int64 { + return nil +} + +func (s *stringStats) DistinctCount() *int64 { + return nil +} + +func (s *stringStats) Min() []byte { + if s.min == nilString { + return nil + } + return []byte(s.min) +} + +func (s *stringStats) Max() []byte { + if s.max == nilString { + return nil + } + return []byte(s.max) +} + +type int64optionalStats struct { + min int64 + max int64 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newint64optionalStats(d uint8) *int64optionalStats { + return &int64optionalStats{ + min: int64(math.MaxInt64), + maxDef: d, + } +} + +func (f *int64optionalStats) add(vals []int64, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *int64optionalStats) bytes(v int64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, uint64(v)) + return bs +} + +func (f *int64optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *int64optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *int64optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *int64optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type int64stats struct { + min int64 + max int64 +} + +func newInt64stats() *int64stats { + return &int64stats{ + min: int64(math.MaxInt64), + } +} + +func (i *int64stats) add(val int64) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *int64stats) bytes(v int64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, uint64(v)) + return bs +} + +func (f *int64stats) NullCount() *int64 { + return nil +} + +func (f *int64stats) DistinctCount() *int64 { + return nil +} + +func (f *int64stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *int64stats) Max() []byte { + return f.bytes(f.max) +} + +type int32optionalStats struct { + min int32 + max int32 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newint32optionalStats(d uint8) *int32optionalStats { + return &int32optionalStats{ + min: int32(math.MaxInt32), + maxDef: d, + } +} + +func (f *int32optionalStats) add(vals []int32, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *int32optionalStats) bytes(v int32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, uint32(v)) + return bs +} + +func (f *int32optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *int32optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *int32optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *int32optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type int32stats struct { + min int32 + max int32 +} + +func newInt32stats() *int32stats { + return &int32stats{ + min: int32(math.MaxInt32), + } +} + +func (i *int32stats) add(val int32) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *int32stats) bytes(v int32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, uint32(v)) + return bs +} + +func (f *int32stats) NullCount() *int64 { + return nil +} + +func (f *int32stats) DistinctCount() *int64 { + return nil +} + +func (f *int32stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *int32stats) Max() []byte { + return f.bytes(f.max) +} + +type float64optionalStats struct { + min float64 + max float64 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newfloat64optionalStats(d uint8) *float64optionalStats { + return &float64optionalStats{ + min: float64(math.MaxFloat64), + maxDef: d, + } +} + +func (f *float64optionalStats) add(vals []float64, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *float64optionalStats) bytes(v float64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, math.Float64bits(v)) + return bs +} + +func (f *float64optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *float64optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *float64optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *float64optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type float64stats struct { + min float64 + max float64 +} + +func newFloat64stats() *float64stats { + return &float64stats{ + min: float64(math.MaxFloat64), + } +} + +func (i *float64stats) add(val float64) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *float64stats) bytes(v float64) []byte { + bs := make([]byte, 8) + binary.LittleEndian.PutUint64(bs, math.Float64bits(v)) + return bs +} + +func (f *float64stats) NullCount() *int64 { + return nil +} + +func (f *float64stats) DistinctCount() *int64 { + return nil +} + +func (f *float64stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *float64stats) Max() []byte { + return f.bytes(f.max) +} + +type float32optionalStats struct { + min float32 + max float32 + nils int64 + nonNils int64 + maxDef uint8 +} + +func newfloat32optionalStats(d uint8) *float32optionalStats { + return &float32optionalStats{ + min: float32(math.MaxFloat32), + maxDef: d, + } +} + +func (f *float32optionalStats) add(vals []float32, defs []uint8) { + var i int + for _, def := range defs { + if def < f.maxDef { + f.nils++ + } else { + val := vals[i] + i++ + + f.nonNils++ + if val < f.min { + f.min = val + } + if val > f.max { + f.max = val + } + } + } +} + +func (f *float32optionalStats) bytes(v float32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + return bs +} + +func (f *float32optionalStats) NullCount() *int64 { + return &f.nils +} + +func (f *float32optionalStats) DistinctCount() *int64 { + return nil +} + +func (f *float32optionalStats) Min() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.min) +} + +func (f *float32optionalStats) Max() []byte { + if f.nonNils == 0 { + return nil + } + return f.bytes(f.max) +} + +type float32stats struct { + min float32 + max float32 +} + +func newFloat32stats() *float32stats { + return &float32stats{ + min: float32(math.MaxFloat32), + } +} + +func (i *float32stats) add(val float32) { + if val < i.min { + i.min = val + } + if val > i.max { + i.max = val + } +} + +func (f *float32stats) bytes(v float32) []byte { + bs := make([]byte, 4) + binary.LittleEndian.PutUint32(bs, math.Float32bits(v)) + return bs +} + +func (f *float32stats) NullCount() *int64 { + return nil +} + +func (f *float32stats) DistinctCount() *int64 { + return nil +} + +func (f *float32stats) Min() []byte { + return f.bytes(f.min) +} + +func (f *float32stats) Max() []byte { + return f.bytes(f.max) +} + +type boolOptionalStats struct { + maxDef uint8 + nils int64 +} + +func newBoolOptionalStats(d uint8) *boolOptionalStats { + return &boolOptionalStats{maxDef: d} +} + +func (b *boolOptionalStats) add(vals []bool, defs []uint8) { + for _, def := range defs { + if def < b.maxDef { + b.nils++ + } + } +} + +func (b *boolOptionalStats) NullCount() *int64 { + return &b.nils +} + +func (b *boolOptionalStats) DistinctCount() *int64 { + return nil +} + +func (b *boolOptionalStats) Min() []byte { + return nil +} + +func (b *boolOptionalStats) Max() []byte { + return nil +} + +type boolStats struct{} + +func newBoolStats() *boolStats { return &boolStats{} } +func (b *boolStats) NullCount() *int64 { return nil } +func (b *boolStats) DistinctCount() *int64 { return nil } +func (b *boolStats) Min() []byte { return nil } +func (b *boolStats) Max() []byte { return nil } + +func pint32(i int32) *int32 { return &i } +func puint32(i uint32) *uint32 { return &i } +func pint64(i int64) *int64 { return &i } +func puint64(i uint64) *uint64 { return &i } +func pbool(b bool) *bool { return &b } +func pstring(s string) *string { return &s } +func pfloat32(f float32) *float32 { return &f } +func pfloat64(f float64) *float64 { return &f } + +// keeps track of the indices of repeated fields +// that have already been handled by a previous field +type indices []int + +func (i indices) rep(rep uint8) { + if rep > 0 { + r := int(rep) - 1 + i[r] = i[r] + 1 + for j := int(rep); j < len(i); j++ { + i[j] = 0 + } + } +} + +func maxDef(types []int) uint8 { + var out uint8 + for _, typ := range types { + if typ > 0 { + out++ + } + } + return out +} + +func Int32Type(se *sch.SchemaElement) { + t := sch.Type_INT32 + se.Type = &t +} + +func Uint32Type(se *sch.SchemaElement) { + t := sch.Type_INT32 + se.Type = &t + ct := sch.ConvertedType_UINT_32 + se.ConvertedType = &ct +} + +func Int64Type(se *sch.SchemaElement) { + t := sch.Type_INT64 + se.Type = &t +} + +func Uint64Type(se *sch.SchemaElement) { + t := sch.Type_INT64 + se.Type = &t + ct := sch.ConvertedType_UINT_64 + se.ConvertedType = &ct +} + +func Float32Type(se *sch.SchemaElement) { + t := sch.Type_FLOAT + se.Type = &t +} + +func Float64Type(se *sch.SchemaElement) { + t := sch.Type_DOUBLE + se.Type = &t +} + +func BoolType(se *sch.SchemaElement) { + t := sch.Type_BOOLEAN + se.Type = &t +} + +func StringType(se *sch.SchemaElement) { + t := sch.Type_BYTE_ARRAY + se.Type = &t +} diff --git a/performance/parquet_performance_test.go b/performance/parquet_performance_test.go new file mode 100644 index 0000000..c5b46bc --- /dev/null +++ b/performance/parquet_performance_test.go @@ -0,0 +1,115 @@ +package performance + +import ( + "bytes" + "github.com/bxcodec/faker/v3" + "github.com/parsyl/parquet/performance/base" + "github.com/parsyl/parquet/performance/message" + "math/rand" + "testing" +) + +const ( + writeBatch = 5_000 + inputSize = 100_000 +) + +type parquetWriter interface { + Add(rec message.Message) + Write() error + Close() error +} + +func generateTestData(count int) []message.Message { + res := make([]message.Message, count) + for i := 0; i < count; i++ { + err := faker.FakeData(&res[i]) + // faker doesn't set nil, so we set them ourselves sometimes + if rand.Intn(2) == 0 { + res[i].ColBool0 = nil + res[i].ColFloat0 = nil + res[i].ColFloat32_0 = nil + res[i].ColInt0 = nil + res[i].ColInt32_0 = nil + res[i].ColStr0 = nil + } + if err != nil { + panic(err) + } + } + return res +} + +func benchmarkParquet(b *testing.B, data []message.Message, buf *bytes.Buffer, getWriter func(*bytes.Buffer) parquetWriter) { + writeOnce := func() { + writer := getWriter(buf) + for i := range data { + writer.Add(data[i]) + if i%writeBatch == 0 { + err := writer.Write() + if err != nil { + b.Fatalf(err.Error()) + } + } + } + err := writer.Write() + if err != nil { + b.Fatalf(err.Error()) + } + err = writer.Close() + if err != nil { + b.Fatalf(err.Error()) + } + } + + writeOnce() // the first time will allocate the buffer to the correct size + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + buf.Reset() + writeOnce() + } +} + +func BenchmarkWrite(b *testing.B) { + data := generateTestData(inputSize) + + var baseBuff bytes.Buffer + b.Run("base", func(b *testing.B) { + getWriter := func(buf *bytes.Buffer) parquetWriter { + writer, err := base.NewParquetWriter(&baseBuff) + if err != nil { + b.Fatal(err) + } + return writer + } + + benchmarkParquet(b, data, &baseBuff, getWriter) + }) + + var optBuff bytes.Buffer + b.Run("opt", func(b *testing.B) { + getWriter := func(buf *bytes.Buffer) parquetWriter { + writer, err := NewParquetWriter(&optBuff) + if err != nil { + b.Fatal(err) + } + return writer + } + benchmarkParquet(b, data, &optBuff, getWriter) + }) + + baseBytes := baseBuff.Bytes() + optBytes := optBuff.Bytes() + + // to make sure we didn't break anything + if len(baseBytes) != len(optBytes) || len(baseBytes) == 0 { + b.Fatal("length", baseBuff.Len(), optBuff.Len()) + } + + for i := 0; i < len(baseBytes); i++ { + if baseBytes[i] != optBytes[i] { + b.Fatal("bytes incorrect at ", i) + } + } +}