From d6df5be49f9758c1806d6a91690ad1be43e9c868 Mon Sep 17 00:00:00 2001 From: marloploemen Date: Fri, 15 Mar 2024 17:13:30 +0100 Subject: [PATCH 1/5] support unambiguous textual json (from/to) --- codec.go | 20 +++++ union.go | 127 ++++++++++++++++++++++++++---- union_test.go | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 341 insertions(+), 14 deletions(-) diff --git a/codec.go b/codec.go index ee5bda1..c1a1d02 100644 --- a/codec.go +++ b/codec.go @@ -198,6 +198,26 @@ func NewCodecForStandardJSONFull(schemaSpecification string) (*Codec, error) { }) } +// NewCodecForUnambiguousJSON provides full serialization/deserialization +// for json that meets the expectations of regular internet json, viewed as +// something distinct from avro-json which has special handling for union +// types. For details see the above comments. +// +// With this `codec` you can expect to see a json string like this: +// +// "Follow your bliss." +// +// to deserialize into the same json structure +// +// "Follow your bliss." +func NewCodecForUnambiguousJSON(schemaSpecification string) (*Codec, error) { + return NewCodecFrom(schemaSpecification, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) +} + func NewCodecFrom(schemaSpecification string, cb *codecBuilder) (*Codec, error) { var schema interface{} diff --git a/union.go b/union.go index 031e84f..5954db3 100644 --- a/union.go +++ b/union.go @@ -20,10 +20,27 @@ import ( // codecInfo is a set of quick lookups it holds all the lookup info for the // all the schemas we need to handle the list of types for this union type codecInfo struct { - allowedTypes []string - codecFromIndex []*Codec - codecFromName map[string]*Codec - indexFromName map[string]int + allowedTypes []string + codecFromIndex []*Codec + codecFromName map[string]*Codec + indexFromName map[string]int + unambiguousMode bool +} + +// isNullable returns if the "null" type is one of the registered types +func (cr codecInfo) isNullable() bool { + _, nullable := cr.indexFromName["null"] + return nullable +} + +// numConcreteTypes returns the number of concrete types (not "null") specified to the codec +func (cr codecInfo) numConcreteTypes() int { + _, nullable := cr.indexFromName["null"] + numConcreteTypes := len(cr.allowedTypes) + if nullable { + numConcreteTypes -= 1 + } + return numConcreteTypes } // Union wraps a datum value in a map for encoding as a Union, as required by @@ -163,8 +180,7 @@ func unionTextualFromNative(cr *codecInfo) func(buf []byte, datum interface{}) ( return func(buf []byte, datum interface{}) ([]byte, error) { switch v := datum.(type) { case nil: - _, ok := cr.indexFromName["null"] - if !ok { + if !cr.isNullable() { return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", cr.allowedTypes, datum) } return append(buf, "null"...), nil @@ -178,19 +194,24 @@ func unionTextualFromNative(cr *codecInfo) func(buf []byte, datum interface{}) ( if !ok { return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", cr.allowedTypes, datum) } - buf = append(buf, '{') var err error - buf, err = stringTextualFromNative(buf, key) - if err != nil { - return nil, fmt.Errorf("cannot encode textual union: %s", err) + if !cr.unambiguousMode || cr.numConcreteTypes() > 1 { + buf = append(buf, '{') + buf, err = stringTextualFromNative(buf, key) + if err != nil { + return nil, fmt.Errorf("cannot encode textual union: %s", err) + } + buf = append(buf, ':') } - buf = append(buf, ':') c := cr.codecFromIndex[index] buf, err = c.textualFromNative(buf, value) if err != nil { return nil, fmt.Errorf("cannot encode textual union: %s", err) } - return append(buf, '}'), nil + if !cr.unambiguousMode || cr.numConcreteTypes() > 1 { + buf = append(buf, '}') + } + return buf, nil } } return nil, fmt.Errorf("cannot encode textual union: non-nil values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", cr.allowedTypes, datum) @@ -200,8 +221,7 @@ func textualJSONFromNativeAvro(cr *codecInfo) func(buf []byte, datum interface{} return func(buf []byte, datum interface{}) ([]byte, error) { switch v := datum.(type) { case nil: - _, ok := cr.indexFromName["null"] - if !ok { + if !cr.isNullable() { return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", cr.allowedTypes, datum) } return append(buf, "null"...), nil @@ -301,6 +321,32 @@ func buildCodecForTypeDescribedBySliceOneWayJSON(st map[string]*Codec, enclosing } return rv, nil } +func buildCodecForTypeDescribedBySliceUnambiguousJSON(st map[string]*Codec, enclosingNamespace string, schemaArray []interface{}, cb *codecBuilder) (*Codec, error) { + if len(schemaArray) == 0 { + return nil, errors.New("Union ought to have one or more members") + } + + cr, err := makeCodecInfo(st, enclosingNamespace, schemaArray, cb) + cr.unambiguousMode = true + if err != nil { + return nil, err + } + + rv := &Codec{ + // NOTE: To support record field default values, union schema set to the + // type name of first member + // TODO: add/change to schemaCanonical below + schemaOriginal: cr.codecFromIndex[0].typeName.fullName, + + typeName: &name{"union", nullNamespace}, + nativeFromBinary: unionNativeFromBinary(&cr), + binaryFromNative: unionBinaryFromNative(&cr), + nativeFromTextual: nativeAvroFromTextualJSON(&cr), + textualFromNative: unionTextualFromNative(&cr), + } + return rv, nil +} + func buildCodecForTypeDescribedBySliceTwoWayJSON(st map[string]*Codec, enclosingNamespace string, schemaArray []interface{}, cb *codecBuilder) (*Codec, error) { if len(schemaArray) == 0 { return nil, errors.New("Union ought to have one or more members") @@ -340,6 +386,11 @@ func checkAll(allowedTypes []string, cr *codecInfo, buf []byte) (interface{}, [] if err != nil { continue } + + // in unambiguous mode, don't return the type if only a single concrete type is registered + if cr.unambiguousMode && cr.numConcreteTypes() == 1 { + return rv, rb, nil + } return map[string]interface{}{name: rv}, rb, nil } return nil, buf, fmt.Errorf("could not decode any json data in input %v", string(buf)) @@ -405,11 +456,59 @@ func nativeAvroFromTextualJSON(cr *codecInfo) func(buf []byte) (interface{}, []b sort.Strings(cr.allowedTypes) case map[string]interface{}: + if cr.unambiguousMode && cr.numConcreteTypes() > 1 { + asmap, ok := m.(map[string]interface{}) // we know this cast cannot fail + if !ok || len(asmap) != 1 { + return nil, buf, fmt.Errorf("expected map with a single key, got: %v", string(buf)) + } + + var name string + var value []byte + for _name, _value := range asmap { + name = _name + var err error + value, err = json.Marshal(_value) + if err != nil { + return nil, buf, fmt.Errorf("could not read value of type as []byte: %v", _value) + } + } + + index, ok := cr.indexFromName[name] + if !ok { + return nil, buf, fmt.Errorf("invalid type: %v", name) + } + + c := cr.codecFromIndex[index] + rv, rb, err := c.NativeFromTextual(value) + if err != nil { + return nil, buf, fmt.Errorf("could not decode json data in input: %v: %v", string(buf), err) + } + return map[string]interface{}{name: rv}, rb, nil + } // try to decode it as a map // because a map should fail faster than a record // if that fails assume record and return it sort.Strings(cr.allowedTypes) + case interface{}: + // if running in unambiguous mode, allow a nullable (NULL, T) type to be checked + if cr.unambiguousMode && cr.numConcreteTypes() == 2 { + // get T + var index int + for _key, _index := range cr.indexFromName { + if _key != "null" { + index = _index + break + } + } + + c := cr.codecFromIndex[index] + rv, rb, err := c.NativeFromTextual(buf) + if err != nil { + return nil, buf, fmt.Errorf("could not decode json data in input: %v: %v", string(buf), err) + } + return rv, rb, nil + } } return checkAll(allowedTypes, cr, buf) diff --git a/union_test.go b/union_test.go index b66884f..0903036 100644 --- a/union_test.go +++ b/union_test.go @@ -263,6 +263,214 @@ func ExampleCodec_TextualFromNative_json() { // Output: {"string":"some string"} } +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_TextualFromNative_unambiguous_primitive() { + codec, err := NewCodecFrom(`["null","string"]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + buf, err := codec.TextualFromNative(nil, Union("string", "some string")) + if err != nil { + fmt.Println(err) + } + fmt.Println(string(buf)) + // Output: "some string" +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_NativeFromTextual_unambiguous_primitive() { + codec, err := NewCodecFrom(`["null","string"]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + // send in a legit json string + t, _, err := codec.NativeFromTextual([]byte("\"some string\"")) + if err != nil { + fmt.Println(err) + } + // see it parse directly into string + o, ok := t.(string) + if !ok { + fmt.Printf("its a %T not a string", t) + } + // pull out the string to show its all good + fmt.Println(o) + // Output: some string +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_TextualFromNative_unambiguous_record() { + codec, err := NewCodecFrom(`["null",{"type": "record", "name": "Person", "fields": [{"name": "name", "type": "string"}]}]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + buf, err := codec.TextualFromNative(nil, Union("Person", map[string]interface{}{"name": "John Doe"})) + if err != nil { + fmt.Println(err) + } + fmt.Println(string(buf)) + // Output: {"name":"John Doe"} +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_NativeFromTextual_unambiguous_record() { + codec, err := NewCodecFrom(`["null",{"type": "record", "name": "Person", "fields": [{"name": "name", "type": "string"}]}]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + // send in a legit json string + t, _, err := codec.NativeFromTextual([]byte("{\"name\": \"John Doe\"}")) + if err != nil { + fmt.Println(err) + } + // see it parse directly into string + o, ok := t.(map[string]interface{}) + if !ok { + fmt.Printf("its a %T not a string", t) + } + // pull out the string to show its all good + fmt.Println(o) + // Output: map[name:John Doe] +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_TextualFromNative_unambiguous_nil() { + codec, err := NewCodecFrom(`["null","string"]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + buf, err := codec.TextualFromNative(nil, Union("null", nil)) + if err != nil { + fmt.Println(err) + } + fmt.Println(string(buf)) + // Output: null +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_NativeFromTextual_unambiguous_nil() { + codec, err := NewCodecFrom(`["null","string"]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + // send in a legit json string + t, _, err := codec.NativeFromTextual([]byte("null")) + if err != nil { + fmt.Println(err) + } + // pull out the string to show its all good + fmt.Println(t) + // Output: +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_TextualFromNative_ambiguous_primitive() { + codec, err := NewCodecFrom(`["int","string"]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + buf, err := codec.TextualFromNative(nil, Union("string", "some string")) + if err != nil { + fmt.Println(err) + } + fmt.Println(string(buf)) + // Output: {"string":"some string"} +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_NativeFromTextual_ambiguous_primitive() { + codec, err := NewCodecFrom(`["int","string"]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + // send in a legit json string + t, _, err := codec.NativeFromTextual([]byte("{\"string\": \"some string\"}")) + // see it parse into a map like the avro encoder does + o, ok := t.(map[string]interface{}) + if !ok { + fmt.Printf("its a %T not a map[string]interface{}", t) + } + // pull out the string to show its all good + v := o["string"] + fmt.Println(v) + // Output: some string +} + +func ExampleCodec_TextualFromNative_ambiguous_record() { + codec, err := NewCodecFrom(`["int",{"type": "record", "name": "Person", "fields": [{"name": "name", "type": "string"}]}]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + buf, err := codec.TextualFromNative(nil, Union("Person", map[string]interface{}{"name": "John Doe"})) + if err != nil { + fmt.Println(err) + } + fmt.Println(string(buf)) + // Output: {"Person":{"name":"John Doe"}} +} + +// Use the unambiguous JSON codec instead for nullable types +func ExampleCodec_NativeFromTextual_ambiguous_record() { + codec, err := NewCodecFrom(`["int",{"type": "record", "name": "Person", "fields": [{"name": "name", "type": "string"}]}]`, &codecBuilder{ + buildCodecForTypeDescribedByMap, + buildCodecForTypeDescribedByString, + buildCodecForTypeDescribedBySliceUnambiguousJSON, + }) + if err != nil { + fmt.Println(err) + } + // send in a legit json string + t, _, err := codec.NativeFromTextual([]byte("{\"Person\": {\"name\": \"John Doe\"}}")) + if err != nil { + fmt.Println(err) + } + // see it parse into a map like the avro encoder does + o, ok := t.(map[string]interface{}) + if !ok { + fmt.Printf("its a %T not a map[string]interface{}", t) + } + // pull out the Person to show its all good + v := o["Person"] + fmt.Println(v) + // Output: map[name:John Doe] +} + func ExampleCodec_NativeFromTextual_json() { codec, err := NewCodecFrom(`["null","string","int"]`, &codecBuilder{ buildCodecForTypeDescribedByMap, From 2b01a7d3bfb0e711f93b9b0359d9a782f7dcb308 Mon Sep 17 00:00:00 2001 From: marloploemen Date: Fri, 15 Mar 2024 17:27:27 +0100 Subject: [PATCH 2/5] update doc --- codec.go | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/codec.go b/codec.go index c1a1d02..2f2153a 100644 --- a/codec.go +++ b/codec.go @@ -199,17 +199,34 @@ func NewCodecForStandardJSONFull(schemaSpecification string) (*Codec, error) { } // NewCodecForUnambiguousJSON provides full serialization/deserialization -// for json that meets the expectations of regular internet json, viewed as -// something distinct from avro-json which has special handling for union -// types. For details see the above comments. +// for json that is unambiguous in terms of what the field will contain. +// This means that avro Union types containing only a single concrete type +// e.g. ["null", "string"] no longer have to specify their type. Unlike +// NewCodecForStandardJSONFull, ambiguous types ["int", "string"] do still +// need to specify their type as map. See the following examples: // -// With this `codec` you can expect to see a json string like this: +// ["null", "string"] => "some string" || null +// ["int", "string"] => {"int": 1} || {"string": "some string"} +// ["null", "int", "string"] => null || {"int": 1} || {"string": "some string"} // -// "Follow your bliss." +// this is especially useful when using json.Marshal with structs containing +// optional types: // -// to deserialize into the same json structure +// type Person struct { +// Name *string `json:"name,omitempty"` +// } // -// "Follow your bliss." +// or using json.Marshal with structs containing a union: +// +// type Message struct { +// Direction DirectionUnion `json:DirectionUnion" +// } +// +// type DirectionUnion struct { // only one of the fields can be non-nil +// +// Request *string `json:"request,omitempty"` +// Response *string `json:"response,omitempty"` +// } func NewCodecForUnambiguousJSON(schemaSpecification string) (*Codec, error) { return NewCodecFrom(schemaSpecification, &codecBuilder{ buildCodecForTypeDescribedByMap, From 04f5ad4884c90abdff2052cedfa0fba184e26c07 Mon Sep 17 00:00:00 2001 From: marloploemen Date: Fri, 15 Mar 2024 17:29:11 +0100 Subject: [PATCH 3/5] remove whitespace --- codec.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/codec.go b/codec.go index 2f2153a..1e4e9e1 100644 --- a/codec.go +++ b/codec.go @@ -213,19 +213,19 @@ func NewCodecForStandardJSONFull(schemaSpecification string) (*Codec, error) { // optional types: // // type Person struct { -// Name *string `json:"name,omitempty"` +// Name *string `json:"name,omitempty"` // } // // or using json.Marshal with structs containing a union: // // type Message struct { -// Direction DirectionUnion `json:DirectionUnion" +// Direction DirectionUnion `json:DirectionUnion" // } // // type DirectionUnion struct { // only one of the fields can be non-nil // -// Request *string `json:"request,omitempty"` -// Response *string `json:"response,omitempty"` +// Request *string `json:"request,omitempty"` +// Response *string `json:"response,omitempty"` // } func NewCodecForUnambiguousJSON(schemaSpecification string) (*Codec, error) { return NewCodecFrom(schemaSpecification, &codecBuilder{ From 6e0862afb2ac0fd54a193ffae0111bf1fe2a14ce Mon Sep 17 00:00:00 2001 From: marloploemen Date: Mon, 25 Mar 2024 16:34:11 +0100 Subject: [PATCH 4/5] propagate buffer --- union.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/union.go b/union.go index 5954db3..3ddb064 100644 --- a/union.go +++ b/union.go @@ -479,11 +479,11 @@ func nativeAvroFromTextualJSON(cr *codecInfo) func(buf []byte) (interface{}, []b } c := cr.codecFromIndex[index] - rv, rb, err := c.NativeFromTextual(value) + rv, _, err := c.NativeFromTextual(value) if err != nil { return nil, buf, fmt.Errorf("could not decode json data in input: %v: %v", string(buf), err) } - return map[string]interface{}{name: rv}, rb, nil + return map[string]interface{}{name: rv}, buf[dec.InputOffset():], nil } // try to decode it as a map @@ -503,11 +503,11 @@ func nativeAvroFromTextualJSON(cr *codecInfo) func(buf []byte) (interface{}, []b } c := cr.codecFromIndex[index] - rv, rb, err := c.NativeFromTextual(buf) + rv, _, err := c.NativeFromTextual(buf) if err != nil { return nil, buf, fmt.Errorf("could not decode json data in input: %v: %v", string(buf), err) } - return rv, rb, nil + return rv, buf[dec.InputOffset():], nil } } From afcf12bb06343be352361d78f1d301e4e5d672ce Mon Sep 17 00:00:00 2001 From: marloploemen Date: Tue, 26 Mar 2024 09:55:40 +0100 Subject: [PATCH 5/5] support binary from native --- union.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/union.go b/union.go index 3ddb064..481b4db 100644 --- a/union.go +++ b/union.go @@ -43,6 +43,17 @@ func (cr codecInfo) numConcreteTypes() int { return numConcreteTypes } +// firstConcreteTypeCodec returns the first non-null codec +func (cr codecInfo) firstConcreteTypeCodec() *Codec { + for k, v := range cr.codecFromName { + if k == "null" { + continue + } + return v + } + return nil +} + // Union wraps a datum value in a map for encoding as a Union, as required by // Union encoder. // @@ -141,6 +152,13 @@ func unionBinaryFromNative(cr *codecInfo) func(buf []byte, datum interface{}) ([ } return longBinaryFromNative(buf, index) case map[string]interface{}: + if cr.unambiguousMode && cr.isNullable() && cr.numConcreteTypes() == 1 { + c := cr.firstConcreteTypeCodec() + index := cr.indexFromName[c.typeName.fullName] + buf, _ = longBinaryFromNative(buf, index) + return c.binaryFromNative(buf, datum) + } + if len(v) != 1 { return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", cr.allowedTypes, datum) } @@ -155,6 +173,14 @@ func unionBinaryFromNative(cr *codecInfo) func(buf []byte, datum interface{}) ([ return c.binaryFromNative(buf, value) } } + + if cr.unambiguousMode && cr.isNullable() && cr.numConcreteTypes() == 1 { + c := cr.firstConcreteTypeCodec() + index := cr.indexFromName[c.typeName.fullName] + buf, _ = longBinaryFromNative(buf, index) + return c.binaryFromNative(buf, datum) + } + return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", cr.allowedTypes, datum) } }