diff --git a/schema.go b/schema.go index 542031cef..0669e1754 100644 --- a/schema.go +++ b/schema.go @@ -278,6 +278,10 @@ func (s *Schema) UnmarshalJSON(b []byte) error { return err } + if err := checkDuplicateFieldIDs(nil, aux.Fields); err != nil { + return err + } + s.init() s.fields = aux.Fields diff --git a/schema_test.go b/schema_test.go index 846ea9b8e..a4dd4a29f 100644 --- a/schema_test.go +++ b/schema_test.go @@ -442,6 +442,306 @@ func TestUnmarshalSchema(t *testing.T) { assert.True(t, tableSchemaSimple.Equals(&schema)) } +func TestUnmarshalSchemaRejectsDuplicateFieldIDs(t *testing.T) { + tests := []struct { + name string + schema string + contains string + }{ + { + name: "top level", + schema: `{ + "type": "struct", + "fields": [ + {"id": 1, "name": "foo", "type": "string", "required": false}, + {"id": 1, "name": "bar", "type": "int", "required": true} + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "multiple fields for id 1: foo and bar", + }, + { + name: "nested struct", + schema: `{ + "type": "struct", + "fields": [ + {"id": 1, "name": "id", "type": "long", "required": true}, + { + "id": 2, + "name": "payload", + "type": { + "type": "struct", + "fields": [ + {"id": 3, "name": "inner", "type": "string", "required": false}, + {"id": 3, "name": "inner_dup", "type": "int", "required": false} + ] + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "multiple fields for id 3: inner and inner_dup", + }, + { + name: "list element", + schema: `{ + "type": "struct", + "fields": [ + {"id": 1, "name": "id", "type": "long", "required": true}, + { + "id": 2, + "name": "items", + "type": { + "type": "list", + "element-id": 1, + "element": "int", + "element-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "multiple fields for id 1: id and element", + }, + { + name: "map key", + schema: `{ + "type": "struct", + "fields": [ + {"id": 1, "name": "id", "type": "long", "required": true}, + { + "id": 2, + "name": "props", + "type": { + "type": "map", + "key-id": 1, + "key": "string", + "value-id": 3, + "value": "int", + "value-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "multiple fields for id 1: id and key", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var schema iceberg.Schema + err := json.Unmarshal([]byte(tt.schema), &schema) + require.ErrorIs(t, err, iceberg.ErrInvalidSchema) + assert.ErrorContains(t, err, tt.contains) + }) + } +} + +func TestUnmarshalSchemaRejectsMissingCollectionFieldIDs(t *testing.T) { + tests := []struct { + name string + schema string + contains string + }{ + { + name: "missing list element id", + schema: `{ + "type": "struct", + "fields": [ + { + "id": 1, + "name": "items", + "type": { + "type": "list", + "element": "int", + "element-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "field is missing required 'element-id' key in JSON", + }, + { + name: "missing map key id", + schema: `{ + "type": "struct", + "fields": [ + { + "id": 1, + "name": "props", + "type": { + "type": "map", + "key": "string", + "value-id": 2, + "value": "int", + "value-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "field is missing required 'key-id' key in JSON", + }, + { + name: "missing map value id", + schema: `{ + "type": "struct", + "fields": [ + { + "id": 1, + "name": "props", + "type": { + "type": "map", + "key-id": 2, + "key": "string", + "value": "int", + "value-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + contains: "field is missing required 'value-id' key in JSON", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var schema iceberg.Schema + err := json.Unmarshal([]byte(tt.schema), &schema) + require.ErrorIs(t, err, iceberg.ErrInvalidSchema) + assert.ErrorContains(t, err, tt.contains) + }) + } +} + +func TestUnmarshalSchemaAllowsZeroCollectionFieldIDs(t *testing.T) { + tests := []struct { + name string + schema string + check func(*testing.T, iceberg.NestedField) + }{ + { + name: "zero list element id", + schema: `{ + "type": "struct", + "fields": [ + { + "id": 1, + "name": "items", + "type": { + "type": "list", + "element-id": 0, + "element": "int", + "element-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + check: func(t *testing.T, field iceberg.NestedField) { + t.Helper() + listType, ok := field.Type.(*iceberg.ListType) + require.True(t, ok) + assert.Equal(t, 0, listType.ElementID) + }, + }, + { + name: "zero map key id", + schema: `{ + "type": "struct", + "fields": [ + { + "id": 1, + "name": "props", + "type": { + "type": "map", + "key-id": 0, + "key": "string", + "value-id": 2, + "value": "int", + "value-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + check: func(t *testing.T, field iceberg.NestedField) { + t.Helper() + mapType, ok := field.Type.(*iceberg.MapType) + require.True(t, ok) + assert.Equal(t, 0, mapType.KeyID) + }, + }, + { + name: "zero map value id", + schema: `{ + "type": "struct", + "fields": [ + { + "id": 1, + "name": "props", + "type": { + "type": "map", + "key-id": 2, + "key": "string", + "value-id": 0, + "value": "int", + "value-required": false + }, + "required": false + } + ], + "schema-id": 1, + "identifier-field-ids": [] + }`, + check: func(t *testing.T, field iceberg.NestedField) { + t.Helper() + mapType, ok := field.Type.(*iceberg.MapType) + require.True(t, ok) + assert.Equal(t, 0, mapType.ValueID) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var schema iceberg.Schema + require.NoError(t, json.Unmarshal([]byte(tt.schema), &schema)) + require.Equal(t, 1, schema.NumFields()) + tt.check(t, schema.Field(0)) + }) + } +} + +func TestUnmarshalSchemaRoundTripsCollectionFieldIDs(t *testing.T) { + data, err := json.Marshal(tableSchemaNested) + require.NoError(t, err) + + var schema iceberg.Schema + require.NoError(t, json.Unmarshal(data, &schema)) + assert.True(t, tableSchemaNested.Equals(&schema)) +} + func TestPruneColumnsString(t *testing.T) { sc, err := iceberg.PruneColumns(tableSchemaNested, map[int]iceberg.Void{1: {}}, false) require.NoError(t, err) diff --git a/types.go b/types.go index 282c7d2bc..351f0808c 100644 --- a/types.go +++ b/types.go @@ -444,7 +444,7 @@ func (l *ListType) String() string { return fmt.Sprintf("list<%s>", l.Element) } func (l *ListType) UnmarshalJSON(b []byte) error { aux := struct { - ID int `json:"element-id"` + ID *int `json:"element-id"` Elem typeIFace `json:"element"` Req bool `json:"element-required"` }{} @@ -452,7 +452,11 @@ func (l *ListType) UnmarshalJSON(b []byte) error { return err } - l.ElementID = aux.ID + if aux.ID == nil { + return fmt.Errorf("%w: field is missing required 'element-id' key in JSON", ErrInvalidSchema) + } + + l.ElementID = *aux.ID l.Element = aux.Elem.Type l.ElementRequired = aux.Req @@ -524,9 +528,9 @@ func (m *MapType) String() string { func (m *MapType) UnmarshalJSON(b []byte) error { aux := struct { - KeyID int `json:"key-id"` + KeyID *int `json:"key-id"` Key typeIFace `json:"key"` - ValueID int `json:"value-id"` + ValueID *int `json:"value-id"` Value typeIFace `json:"value"` ValueReq *bool `json:"value-required"` }{} @@ -534,8 +538,16 @@ func (m *MapType) UnmarshalJSON(b []byte) error { return err } - m.KeyID, m.KeyType = aux.KeyID, aux.Key.Type - m.ValueID, m.ValueType = aux.ValueID, aux.Value.Type + if aux.KeyID == nil { + return fmt.Errorf("%w: field is missing required 'key-id' key in JSON", ErrInvalidSchema) + } + + if aux.ValueID == nil { + return fmt.Errorf("%w: field is missing required 'value-id' key in JSON", ErrInvalidSchema) + } + + m.KeyID, m.KeyType = *aux.KeyID, aux.Key.Type + m.ValueID, m.ValueType = *aux.ValueID, aux.Value.Type if aux.ValueReq == nil { m.ValueRequired = true } else {