-
Notifications
You must be signed in to change notification settings - Fork 214
fix: validate schema field IDs during unmarshal #1315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -442,6 +442,306 @@ func TestUnmarshalSchema(t *testing.T) { | |
| assert.True(t, tableSchemaSimple.Equals(&schema)) | ||
| } | ||
|
|
||
| func TestUnmarshalSchemaRejectsDuplicateFieldIDs(t *testing.T) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice — the top-level and nested-struct cases are the important ones. Since checkDuplicateFieldIDs also walks list element IDs and map key/value IDs (schema.go:118-130), would you consider adding a list or map dup-ID case too? Those branches are only exercised indirectly right now, and they're the easiest to silently break later. |
||
| tests := []struct { | ||
| name string | ||
| schema string | ||
| contains string | ||
| }{ | ||
| { | ||
| name: "top level", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| {"id": 1, "name": "foo", "type": "string", "required": false}, | ||
| {"id": 1, "name": "bar", "type": "int", "required": true} | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "multiple fields for id 1: foo and bar", | ||
| }, | ||
| { | ||
| name: "nested struct", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| {"id": 1, "name": "id", "type": "long", "required": true}, | ||
| { | ||
| "id": 2, | ||
| "name": "payload", | ||
| "type": { | ||
| "type": "struct", | ||
| "fields": [ | ||
| {"id": 3, "name": "inner", "type": "string", "required": false}, | ||
| {"id": 3, "name": "inner_dup", "type": "int", "required": false} | ||
| ] | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "multiple fields for id 3: inner and inner_dup", | ||
| }, | ||
| { | ||
| name: "list element", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| {"id": 1, "name": "id", "type": "long", "required": true}, | ||
| { | ||
| "id": 2, | ||
| "name": "items", | ||
| "type": { | ||
| "type": "list", | ||
| "element-id": 1, | ||
| "element": "int", | ||
| "element-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "multiple fields for id 1: id and element", | ||
| }, | ||
| { | ||
| name: "map key", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| {"id": 1, "name": "id", "type": "long", "required": true}, | ||
| { | ||
| "id": 2, | ||
| "name": "props", | ||
| "type": { | ||
| "type": "map", | ||
| "key-id": 1, | ||
| "key": "string", | ||
| "value-id": 3, | ||
| "value": "int", | ||
| "value-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "multiple fields for id 1: id and key", | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| var schema iceberg.Schema | ||
| err := json.Unmarshal([]byte(tt.schema), &schema) | ||
| require.ErrorIs(t, err, iceberg.ErrInvalidSchema) | ||
| assert.ErrorContains(t, err, tt.contains) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestUnmarshalSchemaRejectsMissingCollectionFieldIDs(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| schema string | ||
| contains string | ||
| }{ | ||
| { | ||
| name: "missing list element id", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| { | ||
| "id": 1, | ||
| "name": "items", | ||
| "type": { | ||
| "type": "list", | ||
| "element": "int", | ||
| "element-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "field is missing required 'element-id' key in JSON", | ||
| }, | ||
| { | ||
| name: "missing map key id", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| { | ||
| "id": 1, | ||
| "name": "props", | ||
| "type": { | ||
| "type": "map", | ||
| "key": "string", | ||
| "value-id": 2, | ||
| "value": "int", | ||
| "value-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "field is missing required 'key-id' key in JSON", | ||
| }, | ||
| { | ||
| name: "missing map value id", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| { | ||
| "id": 1, | ||
| "name": "props", | ||
| "type": { | ||
| "type": "map", | ||
| "key-id": 2, | ||
| "key": "string", | ||
| "value": "int", | ||
| "value-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| contains: "field is missing required 'value-id' key in JSON", | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| var schema iceberg.Schema | ||
| err := json.Unmarshal([]byte(tt.schema), &schema) | ||
| require.ErrorIs(t, err, iceberg.ErrInvalidSchema) | ||
| assert.ErrorContains(t, err, tt.contains) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestUnmarshalSchemaAllowsZeroCollectionFieldIDs(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| schema string | ||
| check func(*testing.T, iceberg.NestedField) | ||
| }{ | ||
| { | ||
| name: "zero list element id", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once the |
||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| { | ||
| "id": 1, | ||
| "name": "items", | ||
| "type": { | ||
| "type": "list", | ||
| "element-id": 0, | ||
| "element": "int", | ||
| "element-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| check: func(t *testing.T, field iceberg.NestedField) { | ||
| t.Helper() | ||
| listType, ok := field.Type.(*iceberg.ListType) | ||
| require.True(t, ok) | ||
| assert.Equal(t, 0, listType.ElementID) | ||
| }, | ||
| }, | ||
| { | ||
| name: "zero map key id", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| { | ||
| "id": 1, | ||
| "name": "props", | ||
| "type": { | ||
| "type": "map", | ||
| "key-id": 0, | ||
| "key": "string", | ||
| "value-id": 2, | ||
| "value": "int", | ||
| "value-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| check: func(t *testing.T, field iceberg.NestedField) { | ||
| t.Helper() | ||
| mapType, ok := field.Type.(*iceberg.MapType) | ||
| require.True(t, ok) | ||
| assert.Equal(t, 0, mapType.KeyID) | ||
| }, | ||
| }, | ||
| { | ||
| name: "zero map value id", | ||
| schema: `{ | ||
| "type": "struct", | ||
| "fields": [ | ||
| { | ||
| "id": 1, | ||
| "name": "props", | ||
| "type": { | ||
| "type": "map", | ||
| "key-id": 2, | ||
| "key": "string", | ||
| "value-id": 0, | ||
| "value": "int", | ||
| "value-required": false | ||
| }, | ||
| "required": false | ||
| } | ||
| ], | ||
| "schema-id": 1, | ||
| "identifier-field-ids": [] | ||
| }`, | ||
| check: func(t *testing.T, field iceberg.NestedField) { | ||
| t.Helper() | ||
| mapType, ok := field.Type.(*iceberg.MapType) | ||
| require.True(t, ok) | ||
| assert.Equal(t, 0, mapType.ValueID) | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| var schema iceberg.Schema | ||
| require.NoError(t, json.Unmarshal([]byte(tt.schema), &schema)) | ||
| require.Equal(t, 1, schema.NumFields()) | ||
| tt.check(t, schema.Field(0)) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestUnmarshalSchemaRoundTripsCollectionFieldIDs(t *testing.T) { | ||
| data, err := json.Marshal(tableSchemaNested) | ||
| require.NoError(t, err) | ||
|
|
||
| var schema iceberg.Schema | ||
| require.NoError(t, json.Unmarshal(data, &schema)) | ||
| assert.True(t, tableSchemaNested.Equals(&schema)) | ||
| } | ||
|
|
||
| func TestPruneColumnsString(t *testing.T) { | ||
| sc, err := iceberg.PruneColumns(tableSchemaNested, map[int]iceberg.Void{1: {}}, false) | ||
| require.NoError(t, err) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This catches duplicate positive IDs, but missing list/map IDs still slip through.
checkDuplicateFieldIDsonly recordsf.ID > 0(schema.go:101), and an omitted listelement-idor mapkey-id/value-iddecodes to0- so a schema that omits one of those still unmarshals cleanly (a lone0isn't a duplicate). If the goal is to validate field IDs on unmarshal, it'd be worth rejecting missing/zero element/key/value IDs too (e.g. decode them as*intand error when absent), while still allowing the negative placeholders used during construction. Is the broader validation in scope here, or is this intentionally limited to duplicates?