Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ func (s *Schema) UnmarshalJSON(b []byte) error {
return err
}

if err := checkDuplicateFieldIDs(nil, aux.Fields); err != nil {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This catches duplicate positive IDs, but missing list/map IDs still slip through. checkDuplicateFieldIDs only records f.ID > 0 (schema.go:101), and an omitted list element-id or map key-id/value-id decodes to 0 - so a schema that omits one of those still unmarshals cleanly (a lone 0 isn't a duplicate). If the goal is to validate field IDs on unmarshal, it'd be worth rejecting missing/zero element/key/value IDs too (e.g. decode them as *int and error when absent), while still allowing the negative placeholders used during construction. Is the broader validation in scope here, or is this intentionally limited to duplicates?

return err
}

s.init()

s.fields = aux.Fields
Expand Down
300 changes: 300 additions & 0 deletions schema_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,306 @@ func TestUnmarshalSchema(t *testing.T) {
assert.True(t, tableSchemaSimple.Equals(&schema))
}

func TestUnmarshalSchemaRejectsDuplicateFieldIDs(t *testing.T) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice — the top-level and nested-struct cases are the important ones. Since checkDuplicateFieldIDs also walks list element IDs and map key/value IDs (schema.go:118-130), would you consider adding a list or map dup-ID case too? Those branches are only exercised indirectly right now, and they're the easiest to silently break later.

tests := []struct {
name string
schema string
contains string
}{
{
name: "top level",
schema: `{
"type": "struct",
"fields": [
{"id": 1, "name": "foo", "type": "string", "required": false},
{"id": 1, "name": "bar", "type": "int", "required": true}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "multiple fields for id 1: foo and bar",
},
{
name: "nested struct",
schema: `{
"type": "struct",
"fields": [
{"id": 1, "name": "id", "type": "long", "required": true},
{
"id": 2,
"name": "payload",
"type": {
"type": "struct",
"fields": [
{"id": 3, "name": "inner", "type": "string", "required": false},
{"id": 3, "name": "inner_dup", "type": "int", "required": false}
]
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "multiple fields for id 3: inner and inner_dup",
},
{
name: "list element",
schema: `{
"type": "struct",
"fields": [
{"id": 1, "name": "id", "type": "long", "required": true},
{
"id": 2,
"name": "items",
"type": {
"type": "list",
"element-id": 1,
"element": "int",
"element-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "multiple fields for id 1: id and element",
},
{
name: "map key",
schema: `{
"type": "struct",
"fields": [
{"id": 1, "name": "id", "type": "long", "required": true},
{
"id": 2,
"name": "props",
"type": {
"type": "map",
"key-id": 1,
"key": "string",
"value-id": 3,
"value": "int",
"value-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "multiple fields for id 1: id and key",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var schema iceberg.Schema
err := json.Unmarshal([]byte(tt.schema), &schema)
require.ErrorIs(t, err, iceberg.ErrInvalidSchema)
assert.ErrorContains(t, err, tt.contains)
})
}
}

func TestUnmarshalSchemaRejectsMissingCollectionFieldIDs(t *testing.T) {
tests := []struct {
name string
schema string
contains string
}{
{
name: "missing list element id",
schema: `{
"type": "struct",
"fields": [
{
"id": 1,
"name": "items",
"type": {
"type": "list",
"element": "int",
"element-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "field is missing required 'element-id' key in JSON",
},
{
name: "missing map key id",
schema: `{
"type": "struct",
"fields": [
{
"id": 1,
"name": "props",
"type": {
"type": "map",
"key": "string",
"value-id": 2,
"value": "int",
"value-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "field is missing required 'key-id' key in JSON",
},
{
name: "missing map value id",
schema: `{
"type": "struct",
"fields": [
{
"id": 1,
"name": "props",
"type": {
"type": "map",
"key-id": 2,
"key": "string",
"value": "int",
"value-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
contains: "field is missing required 'value-id' key in JSON",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var schema iceberg.Schema
err := json.Unmarshal([]byte(tt.schema), &schema)
require.ErrorIs(t, err, iceberg.ErrInvalidSchema)
assert.ErrorContains(t, err, tt.contains)
})
}
}

func TestUnmarshalSchemaAllowsZeroCollectionFieldIDs(t *testing.T) {
tests := []struct {
name string
schema string
check func(*testing.T, iceberg.NestedField)
}{
{
name: "zero list element id",

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once the == 0 checks come out, this row and "zero map value id" assert the wrong outcome — they'd need to flip to require.NoError. While here: the table never exercises zero map key-id or missing map value-id (both are real branches), and there's no positive case proving a valid non-zero list/map schema still unmarshals after the *int change — tableSchemaNested goes through the constructor, not JSON decode, so a regression there wouldn't be caught. I'd add a round-trip case asserting require.NoError.

schema: `{
"type": "struct",
"fields": [
{
"id": 1,
"name": "items",
"type": {
"type": "list",
"element-id": 0,
"element": "int",
"element-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
check: func(t *testing.T, field iceberg.NestedField) {
t.Helper()
listType, ok := field.Type.(*iceberg.ListType)
require.True(t, ok)
assert.Equal(t, 0, listType.ElementID)
},
},
{
name: "zero map key id",
schema: `{
"type": "struct",
"fields": [
{
"id": 1,
"name": "props",
"type": {
"type": "map",
"key-id": 0,
"key": "string",
"value-id": 2,
"value": "int",
"value-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
check: func(t *testing.T, field iceberg.NestedField) {
t.Helper()
mapType, ok := field.Type.(*iceberg.MapType)
require.True(t, ok)
assert.Equal(t, 0, mapType.KeyID)
},
},
{
name: "zero map value id",
schema: `{
"type": "struct",
"fields": [
{
"id": 1,
"name": "props",
"type": {
"type": "map",
"key-id": 2,
"key": "string",
"value-id": 0,
"value": "int",
"value-required": false
},
"required": false
}
],
"schema-id": 1,
"identifier-field-ids": []
}`,
check: func(t *testing.T, field iceberg.NestedField) {
t.Helper()
mapType, ok := field.Type.(*iceberg.MapType)
require.True(t, ok)
assert.Equal(t, 0, mapType.ValueID)
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var schema iceberg.Schema
require.NoError(t, json.Unmarshal([]byte(tt.schema), &schema))
require.Equal(t, 1, schema.NumFields())
tt.check(t, schema.Field(0))
})
}
}

func TestUnmarshalSchemaRoundTripsCollectionFieldIDs(t *testing.T) {
data, err := json.Marshal(tableSchemaNested)
require.NoError(t, err)

var schema iceberg.Schema
require.NoError(t, json.Unmarshal(data, &schema))
assert.True(t, tableSchemaNested.Equals(&schema))
}

func TestPruneColumnsString(t *testing.T) {
sc, err := iceberg.PruneColumns(tableSchemaNested, map[int]iceberg.Void{1: {}}, false)
require.NoError(t, err)
Expand Down
24 changes: 18 additions & 6 deletions types.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,15 +444,19 @@ func (l *ListType) String() string { return fmt.Sprintf("list<%s>", l.Element) }

func (l *ListType) UnmarshalJSON(b []byte) error {
aux := struct {
ID int `json:"element-id"`
ID *int `json:"element-id"`
Elem typeIFace `json:"element"`
Req bool `json:"element-required"`
}{}
if err := json.Unmarshal(b, &aux); err != nil {
return err
}

l.ElementID = aux.ID
if aux.ID == nil {
return fmt.Errorf("%w: field is missing required 'element-id' key in JSON", ErrInvalidSchema)
}

l.ElementID = *aux.ID
l.Element = aux.Elem.Type
l.ElementRequired = aux.Req

Expand Down Expand Up @@ -524,18 +528,26 @@ func (m *MapType) String() string {

func (m *MapType) UnmarshalJSON(b []byte) error {
aux := struct {
KeyID int `json:"key-id"`
KeyID *int `json:"key-id"`
Key typeIFace `json:"key"`
ValueID int `json:"value-id"`
ValueID *int `json:"value-id"`
Value typeIFace `json:"value"`
ValueReq *bool `json:"value-required"`
}{}
if err := json.Unmarshal(b, &aux); err != nil {
return err
}

m.KeyID, m.KeyType = aux.KeyID, aux.Key.Type
m.ValueID, m.ValueType = aux.ValueID, aux.Value.Type
if aux.KeyID == nil {
return fmt.Errorf("%w: field is missing required 'key-id' key in JSON", ErrInvalidSchema)
}

if aux.ValueID == nil {
return fmt.Errorf("%w: field is missing required 'value-id' key in JSON", ErrInvalidSchema)
}

m.KeyID, m.KeyType = *aux.KeyID, aux.Key.Type
m.ValueID, m.ValueType = *aux.ValueID, aux.Value.Type
if aux.ValueReq == nil {
m.ValueRequired = true
} else {
Expand Down
Loading