Skip to content

Commit ca30ee3

Browse files
committed
fix(expr): correct CanContainNulls/CanContainNaNs when field missing from stats map
Use Schema::FindFieldById (recursive lookup) instead of StructType::GetFieldById (top-level only). Return Result<bool> to propagate errors. Restore comment explaining null_value_counts semantics. Add regression tests in a standalone test suite.
1 parent 8601f16 commit ca30ee3

2 files changed

Lines changed: 109 additions & 12 deletions

File tree

src/iceberg/expression/strict_metrics_evaluator.cc

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
142142
return kRowsMightNotMatch;
143143
}
144144

145-
if (CanContainNulls(id) || CanContainNaNs(id)) {
145+
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
146+
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
147+
if (has_nulls || has_nans) {
146148
return kRowsMightNotMatch;
147149
}
148150

@@ -168,7 +170,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
168170
return kRowsMightNotMatch;
169171
}
170172

171-
if (CanContainNulls(id) || CanContainNaNs(id)) {
173+
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
174+
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
175+
if (has_nulls || has_nans) {
172176
return kRowsMightNotMatch;
173177
}
174178

@@ -194,7 +198,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
194198
return kRowsMightNotMatch;
195199
}
196200

197-
if (CanContainNulls(id) || CanContainNaNs(id)) {
201+
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
202+
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
203+
if (has_nulls || has_nans) {
198204
return kRowsMightNotMatch;
199205
}
200206

@@ -226,7 +232,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
226232
return kRowsMightNotMatch;
227233
}
228234

229-
if (CanContainNulls(id) || CanContainNaNs(id)) {
235+
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
236+
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
237+
if (has_nulls || has_nans) {
230238
return kRowsMightNotMatch;
231239
}
232240

@@ -258,7 +266,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
258266
return kRowsMightNotMatch;
259267
}
260268

261-
if (CanContainNulls(id) || CanContainNaNs(id)) {
269+
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
270+
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
271+
if (has_nulls || has_nans) {
262272
return kRowsMightNotMatch;
263273
}
264274
auto lower_it = data_file_.lower_bounds.find(id);
@@ -330,7 +340,9 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
330340
return kRowsMightNotMatch;
331341
}
332342

333-
if (CanContainNulls(id) || CanContainNaNs(id)) {
343+
ICEBERG_ASSIGN_OR_RAISE(auto has_nulls, CanContainNulls(id));
344+
ICEBERG_ASSIGN_OR_RAISE(auto has_nans, CanContainNaNs(id));
345+
if (has_nulls || has_nans) {
334346
return kRowsMightNotMatch;
335347
}
336348
auto lower_it = data_file_.lower_bounds.find(id);
@@ -435,19 +447,41 @@ class StrictMetricsVisitor : public BoundVisitor<bool> {
435447
return Literal::Deserialize(stats, primitive_type);
436448
}
437449

438-
bool CanContainNulls(int32_t id) {
450+
Result<bool> CanContainNulls(int32_t id) {
451+
ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema_.FindFieldById(id));
452+
if (field_opt.has_value() && !field_opt->get().optional()) {
453+
return false;
454+
}
455+
456+
// When null_value_counts is not empty, the evaluator expects all fields to be
457+
// present. A missing entry indicates that the field's null count is unknown.
439458
if (data_file_.null_value_counts.empty()) {
440459
return true;
441460
}
442461
auto it = data_file_.null_value_counts.find(id);
443-
return it != data_file_.null_value_counts.cend() && it->second > 0;
462+
if (it == data_file_.null_value_counts.cend()) {
463+
return true;
464+
}
465+
return it->second > 0;
444466
}
445467

446-
bool CanContainNaNs(int32_t id) {
447-
// nan counts might be null for early version writers when nan counters are not
448-
// populated.
468+
Result<bool> CanContainNaNs(int32_t id) {
469+
ICEBERG_ASSIGN_OR_RAISE(auto field_opt, schema_.FindFieldById(id));
470+
if (field_opt.has_value()) {
471+
auto type_id = field_opt->get().type()->type_id();
472+
if (type_id != TypeId::kFloat && type_id != TypeId::kDouble) {
473+
return false;
474+
}
475+
}
476+
477+
if (data_file_.nan_value_counts.empty()) {
478+
return true;
479+
}
449480
auto it = data_file_.nan_value_counts.find(id);
450-
return it != data_file_.nan_value_counts.cend() && it->second > 0;
481+
if (it == data_file_.nan_value_counts.cend()) {
482+
return true;
483+
}
484+
return it->second > 0;
451485
}
452486

453487
bool ContainsNullsOnly(int32_t id) {

src/iceberg/test/strict_metrics_evaluator_test.cc

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,4 +846,67 @@ TEST_F(StrictMetricsEvaluatorMigratedTest, EvaluateOnNestedColumnWithStats) {
846846
ExpectShouldRead(Expressions::NotNull("struct.nested_col_with_stats"), false);
847847
}
848848

849+
TEST(StrictMetricsEvaluatorRegressionTest, MissingNullCountForField) {
850+
// Field 14 (no_nan_stats, float64, optional) has bounds and value_counts but is
851+
// missing from null_value_counts. The evaluator must conservatively assume nulls
852+
// may exist and return kRowsMightNotMatch for comparison operators.
853+
auto schema = std::make_shared<Schema>(
854+
std::vector<SchemaField>{
855+
SchemaField::MakeOptional(14, "no_nan_stats", float64()),
856+
},
857+
/*schema_id=*/0);
858+
859+
auto data_file = std::make_shared<DataFile>();
860+
data_file->file_path = "null_test.parquet";
861+
data_file->file_format = FileFormatType::kParquet;
862+
data_file->record_count = 50;
863+
data_file->value_counts = {{14, 50L}};
864+
data_file->null_value_counts = {{4, 0L}, {5, 0L}};
865+
data_file->nan_value_counts = {{14, 0L}};
866+
data_file->lower_bounds = {{14, Literal::Double(1.0).Serialize().value()}};
867+
data_file->upper_bounds = {{14, Literal::Double(100.0).Serialize().value()}};
868+
869+
auto evaluate = [&](const std::shared_ptr<Expression>& expr) {
870+
ICEBERG_UNWRAP_OR_FAIL(auto eval, StrictMetricsEvaluator::Make(expr, schema, true));
871+
ICEBERG_UNWRAP_OR_FAIL(auto result, eval->Evaluate(*data_file));
872+
EXPECT_EQ(result, kRowsMightNotMatch) << expr->ToString();
873+
};
874+
875+
evaluate(Expressions::LessThan("no_nan_stats", Literal::Double(200.0)));
876+
evaluate(Expressions::LessThanOrEqual("no_nan_stats", Literal::Double(200.0)));
877+
evaluate(Expressions::GreaterThan("no_nan_stats", Literal::Double(-1.0)));
878+
evaluate(Expressions::GreaterThanOrEqual("no_nan_stats", Literal::Double(-1.0)));
879+
evaluate(Expressions::Equal("no_nan_stats", Literal::Double(50.0)));
880+
}
881+
882+
TEST(StrictMetricsEvaluatorRegressionTest, MissingNanCountForField) {
883+
// Field 14 (no_nan_stats, float64, optional) is missing from nan_value_counts.
884+
// For a floating-point field, the evaluator must conservatively assume NaNs may
885+
// exist and return kRowsMightNotMatch for comparison operators.
886+
auto schema = std::make_shared<Schema>(
887+
std::vector<SchemaField>{
888+
SchemaField::MakeOptional(14, "no_nan_stats", float64()),
889+
},
890+
/*schema_id=*/0);
891+
892+
auto data_file = std::make_shared<DataFile>();
893+
data_file->file_path = "nan_test.parquet";
894+
data_file->file_format = FileFormatType::kParquet;
895+
data_file->record_count = 50;
896+
data_file->value_counts = {{14, 50L}};
897+
data_file->null_value_counts = {{14, 0L}};
898+
data_file->nan_value_counts = {{8, 0L}};
899+
data_file->lower_bounds = {{14, Literal::Double(1.0).Serialize().value()}};
900+
data_file->upper_bounds = {{14, Literal::Double(100.0).Serialize().value()}};
901+
902+
auto evaluate = [&](const std::shared_ptr<Expression>& expr) {
903+
ICEBERG_UNWRAP_OR_FAIL(auto eval, StrictMetricsEvaluator::Make(expr, schema, true));
904+
ICEBERG_UNWRAP_OR_FAIL(auto result, eval->Evaluate(*data_file));
905+
EXPECT_EQ(result, kRowsMightNotMatch) << expr->ToString();
906+
};
907+
908+
evaluate(Expressions::LessThan("no_nan_stats", Literal::Double(200.0)));
909+
evaluate(Expressions::GreaterThan("no_nan_stats", Literal::Double(-1.0)));
910+
}
911+
849912
} // namespace iceberg

0 commit comments

Comments
 (0)