Skip to content

Commit a8c650d

Browse files
committed
fix truncate invalid max value && add comments for collect NaN
1 parent 01f825d commit a8c650d

5 files changed

Lines changed: 48 additions & 35 deletions

File tree

src/iceberg/parquet/parquet_metrics.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class ICEBERG_BUNDLE_EXPORT ParquetMetrics {
4343
///
4444
/// This function extracts metrics including row count, column sizes, value counts,
4545
/// null value counts, and lower/upper bounds from Parquet file metadata.
46+
/// NaN value counts are not currently collected from Parquet metadata.
4647
/// The metrics are computed according to the provided MetricsConfig, which determines
4748
/// which columns to collect metrics for and at what granularity (counts only, truncated
4849
/// bounds, or full bounds).

src/iceberg/parquet/parquet_writer.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ class ParquetWriter::Impl {
164164
if (!metadata_) {
165165
return Metrics();
166166
}
167+
// TODO(WZhuo): collect write-side FieldMetrics to support NaN value counts.
167168
return ParquetMetrics::GetMetrics(*schema_, *parquet_schema_, *metrics_config_,
168169
*metadata_, {});
169170
}

src/iceberg/test/truncate_util_test.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,9 @@ TEST(TruncateUtilTest, TruncateBinaryMax) {
7979
EXPECT_EQ(result3, Literal::Binary(test3));
8080

8181
// Test3b: cannot truncate when first bytes are all 0xFF
82-
EXPECT_THAT(TruncateUtils::TruncateLiteralMax(Literal::Binary(test3), 2),
83-
IsError(ErrorKind::kInvalidArgument));
82+
ICEBERG_UNWRAP_OR_FAIL(auto result3b,
83+
TruncateUtils::TruncateLiteralMax(Literal::Binary(test3), 2));
84+
EXPECT_EQ(result3b, std::nullopt);
8485

8586
// Test4: truncate {1, 1, 0} to 2 bytes -> {1, 2}
8687
ICEBERG_UNWRAP_OR_FAIL(auto result4,
@@ -143,8 +144,9 @@ TEST(TruncateUtilTest, TruncateStringMax) {
143144

144145
// Test5: Max 4-byte UTF-8 characters "\uDBFF\uDFFF\uDBFF\uDFFF"
145146
std::string test5 = "\xF4\x8F\xBF\xBF\xF4\x8F\xBF\xBF"; // U+10FFFF U+10FFFF
146-
EXPECT_THAT(TruncateUtils::TruncateLiteralMax(Literal::String(test5), 1),
147-
IsError(ErrorKind::kInvalidArgument));
147+
ICEBERG_UNWRAP_OR_FAIL(auto result5_1,
148+
TruncateUtils::TruncateLiteralMax(Literal::String(test5), 1));
149+
EXPECT_EQ(result5_1, std::nullopt);
148150

149151
// Test6: 4-byte UTF-8 character "\uD800\uDFFF\uD800\uDFFF"
150152
std::string test6 = "\xF0\x90\x8F\xBF\xF0\x90\x8F\xBF"; // U+103FF U+103FF

src/iceberg/util/truncate_util.cc

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -168,42 +168,46 @@ Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t wid
168168
}
169169

170170
template <TypeId type_id>
171-
Result<Literal> TruncateLiteralMaxImpl(const Literal& literal, int32_t width) = delete;
171+
Result<std::optional<Literal>> TruncateLiteralMaxImpl(const Literal& literal,
172+
int32_t width) = delete;
172173

173174
template <>
174-
Result<Literal> TruncateLiteralMaxImpl<TypeId::kString>(const Literal& literal,
175-
int32_t width) {
175+
Result<std::optional<Literal>> TruncateLiteralMaxImpl<TypeId::kString>(
176+
const Literal& literal, int32_t width) {
176177
const auto& str = std::get<std::string>(literal.value());
177-
ICEBERG_ASSIGN_OR_RAISE(std::string truncated,
178-
TruncateUtils::TruncateUTF8Max(str, width));
179-
return Literal::String(std::move(truncated));
178+
ICEBERG_ASSIGN_OR_RAISE(auto truncated, TruncateUtils::TruncateUTF8Max(str, width));
179+
if (!truncated.has_value()) {
180+
return std::nullopt;
181+
}
182+
return std::optional<Literal>(Literal::String(std::move(truncated.value())));
180183
}
181184

182185
template <>
183-
Result<Literal> TruncateLiteralMaxImpl<TypeId::kBinary>(const Literal& literal,
184-
int32_t width) {
186+
Result<std::optional<Literal>> TruncateLiteralMaxImpl<TypeId::kBinary>(
187+
const Literal& literal, int32_t width) {
185188
const auto& data = std::get<std::vector<uint8_t>>(literal.value());
186189
if (static_cast<int32_t>(data.size()) <= width) {
187-
return literal;
190+
return std::optional<Literal>(literal);
188191
}
189192

190193
std::vector<uint8_t> truncated(data.begin(), data.begin() + width);
191194
for (auto it = truncated.rbegin(); it != truncated.rend(); ++it) {
192195
if (*it < 0xFF) {
193196
++(*it);
194197
truncated.resize(truncated.size() - std::distance(truncated.rbegin(), it));
195-
return Literal::Binary(std::move(truncated));
198+
return std::optional<Literal>(Literal::Binary(std::move(truncated)));
196199
}
197200
}
198-
return InvalidArgument("Cannot truncate upper bound for binary: all bytes are 0xFF");
201+
return std::nullopt;
199202
}
200203

201204
} // namespace
202205

203-
Result<std::string> TruncateUtils::TruncateUTF8Max(const std::string& source, size_t L) {
206+
Result<std::optional<std::string>> TruncateUtils::TruncateUTF8Max(
207+
const std::string& source, size_t L) {
204208
std::string truncated = TruncateUTF8(source, L);
205209
if (truncated == source) {
206-
return truncated;
210+
return std::optional<std::string>(std::move(truncated));
207211
}
208212

209213
// Try incrementing code points from the end
@@ -232,13 +236,12 @@ Result<std::string> TruncateUtils::TruncateUTF8Max(const std::string& source, si
232236
if (next_code_point <= kUtf8MaxCodePoint) {
233237
truncated.resize(cp_start);
234238
AppendUtf8CodePoint(next_code_point, truncated);
235-
return truncated;
239+
return std::optional<std::string>(std::move(truncated));
236240
}
237241
}
238242
last_cp_start = cp_start;
239243
}
240-
return InvalidArgument(
241-
"Cannot truncate upper bound for string: all code points are 0x10FFFF");
244+
return std::nullopt;
242245
}
243246

244247
Decimal TruncateUtils::TruncateDecimal(const Decimal& decimal, int32_t width) {
@@ -275,10 +278,11 @@ Result<Literal> TruncateUtils::TruncateLiteral(const Literal& literal, int32_t w
275278
case TYPE_ID: \
276279
return TruncateLiteralMaxImpl<TYPE_ID>(literal, width);
277280

278-
Result<Literal> TruncateUtils::TruncateLiteralMax(const Literal& literal, int32_t width) {
281+
Result<std::optional<Literal>> TruncateUtils::TruncateLiteralMax(const Literal& literal,
282+
int32_t width) {
279283
if (literal.IsNull()) [[unlikely]] {
280284
// Return null as is
281-
return literal;
285+
return std::optional<Literal>(literal);
282286
}
283287

284288
if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
@@ -305,14 +309,14 @@ Result<Literal> TruncateUtils::TruncateLowerBound(const PrimitiveType& type,
305309
}
306310
}
307311

308-
Result<Literal> TruncateUtils::TruncateUpperBound(const PrimitiveType& type,
309-
const Literal& value, int32_t length) {
312+
Result<std::optional<Literal>> TruncateUtils::TruncateUpperBound(
313+
const PrimitiveType& type, const Literal& value, int32_t length) {
310314
switch (type.type_id()) {
311315
case TypeId::kString:
312316
case TypeId::kBinary:
313317
return TruncateLiteralMax(value, length);
314318
default:
315-
return value;
319+
return std::optional<Literal>(value);
316320
}
317321
}
318322

src/iceberg/util/truncate_util.h

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#pragma once
2121

2222
#include <cstdint>
23+
#include <optional>
2324
#include <string>
2425
#include <utility>
2526

@@ -71,9 +72,10 @@ class ICEBERG_EXPORT TruncateUtils {
7172
/// \param source The input string to truncate.
7273
/// \param L The maximum number of code points allowed in the output string.
7374
/// \return A Result containing the original string (if no truncation is
74-
/// needed), or the smallest string greater than the truncated prefix, or an
75-
/// error if no such value exists or the input is invalid UTF-8.
76-
static Result<std::string> TruncateUTF8Max(const std::string& source, size_t L);
75+
/// needed), the smallest string greater than the truncated prefix, or nullopt if no
76+
/// safe upper bound can be represented. Invalid UTF-8 is returned as an error.
77+
static Result<std::optional<std::string>> TruncateUTF8Max(const std::string& source,
78+
size_t L);
7779

7880
/// \brief Truncate an integer v, either int32_t or int64_t, to v - (v % W).
7981
///
@@ -109,10 +111,11 @@ class ICEBERG_EXPORT TruncateUtils {
109111
///
110112
/// \param value The input Literal maximum value to truncate.
111113
/// \param width The width to truncate to.
112-
/// \return A Result containing either the original Literal (if no truncation is needed)
113-
/// or the smallest Literal greater than the truncated prefix, or an error if no such
114-
/// value exists or cannot be represented.
115-
static Result<Literal> TruncateLiteralMax(const Literal& value, int32_t width);
114+
/// \return A Result containing either the original Literal (if no truncation is
115+
/// needed), the smallest Literal greater than the truncated prefix, or nullopt if no
116+
/// safe upper bound can be represented.
117+
static Result<std::optional<Literal>> TruncateLiteralMax(const Literal& value,
118+
int32_t width);
116119

117120
/// \brief Truncate the lower bound of a string or binary value.
118121
///
@@ -134,9 +137,11 @@ class ICEBERG_EXPORT TruncateUtils {
134137
/// \param type The Iceberg primitive type.
135138
/// \param value The upper bound literal value.
136139
/// \param width The width to truncate to.
137-
/// \return The truncated upper bound literal.
138-
static Result<Literal> TruncateUpperBound(const PrimitiveType& type,
139-
const Literal& value, int32_t width);
140+
/// \return The truncated upper bound literal, or nullopt if no safe upper bound can be
141+
/// represented.
142+
static Result<std::optional<Literal>> TruncateUpperBound(const PrimitiveType& type,
143+
const Literal& value,
144+
int32_t width);
140145
};
141146

142147
} // namespace iceberg

0 commit comments

Comments
 (0)