From f031ffccf39fc32c3129ac8acc367e71086df3f6 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Fri, 23 Jan 2026 17:47:51 +0800 Subject: [PATCH 1/6] Support Thai/Khmer digit dates in CH Scan UTF-8 strings for local digits before conversion and add regression queries for Thai and Khmer numeral date parsing in the function suite. --- .../GlutenFunctionValidateSuite.scala | 25 +++++++++- .../LocalDigitsToAsciiDigitForDate.cpp | 47 ++++++++++++++----- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index e446af8f299c..9b86287fd8cd 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -1408,7 +1408,8 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS |(8, '4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M='), |(9, null), |(10, '4Keo4Kem4Keo4KerLeCnp+Cnpy3gp6fgp6k='), - |(11, 'MjAyNS0xMS0xMg==') + |(11, 'MjAyNS0xMS0xMg=='), + |(12, '4LmS4LmQ4LmS4LmVLeC5keC5kS3guZHguZM=') |""".stripMargin) var query_sql = """ |select @@ -1433,6 +1434,28 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS |""".stripMargin compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) + query_sql = """ + |select from_unixtime( + | unix_timestamp( + | regexp_replace( + | cast(unbase64('4LmS4LmQ4LmS4LmVLeC5keC5kS3guZHguZM=') as string), + | '-0', '-'), + | 'yyyy-MM-dd'), + | 'yyyy-MM-dd') + |""".stripMargin + compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) + + query_sql = """ + |select from_unixtime( + | unix_timestamp( + | regexp_replace( + | cast(unbase64('4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M=') as string), + | '-0', '-'), + | 'yyyy-MM-dd'), + | 'yyyy-MM-dd') + |""".stripMargin + compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) + sql("drop table tb_local_date") } } diff --git a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp index 7f19975c0a96..0b4283ef1053 100644 --- a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp +++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp @@ -163,19 +163,6 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction } private: - bool hasLocalDigit(StringRef str) const - { - if (!str.size) - return false; - // In most cases, the first byte is a digit. - char c = reinterpret_cast(str.data[0]); - if ('0' <= c && c <= '9') - { - return false; - } - return true; - } - char toAsciiDigit(char32_t c) const { // In Thai and Persian, dates typically do not use the Gregorian calendar. // This may cause failures in unix_timestamp parsing. @@ -195,6 +182,40 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction return 0; } + bool hasLocalDigit(StringRef str) const + { + if (!str.size) + return false; + for (size_t i = 0; i < str.size;) + { + unsigned char c = str.data[i]; + char32_t cp = 0; + if ((c & 0x80) == 0) // 1-byte + { + cp = c; + i += 1; + } + else if ((c & 0xE0) == 0xC0) // 2-byte + { + cp = ((c & 0x1F) << 6) | (str.data[i + 1] & 0x3F); + i += 2; + } + else if ((c & 0xF0) == 0xE0) // 3-byte + { + cp = ((c & 0x0F) << 12) | ((str.data[i + 1] & 0x3F) << 6) | (str.data[i + 2] & 0x3F); + i += 3; + } + else if ((c & 0xF8) == 0xF0) // 4-byte + { + cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | ((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F); + i += 4; + } + if (toAsciiDigit(cp)) + return true; + } + return false; + } + String convertLocalDigit(const StringRef & str) const { std::string result; From 04647e7a6455bb96d3f7fb396a6dc1b0f6ed0e82 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Mon, 26 Jan 2026 17:09:30 +0800 Subject: [PATCH 2/6] [CH] Document local digit date fixtures Add comments describing the base64-encoded local digit date fixtures used in local digit date tests. --- .../apache/gluten/execution/GlutenFunctionValidateSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index 9b86287fd8cd..88abe3ae2e05 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -1411,6 +1411,8 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS |(11, 'MjAyNS0xMS0xMg=='), |(12, '4LmS4LmQ4LmS4LmVLeC5keC5kS3guZHguZM=') |""".stripMargin) + // base64 inputs decode to local digit dates: + // 1-3 Arabic-Indic, 5 Persian, 7 Devanagari, 8 Khmer, 10 Bengali, 11 ASCII, 12 Thai var query_sql = """ |select |from_unixtime(unix_timestamp(cast(unbase64(d) as string), 'yyyy-MM-dd')), From 9d15a688ac03e81aab3ceee0a2e6fe4a610b9675 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Mon, 26 Jan 2026 17:50:23 +0800 Subject: [PATCH 3/6] [CH] Speed up local digit conversion Use SIMD-based ASCII detection, fast-path common UTF-8 digit ranges, and avoid double scans when converting local digits. --- .../LocalDigitsToAsciiDigitForDate.cpp | 186 ++++++++++++------ 1 file changed, 124 insertions(+), 62 deletions(-) diff --git a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp index 0b4283ef1053..31495199d424 100644 --- a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp +++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp @@ -25,6 +25,31 @@ #include #include #include +#include +#if SIMDJSON_IMPLEMENTATION_ICELAKE && defined(__AVX512F__) && defined(__AVX512BW__) +#include +namespace simdjson_impl = simdjson::icelake::simd; +#elif SIMDJSON_IMPLEMENTATION_HASWELL && defined(__AVX2__) +#include +namespace simdjson_impl = simdjson::haswell::simd; +#elif SIMDJSON_IMPLEMENTATION_WESTMERE && defined(__SSE4_2__) +#include +namespace simdjson_impl = simdjson::westmere::simd; +#elif SIMDJSON_IMPLEMENTATION_ARM64 +#include +namespace simdjson_impl = simdjson::arm64::simd; +#elif SIMDJSON_IMPLEMENTATION_PPC64 +#include +namespace simdjson_impl = simdjson::ppc64::simd; +#elif SIMDJSON_IMPLEMENTATION_LSX +#include +namespace simdjson_impl = simdjson::lsx::simd; +#elif SIMDJSON_IMPLEMENTATION_LASX +#include +namespace simdjson_impl = simdjson::lasx::simd; +#else +#define SIMDJSON_NO_SIMD 1 +#endif #include #include #include @@ -97,7 +122,9 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction getName(), data_col->getName()); auto date_str = col_str->getDataAt(0); - auto new_str = convertLocalDigit(date_str); + std::string new_str; + if (!convertLocalDigitIfNeeded(date_str, new_str)) + return arguments[0].column; auto new_data_col = data_col->cloneEmpty(); new_data_col->insertData(new_str.c_str(), new_str.size()); return DB::ColumnConst::create(std::move(new_data_col), input_rows_count); @@ -120,45 +147,39 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction getName(), data_col->getName()); - auto nested_data_col = DB::removeNullable(arguments[0].column); - bool has_local_digit = false; - size_t row_index = 0; - for (row_index = 0; row_index < input_rows_count; ++row_index) + std::string converted; + DB::MutableColumnPtr res_col; + for (size_t row_index = 0; row_index < input_rows_count; ++row_index) { if (null_map && (*null_map)[row_index]) { + if (res_col) + res_col->insertDefault(); continue; } auto str = col_str->getDataAt(row_index); - if (hasLocalDigit(str)) + if (convertLocalDigitIfNeeded(str, converted)) { - has_local_digit = true; - break; + if (!res_col) + { + res_col = data_col->cloneEmpty(); + if (row_index) + res_col->insertManyFrom(*data_col, 0, row_index); + } + LOG_ERROR( + getLogger("LocalDigitsToAsciiDigitForDateFunction"), + "Converted local digit string {} to ascii digit string: {}", + col_str->getDataAt(row_index).toString(), + converted); + res_col->insertData(converted.c_str(), converted.size()); } - } - - if (!has_local_digit) - { - // No local language digits found, return the original column - return arguments[0].column; - } - - auto res_col = data_col->cloneEmpty(); - if (row_index) - { - res_col->insertManyFrom(*data_col, 0, row_index); - } - for (; row_index < input_rows_count; ++row_index) - { - if (null_map && (*null_map)[row_index]) + else if (res_col) { - res_col->insertDefault(); - continue; + res_col->insertFrom(*data_col, row_index); } - auto str = convertLocalDigit(col_str->getDataAt(row_index)); - LOG_ERROR(getLogger("LocalDigitsToAsciiDigitForDateFunction"), "Converted local digit string {} to ascii digit string: {}", col_str->getDataAt(row_index).toString(), str); - res_col->insertData(str.c_str(), str.size()); } + if (!res_col) + return arguments[0].column; return res_col; } @@ -182,61 +203,97 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction return 0; } - bool hasLocalDigit(StringRef str) const + bool hasNonAsciiSimd(const char * data, size_t size) const { - if (!str.size) - return false; - for (size_t i = 0; i < str.size;) +#if SIMDJSON_NO_SIMD + const unsigned char * bytes = reinterpret_cast(data); + for (size_t i = 0; i < size; ++i) { - unsigned char c = str.data[i]; - char32_t cp = 0; - if ((c & 0x80) == 0) // 1-byte - { - cp = c; - i += 1; - } - else if ((c & 0xE0) == 0xC0) // 2-byte - { - cp = ((c & 0x1F) << 6) | (str.data[i + 1] & 0x3F); - i += 2; - } - else if ((c & 0xF0) == 0xE0) // 3-byte - { - cp = ((c & 0x0F) << 12) | ((str.data[i + 1] & 0x3F) << 6) | (str.data[i + 2] & 0x3F); - i += 3; - } - else if ((c & 0xF8) == 0xF0) // 4-byte - { - cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | ((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F); - i += 4; - } - if (toAsciiDigit(cp)) + if (bytes[i] & 0x80) + return true; + } + return false; +#else + using simd8_u8 = simdjson_impl::simd8; + constexpr size_t kBlockSize = simd8_u8::SIZE; + size_t i = 0; + for (; i + kBlockSize <= size; i += kBlockSize) + { + if (!simd8_u8::load(reinterpret_cast(data + i)).is_ascii()) + return true; + } + for (; i < size; ++i) + { + if (static_cast(data[i]) & 0x80) return true; } return false; +#endif } - String convertLocalDigit(const StringRef & str) const + bool convertLocalDigitIfNeeded(StringRef str, std::string & result) const { - std::string result; + if (!str.size) + return false; + if (!hasNonAsciiSimd(str.data, str.size)) + return false; + result.clear(); result.reserve(str.size); + bool has_local_digit = false; for (size_t i = 0; i < str.size;) { unsigned char c = str.data[i]; char32_t cp = 0; if ((c & 0x80) == 0) // 1-byte { - cp = c; + result.push_back(c); i += 1; + continue; } else if ((c & 0xE0) == 0xC0) // 2-byte { - cp = ((c & 0x1F) << 6) | (str.data[i + 1] & 0x3F); + unsigned char b1 = str.data[i + 1]; + if (c == 0xD9 && b1 >= 0xA0 && b1 <= 0xA9) // Arabic-Indic + { + result.push_back(static_cast('0' + (b1 - 0xA0))); + has_local_digit = true; + i += 2; + continue; + } + if (c == 0xDB && b1 >= 0xB0 && b1 <= 0xB9) // Eastern Arabic-Indic (Persian) + { + result.push_back(static_cast('0' + (b1 - 0xB0))); + has_local_digit = true; + i += 2; + continue; + } + cp = ((c & 0x1F) << 6) | (b1 & 0x3F); i += 2; } else if ((c & 0xF0) == 0xE0) // 3-byte { - cp = ((c & 0x0F) << 12) | ((str.data[i + 1] & 0x3F) << 6) | (str.data[i + 2] & 0x3F); + unsigned char b1 = str.data[i + 1]; + unsigned char b2 = str.data[i + 2]; + if (c == 0xE0) + { + if ((b1 == 0xA5 && b2 >= 0xA6 && b2 <= 0xAF) || // Devanagari + (b1 == 0xA7 && b2 >= 0xA6 && b2 <= 0xAF) || // Bengali + (b1 == 0xB9 && b2 >= 0x90 && b2 <= 0x99)) // Thai + { + result.push_back(static_cast('0' + (b2 & 0x0F))); + has_local_digit = true; + i += 3; + continue; + } + } + else if (c == 0xE1 && b1 == 0x9F && b2 >= 0xA0 && b2 <= 0xA9) // Khmer + { + result.push_back(static_cast('0' + (b2 - 0xA0))); + has_local_digit = true; + i += 3; + continue; + } + cp = ((c & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); i += 3; } else if ((c & 0xF8) == 0xF0) // 4-byte @@ -246,11 +303,16 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction } auto local_digit = toAsciiDigit(cp); if (local_digit) + { result.push_back(local_digit); + has_local_digit = true; + } else + { result.push_back(cp); + } } - return result; + return has_local_digit; } }; From 52e68319322d118f8a81b5cdf97a785285fe8b68 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Thu, 29 Jan 2026 21:30:24 +0800 Subject: [PATCH 4/6] [CH] Fix UTF-8 fallback in local digit conversion Preserve original UTF-8 bytes when no local digit is detected in multi-byte sequences, and downgrade logging to debug. --- .../LocalDigitsToAsciiDigitForDate.cpp | 56 +++++++++++++++---- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp index 31495199d424..40e193f3f40f 100644 --- a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp +++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp @@ -166,7 +166,7 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction if (row_index) res_col->insertManyFrom(*data_col, 0, row_index); } - LOG_ERROR( + LOG_DEBUG( getLogger("LocalDigitsToAsciiDigitForDateFunction"), "Converted local digit string {} to ascii digit string: {}", col_str->getDataAt(row_index).toString(), @@ -268,7 +268,19 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction continue; } cp = ((c & 0x1F) << 6) | (b1 & 0x3F); + auto local_digit = toAsciiDigit(cp); + if (local_digit) + { + result.push_back(local_digit); + has_local_digit = true; + } + else + { + result.push_back(static_cast(c)); + result.push_back(static_cast(b1)); + } i += 2; + continue; } else if ((c & 0xF0) == 0xE0) // 3-byte { @@ -294,22 +306,42 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction continue; } cp = ((c & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F); + auto local_digit = toAsciiDigit(cp); + if (local_digit) + { + result.push_back(local_digit); + has_local_digit = true; + } + else + { + result.push_back(static_cast(c)); + result.push_back(static_cast(b1)); + result.push_back(static_cast(b2)); + } i += 3; + continue; } else if ((c & 0xF8) == 0xF0) // 4-byte { - cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | ((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F); + unsigned char b1 = str.data[i + 1]; + unsigned char b2 = str.data[i + 2]; + unsigned char b3 = str.data[i + 3]; + cp = ((c & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); + auto local_digit = toAsciiDigit(cp); + if (local_digit) + { + result.push_back(local_digit); + has_local_digit = true; + } + else + { + result.push_back(static_cast(c)); + result.push_back(static_cast(b1)); + result.push_back(static_cast(b2)); + result.push_back(static_cast(b3)); + } i += 4; - } - auto local_digit = toAsciiDigit(cp); - if (local_digit) - { - result.push_back(local_digit); - has_local_digit = true; - } - else - { - result.push_back(cp); + continue; } } return has_local_digit; From fc9ec4c187eb83395b9c0932a05cb865b4963023 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Fri, 30 Jan 2026 12:10:55 +0800 Subject: [PATCH 5/6] [CH] Fix Devanagari/Bengali digit mapping Map UTF-8 byte ranges to correct digit values for Devanagari and Bengali local digits. --- .../Functions/LocalDigitsToAsciiDigitForDate.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp index 40e193f3f40f..1e50e02bff53 100644 --- a/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp +++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp @@ -289,10 +289,16 @@ class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction if (c == 0xE0) { if ((b1 == 0xA5 && b2 >= 0xA6 && b2 <= 0xAF) || // Devanagari - (b1 == 0xA7 && b2 >= 0xA6 && b2 <= 0xAF) || // Bengali - (b1 == 0xB9 && b2 >= 0x90 && b2 <= 0x99)) // Thai + (b1 == 0xA7 && b2 >= 0xA6 && b2 <= 0xAF)) // Bengali { - result.push_back(static_cast('0' + (b2 & 0x0F))); + result.push_back(static_cast('0' + (b2 - 0xA6))); + has_local_digit = true; + i += 3; + continue; + } + if (b1 == 0xB9 && b2 >= 0x90 && b2 <= 0x99) // Thai + { + result.push_back(static_cast('0' + (b2 - 0x90))); has_local_digit = true; i += 3; continue; From 7071d863c29fce0bd39123bb5af3719eafe41c58 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Mon, 2 Feb 2026 15:19:26 +0800 Subject: [PATCH 6/6] Fix scala code format in CH CI. --- .../main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala | 1 + .../apache/spark/sql/execution/datasources/DeltaV1Writes.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala b/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala index 58d59aa9dec5..f414ab8f2856 100644 --- a/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala +++ b/backends-clickhouse/src-delta23/main/scala/org/apache/spark/sql/delta/DeltaAdapter.scala @@ -15,6 +15,7 @@ * limitations under the License. */ package org.apache.spark.sql.delta + import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.delta.stats.DeltaScan diff --git a/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala b/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala index 8ae99cc0d59f..de9b760c093b 100644 --- a/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala +++ b/backends-clickhouse/src-delta33/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala @@ -15,6 +15,7 @@ * limitations under the License. */ package org.apache.spark.sql.execution.datasources + import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.spark.sql.SparkSession