From 83cde3bf7caa8f40ea036e3bba70c532b2729410 Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Sat, 10 Jan 2026 09:34:53 -0800 Subject: [PATCH 1/9] int_to_binary --- .../source/user-guide/latest/compatibility.md | 128 +++++++++++++++++- .../spark-expr/src/conversion_funcs/cast.rs | 41 +++++- .../apache/comet/expressions/CometCast.scala | 9 +- .../org/apache/comet/CometCastSuite.scala | 2 +- 4 files changed, 165 insertions(+), 15 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index c09f6a61e6..8b6684ac6c 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -99,16 +99,19 @@ they will be identical to Spark. Unsorted results may have different row orderin Cast operations in Comet fall into three levels of support: -- **C (Compatible)**: The results match Apache Spark -- **I (Incompatible)**: The results may match Apache Spark for some inputs, but there are known issues where some inputs +- **Compatible**: The results match Apache Spark +- **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting `spark.comet.expression.Cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not recommended for production use. -- **U (Unsupported)**: Comet does not provide a native version of this cast expression and the query stage will fall back to +- **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to Spark. -- **N/A**: Spark does not support this cast. -### Legacy Mode +### Compatible Casts + +The following cast operations are generally compatible with Spark except for the differences noted here. + +<<<<<<< HEAD @@ -123,4 +126,117 @@ Cast operations in Comet fall into three levels of support: -See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. +# See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. + + + + + +| From Type | To Type | Notes | +|-|-|-| +| boolean | byte | | +| boolean | short | | +| boolean | integer | | +| boolean | long | | +| boolean | float | | +| boolean | double | | +| boolean | string | | +| byte | boolean | | +| byte | short | | +| byte | integer | | +| byte | long | | +| byte | float | | +| byte | double | | +| byte | decimal | | +| byte | string | | +| byte | binary | | +| short | boolean | | +| short | byte | | +| short | integer | | +| short | long | | +| short | float | | +| short | double | | +| short | decimal | | +| short | string | | +| short | binary | | +| integer | boolean | | +| integer | byte | | +| integer | short | | +| integer | long | | +| integer | float | | +| integer | double | | +| integer | decimal | | +| integer | string | | +| integer | binary | | +| long | boolean | | +| long | byte | | +| long | short | | +| long | integer | | +| long | float | | +| long | double | | +| long | decimal | | +| long | string | | +| long | binary | | +| float | boolean | | +| float | byte | | +| float | short | | +| float | integer | | +| float | long | | +| float | double | | +| float | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | +| double | boolean | | +| double | byte | | +| double | short | | +| double | integer | | +| double | long | | +| double | float | | +| double | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | +| decimal | boolean | | +| decimal | byte | | +| decimal | short | | +| decimal | integer | | +| decimal | long | | +| decimal | float | | +| decimal | double | | +| decimal | decimal | | +| decimal | string | There can be formatting differences in some case due to Spark using scientific notation where Comet does not | +| string | boolean | | +| string | byte | | +| string | short | | +| string | integer | | +| string | long | | +| string | float | | +| string | double | | +| string | date | Only supports years between 262143 BC and 262142 AD | +| binary | string | | +| date | string | | +| timestamp | long | | +| timestamp | string | | +| timestamp | date | | + + + +### Incompatible Casts + +The following cast operations are not compatible with Spark for all inputs and are disabled by default. + + + + + +| From Type | To Type | Notes | +|-|-|-| +| float | decimal | There can be rounding differences | +| double | decimal | There can be rounding differences | +| string | decimal | Does not support fullwidth unicode digits (e.g \\uFF10) +or strings containing null bytes (e.g \\u0000) | +| string | timestamp | Not all valid formats are supported | + + + +### Unsupported Casts + +Any cast not listed in the previous tables is currently unsupported. We are working on adding more. See the +[tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. + +> > > > > > > 6f8e1c629 (int_to_binary) diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index 5c65336183..c41f517120 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -20,13 +20,13 @@ use crate::{timezone, BinaryOutputStyle}; use crate::{EvalMode, SparkError, SparkResult}; use arrow::array::builder::StringBuilder; use arrow::array::{ - BooleanBuilder, Decimal128Builder, DictionaryArray, GenericByteArray, ListArray, + BinaryBuilder, BooleanBuilder, Decimal128Builder, DictionaryArray, GenericByteArray, ListArray, PrimitiveBuilder, StringArray, StructArray, TimestampMicrosecondBuilder, }; use arrow::compute::can_cast_types; use arrow::datatypes::{ - i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type, GenericBinaryType, - Schema, + i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type, + GenericBinaryType, Schema, }; use arrow::{ array::{ @@ -311,7 +311,7 @@ fn can_cast_from_byte(to_type: &DataType, _: &SparkCastOptions) -> bool { use DataType::*; matches!( to_type, - Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) + Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) | Binary ) } @@ -319,14 +319,14 @@ fn can_cast_from_short(to_type: &DataType, _: &SparkCastOptions) -> bool { use DataType::*; matches!( to_type, - Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) + Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) | Binary ) } fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool { use DataType::*; match to_type { - Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 => true, + Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 | Binary => true, Decimal128(_, _) => { // incompatible: no overflow check options.allow_incompat @@ -338,7 +338,7 @@ fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool { fn can_cast_from_long(to_type: &DataType, options: &SparkCastOptions) -> bool { use DataType::*; match to_type { - Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => true, + Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Binary => true, Decimal128(_, _) => { // incompatible: no overflow check options.allow_incompat @@ -501,6 +501,29 @@ macro_rules! cast_float_to_string { }}; } +// eval mode is not needed since all ints can be implemented in binary format +macro_rules! cast_whole_num_to_binary { + ($array:expr, $primitive_type:ty, $byte_size:expr) => {{ + let input_arr = $array + .as_any() + .downcast_ref::<$primitive_type>() + .ok_or_else(|| SparkError::Internal("Expected numeric array".to_string()))?; + + let len = input_arr.len(); + let mut builder = BinaryBuilder::with_capacity(len, len * $byte_size); + + for i in 0..input_arr.len() { + if input_arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(input_arr.value(i).to_be_bytes()); + } + } + + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} + macro_rules! cast_int_to_int_macro { ( $array: expr, @@ -1101,6 +1124,10 @@ fn cast_array( } (Binary, Utf8) => Ok(cast_binary_to_string::(&array, cast_options)?), (Date32, Timestamp(_, tz)) => Ok(cast_date_to_timestamp(&array, cast_options, tz)?), + (Int8, Binary) => cast_whole_num_to_binary!(&array, Int8Array, 1), + (Int16, Binary) => cast_whole_num_to_binary!(&array, Int16Array, 2), + (Int32, Binary) => cast_whole_num_to_binary!(&array, Int32Array, 4), + (Int64, Binary) => cast_whole_num_to_binary!(&array, Int64Array, 8), _ if cast_options.is_adapting_schema || is_datafusion_spark_compatible(from_type, to_type) => { diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index f42a5d8d8e..30a839c2e8 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -21,7 +21,7 @@ package org.apache.comet.expressions import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType} +import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, DataTypes, DecimalType, NullType, StructType} import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo @@ -126,6 +126,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { isSupported(dt.elementType, DataTypes.StringType, timeZoneId, evalMode) case (dt: ArrayType, dt1: ArrayType) => isSupported(dt.elementType, dt1.elementType, timeZoneId, evalMode) + case (from: DataType, _: BinaryType) => canCastToBinary(from) case (dt: DataType, _) if dt.typeName == "timestamp_ntz" => // https://github.com/apache/datafusion-comet/issues/378 toType match { @@ -351,6 +352,12 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case _ => Unsupported(Some(s"Cast from DateType to $toType is not supported")) } + private def canCastToBinary(fromType: DataType): SupportLevel = fromType match { + case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => + Compatible() + case _ => Unsupported(Some(s"Cast from $fromType to BinaryType is not supported")) + } + private def unsupported(fromType: DataType, toType: DataType): Unsupported = { Unsupported(Some(s"Cast from $fromType to $toType is not supported")) } diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index bea701d490..df86085dde 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -280,7 +280,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { hasIncompatibleType = usingParquetExecWithIncompatTypes) } - ignore("cast ShortType to BinaryType") { + test("cast ShortType to BinaryType") { castTest( generateShorts(), DataTypes.BinaryType, From d05b628cf9741ee204483d897dd7744c3d7066d4 Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Tue, 13 Jan 2026 15:37:07 -0800 Subject: [PATCH 2/9] int_to_binary --- .../source/user-guide/latest/compatibility.md | 92 ------------- .../spark-expr/src/conversion_funcs/cast.rs | 9 +- .../apache/comet/expressions/CometCast.scala | 129 +++++++++++------- .../org/apache/comet/CometCastSuite.scala | 55 ++++---- 4 files changed, 116 insertions(+), 169 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index 8b6684ac6c..fa09c885e0 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -131,89 +131,6 @@ The following cast operations are generally compatible with Spark except for the - -| From Type | To Type | Notes | -|-|-|-| -| boolean | byte | | -| boolean | short | | -| boolean | integer | | -| boolean | long | | -| boolean | float | | -| boolean | double | | -| boolean | string | | -| byte | boolean | | -| byte | short | | -| byte | integer | | -| byte | long | | -| byte | float | | -| byte | double | | -| byte | decimal | | -| byte | string | | -| byte | binary | | -| short | boolean | | -| short | byte | | -| short | integer | | -| short | long | | -| short | float | | -| short | double | | -| short | decimal | | -| short | string | | -| short | binary | | -| integer | boolean | | -| integer | byte | | -| integer | short | | -| integer | long | | -| integer | float | | -| integer | double | | -| integer | decimal | | -| integer | string | | -| integer | binary | | -| long | boolean | | -| long | byte | | -| long | short | | -| long | integer | | -| long | float | | -| long | double | | -| long | decimal | | -| long | string | | -| long | binary | | -| float | boolean | | -| float | byte | | -| float | short | | -| float | integer | | -| float | long | | -| float | double | | -| float | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | -| double | boolean | | -| double | byte | | -| double | short | | -| double | integer | | -| double | long | | -| double | float | | -| double | string | There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 | -| decimal | boolean | | -| decimal | byte | | -| decimal | short | | -| decimal | integer | | -| decimal | long | | -| decimal | float | | -| decimal | double | | -| decimal | decimal | | -| decimal | string | There can be formatting differences in some case due to Spark using scientific notation where Comet does not | -| string | boolean | | -| string | byte | | -| string | short | | -| string | integer | | -| string | long | | -| string | float | | -| string | double | | -| string | date | Only supports years between 262143 BC and 262142 AD | -| binary | string | | -| date | string | | -| timestamp | long | | -| timestamp | string | | -| timestamp | date | | - ### Incompatible Casts @@ -223,15 +140,6 @@ The following cast operations are not compatible with Spark for all inputs and a - -| From Type | To Type | Notes | -|-|-|-| -| float | decimal | There can be rounding differences | -| double | decimal | There can be rounding differences | -| string | decimal | Does not support fullwidth unicode digits (e.g \\uFF10) -or strings containing null bytes (e.g \\u0000) | -| string | timestamp | Not all valid formats are supported | - ### Unsupported Casts diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index c41f517120..5cc1d4aa36 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -66,6 +66,7 @@ use std::{ num::Wrapping, sync::Arc, }; +use crate::EvalMode::Legacy; static TIMESTAMP_FORMAT: Option<&str> = Some("%Y-%m-%d %H:%M:%S%.f"); @@ -1124,10 +1125,10 @@ fn cast_array( } (Binary, Utf8) => Ok(cast_binary_to_string::(&array, cast_options)?), (Date32, Timestamp(_, tz)) => Ok(cast_date_to_timestamp(&array, cast_options, tz)?), - (Int8, Binary) => cast_whole_num_to_binary!(&array, Int8Array, 1), - (Int16, Binary) => cast_whole_num_to_binary!(&array, Int16Array, 2), - (Int32, Binary) => cast_whole_num_to_binary!(&array, Int32Array, 4), - (Int64, Binary) => cast_whole_num_to_binary!(&array, Int64Array, 8), + (Int8, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int8Array, 1), + (Int16, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int16Array, 2), + (Int32, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int32Array, 4), + (Int64, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int64Array, 8), _ if cast_options.is_adapting_schema || is_datafusion_spark_compatible(from_type, to_type) => { diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 30a839c2e8..2fc37187d6 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -126,7 +126,6 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { isSupported(dt.elementType, DataTypes.StringType, timeZoneId, evalMode) case (dt: ArrayType, dt1: ArrayType) => isSupported(dt.elementType, dt1.elementType, timeZoneId, evalMode) - case (from: DataType, _: BinaryType) => canCastToBinary(from) case (dt: DataType, _) if dt.typeName == "timestamp_ntz" => // https://github.com/apache/datafusion-comet/issues/378 toType match { @@ -148,13 +147,13 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case (DataTypes.BooleanType, _) => canCastFromBoolean(toType) case (DataTypes.ByteType, _) => - canCastFromByte(toType) + canCastFromByte(toType, evalMode) case (DataTypes.ShortType, _) => - canCastFromShort(toType) + canCastFromShort(toType, evalMode) case (DataTypes.IntegerType, _) => - canCastFromInt(toType) + canCastFromInt(toType, evalMode) case (DataTypes.LongType, _) => - canCastFromLong(toType) + canCastFromLong(toType, evalMode) case (DataTypes.FloatType, _) => canCastFromFloat(toType) case (DataTypes.DoubleType, _) => @@ -270,53 +269,85 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case _ => unsupported(DataTypes.BooleanType, toType) } - private def canCastFromByte(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType => - Compatible() - case DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => - Compatible() - case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => - Compatible() - case _ => - unsupported(DataTypes.ByteType, toType) - } + private def canCastFromByte(toType: DataType, evalMode: CometEvalMode.Value): SupportLevel = + toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => + Compatible() + case DataTypes.BinaryType => + if (evalMode == CometEvalMode.LEGACY) { + Compatible() + } else { + Unsupported( + Some(s"Spark does not support byte to binary conversion in ${evalMode} eval mode")) + } + case _ => + unsupported(DataTypes.ByteType, toType) + } - private def canCastFromShort(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType => - Compatible() - case DataTypes.ByteType | DataTypes.IntegerType | DataTypes.LongType => - Compatible() - case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => - Compatible() - case _ => - unsupported(DataTypes.ShortType, toType) - } + private def canCastFromShort(toType: DataType, evalMode: CometEvalMode.Value): SupportLevel = + toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ByteType | DataTypes.IntegerType | DataTypes.LongType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => + Compatible() + case DataTypes.BinaryType => + if (evalMode == CometEvalMode.LEGACY) { + Compatible() + } else { + Unsupported( + Some(s"Spark does not support short to binary conversion in ${evalMode} eval mode")) + } + case _ => + unsupported(DataTypes.ShortType, toType) + } - private def canCastFromInt(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType => - Compatible() - case DataTypes.ByteType | DataTypes.ShortType | DataTypes.LongType => - Compatible() - case DataTypes.FloatType | DataTypes.DoubleType => - Compatible() - case _: DecimalType => - Compatible() - case _ => - unsupported(DataTypes.IntegerType, toType) - } + private def canCastFromInt(toType: DataType, evalMode: CometEvalMode.Value): SupportLevel = + toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ByteType | DataTypes.ShortType | DataTypes.LongType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType => + Compatible() + case _: DecimalType => + Compatible() + case DataTypes.BinaryType => + if (evalMode == CometEvalMode.LEGACY) { + Compatible() + } else { + Unsupported( + Some(s"Spark does not support int to binary conversion in ${evalMode} eval mode")) + } + case _ => + unsupported(DataTypes.IntegerType, toType) + } - private def canCastFromLong(toType: DataType): SupportLevel = toType match { - case DataTypes.BooleanType => - Compatible() - case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType => - Compatible() - case DataTypes.FloatType | DataTypes.DoubleType => - Compatible() - case _: DecimalType => - Compatible() - case _ => - unsupported(DataTypes.LongType, toType) - } + private def canCastFromLong(toType: DataType, evalMode: CometEvalMode.Value): SupportLevel = + toType match { + case DataTypes.BooleanType => + Compatible() + case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType => + Compatible() + case DataTypes.FloatType | DataTypes.DoubleType => + Compatible() + case _: DecimalType => + Compatible() + case DataTypes.BinaryType => + if (evalMode == CometEvalMode.LEGACY) { + Compatible() + } else { + Unsupported( + Some(s"Spark does not support long to binary conversion in ${evalMode} eval mode")) + } + case _ => + unsupported(DataTypes.LongType, toType) + } private def canCastFromFloat(toType: DataType): SupportLevel = toType match { case DataTypes.BooleanType | DataTypes.DoubleType | DataTypes.ByteType | DataTypes.ShortType | diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index df86085dde..72018cf4f4 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -206,11 +206,12 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { hasIncompatibleType = usingParquetExecWithIncompatTypes) } - ignore("cast ByteType to BinaryType") { + test("cast ByteType to BinaryType") { + // Spark does not support ANSI or Try mode castTest( generateBytes(), DataTypes.BinaryType, - hasIncompatibleType = usingParquetExecWithIncompatTypes) + hasIncompatibleType = usingParquetExecWithIncompatTypes, testAnsi = false, testTry = false) } ignore("cast ByteType to TimestampType") { @@ -281,10 +282,11 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("cast ShortType to BinaryType") { +// Spark does not support ANSI or Try mode castTest( generateShorts(), DataTypes.BinaryType, - hasIncompatibleType = usingParquetExecWithIncompatTypes) + hasIncompatibleType = usingParquetExecWithIncompatTypes, testAnsi = false, testTry = false) } ignore("cast ShortType to TimestampType") { @@ -345,8 +347,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { castTest(generateInts(), DataTypes.StringType) } - ignore("cast IntegerType to BinaryType") { - castTest(generateInts(), DataTypes.BinaryType) + test("cast IntegerType to BinaryType") { + // Spark does not support ANSI or Try mode + castTest(generateInts(), DataTypes.BinaryType, testAnsi = false, testTry = false) } ignore("cast IntegerType to TimestampType") { @@ -391,8 +394,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { castTest(generateLongs(), DataTypes.StringType) } - ignore("cast LongType to BinaryType") { - castTest(generateLongs(), DataTypes.BinaryType) + test("cast LongType to BinaryType") { + // Spark does not support ANSI or Try mode + castTest(generateLongs(), DataTypes.BinaryType , testAnsi = false, testTry = false) } ignore("cast LongType to TimestampType") { @@ -1416,28 +1420,30 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { input: DataFrame, toType: DataType, hasIncompatibleType: Boolean = false, - testAnsi: Boolean = true): Unit = { + testAnsi: Boolean = true, + testTry: Boolean = true): Unit = { withTempPath { dir => val data = roundtripParquet(input, dir).coalesce(1) - data.createOrReplaceTempView("t") withSQLConf((SQLConf.ANSI_ENABLED.key, "false")) { // cast() should return null for invalid inputs when ansi mode is disabled - val df = spark.sql(s"select a, cast(a as ${toType.sql}) from t order by a") + val df = data.select(col("a"), col("a").cast(toType)).orderBy(col("a")) if (hasIncompatibleType) { checkSparkAnswer(df) } else { checkSparkAnswerAndOperator(df) } - // try_cast() should always return null for invalid inputs - val df2 = - spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a") - if (hasIncompatibleType) { - checkSparkAnswer(df2) - } else { - checkSparkAnswerAndOperator(df2) + if (testTry){ + // try_cast() should always return null for invalid inputs + val df2 = + data.select(col("a"), col("a").try_cast(toType)).orderBy(col("a")) + if (hasIncompatibleType) { + checkSparkAnswer(df2) + } else { + checkSparkAnswerAndOperator(df2) + } } } @@ -1495,14 +1501,15 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } // try_cast() should always return null for invalid inputs - val df2 = - spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a") - if (hasIncompatibleType) { - checkSparkAnswer(df2) - } else { - checkSparkAnswerAndOperator(df2) + if (testTry){ + val df2 = + data.select(col("a"), col("a").cast(toType)).orderBy(col("a")) + if (hasIncompatibleType) { + checkSparkAnswer(df2) + } else { + checkSparkAnswerAndOperator(df2) + } } - } } } From b4e637e0ea8a7a488193742562310f2efe066d5d Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Tue, 13 Jan 2026 15:42:28 -0800 Subject: [PATCH 3/9] int_to_binary --- .../source/user-guide/latest/compatibility.md | 123 +++++++++++++++--- 1 file changed, 107 insertions(+), 16 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index fa09c885e0..a5837a1e59 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -99,17 +99,16 @@ they will be identical to Spark. Unsorted results may have different row orderin Cast operations in Comet fall into three levels of support: -- **Compatible**: The results match Apache Spark -- **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs +- **C (Compatible)**: The results match Apache Spark +- **I (Incompatible)**: The results may match Apache Spark for some inputs, but there are known issues where some inputs will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting `spark.comet.expression.Cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not recommended for production use. -- **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to +- **U (Unsupported)**: Comet does not provide a native version of this cast expression and the query stage will fall back to Spark. +- **N/A**: Spark does not support this cast. -### Compatible Casts - -The following cast operations are generally compatible with Spark except for the differences noted here. +### Legacy Mode <<<<<<< HEAD @@ -130,21 +129,113 @@ The following cast operations are generally compatible with Spark except for the - - - -### Incompatible Casts + + +| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | +| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| date | N/A | U | U | - | U | U | U | U | U | U | C | U | +| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | +| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | +| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | +| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | +| long | U | C | C | N/A | C | C | C | C | - | C | C | U | +| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| string | C | C | C | C | I | C | C | C | C | C | - | I | +| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | + + +**Notes:** + +- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not +- **double -> decimal**: There can be rounding differences +- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **float -> decimal**: There can be rounding differences +- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **string -> date**: Only supports years between 262143 BC and 262142 AD +- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) + or strings containing null bytes (e.g \\u0000) +- **string -> timestamp**: Not all valid formats are supported + -The following cast operations are not compatible with Spark for all inputs and are disabled by default. +### Try Mode - - - -### Unsupported Casts + + +| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | +| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| date | N/A | U | U | - | U | U | U | U | U | U | C | U | +| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | +| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | +| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | +| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | +| long | U | C | C | N/A | C | C | C | C | - | C | C | U | +| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| string | C | C | C | C | I | C | C | C | C | C | - | I | +| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | + + +**Notes:** +<<<<<<< HEAD Any cast not listed in the previous tables is currently unsupported. We are working on adding more. See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. -> > > > > > > 6f8e1c629 (int_to_binary) +> > > > > > > # 6f8e1c629 (int_to_binary) + +- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not +- **double -> decimal**: There can be rounding differences +- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **float -> decimal**: There can be rounding differences +- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **string -> date**: Only supports years between 262143 BC and 262142 AD +- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) + or strings containing null bytes (e.g \\u0000) +- **string -> timestamp**: Not all valid formats are supported + + +### ANSI Mode + + + + + +| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | +|---|---|---|---|---|---|---|---|---|---|---|---|---| +| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | +| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| date | N/A | U | U | - | U | U | U | U | U | U | C | U | +| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | +| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | +| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | +| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | +| long | U | C | C | N/A | C | C | C | C | - | C | C | U | +| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| string | C | C | C | C | I | C | C | C | C | C | - | I | +| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | + + +**Notes:** + +- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not +- **double -> decimal**: There can be rounding differences +- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **float -> decimal**: There can be rounding differences +- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 +- **string -> date**: Only supports years between 262143 BC and 262142 AD +- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) + or strings containing null bytes (e.g \\u0000) +- **string -> timestamp**: ANSI mode not supported + + +See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. + +> > > > > > > acee9701f (int_to_binary) From ab4039263ec31d244633ee4c24a45d1a9a18833c Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Tue, 13 Jan 2026 16:01:47 -0800 Subject: [PATCH 4/9] int_to_binary --- .../apache/comet/expressions/CometCast.scala | 34 ++++--------------- .../org/apache/comet/CometCastSuite.scala | 16 +++++---- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 2fc37187d6..68d75019ba 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -277,13 +277,8 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { Compatible() case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => Compatible() - case DataTypes.BinaryType => - if (evalMode == CometEvalMode.LEGACY) { - Compatible() - } else { - Unsupported( - Some(s"Spark does not support byte to binary conversion in ${evalMode} eval mode")) - } + case DataTypes.BinaryType if (evalMode == CometEvalMode.LEGACY) => + Compatible() case _ => unsupported(DataTypes.ByteType, toType) } @@ -296,13 +291,8 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { Compatible() case DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => Compatible() - case DataTypes.BinaryType => - if (evalMode == CometEvalMode.LEGACY) { - Compatible() - } else { - Unsupported( - Some(s"Spark does not support short to binary conversion in ${evalMode} eval mode")) - } + case DataTypes.BinaryType if (evalMode == CometEvalMode.LEGACY) => + Compatible() case _ => unsupported(DataTypes.ShortType, toType) } @@ -317,13 +307,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { Compatible() case _: DecimalType => Compatible() - case DataTypes.BinaryType => - if (evalMode == CometEvalMode.LEGACY) { - Compatible() - } else { - Unsupported( - Some(s"Spark does not support int to binary conversion in ${evalMode} eval mode")) - } + case DataTypes.BinaryType if (evalMode == CometEvalMode.LEGACY) => Compatible() case _ => unsupported(DataTypes.IntegerType, toType) } @@ -338,13 +322,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { Compatible() case _: DecimalType => Compatible() - case DataTypes.BinaryType => - if (evalMode == CometEvalMode.LEGACY) { - Compatible() - } else { - Unsupported( - Some(s"Spark does not support long to binary conversion in ${evalMode} eval mode")) - } + case DataTypes.BinaryType if (evalMode == CometEvalMode.LEGACY) => Compatible() case _ => unsupported(DataTypes.LongType, toType) } diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 72018cf4f4..08671eb0e8 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -211,7 +211,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { castTest( generateBytes(), DataTypes.BinaryType, - hasIncompatibleType = usingParquetExecWithIncompatTypes, testAnsi = false, testTry = false) + hasIncompatibleType = usingParquetExecWithIncompatTypes, + testAnsi = false, + testTry = false) } ignore("cast ByteType to TimestampType") { @@ -286,7 +288,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { castTest( generateShorts(), DataTypes.BinaryType, - hasIncompatibleType = usingParquetExecWithIncompatTypes, testAnsi = false, testTry = false) + hasIncompatibleType = usingParquetExecWithIncompatTypes, + testAnsi = false, + testTry = false) } ignore("cast ShortType to TimestampType") { @@ -396,7 +400,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { test("cast LongType to BinaryType") { // Spark does not support ANSI or Try mode - castTest(generateLongs(), DataTypes.BinaryType , testAnsi = false, testTry = false) + castTest(generateLongs(), DataTypes.BinaryType, testAnsi = false, testTry = false) } ignore("cast LongType to TimestampType") { @@ -1435,7 +1439,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { checkSparkAnswerAndOperator(df) } - if (testTry){ + if (testTry) { // try_cast() should always return null for invalid inputs val df2 = data.select(col("a"), col("a").try_cast(toType)).orderBy(col("a")) @@ -1501,9 +1505,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } // try_cast() should always return null for invalid inputs - if (testTry){ + if (testTry) { val df2 = - data.select(col("a"), col("a").cast(toType)).orderBy(col("a")) + data.select(col("a"), col("a").try_cast(toType)).orderBy(col("a")) if (hasIncompatibleType) { checkSparkAnswer(df2) } else { From 092c00e5a8423344e45de85efabd3f5e050a8e74 Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Tue, 13 Jan 2026 18:51:41 -0800 Subject: [PATCH 5/9] int_to_binary_boolean_to_decimal --- .../spark-expr/src/conversion_funcs/cast.rs | 36 +++++++++++++++---- .../apache/comet/expressions/CometCast.scala | 4 +-- .../org/apache/comet/CometCastSuite.scala | 18 +++++++--- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index 5cc1d4aa36..d9717e085b 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -16,6 +16,7 @@ // under the License. use crate::utils::array_with_timezone; +use crate::EvalMode::Legacy; use crate::{timezone, BinaryOutputStyle}; use crate::{EvalMode, SparkError, SparkResult}; use arrow::array::builder::StringBuilder; @@ -25,8 +26,8 @@ use arrow::array::{ }; use arrow::compute::can_cast_types; use arrow::datatypes::{ - i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type, - GenericBinaryType, Schema, + i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type, GenericBinaryType, + Schema, }; use arrow::{ array::{ @@ -66,7 +67,6 @@ use std::{ num::Wrapping, sync::Arc, }; -use crate::EvalMode::Legacy; static TIMESTAMP_FORMAT: Option<&str> = Some("%Y-%m-%d %H:%M:%S%.f"); @@ -305,7 +305,10 @@ fn can_cast_from_timestamp(to_type: &DataType, _options: &SparkCastOptions) -> b fn can_cast_from_boolean(to_type: &DataType, _: &SparkCastOptions) -> bool { use DataType::*; - matches!(to_type, Int8 | Int16 | Int32 | Int64 | Float32 | Float64) + matches!( + to_type, + Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) + ) } fn can_cast_from_byte(to_type: &DataType, _: &SparkCastOptions) -> bool { @@ -1126,9 +1129,18 @@ fn cast_array( (Binary, Utf8) => Ok(cast_binary_to_string::(&array, cast_options)?), (Date32, Timestamp(_, tz)) => Ok(cast_date_to_timestamp(&array, cast_options, tz)?), (Int8, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int8Array, 1), - (Int16, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int16Array, 2), - (Int32, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int32Array, 4), - (Int64, Binary) if (eval_mode == Legacy) => cast_whole_num_to_binary!(&array, Int64Array, 8), + (Int16, Binary) if (eval_mode == Legacy) => { + cast_whole_num_to_binary!(&array, Int16Array, 2) + } + (Int32, Binary) if (eval_mode == Legacy) => { + cast_whole_num_to_binary!(&array, Int32Array, 4) + } + (Int64, Binary) if (eval_mode == Legacy) => { + cast_whole_num_to_binary!(&array, Int64Array, 8) + } + (Boolean, Decimal128(precision, scale)) => { + cast_boolean_to_decimal(&array, *precision, *scale) + } _ if cast_options.is_adapting_schema || is_datafusion_spark_compatible(from_type, to_type) => { @@ -1191,6 +1203,16 @@ fn cast_date_to_timestamp( )) } +fn cast_boolean_to_decimal(array: &ArrayRef, precision: u8, scale: i8) -> SparkResult { + let bool_array = array.as_boolean(); + let scale_factor = 10_i128.pow(scale as u32); + let result: Decimal128Array = bool_array + .iter() + .map(|v| v.map(|b| if b { scale_factor } else { 0 })) + .collect(); + Ok(Arc::new(result.with_precision_and_scale(precision, scale)?)) +} + fn cast_string_to_float( array: &ArrayRef, to_type: &DataType, diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 68d75019ba..d419568063 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -21,7 +21,7 @@ package org.apache.comet.expressions import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, DataTypes, DecimalType, NullType, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType} import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo @@ -264,7 +264,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { private def canCastFromBoolean(toType: DataType): SupportLevel = toType match { case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType | - DataTypes.FloatType | DataTypes.DoubleType => + DataTypes.FloatType | DataTypes.DoubleType | _: DecimalType => Compatible() case _ => unsupported(DataTypes.BooleanType, toType) } diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 08671eb0e8..f6a6defef7 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -134,11 +134,18 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { castTest(generateBools(), DataTypes.DoubleType) } - ignore("cast BooleanType to DecimalType(10,2)") { - // Arrow error: Cast error: Casting from Boolean to Decimal128(10, 2) not supported + test("cast BooleanType to DecimalType(10,2)") { castTest(generateBools(), DataTypes.createDecimalType(10, 2)) } + test("cast BooleanType to DecimalType(14,4)") { + castTest(generateBools(), DataTypes.createDecimalType(14, 4)) + } + + test("cast BooleanType to DecimalType(30,0)") { + castTest(generateBools(), DataTypes.createDecimalType(30, 0)) + } + test("cast BooleanType to StringType") { castTest(generateBools(), DataTypes.StringType) } @@ -1440,9 +1447,11 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } if (testTry) { + data.createOrReplaceTempView("t") // try_cast() should always return null for invalid inputs +// not using spark DSL since it `try_cast` is only available from Spark 4x val df2 = - data.select(col("a"), col("a").try_cast(toType)).orderBy(col("a")) + spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a") if (hasIncompatibleType) { checkSparkAnswer(df2) } else { @@ -1506,8 +1515,9 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { // try_cast() should always return null for invalid inputs if (testTry) { + data.createOrReplaceTempView("t") val df2 = - data.select(col("a"), col("a").try_cast(toType)).orderBy(col("a")) + spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a") if (hasIncompatibleType) { checkSparkAnswer(df2) } else { From 55f7def08165597e6d777832054c2a64e8957f37 Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Tue, 13 Jan 2026 22:19:32 -0800 Subject: [PATCH 6/9] int_to_binary_boolean_to_decimal --- docs/source/user-guide/latest/compatibility.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index a5837a1e59..ac7676c57b 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -134,15 +134,15 @@ Cast operations in Comet fall into three levels of support: | | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | |---|---|---|---|---|---|---|---|---|---|---|---|---| | binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | -| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | -| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | +| boolean | N/A | - | C | N/A | C | C | C | C | C | C | C | U | +| byte | C | C | - | N/A | C | C | C | C | C | C | C | U | | date | N/A | U | U | - | U | U | U | U | U | U | C | U | | decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | | double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | | float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | -| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | -| long | U | C | C | N/A | C | C | C | C | - | C | C | U | -| short | U | C | C | N/A | C | C | C | C | C | - | C | U | +| integer | C | C | C | N/A | C | C | C | - | C | C | C | U | +| long | C | C | C | N/A | C | C | C | C | - | C | C | U | +| short | C | C | C | N/A | C | C | C | C | C | - | C | U | | string | C | C | C | C | I | C | C | C | C | C | - | I | | timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | @@ -169,7 +169,7 @@ Cast operations in Comet fall into three levels of support: | | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | |---|---|---|---|---|---|---|---|---|---|---|---|---| | binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | -| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| boolean | N/A | - | C | N/A | C | C | C | C | C | C | C | U | | byte | U | C | - | N/A | C | C | C | C | C | C | C | U | | date | N/A | U | U | - | U | U | U | U | U | U | C | U | | decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | @@ -210,7 +210,7 @@ Any cast not listed in the previous tables is currently unsupported. We are work | | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | |---|---|---|---|---|---|---|---|---|---|---|---|---| | binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | -| boolean | N/A | - | C | N/A | U | C | C | C | C | C | C | U | +| boolean | N/A | - | C | N/A | C | C | C | C | C | C | C | U | | byte | U | C | - | N/A | C | C | C | C | C | C | C | U | | date | N/A | U | U | - | U | U | U | U | U | U | C | U | | decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | From 7623992b99dbd3b1bea717f2b7ad0eb619091a55 Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Sun, 1 Feb 2026 13:00:18 -0800 Subject: [PATCH 7/9] rebase_main --- .../main/scala/org/apache/comet/expressions/CometCast.scala | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index d419568063..000cc5fd4f 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -361,12 +361,6 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case _ => Unsupported(Some(s"Cast from DateType to $toType is not supported")) } - private def canCastToBinary(fromType: DataType): SupportLevel = fromType match { - case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => - Compatible() - case _ => Unsupported(Some(s"Cast from $fromType to BinaryType is not supported")) - } - private def unsupported(fromType: DataType, toType: DataType): Unsupported = { Unsupported(Some(s"Cast from $fromType to $toType is not supported")) } From af88fd9acbe7a47948193fb12091ed919ae3f5cb Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Sun, 1 Feb 2026 13:08:46 -0800 Subject: [PATCH 8/9] rebase_main --- .../source/user-guide/latest/compatibility.md | 115 ------------------ 1 file changed, 115 deletions(-) diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md index ac7676c57b..c09f6a61e6 100644 --- a/docs/source/user-guide/latest/compatibility.md +++ b/docs/source/user-guide/latest/compatibility.md @@ -110,8 +110,6 @@ Cast operations in Comet fall into three levels of support: ### Legacy Mode -<<<<<<< HEAD - @@ -125,117 +123,4 @@ Cast operations in Comet fall into three levels of support: -# See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. - - - - - -| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | -|---|---|---|---|---|---|---|---|---|---|---|---|---| -| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | -| boolean | N/A | - | C | N/A | C | C | C | C | C | C | C | U | -| byte | C | C | - | N/A | C | C | C | C | C | C | C | U | -| date | N/A | U | U | - | U | U | U | U | U | U | C | U | -| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | -| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | -| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | -| integer | C | C | C | N/A | C | C | C | - | C | C | C | U | -| long | C | C | C | N/A | C | C | C | C | - | C | C | U | -| short | C | C | C | N/A | C | C | C | C | C | - | C | U | -| string | C | C | C | C | I | C | C | C | C | C | - | I | -| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | - - -**Notes:** - -- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not -- **double -> decimal**: There can be rounding differences -- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 -- **float -> decimal**: There can be rounding differences -- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 -- **string -> date**: Only supports years between 262143 BC and 262142 AD -- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) -- **string -> timestamp**: Not all valid formats are supported - - -### Try Mode - - - - - -| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | -|---|---|---|---|---|---|---|---|---|---|---|---|---| -| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | -| boolean | N/A | - | C | N/A | C | C | C | C | C | C | C | U | -| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | -| date | N/A | U | U | - | U | U | U | U | U | U | C | U | -| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | -| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | -| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | -| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | -| long | U | C | C | N/A | C | C | C | C | - | C | C | U | -| short | U | C | C | N/A | C | C | C | C | C | - | C | U | -| string | C | C | C | C | I | C | C | C | C | C | - | I | -| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | - - -**Notes:** - -<<<<<<< HEAD -Any cast not listed in the previous tables is currently unsupported. We are working on adding more. See the -[tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. - -> > > > > > > # 6f8e1c629 (int_to_binary) - -- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not -- **double -> decimal**: There can be rounding differences -- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 -- **float -> decimal**: There can be rounding differences -- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 -- **string -> date**: Only supports years between 262143 BC and 262142 AD -- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) -- **string -> timestamp**: Not all valid formats are supported - - -### ANSI Mode - - - - - -| | binary | boolean | byte | date | decimal | double | float | integer | long | short | string | timestamp | -|---|---|---|---|---|---|---|---|---|---|---|---|---| -| binary | - | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | C | N/A | -| boolean | N/A | - | C | N/A | C | C | C | C | C | C | C | U | -| byte | U | C | - | N/A | C | C | C | C | C | C | C | U | -| date | N/A | U | U | - | U | U | U | U | U | U | C | U | -| decimal | N/A | C | C | N/A | - | C | C | C | C | C | C | U | -| double | N/A | C | C | N/A | I | - | C | C | C | C | C | U | -| float | N/A | C | C | N/A | I | C | - | C | C | C | C | U | -| integer | U | C | C | N/A | C | C | C | - | C | C | C | U | -| long | U | C | C | N/A | C | C | C | C | - | C | C | U | -| short | U | C | C | N/A | C | C | C | C | C | - | C | U | -| string | C | C | C | C | I | C | C | C | C | C | - | I | -| timestamp | N/A | U | U | C | U | U | U | U | C | U | C | - | - - -**Notes:** - -- **decimal -> string**: There can be formatting differences in some case due to Spark using scientific notation where Comet does not -- **double -> decimal**: There can be rounding differences -- **double -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 -- **float -> decimal**: There can be rounding differences -- **float -> string**: There can be differences in precision. For example, the input "1.4E-45" will produce 1.0E-45 instead of 1.4E-45 -- **string -> date**: Only supports years between 262143 BC and 262142 AD -- **string -> decimal**: Does not support fullwidth unicode digits (e.g \\uFF10) - or strings containing null bytes (e.g \\u0000) -- **string -> timestamp**: ANSI mode not supported - - See the [tracking issue](https://github.com/apache/datafusion-comet/issues/286) for more details. - -> > > > > > > acee9701f (int_to_binary) From b49b178fbc55f4a10f81b4cc2a7e4e31aa0e1281 Mon Sep 17 00:00:00 2001 From: B Vadlamani Date: Thu, 5 Feb 2026 08:53:42 -0800 Subject: [PATCH 9/9] support_cast_int_to_binary --- native/spark-expr/src/conversion_funcs/cast.rs | 4 ++-- spark/src/test/scala/org/apache/comet/CometCastSuite.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs index d9717e085b..be52574774 100644 --- a/native/spark-expr/src/conversion_funcs/cast.rs +++ b/native/spark-expr/src/conversion_funcs/cast.rs @@ -1205,10 +1205,10 @@ fn cast_date_to_timestamp( fn cast_boolean_to_decimal(array: &ArrayRef, precision: u8, scale: i8) -> SparkResult { let bool_array = array.as_boolean(); - let scale_factor = 10_i128.pow(scale as u32); + let scaled_val = 10_i128.pow(scale as u32); let result: Decimal128Array = bool_array .iter() - .map(|v| v.map(|b| if b { scale_factor } else { 0 })) + .map(|v| v.map(|b| if b { scaled_val } else { 0 })) .collect(); Ok(Arc::new(result.with_precision_and_scale(precision, scale)?)) } diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index f6a6defef7..9fc9a16579 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -1448,7 +1448,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { if (testTry) { data.createOrReplaceTempView("t") - // try_cast() should always return null for invalid inputs +// try_cast() should always return null for invalid inputs // not using spark DSL since it `try_cast` is only available from Spark 4x val df2 = spark.sql(s"select a, try_cast(a as ${toType.sql}) from t order by a")