diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 0339cd2a3e..126c570dbf 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -245,7 +245,7 @@ Comet supports using the following aggregate functions within window contexts wi | ArrayRemove | Yes | | | ArrayRepeat | No | | | ArrayUnion | No | Behaves differently than spark. Comet sorts the input arrays before performing the union, while Spark preserves the order of the first array and appends unique elements from the second. | -| ArraysOverlap | No | | +| ArraysOverlap | No | Null handling differs: Comet returns `false` when no common elements exist and null elements are present, while Spark returns `null` (three-valued logic). Example: `arrays_overlap(array(1, null, 3), array(4, 5))` returns `null` in Spark but `false` in Comet. | | CreateArray | Yes | | | ElementAt | Yes | Input must be an array. Map inputs are not supported. | | Flatten | Yes | | diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index b552a071d6..342f3da244 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -198,7 +198,11 @@ object CometArrayMin extends CometExpressionSerde[ArrayMin] { object CometArraysOverlap extends CometExpressionSerde[ArraysOverlap] { - override def getSupportLevel(expr: ArraysOverlap): SupportLevel = Incompatible(None) + override def getSupportLevel(expr: ArraysOverlap): SupportLevel = Incompatible(Some( + "Null handling differs from Spark: DataFusion's array_has_any returns false when no " + + "common elements are found, even if null elements exist. Spark returns null in such " + + "cases following three-valued logic (SQL standard). Example: " + + "arrays_overlap(array(1, null, 3), array(4, 5)) returns null in Spark but false in Comet.")) override def convert( expr: ArraysOverlap, diff --git a/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala index cf49117364..3ce1a3fb41 100644 --- a/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala @@ -521,6 +521,43 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp } } + test("arrays_overlap - null handling behavior verification") { + withSQLConf(CometConf.getExprAllowIncompatConfigKey(classOf[ArraysOverlap]) -> "true") { + withTempDir { dir => + withTempView("t1") { + val path = new Path(dir.toURI.toString, "test.parquet") + makeParquetFileAllPrimitiveTypes(path, dictionaryEnabled = false, 100) + spark.read.parquet(path.toString).createOrReplaceTempView("t1") + + // Test case 1: Common element exists - should return true + checkSparkAnswerAndOperator(sql( + "SELECT arrays_overlap(array(1, 2, 3), array(3, 4, 5)) from t1 limit 1")) + + // Test case 2: No common elements, no nulls - should return false + checkSparkAnswerAndOperator(sql( + "SELECT arrays_overlap(array(1, 2), array(3, 4)) from t1 limit 1")) + + // Test case 3: No common elements, but null exists - Spark returns null (three-valued logic) + // This is the key incompatibility case documented in issue #3175 + checkSparkAnswerAndOperator(sql( + "SELECT arrays_overlap(array(1, null, 3), array(4, 5)) from t1 limit 1")) + + // Test case 4: Common element exists even with null - should return true + checkSparkAnswerAndOperator(sql( + "SELECT arrays_overlap(array(1, null, 3), array(1, 4)) from t1 limit 1")) + + // Test case 5: Both arrays have null but no common non-null elements + checkSparkAnswerAndOperator(sql( + "SELECT arrays_overlap(array(1, null), array(2, null)) from t1 limit 1")) + + // Test case 6: Empty arrays + checkSparkAnswerAndOperator(sql( + "SELECT arrays_overlap(array(), array(1, 2)) from t1 limit 1")) + } + } + } + } + test("array_compact") { // TODO fix for Spark 4.0.0 assume(!isSpark40Plus)