-
Notifications
You must be signed in to change notification settings - Fork 283
Feat: add support for elt expression
#3269
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
768b3e9
c68c342
d887555
231aa90
9500bbb
9577481
3791557
7c2f082
609a605
a151b2c
ad3e7f5
ea92e4b
8dfeca3
559741e
ebda14e
408152e
d7857b2
63fb715
4b07cc4
0f518f6
1604dc6
fc22ee7
aef41be
70c400b
8e3f482
2e66014
ce7a5c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,8 +21,8 @@ package org.apache.comet.serde | |
|
|
||
| import java.util.Locale | ||
|
|
||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, Expression, InitCap, Left, Length, Like, Literal, Lower, RegExpReplace, RLike, StringLPad, StringRepeat, StringRPad, Substring, Upper} | ||
| import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType} | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, Elt, Expression, InitCap, Left, Length, Like, Literal, Lower, RegExpReplace, RLike, StringLPad, StringRepeat, StringRPad, Substring, Upper} | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| import org.apache.comet.CometConf | ||
| import org.apache.comet.CometSparkSessionExtensions.withInfo | ||
|
|
@@ -289,6 +289,16 @@ object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { | |
| } | ||
| } | ||
|
|
||
| object CometElt extends CometScalarFunction[Elt]("elt") { | ||
|
|
||
| override def getSupportLevel(expr: Elt): SupportLevel = { | ||
| if (expr.failOnError) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the PR @kazantsev-maksim .Minor nitpick : We generally add
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, fixed |
||
| return Unsupported(Some("ANSI mode not supported")) | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also might want to check if the inputs are a number and an array to make sure we terminate early in case the user provides malformed input
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Relevant examples : CometLength , CometRPad , CometLPad
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first argument can be a numeric string - '2', it seems we can't cover all cases here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can do a selective support for now (say Int, String) inputs for now and file an issue for further enhancements down the line. WDYT @kazantsev-maksim ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added it. @coderfender Can you please see?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you |
||
| Compatible(None) | ||
| } | ||
| } | ||
|
|
||
| trait CommonStringExprs { | ||
|
|
||
| def stringDecode( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,8 +23,9 @@ import scala.util.Random | |
|
|
||
| import org.apache.parquet.hadoop.ParquetOutputFormat | ||
| import org.apache.spark.sql.{CometTestBase, DataFrame} | ||
| import org.apache.spark.sql.functions.lit | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.{DataTypes, StructField, StructType} | ||
| import org.apache.spark.sql.types.{DataTypes, StringType, StructField, StructType} | ||
|
|
||
| import org.apache.comet.testing.{DataGenOptions, FuzzDataGenerator} | ||
|
|
||
|
|
@@ -391,4 +392,43 @@ class CometStringExpressionSuite extends CometTestBase { | |
| } | ||
| } | ||
|
|
||
| test("elt") { | ||
| val wrongNumArgsWithoutSuggestionExceptionMsg = | ||
| "[WRONG_NUM_ARGS.WITHOUT_SUGGESTION] The `elt` requires > 1 parameters but the actual number is 1." | ||
| withSQLConf( | ||
| SQLConf.ANSI_ENABLED.key -> "false", | ||
| SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> | ||
| "org.apache.spark.sql.catalyst.optimizer.ConstantFolding") { | ||
| val r = new Random(42) | ||
| val fieldsCount = 10 | ||
| val indexes = Seq.range(1, fieldsCount) | ||
| val edgeCasesIndexes = Seq(-1, 0, -100, fieldsCount + 100) | ||
| val schema = indexes | ||
| .foldLeft(new StructType())((schema, idx) => | ||
| schema.add(s"c$idx", StringType, nullable = true)) | ||
| val df = FuzzDataGenerator.generateDataFrame( | ||
| r, | ||
| spark, | ||
| schema, | ||
| 100, | ||
| DataGenOptions(maxStringLength = 6)) | ||
| df.withColumn( | ||
| "idx", | ||
| lit(Random.shuffle(indexes ++ edgeCasesIndexes).headOption.getOrElse(-1))) | ||
| .createOrReplaceTempView("t1") | ||
| checkSparkAnswerAndOperator( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would also be worth adding a literal test with constant folding disabled, something like:
withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
"org.apache.spark.sql.catalyst.optimizer.ConstantFolding") {
checkSparkAnswerAndOperator("SELECT elt(2, 'a', 'b', 'c')") It is totally fine to fall back to Spark if all inputs are literal. |
||
| sql(s"SELECT elt(idx, ${schema.fieldNames.mkString(",")}) FROM t1")) | ||
| checkSparkAnswerAndOperator( | ||
| sql(s"SELECT elt(cast(null as int), ${schema.fieldNames.mkString(",")}) FROM t1")) | ||
| checkSparkAnswerMaybeThrows(sql("SELECT elt(1) FROM t1")) match { | ||
| case (Some(spark), Some(comet)) => | ||
| assert(spark.getMessage.contains(wrongNumArgsWithoutSuggestionExceptionMsg)) | ||
| assert(comet.getMessage.contains(wrongNumArgsWithoutSuggestionExceptionMsg)) | ||
| case (spark, comet) => | ||
| fail( | ||
| s"Expected Spark and Comet to throw exception, but got\nSpark: $spark\nComet: $comet") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would be great if you can add more tests to cover other code paths
|
||
| } | ||
| checkSparkAnswerAndOperator("SELECT elt(2, 'a', 'b', 'c')") | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,7 +76,8 @@ object CometStringExpressionBenchmark extends CometBenchmarkBase { | |
| StringExprConfig("substring", "select substring(c1, 1, 100) from parquetV1Table"), | ||
| StringExprConfig("translate", "select translate(c1, '123456', 'aBcDeF') from parquetV1Table"), | ||
| StringExprConfig("trim", "select trim(c1) from parquetV1Table"), | ||
| StringExprConfig("upper", "select upper(c1) from parquetV1Table")) | ||
| StringExprConfig("upper", "select upper(c1) from parquetV1Table"), | ||
| StringExprConfig("elt", "select elt(2, c1, c1) from parquetV1Table")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you for adding this in the benchmark setup
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
|
|
||
| override def runCometBenchmark(mainArgs: Array[String]): Unit = { | ||
| runBenchmarkWithTable("String expressions", 1024) { v => | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: https://github.com/apache/datafusion/blob/main/datafusion/spark/src/function/string/elt.rs#L128