add new spark.sql.sources.v2.bucketing.partitionKeyOrdering.enabled config

peter-toth · peter-toth · commit 67325e944fdf · 2026-03-30T11:03:27.000+02:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2165,6 +2165,18 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val V2_BUCKETING_PARTITION_KEY_ORDERING_ENABLED =
+    buildConf("spark.sql.sources.v2.bucketing.partitionKeyOrdering.enabled")
+      .doc("When enabled, Spark derives output ordering from the partition key expressions of " +
+        "a V2 data source that reports a KeyedPartitioning but does not report explicit ordering " +
+        "via SupportsReportOrdering. Within a single partition all rows share the same key " +
+        s"value, so the data is trivially sorted by those expressions. Requires " +
+        s"${V2_BUCKETING_ENABLED.key} to be enabled.")
+      .version("4.2.0")
+      .withBindingPolicy(ConfigBindingPolicy.SESSION)
+      .booleanConf
+      .createWithDefault(true)
+
   val BUCKETING_MAX_BUCKETS = buildConf("spark.sql.sources.bucketing.maxBuckets")
     .doc("The maximum number of buckets allowed.")
     .version("2.4.0")
@@ -7731,6 +7743,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
   def v2BucketingAllowSorting: Boolean =
     getConf(SQLConf.V2_BUCKETING_SORTING_ENABLED)
 
+  def v2BucketingPartitionKeyOrderingEnabled: Boolean =
+    getConf(SQLConf.V2_BUCKETING_PARTITION_KEY_ORDERING_ENABLED)
+
   def dataFrameSelfJoinAutoResolveAmbiguity: Boolean =
     getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExecBase.scala
@@ -106,14 +106,16 @@ trait DataSourceV2ScanExecBase extends LeafExecNode {
   /**
    * Returns the output ordering for this scan. When the source reports ordering via
    * `SupportsReportOrdering`, that ordering is returned as-is. Otherwise, when the output
-   * partitioning is a `KeyedPartitioning`, each partition contains rows where the key expressions
-   * evaluate to a single constant value, so the data is trivially sorted by those expressions
-   * within the partition.
+   * partitioning is a `KeyedPartitioning` and
+   * `spark.sql.sources.v2.bucketing.partitionKeyOrdering.enabled` is on, each partition
+   * contains rows where the key expressions evaluate to a single constant value, so the data
+   * is trivially sorted by those expressions within the partition.
    */
   override def outputOrdering: Seq[SortOrder] = {
     (ordering, outputPartitioning) match {
       case (Some(o), _) => o
-      case (_, k: KeyedPartitioning) => k.expressions.map(SortOrder(_, Ascending))
+      case (_, k: KeyedPartitioning) if conf.v2BucketingPartitionKeyOrderingEnabled =>
+        k.expressions.map(SortOrder(_, Ascending))
       case _ => Seq.empty
     }
   }