[Spark] Refactor out Delta read path (#4041)

#### Which Delta project/connector is this regarding?  - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description  Refactored out the following read path functionality: - V2 -> V1 relation conversions. - Table with partition filters resolution. This is a refactor-only change to support the single-pass Analyzer project in Spark: https://issues.apache.org/jira/browse/SPARK-49834. It will be used in single-pass resolver extensions: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/ResolverExtension.scala. ## How was this patch tested? Existing tests. ## Does this PR introduce _any_ user-facing changes? No.
delta-io · Jan 14, 2025 · c979a89 · c979a89
1 parent b1139d8
commit c979a89
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 10 deletions.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala
@@ -265,12 +265,7 @@ class DeltaAnalysis(session: SparkSession)
       }
       DeltaDynamicPartitionOverwriteCommand(r, d, adjustedQuery, o.writeOptions, o.isByName)
 
-    // Pull out the partition filter that may be part of the FileIndex. This can happen when someone
-    // queries a Delta table such as spark.read.format("delta").load("/some/table/partition=2")
-    case l @ DeltaTable(index: TahoeLogFileIndex) if index.partitionFilters.nonEmpty =>
-      Filter(
-        index.partitionFilters.reduce(And),
-        DeltaTableUtils.replaceFileIndex(l, index.copy(partitionFilters = Nil)))
+    case ResolveDeltaTableWithPartitionFilters(plan) => plan
 
     // SQL CDC table value functions "table_changes" and "table_changes_by_path"
     case stmt: CDCStatementBase if stmt.functionArgs.forall(_.resolved) =>
@@ -442,10 +437,7 @@ class DeltaAnalysis(session: SparkSession)
 
     case d: DescribeDeltaHistory if d.childrenResolved => d.toCommand
 
-    // This rule falls back to V1 nodes, since we don't have a V2 reader for Delta right now
-    case dsv2 @ DataSourceV2Relation(d: DeltaTableV2, _, _, _, options)
-        if dsv2.getTagValue(DeltaRelation.KEEP_AS_V2_RELATION_TAG).isEmpty =>
-      DeltaRelation.fromV2Relation(d, dsv2, options)
+    case FallbackToV1DeltaRelation(v1Relation) => v1Relation
 
     case ResolvedTable(_, _, d: DeltaTableV2, _) if d.catalogTable.isEmpty && !d.tableExists =>
       // This is DDL on a path based table that doesn't exist. CREATE will not hit this path, most

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/FallbackToV1Relations.scala b/spark/src/main/scala/org/apache/spark/sql/delta/FallbackToV1Relations.scala
@@ -0,0 +1,32 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta
+
+import org.apache.spark.sql.delta.catalog.DeltaTableV2
+import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+
+/**
+ * Fall back to V1 nodes, since we don't have a V2 reader for Delta right now
+ */
+object FallbackToV1DeltaRelation {
+  def unapply(dsv2: DataSourceV2Relation): Option[LogicalRelation] = dsv2.table match {
+    case d: DeltaTableV2 if dsv2.getTagValue(DeltaRelation.KEEP_AS_V2_RELATION_TAG).isEmpty =>
+      Some(DeltaRelation.fromV2Relation(d, dsv2, dsv2.options))
+    case _ => None
+  }
+}
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/ResolveDeltaTableWithPartitionFilters.scala b/spark/src/main/scala/org/apache/spark/sql/delta/ResolveDeltaTableWithPartitionFilters.scala
@@ -0,0 +1,37 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta
+
+import org.apache.spark.sql.catalyst.expressions.And
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Filter}
+import org.apache.spark.sql.delta.files.TahoeLogFileIndex
+
+/**
+ * Pull out the partition filter that may be part of the FileIndex. This can happen when someone
+ * queries a Delta table such as spark.read.format("delta").load("/some/table/partition=2")
+ */
+object ResolveDeltaTableWithPartitionFilters {
+  def unapply(plan: LogicalPlan): Option[LogicalPlan] = plan match {
+    case relation @ DeltaTable(index: TahoeLogFileIndex) if index.partitionFilters.nonEmpty =>
+      val result = Filter(
+        index.partitionFilters.reduce(And),
+        DeltaTableUtils.replaceFileIndex(relation, index.copy(partitionFilters = Nil))
+      )
+      Some(result)
+    case _ => None
+  }
+}