apache
diff --git a/‎python/pyspark/pipelines/add_pipeline_analysis_context.py‎
Lines changed: 43 additions & 0 deletions b/‎python/pyspark/pipelines/add_pipeline_analysis_context.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎python/pyspark/pipelines/spark_connect_graph_element_registry.py‎
Lines changed: 5 additions & 2 deletions b/‎python/pyspark/pipelines/spark_connect_graph_element_registry.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎python/pyspark/sql/connect/proto/pipelines_pb2.py‎
Lines changed: 4 additions & 4 deletions b/‎python/pyspark/sql/connect/proto/pipelines_pb2.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/pyspark/sql/connect/proto/pipelines_pb2.pyi‎
Lines changed: 16 additions & 0 deletions b/‎python/pyspark/sql/connect/proto/pipelines_pb2.pyi‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto‎
Lines changed: 2 additions & 0 deletions b/‎sql/connect/common/src/main/protobuf/spark/connect/pipelines.proto‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala‎
Lines changed: 29 additions & 15 deletions b/‎sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala‎
Lines changed: 29 additions & 15 deletions
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from contextlib import contextmanager
+from typing import Generator
+from pyspark.sql import SparkSession
+
+@contextmanager
+def add_pipeline_analysis_context(spark: SparkSession, dataflow_graph_id: str, flow_name: str) -> Generator[None, None, None]:
+    """
+    Context manager that add PipelineAnalysisContext extension to the user context
+    used for pipeline specific analysis.
+    """
+    extension_id = None
+    try:
+        import pyspark.sql.connect.proto as pb2
+        from google.protobuf import any_pb2
+
+        analysis_context = pb2.PipelineAnalysisContext(
+            dataflow_graph_id=dataflow_graph_id,
+            flow_name=flow_name
+        )
+
+        extension = any_pb2.Any()
+        extension.Pack(analysis_context)
+
+        extension_id = spark.addThreadlocalUserContextExtension(extension)
+        yield
+    finally:
+        spark.removeUserContextExtension(extension_id)
@@ -35,6 +35,7 @@
 from pyspark.sql.types import StructType
 from typing import Any, cast
 import pyspark.sql.connect.proto as pb2
+from pyspark.pipelines.add_pipeline_analysis_context import add_pipeline_analysis_context
 
 
 class SparkConnectGraphElementRegistry(GraphElementRegistry):
@@ -43,6 +44,7 @@ class SparkConnectGraphElementRegistry(GraphElementRegistry):
     def __init__(self, spark: SparkSession, dataflow_graph_id: str) -> None:
         # Cast because mypy seems to think `spark`` is a function, not an object. Likely related to
         # SPARK-47544.
+        self._spark = spark
         self._client = cast(Any, spark).client
         self._dataflow_graph_id = dataflow_graph_id
 
@@ -110,8 +112,9 @@ def register_output(self, output: Output) -> None:
         self._client.execute_command(command)
 
     def register_flow(self, flow: Flow) -> None:
-        with block_spark_connect_execution_and_analysis():
-            df = flow.func()
+        with add_pipeline_analysis_context(spark=self._spark, dataflow_graph_id = self._dataflow_graph_id, flow_name = flow.name):
+            with block_spark_connect_execution_and_analysis():
+                df = flow.func()
         relation = cast(ConnectDataFrame, df)._plan.plan(self._client)
 
         relation_flow_details = pb2.PipelineCommand.DefineFlow.WriteRelationFlowDetails(
 
@@ -1499,11 +1499,14 @@ class PipelineAnalysisContext(google.protobuf.message.Message):
 
     DATAFLOW_GRAPH_ID_FIELD_NUMBER: builtins.int
     DEFINITION_PATH_FIELD_NUMBER: builtins.int
+    FLOW_NAME_FIELD_NUMBER: builtins.int
     EXTENSION_FIELD_NUMBER: builtins.int
     dataflow_graph_id: builtins.str
     """Unique identifier of the dataflow graph associated with this pipeline."""
     definition_path: builtins.str
     """The path of the top-level pipeline file determined at runtime during pipeline initialization."""
+    flow_name: builtins.str
+    """The name of the Flow involved in this analysis"""
     @property
     def extension(
         self,
@@ -1516,6 +1519,7 @@ class PipelineAnalysisContext(google.protobuf.message.Message):
         *,
         dataflow_graph_id: builtins.str | None = ...,
         definition_path: builtins.str | None = ...,
+        flow_name: builtins.str | None = ...,
         extension: collections.abc.Iterable[google.protobuf.any_pb2.Any] | None = ...,
     ) -> None: ...
     def HasField(
@@ -1525,10 +1529,14 @@ class PipelineAnalysisContext(google.protobuf.message.Message):
             b"_dataflow_graph_id",
             "_definition_path",
             b"_definition_path",
+            "_flow_name",
+            b"_flow_name",
             "dataflow_graph_id",
             b"dataflow_graph_id",
             "definition_path",
             b"definition_path",
+            "flow_name",
+            b"flow_name",
         ],
     ) -> builtins.bool: ...
     def ClearField(
@@ -1538,12 +1546,16 @@ class PipelineAnalysisContext(google.protobuf.message.Message):
             b"_dataflow_graph_id",
             "_definition_path",
             b"_definition_path",
+            "_flow_name",
+            b"_flow_name",
             "dataflow_graph_id",
             b"dataflow_graph_id",
             "definition_path",
             b"definition_path",
             "extension",
             b"extension",
+            "flow_name",
+            b"flow_name",
         ],
     ) -> None: ...
     @typing.overload
@@ -1554,5 +1566,9 @@ class PipelineAnalysisContext(google.protobuf.message.Message):
     def WhichOneof(
         self, oneof_group: typing_extensions.Literal["_definition_path", b"_definition_path"]
     ) -> typing_extensions.Literal["definition_path"] | None: ...
+    @typing.overload
+    def WhichOneof(
+        self, oneof_group: typing_extensions.Literal["_flow_name", b"_flow_name"]
+    ) -> typing_extensions.Literal["flow_name"] | None: ...
 
 global___PipelineAnalysisContext = PipelineAnalysisContext
@@ -299,6 +299,8 @@ message PipelineAnalysisContext {
   optional string dataflow_graph_id = 1;
   // The path of the top-level pipeline file determined at runtime during pipeline initialization.
   optional string definition_path = 2;
+  // The name of the Flow involved in this analysis
+  optional string flow_name = 3;
 
   // Reserved field for protocol extensions.
   repeated google.protobuf.Any extension = 999;
 
@@ -17,25 +17,25 @@
 
 package org.apache.spark.sql.connect.pipelines
 
-import scala.jdk.CollectionConverters._
-import scala.util.Using
-
 import io.grpc.stub.StreamObserver
-
 import org.apache.spark.connect.proto
-import org.apache.spark.connect.proto.{ExecutePlanResponse, PipelineCommandResult, Relation, ResolvedIdentifier}
+import org.apache.spark.connect.proto._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.classic.DataFrame
 import org.apache.spark.sql.connect.common.DataTypeProtoConverter
 import org.apache.spark.sql.connect.service.SessionHolder
 import org.apache.spark.sql.pipelines.Language.Python
 import org.apache.spark.sql.pipelines.common.RunState.{CANCELED, FAILED}
-import org.apache.spark.sql.pipelines.graph.{AllTables, FlowAnalysis, GraphIdentifierManager, GraphRegistrationContext, IdentifierHelper, NoTables, PipelineUpdateContextImpl, QueryContext, QueryOrigin, QueryOriginType, Sink, SinkImpl, SomeTables, SqlGraphRegistrationContext, Table, TableFilter, TemporaryView, UnresolvedFlow}
+import org.apache.spark.sql.pipelines.graph._
 import org.apache.spark.sql.pipelines.logging.{PipelineEvent, RunProgress}
 import org.apache.spark.sql.types.StructType
 
+import scala.jdk.CollectionConverters._
+import scala.util.Using
+
 /** Handler for SparkConnect PipelineCommands */
 private[connect] object PipelinesHandler extends Logging {
 
@@ -47,8 +47,6 @@ private[connect] object PipelinesHandler extends Logging {
    *   Command to be handled
    * @param responseObserver
    *   The response observer where the response will be sent
-   * @param sparkSession
-   *   The spark session
    * @param transformRelationFunc
    *   Function used to convert a relation to a LogicalPlan. This is used when determining the
    *   LogicalPlan that a flow returns.
@@ -108,7 +106,6 @@ private[connect] object PipelinesHandler extends Logging {
           identifierBuilder.addNamespace(ns)
         }
         identifierBuilder.setTableName(resolvedFlow.identifier)
-        val identifier = identifierBuilder.build()
         PipelineCommandResult
           .newBuilder()
           .setDefineFlowResult(
@@ -129,6 +126,24 @@ private[connect] object PipelinesHandler extends Logging {
     }
   }
 
+  def executeSQL(
+      sessionHolder: SessionHolder,
+      plan: LogicalPlan,
+      pipelineAnalysisContext: PipelineAnalysisContext
+  ): DataFrame = {
+    val graphRegistrationContext = {
+      sessionHolder.dataflowGraphRegistry.getDataflowGraphOrThrow(
+        pipelineAnalysisContext.getDataflowGraphId)
+    }
+    val pipelineSqlProcessor = new PipelineSqlProcessor(graphRegistrationContext)
+    val context = ExternalQueryAnalysisContext(
+      queryContext = QueryContext(
+        currentCatalog = Option(graphRegistrationContext.defaultCatalog),
+        currentDatabase = Option(graphRegistrationContext.defaultDatabase)),
+      spark = sessionHolder.session)
+    pipelineSqlProcessor.processSparkSqlQuery(queryPlan = plan, context = context)
+  }
+
   private def createDataflowGraph(
       cmd: proto.PipelineCommand.CreateDataflowGraph,
       sessionHolder: SessionHolder): String = {
@@ -161,7 +176,7 @@ private[connect] object PipelinesHandler extends Logging {
 
     val graphElementRegistry =
       sessionHolder.dataflowGraphRegistry.getDataflowGraphOrThrow(dataflowGraphId)
-    val sqlGraphElementRegistrationContext = new SqlGraphRegistrationContext(graphElementRegistry)
+    val sqlGraphElementRegistrationContext = new PipelineSqlProcessor(graphElementRegistry)
     sqlGraphElementRegistrationContext.processSqlFile(
       cmd.getSqlText,
       cmd.getSqlFilePath,
@@ -293,8 +308,7 @@ private[connect] object PipelinesHandler extends Logging {
     val rawDestinationIdentifier = GraphIdentifierManager
       .parseTableIdentifier(name = flow.getTargetDatasetName, spark = sessionHolder.session)
     val flowWritesToView =
-      graphElementRegistry
-        .getViews()
+      graphElementRegistry.getViews
         .filter(_.isInstanceOf[TemporaryView])
         .exists(_.identifier == rawDestinationIdentifier)
     val flowWritesToSink =
@@ -304,7 +318,7 @@ private[connect] object PipelinesHandler extends Logging {
     // If the flow is created implicitly as part of defining a view or that it writes to a sink,
     // then we do not qualify the flow identifier and the flow destination. This is because
     // views and sinks are not permitted to have multipart
-    val isImplicitFlowForTempView = (isImplicitFlow && flowWritesToView)
+    val isImplicitFlowForTempView = isImplicitFlow && flowWritesToView
     val Seq(flowIdentifier, destinationIdentifier) =
       Seq(rawFlowIdentifier, rawDestinationIdentifier).map { rawIdentifier =>
         if (isImplicitFlowForTempView || flowWritesToSink) {
@@ -330,8 +344,8 @@ private[connect] object PipelinesHandler extends Logging {
         once = false,
         queryContext = QueryContext(Option(defaultCatalog), Option(defaultDatabase)),
         origin = QueryOrigin(
-          filePath = Option.when(flow.getSourceCodeLocation.hasFileName)(
-            flow.getSourceCodeLocation.getFileName),
+          filePath = Option
+            .when(flow.getSourceCodeLocation.hasFileName)(flow.getSourceCodeLocation.getFileName),
           line = Option.when(flow.getSourceCodeLocation.hasLineNumber)(
             flow.getSourceCodeLocation.getLineNumber),
           objectType = Option(QueryOriginType.Flow.toString),