apache
diff --git a/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 6 additions & 0 deletions b/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎python/pyspark/pipelines/add_pipeline_analysis_context.py‎
Lines changed: 48 additions & 0 deletions b/‎python/pyspark/pipelines/add_pipeline_analysis_context.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎python/pyspark/pipelines/block_connect_access.py‎
Lines changed: 27 additions & 1 deletion b/‎python/pyspark/pipelines/block_connect_access.py‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎python/pyspark/pipelines/cli.py‎
Lines changed: 13 additions & 4 deletions b/‎python/pyspark/pipelines/cli.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎python/pyspark/pipelines/spark_connect_graph_element_registry.py‎
Lines changed: 7 additions & 2 deletions b/‎python/pyspark/pipelines/spark_connect_graph_element_registry.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎python/pyspark/sql/connect/client/core.py‎
Lines changed: 58 additions & 1 deletion b/‎python/pyspark/sql/connect/client/core.py‎
Lines changed: 58 additions & 1 deletion
@@ -6261,6 +6261,12 @@
     },
     "sqlState" : "0A000"
   },
+  "UNSUPPORTED_PIPELINE_SPARK_SQL_COMMAND": {
+    "message" : [
+      "'<command>' is not supported in spark.sql(\"...\") API in Spark Declarative Pipeline."
+    ],
+    "sqlState" : "0A000"
+  },
   "UNSUPPORTED_CHAR_OR_VARCHAR_AS_STRING" : {
     "message" : [
       "The char/varchar type can't be used in the table schema.",
 
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from contextlib import contextmanager
+from typing import Generator, Optional
+from pyspark.sql import SparkSession
+
+from typing import Any, cast
+
+
+@contextmanager
+def add_pipeline_analysis_context(
+    spark: SparkSession, dataflow_graph_id: str, flow_name_opt: Optional[str]
+) -> Generator[None, None, None]:
+    """
+    Context manager that add PipelineAnalysisContext extension to the user context
+    used for pipeline specific analysis.
+    """
+    _extension_id = None
+    _client = cast(Any, spark).client
+    try:
+        import pyspark.sql.connect.proto as pb2
+        from google.protobuf import any_pb2
+
+        _analysis_context = pb2.PipelineAnalysisContext(dataflow_graph_id=dataflow_graph_id)
+        if flow_name_opt is not None:
+            _analysis_context.flow_name = flow_name_opt
+
+        _extension = any_pb2.Any()
+        _extension.Pack(_analysis_context)
+
+        _extension_id = _client.add_threadlocal_user_context_extension(_extension)
+        yield
+    finally:
+        _client.remove_user_context_extension(_extension_id)
@@ -24,6 +24,22 @@
 BLOCKED_RPC_NAMES = ["AnalyzePlan", "ExecutePlan"]
 
 
+def _is_sql_command_request(request: object) -> bool:
+    """Check if the request is spark.sql() command (ExecutePlanRequest with a sql_command)."""
+    try:
+        if not hasattr(request, "plan"):
+            return False
+
+        plan = request.plan
+
+        if not plan.HasField("command"):
+            return False
+
+        return plan.command.HasField("sql_command")
+    except Exception:
+        return False
+
+
 @contextmanager
 def block_spark_connect_execution_and_analysis() -> Generator[None, None, None]:
     """
@@ -41,7 +57,17 @@ def blocked_getattr(self: SparkConnectServiceStub, name: str) -> Callable:
         if name not in BLOCKED_RPC_NAMES:
             return original_getattr(self, name)
 
-        def blocked_method(*args: object, **kwargs: object) -> NoReturn:
+        # Get the original method first
+        original_method = original_getattr(self, name)
+
+        def blocked_method(*args: object, **kwargs: object):
+            # allowlist spark.sql() command (ExecutePlan with sql_command)
+            if name == "ExecutePlan" and len(args) > 0:
+                request = args[0]
+                if _is_sql_command_request(request):
+                    return original_method(*args, **kwargs)
+
+            # Block all other ExecutePlan and AnalyzePlan calls
             raise PySparkException(
                 errorClass="ATTEMPT_ANALYSIS_IN_PIPELINE_QUERY_FUNCTION",
                 messageParameters={},
 
@@ -49,6 +49,8 @@
     handle_pipeline_events,
 )
 
+from pyspark.pipelines.add_pipeline_analysis_context import add_pipeline_analysis_context
+
 PIPELINE_SPEC_FILE_NAMES = ["pipeline.yaml", "pipeline.yml"]
 
 
@@ -216,7 +218,11 @@ def validate_str_dict(d: Mapping[str, str], field_name: str) -> Mapping[str, str
 
 
 def register_definitions(
-    spec_path: Path, registry: GraphElementRegistry, spec: PipelineSpec
+    spec_path: Path,
+    registry: GraphElementRegistry,
+    spec: PipelineSpec,
+    spark: SparkSession,
+    dataflow_graph_id: str,
 ) -> None:
     """Register the graph element definitions in the pipeline spec with the given registry.
     - Looks for Python files matching the glob patterns in the spec and imports them.
@@ -245,8 +251,11 @@ def register_definitions(
                         assert (
                             module_spec.loader is not None
                         ), f"Module spec has no loader for {file}"
-                        with block_session_mutations():
-                            module_spec.loader.exec_module(module)
+                        with add_pipeline_analysis_context(
+                            spark=spark, dataflow_graph_id=dataflow_graph_id, flow_name_opt=None
+                        ):
+                            with block_session_mutations():
+                                module_spec.loader.exec_module(module)
                     elif file.suffix == ".sql":
                         log_with_curr_timestamp(f"Registering SQL file {file}...")
                         with file.open("r") as f:
@@ -324,7 +333,7 @@ def run(
 
     log_with_curr_timestamp("Registering graph elements...")
     registry = SparkConnectGraphElementRegistry(spark, dataflow_graph_id)
-    register_definitions(spec_path, registry, spec)
+    register_definitions(spec_path, registry, spec, spark, dataflow_graph_id)
 
     log_with_curr_timestamp("Starting run...")
     result_iter = start_run(
 
@@ -35,6 +35,7 @@
 from pyspark.sql.types import StructType
 from typing import Any, cast
 import pyspark.sql.connect.proto as pb2
+from pyspark.pipelines.add_pipeline_analysis_context import add_pipeline_analysis_context
 
 
 class SparkConnectGraphElementRegistry(GraphElementRegistry):
@@ -43,6 +44,7 @@ class SparkConnectGraphElementRegistry(GraphElementRegistry):
     def __init__(self, spark: SparkSession, dataflow_graph_id: str) -> None:
         # Cast because mypy seems to think `spark`` is a function, not an object. Likely related to
         # SPARK-47544.
+        self._spark = spark
         self._client = cast(Any, spark).client
         self._dataflow_graph_id = dataflow_graph_id
 
@@ -110,8 +112,11 @@ def register_output(self, output: Output) -> None:
         self._client.execute_command(command)
 
     def register_flow(self, flow: Flow) -> None:
-        with block_spark_connect_execution_and_analysis():
-            df = flow.func()
+        with add_pipeline_analysis_context(
+            spark=self._spark, dataflow_graph_id=self._dataflow_graph_id, flow_name_opt=flow.name
+        ):
+            with block_spark_connect_execution_and_analysis():
+                df = flow.func()
         relation = cast(ConnectDataFrame, df)._plan.plan(self._client)
 
         relation_flow_details = pb2.PipelineCommand.DefineFlow.WriteRelationFlowDetails(
 
@@ -727,6 +727,9 @@ def __init__(
         # cleanup ml cache if possible
         atexit.register(self._cleanup_ml_cache)
 
+        self.global_user_context_extensions = []
+        self.global_user_context_extensions_lock = threading.Lock()
+
     @property
     def _stub(self) -> grpc_lib.SparkConnectServiceStub:
         if self.is_closed:
@@ -1277,6 +1280,24 @@ def token(self) -> Optional[str]:
         """
         return self._builder.token
 
+    def _update_request_with_user_context_extensions(
+        self,
+        req: Union[
+            pb2.AnalyzePlanRequest,
+            pb2.ConfigRequest,
+            pb2.ExecutePlanRequest,
+            pb2.FetchErrorDetailsRequest,
+            pb2.InterruptRequest,
+        ],
+    ) -> None:
+        with self.global_user_context_extensions_lock:
+            for _, extension in self.global_user_context_extensions:
+                req.user_context.extensions.append(extension)
+        if not hasattr(self.thread_local, "user_context_extensions"):
+            return
+        for _, extension in self.thread_local.user_context_extensions:
+            req.user_context.extensions.append(extension)
+
     def _execute_plan_request_with_metadata(
         self, operation_id: Optional[str] = None
     ) -> pb2.ExecutePlanRequest:
@@ -1307,6 +1328,7 @@ def _execute_plan_request_with_metadata(
                     messageParameters={"arg_name": "operation_id", "origin": str(ve)},
                 )
             req.operation_id = operation_id
+        self._update_request_with_user_context_extensions(req)
         return req
 
     def _analyze_plan_request_with_metadata(self) -> pb2.AnalyzePlanRequest:
@@ -1317,6 +1339,7 @@ def _analyze_plan_request_with_metadata(self) -> pb2.AnalyzePlanRequest:
         req.client_type = self._builder.userAgent
         if self._user_id:
             req.user_context.user_id = self._user_id
+        self._update_request_with_user_context_extensions(req)
         return req
 
     def _analyze(self, method: str, **kwargs: Any) -> AnalyzeResult:
@@ -1731,6 +1754,7 @@ def _config_request_with_metadata(self) -> pb2.ConfigRequest:
         req.client_type = self._builder.userAgent
         if self._user_id:
             req.user_context.user_id = self._user_id
+        self._update_request_with_user_context_extensions(req)
         return req
 
     def get_configs(self, *keys: str) -> Tuple[Optional[str], ...]:
@@ -1807,6 +1831,7 @@ def _interrupt_request(
             )
         if self._user_id:
             req.user_context.user_id = self._user_id
+        self._update_request_with_user_context_extensions(req)
         return req
 
     def interrupt_all(self) -> Optional[List[str]]:
@@ -1905,6 +1930,38 @@ def _throw_if_invalid_tag(self, tag: str) -> None:
                 messageParameters={"arg_name": "Spark Connect tag", "arg_value": tag},
             )
 
+    def add_threadlocal_user_context_extension(self, extension: any_pb2.Any) -> str:
+        if not hasattr(self.thread_local, "user_context_extensions"):
+            self.thread_local.user_context_extensions = list()
+        extension_id = "threadlocal_" + str(uuid.uuid4())
+        self.thread_local.user_context_extensions.append((extension_id, extension))
+        return extension_id
+
+    def add_global_user_context_extension(self, extension: any_pb2.Any) -> str:
+        extension_id = "global_" + str(uuid.uuid4())
+        with self.global_user_context_extensions_lock:
+            self.global_user_context_extensions.append((extension_id, extension))
+        return extension_id
+
+    def remove_user_context_extension(self, extension_id: str) -> None:
+        if extension_id.find("threadlocal_") == 0:
+            if not hasattr(self.thread_local, "user_context_extensions"):
+                return
+            self.thread_local.user_context_extensions = list(
+                filter(lambda ex: ex[0] != extension_id, self.thread_local.user_context_extensions)
+            )
+        elif extension_id.find("global_") == 0:
+            with self.global_user_context_extensions_lock:
+                self.global_user_context_extensions = list(
+                    filter(lambda ex: ex[0] != extension_id, self.global_user_context_extensions)
+                )
+
+    def clear_user_context_extensions(self) -> None:
+        if hasattr(self.thread_local, "user_context_extensions"):
+            self.thread_local.user_context_extensions = list()
+        with self.global_user_context_extensions_lock:
+            self.global_user_context_extensions = list()
+
     def _handle_error(self, error: Exception) -> NoReturn:
         """
         Handle errors that occur during RPC calls.
@@ -1945,7 +2002,7 @@ def _fetch_enriched_error(self, info: "ErrorInfo") -> Optional[pb2.FetchErrorDet
             req.client_observed_server_side_session_id = self._server_session_id
         if self._user_id:
             req.user_context.user_id = self._user_id
-
+        self._update_request_with_user_context_extensions(req)
         try:
             return self._stub.FetchErrorDetails(req, metadata=self._builder.metadata())
         except grpc.RpcError: