[SPARK-53592][PYTHON] Make @udf support vectorized UDF

zhengruifeng · zhengruifeng · commit 686d84453610 · 2025-09-20T20:19:31.000+08:00
### What changes were proposed in this pull request? Make udf support vectorized UDF ### Why are the changes needed? to prompt vectorized UDF ### Does this PR introduce _any_ user-facing change? `udf` will try to infer the eval type based on the type hints For example, ```python udf(returnType=LongType()) def pd_add1(ser: pd.Series) -> pd.Series: assert isinstance(ser, pd.Series) return ser + 1 ``` The inferred type is `PythonEvalType.SQL_SCALAR_PANDAS_UDF` ### How was this patch tested? added UTs ### Was this patch authored or co-authored using generative AI tooling? no Closes #52323 from zhengruifeng/unify_udf. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -587,6 +587,7 @@ def __hash__(self):
         "pyspark.sql.tests.test_udf",
         "pyspark.sql.tests.test_udf_combinations",
         "pyspark.sql.tests.test_udf_profiler",
+        "pyspark.sql.tests.test_unified_udf",
         "pyspark.sql.tests.test_udtf",
         "pyspark.sql.tests.test_tvf",
         "pyspark.sql.tests.test_utils",
@@ -1107,6 +1108,7 @@ def __hash__(self):
         "pyspark.sql.tests.connect.test_parity_udf",
         "pyspark.sql.tests.connect.test_parity_udf_combinations",
         "pyspark.sql.tests.connect.test_parity_udf_profiler",
+        "pyspark.sql.tests.connect.test_parity_unified_udf",
         "pyspark.sql.tests.connect.test_parity_memory_profiler",
         "pyspark.sql.tests.connect.test_parity_udtf",
         "pyspark.sql.tests.connect.test_parity_tvf",
diff --git a/python/pyspark/sql/connect/udf.py b/python/pyspark/sql/connect/udf.py
@@ -77,10 +77,7 @@ def _create_py_udf(
     else:
         is_arrow_enabled = useArrow
 
-    eval_type: int = PythonEvalType.SQL_BATCHED_UDF
-
     if is_arrow_enabled:
-        eval_type = PythonEvalType.SQL_ARROW_BATCHED_UDF
         try:
             require_minimum_pandas_version()
             require_minimum_pyarrow_version()
@@ -92,6 +89,25 @@ def _create_py_udf(
                 RuntimeWarning,
             )
 
+    eval_type: Optional[int] = None
+    if useArrow is None:
+        # If the user doesn't explicitly set useArrow
+        from pyspark.sql.pandas.typehints import infer_eval_type_from_func
+
+        try:
+            # Try to infer the eval type from type hints
+            eval_type = infer_eval_type_from_func(f)
+        except Exception:
+            warnings.warn("Cannot infer the eval type from type hints. ", UserWarning)
+
+    if eval_type is None:
+        if is_arrow_enabled:
+            # Arrow optimized Python UDF
+            eval_type = PythonEvalType.SQL_ARROW_BATCHED_UDF
+        else:
+            # Fallback to Regular Python UDF
+            eval_type = PythonEvalType.SQL_BATCHED_UDF
+
     return _create_udf(f, returnType, eval_type)
 
 
diff --git a/python/pyspark/sql/pandas/typehints.py b/python/pyspark/sql/pandas/typehints.py
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 from inspect import Signature
-from typing import Any, Callable, Dict, Optional, Union, TYPE_CHECKING
+from typing import Any, Callable, Dict, Optional, Union, TYPE_CHECKING, get_type_hints
+from inspect import getfullargspec, signature
 
 from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
 from pyspark.errors import PySparkNotImplementedError, PySparkValueError
@@ -277,6 +278,29 @@ def infer_eval_type(
     return eval_type
 
 
+def infer_eval_type_from_func(  # type: ignore[no-untyped-def]
+    f,
+) -> Optional[
+    Union[
+        "PandasScalarUDFType",
+        "PandasScalarIterUDFType",
+        "PandasGroupedAggUDFType",
+        "ArrowScalarUDFType",
+        "ArrowScalarIterUDFType",
+        "ArrowGroupedAggUDFType",
+    ]
+]:
+    argspec = getfullargspec(f)
+    if len(argspec.annotations) > 0:
+        try:
+            type_hints = get_type_hints(f)
+        except NameError:
+            type_hints = {}
+        return infer_eval_type(signature(f), type_hints)
+    else:
+        return None
+
+
 def check_tuple_annotation(
     annotation: Any, parameter_check_func: Optional[Callable[[Any], bool]] = None
 ) -> bool:
diff --git a/python/pyspark/sql/tests/connect/test_parity_unified_udf.py b/python/pyspark/sql/tests/connect/test_parity_unified_udf.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+
+from pyspark.sql.tests.test_unified_udf import UnifiedUDFTestsMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+
+
+class UnifiedUDFParityTests(UnifiedUDFTestsMixin, ReusedConnectTestCase):
+    @classmethod
+    def setUpClass(cls):
+        ReusedConnectTestCase.setUpClass()
+        cls.spark.conf.set("spark.sql.execution.pythonUDF.arrow.enabled", "false")
+
+
+if __name__ == "__main__":
+    from pyspark.sql.tests.connect.test_parity_unified_udf import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/sql/tests/test_unified_udf.py b/python/pyspark/sql/tests/test_unified_udf.py
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py