temp

ilicmarkodb · ilicmarkodb · commit e5fa93c22116 · 2025-08-05T12:29:27.000+02:00
diff --git a/python/pyspark/core/context.py b/python/pyspark/core/context.py
@@ -2649,9 +2649,20 @@ def _to_ddl(self, struct: "StructType") -> str:
 
     def _parse_ddl(self, ddl: str) -> "DataType":
         from pyspark.sql.types import _parse_datatype_json_string
+        from pyspark.sql.utils import ParseException
 
         assert self._jvm is not None
-        return _parse_datatype_json_string(self._jvm.PythonSQLUtils.ddlToJson(ddl))
+        try:
+            # This hack is introduced because of collated strings. E.g., if the return type is
+            # just `STRING COLLATE FR`, `ddlToJson()` would return just `STRING`, losing
+            # information about collation. This is because collation metadata is stored in the
+            # nearest ancestor of `StructField`, so that's why we wrap the return type with
+            # `StructType`.
+            wrapped_jvm_returnType = self._jvm.PythonSQLUtils.parseDataType(f"struct<ddl: {ddl}>")
+            wrapped_returnType = _parse_datatype_json_string(wrapped_jvm_returnType.json())
+            return wrapped_returnType["ddl"].dataType
+        except ParseException:
+            return _parse_datatype_json_string(self._jvm.PythonSQLUtils.ddlToJson(ddl))
 
 
 def _test() -> None:
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
@@ -1379,6 +1379,29 @@ def add1(x):
         result = empty_df.select(add1("id"))
         self.assertEqual(result.collect(), [])
 
+    def test_udf_with_collated_string_types(self):
+        @udf("string collate fr")
+        def my_udf(input_val):
+            return "%s - %s" % (type(input_val), input_val)
+
+        string_types = [
+            StringType(),
+            StringType("UTF8_BINARY"),
+            StringType("UTF8_LCASE"),
+            StringType("UNICODE"),
+        ]
+        data = [("hello",)]
+        expected = "<class 'str'> - hello"
+
+        for string_type in string_types:
+            schema = StructType([StructField("input_col", string_type, True)])
+            df = self.spark.createDataFrame(data, schema=schema)
+            df_result = df.select(my_udf(df.input_col).alias("result"))
+            row = df_result.collect()[0][0]
+            self.assertEqual(row, expected)
+            result_type = df_result.schema["result"].dataType
+            self.assertEqual(result_type, StringType("fr"))
+
 
 class UDFTests(BaseUDFTestsMixin, ReusedSQLTestCase):
     @classmethod
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
@@ -3490,6 +3490,41 @@ def eval(self):
                     udtf(TestUDTF, returnType=ret_type)().collect()
 
 
+def test_udtf_with_collated_string_types(self):
+    @udtf(
+        "out1 string, out2 string collate UTF8_BINARY, out3 string collate UTF8_LCASE,"
+        " out4 string collate UNICODE"
+    )
+    class MyUDTF:
+        def eval(self, v1, v2, v3, v4):
+            yield (v1 + "1", v2 + "2", v3 + "3", v4 + "4")
+
+    schema = StructType(
+        [
+            StructField("col1", StringType(), True),
+            StructField("col2", StringType("UTF8_BINARY"), True),
+            StructField("col3", StringType("UTF8_LCASE"), True),
+            StructField("col4", StringType("UNICODE"), True),
+        ]
+    )
+    df = self.spark.createDataFrame([("hello",) * 4], schema=schema)
+
+    df_out = df.select(MyUDTF(df.col1, df.col2, df.col3, df.col4).alias("out"))
+    result_df = df_out.select("out.*")
+
+    expected_row = ("hello1", "hello2", "hello3", "hello4")
+    self.assertEqual(result_df.collect()[0], expected_row)
+
+    expected_output_types = [
+        StringType(),
+        StringType("UTF8_BINARY"),
+        StringType("UTF8_LCASE"),
+        StringType("UNICODE"),
+    ]
+    for idx, field in enumerate(result_df.schema.fields):
+        self.assertEqual(field.dataType, expected_output_types[idx])
+
+
 class UDTFArrowTests(UDTFArrowTestsMixin, ReusedSQLTestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
@@ -382,12 +382,20 @@ def _judf(self) -> "JavaObject":
 
     def _create_judf(self, func: Callable[..., Any]) -> "JavaObject":
         from pyspark.sql import SparkSession
+        from pyspark.sql.types import StructField
 
         spark = SparkSession._getActiveSessionOrCreate()
         sc = spark.sparkContext
 
         wrapped_func = _wrap_function(sc, func, self.returnType)
-        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
+        # This hack is introduced because of collated strings. E.g., if the return type is
+        # just `StringType("FR")`, `json()` would return just `STRING`, losing information
+        # about collation. This is because collation metadata is stored in the nearest
+        # ancestor of `StructField`, so that's why we wrap the return type with
+        # `StructType`.
+        wrapped_returnType = StructType([StructField("returnType", self.returnType)])
+        wrapped_jvm_returnType = spark._jsparkSession.parseDataType(wrapped_returnType.json())
+        jdt = wrapped_jvm_returnType.fields()[0].dataType()
         assert sc._jvm is not None
         judf = getattr(sc._jvm, "org.apache.spark.sql.execution.python.UserDefinedPythonFunction")(
             self._name, wrapped_func, jdt, self.evalType, self.deterministic
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.python.EvalPythonExec.ArgumentMetadata
-import org.apache.spark.sql.types.{StructType, UserDefinedType}
+import org.apache.spark.sql.types.{StringType, StructType, UserDefinedType}
 
 /**
  * Grouped a iterator into batches.
@@ -109,6 +109,10 @@ class ArrowEvalPythonEvaluatorFactory(
 
     val outputTypes = output.drop(childOutput.length).map(_.dataType.transformRecursively {
       case udt: UserDefinedType[_] => udt.sqlType
+      // We change each StringType, with StringType companion object, to ignore collations.
+      // This is because Python doesn't know about collations, and will always return non-collated
+      // strings.
+      case _: StringType => StringType
     })
 
     val batchIter = Iterator(iter)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonUDTFExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonUDTFExec.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.python.EvalPythonExec.ArgumentMetadata
-import org.apache.spark.sql.types.{StructType, UserDefinedType}
+import org.apache.spark.sql.types.{StringType, StructType, UserDefinedType}
 import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 
 /**
@@ -63,6 +63,10 @@ case class ArrowEvalPythonUDTFExec(
 
     val outputTypes = resultAttrs.map(_.dataType.transformRecursively {
       case udt: UserDefinedType[_] => udt.sqlType
+      // We change each StringType, with StringType companion object, to ignore collations.
+      // This is because Python doesn't know about collations, and will always return non-collated
+      // strings.
+      case _: StringType => StringType
     })
 
     val columnarBatchIter = new ArrowPythonUDTFRunner(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -79,7 +79,7 @@ object EvaluatePython {
 
     case (d: Decimal, _) => d.toJavaBigDecimal
 
-    case (s: UTF8String, StringType) => s.toString
+    case (s: UTF8String, _: StringType) => s.toString
 
     case (other, _) => other
   }

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ object EvaluatePython {`
`79`	`79`
`80`	`80`	`case (d: Decimal, _) => d.toJavaBigDecimal`
`81`	`81`
`82`		`- case (s: UTF8String, StringType) => s.toString`
	`82`	`+ case (s: UTF8String, _: StringType) => s.toString`
`83`	`83`
`84`	`84`	`case (other, _) => other`
`85`	`85`	`}`