temp

ilicmarkodb · ilicmarkodb · commit a1f4c930f3d4 · 2025-07-29T14:06:54.000+02:00
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
@@ -1379,6 +1379,29 @@ def add1(x):
         result = empty_df.select(add1("id"))
         self.assertEqual(result.collect(), [])
 
+    def test_udf_with_collated_string_types(self):
+        @udf("string collate fr")
+        def my_udf(input_val):
+            return "%s - %s" % (type(input_val), input_val)
+
+        string_types = [
+            StringType(),
+            StringType("UTF8_BINARY"),
+            StringType("UTF8_LCASE"),
+            StringType("UNICODE"),
+        ]
+        data = [("hello",)]
+        expected = "<class 'str'> - hello"
+
+        for string_type in string_types:
+            schema = StructType([StructField("input_col", string_type, True)])
+            df = self.spark.createDataFrame(data, schema=schema)
+            df_result = df.select(my_udf(df.input_col).alias("result"))
+            row = df_result.collect()[0][0]
+            self.assertEqual(row, expected)
+            result_type = df_result.schema["result"].dataType
+            self.assertEqual(result_type, StringType("fr"))
+
 
 class UDFTests(BaseUDFTestsMixin, ReusedSQLTestCase):
     @classmethod
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
@@ -3437,6 +3437,40 @@ def eval(self):
                     udtf(TestUDTF, returnType=ret_type)().collect()
 
 
+def test_udtf_with_collated_string_types(self):
+    @udtf(
+        "out1 string, out2 string collate UTF8_BINARY, out3 string collate UTF8_LCASE, out4 string collate UNICODE"
+    )
+    class MyUDTF:
+        def eval(self, v1, v2, v3, v4):
+            yield (v1 + "1", v2 + "2", v3 + "3", v4 + "4")
+
+    schema = StructType(
+        [
+            StructField("col1", StringType(), True),
+            StructField("col2", StringType("UTF8_BINARY"), True),
+            StructField("col3", StringType("UTF8_LCASE"), True),
+            StructField("col4", StringType("UNICODE"), True),
+        ]
+    )
+    df = self.spark.createDataFrame([("hello",) * 4], schema=schema)
+
+    df_out = df.select(MyUDTF(df.col1, df.col2, df.col3, df.col4).alias("out"))
+    result_df = df_out.select("out.*")
+
+    expected_row = ("hello1", "hello2", "hello3", "hello4")
+    self.assertEqual(result_df.collect()[0], expected_row)
+
+    expected_output_types = [
+        StringType(),
+        StringType("UTF8_BINARY"),
+        StringType("UTF8_LCASE"),
+        StringType("UNICODE"),
+    ]
+    for idx, field in enumerate(result_df.schema.fields):
+        self.assertEqual(field.dataType, expected_output_types[idx])
+
+
 class UDTFArrowTests(UDTFArrowTestsMixin, ReusedSQLTestCase):
     @classmethod
     def setUpClass(cls):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -79,7 +79,7 @@ object EvaluatePython {
 
     case (d: Decimal, _) => d.toJavaBigDecimal
 
-    case (s: UTF8String, StringType) => s.toString
+    case (s: UTF8String, _: StringType) => s.toString
 
     case (other, _) => other
   }

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ object EvaluatePython {`
`79`	`79`
`80`	`80`	`case (d: Decimal, _) => d.toJavaBigDecimal`
`81`	`81`
`82`		`- case (s: UTF8String, StringType) => s.toString`
	`82`	`+ case (s: UTF8String, _: StringType) => s.toString`
`83`	`83`
`84`	`84`	`case (other, _) => other`
`85`	`85`	`}`