temp

ilicmarkodb · ilicmarkodb · commit 2584dab7b705 · 2025-07-28T16:02:41.000+02:00
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
@@ -1379,6 +1379,22 @@ def add1(x):
         result = empty_df.select(add1("id"))
         self.assertEqual(result.collect(), [])
 
+    def test_udf_with_collated_string_types(self):
+        @udf("string")
+        def my_udf(input_val):
+            return "%s - %s" % (type(input_val), input_val)
+
+        string_types = [StringType(), StringType("UTF8_BINARY"), StringType("UTF8_LCASE"), StringType("UNICODE")]
+        data = [("hello",)]
+        expected = "<class 'str'> - hello"
+
+        for string_type in string_types:
+            schema = StructType([StructField("input_col", string_type, True)])
+            df = self.spark.createDataFrame(data, schema=schema)
+            row = df.select(my_udf(df.input_col)).collect()[0][0]
+            self.assertEqual(row, expected)
+
+
 
 class UDFTests(BaseUDFTestsMixin, ReusedSQLTestCase):
     @classmethod
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
@@ -3436,6 +3436,37 @@ def eval(self):
                 with self.assertRaisesRegex(PythonException, "UDTF_ARROW_TYPE_CONVERSION_ERROR"):
                     udtf(TestUDTF, returnType=ret_type)().collect()
 
+def test_udtf_with_collated_string_types(self):
+    class MyUDTF:
+        def eval(self, val1, val2, val3, val4):
+            yield (val1 + "1", val2 + "2", val3 + "3", val4 + "4")
+
+    my_udtf = udtf(
+        MyUDTF,
+        returnType=StructType([
+            StructField("out1", StringType()),
+            StructField("out2", StringType()),
+            StructField("out3", StringType()),
+            StructField("out4", StringType()),
+        ])
+    )
+
+    schema = StructType([
+        StructField("col1", StringType(), True),
+        StructField("col2", StringType("UTF8_BINARY"), True),
+        StructField("col3", StringType("UTF8_LCASE"), True),
+        StructField("col4", StringType("UNICODE"), True)
+    ])
+
+    data = [("hello", "hello", "hello", "hello")]
+    df = self.spark.createDataFrame(data, schema=schema)
+
+    result_row = df.select(my_udtf(df.col1, df.col2, df.col3, df.col4)).collect()[0]
+
+    expected = ("hello1", "hello2", "hello3", "hello4")
+    self.assertEqual(result_row, expected)
+
+
 
 class UDTFArrowTests(UDTFArrowTestsMixin, ReusedSQLTestCase):
     @classmethod
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -79,7 +79,7 @@ object EvaluatePython {
 
     case (d: Decimal, _) => d.toJavaBigDecimal
 
-    case (s: UTF8String, StringType) => s.toString
+    case (s: UTF8String, _: StringType) => s.toString
 
     case (other, _) => other
   }

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ object EvaluatePython {`
`79`	`79`
`80`	`80`	`case (d: Decimal, _) => d.toJavaBigDecimal`
`81`	`81`
`82`		`- case (s: UTF8String, StringType) => s.toString`
	`82`	`+ case (s: UTF8String, _: StringType) => s.toString`
`83`	`83`
`84`	`84`	`case (other, _) => other`
`85`	`85`	`}`