[SPARK-54315][PYTHON][TESTS] Optimize test ApplyInArrowTests.test_arrow_batch_slicing

zhengruifeng · zhengruifeng · commit 904219853b77 · 2025-11-13T10:29:21.000+08:00
### What changes were proposed in this pull request? Optimize test `ApplyInArrowTests.test_arrow_batch_slicing` ### Why are the changes needed? before ``` Starting test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (temp output: /__w/spark/spark/python/target/84531e82-addd-4a47-bb65-9006099020a0/python3.11__pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map__l8scrw6b.log) Finished test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (100s) ``` after ``` Starting test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (temp output: /__w/spark/spark/python/target/f8e27057-d295-433e-99a2-a429a48278d5/python3.11__pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map__ss4itrd9.log) Finished test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (39s) ``` ### Does this PR introduce _any_ user-facing change? No, test-only ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #53011 from zhengruifeng/opt_test_grouped. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org> (cherry picked from commit 65b69a0) Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py b/python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py
@@ -356,14 +356,14 @@ def arrow_func(key, table):
         self.assertEqual(df2.join(df2).count(), 1)
 
     def test_arrow_batch_slicing(self):
-        df = self.spark.range(10000000).select(
-            (sf.col("id") % 2).alias("key"), sf.col("id").alias("v")
-        )
+        n = 100000
+
+        df = self.spark.range(n).select((sf.col("id") % 2).alias("key"), sf.col("id").alias("v"))
         cols = {f"col_{i}": sf.col("v") + i for i in range(20)}
         df = df.withColumns(cols)
 
         def min_max_v(table):
-            assert len(table) == 10000000 / 2, len(table)
+            assert len(table) == n / 2, len(table)
             return pa.Table.from_pydict(
                 {
                     "key": [table.column("key")[0].as_py()],
@@ -376,7 +376,7 @@ def min_max_v(table):
             df.groupby("key").agg(sf.min("v").alias("min"), sf.max("v").alias("max")).sort("key")
         ).collect()
 
-        for maxRecords, maxBytes in [(1000, 2**31 - 1), (0, 1048576), (1000, 1048576)]:
+        for maxRecords, maxBytes in [(1000, 2**31 - 1), (0, 4096), (1000, 4096)]:
             with self.subTest(maxRecords=maxRecords, maxBytes=maxBytes):
                 with self.sql_conf(
                     {
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
@@ -938,14 +938,14 @@ def test(pdf):
                 self.assertEqual(row[1], 123)
 
     def test_arrow_batch_slicing(self):
-        df = self.spark.range(100000).select(
-            (sf.col("id") % 2).alias("key"), sf.col("id").alias("v")
-        )
+        n = 100000
+
+        df = self.spark.range(n).select((sf.col("id") % 2).alias("key"), sf.col("id").alias("v"))
         cols = {f"col_{i}": sf.col("v") + i for i in range(20)}
         df = df.withColumns(cols)
 
         def min_max_v(pdf):
-            assert len(pdf) == 100000 / 2, len(pdf)
+            assert len(pdf) == n / 2, len(pdf)
             return pd.DataFrame(
                 {
                     "key": [pdf.key.iloc[0]],
@@ -958,7 +958,7 @@ def min_max_v(pdf):
             df.groupby("key").agg(sf.min("v").alias("min"), sf.max("v").alias("max")).sort("key")
         ).collect()
 
-        for maxRecords, maxBytes in [(1000, 4096), (0, 4096), (1000, 4096)]:
+        for maxRecords, maxBytes in [(1000, 2**31 - 1), (0, 4096), (1000, 4096)]:
             with self.subTest(maxRecords=maxRecords, maxBytes=maxBytes):
                 with self.sql_conf(
                     {