Skip to content

Commit 9042198

Browse files
committed
[SPARK-54315][PYTHON][TESTS] Optimize test ApplyInArrowTests.test_arrow_batch_slicing
### What changes were proposed in this pull request? Optimize test `ApplyInArrowTests.test_arrow_batch_slicing` ### Why are the changes needed? before ``` Starting test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (temp output: /__w/spark/spark/python/target/84531e82-addd-4a47-bb65-9006099020a0/python3.11__pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map__l8scrw6b.log) Finished test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (100s) ``` after ``` Starting test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (temp output: /__w/spark/spark/python/target/f8e27057-d295-433e-99a2-a429a48278d5/python3.11__pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map__ss4itrd9.log) Finished test(python3.11): pyspark.sql.tests.connect.arrow.test_parity_arrow_grouped_map (39s) ``` ### Does this PR introduce _any_ user-facing change? No, test-only ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #53011 from zhengruifeng/opt_test_grouped. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]> (cherry picked from commit 65b69a0) Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent d65234b commit 9042198

File tree

2 files changed

+10
-10
lines changed

2 files changed

+10
-10
lines changed

python/pyspark/sql/tests/arrow/test_arrow_grouped_map.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -356,14 +356,14 @@ def arrow_func(key, table):
356356
self.assertEqual(df2.join(df2).count(), 1)
357357

358358
def test_arrow_batch_slicing(self):
359-
df = self.spark.range(10000000).select(
360-
(sf.col("id") % 2).alias("key"), sf.col("id").alias("v")
361-
)
359+
n = 100000
360+
361+
df = self.spark.range(n).select((sf.col("id") % 2).alias("key"), sf.col("id").alias("v"))
362362
cols = {f"col_{i}": sf.col("v") + i for i in range(20)}
363363
df = df.withColumns(cols)
364364

365365
def min_max_v(table):
366-
assert len(table) == 10000000 / 2, len(table)
366+
assert len(table) == n / 2, len(table)
367367
return pa.Table.from_pydict(
368368
{
369369
"key": [table.column("key")[0].as_py()],
@@ -376,7 +376,7 @@ def min_max_v(table):
376376
df.groupby("key").agg(sf.min("v").alias("min"), sf.max("v").alias("max")).sort("key")
377377
).collect()
378378

379-
for maxRecords, maxBytes in [(1000, 2**31 - 1), (0, 1048576), (1000, 1048576)]:
379+
for maxRecords, maxBytes in [(1000, 2**31 - 1), (0, 4096), (1000, 4096)]:
380380
with self.subTest(maxRecords=maxRecords, maxBytes=maxBytes):
381381
with self.sql_conf(
382382
{

python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -938,14 +938,14 @@ def test(pdf):
938938
self.assertEqual(row[1], 123)
939939

940940
def test_arrow_batch_slicing(self):
941-
df = self.spark.range(100000).select(
942-
(sf.col("id") % 2).alias("key"), sf.col("id").alias("v")
943-
)
941+
n = 100000
942+
943+
df = self.spark.range(n).select((sf.col("id") % 2).alias("key"), sf.col("id").alias("v"))
944944
cols = {f"col_{i}": sf.col("v") + i for i in range(20)}
945945
df = df.withColumns(cols)
946946

947947
def min_max_v(pdf):
948-
assert len(pdf) == 100000 / 2, len(pdf)
948+
assert len(pdf) == n / 2, len(pdf)
949949
return pd.DataFrame(
950950
{
951951
"key": [pdf.key.iloc[0]],
@@ -958,7 +958,7 @@ def min_max_v(pdf):
958958
df.groupby("key").agg(sf.min("v").alias("min"), sf.max("v").alias("max")).sort("key")
959959
).collect()
960960

961-
for maxRecords, maxBytes in [(1000, 4096), (0, 4096), (1000, 4096)]:
961+
for maxRecords, maxBytes in [(1000, 2**31 - 1), (0, 4096), (1000, 4096)]:
962962
with self.subTest(maxRecords=maxRecords, maxBytes=maxBytes):
963963
with self.sql_conf(
964964
{

0 commit comments

Comments
 (0)