Skip to content

Commit 6ab297b

Browse files
committed
[SPARK-52519][PS] Enable divide-by-zero for numeric floordiv with ANSI enabled
### What changes were proposed in this pull request? Enable divide-by-zero for numeric floordiv with ANSI enabled ### Why are the changes needed? Ensure pandas on Spark works well with ANSI mode on. Part of https://issues.apache.org/jira/browse/SPARK-52169. ### Does this PR introduce _any_ user-facing change? Yes. ```py >>> spark.conf.get("spark.sql.ansi.enabled") 'true' >>> ps.set_option("compute.fail_on_ansi_mode", False) >>> ps.set_option("compute.ansi_mode_support", True) >>> ps.Series([1, 2]) // 0 0 inf 1 inf dtype: float64 >>> ps.Series([1, 2]) // ps.Series([0, 0]) 0 inf 1 inf dtype: float64 ``` ### How was this patch tested? Unit tests. ``` (dev3.10) spark (num_floordiv) % SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.10 --testnames "pyspark.pandas.tests.computation.test_binary_ops FrameBinaryOpsTests.test_binary_operator_floordiv" ... Tests passed in 6 seconds (dev3.10) spark (num_floordiv) % SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.10 --testnames "pyspark.pandas.tests.computation.test_binary_ops FrameBinaryOpsTests.test_binary_operator_floordiv" ... Tests passed in 4 seconds ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51209 from xinrong-meng/num_floordiv. Lead-authored-by: Xinrong Meng <[email protected]> Co-authored-by: Xinrong Meng <[email protected]> Signed-off-by: Xinrong Meng <[email protected]>
1 parent 4575429 commit 6ab297b

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

python/pyspark/pandas/data_type_ops/num_ops.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
import numbers
19-
from typing import Any, Union
19+
from typing import Any, Union, Callable
2020

2121
import numpy as np
2222
import pandas as pd
@@ -271,13 +271,22 @@ def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
271271
_sanitize_list_like(right)
272272
if not is_valid_operand_for_numeric_arithmetic(right):
273273
raise TypeError("Floor division can not be applied to given types.")
274+
spark_session = left._internal.spark_frame.sparkSession
275+
use_try_divide = is_ansi_mode_enabled(spark_session)
276+
277+
def fallback_div(x: PySparkColumn, y: PySparkColumn) -> PySparkColumn:
278+
return x.__div__(y)
279+
280+
safe_div: Callable[[PySparkColumn, PySparkColumn], PySparkColumn] = (
281+
F.try_divide if use_try_divide else fallback_div
282+
)
274283

275284
def floordiv(left: PySparkColumn, right: Any) -> PySparkColumn:
276285
return F.when(F.lit(right is np.nan), np.nan).otherwise(
277286
F.when(
278287
F.lit(right != 0) | F.lit(right).isNull(),
279288
F.floor(left.__div__(right)),
280-
).otherwise(F.lit(np.inf).__div__(left))
289+
).otherwise(safe_div(F.lit(np.inf), left))
281290
)
282291

283292
right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type)
@@ -369,6 +378,15 @@ def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
369378
_sanitize_list_like(right)
370379
if not is_valid_operand_for_numeric_arithmetic(right):
371380
raise TypeError("Floor division can not be applied to given types.")
381+
spark_session = left._internal.spark_frame.sparkSession
382+
use_try_divide = is_ansi_mode_enabled(spark_session)
383+
384+
def fallback_div(x: PySparkColumn, y: PySparkColumn) -> PySparkColumn:
385+
return x.__div__(y)
386+
387+
safe_div: Callable[[PySparkColumn, PySparkColumn], PySparkColumn] = (
388+
F.try_divide if use_try_divide else fallback_div
389+
)
372390

373391
def floordiv(left: PySparkColumn, right: Any) -> PySparkColumn:
374392
return F.when(F.lit(right is np.nan), np.nan).otherwise(
@@ -377,7 +395,7 @@ def floordiv(left: PySparkColumn, right: Any) -> PySparkColumn:
377395
F.floor(left.__div__(right)),
378396
).otherwise(
379397
F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(
380-
F.lit(np.inf).__div__(left)
398+
safe_div(F.lit(np.inf), left)
381399
)
382400
)
383401
)

python/pyspark/pandas/tests/computation/test_binary_ops.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,11 @@ def test_binary_operator_truediv(self):
208208
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 / psdf["a"])
209209

210210
def test_binary_operator_floordiv(self):
211-
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
211+
pdf = pd.DataFrame({"a": ["x"], "b": [1], "c": [1.0], "d": [0]})
212+
psdf = ps.from_pandas(pdf)
213+
self.assert_eq(pdf["b"] // 0, psdf["b"] // 0)
214+
self.assert_eq(pdf["c"] // 0, psdf["c"] // 0)
215+
self.assert_eq(pdf["d"] // 0, psdf["d"] // 0)
212216

213217
ks_err_msg = "Floor division can not be applied to strings"
214218
self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] // psdf["b"])

0 commit comments

Comments
 (0)