Skip to content

Commit

Permalink
Remove cudf.Scalar from date_range/to_datetime (#17860)
Browse files Browse the repository at this point in the history
Towards #17843

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17860
  • Loading branch information
mroeschke authored Feb 5, 2025
1 parent a5dddb0 commit dfcf8e5
Showing 1 changed file with 45 additions and 60 deletions.
105 changes: 45 additions & 60 deletions python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import pandas as pd
import pandas.tseries.offsets as pd_offset
import pyarrow as pa
from typing_extensions import Self

import pylibcudf as plc
Expand All @@ -19,6 +20,7 @@
from cudf.core import column
from cudf.core.buffer import acquire_spill_lock
from cudf.core.index import ensure_index
from cudf.core.scalar import pa_scalar_to_plc_scalar

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -240,6 +242,11 @@ def to_datetime(
)

times_column = None
factor_denominator = (
column.datetime._unit_to_nanoseconds_conversion["s"]
if np.datetime_data(col.dtype)[0] == "s"
else 1
)
for u in ["h", "m", "s", "ms", "us", "ns"]:
value = unit_rev.get(u)
if value is not None and value in arg:
Expand All @@ -252,15 +259,9 @@ def to_datetime(
except ValueError:
current_col = current_col.astype(dtype="float64")

factor = cudf.Scalar(
factor = (
column.datetime._unit_to_nanoseconds_conversion[u]
/ (
column.datetime._unit_to_nanoseconds_conversion[
"s"
]
if np.datetime_data(col.dtype)[0] == "s"
else 1
)
/ factor_denominator
)

if times_column is None:
Expand Down Expand Up @@ -324,10 +325,7 @@ def _process_col(
):
if col.dtype.kind == "f":
if unit not in (None, "ns"):
factor = cudf.Scalar(
column.datetime._unit_to_nanoseconds_conversion[unit]
)
col = col * factor
col = col * column.datetime._unit_to_nanoseconds_conversion[unit]

if format is not None:
# Converting to int because,
Expand All @@ -337,7 +335,7 @@ def _process_col(
# Instead we directly cast to int and perform
# parsing against `format`.
col = (
col.astype("int")
col.astype(np.dtype(np.int64))
.astype("str")
.strptime(
dtype=np.dtype("datetime64[us]")
Expand All @@ -351,7 +349,7 @@ def _process_col(

elif col.dtype.kind in "iu":
if unit in ("D", "h", "m"):
factor = cudf.Scalar(
factor = (
column.datetime._unit_to_nanoseconds_conversion[unit]
/ column.datetime._unit_to_nanoseconds_conversion["s"]
)
Expand Down Expand Up @@ -589,14 +587,13 @@ def __init__(self, n=1, normalize=False, **kwds):
scalars = {}
for k, v in kwds.items():
if k in all_possible_units:
# Months must be int16
# Months must be int16 or int32
if k == "months":
# TODO: throw for out-of-bounds int16 values
dtype = np.dtype(np.int16)
# TODO: throw for out-of-bounds int32 values
scalars[k] = np.int32(v)
else:
unit = self._UNITS_TO_CODES[k]
dtype = np.dtype(f"timedelta64[{unit}]")
scalars[k] = cudf.Scalar(v, dtype=dtype)
scalars[k] = np.timedelta64(v, unit)

self._scalars = scalars

Expand Down Expand Up @@ -647,33 +644,23 @@ def _datetime_binop(
f" and {type(datetime_col).__name__}"
)
if not self._is_no_op:
if "months" in self._scalars:
rhs = self._generate_months_column(len(datetime_col), op)
with acquire_spill_lock():
datetime_col = type(datetime_col).from_pylibcudf(
plc.datetime.add_calendrical_months(
datetime_col.to_pylibcudf(mode="read"),
rhs.to_pylibcudf(mode="read"),
)
)

for unit, value in self._scalars.items():
if unit != "months":
value = -value if op == "__sub__" else value
datetime_col += cudf.core.column.as_column(
value = -value if op == "__sub__" else value
if unit == "months":
with acquire_spill_lock():
datetime_col = type(datetime_col).from_pylibcudf(
plc.datetime.add_calendrical_months(
datetime_col.to_pylibcudf(mode="read"),
pa_scalar_to_plc_scalar(pa.scalar(value)),
)
)
else:
datetime_col += column.as_column(
value, length=len(datetime_col)
)

return datetime_col

def _generate_months_column(self, size, op):
months = self._scalars["months"]
months = -months if op == "__sub__" else months
# TODO: pass a scalar instead of constructing a column
# https://github.com/rapidsai/cudf/issues/6990
col = cudf.core.column.as_column(months, length=size)
return col

@property
def _is_no_op(self) -> bool:
# some logic could be implemented here for more complex cases
Expand Down Expand Up @@ -899,14 +886,15 @@ def date_range(
)

dtype = np.dtype("datetime64[ns]")
unit, _ = np.datetime_data(dtype)

if freq is None:
# `start`, `end`, `periods` is specified, we treat the timestamps as
# integers and divide the number range evenly with `periods` elements.
start = cudf.Scalar(start, dtype=dtype).value.astype("int64")
end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
arr = np.linspace(start=start, stop=end, num=periods)
result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
start = dtype.type(start, unit).astype(np.dtype(np.int64))
end = dtype.type(end, unit).astype(np.dtype(np.int64))
arr = np.linspace(start=start, stop=end, num=periods).astype(dtype)
result = column.as_column(arr)
return cudf.DatetimeIndex._from_column(result, name=name).tz_localize(
tz
)
Expand Down Expand Up @@ -941,23 +929,22 @@ def date_range(
_periods_not_specified = False

if start is None:
end = cudf.Scalar(end, dtype=dtype)
start = cudf.Scalar(
pd.Timestamp(end.value)
- (periods - 1) * offset._maybe_as_fast_pandas_offset(),
dtype=dtype,
)
end = dtype.type(end, unit)
start = (
pd.Timestamp(end)
- (periods - 1) * offset._maybe_as_fast_pandas_offset()
).to_numpy()
elif end is None:
start = cudf.Scalar(start, dtype=dtype)
start = dtype.type(start, unit)
elif periods is None:
# When `periods` is unspecified, its upper bound estimated by
# dividing the number of nanoseconds between two timestamps with
# the lower bound of `freq` in nanoseconds. While the final result
# may contain extra elements that exceeds `end`, they are trimmed
# as a post processing step. [1]
_periods_not_specified = True
start = cudf.Scalar(start, dtype=dtype)
end = cudf.Scalar(end, dtype=dtype)
start = dtype.type(start, unit)
end = dtype.type(end, unit)
_is_increment_sequence = end >= start

periods = math.floor(
Expand Down Expand Up @@ -985,7 +972,7 @@ def date_range(
# are dropped in conversion during the binops
warnings.simplefilter("ignore", UserWarning)
end_estim = (
pd.Timestamp(start.value)
pd.Timestamp(start)
+ periods * offset._maybe_as_fast_pandas_offset()
).to_datetime64()

Expand All @@ -998,7 +985,7 @@ def date_range(
res = libcudf.column.Column.from_pylibcudf(
plc.filling.calendrical_month_sequence(
periods,
start.device_value,
pa_scalar_to_plc_scalar(pa.scalar(start)),
months,
)
)
Expand All @@ -1012,13 +999,11 @@ def date_range(
else:
# If `offset` is fixed frequency, we generate a range of
# treating `start`, `stop` and `step` as ints:
stop = end_estim.astype("int64")
start = start.value.astype("int64")
stop = end_estim.astype(np.dtype(np.int64))
start = start.astype(np.dtype(np.int64))
step = _offset_to_nanoseconds_lower_bound(offset)
arr = range(int(start), int(stop), step)
res = cudf.core.column.as_column(arr, dtype="int64").astype(
"datetime64[ns]"
)
res = column.as_column(arr).astype(dtype)

return cudf.DatetimeIndex._from_column(
res, name=name, freq=freq
Expand Down

0 comments on commit dfcf8e5

Please sign in to comment.