Skip to content

Commit 188b2da

Browse files
MarcoGorelliDr-Irv
andauthored
ENH: Introduce pandas.col (#62103)
Co-authored-by: Irv Lustig <[email protected]>
1 parent f84ba18 commit 188b2da

File tree

9 files changed

+423
-3
lines changed

9 files changed

+423
-3
lines changed

doc/source/reference/general_functions.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ Top-level evaluation
7171
.. autosummary::
7272
:toctree: api/
7373

74+
col
7475
eval
7576

7677
Datetime formats

doc/source/user_guide/dsintro.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,12 @@ a function of one argument to be evaluated on the DataFrame being assigned to.
553553
554554
iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head()
555555
556+
or, using :meth:`pandas.col`:
557+
558+
.. ipython:: python
559+
560+
iris.assign(sepal_ratio=pd.col("SepalWidth") / pd.col("SepalLength")).head()
561+
556562
:meth:`~pandas.DataFrame.assign` **always** returns a copy of the data, leaving the original
557563
DataFrame untouched.
558564

doc/source/whatsnew/v3.0.0.rst

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,28 @@ process in more detail.
117117

118118
`PDEP-7: Consistent copy/view semantics in pandas with Copy-on-Write <https://pandas.pydata.org/pdeps/0007-copy-on-write.html>`__
119119

120-
.. _whatsnew_300.enhancements.enhancement2:
120+
.. _whatsnew_300.enhancements.col:
121121

122-
Enhancement2
123-
^^^^^^^^^^^^
122+
``pd.col`` syntax can now be used in :meth:`DataFrame.assign` and :meth:`DataFrame.loc`
123+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
124+
125+
You can now use ``pd.col`` to create callables for use in dataframe methods which accept them. For example, if you have a dataframe
126+
127+
.. ipython:: python
128+
129+
df = pd.DataFrame({'a': [1, 1, 2], 'b': [4, 5, 6]})
130+
131+
and you want to create a new column ``'c'`` by summing ``'a'`` and ``'b'``, then instead of
132+
133+
.. ipython:: python
134+
135+
df.assign(c = lambda df: df['a'] + df['b'])
136+
137+
you can now write:
138+
139+
.. ipython:: python
140+
141+
df.assign(c = pd.col('a') + pd.col('b'))
124142
125143
New Deprecation Policy
126144
^^^^^^^^^^^^^^^^^^^^^^

pandas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105
Series,
106106
DataFrame,
107107
)
108+
from pandas.core.col import col
108109

109110
from pandas.core.dtypes.dtypes import SparseDtype
110111

@@ -281,6 +282,7 @@
281282
"array",
282283
"arrays",
283284
"bdate_range",
285+
"col",
284286
"concat",
285287
"crosstab",
286288
"cut",

pandas/api/typing/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pandas._libs.lib import NoDefault
77
from pandas._libs.missing import NAType
88

9+
from pandas.core.col import Expression
910
from pandas.core.groupby import (
1011
DataFrameGroupBy,
1112
SeriesGroupBy,
@@ -41,6 +42,7 @@
4142
"ExpandingGroupby",
4243
"ExponentialMovingWindow",
4344
"ExponentialMovingWindowGroupby",
45+
"Expression",
4446
"FrozenList",
4547
"JsonReader",
4648
"NAType",

pandas/core/col.py

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import (
4+
Callable,
5+
Hashable,
6+
)
7+
from typing import (
8+
TYPE_CHECKING,
9+
Any,
10+
)
11+
12+
from pandas.core.series import Series
13+
14+
if TYPE_CHECKING:
15+
from pandas import DataFrame
16+
17+
18+
# Used only for generating the str repr of expressions.
19+
_OP_SYMBOLS = {
20+
"__add__": "+",
21+
"__radd__": "+",
22+
"__sub__": "-",
23+
"__rsub__": "-",
24+
"__mul__": "*",
25+
"__rmul__": "*",
26+
"__truediv__": "/",
27+
"__rtruediv__": "/",
28+
"__floordiv__": "//",
29+
"__rfloordiv__": "//",
30+
"__mod__": "%",
31+
"__rmod__": "%",
32+
"__ge__": ">=",
33+
"__gt__": ">",
34+
"__le__": "<=",
35+
"__lt__": "<",
36+
"__eq__": "==",
37+
"__ne__": "!=",
38+
}
39+
40+
41+
def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]:
42+
# Parse `args`, evaluating any expressions we encounter.
43+
return tuple([x(df) if isinstance(x, Expression) else x for x in args])
44+
45+
46+
def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]:
47+
# Parse `kwargs`, evaluating any expressions we encounter.
48+
return {
49+
key: val(df) if isinstance(val, Expression) else val
50+
for key, val in kwargs.items()
51+
}
52+
53+
54+
def _pretty_print_args_kwargs(*args: Any, **kwargs: Any) -> str:
55+
inputs_repr = ", ".join(
56+
arg._repr_str if isinstance(arg, Expression) else repr(arg) for arg in args
57+
)
58+
kwargs_repr = ", ".join(
59+
f"{k}={v._repr_str if isinstance(v, Expression) else v!r}"
60+
for k, v in kwargs.items()
61+
)
62+
63+
all_args = []
64+
if inputs_repr:
65+
all_args.append(inputs_repr)
66+
if kwargs_repr:
67+
all_args.append(kwargs_repr)
68+
69+
return ", ".join(all_args)
70+
71+
72+
class Expression:
73+
"""
74+
Class representing a deferred column.
75+
76+
This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`.
77+
"""
78+
79+
def __init__(self, func: Callable[[DataFrame], Any], repr_str: str) -> None:
80+
self._func = func
81+
self._repr_str = repr_str
82+
83+
def __call__(self, df: DataFrame) -> Any:
84+
return self._func(df)
85+
86+
def _with_binary_op(self, op: str, other: Any) -> Expression:
87+
op_symbol = _OP_SYMBOLS.get(op, op)
88+
89+
if isinstance(other, Expression):
90+
if op.startswith("__r"):
91+
repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})"
92+
else:
93+
repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})"
94+
return Expression(lambda df: getattr(self(df), op)(other(df)), repr_str)
95+
else:
96+
if op.startswith("__r"):
97+
repr_str = f"({other!r} {op_symbol} {self._repr_str})"
98+
else:
99+
repr_str = f"({self._repr_str} {op_symbol} {other!r})"
100+
return Expression(lambda df: getattr(self(df), op)(other), repr_str)
101+
102+
# Binary ops
103+
def __add__(self, other: Any) -> Expression:
104+
return self._with_binary_op("__add__", other)
105+
106+
def __radd__(self, other: Any) -> Expression:
107+
return self._with_binary_op("__radd__", other)
108+
109+
def __sub__(self, other: Any) -> Expression:
110+
return self._with_binary_op("__sub__", other)
111+
112+
def __rsub__(self, other: Any) -> Expression:
113+
return self._with_binary_op("__rsub__", other)
114+
115+
def __mul__(self, other: Any) -> Expression:
116+
return self._with_binary_op("__mul__", other)
117+
118+
def __rmul__(self, other: Any) -> Expression:
119+
return self._with_binary_op("__rmul__", other)
120+
121+
def __truediv__(self, other: Any) -> Expression:
122+
return self._with_binary_op("__truediv__", other)
123+
124+
def __rtruediv__(self, other: Any) -> Expression:
125+
return self._with_binary_op("__rtruediv__", other)
126+
127+
def __floordiv__(self, other: Any) -> Expression:
128+
return self._with_binary_op("__floordiv__", other)
129+
130+
def __rfloordiv__(self, other: Any) -> Expression:
131+
return self._with_binary_op("__rfloordiv__", other)
132+
133+
def __ge__(self, other: Any) -> Expression:
134+
return self._with_binary_op("__ge__", other)
135+
136+
def __gt__(self, other: Any) -> Expression:
137+
return self._with_binary_op("__gt__", other)
138+
139+
def __le__(self, other: Any) -> Expression:
140+
return self._with_binary_op("__le__", other)
141+
142+
def __lt__(self, other: Any) -> Expression:
143+
return self._with_binary_op("__lt__", other)
144+
145+
def __eq__(self, other: object) -> Expression: # type: ignore[override]
146+
return self._with_binary_op("__eq__", other)
147+
148+
def __ne__(self, other: object) -> Expression: # type: ignore[override]
149+
return self._with_binary_op("__ne__", other)
150+
151+
def __mod__(self, other: Any) -> Expression:
152+
return self._with_binary_op("__mod__", other)
153+
154+
def __rmod__(self, other: Any) -> Expression:
155+
return self._with_binary_op("__rmod__", other)
156+
157+
def __array_ufunc__(
158+
self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any
159+
) -> Expression:
160+
def func(df: DataFrame) -> Any:
161+
parsed_inputs = _parse_args(df, *inputs)
162+
parsed_kwargs = _parse_kwargs(df, *kwargs)
163+
return ufunc(*parsed_inputs, **parsed_kwargs)
164+
165+
args_str = _pretty_print_args_kwargs(*inputs, **kwargs)
166+
repr_str = f"{ufunc.__name__}({args_str})"
167+
168+
return Expression(func, repr_str)
169+
170+
# Everything else
171+
def __getattr__(self, attr: str, /) -> Any:
172+
if attr in Series._accessors:
173+
return NamespaceExpression(self, attr)
174+
175+
def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any:
176+
parsed_args = _parse_args(df, *args)
177+
parsed_kwargs = _parse_kwargs(df, **kwargs)
178+
return getattr(self(df), attr)(*parsed_args, **parsed_kwargs)
179+
180+
def wrapper(*args: Any, **kwargs: Any) -> Expression:
181+
args_str = _pretty_print_args_kwargs(*args, **kwargs)
182+
repr_str = f"{self._repr_str}.{attr}({args_str})"
183+
184+
return Expression(lambda df: func(df, *args, **kwargs), repr_str)
185+
186+
return wrapper
187+
188+
def __repr__(self) -> str:
189+
return self._repr_str or "Expr(...)"
190+
191+
192+
class NamespaceExpression:
193+
def __init__(self, func: Expression, namespace: str) -> None:
194+
self._func = func
195+
self._namespace = namespace
196+
197+
def __call__(self, df: DataFrame) -> Any:
198+
return self._func(df)
199+
200+
def __getattr__(self, attr: str) -> Any:
201+
if isinstance(getattr(getattr(Series, self._namespace), attr), property):
202+
repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}"
203+
return Expression(
204+
lambda df: getattr(getattr(self(df), self._namespace), attr),
205+
repr_str,
206+
)
207+
208+
def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any:
209+
parsed_args = _parse_args(df, *args)
210+
parsed_kwargs = _parse_kwargs(df, **kwargs)
211+
return getattr(getattr(self(df), self._namespace), attr)(
212+
*parsed_args, **parsed_kwargs
213+
)
214+
215+
def wrapper(*args: Any, **kwargs: Any) -> Expression:
216+
args_str = _pretty_print_args_kwargs(*args, **kwargs)
217+
repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})"
218+
return Expression(lambda df: func(df, *args, **kwargs), repr_str)
219+
220+
return wrapper
221+
222+
223+
def col(col_name: Hashable) -> Expression:
224+
"""
225+
Generate deferred object representing a column of a DataFrame.
226+
227+
Any place which accepts ``lambda df: df[col_name]``, such as
228+
:meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept
229+
``pd.col(col_name)``.
230+
231+
Parameters
232+
----------
233+
col_name : Hashable
234+
Column name.
235+
236+
Returns
237+
-------
238+
`pandas.api.typing.Expression`
239+
A deferred object representing a column of a DataFrame.
240+
241+
See Also
242+
--------
243+
DataFrame.query : Query columns of a dataframe using string expressions.
244+
245+
Examples
246+
--------
247+
248+
You can use `col` in `assign`.
249+
250+
>>> df = pd.DataFrame({"name": ["beluga", "narwhal"], "speed": [100, 110]})
251+
>>> df.assign(name_titlecase=pd.col("name").str.title())
252+
name speed name_titlecase
253+
0 beluga 100 Beluga
254+
1 narwhal 110 Narwhal
255+
256+
You can also use it for filtering.
257+
258+
>>> df.loc[pd.col("speed") > 105]
259+
name speed
260+
1 narwhal 110
261+
"""
262+
if not isinstance(col_name, Hashable):
263+
msg = f"Expected Hashable, got: {type(col_name)}"
264+
raise TypeError(msg)
265+
266+
def func(df: DataFrame) -> Series:
267+
if col_name not in df.columns:
268+
columns_str = str(df.columns.tolist())
269+
max_len = 90
270+
if len(columns_str) > max_len:
271+
columns_str = columns_str[:max_len] + "...]"
272+
273+
msg = (
274+
f"Column '{col_name}' not found in given DataFrame.\n\n"
275+
f"Hint: did you mean one of {columns_str} instead?"
276+
)
277+
raise ValueError(msg)
278+
return df[col_name]
279+
280+
return Expression(func, f"col({col_name!r})")
281+
282+
283+
__all__ = ["Expression", "col"]

pandas/core/frame.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5304,6 +5304,13 @@ def assign(self, **kwargs) -> DataFrame:
53045304
Portland 17.0 62.6
53055305
Berkeley 25.0 77.0
53065306
5307+
or by using :meth:`pandas.col`:
5308+
5309+
>>> df.assign(temp_f=pd.col("temp_c") * 9 / 5 + 32)
5310+
temp_c temp_f
5311+
Portland 17.0 62.6
5312+
Berkeley 25.0 77.0
5313+
53075314
You can create multiple columns within the same assign where one
53085315
of the columns depends on another one defined within the same assign:
53095316

pandas/tests/api/test_api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class TestPDApi(Base):
107107
funcs = [
108108
"array",
109109
"bdate_range",
110+
"col",
110111
"concat",
111112
"crosstab",
112113
"cut",
@@ -260,6 +261,7 @@ class TestApi(Base):
260261
"ExpandingGroupby",
261262
"ExponentialMovingWindow",
262263
"ExponentialMovingWindowGroupby",
264+
"Expression",
263265
"FrozenList",
264266
"JsonReader",
265267
"NaTType",

0 commit comments

Comments
 (0)