-
-
Notifications
You must be signed in to change notification settings - Fork 18.9k
Description
EDIT: Updated Example Code
Is your feature request related to a problem?
I was trying to perform a method chained logical mask without an intermediate assignment and found the method of passing the logical mask or callable to .loc
to be verbose. I am trying to reduce that verbosity.
Examples
(df_xtra_metadata.loc[(df_xtra_metadata['column'] == val) & (df_xtra_metadata['column2'] >= val2)]
.reset_index()
)
# or with a callable, shorter but still long
(df_xtra_metadata.loc[lambda x: (x['column'] == val) & (x['column2'] >= val2)]
.reset_index()
)
# not as long but requires assigning and deleting the mask in the scope
mask = (df_xtra_metadata['column'] == val) & (df_xtra_metadata['column2'] >= val2)
df.loc[mask].reset_index()
Describe the solution you'd like
I would like an object similar to pd.IndexSlice
but instead is initialized with some metadata and then when called (which takes in the dataframe) computes the logical mask.
API breaking implications
This feature would add to the existing API but in a non obtrusive way and would not conflict with any of the current API.
Describe alternatives you've considered
I haven't considered any alternatives, I wrote this for myself and thought maybe it would be useful to others.
Additional context
So below is the code I wrote toying around with this idea.
from __future__ import annotations
from operator import and_, eq, ge, getitem, gt, invert, le, lt, ne, or_, xor
from typing import Any, Callable, Literal, Union
import pandas as pd
class LogicalSlice:
"""Create objects to more easily calculate logical masks inplace for slicing DataFrames."""
ops = {
">": gt,
"<": lt,
">=": ge,
"<=": le,
"==": eq,
"!=": ne,
"in": pd.Series.isin,
}
__slots__ = ("_repr", "_closure", "_body")
def __init__(
self, column: Any, op: Literal[">", "<", ">=", "<=", "==", "!=", "in"], val: Any
) -> None:
operation: Union[gt, lt, ge, le, eq, ne, pd.Series.isin] = self.ops[op]
def closure(df: pd.DataFrame) -> pd.Series:
return operation(getitem(df, column), val)
self._closure: Callable[pd.DataFrame, pd.Series] = closure
if isinstance(column, str):
col: str = f"'{column}'"
else:
col: str = str(column)
if op == "in":
op_rhs: str = f".isin({val})"
else:
op_rhs: str = f" {op} {val}"
body: str = f"df[{col}]{op_rhs}"
self._set_body(body)
def __call__(self, df: pd.DataFrame) -> pd.Series:
return self._closure(df)
def __repr__(self) -> str:
return self._repr
def __invert__(self) -> LogicalSlice:
inverted: LogicalSlice = self._compose(invert, self)
body: str = f"~({self._body})"
inverted._set_body(body)
return inverted
def __and__(self, other: LogicalSlice) -> LogicalSlice:
conjunction: LogicalSlice = self._compose(and_, self, other)
body: str = f"({self._body}) & ({other._body})"
conjunction._set_body(body)
return conjunction
def __or__(self, other: LogicalSlice) -> LogicalSlice:
disjunction: LogicalSlice = self._compose(or_, self, other)
body: str = f"({self._body}) | ({other._body})"
disjunction._set_body(body)
return disjunction
def __xor__(self, other: LogicalSlice):
symetric_difference: LogicalSlice = self._compose(xor, self, other)
body: str = f"({self._body}) ^ ({other._body})"
symetric_difference._set_body(body)
return symetric_difference
def _set_body(self, body: str) -> None:
self._body = body
self._repr = f"lambda df: {body}"
@classmethod
def _compose(
cls, operation: Union[invert, xor, and_, or_], *operands: LogicalSlice
) -> LogicalSlice:
new_logical_slice = cls.__new__(cls)
def closure(df: pd.DataFrame) -> pd.Series:
return operation(*(operand(df) for operand in operands))
new_logical_slice._closure = closure
return new_logical_slice
Here is some example code of the API. I suppose it should
In [7]: import pandas as pd
In [8]: import numpy as np
In [9]: mask = pd.LogicalSlice
In [10]: # create dummy dataframe for example
In [11]: df = pd.DataFrame(np.random.randn(8, 5), index=list(range(8)), columns=[1,2,3,4,5])
In [12]: df
Out[12]:
1 2 3 4 5
0 2.146687 1.111394 0.782805 -0.664520 -0.775198
1 0.266829 1.021803 0.112757 1.912192 -0.629621
2 0.796544 -0.584199 -0.958174 0.817412 -0.148881
3 0.436404 0.528461 0.911095 0.854496 2.047070
4 -0.144312 -2.274132 0.129994 -2.413109 1.248950
5 -0.233113 -2.028095 -0.164058 0.912749 2.717956
6 -1.537385 1.881949 -1.567367 0.489918 -1.142897
7 -0.411876 -1.044270 -0.869536 0.878700 0.077144
In [13]: # get the rows where column 1 is greater than 0 using pd.LogicalSlice
In [16]: df.loc[mask(1, ">", 0)]
Out[16]:
1 2 3 4 5
0 2.146687 1.111394 0.782805 -0.664520 -0.775198
1 0.266829 1.021803 0.112757 1.912192 -0.629621
2 0.796544 -0.584199 -0.958174 0.817412 -0.148881
3 0.436404 0.528461 0.911095 0.854496 2.047070
In [17]: # bind the mask to a variable for later user
In [18]: non_neg = mask(1, ">", 0)
In [19]: # repr displays a lambda but its a closure under the hood
In [20]: non_neg
Out[20]: lambda df: df[1] > 0
In [21]: df.loc[non_neg]
Out[21]:
1 2 3 4 5
0 2.146687 1.111394 0.782805 -0.664520 -0.775198
1 0.266829 1.021803 0.112757 1.912192 -0.629621
2 0.796544 -0.584199 -0.958174 0.817412 -0.148881
3 0.436404 0.528461 0.911095 0.854496 2.047070
In [22]: # multiple LogicalSlices can be combined using bitwise operators
In [23]: non_neg & mask(2, ">", 0)
Out[23]: lambda df: (df[1] > 0) & (df[2] > 0)
In [24]: df.loc[non_neg & mask(2, ">", 0)]
Out[24]:
1 2 3 4 5
0 2.146687 1.111394 0.782805 -0.664520 -0.775198
1 0.266829 1.021803 0.112757 1.912192 -0.629621
3 0.436404 0.528461 0.911095 0.854496 2.047070
In [25]: # the traditional method
In [26]: df.loc[(df[1] > 0) & (df[2] > 0)]
Out[26]:
1 2 3 4 5
0 2.146687 1.111394 0.782805 -0.664520 -0.775198
1 0.266829 1.021803 0.112757 1.912192 -0.629621
3 0.436404 0.528461 0.911095 0.854496 2.047070
In [27]: # combining multiple masks
In [28]: ~(non_neg & mask(2, ">", 0)) | mask(4, "<", 0)
Out[28]: lambda df: (~((df[1] > 0) & (df[2] > 0))) | (df[4] < 0)
In [29]: big_mask = ~(non_neg & mask(2, ">", 0)) | mask(4, "<", 0)
In [30]: big_mask
Out[30]: lambda df: (~((df[1] > 0) & (df[2] > 0))) | (df[4] < 0)
In [31]: df.loc[big_mask]
Out[31]:
1 2 3 4 5
0 2.146687 1.111394 0.782805 -0.664520 -0.775198
2 0.796544 -0.584199 -0.958174 0.817412 -0.148881
4 -0.144312 -2.274132 0.129994 -2.413109 1.248950
5 -0.233113 -2.028095 -0.164058 0.912749 2.717956
6 -1.537385 1.881949 -1.567367 0.489918 -1.142897
7 -0.411876 -1.044270 -0.869536 0.878700 0.077144