Skip to content

ENH: LogicalSlice object #40740

@Delengowski

Description

@Delengowski

EDIT: Updated Example Code

Is your feature request related to a problem?

I was trying to perform a method chained logical mask without an intermediate assignment and found the method of passing the logical mask or callable to .loc to be verbose. I am trying to reduce that verbosity.

Examples

(df_xtra_metadata.loc[(df_xtra_metadata['column'] == val) & (df_xtra_metadata['column2'] >= val2)]
    .reset_index()
)

# or with a callable, shorter but still long
(df_xtra_metadata.loc[lambda x: (x['column'] == val) & (x['column2'] >= val2)]
   .reset_index()
)

# not as long but requires assigning and deleting the mask in the scope
mask = (df_xtra_metadata['column'] == val) & (df_xtra_metadata['column2'] >= val2)
df.loc[mask].reset_index()

Describe the solution you'd like

I would like an object similar to pd.IndexSlice but instead is initialized with some metadata and then when called (which takes in the dataframe) computes the logical mask.

API breaking implications

This feature would add to the existing API but in a non obtrusive way and would not conflict with any of the current API.

Describe alternatives you've considered

I haven't considered any alternatives, I wrote this for myself and thought maybe it would be useful to others.

Additional context

So below is the code I wrote toying around with this idea.

from __future__ import annotations
from operator import and_, eq, ge, getitem, gt, invert, le, lt, ne, or_, xor
from typing import Any, Callable, Literal, Union

import pandas as pd


class LogicalSlice:
    """Create objects to more easily calculate logical masks inplace for slicing DataFrames."""
    ops = {
        ">": gt,
        "<": lt,
        ">=": ge,
        "<=": le,
        "==": eq,
        "!=": ne,
        "in": pd.Series.isin,
    }

    __slots__ = ("_repr", "_closure", "_body")

    def __init__(
        self, column: Any, op: Literal[">", "<", ">=", "<=", "==", "!=", "in"], val: Any
    ) -> None:
        operation: Union[gt, lt, ge, le, eq, ne, pd.Series.isin] = self.ops[op]

        def closure(df: pd.DataFrame) -> pd.Series:
            return operation(getitem(df, column), val)

        self._closure: Callable[pd.DataFrame, pd.Series] = closure
        if isinstance(column, str):
            col: str = f"'{column}'"
        else:
            col: str = str(column)
        if op == "in":
            op_rhs: str = f".isin({val})"
        else:
            op_rhs: str = f" {op} {val}"
        body: str = f"df[{col}]{op_rhs}"
        self._set_body(body)

    def __call__(self, df: pd.DataFrame) -> pd.Series:
        return self._closure(df)

    def __repr__(self) -> str:
        return self._repr

    def __invert__(self) -> LogicalSlice:
        inverted: LogicalSlice = self._compose(invert, self)
        body: str = f"~({self._body})"
        inverted._set_body(body)
        return inverted

    def __and__(self, other: LogicalSlice) -> LogicalSlice:
        conjunction: LogicalSlice = self._compose(and_, self, other)
        body: str = f"({self._body}) & ({other._body})"
        conjunction._set_body(body)
        return conjunction

    def __or__(self, other: LogicalSlice) -> LogicalSlice:
        disjunction: LogicalSlice = self._compose(or_, self, other)
        body: str = f"({self._body}) | ({other._body})"
        disjunction._set_body(body)
        return disjunction

    def __xor__(self, other: LogicalSlice):
        symetric_difference: LogicalSlice = self._compose(xor, self, other)
        body: str = f"({self._body}) ^ ({other._body})"
        symetric_difference._set_body(body)
        return symetric_difference

    def _set_body(self, body: str) -> None:
        self._body = body
        self._repr = f"lambda df: {body}"

    @classmethod
    def _compose(
        cls, operation: Union[invert, xor, and_, or_], *operands: LogicalSlice
    ) -> LogicalSlice:
        new_logical_slice = cls.__new__(cls)

        def closure(df: pd.DataFrame) -> pd.Series:
            return operation(*(operand(df) for operand in operands))

        new_logical_slice._closure = closure
        return new_logical_slice

Here is some example code of the API. I suppose it should

In [7]: import pandas as pd

In [8]: import numpy as np

In [9]: mask = pd.LogicalSlice

In [10]: # create dummy dataframe for example

In [11]: df = pd.DataFrame(np.random.randn(8, 5), index=list(range(8)), columns=[1,2,3,4,5])

In [12]: df
Out[12]:
          1         2         3         4         5
0  2.146687  1.111394  0.782805 -0.664520 -0.775198
1  0.266829  1.021803  0.112757  1.912192 -0.629621
2  0.796544 -0.584199 -0.958174  0.817412 -0.148881
3  0.436404  0.528461  0.911095  0.854496  2.047070
4 -0.144312 -2.274132  0.129994 -2.413109  1.248950
5 -0.233113 -2.028095 -0.164058  0.912749  2.717956
6 -1.537385  1.881949 -1.567367  0.489918 -1.142897
7 -0.411876 -1.044270 -0.869536  0.878700  0.077144

In [13]: # get the rows where column 1 is greater than 0 using pd.LogicalSlice

In [16]: df.loc[mask(1, ">", 0)]
Out[16]:
          1         2         3         4         5
0  2.146687  1.111394  0.782805 -0.664520 -0.775198
1  0.266829  1.021803  0.112757  1.912192 -0.629621
2  0.796544 -0.584199 -0.958174  0.817412 -0.148881
3  0.436404  0.528461  0.911095  0.854496  2.047070

In [17]: # bind the mask to a variable for later user

In [18]: non_neg = mask(1, ">", 0)

In [19]: # repr displays a lambda but its a closure under the hood

In [20]: non_neg
Out[20]: lambda df: df[1] > 0

In [21]: df.loc[non_neg]
Out[21]:
          1         2         3         4         5
0  2.146687  1.111394  0.782805 -0.664520 -0.775198
1  0.266829  1.021803  0.112757  1.912192 -0.629621
2  0.796544 -0.584199 -0.958174  0.817412 -0.148881
3  0.436404  0.528461  0.911095  0.854496  2.047070

In [22]: # multiple LogicalSlices can be combined using bitwise operators

In [23]: non_neg & mask(2, ">", 0)
Out[23]: lambda df: (df[1] > 0) & (df[2] > 0)

In [24]: df.loc[non_neg & mask(2, ">", 0)]
Out[24]:
          1         2         3         4         5
0  2.146687  1.111394  0.782805 -0.664520 -0.775198
1  0.266829  1.021803  0.112757  1.912192 -0.629621
3  0.436404  0.528461  0.911095  0.854496  2.047070

In [25]: # the traditional method

In [26]: df.loc[(df[1] > 0) & (df[2] > 0)]
Out[26]:
          1         2         3         4         5
0  2.146687  1.111394  0.782805 -0.664520 -0.775198
1  0.266829  1.021803  0.112757  1.912192 -0.629621
3  0.436404  0.528461  0.911095  0.854496  2.047070

In [27]: # combining multiple masks

In [28]: ~(non_neg & mask(2, ">", 0)) | mask(4, "<", 0)
Out[28]: lambda df: (~((df[1] > 0) & (df[2] > 0))) | (df[4] < 0)

In [29]: big_mask = ~(non_neg & mask(2, ">", 0)) | mask(4, "<", 0)

In [30]: big_mask
Out[30]: lambda df: (~((df[1] > 0) & (df[2] > 0))) | (df[4] < 0)

In [31]: df.loc[big_mask]
Out[31]:
          1         2         3         4         5
0  2.146687  1.111394  0.782805 -0.664520 -0.775198
2  0.796544 -0.584199 -0.958174  0.817412 -0.148881
4 -0.144312 -2.274132  0.129994 -2.413109  1.248950
5 -0.233113 -2.028095 -0.164058  0.912749  2.717956
6 -1.537385  1.881949 -1.567367  0.489918 -1.142897
7 -0.411876 -1.044270 -0.869536  0.878700  0.077144

Metadata

Metadata

Assignees

No one assigned

    Labels

    EnhancementNeeds TriageIssue that has not been reviewed by a pandas team member

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions