[ENH] Fix summarise for MultiIndex (#1460)

samukweku · samuel.oranyeli · ericmjl · web-flow · commit 09df7f6e591a · 2025-04-11T09:31:26.000+10:00
* updates to jn.summarise

* cleanup

* cleanup

* singledispatch for tuple

* minor fix for mutate

* remove default parameter in max

* remove default parameter in max

* remove default parameter in max

---------

Co-authored-by: samuel.oranyeli &lt;samuel.oranyeli@grow.inc&gt;
Co-authored-by: Eric Ma &lt;ericmjl@users.noreply.github.com&gt;
diff --git a/janitor/functions/mutate.py b/janitor/functions/mutate.py
@@ -177,6 +177,8 @@ def mutate(
         A pandas DataFrame or Series with aggregated columns.
     """  # noqa: E501
     check("copy", copy, [bool])
+    if copy:
+        df = df.copy(deep=None)
     if by is not None:
         if isinstance(by, DataFrameGroupBy):
             # it is assumed that by is created from df
@@ -188,8 +190,7 @@ def mutate(
             if is_scalar(by):
                 by = [by]
             by = df.groupby(by, sort=False, observed=True)
-    if copy:
-        df = df.copy(deep=None)
+
     for arg in args:
         df = _mutator(arg, df=df, by=by)
     return df
@@ -226,11 +227,9 @@ def _(arg, df, by):
     for column_name, mutator in arg.items():
         if isinstance(mutator, tuple):
             column, func = mutator
-            column = _process_within_dict(mutator=func, obj=val[column])
+            column = _apply_func_to_obj(mutator=func, obj=val[column])
         else:
-            column = _process_within_dict(
-                mutator=mutator, obj=val[column_name]
-            )
+            column = _apply_func_to_obj(mutator=mutator, obj=val[column_name])
         df[column_name] = column
     return df
 
@@ -262,7 +261,7 @@ def _process_maybe_string(func: str, obj):
     return obj.transform(func)
 
 
-def _process_within_dict(mutator, obj):
+def _apply_func_to_obj(mutator, obj):
     """Handle str/callables within a dictionary"""
     if isinstance(mutator, str):
         return _process_maybe_string(func=mutator, obj=obj)
diff --git a/janitor/functions/summarise.py b/janitor/functions/summarise.py
@@ -19,7 +19,7 @@ def summarise(
     df: pd.DataFrame,
     *args: tuple[dict | tuple],
     by: Any = None,
-) -> pd.DataFrame | pd.Series:
+) -> pd.DataFrame:
     """
 
     !!! info "New in version 0.31.0"
@@ -42,13 +42,15 @@ def summarise(
     - **dictionary argument**:
     If the argument is a dictionary,
     the value in the `{key:value}` pairing
-    should be either a string, a callable or a tuple.
+    should be either a string, a callable, or a tuple.
 
         - If the value in the dictionary
         is a string or a callable,
         the key of the dictionary
         should be an existing column name.
 
+        The function is applied on the `df[column_name]` series.
+
         !!!note
 
             - If the value is a string,
@@ -57,28 +59,24 @@ def summarise(
 
         - If the value of the dictionary is a tuple,
         it should be of length 2, and of the form
-        `(column_name, mutation_func)`,
+        `(column_name, aggfunc)`,
         where `column_name` should exist in the DataFrame,
-        and `mutation_func` should be either a string or a callable.
-
-        !!!note
+        and `aggfunc` should be either a string or a callable.
 
-            - If `mutation_func` is a string,
-            the string should be a pandas string function,
-            e.g "sum", "mean", etc.
+        This option allows for custom renaming of the aggregation output,
+        where the key in the dictionary can be a new column name.
 
-        The key in the dictionary can be a new column name.
 
     - **tuple argument**:
     If the argument is a tuple, it should be of length 2,
     and of the form
-    `(column_name, mutation_func)`,
+    `(column_name, aggfunc)`,
     where column_name should exist in the DataFrame,
-    and `mutation_func` should be either a string or a callable.
+    and `aggfunc` should be either a string or a callable.
 
         !!!note
 
-            - if `mutation_func` is a string,
+            - if `aggfunc` is a string,
             the string should be a pandas string function,
             e.g "sum", "mean", etc.
 
@@ -89,6 +87,7 @@ def summarise(
             as such multiple columns can be processed here -
             they will be processed individually.
 
+
     - **callable argument**:
     If the argument is a callable, the callable is applied
     on the DataFrame or GroupBy object.
@@ -169,7 +168,7 @@ def summarise(
         ValueError: If a tuple is passed and the length is not 2.
 
     Returns:
-        A pandas DataFrame or Series with aggregated columns.
+        A pandas DataFrame with aggregated columns.
 
     """  # noqa: E501
 
@@ -184,18 +183,52 @@ def summarise(
             if is_scalar(by):
                 by = [by]
             by = df.groupby(by, sort=False, observed=True)
-    dictionary = {}
+    contents = []
     for arg in args:
-        aggregate = _mutator(arg, df=df, by=by)
-        dictionary.update(aggregate)
-    values = map(is_scalar, dictionary.values())
-    if all(values):
-        return pd.Series(dictionary)
-    return pd.concat(dictionary, axis="columns", sort=False, copy=False)
+        aggregate = _aggfunc(arg, df=df, by=by)
+        contents.extend(aggregate)
+    counts = 0
+    for entry in contents:
+        if isinstance(entry, pd.DataFrame):
+            length = entry.columns.nlevels
+        elif isinstance(entry.name, tuple):
+            length = len(entry.name)
+        else:
+            length = 1
+        counts = max(counts, length)
+    contents_ = []
+    for entry in contents:
+        if isinstance(entry, pd.DataFrame):
+            length_ = entry.columns.nlevels
+            length = counts - length_
+            if length:
+                patch = [""] * length
+                columns = [
+                    entry.columns.get_level_values(n) for n in range(length_)
+                ]
+                columns.append(patch)
+                names = [*entry.columns.names]
+                names.extend([None] * length)
+                columns = pd.MultiIndex.from_arrays(columns, names=names)
+                entry.columns = columns
+        elif is_scalar(entry.name):
+            length = counts - 1
+            if length:
+                patch = [""] * length
+                name = (entry.name, *patch)
+                entry.name = name
+        elif isinstance(entry.name, tuple):
+            length = counts - len(entry.name)
+            if length:
+                patch = [""] * length
+                name = (*entry.name, *patch)
+                entry.name = name
+        contents_.append(entry)
+    return pd.concat(contents_, axis=1, copy=False, sort=False)
 
 
 @singledispatch
-def _mutator(arg, df, by):
+def _aggfunc(arg, df, by):
     if by is None:
         val = df
     else:
@@ -204,41 +237,68 @@ def _mutator(arg, df, by):
     if isinstance(outcome, pd.Series):
         if not outcome.name:
             raise ValueError("Ensure the pandas Series object has a name")
-        return {outcome.name: outcome}
-    # assumption: a mapping - DataFrame/dictionary/...
-    return {**outcome}
+        return [outcome]
+    if isinstance(outcome, pd.DataFrame):
+        return [outcome]
+    raise TypeError(
+        "The output from the aggregation should be a named Series or a DataFrame"
+    )
 
 
-@_mutator.register(dict)
+@_aggfunc.register(tuple)
+def _(arg, df, by):
+    """Dispatch function for tuple"""
+    if len(arg) != 2:
+        raise ValueError("the tuple has to be a length of 2")
+    column_name, aggfunc = arg
+    column_names = get_index_labels(arg=[column_name], df=df, axis="columns")
+    mapping = {column_name: aggfunc for column_name in column_names}
+    return _aggfunc(mapping, df=df, by=by)
+
+
+@_aggfunc.register(dict)
 def _(arg, df, by):
     """Dispatch function for dictionary"""
     if by is None:
         val = df
     else:
         val = by
 
-    dictionary = {}
-    for column_name, mutator in arg.items():
-        if isinstance(mutator, tuple):
-            column, func = mutator
-            column = _process_within_dict(mutator=func, obj=val[column])
+    contents = []
+    for column_name, aggfunc in arg.items():
+        if isinstance(aggfunc, tuple):
+            if len(aggfunc) != 2:
+                raise ValueError("the tuple has to be a length of 2")
+            column, func = aggfunc
+            column_ = _handle_tuple_groupby_selection(by=by, column=column)
+            column = _apply_func_to_obj(aggfunc=func, obj=val[column_])
+            if isinstance(column, pd.DataFrame) and column.shape[-1] == 1:
+                column = column.squeeze()
+            column = _convert_obj_to_named_series(
+                obj=column,
+                column_name=column_name,
+                function=func,
+            )
+            if not isinstance(column, pd.Series):
+                raise TypeError(
+                    "Expected a pandas Series object; "
+                    f"instead got {type(column)}"
+                )
         else:
-            column = _process_within_dict(
-                mutator=mutator, obj=val[column_name]
+            column_ = _handle_tuple_groupby_selection(
+                by=by, column=column_name
             )
-        dictionary[column_name] = column
-    return dictionary
-
-
-@_mutator.register(tuple)
-def _(arg, df, by):
-    """Dispatch function for tuple"""
-    if len(arg) != 2:
-        raise ValueError("the tuple has to be a length of 2")
-    column_names, mutator = arg
-    column_names = get_index_labels(arg=[column_names], df=df, axis="columns")
-    mapping = {column_name: mutator for column_name in column_names}
-    return _mutator(mapping, df=df, by=by)
+            column = _apply_func_to_obj(aggfunc=aggfunc, obj=val[column_])
+            column = _convert_obj_to_named_series(
+                obj=column,
+                column_name=column_name,
+                function=aggfunc,
+            )
+        column = _rename_column_in_by(
+            column=column, column_name=column_name, by=by
+        )
+        contents.append(column)
+    return contents
 
 
 def _process_maybe_callable(func: callable, obj):
@@ -257,8 +317,39 @@ def _process_maybe_string(func: str, obj):
     return obj.agg(func)
 
 
-def _process_within_dict(mutator, obj):
+def _apply_func_to_obj(aggfunc, obj):
     """Handle str/callables within a dictionary"""
-    if isinstance(mutator, str):
-        return _process_maybe_string(func=mutator, obj=obj)
-    return _process_maybe_callable(func=mutator, obj=obj)
+    if isinstance(aggfunc, str):
+        return _process_maybe_string(func=aggfunc, obj=obj)
+    return _process_maybe_callable(func=aggfunc, obj=obj)
+
+
+def _handle_tuple_groupby_selection(by: Any, column: Any):
+    """
+    Properly handle a tuple column selection in the presence of a groupby
+    """
+    if (by is not None) and isinstance(column, tuple):
+        return [column]
+    return column
+
+
+def _convert_obj_to_named_series(obj, function: Any, column_name: Any):
+    if isinstance(obj, pd.Series):
+        obj.name = column_name
+        return obj
+    if not is_scalar(obj):
+        return obj
+    if isinstance(function, str):
+        function_name = function
+    else:
+        function_name = function.__name__
+    return pd.Series(data=obj, index=[function_name], name=column_name)
+
+
+def _rename_column_in_by(column, column_name, by):
+    if by is None:
+        return column
+    elif isinstance(column, pd.DataFrame) and is_scalar(column_name):
+        columns = pd.MultiIndex.from_product([[column_name], column.columns])
+        column.columns = columns
+    return column
diff --git a/tests/functions/test_summarise.py b/tests/functions/test_summarise.py