Merge pull request #61 from igerber/claude/todo-items-2.0.1-h6tgS

igerber · web-flow · commit d851fc3ddad9 · 2026-01-13T07:28:59.000-05:00
Address TODO items for version 2.0.1
diff --git a/TODO.md b/TODO.md
@@ -25,7 +25,7 @@ Consolidation opportunities for cleaner maintenance:
 
 | Duplicate Code | Locations | Notes |
 |---------------|-----------|-------|
-| Within-transformation logic | `estimators.py:217-232`, `estimators.py:787-833`, `bacon.py:567-642` | Extract to utils.py |
+| ~~Within-transformation logic~~ | ~~Multiple files~~ | ✅ Extracted to `utils.py` as `demean_by_group()` and `within_transform()` (v2.0.1) |
 | Linear regression helper | `staggered.py:205-240`, `estimators.py:366-408` | Consider consolidation |
 
 ### Large Module Files
diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py
@@ -113,7 +113,7 @@
     plot_sensitivity,
 )
 
-__version__ = "2.0.0"
+__version__ = "2.0.1"
 __all__ = [
     # Estimators
     "DifferenceInDifferences",
diff --git a/diff_diff/bacon.py b/diff_diff/bacon.py
@@ -17,6 +17,8 @@
 import numpy as np
 import pandas as pd
 
+from diff_diff.utils import within_transform as _within_transform_util
+
 
 @dataclass
 class Comparison2x2:
@@ -573,66 +575,16 @@ def _compute_twfe(
         treat_col: str = '__bacon_treated_internal__',
     ) -> float:
         """Compute TWFE estimate using within-transformation."""
-        # Demean by unit and time
-        y = df[outcome].values
-        d = df[treat_col].astype(float).values
-
-        # Create unit and time dummies for demeaning
-        units = df[unit].values
-        times = df[time].values
-
-        # Unit means
-        unit_map = {u: i for i, u in enumerate(df[unit].unique())}
-        unit_idx = np.array([unit_map[u] for u in units])
-        n_units = len(unit_map)
-
-        # Time means
-        time_map = {t: i for i, t in enumerate(df[time].unique())}
-        time_idx = np.array([time_map[t] for t in times])
-        n_times = len(time_map)
-
-        # Compute means
-        y_unit_mean = np.zeros(n_units)
-        d_unit_mean = np.zeros(n_units)
-        unit_counts = np.zeros(n_units)
-
-        for i in range(len(y)):
-            u = unit_idx[i]
-            y_unit_mean[u] += y[i]
-            d_unit_mean[u] += d[i]
-            unit_counts[u] += 1
-
-        y_unit_mean /= np.maximum(unit_counts, 1)
-        d_unit_mean /= np.maximum(unit_counts, 1)
-
-        y_time_mean = np.zeros(n_times)
-        d_time_mean = np.zeros(n_times)
-        time_counts = np.zeros(n_times)
-
-        for i in range(len(y)):
-            t = time_idx[i]
-            y_time_mean[t] += y[i]
-            d_time_mean[t] += d[i]
-            time_counts[t] += 1
-
-        y_time_mean /= np.maximum(time_counts, 1)
-        d_time_mean /= np.maximum(time_counts, 1)
-
-        # Overall mean
-        y_mean = np.mean(y)
-        d_mean = np.mean(d)
-
-        # Within transformation: y_it - y_i - y_t + y
-        y_within = np.zeros(len(y))
-        d_within = np.zeros(len(d))
-
-        for i in range(len(y)):
-            u = unit_idx[i]
-            t = time_idx[i]
-            y_within[i] = y[i] - y_unit_mean[u] - y_time_mean[t] + y_mean
-            d_within[i] = d[i] - d_unit_mean[u] - d_time_mean[t] + d_mean
-
-        # OLS on demeaned data
+        # Apply two-way within transformation
+        df_dm = _within_transform_util(
+            df, [outcome, treat_col], unit, time, suffix="_within"
+        )
+
+        # Extract within-transformed values
+        y_within = df_dm[f"{outcome}_within"].values
+        d_within = df_dm[f"{treat_col}_within"].values
+
+        # OLS on demeaned data: beta = sum(d * y) / sum(d^2)
         d_var = np.sum(d_within ** 2)
         if d_var > 0:
             beta = np.sum(d_within * y_within) / d_var
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -23,6 +23,7 @@
     WildBootstrapResults,
     compute_confidence_interval,
     compute_p_value,
+    demean_by_group,
     validate_binary,
     wild_bootstrap_se,
 )
@@ -227,10 +228,10 @@ def fit(
             # unit-invariant, so demeaning them would create multicollinearity
             vars_to_demean = [outcome] + (covariates or [])
             for ab_var in absorb:
-                n_absorbed_effects += working_data[ab_var].nunique() - 1
-                for var in vars_to_demean:
-                    group_means = working_data.groupby(ab_var)[var].transform("mean")
-                    working_data[var] = working_data[var] - group_means
+                working_data, n_fe = demean_by_group(
+                    working_data, vars_to_demean, ab_var, inplace=True
+                )
+                n_absorbed_effects += n_fe
                 absorbed_vars.append(ab_var)
 
         # Extract variables (may be demeaned if absorb was used)
@@ -828,10 +829,10 @@ def fit(  # type: ignore[override]
         if absorb:
             vars_to_demean = [outcome] + (covariates or [])
             for ab_var in absorb:
-                n_absorbed_effects += working_data[ab_var].nunique() - 1
-                for var in vars_to_demean:
-                    group_means = working_data.groupby(ab_var)[var].transform("mean")
-                    working_data[var] = working_data[var] - group_means
+                working_data, n_fe = demean_by_group(
+                    working_data, vars_to_demean, ab_var, inplace=True
+                )
+                n_absorbed_effects += n_fe
 
         # Extract outcome and treatment
         y = working_data[outcome].values.astype(float)
diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py
@@ -21,6 +21,7 @@
 from diff_diff.utils import (
     compute_confidence_interval,
     compute_p_value,
+    within_transform as _within_transform_util,
 )
 
 
@@ -789,28 +790,7 @@ def _within_transform(
 
         y_it - y_i. - y_.t + y_..
         """
-        df = df.copy()
-
-        # Build all demeaned columns at once to avoid fragmentation
-        demeaned_data = {}
-        for var in variables:
-            # Unit means
-            unit_means = df.groupby(unit)[var].transform("mean")
-            # Time means
-            time_means = df.groupby(time)[var].transform("mean")
-            # Grand mean
-            grand_mean = df[var].mean()
-
-            # Within transformation
-            demeaned_data[f"{var}_dm"] = (
-                df[var] - unit_means - time_means + grand_mean
-            ).values
-
-        # Add all demeaned columns at once
-        demeaned_df = pd.DataFrame(demeaned_data, index=df.index)
-        df = pd.concat([df, demeaned_df], axis=1)
-
-        return df
+        return _within_transform_util(df, variables, unit, time, suffix="_dm")
 
     def _compute_iw_effects(
         self,
diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py
@@ -17,6 +17,7 @@
 from diff_diff.utils import (
     compute_confidence_interval,
     compute_p_value,
+    within_transform as _within_transform_util,
 )
 
 
@@ -211,25 +212,8 @@ def _within_transform(
         pd.DataFrame
             Data with demeaned variables.
         """
-        data = data.copy()
         variables = [outcome] + (covariates or [])
-
-        # Cache groupby objects for efficiency (avoids re-computing group indexes)
-        unit_grouper = data.groupby(unit, sort=False)
-        time_grouper = data.groupby(time, sort=False)
-
-        for var in variables:
-            # Unit means (using cached grouper)
-            unit_means = unit_grouper[var].transform("mean")
-            # Time means (using cached grouper)
-            time_means = time_grouper[var].transform("mean")
-            # Grand mean
-            grand_mean = data[var].mean()
-
-            # Within transformation
-            data[f"{var}_demeaned"] = data[var] - unit_means - time_means + grand_mean
-
-        return data
+        return _within_transform_util(data, variables, unit, time, suffix="_demeaned")
 
     def _check_staggered_treatment(
         self,
diff --git a/diff_diff/utils.py b/diff_diff/utils.py
@@ -1342,3 +1342,140 @@ def compute_placebo_effects(
         placebo_effects.append(placebo_tau)
 
     return np.asarray(placebo_effects)
+
+
+def demean_by_group(
+    data: pd.DataFrame,
+    variables: List[str],
+    group_var: str,
+    inplace: bool = False,
+    suffix: str = "",
+) -> Tuple[pd.DataFrame, int]:
+    """
+    Demean variables by a grouping variable (one-way within transformation).
+
+    For each variable, computes: x_ig - mean(x_g) where g is the group.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        DataFrame containing the variables to demean.
+    variables : list of str
+        Column names to demean.
+    group_var : str
+        Column name for the grouping variable.
+    inplace : bool, default False
+        If True, modifies the original columns. If False, leaves original
+        columns unchanged (demeaning is still applied to return value).
+    suffix : str, default ""
+        Suffix to add to demeaned column names (only used when inplace=False
+        and you want to keep both original and demeaned columns).
+
+    Returns
+    -------
+    data : pd.DataFrame
+        DataFrame with demeaned variables.
+    n_effects : int
+        Number of absorbed fixed effects (nunique - 1).
+
+    Examples
+    --------
+    >>> df, n_fe = demean_by_group(df, ['y', 'x1', 'x2'], 'unit')
+    >>> # df['y'], df['x1'], df['x2'] are now demeaned by unit
+    """
+    if not inplace:
+        data = data.copy()
+
+    # Count fixed effects (categories - 1 for identification)
+    n_effects = data[group_var].nunique() - 1
+
+    # Cache the groupby object for efficiency
+    grouper = data.groupby(group_var, sort=False)
+
+    for var in variables:
+        col_name = var if not suffix else f"{var}{suffix}"
+        group_means = grouper[var].transform("mean")
+        data[col_name] = data[var] - group_means
+
+    return data, n_effects
+
+
+def within_transform(
+    data: pd.DataFrame,
+    variables: List[str],
+    unit: str,
+    time: str,
+    inplace: bool = False,
+    suffix: str = "_demeaned",
+) -> pd.DataFrame:
+    """
+    Apply two-way within transformation to remove unit and time fixed effects.
+
+    Computes: y_it - y_i. - y_.t + y_.. for each variable.
+
+    This is the standard fixed effects transformation for panel data that
+    removes both unit-specific and time-specific effects.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Panel data containing the variables to transform.
+    variables : list of str
+        Column names to transform.
+    unit : str
+        Column name for unit identifier.
+    time : str
+        Column name for time period identifier.
+    inplace : bool, default False
+        If True, modifies the original columns. If False, creates new columns
+        with the specified suffix.
+    suffix : str, default "_demeaned"
+        Suffix for new column names when inplace=False.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with within-transformed variables.
+
+    Notes
+    -----
+    The within transformation removes variation that is constant within units
+    (unit fixed effects) and constant within time periods (time fixed effects).
+    The resulting estimates are equivalent to including unit and time dummies
+    but is computationally more efficient for large panels.
+
+    Examples
+    --------
+    >>> df = within_transform(df, ['y', 'x'], 'unit_id', 'year')
+    >>> # df now has 'y_demeaned' and 'x_demeaned' columns
+    """
+    if not inplace:
+        data = data.copy()
+
+    # Cache groupby objects for efficiency
+    unit_grouper = data.groupby(unit, sort=False)
+    time_grouper = data.groupby(time, sort=False)
+
+    if inplace:
+        # Modify columns in place
+        for var in variables:
+            unit_means = unit_grouper[var].transform("mean")
+            time_means = time_grouper[var].transform("mean")
+            grand_mean = data[var].mean()
+            data[var] = data[var] - unit_means - time_means + grand_mean
+    else:
+        # Build all demeaned columns at once to avoid DataFrame fragmentation
+        demeaned_data = {}
+        for var in variables:
+            unit_means = unit_grouper[var].transform("mean")
+            time_means = time_grouper[var].transform("mean")
+            grand_mean = data[var].mean()
+            demeaned_data[f"{var}{suffix}"] = (
+                data[var] - unit_means - time_means + grand_mean
+            ).values
+
+        # Add all columns at once
+        demeaned_df = pd.DataFrame(demeaned_data, index=data.index)
+        data = pd.concat([data, demeaned_df], axis=1)
+
+    return data
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "diff-diff"
-version = "2.0.0"
+version = "2.0.1"
 description = "A library for Difference-in-Differences causal inference analysis"
 readme = "README.md"
 license = "MIT"

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@`
`113`	`113`	`plot_sensitivity,`
`114`	`114`	`)`
`115`	`115`
`116`		`-__version__ = "2.0.0"`
	`116`	`+__version__ = "2.0.1"`
`117`	`117`	`__all__ = [`
`118`	`118`	`# Estimators`
`119`	`119`	`"DifferenceInDifferences",`