Skip to content

Commit d851fc3

Browse files
authored
Merge pull request #61 from igerber/claude/todo-items-2.0.1-h6tgS
Address TODO items for version 2.0.1
2 parents 2083f13 + 9e576c0 commit d851fc3

8 files changed

Lines changed: 165 additions & 111 deletions

File tree

TODO.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Consolidation opportunities for cleaner maintenance:
2525

2626
| Duplicate Code | Locations | Notes |
2727
|---------------|-----------|-------|
28-
| Within-transformation logic | `estimators.py:217-232`, `estimators.py:787-833`, `bacon.py:567-642` | Extract to utils.py |
28+
| ~~Within-transformation logic~~ | ~~Multiple files~~ | ✅ Extracted to `utils.py` as `demean_by_group()` and `within_transform()` (v2.0.1) |
2929
| Linear regression helper | `staggered.py:205-240`, `estimators.py:366-408` | Consider consolidation |
3030

3131
### Large Module Files

diff_diff/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@
113113
plot_sensitivity,
114114
)
115115

116-
__version__ = "2.0.0"
116+
__version__ = "2.0.1"
117117
__all__ = [
118118
# Estimators
119119
"DifferenceInDifferences",

diff_diff/bacon.py

Lines changed: 12 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import numpy as np
1818
import pandas as pd
1919

20+
from diff_diff.utils import within_transform as _within_transform_util
21+
2022

2123
@dataclass
2224
class Comparison2x2:
@@ -573,66 +575,16 @@ def _compute_twfe(
573575
treat_col: str = '__bacon_treated_internal__',
574576
) -> float:
575577
"""Compute TWFE estimate using within-transformation."""
576-
# Demean by unit and time
577-
y = df[outcome].values
578-
d = df[treat_col].astype(float).values
579-
580-
# Create unit and time dummies for demeaning
581-
units = df[unit].values
582-
times = df[time].values
583-
584-
# Unit means
585-
unit_map = {u: i for i, u in enumerate(df[unit].unique())}
586-
unit_idx = np.array([unit_map[u] for u in units])
587-
n_units = len(unit_map)
588-
589-
# Time means
590-
time_map = {t: i for i, t in enumerate(df[time].unique())}
591-
time_idx = np.array([time_map[t] for t in times])
592-
n_times = len(time_map)
593-
594-
# Compute means
595-
y_unit_mean = np.zeros(n_units)
596-
d_unit_mean = np.zeros(n_units)
597-
unit_counts = np.zeros(n_units)
598-
599-
for i in range(len(y)):
600-
u = unit_idx[i]
601-
y_unit_mean[u] += y[i]
602-
d_unit_mean[u] += d[i]
603-
unit_counts[u] += 1
604-
605-
y_unit_mean /= np.maximum(unit_counts, 1)
606-
d_unit_mean /= np.maximum(unit_counts, 1)
607-
608-
y_time_mean = np.zeros(n_times)
609-
d_time_mean = np.zeros(n_times)
610-
time_counts = np.zeros(n_times)
611-
612-
for i in range(len(y)):
613-
t = time_idx[i]
614-
y_time_mean[t] += y[i]
615-
d_time_mean[t] += d[i]
616-
time_counts[t] += 1
617-
618-
y_time_mean /= np.maximum(time_counts, 1)
619-
d_time_mean /= np.maximum(time_counts, 1)
620-
621-
# Overall mean
622-
y_mean = np.mean(y)
623-
d_mean = np.mean(d)
624-
625-
# Within transformation: y_it - y_i - y_t + y
626-
y_within = np.zeros(len(y))
627-
d_within = np.zeros(len(d))
628-
629-
for i in range(len(y)):
630-
u = unit_idx[i]
631-
t = time_idx[i]
632-
y_within[i] = y[i] - y_unit_mean[u] - y_time_mean[t] + y_mean
633-
d_within[i] = d[i] - d_unit_mean[u] - d_time_mean[t] + d_mean
634-
635-
# OLS on demeaned data
578+
# Apply two-way within transformation
579+
df_dm = _within_transform_util(
580+
df, [outcome, treat_col], unit, time, suffix="_within"
581+
)
582+
583+
# Extract within-transformed values
584+
y_within = df_dm[f"{outcome}_within"].values
585+
d_within = df_dm[f"{treat_col}_within"].values
586+
587+
# OLS on demeaned data: beta = sum(d * y) / sum(d^2)
636588
d_var = np.sum(d_within ** 2)
637589
if d_var > 0:
638590
beta = np.sum(d_within * y_within) / d_var

diff_diff/estimators.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
WildBootstrapResults,
2424
compute_confidence_interval,
2525
compute_p_value,
26+
demean_by_group,
2627
validate_binary,
2728
wild_bootstrap_se,
2829
)
@@ -227,10 +228,10 @@ def fit(
227228
# unit-invariant, so demeaning them would create multicollinearity
228229
vars_to_demean = [outcome] + (covariates or [])
229230
for ab_var in absorb:
230-
n_absorbed_effects += working_data[ab_var].nunique() - 1
231-
for var in vars_to_demean:
232-
group_means = working_data.groupby(ab_var)[var].transform("mean")
233-
working_data[var] = working_data[var] - group_means
231+
working_data, n_fe = demean_by_group(
232+
working_data, vars_to_demean, ab_var, inplace=True
233+
)
234+
n_absorbed_effects += n_fe
234235
absorbed_vars.append(ab_var)
235236

236237
# Extract variables (may be demeaned if absorb was used)
@@ -828,10 +829,10 @@ def fit( # type: ignore[override]
828829
if absorb:
829830
vars_to_demean = [outcome] + (covariates or [])
830831
for ab_var in absorb:
831-
n_absorbed_effects += working_data[ab_var].nunique() - 1
832-
for var in vars_to_demean:
833-
group_means = working_data.groupby(ab_var)[var].transform("mean")
834-
working_data[var] = working_data[var] - group_means
832+
working_data, n_fe = demean_by_group(
833+
working_data, vars_to_demean, ab_var, inplace=True
834+
)
835+
n_absorbed_effects += n_fe
835836

836837
# Extract outcome and treatment
837838
y = working_data[outcome].values.astype(float)

diff_diff/sun_abraham.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from diff_diff.utils import (
2222
compute_confidence_interval,
2323
compute_p_value,
24+
within_transform as _within_transform_util,
2425
)
2526

2627

@@ -789,28 +790,7 @@ def _within_transform(
789790
790791
y_it - y_i. - y_.t + y_..
791792
"""
792-
df = df.copy()
793-
794-
# Build all demeaned columns at once to avoid fragmentation
795-
demeaned_data = {}
796-
for var in variables:
797-
# Unit means
798-
unit_means = df.groupby(unit)[var].transform("mean")
799-
# Time means
800-
time_means = df.groupby(time)[var].transform("mean")
801-
# Grand mean
802-
grand_mean = df[var].mean()
803-
804-
# Within transformation
805-
demeaned_data[f"{var}_dm"] = (
806-
df[var] - unit_means - time_means + grand_mean
807-
).values
808-
809-
# Add all demeaned columns at once
810-
demeaned_df = pd.DataFrame(demeaned_data, index=df.index)
811-
df = pd.concat([df, demeaned_df], axis=1)
812-
813-
return df
793+
return _within_transform_util(df, variables, unit, time, suffix="_dm")
814794

815795
def _compute_iw_effects(
816796
self,

diff_diff/twfe.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from diff_diff.utils import (
1818
compute_confidence_interval,
1919
compute_p_value,
20+
within_transform as _within_transform_util,
2021
)
2122

2223

@@ -211,25 +212,8 @@ def _within_transform(
211212
pd.DataFrame
212213
Data with demeaned variables.
213214
"""
214-
data = data.copy()
215215
variables = [outcome] + (covariates or [])
216-
217-
# Cache groupby objects for efficiency (avoids re-computing group indexes)
218-
unit_grouper = data.groupby(unit, sort=False)
219-
time_grouper = data.groupby(time, sort=False)
220-
221-
for var in variables:
222-
# Unit means (using cached grouper)
223-
unit_means = unit_grouper[var].transform("mean")
224-
# Time means (using cached grouper)
225-
time_means = time_grouper[var].transform("mean")
226-
# Grand mean
227-
grand_mean = data[var].mean()
228-
229-
# Within transformation
230-
data[f"{var}_demeaned"] = data[var] - unit_means - time_means + grand_mean
231-
232-
return data
216+
return _within_transform_util(data, variables, unit, time, suffix="_demeaned")
233217

234218
def _check_staggered_treatment(
235219
self,

diff_diff/utils.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,3 +1342,140 @@ def compute_placebo_effects(
13421342
placebo_effects.append(placebo_tau)
13431343

13441344
return np.asarray(placebo_effects)
1345+
1346+
1347+
def demean_by_group(
1348+
data: pd.DataFrame,
1349+
variables: List[str],
1350+
group_var: str,
1351+
inplace: bool = False,
1352+
suffix: str = "",
1353+
) -> Tuple[pd.DataFrame, int]:
1354+
"""
1355+
Demean variables by a grouping variable (one-way within transformation).
1356+
1357+
For each variable, computes: x_ig - mean(x_g) where g is the group.
1358+
1359+
Parameters
1360+
----------
1361+
data : pd.DataFrame
1362+
DataFrame containing the variables to demean.
1363+
variables : list of str
1364+
Column names to demean.
1365+
group_var : str
1366+
Column name for the grouping variable.
1367+
inplace : bool, default False
1368+
If True, modifies the original columns. If False, leaves original
1369+
columns unchanged (demeaning is still applied to return value).
1370+
suffix : str, default ""
1371+
Suffix to add to demeaned column names (only used when inplace=False
1372+
and you want to keep both original and demeaned columns).
1373+
1374+
Returns
1375+
-------
1376+
data : pd.DataFrame
1377+
DataFrame with demeaned variables.
1378+
n_effects : int
1379+
Number of absorbed fixed effects (nunique - 1).
1380+
1381+
Examples
1382+
--------
1383+
>>> df, n_fe = demean_by_group(df, ['y', 'x1', 'x2'], 'unit')
1384+
>>> # df['y'], df['x1'], df['x2'] are now demeaned by unit
1385+
"""
1386+
if not inplace:
1387+
data = data.copy()
1388+
1389+
# Count fixed effects (categories - 1 for identification)
1390+
n_effects = data[group_var].nunique() - 1
1391+
1392+
# Cache the groupby object for efficiency
1393+
grouper = data.groupby(group_var, sort=False)
1394+
1395+
for var in variables:
1396+
col_name = var if not suffix else f"{var}{suffix}"
1397+
group_means = grouper[var].transform("mean")
1398+
data[col_name] = data[var] - group_means
1399+
1400+
return data, n_effects
1401+
1402+
1403+
def within_transform(
1404+
data: pd.DataFrame,
1405+
variables: List[str],
1406+
unit: str,
1407+
time: str,
1408+
inplace: bool = False,
1409+
suffix: str = "_demeaned",
1410+
) -> pd.DataFrame:
1411+
"""
1412+
Apply two-way within transformation to remove unit and time fixed effects.
1413+
1414+
Computes: y_it - y_i. - y_.t + y_.. for each variable.
1415+
1416+
This is the standard fixed effects transformation for panel data that
1417+
removes both unit-specific and time-specific effects.
1418+
1419+
Parameters
1420+
----------
1421+
data : pd.DataFrame
1422+
Panel data containing the variables to transform.
1423+
variables : list of str
1424+
Column names to transform.
1425+
unit : str
1426+
Column name for unit identifier.
1427+
time : str
1428+
Column name for time period identifier.
1429+
inplace : bool, default False
1430+
If True, modifies the original columns. If False, creates new columns
1431+
with the specified suffix.
1432+
suffix : str, default "_demeaned"
1433+
Suffix for new column names when inplace=False.
1434+
1435+
Returns
1436+
-------
1437+
pd.DataFrame
1438+
DataFrame with within-transformed variables.
1439+
1440+
Notes
1441+
-----
1442+
The within transformation removes variation that is constant within units
1443+
(unit fixed effects) and constant within time periods (time fixed effects).
1444+
The resulting estimates are equivalent to including unit and time dummies
1445+
but is computationally more efficient for large panels.
1446+
1447+
Examples
1448+
--------
1449+
>>> df = within_transform(df, ['y', 'x'], 'unit_id', 'year')
1450+
>>> # df now has 'y_demeaned' and 'x_demeaned' columns
1451+
"""
1452+
if not inplace:
1453+
data = data.copy()
1454+
1455+
# Cache groupby objects for efficiency
1456+
unit_grouper = data.groupby(unit, sort=False)
1457+
time_grouper = data.groupby(time, sort=False)
1458+
1459+
if inplace:
1460+
# Modify columns in place
1461+
for var in variables:
1462+
unit_means = unit_grouper[var].transform("mean")
1463+
time_means = time_grouper[var].transform("mean")
1464+
grand_mean = data[var].mean()
1465+
data[var] = data[var] - unit_means - time_means + grand_mean
1466+
else:
1467+
# Build all demeaned columns at once to avoid DataFrame fragmentation
1468+
demeaned_data = {}
1469+
for var in variables:
1470+
unit_means = unit_grouper[var].transform("mean")
1471+
time_means = time_grouper[var].transform("mean")
1472+
grand_mean = data[var].mean()
1473+
demeaned_data[f"{var}{suffix}"] = (
1474+
data[var] - unit_means - time_means + grand_mean
1475+
).values
1476+
1477+
# Add all columns at once
1478+
demeaned_df = pd.DataFrame(demeaned_data, index=data.index)
1479+
data = pd.concat([data, demeaned_df], axis=1)
1480+
1481+
return data

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "maturin"
44

55
[project]
66
name = "diff-diff"
7-
version = "2.0.0"
7+
version = "2.0.1"
88
description = "A library for Difference-in-Differences causal inference analysis"
99
readme = "README.md"
1010
license = "MIT"

0 commit comments

Comments
 (0)