Skip to content

Commit fdd580b

Browse files
igerberclaude
andcommitted
Fix MultiPeriodDiD rank-deficient vcov/df computation (P1)
- Route MultiPeriodDiD through solve_ols with return_vcov=True and cluster_ids - Calculate degrees of freedom using effective rank (non-NaN coefficients) - Handle homoskedastic vcov case for rank-deficient matrices - Add test_rank_deficient_design_warns_and_sets_nan test This ensures MultiPeriodDiD properly handles rank-deficient design matrices by warning users, setting NaN for dropped coefficients, and computing valid SEs for identified coefficients only. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 8c18cac commit fdd580b

2 files changed

Lines changed: 73 additions & 17 deletions

File tree

diff_diff/estimators.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -873,29 +873,42 @@ def fit( # type: ignore[override]
873873
var_names.append(col)
874874

875875
# Fit OLS using unified backend
876-
coefficients, residuals, fitted, _ = solve_ols(
877-
X, y, return_fitted=True, return_vcov=False
876+
# Pass cluster_ids to solve_ols for proper vcov computation
877+
# This handles rank-deficient matrices by returning NaN for dropped columns
878+
cluster_ids = data[self.cluster].values if self.cluster is not None else None
879+
880+
# Note: Wild bootstrap for multi-period effects is complex (multiple coefficients)
881+
# For now, we use analytical inference even if inference="wild_bootstrap"
882+
coefficients, residuals, fitted, vcov = solve_ols(
883+
X, y,
884+
return_fitted=True,
885+
return_vcov=True,
886+
cluster_ids=cluster_ids,
887+
column_names=var_names,
878888
)
879889
r_squared = compute_r_squared(y, residuals)
880890

881-
# Degrees of freedom
882-
df = len(y) - X.shape[1] - n_absorbed_effects
891+
# Degrees of freedom using effective rank (non-NaN coefficients)
892+
k_effective = int(np.sum(~np.isnan(coefficients)))
893+
df = len(y) - k_effective - n_absorbed_effects
883894

884-
# Compute standard errors
885-
# Note: Wild bootstrap for multi-period effects is complex (multiple coefficients)
886-
# For now, we use analytical inference even if inference="wild_bootstrap"
887-
if self.cluster is not None:
888-
cluster_ids = data[self.cluster].values
889-
vcov = compute_robust_vcov(X, residuals, cluster_ids)
890-
elif self.robust:
891-
vcov = compute_robust_vcov(X, residuals)
892-
else:
895+
# For non-robust, non-clustered case, we need homoskedastic vcov
896+
# solve_ols returns HC1 by default, so compute homoskedastic if needed
897+
if not self.robust and self.cluster is None:
893898
n = len(y)
894-
k = X.shape[1]
895-
mse = np.sum(residuals**2) / (n - k)
899+
mse = np.sum(residuals**2) / (n - k_effective)
896900
# Use solve() instead of inv() for numerical stability
897-
# solve(A, B) computes X where AX=B, so this yields (X'X)^{-1} * mse
898-
vcov = np.linalg.solve(X.T @ X, mse * np.eye(k))
901+
# Only compute for identified columns (non-NaN coefficients)
902+
identified_mask = ~np.isnan(coefficients)
903+
if np.all(identified_mask):
904+
vcov = np.linalg.solve(X.T @ X, mse * np.eye(X.shape[1]))
905+
else:
906+
# For rank-deficient case, compute vcov on reduced matrix then expand
907+
X_reduced = X[:, identified_mask]
908+
vcov_reduced = np.linalg.solve(X_reduced.T @ X_reduced, mse * np.eye(X_reduced.shape[1]))
909+
# Expand to full size with NaN for dropped columns
910+
vcov = np.full((X.shape[1], X.shape[1]), np.nan)
911+
vcov[np.ix_(identified_mask, identified_mask)] = vcov_reduced
899912

900913
# Extract period-specific treatment effects
901914
period_effects = {}

tests/test_estimators.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1624,6 +1624,49 @@ def test_coefficients_dict(self, multi_period_data):
16241624
# Treatment interactions
16251625
assert any("treated:period_" in k for k in results.coefficients)
16261626

1627+
def test_rank_deficient_design_warns_and_sets_nan(self, multi_period_data):
1628+
"""Test that rank-deficient design matrix warns and sets NaN for dropped columns."""
1629+
import warnings
1630+
1631+
# Add a covariate that is perfectly collinear with an existing column
1632+
# Use exact duplicate to ensure perfect collinearity is detected
1633+
multi_period_data = multi_period_data.copy()
1634+
multi_period_data["collinear_cov"] = multi_period_data["treated"].copy()
1635+
1636+
did = MultiPeriodDiD()
1637+
1638+
with warnings.catch_warnings(record=True) as w:
1639+
warnings.simplefilter("always")
1640+
results = did.fit(
1641+
multi_period_data,
1642+
outcome="outcome",
1643+
treatment="treated",
1644+
time="period",
1645+
post_periods=[3, 4, 5],
1646+
covariates=["collinear_cov"]
1647+
)
1648+
1649+
# Should have warning about rank deficiency
1650+
rank_warnings = [x for x in w if "Rank-deficient" in str(x.message)
1651+
or "collinear" in str(x.message).lower()]
1652+
assert len(rank_warnings) > 0, "Expected warning about rank deficiency"
1653+
1654+
# The collinear covariate should have NaN coefficient
1655+
assert "collinear_cov" in results.coefficients
1656+
assert np.isnan(results.coefficients["collinear_cov"]), \
1657+
"Collinear covariate coefficient should be NaN"
1658+
1659+
# Treatment effects should still be identified (not NaN)
1660+
for period in [3, 4, 5]:
1661+
pe = results.period_effects[period]
1662+
assert not np.isnan(pe.effect), f"Period {period} effect should be identified"
1663+
assert not np.isnan(pe.se), f"Period {period} SE should be valid"
1664+
assert pe.se > 0, f"Period {period} SE should be positive"
1665+
1666+
# Vcov should have NaN for the collinear column
1667+
assert results.vcov is not None
1668+
assert np.any(np.isnan(results.vcov)), "Vcov should have NaN for dropped column"
1669+
16271670

16281671
class TestSyntheticDiD:
16291672
"""Tests for SyntheticDiD estimator."""

0 commit comments

Comments
 (0)