Fix MultiPeriodDiD rank-deficient vcov/df computation (P1)

igerber · claude · igerber · commit fdd580b6ef24 · 2026-01-20T12:32:56.000-05:00
- Route MultiPeriodDiD through solve_ols with return_vcov=True and cluster_ids
- Calculate degrees of freedom using effective rank (non-NaN coefficients)
- Handle homoskedastic vcov case for rank-deficient matrices
- Add test_rank_deficient_design_warns_and_sets_nan test

This ensures MultiPeriodDiD properly handles rank-deficient design matrices
by warning users, setting NaN for dropped coefficients, and computing valid
SEs for identified coefficients only.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
@@ -873,29 +873,42 @@ def fit(  # type: ignore[override]
                     var_names.append(col)
 
         # Fit OLS using unified backend
-        coefficients, residuals, fitted, _ = solve_ols(
-            X, y, return_fitted=True, return_vcov=False
+        # Pass cluster_ids to solve_ols for proper vcov computation
+        # This handles rank-deficient matrices by returning NaN for dropped columns
+        cluster_ids = data[self.cluster].values if self.cluster is not None else None
+
+        # Note: Wild bootstrap for multi-period effects is complex (multiple coefficients)
+        # For now, we use analytical inference even if inference="wild_bootstrap"
+        coefficients, residuals, fitted, vcov = solve_ols(
+            X, y,
+            return_fitted=True,
+            return_vcov=True,
+            cluster_ids=cluster_ids,
+            column_names=var_names,
         )
         r_squared = compute_r_squared(y, residuals)
 
-        # Degrees of freedom
-        df = len(y) - X.shape[1] - n_absorbed_effects
+        # Degrees of freedom using effective rank (non-NaN coefficients)
+        k_effective = int(np.sum(~np.isnan(coefficients)))
+        df = len(y) - k_effective - n_absorbed_effects
 
-        # Compute standard errors
-        # Note: Wild bootstrap for multi-period effects is complex (multiple coefficients)
-        # For now, we use analytical inference even if inference="wild_bootstrap"
-        if self.cluster is not None:
-            cluster_ids = data[self.cluster].values
-            vcov = compute_robust_vcov(X, residuals, cluster_ids)
-        elif self.robust:
-            vcov = compute_robust_vcov(X, residuals)
-        else:
+        # For non-robust, non-clustered case, we need homoskedastic vcov
+        # solve_ols returns HC1 by default, so compute homoskedastic if needed
+        if not self.robust and self.cluster is None:
             n = len(y)
-            k = X.shape[1]
-            mse = np.sum(residuals**2) / (n - k)
+            mse = np.sum(residuals**2) / (n - k_effective)
             # Use solve() instead of inv() for numerical stability
-            # solve(A, B) computes X where AX=B, so this yields (X'X)^{-1} * mse
-            vcov = np.linalg.solve(X.T @ X, mse * np.eye(k))
+            # Only compute for identified columns (non-NaN coefficients)
+            identified_mask = ~np.isnan(coefficients)
+            if np.all(identified_mask):
+                vcov = np.linalg.solve(X.T @ X, mse * np.eye(X.shape[1]))
+            else:
+                # For rank-deficient case, compute vcov on reduced matrix then expand
+                X_reduced = X[:, identified_mask]
+                vcov_reduced = np.linalg.solve(X_reduced.T @ X_reduced, mse * np.eye(X_reduced.shape[1]))
+                # Expand to full size with NaN for dropped columns
+                vcov = np.full((X.shape[1], X.shape[1]), np.nan)
+                vcov[np.ix_(identified_mask, identified_mask)] = vcov_reduced
 
         # Extract period-specific treatment effects
         period_effects = {}
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
@@ -1624,6 +1624,49 @@ def test_coefficients_dict(self, multi_period_data):
         # Treatment interactions
         assert any("treated:period_" in k for k in results.coefficients)
 
+    def test_rank_deficient_design_warns_and_sets_nan(self, multi_period_data):
+        """Test that rank-deficient design matrix warns and sets NaN for dropped columns."""
+        import warnings
+
+        # Add a covariate that is perfectly collinear with an existing column
+        # Use exact duplicate to ensure perfect collinearity is detected
+        multi_period_data = multi_period_data.copy()
+        multi_period_data["collinear_cov"] = multi_period_data["treated"].copy()
+
+        did = MultiPeriodDiD()
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            results = did.fit(
+                multi_period_data,
+                outcome="outcome",
+                treatment="treated",
+                time="period",
+                post_periods=[3, 4, 5],
+                covariates=["collinear_cov"]
+            )
+
+        # Should have warning about rank deficiency
+        rank_warnings = [x for x in w if "Rank-deficient" in str(x.message)
+                        or "collinear" in str(x.message).lower()]
+        assert len(rank_warnings) > 0, "Expected warning about rank deficiency"
+
+        # The collinear covariate should have NaN coefficient
+        assert "collinear_cov" in results.coefficients
+        assert np.isnan(results.coefficients["collinear_cov"]), \
+            "Collinear covariate coefficient should be NaN"
+
+        # Treatment effects should still be identified (not NaN)
+        for period in [3, 4, 5]:
+            pe = results.period_effects[period]
+            assert not np.isnan(pe.effect), f"Period {period} effect should be identified"
+            assert not np.isnan(pe.se), f"Period {period} SE should be valid"
+            assert pe.se > 0, f"Period {period} SE should be positive"
+
+        # Vcov should have NaN for the collinear column
+        assert results.vcov is not None
+        assert np.any(np.isnan(results.vcov)), "Vcov should have NaN for dropped column"
+
 
 class TestSyntheticDiD:
     """Tests for SyntheticDiD estimator."""