Address fourth round of code review feedback

igerber · claude · igerber · commit afcc21f00493 · 2026-01-20T10:40:55.000-05:00
P1 Fixes:
- Fix degrees of freedom calculation to use effective rank (n_params_effective_)
  instead of total columns when design matrix is rank-deficient. This ensures
  correct t-statistics, p-values, and confidence intervals for identified
  coefficients.
- Improve Rust backend error message for rank-deficient X'X to suggest using
  solve_ols without skip_rank_check for R-style handling.
- Improve TWFE collinearity error message to surface actual dropped column names
  and distinguish between treatment collinearity (error) vs covariate
  collinearity (warning).

P2 Fixes:
- Add inference validation tests that verify degrees of freedom, p-values, and
  confidence intervals are computed correctly when columns are dropped due to
  rank deficiency.

Tests:
- test_rank_deficient_degrees_of_freedom: Verifies n_params_effective_ and df_
- test_rank_deficient_inference_uses_correct_df: Verifies p-value and CI
  use correct df (n - rank)
- test_rank_deficient_inference_nan_for_dropped_coef: Verifies NaN inference
  for dropped coefficients

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
@@ -958,8 +958,11 @@ class LinearRegression:
         Number of observations (available after fit).
     n_params_ : int
         Number of parameters including intercept (available after fit).
+    n_params_effective_ : int
+        Effective number of parameters after dropping linearly dependent columns.
+        Equals n_params_ for full-rank matrices (available after fit).
     df_ : int
-        Degrees of freedom (n - k) (available after fit).
+        Degrees of freedom (n - n_params_effective) (available after fit).
 
     Examples
     --------
@@ -1009,6 +1012,7 @@ def __init__(
         self._X: Optional[np.ndarray] = None
         self.n_obs_: Optional[int] = None
         self.n_params_: Optional[int] = None
+        self.n_params_effective_: Optional[int] = None
         self.df_: Optional[int] = None
 
     def fit(
@@ -1107,7 +1111,12 @@ def fit(
         self._X = X
         self.n_obs_ = X.shape[0]
         self.n_params_ = X.shape[1]
-        self.df_ = self.n_obs_ - self.n_params_ - df_adjustment
+
+        # Compute effective number of parameters (excluding dropped columns)
+        # This is needed for correct degrees of freedom in inference
+        nan_mask = np.isnan(coefficients)
+        self.n_params_effective_ = int(self.n_params_ - np.sum(nan_mask))
+        self.df_ = self.n_obs_ - self.n_params_effective_ - df_adjustment
 
         return self
 
diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py
@@ -140,17 +140,38 @@ def fit(  # type: ignore[override]
         r_squared = reg.r_squared()
         att = coefficients[att_idx]
 
-        # Check if treatment coefficient is identifiable
-        # If NaN, treatment is perfectly collinear with fixed effects
-        if np.isnan(att):
-            raise ValueError(
-                "Treatment is perfectly collinear with unit/time fixed effects. "
-                "This means the treatment effect cannot be identified from the data. "
-                "This can happen when: (1) all treated units are treated in all periods, "
-                "(2) treatment timing is constant within units, or (3) the panel structure "
-                "doesn't allow separating treatment from fixed effects. "
-                "Check your data structure and ensure treatment varies within units over time."
-            )
+        # Check for unidentified coefficients (collinearity)
+        # Build column names for informative error messages
+        column_names = ["intercept", "treatment×post"]
+        if covariates:
+            column_names.extend(covariates)
+
+        nan_mask = np.isnan(coefficients)
+        if np.any(nan_mask):
+            dropped_indices = np.where(nan_mask)[0]
+            dropped_names = [column_names[i] if i < len(column_names)
+                            else f"column {i}" for i in dropped_indices]
+
+            # Determine the source of collinearity for better error message
+            if att_idx in dropped_indices:
+                # Treatment coefficient is unidentified
+                raise ValueError(
+                    f"Treatment effect cannot be identified due to collinearity. "
+                    f"Dropped columns: {', '.join(dropped_names)}. "
+                    "This can happen when: (1) treatment is perfectly collinear with "
+                    "unit/time fixed effects, (2) all treated units are treated in all "
+                    "periods, or (3) a covariate is collinear with the treatment indicator. "
+                    "Check your data structure and model specification."
+                )
+            else:
+                # Only covariates are dropped - this is a warning, not an error
+                # The ATT can still be estimated
+                warnings.warn(
+                    f"Some covariates are collinear and were dropped: "
+                    f"{', '.join(dropped_names)}. The treatment effect is still identified.",
+                    UserWarning,
+                    stacklevel=2,
+                )
 
         # Get inference - either from bootstrap or analytical
         if self.inference == "wild_bootstrap":
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
@@ -265,7 +265,9 @@ fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
 
         let col = a.solve(&e_i).map_err(|e| {
             PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Matrix inversion failed: {}",
+                "Matrix inversion failed (likely rank-deficient X'X): {}. \
+                 If the design matrix is rank-deficient, use solve_ols without \
+                 skip_rank_check=True to enable R-style handling.",
                 e
             ))
         })?;
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
@@ -983,6 +983,110 @@ def test_matches_solve_ols(self, simple_data):
         np.testing.assert_allclose(reg.fitted_values_, fitted, rtol=1e-10)
         np.testing.assert_allclose(reg.vcov_, vcov, rtol=1e-10)
 
+    def test_rank_deficient_degrees_of_freedom(self):
+        """Test that degrees of freedom are computed correctly when columns are dropped.
+
+        When a design matrix is rank-deficient, the effective number of parameters
+        is the rank, not the number of columns. The df should be n - rank.
+        """
+        import warnings
+
+        np.random.seed(42)
+        n = 100
+        # Create rank-deficient matrix: 4 columns but rank 3
+        X = np.random.randn(n, 3)
+        X = np.column_stack([X, X[:, 0] + X[:, 1]])  # Column 3 = Column 0 + Column 1
+
+        y = np.random.randn(n)
+
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            reg = LinearRegression(include_intercept=False).fit(X, y)
+
+        # n_params_ should be total columns (4)
+        assert reg.n_params_ == 4
+
+        # n_params_effective_ should be the rank (3)
+        assert reg.n_params_effective_ == 3
+
+        # df_ should be n - effective_params = 100 - 3 = 97
+        assert reg.df_ == n - 3
+
+        # Verify one coefficient is NaN (the dropped one)
+        assert np.sum(np.isnan(reg.coefficients_)) == 1
+
+    def test_rank_deficient_inference_uses_correct_df(self):
+        """Test that p-values and CIs use the correct df for rank-deficient matrices."""
+        import warnings
+        from scipy import stats
+
+        np.random.seed(42)
+        n = 100
+        # Create rank-deficient matrix
+        X = np.random.randn(n, 3)
+        X = np.column_stack([X, X[:, 0] + X[:, 1]])  # Perfect collinearity
+
+        # True coefficients for the first 3 columns only
+        y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(n) * 0.5
+
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            reg = LinearRegression(include_intercept=False).fit(X, y)
+
+        # Get inference for an identified coefficient
+        nan_mask = np.isnan(reg.coefficients_)
+        kept_idx = np.where(~nan_mask)[0][0]  # First non-NaN coefficient
+        result = reg.get_inference(kept_idx)
+
+        # Check that df is correct (should be n - rank = 97)
+        assert result.df == n - 3, f"Expected df={n-3}, got {result.df}"
+
+        # Manually compute expected values using correct df
+        coef = result.coefficient
+        se = result.se
+        t_stat_expected = coef / se
+        p_value_expected = 2 * (1 - stats.t.cdf(abs(t_stat_expected), df=n - 3))
+
+        # Verify t-stat
+        np.testing.assert_allclose(result.t_stat, t_stat_expected, rtol=1e-10)
+
+        # Verify p-value uses correct df (use atol for very small p-values)
+        np.testing.assert_allclose(result.p_value, p_value_expected, atol=1e-10)
+
+        # Verify CI uses correct df
+        t_crit = stats.t.ppf(1 - 0.05 / 2, df=n - 3)
+        ci_expected = (coef - t_crit * se, coef + t_crit * se)
+        np.testing.assert_allclose(result.conf_int, ci_expected, rtol=1e-6)
+
+    def test_rank_deficient_inference_nan_for_dropped_coef(self):
+        """Test that inference for dropped coefficients returns NaN values."""
+        import warnings
+
+        np.random.seed(42)
+        n = 100
+        X = np.random.randn(n, 3)
+        X = np.column_stack([X, X[:, 0] + X[:, 1]])  # Column 3 is dropped
+        y = np.random.randn(n)
+
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            reg = LinearRegression(include_intercept=False).fit(X, y)
+
+        # Find the dropped coefficient index
+        nan_mask = np.isnan(reg.coefficients_)
+        dropped_idx = np.where(nan_mask)[0][0]
+
+        # Get inference for dropped coefficient
+        result = reg.get_inference(dropped_idx)
+
+        # All inference values should be NaN
+        assert np.isnan(result.coefficient)
+        assert np.isnan(result.se)
+        assert np.isnan(result.t_stat)
+        assert np.isnan(result.p_value)
+        assert np.isnan(result.conf_int[0])
+        assert np.isnan(result.conf_int[1])
+
 
 class TestNumericalStability:
     """Tests for numerical stability with ill-conditioned matrices."""