Skip to content

Commit afcc21f

Browse files
igerberclaude
andcommitted
Address fourth round of code review feedback
P1 Fixes: - Fix degrees of freedom calculation to use effective rank (n_params_effective_) instead of total columns when design matrix is rank-deficient. This ensures correct t-statistics, p-values, and confidence intervals for identified coefficients. - Improve Rust backend error message for rank-deficient X'X to suggest using solve_ols without skip_rank_check for R-style handling. - Improve TWFE collinearity error message to surface actual dropped column names and distinguish between treatment collinearity (error) vs covariate collinearity (warning). P2 Fixes: - Add inference validation tests that verify degrees of freedom, p-values, and confidence intervals are computed correctly when columns are dropped due to rank deficiency. Tests: - test_rank_deficient_degrees_of_freedom: Verifies n_params_effective_ and df_ - test_rank_deficient_inference_uses_correct_df: Verifies p-value and CI use correct df (n - rank) - test_rank_deficient_inference_nan_for_dropped_coef: Verifies NaN inference for dropped coefficients Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent c94e8b2 commit afcc21f

4 files changed

Lines changed: 150 additions & 14 deletions

File tree

diff_diff/linalg.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -958,8 +958,11 @@ class LinearRegression:
958958
Number of observations (available after fit).
959959
n_params_ : int
960960
Number of parameters including intercept (available after fit).
961+
n_params_effective_ : int
962+
Effective number of parameters after dropping linearly dependent columns.
963+
Equals n_params_ for full-rank matrices (available after fit).
961964
df_ : int
962-
Degrees of freedom (n - k) (available after fit).
965+
Degrees of freedom (n - n_params_effective) (available after fit).
963966
964967
Examples
965968
--------
@@ -1009,6 +1012,7 @@ def __init__(
10091012
self._X: Optional[np.ndarray] = None
10101013
self.n_obs_: Optional[int] = None
10111014
self.n_params_: Optional[int] = None
1015+
self.n_params_effective_: Optional[int] = None
10121016
self.df_: Optional[int] = None
10131017

10141018
def fit(
@@ -1107,7 +1111,12 @@ def fit(
11071111
self._X = X
11081112
self.n_obs_ = X.shape[0]
11091113
self.n_params_ = X.shape[1]
1110-
self.df_ = self.n_obs_ - self.n_params_ - df_adjustment
1114+
1115+
# Compute effective number of parameters (excluding dropped columns)
1116+
# This is needed for correct degrees of freedom in inference
1117+
nan_mask = np.isnan(coefficients)
1118+
self.n_params_effective_ = int(self.n_params_ - np.sum(nan_mask))
1119+
self.df_ = self.n_obs_ - self.n_params_effective_ - df_adjustment
11111120

11121121
return self
11131122

diff_diff/twfe.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,38 @@ def fit( # type: ignore[override]
140140
r_squared = reg.r_squared()
141141
att = coefficients[att_idx]
142142

143-
# Check if treatment coefficient is identifiable
144-
# If NaN, treatment is perfectly collinear with fixed effects
145-
if np.isnan(att):
146-
raise ValueError(
147-
"Treatment is perfectly collinear with unit/time fixed effects. "
148-
"This means the treatment effect cannot be identified from the data. "
149-
"This can happen when: (1) all treated units are treated in all periods, "
150-
"(2) treatment timing is constant within units, or (3) the panel structure "
151-
"doesn't allow separating treatment from fixed effects. "
152-
"Check your data structure and ensure treatment varies within units over time."
153-
)
143+
# Check for unidentified coefficients (collinearity)
144+
# Build column names for informative error messages
145+
column_names = ["intercept", "treatment×post"]
146+
if covariates:
147+
column_names.extend(covariates)
148+
149+
nan_mask = np.isnan(coefficients)
150+
if np.any(nan_mask):
151+
dropped_indices = np.where(nan_mask)[0]
152+
dropped_names = [column_names[i] if i < len(column_names)
153+
else f"column {i}" for i in dropped_indices]
154+
155+
# Determine the source of collinearity for better error message
156+
if att_idx in dropped_indices:
157+
# Treatment coefficient is unidentified
158+
raise ValueError(
159+
f"Treatment effect cannot be identified due to collinearity. "
160+
f"Dropped columns: {', '.join(dropped_names)}. "
161+
"This can happen when: (1) treatment is perfectly collinear with "
162+
"unit/time fixed effects, (2) all treated units are treated in all "
163+
"periods, or (3) a covariate is collinear with the treatment indicator. "
164+
"Check your data structure and model specification."
165+
)
166+
else:
167+
# Only covariates are dropped - this is a warning, not an error
168+
# The ATT can still be estimated
169+
warnings.warn(
170+
f"Some covariates are collinear and were dropped: "
171+
f"{', '.join(dropped_names)}. The treatment effect is still identified.",
172+
UserWarning,
173+
stacklevel=2,
174+
)
154175

155176
# Get inference - either from bootstrap or analytical
156177
if self.inference == "wild_bootstrap":

rust/src/linalg.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,9 @@ fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
265265

266266
let col = a.solve(&e_i).map_err(|e| {
267267
PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
268-
"Matrix inversion failed: {}",
268+
"Matrix inversion failed (likely rank-deficient X'X): {}. \
269+
If the design matrix is rank-deficient, use solve_ols without \
270+
skip_rank_check=True to enable R-style handling.",
269271
e
270272
))
271273
})?;

tests/test_linalg.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,110 @@ def test_matches_solve_ols(self, simple_data):
983983
np.testing.assert_allclose(reg.fitted_values_, fitted, rtol=1e-10)
984984
np.testing.assert_allclose(reg.vcov_, vcov, rtol=1e-10)
985985

986+
def test_rank_deficient_degrees_of_freedom(self):
987+
"""Test that degrees of freedom are computed correctly when columns are dropped.
988+
989+
When a design matrix is rank-deficient, the effective number of parameters
990+
is the rank, not the number of columns. The df should be n - rank.
991+
"""
992+
import warnings
993+
994+
np.random.seed(42)
995+
n = 100
996+
# Create rank-deficient matrix: 4 columns but rank 3
997+
X = np.random.randn(n, 3)
998+
X = np.column_stack([X, X[:, 0] + X[:, 1]]) # Column 3 = Column 0 + Column 1
999+
1000+
y = np.random.randn(n)
1001+
1002+
with warnings.catch_warnings(record=True):
1003+
warnings.simplefilter("always")
1004+
reg = LinearRegression(include_intercept=False).fit(X, y)
1005+
1006+
# n_params_ should be total columns (4)
1007+
assert reg.n_params_ == 4
1008+
1009+
# n_params_effective_ should be the rank (3)
1010+
assert reg.n_params_effective_ == 3
1011+
1012+
# df_ should be n - effective_params = 100 - 3 = 97
1013+
assert reg.df_ == n - 3
1014+
1015+
# Verify one coefficient is NaN (the dropped one)
1016+
assert np.sum(np.isnan(reg.coefficients_)) == 1
1017+
1018+
def test_rank_deficient_inference_uses_correct_df(self):
1019+
"""Test that p-values and CIs use the correct df for rank-deficient matrices."""
1020+
import warnings
1021+
from scipy import stats
1022+
1023+
np.random.seed(42)
1024+
n = 100
1025+
# Create rank-deficient matrix
1026+
X = np.random.randn(n, 3)
1027+
X = np.column_stack([X, X[:, 0] + X[:, 1]]) # Perfect collinearity
1028+
1029+
# True coefficients for the first 3 columns only
1030+
y = 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(n) * 0.5
1031+
1032+
with warnings.catch_warnings(record=True):
1033+
warnings.simplefilter("always")
1034+
reg = LinearRegression(include_intercept=False).fit(X, y)
1035+
1036+
# Get inference for an identified coefficient
1037+
nan_mask = np.isnan(reg.coefficients_)
1038+
kept_idx = np.where(~nan_mask)[0][0] # First non-NaN coefficient
1039+
result = reg.get_inference(kept_idx)
1040+
1041+
# Check that df is correct (should be n - rank = 97)
1042+
assert result.df == n - 3, f"Expected df={n-3}, got {result.df}"
1043+
1044+
# Manually compute expected values using correct df
1045+
coef = result.coefficient
1046+
se = result.se
1047+
t_stat_expected = coef / se
1048+
p_value_expected = 2 * (1 - stats.t.cdf(abs(t_stat_expected), df=n - 3))
1049+
1050+
# Verify t-stat
1051+
np.testing.assert_allclose(result.t_stat, t_stat_expected, rtol=1e-10)
1052+
1053+
# Verify p-value uses correct df (use atol for very small p-values)
1054+
np.testing.assert_allclose(result.p_value, p_value_expected, atol=1e-10)
1055+
1056+
# Verify CI uses correct df
1057+
t_crit = stats.t.ppf(1 - 0.05 / 2, df=n - 3)
1058+
ci_expected = (coef - t_crit * se, coef + t_crit * se)
1059+
np.testing.assert_allclose(result.conf_int, ci_expected, rtol=1e-6)
1060+
1061+
def test_rank_deficient_inference_nan_for_dropped_coef(self):
1062+
"""Test that inference for dropped coefficients returns NaN values."""
1063+
import warnings
1064+
1065+
np.random.seed(42)
1066+
n = 100
1067+
X = np.random.randn(n, 3)
1068+
X = np.column_stack([X, X[:, 0] + X[:, 1]]) # Column 3 is dropped
1069+
y = np.random.randn(n)
1070+
1071+
with warnings.catch_warnings(record=True):
1072+
warnings.simplefilter("always")
1073+
reg = LinearRegression(include_intercept=False).fit(X, y)
1074+
1075+
# Find the dropped coefficient index
1076+
nan_mask = np.isnan(reg.coefficients_)
1077+
dropped_idx = np.where(nan_mask)[0][0]
1078+
1079+
# Get inference for dropped coefficient
1080+
result = reg.get_inference(dropped_idx)
1081+
1082+
# All inference values should be NaN
1083+
assert np.isnan(result.coefficient)
1084+
assert np.isnan(result.se)
1085+
assert np.isnan(result.t_stat)
1086+
assert np.isnan(result.p_value)
1087+
assert np.isnan(result.conf_int[0])
1088+
assert np.isnan(result.conf_int[1])
1089+
9861090

9871091
class TestNumericalStability:
9881092
"""Tests for numerical stability with ill-conditioned matrices."""

0 commit comments

Comments
 (0)