Skip to content

Commit 6446bac

Browse files
igerberclaude
andcommitted
Address CI review: fix never_treated, add identification checks, Poisson rank handling
Finding 1 (P1): Rewrite never_treated implementation. Instead of deleting pre-treatment observations (wrong), keep all data in-sample and control the comparison pool via the interaction matrix. For never_treated, _build_interaction_matrix creates ALL (g,t) pairs including pre-treatment cells as placebo indicators, so only never-treated units remain in the baseline. Pre-treatment coefficients serve as placebo/pre-trend tests and aggregate("event") now produces negative event times. Finding 2 (P1): Add identification checks in fit() — raise ValueError for no treated cohorts, no never-treated units with never_treated control, and empty interaction matrix. Finding 3 (P1): Add rank_deficient_action to solve_poisson matching solve_logit/solve_ols pattern. Uses QR-based rank detection, drops collinear columns, expands beta with NaN. Warns on singular Hessian instead of silent break. Finding 4 (P1): Reject bootstrap params for nonlinear methods — n_bootstrap > 0 with method != "ols" raises ValueError. Finding 5 (P2): Fix tutorial to accurately describe OLS FE structure (unit+time via within-transformation, not additive cohort+time). Also: skip NaN coefficients in OLS/logit/Poisson ATT extraction, fix vcov submatrix to only include identified cells. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent cfe0e5c commit 6446bac

4 files changed

Lines changed: 214 additions & 67 deletions

File tree

diff_diff/linalg.py

Lines changed: 86 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,9 +1204,7 @@ def solve_logit(
12041204
if np.any(weights < 0):
12051205
raise ValueError("weights must be non-negative")
12061206
if np.sum(weights) <= 0:
1207-
raise ValueError(
1208-
"weights sum to zero — no observations have positive weight"
1209-
)
1207+
raise ValueError("weights sum to zero — no observations have positive weight")
12101208

12111209
# Validate rank_deficient_action
12121210
valid_actions = {"warn", "error", "silent"}
@@ -1882,7 +1880,9 @@ def fit(
18821880
kept_cols = np.where(~nan_mask)[0]
18831881
if len(kept_cols) > 0:
18841882
vcov_reduced, _n_valid_rep = compute_replicate_vcov(
1885-
X[:, kept_cols], y, coefficients[kept_cols],
1883+
X[:, kept_cols],
1884+
y,
1885+
coefficients[kept_cols],
18861886
_effective_survey_design,
18871887
weight_type=self.weight_type,
18881888
)
@@ -1892,7 +1892,10 @@ def fit(
18921892
_n_valid_rep = 0
18931893
else:
18941894
vcov, _n_valid_rep = compute_replicate_vcov(
1895-
X, y, coefficients, _effective_survey_design,
1895+
X,
1896+
y,
1897+
coefficients,
1898+
_effective_survey_design,
18961899
weight_type=self.weight_type,
18971900
)
18981901
# Store effective replicate df only when replicates were dropped
@@ -1948,7 +1951,7 @@ def fit(
19481951
if isinstance(_effective_survey_design, ResolvedSurveyDesign):
19491952
self.survey_df_ = _effective_survey_design.df_survey
19501953
# Override with effective replicate df if available
1951-
if hasattr(self, '_replicate_df') and self._replicate_df is not None:
1954+
if hasattr(self, "_replicate_df") and self._replicate_df is not None:
19521955
self.survey_df_ = self._replicate_df
19531956

19541957
return self
@@ -1964,10 +1967,9 @@ def compute_deff(self, coefficient_names=None):
19641967
DEFFDiagnostics
19651968
"""
19661969
self._check_fitted()
1967-
if not (hasattr(self, 'survey_design') and self.survey_design is not None):
1970+
if not (hasattr(self, "survey_design") and self.survey_design is not None):
19681971
raise ValueError(
1969-
"compute_deff() requires a survey design. "
1970-
"Fit with survey_design= first."
1972+
"compute_deff() requires a survey design. " "Fit with survey_design= first."
19711973
)
19721974
from diff_diff.survey import compute_deff_diagnostics
19731975

@@ -1980,17 +1982,23 @@ def compute_deff(self, coefficient_names=None):
19801982
k = len(self.coefficients_)
19811983
nan_arr = np.full(k, np.nan)
19821984
from diff_diff.survey import DEFFDiagnostics
1985+
19831986
return DEFFDiagnostics(
1984-
deff=nan_arr, effective_n=nan_arr.copy(),
1985-
srs_se=nan_arr.copy(), survey_se=nan_arr.copy(),
1987+
deff=nan_arr,
1988+
effective_n=nan_arr.copy(),
1989+
srs_se=nan_arr.copy(),
1990+
survey_se=nan_arr.copy(),
19861991
coefficient_names=coefficient_names,
19871992
)
19881993
# Compute on kept columns only
19891994
X_kept = self._X[:, kept]
19901995
vcov_kept = self.vcov_[np.ix_(kept, kept)]
19911996
deff_kept = compute_deff_diagnostics(
1992-
X_kept, self.residuals_, vcov_kept,
1993-
self.weights, weight_type=self.weight_type,
1997+
X_kept,
1998+
self.residuals_,
1999+
vcov_kept,
2000+
self.weights,
2001+
weight_type=self.weight_type,
19942002
)
19952003
# Expand back to full size with NaN for dropped
19962004
k = len(self.coefficients_)
@@ -2003,15 +2011,21 @@ def compute_deff(self, coefficient_names=None):
20032011
full_srs_se[kept] = deff_kept.srs_se
20042012
full_survey_se[kept] = deff_kept.survey_se
20052013
from diff_diff.survey import DEFFDiagnostics
2014+
20062015
return DEFFDiagnostics(
2007-
deff=full_deff, effective_n=full_eff_n,
2008-
srs_se=full_srs_se, survey_se=full_survey_se,
2016+
deff=full_deff,
2017+
effective_n=full_eff_n,
2018+
srs_se=full_srs_se,
2019+
survey_se=full_survey_se,
20092020
coefficient_names=coefficient_names,
20102021
)
20112022

20122023
return compute_deff_diagnostics(
2013-
self._X, self.residuals_, self.vcov_,
2014-
self.weights, weight_type=self.weight_type,
2024+
self._X,
2025+
self.residuals_,
2026+
self.vcov_,
2027+
self.weights,
2028+
weight_type=self.weight_type,
20152029
coefficient_names=coefficient_names,
20162030
)
20172031

@@ -2108,24 +2122,31 @@ def get_inference(
21082122
effective_df = df
21092123
elif self.survey_df_ is not None:
21102124
effective_df = self.survey_df_
2111-
elif (hasattr(self, 'survey_design') and self.survey_design is not None
2112-
and hasattr(self.survey_design, 'uses_replicate_variance')
2113-
and self.survey_design.uses_replicate_variance):
2125+
elif (
2126+
hasattr(self, "survey_design")
2127+
and self.survey_design is not None
2128+
and hasattr(self.survey_design, "uses_replicate_variance")
2129+
and self.survey_design.uses_replicate_variance
2130+
):
21142131
# Replicate design with undefined df (rank <= 1) — NaN inference
21152132
warnings.warn(
21162133
"Replicate design has undefined survey d.f. (rank <= 1). "
21172134
"Inference fields will be NaN.",
2118-
UserWarning, stacklevel=2,
2135+
UserWarning,
2136+
stacklevel=2,
21192137
)
21202138
effective_df = 0 # Forces NaN from t-distribution
21212139
else:
21222140
effective_df = self.df_
21232141

21242142
# Warn if df is non-positive and fall back to normal distribution
21252143
# (skip for replicate designs — df=0 is intentional for NaN inference)
2126-
_is_replicate = (hasattr(self, 'survey_design') and self.survey_design is not None
2127-
and hasattr(self.survey_design, 'uses_replicate_variance')
2128-
and self.survey_design.uses_replicate_variance)
2144+
_is_replicate = (
2145+
hasattr(self, "survey_design")
2146+
and self.survey_design is not None
2147+
and hasattr(self.survey_design, "uses_replicate_variance")
2148+
and self.survey_design.uses_replicate_variance
2149+
)
21292150
if effective_df is not None and effective_df <= 0 and not _is_replicate:
21302151
import warnings
21312152

@@ -2350,6 +2371,7 @@ def solve_poisson(
23502371
max_iter: int = 200,
23512372
tol: float = 1e-8,
23522373
init_beta: Optional[np.ndarray] = None,
2374+
rank_deficient_action: str = "warn",
23532375
) -> Tuple[np.ndarray, np.ndarray]:
23542376
"""Poisson IRLS (Newton-Raphson with log link).
23552377
@@ -2365,15 +2387,38 @@ def solve_poisson(
23652387
init_beta : optional starting coefficient vector; if None, zeros are used
23662388
with the first column treated as the intercept and initialized to
23672389
log(mean(y)) to improve convergence for large-scale outcomes.
2390+
rank_deficient_action : {"warn", "error", "silent"}
2391+
How to handle rank-deficient design matrices. Mirrors solve_ols/solve_logit.
23682392
23692393
Returns
23702394
-------
2371-
beta : (k,) coefficient vector
2395+
beta : (k,) coefficient vector (NaN for dropped columns if rank-deficient)
23722396
W : (n,) final fitted means mu_hat (weights for sandwich vcov)
23732397
"""
2398+
n, k_orig = X.shape
2399+
2400+
# Rank-deficiency detection (same pattern as solve_logit/solve_ols)
2401+
kept_cols = np.arange(k_orig)
2402+
rank, dropped_cols, _pivot = _detect_rank_deficiency(X)
2403+
if len(dropped_cols) > 0:
2404+
if rank_deficient_action == "error":
2405+
raise ValueError(
2406+
f"Rank-deficient design matrix: {len(dropped_cols)} collinear columns detected."
2407+
)
2408+
if rank_deficient_action == "warn":
2409+
warnings.warn(
2410+
f"Rank-deficient design matrix: dropping {len(dropped_cols)} of {k_orig} columns. "
2411+
f"Coefficients for these columns are set to NA.",
2412+
UserWarning,
2413+
stacklevel=2,
2414+
)
2415+
dropped_set = set(int(d) for d in dropped_cols)
2416+
kept_cols = np.array([i for i in range(k_orig) if i not in dropped_set])
2417+
X = X[:, kept_cols]
2418+
23742419
n, k = X.shape
23752420
if init_beta is not None:
2376-
beta = init_beta.copy()
2421+
beta = init_beta[kept_cols].copy() if len(dropped_cols) > 0 else init_beta.copy()
23772422
else:
23782423
beta = np.zeros(k)
23792424
# Initialise the intercept to log(mean(y)) so the first IRLS step
@@ -2385,11 +2430,17 @@ def solve_poisson(
23852430
for _ in range(max_iter):
23862431
eta = np.clip(X @ beta, -500, 500)
23872432
mu = np.exp(eta)
2388-
score = X.T @ (y - mu) # gradient of log-likelihood
2389-
hess = X.T @ (mu[:, None] * X) # -Hessian = X'WX, W=diag(mu)
2433+
score = X.T @ (y - mu) # gradient of log-likelihood
2434+
hess = X.T @ (mu[:, None] * X) # -Hessian = X'WX, W=diag(mu)
23902435
try:
23912436
delta = np.linalg.solve(hess + 1e-12 * np.eye(k), score)
23922437
except np.linalg.LinAlgError:
2438+
warnings.warn(
2439+
"solve_poisson: Hessian is singular at iteration. "
2440+
"Design matrix may be rank-deficient.",
2441+
RuntimeWarning,
2442+
stacklevel=2,
2443+
)
23932444
break
23942445
# Damped step: cap the maximum coefficient change to avoid overshooting
23952446
max_step = np.max(np.abs(delta))
@@ -2407,4 +2458,11 @@ def solve_poisson(
24072458
stacklevel=2,
24082459
)
24092460
mu_final = np.exp(np.clip(X @ beta, -500, 500))
2461+
2462+
# Expand back to full size if columns were dropped
2463+
if len(dropped_cols) > 0:
2464+
beta_full = np.full(k_orig, np.nan)
2465+
beta_full[kept_cols] = beta
2466+
beta = beta_full
2467+
24102468
return beta, mu_final

diff_diff/wooldridge.py

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,26 +77,21 @@ def _filter_sample(
7777
) -> pd.DataFrame:
7878
"""Return the analysis sample following jwdid selection rules.
7979
80-
For "not_yet_treated": keep all observations from treated units (pre- and
81-
post-treatment) plus all never-treated and not-yet-treated observations.
82-
For "never_treated": keep only post-treatment observations from treated
83-
units (t >= g - anticipation) plus all never-treated observations.
84-
Pre-treatment observations from treated units are excluded so they do not
85-
serve as implicit controls in the regression baseline.
80+
All treated units keep ALL observations (pre- and post-treatment) for
81+
proper FE estimation. The control_group setting affects which additional
82+
control observations are included, AND the interaction matrix structure
83+
(see _build_interaction_matrix).
8684
"""
8785
df = data.copy()
8886
# Normalise never-treated: fill NaN cohort with 0
8987
df[cohort] = df[cohort].fillna(0)
9088

89+
treated_mask = df[cohort] > 0
90+
9191
if control_group == "never_treated":
92-
# Post-treatment obs from treated units + all never-treated obs.
93-
# Pre-treatment obs from treated units are excluded so the
94-
# counterfactual is identified solely from never-treated units.
95-
treated_mask = (df[cohort] > 0) & (df[time] >= df[cohort] - anticipation)
9692
control_mask = df[cohort] == 0
9793
else: # not_yet_treated
98-
# All treated-unit obs + never-treated + not-yet-treated obs
99-
treated_mask = df[cohort] > 0
94+
# Keep untreated-at-t observations for not-yet-treated units
10095
control_mask = (df[cohort] == 0) | (df[cohort] > df[time])
10196

10297
return df[treated_mask | control_mask].copy()
@@ -107,9 +102,20 @@ def _build_interaction_matrix(
107102
cohort: str,
108103
time: str,
109104
anticipation: int,
105+
control_group: str = "not_yet_treated",
110106
) -> Tuple[np.ndarray, List[str], List[Tuple[Any, Any]]]:
111107
"""Build the saturated cohort×time interaction design matrix.
112108
109+
For ``not_yet_treated``: only post-treatment cells (t >= g - anticipation).
110+
Pre-treatment obs from treated units sit in the regression baseline alongside
111+
not-yet-treated controls.
112+
113+
For ``never_treated``: ALL (g, t) pairs for each treated cohort. This
114+
"absorbs" pre-treatment obs from treated units into their own indicators so
115+
they do not serve as implicit controls in the baseline. Only never-treated
116+
observations remain in the omitted category. Pre-treatment coefficients
117+
(t < g) serve as placebo/pre-trend tests.
118+
113119
Returns
114120
-------
115121
X_int : (n, n_cells) binary indicator matrix
@@ -127,7 +133,7 @@ def _build_interaction_matrix(
127133

128134
for g in groups:
129135
for t in times:
130-
if t >= g - anticipation:
136+
if control_group == "never_treated" or t >= g - anticipation:
131137
indicator = ((cohort_vals == g) & (time_vals == t)).astype(float)
132138
cols.append(indicator)
133139
col_names.append(f"g{g}_t{t}")
@@ -315,16 +321,46 @@ def fit(
315321
df = data.copy()
316322
df[cohort] = df[cohort].fillna(0)
317323

324+
# 0. Reject bootstrap for nonlinear methods (not implemented)
325+
if self.n_bootstrap > 0 and self.method != "ols":
326+
raise ValueError(
327+
f"Bootstrap inference is only supported for method='ols'. "
328+
f"Got method={self.method!r} with n_bootstrap={self.n_bootstrap}. "
329+
f"Set n_bootstrap=0 for analytic SEs."
330+
)
331+
318332
# 1. Filter to analysis sample
319333
sample = _filter_sample(df, unit, time, cohort, self.control_group, self.anticipation)
320334

335+
# 1b. Identification checks
336+
groups = sorted(g for g in sample[cohort].unique() if g > 0)
337+
if len(groups) == 0:
338+
raise ValueError(
339+
"No treated cohorts found in data. Ensure the cohort column "
340+
"contains values > 0 for treated units."
341+
)
342+
if self.control_group == "never_treated" and not (sample[cohort] == 0).any():
343+
raise ValueError(
344+
"control_group='never_treated' but no never-treated units "
345+
"(cohort == 0) found. Use 'not_yet_treated' or add "
346+
"never-treated units."
347+
)
348+
321349
# 2. Build interaction matrix
322350
X_int, int_col_names, gt_keys = _build_interaction_matrix(
323-
sample, cohort=cohort, time=time, anticipation=self.anticipation
351+
sample,
352+
cohort=cohort,
353+
time=time,
354+
anticipation=self.anticipation,
355+
control_group=self.control_group,
324356
)
357+
if X_int.shape[1] == 0:
358+
raise ValueError(
359+
"No valid treatment cells found. Check that treated units "
360+
"have post-treatment observations in the data."
361+
)
325362

326363
# 3. Covariates
327-
groups = sorted(g for g in sample[cohort].unique() if g > 0)
328364
X_cov = _prepare_covariates(
329365
sample,
330366
exovar=exovar,
@@ -444,6 +480,9 @@ def _fit_ols(
444480
for idx, (g, t) in enumerate(gt_keys):
445481
if idx >= len(coefs):
446482
break
483+
# Skip cells whose coefficient was dropped (rank deficiency)
484+
if np.isnan(coefs[idx]):
485+
continue
447486
att = float(coefs[idx])
448487
se = float(np.sqrt(max(vcov[idx, idx], 0.0))) if vcov is not None else float("nan")
449488
t_stat, p_value, conf_int = safe_inference(att, se, alpha=self.alpha)
@@ -456,10 +495,14 @@ def _fit_ols(
456495
}
457496
gt_weights[(g, t)] = int(((sample[cohort] == g) & (sample[time] == t)).sum())
458497

459-
# Extract vcov submatrix for beta_{g,t} only
460-
n_gt = len(gt_keys)
461-
gt_vcov = vcov[:n_gt, :n_gt] if vcov is not None else None
462-
gt_keys_ordered = list(gt_keys)
498+
# Extract vcov submatrix for identified β_{g,t} only (skip NaN/dropped)
499+
gt_keys_ordered = list(gt_effects.keys())
500+
if vcov is not None and gt_keys_ordered:
501+
# Map from gt_keys_ordered to original indices in the coef vector
502+
orig_indices = [i for i, k in enumerate(gt_keys) if k in gt_effects]
503+
gt_vcov = vcov[np.ix_(orig_indices, orig_indices)]
504+
else:
505+
gt_vcov = None
463506

464507
# 8. Simple aggregation (always computed)
465508
overall = _compute_weighted_agg(
@@ -721,7 +764,13 @@ def _fit_poisson(
721764
cluster_col = self.cluster if self.cluster else unit
722765
cluster_ids = sample[cluster_col].values
723766

724-
beta, mu_hat = solve_poisson(X_full, y)
767+
beta, mu_hat = solve_poisson(X_full, y, rank_deficient_action=self.rank_deficient_action)
768+
769+
# Handle rank-deficient designs: zero out NaN entries so downstream
770+
# matrix ops don't propagate NaN (dropped columns contribute nothing)
771+
nan_mask = np.isnan(beta)
772+
if np.any(nan_mask):
773+
beta = np.where(nan_mask, 0.0, beta)
725774

726775
# QMLE sandwich vcov via shared linalg backend
727776
resids = y - mu_hat

0 commit comments

Comments
 (0)