Fix DoseResponseCurve survey df and strengthen P3 tests from PR #226 review (round 10)

igerber · claude · igerber · commit f3d74276471d · 2026-03-22T06:30:32.000-04:00
- DoseResponseCurve: add df_survey field; to_dataframe() now passes
  survey df to safe_inference() so exported p-values match fit-time
  inference instead of using normal approximation
- Strengthen Bacon exact-weight test to assert exact vs approximate
  weights differ (not just finiteness)
- Strengthen ContinuousDiD dose-response test to assert exported
  p-values match safe_inference(..., df=survey_df)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/diff_diff/continuous_did.py b/diff_diff/continuous_did.py
@@ -397,6 +397,8 @@ def fit(
                 unit_cohorts=precomp["unit_cohorts"],
             )
 
+        _survey_df = None  # Set by analytical branch when survey is active
+
         if len(post_gt) == 0:
             warnings.warn(
                 "No post-treatment (g,t) cells available for aggregation. "
@@ -665,6 +667,7 @@ def fit(
             target="att",
             p_value=att_d_p,
             n_bootstrap=self.n_bootstrap,
+            df_survey=_survey_df,
         )
         dose_response_acrt = DoseResponseCurve(
             dose_grid=dvals,
@@ -675,6 +678,7 @@ def fit(
             target="acrt",
             p_value=acrt_d_p,
             n_bootstrap=self.n_bootstrap,
+            df_survey=_survey_df,
         )
 
         # Strip bootstrap internals from gt_results
diff --git a/diff_diff/continuous_did_results.py b/diff_diff/continuous_did_results.py
@@ -45,6 +45,7 @@ class DoseResponseCurve:
     target: str
     p_value: Optional[np.ndarray] = None
     n_bootstrap: int = 0
+    df_survey: Optional[int] = None
 
     def to_dataframe(self) -> pd.DataFrame:
         """Convert to DataFrame with dose, effect, se, CI, t_stat, p_value."""
@@ -60,7 +61,7 @@ def to_dataframe(self) -> pd.DataFrame:
             t_stat = np.full(n, np.nan)
             p_value = np.full(n, np.nan)
             for i in range(n):
-                t_i, p_i, _ = safe_inference(self.effects[i], self.se[i])
+                t_i, p_i, _ = safe_inference(self.effects[i], self.se[i], df=self.df_survey)
                 t_stat[i] = t_i
                 p_value[i] = p_i
         return pd.DataFrame(
diff --git a/tests/test_survey_phase3.py b/tests/test_survey_phase3.py
@@ -405,6 +405,25 @@ def test_exact_weights_survey_weighted(self, staggered_survey_data):
         assert len(r.comparisons) > 0
         for comp in r.comparisons:
             assert np.isfinite(comp.weight)
+        # With non-uniform weights, exact weights should differ from
+        # approximate weights (approximate uses n_k*(1-n_k)*Var(D))
+        r_approx = BaconDecomposition(weights="approximate").fit(
+            staggered_survey_data,
+            "outcome",
+            "unit",
+            "time",
+            "first_treat",
+            survey_design=sd,
+        )
+        # At least one comparison weight should differ
+        exact_weights = {(c.treated_group, c.control_group): c.weight for c in r.comparisons}
+        approx_weights = {
+            (c.treated_group, c.control_group): c.weight for c in r_approx.comparisons
+        }
+        common_keys = set(exact_weights) & set(approx_weights)
+        assert len(common_keys) > 0
+        diffs = [abs(exact_weights[k] - approx_weights[k]) for k in common_keys]
+        assert max(diffs) > 1e-10, "Exact and approximate weights should differ"
 
 
 # =============================================================================
@@ -886,6 +905,7 @@ def test_sun_abraham_survey_df_regression(self, staggered_survey_data):
     def test_continuous_did_dose_response_survey_pvalue(self, continuous_survey_data):
         """DoseResponseCurve.to_dataframe() p-values should use survey df."""
         from diff_diff import ContinuousDiD
+        from diff_diff.utils import safe_inference
 
         sd = SurveyDesign(weights="weight", strata="stratum")
         result = ContinuousDiD(n_bootstrap=0).fit(
@@ -899,8 +919,13 @@ def test_continuous_did_dose_response_survey_pvalue(self, continuous_survey_data
         )
         sm = result.survey_metadata
         assert sm is not None
-        # Check that dose-response curve p-values are finite
+        assert sm.df_survey is not None
+        # Check that dose-response curve carries survey df
+        assert result.dose_response_att.df_survey == sm.df_survey
+        # Check exported p-values use survey df, not normal approx
         att_df = result.dose_response_att.to_dataframe()
-        assert "p_value" in att_df.columns
-        finite_p = att_df["p_value"].dropna()
-        assert len(finite_p) > 0
+        for i in range(min(3, len(att_df))):
+            row = att_df.iloc[i]
+            if np.isfinite(row["effect"]) and np.isfinite(row["se"]) and row["se"] > 0:
+                _, expected_p, _ = safe_inference(row["effect"], row["se"], df=sm.df_survey)
+                assert row["p_value"] == pytest.approx(expected_p, rel=1e-10)