Fix doc snippet bugs: wrong params, data-shape mismatches, and harden test

igerber · claude · igerber · commit 264af1259364 · 2026-03-16T19:09:04.000-04:00
- Fix Bacon wording: "negative weights" → accurate Goodman-Bacon (2021) language
- Fix first_treat column name in Bacon examples (2 files)
- Fix 15 doc snippet bugs: wrong reference_period, column names, return types,
  and estimator-data mismatches (SyntheticDiD, wild bootstrap, diagnostics)
- Harden test_doc_snippets: fail on all exceptions except NameError
- Add mock dataset loaders so dataset page snippets execute without network
- Add dose/exposure columns to test namespace for troubleshooting snippets

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/docs/api/diagnostics.rst b/docs/api/diagnostics.rst
@@ -76,14 +76,15 @@ Example
 
 .. code-block:: python
 
-   from diff_diff import permutation_test
+   from diff_diff import permutation_test, generate_did_data
 
+   panel = generate_did_data(n_units=100, n_periods=10, treatment_effect=2.0)
    result = permutation_test(
-       data,
-       outcome='y',
+       panel,
+       outcome='outcome',
        treatment='treated',
-       time='period',
-       unit='unit_id',
+       time='post',
+       unit='unit',
        n_permutations=1000
    )
 
@@ -101,18 +102,20 @@ Example
 
 .. code-block:: python
 
-   from diff_diff import leave_one_out_test
+   from diff_diff import leave_one_out_test, generate_did_data
 
+   panel = generate_did_data(n_units=100, n_periods=10, treatment_effect=2.0)
    result = leave_one_out_test(
-       data,
-       outcome='y',
+       panel,
+       outcome='outcome',
        treatment='treated',
-       time='period',
-       unit='unit_id'
+       time='post',
+       unit='unit'
    )
 
    # Check if results are driven by single units
-   print(f"Effect range: [{result.min_effect:.3f}, {result.max_effect:.3f}]")
+   loo = result.leave_one_out_effects
+   print(f"Effect range: [{min(loo.values()):.3f}, {max(loo.values()):.3f}]")
 
 run_all_placebo_tests
 ---------------------
diff --git a/docs/api/prep.rst b/docs/api/prep.rst
@@ -137,7 +137,7 @@ Example
 
    from diff_diff import make_post_indicator
 
-   data['post'] = make_post_indicator(
+   data = make_post_indicator(
        data,
        time_column='period',
        treatment_start=5
@@ -267,17 +267,19 @@ Example
 
    from diff_diff import validate_did_data
 
-   is_valid, issues = validate_did_data(
+   result = validate_did_data(
        data,
        outcome='outcome',
        treatment='treated',
        time='period',
        unit='unit_id'
    )
 
-   if not is_valid:
-       for issue in issues:
-           print(f"Issue: {issue}")
+   if not result['valid']:
+       for error in result['errors']:
+           print(f"Error: {error}")
+       for warning in result['warnings']:
+           print(f"Warning: {warning}")
 
 summarize_did_data
 ~~~~~~~~~~~~~~~~~~
@@ -301,9 +303,7 @@ Example
        unit='unit_id'
    )
 
-   print(f"N units: {summary['n_units']}")
-   print(f"N periods: {summary['n_periods']}")
-   print(f"Treatment fraction: {summary['treatment_fraction']:.1%}")
+   print(summary)
 
 Control Unit Selection
 ----------------------
@@ -320,16 +320,17 @@ Example
 
 .. code-block:: python
 
-   from diff_diff import rank_control_units
+   from diff_diff import rank_control_units, generate_did_data
 
+   panel = generate_did_data(n_units=100, n_periods=10, treatment_effect=2.0)
    ranked = rank_control_units(
-       data,
-       unit_column='unit_id',
+       panel,
+       unit_column='unit',
        time_column='period',
        outcome_column='outcome',
        treatment_column='treated',
-       pre_periods=[0, 1, 2, 3]
+       pre_periods=[0, 1, 2, 3, 4]
    )
 
    # Select top 10 control units
-   best_controls = ranked.head(10)['unit_id'].tolist()
+   best_controls = ranked.head(10)['unit'].tolist()
diff --git a/docs/api/utils.rst b/docs/api/utils.rst
@@ -31,7 +31,7 @@ Example
        pre_periods=[0, 1, 2, 3]
    )
 
-   print(f"F-statistic: {result['f_stat']:.3f}")
+   print(f"t-statistic: {result['t_statistic']:.3f}")
    print(f"p-value: {result['p_value']:.3f}")
 
    if result['p_value'] > 0.05:
@@ -87,12 +87,14 @@ Example
 
 .. code-block:: python
 
-   from diff_diff import DifferenceInDifferences
+   from diff_diff import DifferenceInDifferences, generate_did_data
+
+   panel = generate_did_data(n_units=200, n_periods=10, treatment_effect=2.0)
 
    # Use wild bootstrap via the estimator's inference parameter (recommended)
    did = DifferenceInDifferences(inference='wild_bootstrap', n_bootstrap=999,
-                                  cluster='unit_id')
-   results = did.fit(data, outcome='y', treatment='treated',
+                                  cluster='unit')
+   results = did.fit(panel, outcome='outcome', treatment='treated',
                      time='post')
 
    print(f"Bootstrap SE: {results.se:.3f}")
diff --git a/docs/api/visualization.rst b/docs/api/visualization.rst
@@ -22,7 +22,7 @@ Example
    # Fit event study model
    model = MultiPeriodDiD()
    results = model.fit(data, outcome='y', treatment='treated',
-                       time='period', unit='unit_id', reference_period=-1)
+                       time='period', unit='unit_id', reference_period=2)
 
    # Create plot
    fig = plot_event_study(results)
diff --git a/docs/choosing_estimator.rst b/docs/choosing_estimator.rst
@@ -157,7 +157,7 @@ Use :class:`~diff_diff.MultiPeriodDiD` when:
 
    event = MultiPeriodDiD()
    results = event.fit(data, outcome='y', treatment='treated',
-                       time='period', unit='unit_id', reference_period=-1)
+                       time='period', unit='unit_id', reference_period=2)
 
    # Visualize
    plot_event_study(results)
@@ -205,10 +205,12 @@ Use :class:`~diff_diff.SyntheticDiD` when:
 
 .. code-block:: python
 
-   from diff_diff import SyntheticDiD
+   from diff_diff import SyntheticDiD, generate_did_data
 
+   # SyntheticDiD requires block treatment (constant within units)
+   block_data = generate_did_data(n_units=40, n_periods=10, treatment_effect=2.0)
    sdid = SyntheticDiD()
-   results = sdid.fit(data, outcome='y', unit='unit_id',
+   results = sdid.fit(block_data, outcome='outcome', unit='unit',
                       time='period', treatment='treated')
 
    # View the unit weights
@@ -412,7 +414,7 @@ Use :class:`~diff_diff.BaconDecomposition` when:
 
 - You want to **diagnose** whether TWFE is biased in your staggered setting
 - You need to see which 2x2 comparisons drive the TWFE estimate
-- You want to check for negative weights from forbidden comparisons
+- You want to check whether later-vs-earlier or already-treated-as-control comparisons carry substantial weight
 
 Goodman-Bacon (2021) decomposes the TWFE estimate into a weighted average of
 all 2x2 DiD comparisons and their weights.
@@ -423,7 +425,7 @@ all 2x2 DiD comparisons and their weights.
 
    bacon = BaconDecomposition()
    results = bacon.fit(data, outcome='y', unit='unit_id',
-                       time='period', first_treat='treated')
+                       time='period', first_treat='first_treat')
    results.print_summary()
 
    # Visualize the decomposition
@@ -535,14 +537,18 @@ For panel data, always cluster at the unit level unless you have a strong reason
 
 .. code-block:: python
 
+   from diff_diff import generate_did_data
+
+   panel = generate_did_data(n_units=200, n_periods=10, treatment_effect=2.0)
+
    # Good: Cluster at unit level for panel data
-   did = DifferenceInDifferences(cluster='unit_id')
-   results = did.fit(data, outcome='y', treatment='treated',
+   did = DifferenceInDifferences(cluster='unit')
+   results = did.fit(panel, outcome='outcome', treatment='treated',
                      time='post')
 
    # Better for few clusters: Wild bootstrap
-   did = DifferenceInDifferences(inference='wild_bootstrap', cluster='state')
-   results = did.fit(data, outcome='y', treatment='treated',
+   did = DifferenceInDifferences(inference='wild_bootstrap', cluster='unit')
+   results = did.fit(panel, outcome='outcome', treatment='treated',
                      time='post')
 
 When in Doubt
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -131,7 +131,7 @@ Examine treatment effects over time:
        outcome='outcome',
        treatment='treated',
        time='period',
-       post_periods=[5, 6, 7, 8, 9],
+       post_periods=[5, 6, 7],
        reference_period=4
    )
 
@@ -154,7 +154,7 @@ When treatment is adopted at different times across units:
        outcome='outcome',
        unit='unit_id',
        time='period',
-       first_treat='first_treatment_period'
+       first_treat='first_treat'
    )
 
    # View aggregated treatment effect
diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst
@@ -363,11 +363,11 @@ during leave-one-out cross-validation (LOOCV).
    pre_periods = data.loc[data['post'] == 0, 'period'].nunique()
    print(f"Pre-treatment periods: {pre_periods}")  # Must be >= 2; stability improves with >= 4
 
-   # If TROP cannot find valid parameters, try SyntheticDiD as a fallback
-   from diff_diff import SyntheticDiD
-   sdid = SyntheticDiD()
-   results = sdid.fit(data, outcome='y', treatment='treatment',
-                      unit='unit_id', time='period')
+   # If TROP cannot find valid parameters, try CallawaySantAnna as a fallback
+   from diff_diff import CallawaySantAnna
+   cs = CallawaySantAnna()
+   results = cs.fit(data, outcome='y', unit='unit_id',
+                    time='period', first_treat='first_treat')
 
 "LOOCV fits failed / numerical instability"
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -572,7 +572,7 @@ unbalanced. Bacon decomposition requires a balanced panel.
    # Then run decomposition
    bacon = BaconDecomposition()
    results = bacon.fit(balanced, outcome='y', unit='unit_id',
-                       time='period', first_treat='treatment')
+                       time='period', first_treat='first_treat')
 
 Deprecation Warnings
 --------------------
@@ -614,7 +614,7 @@ If you encounter issues not covered here:
 
    data = generate_did_data(n_units=100, n_periods=10, treatment_effect=2.0)
    did = DifferenceInDifferences()
-   results = did.fit(data, outcome='y', treatment='treated', time='post')
+   results = did.fit(data, outcome='outcome', treatment='treated', time='post')
    print(f"True effect: 2.0, Estimated: {results.att:.3f}")
 
 For bugs or feature requests, please open an issue on
diff --git a/tests/test_doc_snippets.py b/tests/test_doc_snippets.py