Skip to content

Commit 030d5f5

Browse files
igerberclaude
andcommitted
Pin dose-response cohort to period 3 (P1); harden narrative to close-race shifts
CI re-review P1: bench_dose_response.py inherited the CDiD generator's default cohort [2], not the documented period 3. The fallback that would have set first_treat=3 never ran (generator already populates first_treat), so the committed baselines measured a different cohort onset than the scenario doc. The binarized DiD phase also hardcoded post >= 3, which further desynced it from the actual CDiD treatment start under the default DGP. Fix: - Pin the generator to cohort_periods=[3] so the DGP matches the docs. - Assert exactly one positive first_treat after generation; future DGP changes that break the single-cohort contract will fail loudly instead of drifting silently. - Binarized phase now derives its post cutoff from the actual first_treat in the data, not a hardcoded period number. No opportunity to desync from the CDiD fits above. - Regenerated dose-response baselines for both backends. Structural narrative hardening: Prior CI rounds have repeatedly re-flagged the same drift pattern: the staggered campaign and reversible dCDH narratives make phase- order claims at close-race cells (staggered Rust medium, dCDH at this shape) that can flip on rerun because the two contenders are within a few percentage points of each other. The underlying ranking is not the right level of abstraction for narrative; the phase-share table is. This commit rewrites both narratives to describe the aggregate share pattern and defer per-cell ordering to the generator-produced table. Scaling finding #2 and hotspot table row #2 get the same treatment. Net effect: narrative claims are now robust to rerun noise at close-race cells. Still measurement only. No changes under diff_diff/ or rust/. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a0aafc5 commit 030d5f5

29 files changed

Lines changed: 358 additions & 343 deletions

benchmarks/speed_review/baselines/brand_awareness_survey_large_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_large",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.963601333,
5+
"total_seconds": 1.0910496250000001,
66
"memory": {
77
"available": true,
8-
"start_mb": 197.14,
9-
"peak_mb": 347.12,
10-
"growth_mb": 149.98,
8+
"start_mb": 188.45,
9+
"peak_mb": 327.44,
10+
"growth_mb": 138.98,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.0072292080000000425,
15+
"seconds": 0.009826500000000182,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.017787999999999915,
20+
"seconds": 0.030280333999999964,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.5239118750000002,
25+
"seconds": 0.6243122919999999,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.2471308750000003,
30+
"seconds": 0.24174716599999968,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.023947041000000002,
35+
"seconds": 0.025623749999999834,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.010973958000000117,
40+
"seconds": 0.01191299999999984,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.13260737500000008,
45+
"seconds": 0.147335875,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_large_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_large",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.880690958,
5+
"total_seconds": 1.0000031249999999,
66
"memory": {
77
"available": true,
8-
"start_mb": 189.05,
9-
"peak_mb": 341.62,
10-
"growth_mb": 152.58,
8+
"start_mb": 194.03,
9+
"peak_mb": 336.08,
10+
"growth_mb": 142.05,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.013126208999999944,
15+
"seconds": 0.013511041000000112,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.03180279199999991,
20+
"seconds": 0.03037650000000003,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.4201593749999999,
25+
"seconds": 0.5431151669999998,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.21300062499999983,
30+
"seconds": 0.21752962499999962,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.036875957999999986,
35+
"seconds": 0.04399687500000038,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.025331083999999837,
40+
"seconds": 0.016433082999999904,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.14037483400000017,
45+
"seconds": 0.13501837500000002,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_medium_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_medium",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.802721625,
5+
"total_seconds": 0.563283334,
66
"memory": {
77
"available": true,
8-
"start_mb": 132.53,
9-
"peak_mb": 188.75,
10-
"growth_mb": 56.22,
8+
"start_mb": 133.69,
9+
"peak_mb": 187.7,
10+
"growth_mb": 54.02,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.011162792000000032,
15+
"seconds": 0.010921792000000097,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.033565499999999915,
20+
"seconds": 0.03732066599999995,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.4388773749999999,
25+
"seconds": 0.20805304199999997,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.17477937499999996,
30+
"seconds": 0.12622899999999992,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.02887754099999995,
35+
"seconds": 0.01834783299999998,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.05516908300000001,
40+
"seconds": 0.054030583000000076,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.060266375000000094,
45+
"seconds": 0.10836029199999997,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_medium_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_medium",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.512456792,
5+
"total_seconds": 0.5500554579999999,
66
"memory": {
77
"available": true,
8-
"start_mb": 133.5,
9-
"peak_mb": 183.52,
10-
"growth_mb": 50.02,
8+
"start_mb": 135.36,
9+
"peak_mb": 184.86,
10+
"growth_mb": 49.5,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.012339082999999973,
15+
"seconds": 0.011186999999999947,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.0368013330000001,
20+
"seconds": 0.03363270800000007,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.14184362500000003,
25+
"seconds": 0.18678066699999996,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.1607655830000001,
30+
"seconds": 0.16038787500000007,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.025544416000000014,
35+
"seconds": 0.022171542000000155,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.060123250000000183,
40+
"seconds": 0.0532650830000001,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.07500912500000001,
45+
"seconds": 0.08262075000000002,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_small",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.202955542,
5+
"total_seconds": 0.19338629200000002,
66
"memory": {
77
"available": true,
88
"start_mb": 115.48,
9-
"peak_mb": 126.53,
10-
"growth_mb": 11.05,
9+
"peak_mb": 127.31,
10+
"growth_mb": 11.83,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.0013837089999999552,
15+
"seconds": 0.0014470410000000378,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.006605916000000045,
20+
"seconds": 0.0072707499999999925,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.020932875000000073,
25+
"seconds": 0.023173292000000068,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.05001366600000001,
30+
"seconds": 0.03375529200000005,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.00929579199999997,
35+
"seconds": 0.01041325000000004,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.028791582999999954,
40+
"seconds": 0.027520249999999913,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.08592387499999998,
45+
"seconds": 0.08979433299999995,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_small",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.209049209,
5+
"total_seconds": 0.19669587500000008,
66
"memory": {
77
"available": true,
8-
"start_mb": 115.23,
9-
"peak_mb": 128.47,
10-
"growth_mb": 13.23,
8+
"start_mb": 114.78,
9+
"peak_mb": 127.91,
10+
"growth_mb": 13.12,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.0016902500000000042,
15+
"seconds": 0.0016678749999999853,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.005705791000000016,
20+
"seconds": 0.005756874999999995,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.01469479100000004,
25+
"seconds": 0.012066042000000055,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.05941337499999999,
30+
"seconds": 0.05887395800000006,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.009663624999999954,
35+
"seconds": 0.008938375000000054,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.02708766600000001,
40+
"seconds": 0.0274049999999999,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.090782625,
45+
"seconds": 0.08197737500000002,
4646
"ok": true,
4747
"error": null
4848
}

0 commit comments

Comments
 (0)