Skip to content

Commit 98a1f3a

Browse files
igerberclaude
andcommitted
Close remaining CI review items; automate table generation
Addresses the second-round CI review findings: - P1 false-pass (remaining): removed five phase-local try/except blocks that swallowed sub-step exceptions (HonestDiD M-grids in brand-awareness and BRFSS, dCDH HonestDiD and heterogeneity refit, dose-response dataframe extraction). Exceptions now escape, the phase is marked ok=false, and run_scenario's atexit handler exits nonzero. The fix caught a real API-usage bug on its first rerun: dose_response extract phase tried to pull event_study level on a result fit with aggregate="dose"; the event_study fit lives in a dedicated phase, so that level is removed from the extraction loop. - P2 scenario-spec drift: BRFSS scenario text now says pweight TSL stage-2 (matching the aggregate_survey-returned design), not "Full replicate-weight path"; dCDH reversible scenario text now says heterogeneity="group" (matching the script), not "cohort". - P3 path leakage: tracemalloc output now scrubs $HOME, repo root, and site-packages before writing the committed txt. Drift-prevention layer: - gen_findings_tables.py reads every JSON baseline and rewrites the numerical tables in performance-plan.md between <!-- TABLE:start <id> --> / <!-- TABLE:end <id> --> markers. Tables now re-derive from data on every rerun, eliminating the hand-edit drift the prior review flagged. Narrative prose stays hand-written by design, forcing a human re-read of findings when numbers shift. Findings refresh (the numbers moved slightly; three narrative claims needed updating): - "Rust marginally slower than Python on JK1 at large scale" -> removed; fresh data has Rust and Python within noise on brand awareness at large (JK1 phase 0.577s Py / 0.562s Rust, totals 1.03 / 1.04). - "ImputationDiD consistently dominant phase at all scales" -> narrowed to "dominant under Python; tied with SunAbraham under Rust at large". - "Nine-figures of MB" in memory finding #3 was a phrasing error (literally 100+ TB); corrected to "mid-100s of MB". Priority of optimization opportunities refreshed against new data: - #1 aggregate_survey precompute stratum scaffolding: High (unchanged, now strongly supported - 24.75s Python / 25.41s Rust at 1M rows, 100% of chain runtime, growth only +31 MB). - #2 Staggered CS working-memory audit: Low with explicit bump-trigger (Rust large crosses 512 MB Lambda line). - #5 Rust-port JK1 replicate fit loop: demoted from Medium to Low - the "Rust regression to fix" leg of the rationale is gone because Rust is no longer slower. Net: one clear priority (aggregate_survey fix), four optional follow-ups. Still measurement only. No changes under diff_diff/ or rust/. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 8d5eae8 commit 98a1f3a

36 files changed

Lines changed: 745 additions & 486 deletions

benchmarks/speed_review/baselines/brand_awareness_survey_large_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_large",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.9061127080000002,
5+
"total_seconds": 1.026685333,
66
"memory": {
77
"available": true,
8-
"start_mb": 189.33,
9-
"peak_mb": 335.59,
10-
"growth_mb": 146.27,
8+
"start_mb": 195.45,
9+
"peak_mb": 335.25,
10+
"growth_mb": 139.8,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.013970540999999947,
15+
"seconds": 0.01289908400000006,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.03233287500000004,
20+
"seconds": 0.030157082999999973,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.44611704099999994,
25+
"seconds": 0.5706585830000002,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.21938504199999986,
30+
"seconds": 0.21575479099999972,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.04051395800000002,
35+
"seconds": 0.039158959000000326,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.016386375000000175,
40+
"seconds": 0.02081145899999992,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.13737600000000016,
45+
"seconds": 0.13721408299999993,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_large_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_large",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.8791467499999999,
5+
"total_seconds": 1.03563075,
66
"memory": {
77
"available": true,
8-
"start_mb": 187.97,
9-
"peak_mb": 315.2,
10-
"growth_mb": 127.23,
8+
"start_mb": 194.5,
9+
"peak_mb": 338.94,
10+
"growth_mb": 144.44,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.013859375000000007,
15+
"seconds": 0.01294266700000013,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.02124400000000004,
20+
"seconds": 0.031624374999999816,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.42970375000000005,
25+
"seconds": 0.5619407919999999,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.2112943330000001,
30+
"seconds": 0.263706,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.038379208000000276,
35+
"seconds": 0.009824249999999868,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.025571082999999994,
40+
"seconds": 0.012082083000000132,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.1390828329999998,
45+
"seconds": 0.1435007500000003,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_medium_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_medium",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.560188,
5+
"total_seconds": 0.763684958,
66
"memory": {
77
"available": true,
8-
"start_mb": 131.69,
9-
"peak_mb": 187.8,
10-
"growth_mb": 56.11,
8+
"start_mb": 135.03,
9+
"peak_mb": 193.52,
10+
"growth_mb": 58.48,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.011834042000000045,
15+
"seconds": 0.01204445799999998,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.03354012500000003,
20+
"seconds": 0.03723474999999998,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.21381758399999995,
25+
"seconds": 0.377466875,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.13717983300000003,
30+
"seconds": 0.14207241700000006,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.018324165999999975,
35+
"seconds": 0.014263458000000062,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.058137000000000105,
40+
"seconds": 0.06729445800000011,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.08734375000000005,
45+
"seconds": 0.11328508300000006,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_medium_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_medium",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.5398647089999999,
5+
"total_seconds": 0.5770147080000001,
66
"memory": {
77
"available": true,
8-
"start_mb": 133.16,
9-
"peak_mb": 185.38,
10-
"growth_mb": 52.22,
8+
"start_mb": 135.83,
9+
"peak_mb": 189.91,
10+
"growth_mb": 54.08,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.011500667000000075,
15+
"seconds": 0.014149625000000055,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.03384820799999999,
20+
"seconds": 0.03726633299999993,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.191542875,
25+
"seconds": 0.20208641699999985,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.105974083,
30+
"seconds": 0.14449316599999995,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.02876208299999994,
35+
"seconds": 0.022887250000000137,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.06280441700000017,
40+
"seconds": 0.07035533400000005,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.10540583399999992,
45+
"seconds": 0.085751833,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_small",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.15974079200000002,
5+
"total_seconds": 0.21190179099999995,
66
"memory": {
77
"available": true,
8-
"start_mb": 115.44,
9-
"peak_mb": 125.66,
10-
"growth_mb": 10.22,
8+
"start_mb": 115.28,
9+
"peak_mb": 131.0,
10+
"growth_mb": 15.72,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.0016714159999999811,
15+
"seconds": 0.0019152919999999574,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.0061952499999999855,
20+
"seconds": 0.007577749999999939,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.018200666000000032,
25+
"seconds": 0.02303670899999999,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.02470079199999997,
30+
"seconds": 0.052659917000000056,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.008862999999999954,
35+
"seconds": 0.009975750000000061,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.024017708000000026,
40+
"seconds": 0.027377666999999994,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.07607645800000007,
45+
"seconds": 0.08933650000000004,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_small",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.19896133300000007,
5+
"total_seconds": 0.19699320799999998,
66
"memory": {
77
"available": true,
8-
"start_mb": 116.34,
9-
"peak_mb": 129.73,
10-
"growth_mb": 13.39,
8+
"start_mb": 115.36,
9+
"peak_mb": 130.86,
10+
"growth_mb": 15.5,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.0019397500000000178,
15+
"seconds": 0.0021563339999999265,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.005711999999999939,
20+
"seconds": 0.006268332999999959,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.011531958999999925,
25+
"seconds": 0.020846708999999963,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.06204845800000003,
30+
"seconds": 0.027288292000000047,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.00982324999999995,
35+
"seconds": 0.01103129199999997,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.024675957999999998,
40+
"seconds": 0.02803533300000005,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.08321629100000005,
45+
"seconds": 0.10135274999999999,
4646
"ok": true,
4747
"error": null
4848
}

0 commit comments

Comments
 (0)