Skip to content

Commit 4bf991c

Browse files
igerberclaude
andcommitted
Drop cluster="psu" from naive phase; tighten two narrative claims (P3)
CI re-review P3: - bench_brand_awareness_survey.py "naive" phase was using cluster="psu", which is already a partial sampling-design correction - the SE-inflation comparison is more faithful to Tutorial 17 when the first phase is genuinely untreated-for-design. Removed the cluster argument. - performance-plan.md narrative overreaches corrected: - Staggered campaign: at Rust medium SunAbraham is now the clearly leading phase (~1.7x ImputationDiD there), not "slightly edges out". Reworded to say ImputationDiD / SunAbraham are the top two at every scale but their order is not stable across backend and scale. - Reversible dCDH: split is not "~evenly under both backends" - Python is closer to 58/41 with the main fit leading, Rust is 51/49 with the heterogeneity refit leading. Reworded to reflect the split per backend. Regenerated the affected brand-awareness and campaign-staggered baselines (the naive-fit change slightly reduces brand-awareness chain totals and shifts phase-percentage shares). Tables in performance-plan.md re-derived via gen_findings_tables.py. Still measurement only. No changes under diff_diff/ or rust/. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 539c7d7 commit 4bf991c

29 files changed

Lines changed: 346 additions & 331 deletions

benchmarks/speed_review/baselines/brand_awareness_survey_large_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_large",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.9466241249999998,
5+
"total_seconds": 0.963601333,
66
"memory": {
77
"available": true,
8-
"start_mb": 187.52,
9-
"peak_mb": 336.53,
10-
"growth_mb": 149.02,
8+
"start_mb": 197.14,
9+
"peak_mb": 347.12,
10+
"growth_mb": 149.98,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.014438584000000088,
15+
"seconds": 0.0072292080000000425,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.02661404200000006,
20+
"seconds": 0.017787999999999915,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.4855631249999999,
25+
"seconds": 0.5239118750000002,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.24716379199999983,
30+
"seconds": 0.2471308750000003,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.02261091699999973,
35+
"seconds": 0.023947041000000002,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.01228116700000026,
40+
"seconds": 0.010973958000000117,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.13794237500000017,
45+
"seconds": 0.13260737500000008,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_large_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_large",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.9458203330000001,
5+
"total_seconds": 0.880690958,
66
"memory": {
77
"available": true,
8-
"start_mb": 193.34,
9-
"peak_mb": 343.95,
10-
"growth_mb": 150.61,
8+
"start_mb": 189.05,
9+
"peak_mb": 341.62,
10+
"growth_mb": 152.58,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.012478959000000067,
15+
"seconds": 0.013126208999999944,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.029093875000000047,
20+
"seconds": 0.03180279199999991,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.4777013749999999,
25+
"seconds": 0.4201593749999999,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.24978320799999998,
30+
"seconds": 0.21300062499999983,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.020603166999999978,
35+
"seconds": 0.036875957999999986,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.0116797919999998,
40+
"seconds": 0.025331083999999837,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.14445204200000017,
45+
"seconds": 0.14037483400000017,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_medium_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_medium",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.790833584,
5+
"total_seconds": 0.802721625,
66
"memory": {
77
"available": true,
8-
"start_mb": 133.86,
9-
"peak_mb": 186.22,
10-
"growth_mb": 52.36,
8+
"start_mb": 132.53,
9+
"peak_mb": 188.75,
10+
"growth_mb": 56.22,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.014936249999999984,
15+
"seconds": 0.011162792000000032,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.03534024999999996,
20+
"seconds": 0.033565499999999915,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.40949133400000004,
25+
"seconds": 0.4388773749999999,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.18262604100000002,
30+
"seconds": 0.17477937499999996,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.03471770800000007,
35+
"seconds": 0.02887754099999995,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.0515672920000001,
40+
"seconds": 0.05516908300000001,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.06213379099999994,
45+
"seconds": 0.060266375000000094,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_medium_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_medium",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.49272766700000004,
5+
"total_seconds": 0.512456792,
66
"memory": {
77
"available": true,
8-
"start_mb": 136.12,
9-
"peak_mb": 187.19,
10-
"growth_mb": 51.06,
8+
"start_mb": 133.5,
9+
"peak_mb": 183.52,
10+
"growth_mb": 50.02,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.01159325,
15+
"seconds": 0.012339082999999973,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.03190720899999999,
20+
"seconds": 0.0368013330000001,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.13057945900000012,
25+
"seconds": 0.14184362500000003,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.13259537499999996,
30+
"seconds": 0.1607655830000001,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.034756790999999954,
35+
"seconds": 0.025544416000000014,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.07622437500000001,
40+
"seconds": 0.060123250000000183,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.07503525,
45+
"seconds": 0.07500912500000001,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_small",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.21868924999999995,
5+
"total_seconds": 0.202955542,
66
"memory": {
77
"available": true,
8-
"start_mb": 115.31,
9-
"peak_mb": 127.31,
10-
"growth_mb": 12.0,
8+
"start_mb": 115.48,
9+
"peak_mb": 126.53,
10+
"growth_mb": 11.05,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.002108082999999983,
15+
"seconds": 0.0013837089999999552,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.00682429200000001,
20+
"seconds": 0.006605916000000045,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.024697250000000004,
25+
"seconds": 0.020932875000000073,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.05683933299999999,
30+
"seconds": 0.05001366600000001,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.009950499999999973,
35+
"seconds": 0.00929579199999997,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.028082541999999933,
40+
"seconds": 0.028791582999999954,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.09016795900000008,
45+
"seconds": 0.08592387499999998,
4646
"ok": true,
4747
"error": null
4848
}

benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,47 @@
22
"scenario": "brand_awareness_survey_small",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.22938095800000002,
5+
"total_seconds": 0.209049209,
66
"memory": {
77
"available": true,
8-
"start_mb": 115.72,
9-
"peak_mb": 129.34,
10-
"growth_mb": 13.62,
8+
"start_mb": 115.23,
9+
"peak_mb": 128.47,
10+
"growth_mb": 13.23,
1111
"sampler_interval_s": 0.01
1212
},
1313
"phases": {
1414
"1_naive_fit_no_survey_design": {
15-
"seconds": 0.002027083000000096,
15+
"seconds": 0.0016902500000000042,
1616
"ok": true,
1717
"error": null
1818
},
1919
"2_tsl_strata_psu_fpc": {
20-
"seconds": 0.006830167000000054,
20+
"seconds": 0.005705791000000016,
2121
"ok": true,
2222
"error": null
2323
},
2424
"3_replicate_weights_jk1": {
25-
"seconds": 0.028238708000000057,
25+
"seconds": 0.01469479100000004,
2626
"ok": true,
2727
"error": null
2828
},
2929
"4_multi_outcome_loop_3_metrics": {
30-
"seconds": 0.06599037499999993,
30+
"seconds": 0.05941337499999999,
3131
"ok": true,
3232
"error": null
3333
},
3434
"5_check_parallel_trends": {
35-
"seconds": 0.010059291999999997,
35+
"seconds": 0.009663624999999954,
3636
"ok": true,
3737
"error": null
3838
},
3939
"6_placebo_refit_pre_period": {
40-
"seconds": 0.028069124999999917,
40+
"seconds": 0.02708766600000001,
4141
"ok": true,
4242
"error": null
4343
},
4444
"7_event_study_plus_honest_did": {
45-
"seconds": 0.08814829099999999,
45+
"seconds": 0.090782625,
4646
"ok": true,
4747
"error": null
4848
}

0 commit comments

Comments
 (0)