Skip to content

Commit 33d55fb

Browse files
igerberclaude
andcommitted
Extend scenarios with scale sweep, confirm aggregate_survey bottleneck
Four scenarios (campaign_staggered, brand_awareness_survey, brfss_panel, geo_few_markets) now run at small/medium/large data scales rather than a single tutorial-scale point. The large scales reflect practitioner realism: 1M-row BRFSS pooled panels, 1,500-unit county-level staggered studies, 1,000-unit multi-region brand surveys, 500-unit zip-level geo-experiments. Key finding from the sweep: aggregate_survey at 1M microdata rows takes ~24 seconds (100% of BRFSS chain runtime), with 97% of that in _compute_stratified_psu_meat self-time. The tutorial-scale pass had flagged this as a 1.5s finding; at practitioner scale it is 15-20x larger and becomes the single highest-value optimization target identified. The other four findings hold across scales: CS chain scales well to 1,500 units, brand-survey chain scales sub-linearly, SDiD Rust gap is stable, ImputationDiD remains the top phase of the staggered chain at all scales. Measurement only. docs/performance-plan.md and docs/performance-scenarios.md updated with scale-sweep tables and scaling-finding narrative. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent de6ce63 commit 33d55fb

34 files changed

Lines changed: 1285 additions & 334 deletions

benchmarks/speed_review/README.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Speed Review Practitioner Workflow Benchmarks
1+
# Speed Review - Practitioner Workflow Benchmarks
22

33
Scenario-driven performance measurement for end-to-end practitioner chains,
44
as distinct from `benchmarks/run_benchmarks.py` which measures R-parity on
@@ -47,19 +47,25 @@ to regenerate the full flame when needed.
4747
# One-time install
4848
pip install pyinstrument
4949

50-
# All scenarios, both backends
50+
# All scenarios, both backends, all scales
5151
python benchmarks/speed_review/run_all.py
5252

53-
# One scenario, one backend
53+
# One scenario, one backend (the script runs its full scale sweep internally)
5454
DIFF_DIFF_BACKEND=rust python benchmarks/speed_review/bench_campaign_staggered.py
5555

5656
# Subset
5757
python benchmarks/speed_review/run_all.py --scenarios brfss_panel geo_few_markets
5858
```
5959

60+
Multi-scale scenarios write per-scale outputs
61+
(e.g. `campaign_staggered_small_rust.json`, `..._medium_rust.json`,
62+
`..._large_rust.json`). Single-scale scenarios write the scale-free form
63+
(e.g. `dose_response_rust.json`). Full runtime for all scales × both
64+
backends is ~90 seconds on Apple Silicon M4.
65+
6066
## Where to look for findings
6167

62-
[`docs/performance-plan.md`](../../docs/performance-plan.md) "Practitioner
68+
[`docs/performance-plan.md`](../../docs/performance-plan.md) - "Practitioner
6369
Workflow Baseline (v3.1.3)" section holds per-scenario hot-phase rankings
6470
and action recommendations. The scenarios here are the measurement surface;
6571
the findings doc is the decision output.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"scenario": "brand_awareness_survey_large",
3+
"backend": "python",
4+
"has_rust_backend": false,
5+
"total_seconds": 0.7940070000000001,
6+
"phases": {
7+
"1_naive_fit_no_survey_design": {
8+
"seconds": 0.013499665999999966,
9+
"ok": true,
10+
"error": null
11+
},
12+
"2_tsl_strata_psu_fpc": {
13+
"seconds": 0.03187458300000001,
14+
"ok": true,
15+
"error": null
16+
},
17+
"3_replicate_weights_brr": {
18+
"seconds": 0.3442796670000001,
19+
"ok": true,
20+
"error": null
21+
},
22+
"4_multi_outcome_loop_3_metrics": {
23+
"seconds": 0.19682533299999982,
24+
"ok": true,
25+
"error": null
26+
},
27+
"5_check_parallel_trends": {
28+
"seconds": 0.030179500000000026,
29+
"ok": true,
30+
"error": null
31+
},
32+
"6_placebo_refit_pre_period": {
33+
"seconds": 0.043751333999999975,
34+
"ok": true,
35+
"error": null
36+
},
37+
"7_event_study_plus_honest_did": {
38+
"seconds": 0.13358487500000016,
39+
"ok": true,
40+
"error": null
41+
}
42+
},
43+
"metadata": {
44+
"scale": "large",
45+
"n_units": 1000,
46+
"n_periods": 12,
47+
"n_obs": 12000,
48+
"n_strata": 20,
49+
"n_psu_per_stratum": 8,
50+
"n_replicate_weights": 160,
51+
"outcomes": [
52+
"outcome",
53+
"consideration",
54+
"purchase_intent"
55+
]
56+
},
57+
"diff_diff_version": "3.1.3",
58+
"numpy_version": "2.0.2"
59+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"scenario": "brand_awareness_survey_large",
3+
"backend": "rust",
4+
"has_rust_backend": true,
5+
"total_seconds": 0.828119375,
6+
"phases": {
7+
"1_naive_fit_no_survey_design": {
8+
"seconds": 0.014049749999999861,
9+
"ok": true,
10+
"error": null
11+
},
12+
"2_tsl_strata_psu_fpc": {
13+
"seconds": 0.029422499999999907,
14+
"ok": true,
15+
"error": null
16+
},
17+
"3_replicate_weights_brr": {
18+
"seconds": 0.36754912500000003,
19+
"ok": true,
20+
"error": null
21+
},
22+
"4_multi_outcome_loop_3_metrics": {
23+
"seconds": 0.16490987499999998,
24+
"ok": true,
25+
"error": null
26+
},
27+
"5_check_parallel_trends": {
28+
"seconds": 0.03375229199999996,
29+
"ok": true,
30+
"error": null
31+
},
32+
"6_placebo_refit_pre_period": {
33+
"seconds": 0.06475750000000025,
34+
"ok": true,
35+
"error": null
36+
},
37+
"7_event_study_plus_honest_did": {
38+
"seconds": 0.15367104200000004,
39+
"ok": true,
40+
"error": null
41+
}
42+
},
43+
"metadata": {
44+
"scale": "large",
45+
"n_units": 1000,
46+
"n_periods": 12,
47+
"n_obs": 12000,
48+
"n_strata": 20,
49+
"n_psu_per_stratum": 8,
50+
"n_replicate_weights": 160,
51+
"outcomes": [
52+
"outcome",
53+
"consideration",
54+
"purchase_intent"
55+
]
56+
},
57+
"diff_diff_version": "3.1.3",
58+
"numpy_version": "2.0.2"
59+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"scenario": "brand_awareness_survey_medium",
3+
"backend": "python",
4+
"has_rust_backend": false,
5+
"total_seconds": 0.48956791599999994,
6+
"phases": {
7+
"1_naive_fit_no_survey_design": {
8+
"seconds": 0.01289191699999992,
9+
"ok": true,
10+
"error": null
11+
},
12+
"2_tsl_strata_psu_fpc": {
13+
"seconds": 0.035409875000000035,
14+
"ok": true,
15+
"error": null
16+
},
17+
"3_replicate_weights_brr": {
18+
"seconds": 0.12633833299999997,
19+
"ok": true,
20+
"error": null
21+
},
22+
"4_multi_outcome_loop_3_metrics": {
23+
"seconds": 0.17774295900000003,
24+
"ok": true,
25+
"error": null
26+
},
27+
"5_check_parallel_trends": {
28+
"seconds": 0.018629792000000034,
29+
"ok": true,
30+
"error": null
31+
},
32+
"6_placebo_refit_pre_period": {
33+
"seconds": 0.0519646250000001,
34+
"ok": true,
35+
"error": null
36+
},
37+
"7_event_study_plus_honest_did": {
38+
"seconds": 0.06657341699999986,
39+
"ok": true,
40+
"error": null
41+
}
42+
},
43+
"metadata": {
44+
"scale": "medium",
45+
"n_units": 500,
46+
"n_periods": 12,
47+
"n_obs": 6000,
48+
"n_strata": 15,
49+
"n_psu_per_stratum": 6,
50+
"n_replicate_weights": 90,
51+
"outcomes": [
52+
"outcome",
53+
"consideration",
54+
"purchase_intent"
55+
]
56+
},
57+
"diff_diff_version": "3.1.3",
58+
"numpy_version": "2.0.2"
59+
}
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
{
2+
"scenario": "brand_awareness_survey_medium",
3+
"backend": "rust",
4+
"has_rust_backend": true,
5+
"total_seconds": 0.535454792,
6+
"phases": {
7+
"1_naive_fit_no_survey_design": {
8+
"seconds": 0.011897708999999979,
9+
"ok": true,
10+
"error": null
11+
},
12+
"2_tsl_strata_psu_fpc": {
13+
"seconds": 0.03526237499999996,
14+
"ok": true,
15+
"error": null
16+
},
17+
"3_replicate_weights_brr": {
18+
"seconds": 0.185435083,
19+
"ok": true,
20+
"error": null
21+
},
22+
"4_multi_outcome_loop_3_metrics": {
23+
"seconds": 0.14044966699999994,
24+
"ok": true,
25+
"error": null
26+
},
27+
"5_check_parallel_trends": {
28+
"seconds": 0.019051875000000162,
29+
"ok": true,
30+
"error": null
31+
},
32+
"6_placebo_refit_pre_period": {
33+
"seconds": 0.05337804200000007,
34+
"ok": true,
35+
"error": null
36+
},
37+
"7_event_study_plus_honest_did": {
38+
"seconds": 0.08997387500000009,
39+
"ok": true,
40+
"error": null
41+
}
42+
},
43+
"metadata": {
44+
"scale": "medium",
45+
"n_units": 500,
46+
"n_periods": 12,
47+
"n_obs": 6000,
48+
"n_strata": 15,
49+
"n_psu_per_stratum": 6,
50+
"n_replicate_weights": 90,
51+
"outcomes": [
52+
"outcome",
53+
"consideration",
54+
"purchase_intent"
55+
]
56+
},
57+
"diff_diff_version": "3.1.3",
58+
"numpy_version": "2.0.2"
59+
}

benchmarks/speed_review/baselines/brand_awareness_survey_python.json renamed to benchmarks/speed_review/baselines/brand_awareness_survey_small_python.json

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,47 @@
11
{
2-
"scenario": "brand_awareness_survey",
2+
"scenario": "brand_awareness_survey_small",
33
"backend": "python",
44
"has_rust_backend": false,
5-
"total_seconds": 0.18850491600000008,
5+
"total_seconds": 0.15087129199999993,
66
"phases": {
77
"1_naive_fit_no_survey_design": {
8-
"seconds": 0.0016701670000000002,
8+
"seconds": 0.0017902499999999932,
99
"ok": true,
1010
"error": null
1111
},
1212
"2_tsl_strata_psu_fpc": {
13-
"seconds": 0.006741541999999989,
13+
"seconds": 0.00610949999999999,
1414
"ok": true,
1515
"error": null
1616
},
1717
"3_replicate_weights_brr": {
18-
"seconds": 0.014424250000000027,
18+
"seconds": 0.02120725000000001,
1919
"ok": true,
2020
"error": null
2121
},
2222
"4_multi_outcome_loop_3_metrics": {
23-
"seconds": 0.043619666,
23+
"seconds": 0.011621500000000062,
2424
"ok": true,
2525
"error": null
2626
},
2727
"5_check_parallel_trends": {
28-
"seconds": 0.00915220799999994,
28+
"seconds": 0.001833375000000026,
2929
"ok": true,
3030
"error": null
3131
},
3232
"6_placebo_refit_pre_period": {
33-
"seconds": 0.029268290999999946,
33+
"seconds": 0.027076792000000016,
3434
"ok": true,
3535
"error": null
3636
},
3737
"7_event_study_plus_honest_did": {
38-
"seconds": 0.08362433400000002,
38+
"seconds": 0.081212583,
3939
"ok": true,
4040
"error": null
4141
}
4242
},
4343
"metadata": {
44+
"scale": "small",
4445
"n_units": 200,
4546
"n_periods": 12,
4647
"n_obs": 2400,

benchmarks/speed_review/baselines/brand_awareness_survey_rust.json renamed to benchmarks/speed_review/baselines/brand_awareness_survey_small_rust.json

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,47 @@
11
{
2-
"scenario": "brand_awareness_survey",
2+
"scenario": "brand_awareness_survey_small",
33
"backend": "rust",
44
"has_rust_backend": true,
5-
"total_seconds": 0.16800324999999994,
5+
"total_seconds": 0.200881125,
66
"phases": {
77
"1_naive_fit_no_survey_design": {
8-
"seconds": 0.0018907079999999077,
8+
"seconds": 0.0018462080000000158,
99
"ok": true,
1010
"error": null
1111
},
1212
"2_tsl_strata_psu_fpc": {
13-
"seconds": 0.006109541999999912,
13+
"seconds": 0.005704333000000061,
1414
"ok": true,
1515
"error": null
1616
},
1717
"3_replicate_weights_brr": {
18-
"seconds": 0.01849195799999992,
18+
"seconds": 0.015561500000000006,
1919
"ok": true,
2020
"error": null
2121
},
2222
"4_multi_outcome_loop_3_metrics": {
23-
"seconds": 0.02723191700000005,
23+
"seconds": 0.05937758399999993,
2424
"ok": true,
2525
"error": null
2626
},
2727
"5_check_parallel_trends": {
28-
"seconds": 0.009134625000000063,
28+
"seconds": 0.00939004099999996,
2929
"ok": true,
3030
"error": null
3131
},
3232
"6_placebo_refit_pre_period": {
33-
"seconds": 0.024182666999999936,
33+
"seconds": 0.025794415999999987,
3434
"ok": true,
3535
"error": null
3636
},
3737
"7_event_study_plus_honest_did": {
38-
"seconds": 0.08095333299999996,
38+
"seconds": 0.08319054199999998,
3939
"ok": true,
4040
"error": null
4141
}
4242
},
4343
"metadata": {
44+
"scale": "small",
4445
"n_units": 200,
4546
"n_periods": 12,
4647
"n_obs": 2400,

0 commit comments

Comments
 (0)