igerber
diff --git a/‎benchmarks/speed_review/README.md‎
Lines changed: 76 additions & 0 deletions b/‎benchmarks/speed_review/README.md‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎benchmarks/speed_review/baselines/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/speed_review/baselines/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/speed_review/baselines/brand_awareness_survey_python.json‎
Lines changed: 58 additions & 0 deletions b/‎benchmarks/speed_review/baselines/brand_awareness_survey_python.json‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎benchmarks/speed_review/baselines/brand_awareness_survey_rust.json‎
Lines changed: 58 additions & 0 deletions b/‎benchmarks/speed_review/baselines/brand_awareness_survey_rust.json‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎benchmarks/speed_review/baselines/brfss_panel_python.json‎
Lines changed: 48 additions & 0 deletions b/‎benchmarks/speed_review/baselines/brfss_panel_python.json‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎benchmarks/speed_review/baselines/brfss_panel_rust.json‎
Lines changed: 48 additions & 0 deletions b/‎benchmarks/speed_review/baselines/brfss_panel_rust.json‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎benchmarks/speed_review/baselines/campaign_staggered_python.json‎
Lines changed: 62 additions & 0 deletions b/‎benchmarks/speed_review/baselines/campaign_staggered_python.json‎
Lines changed: 62 additions & 0 deletions
@@ -0,0 +1,76 @@
+# Speed Review — Practitioner Workflow Benchmarks
+
+Scenario-driven performance measurement for end-to-end practitioner chains,
+as distinct from `benchmarks/run_benchmarks.py` which measures R-parity on
+isolated `fit()` calls.
+
+## Why these exist
+
+See [`docs/performance-scenarios.md`](../../docs/performance-scenarios.md) for
+the full methodology. Short version: the existing benchmarks measure
+`fit()` in isolation on 200 x 8 synthetic panels, which does not reflect what
+a practitioner running the 8-step Baker et al. (2025) workflow on a real
+BRFSS or geo-experiment panel actually sees. These scripts measure the full
+chain (Bacon -> fit -> HonestDiD -> cross-estimator robustness -> reporting)
+at data shapes anchored to applied-econ conventions.
+
+## Layout
+
+```
+benchmarks/speed_review/
+├── README.md                           # this file
+├── bench_shared.py                     # timing + pyinstrument harness
+├── run_all.py                          # orchestrator (both backends)
+├── bench_campaign_staggered.py         # Scenario 1: CS + 8-step chain
+├── bench_brand_awareness_survey.py     # Scenario 2: DiD + SurveyDesign
+├── bench_brfss_panel.py                # Scenario 3: aggregate_survey -> CS
+├── bench_geo_few_markets.py            # Scenario 4: SDiD + jackknife
+├── bench_reversible_dcdh.py            # Scenario 5: dCDH L_max + TSL
+├── bench_dose_response.py              # Scenario 6: ContinuousDiD splines
+├── bench_callaway.py                   # pre-existing CS scaling sweep
+├── baseline_results.json               # pre-existing CS baseline
+└── baselines/                          # this effort's output
+    ├── <scenario>_<backend>.json       # phase-level wall-clock (committed)
+    └── profiles/                       # flame HTMLs (gitignored)
+        └── <scenario>_<backend>.html   # pyinstrument flame output
+```
+
+**Note on profile HTMLs.** pyinstrument flames are ~500KB-1.2MB each and are
+regenerated on every run; they live under `baselines/profiles/` which is
+gitignored. The key hotspots identified from them are already captured in
+the findings doc (top-5 hot phases per scenario); run a scenario locally
+to regenerate the full flame when needed.
+
+## Running
+
+```bash
+# One-time install
+pip install pyinstrument
+
+# All scenarios, both backends
+python benchmarks/speed_review/run_all.py
+
+# One scenario, one backend
+DIFF_DIFF_BACKEND=rust python benchmarks/speed_review/bench_campaign_staggered.py
+
+# Subset
+python benchmarks/speed_review/run_all.py --scenarios brfss_panel geo_few_markets
+```
+
+## Where to look for findings
+
+[`docs/performance-plan.md`](../../docs/performance-plan.md) — "Practitioner
+Workflow Baseline (v3.1.3)" section holds per-scenario hot-phase rankings
+and action recommendations. The scenarios here are the measurement surface;
+the findings doc is the decision output.
+
+## Adding a scenario
+
+1. Add the scenario definition to `docs/performance-scenarios.md`
+   (persona, data shape, operation chain, source anchor).
+2. Add `bench_<name>.py` following the existing scripts: build data, define
+   `phases` as a list of `(label, callable)` tuples, call `run_scenario`.
+3. Register it in `run_all.py`'s `SCRIPTS` dict.
+4. Run under both backends, commit the refreshed `baselines/*.json` and the
+   corresponding `baselines/profiles/*.html`.
+5. Add a per-scenario finding paragraph to `docs/performance-plan.md`.
@@ -0,0 +1 @@
+profiles/
@@ -0,0 +1,58 @@
+{
+  "scenario": "brand_awareness_survey",
+  "backend": "python",
+  "has_rust_backend": false,
+  "total_seconds": 0.18850491600000008,
+  "phases": {
+    "1_naive_fit_no_survey_design": {
+      "seconds": 0.0016701670000000002,
+      "ok": true,
+      "error": null
+    },
+    "2_tsl_strata_psu_fpc": {
+      "seconds": 0.006741541999999989,
+      "ok": true,
+      "error": null
+    },
+    "3_replicate_weights_brr": {
+      "seconds": 0.014424250000000027,
+      "ok": true,
+      "error": null
+    },
+    "4_multi_outcome_loop_3_metrics": {
+      "seconds": 0.043619666,
+      "ok": true,
+      "error": null
+    },
+    "5_check_parallel_trends": {
+      "seconds": 0.00915220799999994,
+      "ok": true,
+      "error": null
+    },
+    "6_placebo_refit_pre_period": {
+      "seconds": 0.029268290999999946,
+      "ok": true,
+      "error": null
+    },
+    "7_event_study_plus_honest_did": {
+      "seconds": 0.08362433400000002,
+      "ok": true,
+      "error": null
+    }
+  },
+  "metadata": {
+    "n_units": 200,
+    "n_periods": 12,
+    "n_obs": 2400,
+    "n_strata": 10,
+    "n_psu_per_stratum": 4,
+    "n_replicate_weights": 40,
+    "outcomes": [
+      "outcome",
+      "consideration",
+      "purchase_intent"
+    ]
+  },
+  "diff_diff_version": "3.1.3",
+  "numpy_version": "2.0.2"
+}
@@ -0,0 +1,58 @@
+{
+  "scenario": "brand_awareness_survey",
+  "backend": "rust",
+  "has_rust_backend": true,
+  "total_seconds": 0.16800324999999994,
+  "phases": {
+    "1_naive_fit_no_survey_design": {
+      "seconds": 0.0018907079999999077,
+      "ok": true,
+      "error": null
+    },
+    "2_tsl_strata_psu_fpc": {
+      "seconds": 0.006109541999999912,
+      "ok": true,
+      "error": null
+    },
+    "3_replicate_weights_brr": {
+      "seconds": 0.01849195799999992,
+      "ok": true,
+      "error": null
+    },
+    "4_multi_outcome_loop_3_metrics": {
+      "seconds": 0.02723191700000005,
+      "ok": true,
+      "error": null
+    },
+    "5_check_parallel_trends": {
+      "seconds": 0.009134625000000063,
+      "ok": true,
+      "error": null
+    },
+    "6_placebo_refit_pre_period": {
+      "seconds": 0.024182666999999936,
+      "ok": true,
+      "error": null
+    },
+    "7_event_study_plus_honest_did": {
+      "seconds": 0.08095333299999996,
+      "ok": true,
+      "error": null
+    }
+  },
+  "metadata": {
+    "n_units": 200,
+    "n_periods": 12,
+    "n_obs": 2400,
+    "n_strata": 10,
+    "n_psu_per_stratum": 4,
+    "n_replicate_weights": 40,
+    "outcomes": [
+      "outcome",
+      "consideration",
+      "purchase_intent"
+    ]
+  },
+  "diff_diff_version": "3.1.3",
+  "numpy_version": "2.0.2"
+}
@@ -0,0 +1,48 @@
+{
+  "scenario": "brfss_panel",
+  "backend": "python",
+  "has_rust_backend": false,
+  "total_seconds": 1.599043583,
+  "phases": {
+    "1_aggregate_survey_microdata_to_panel": {
+      "seconds": 1.530210625,
+      "ok": true,
+      "error": null
+    },
+    "2_cs_fit_with_stage2_survey_design": {
+      "seconds": 0.014581666999999854,
+      "ok": true,
+      "error": null
+    },
+    "3_inspect_pretrends": {
+      "seconds": 1.8749999997069722e-06,
+      "ok": true,
+      "error": null
+    },
+    "4_honest_did_grid": {
+      "seconds": 0.003660958000000214,
+      "ok": true,
+      "error": null
+    },
+    "5_sun_abraham_robustness": {
+      "seconds": 0.05053487499999987,
+      "ok": true,
+      "error": null
+    },
+    "6_practitioner_next_steps": {
+      "seconds": 4.9042000000110164e-05,
+      "ok": true,
+      "error": null
+    }
+  },
+  "metadata": {
+    "n_microdata_rows": 50000,
+    "n_states": 50,
+    "n_years": 10,
+    "n_strata": 10,
+    "n_psu": 200,
+    "n_bootstrap": 199
+  },
+  "diff_diff_version": "3.1.3",
+  "numpy_version": "2.0.2"
+}
@@ -0,0 +1,48 @@
+{
+  "scenario": "brfss_panel",
+  "backend": "rust",
+  "has_rust_backend": true,
+  "total_seconds": 1.5960411249999997,
+  "phases": {
+    "1_aggregate_survey_microdata_to_panel": {
+      "seconds": 1.5271849580000003,
+      "ok": true,
+      "error": null
+    },
+    "2_cs_fit_with_stage2_survey_design": {
+      "seconds": 0.014870542000000153,
+      "ok": true,
+      "error": null
+    },
+    "3_inspect_pretrends": {
+      "seconds": 2.208000000170074e-06,
+      "ok": true,
+      "error": null
+    },
+    "4_honest_did_grid": {
+      "seconds": 0.003847707999999894,
+      "ok": true,
+      "error": null
+    },
+    "5_sun_abraham_robustness": {
+      "seconds": 0.05008866700000025,
+      "ok": true,
+      "error": null
+    },
+    "6_practitioner_next_steps": {
+      "seconds": 4.3584000000151946e-05,
+      "ok": true,
+      "error": null
+    }
+  },
+  "metadata": {
+    "n_microdata_rows": 50000,
+    "n_states": 50,
+    "n_years": 10,
+    "n_strata": 10,
+    "n_psu": 200,
+    "n_bootstrap": 199
+  },
+  "diff_diff_version": "3.1.3",
+  "numpy_version": "2.0.2"
+}
@@ -0,0 +1,62 @@
+{
+  "scenario": "campaign_staggered",
+  "backend": "python",
+  "has_rust_backend": false,
+  "total_seconds": 0.493763792,
+  "phases": {
+    "1_bacon_decomposition": {
+      "seconds": 0.00662462499999994,
+      "ok": true,
+      "error": null
+    },
+    "2_cs_fit_with_covariates_bootstrap999": {
+      "seconds": 0.06328537499999998,
+      "ok": true,
+      "error": null
+    },
+    "3_inspect_pretrends": {
+      "seconds": 3.3750000000276614e-06,
+      "ok": true,
+      "error": null
+    },
+    "4_honest_did_M_grid": {
+      "seconds": 0.0047993339999999884,
+      "ok": true,
+      "error": null
+    },
+    "5_sun_abraham_robustness": {
+      "seconds": 0.09586058399999997,
+      "ok": true,
+      "error": null
+    },
+    "6_imputation_did_robustness": {
+      "seconds": 0.29060341599999995,
+      "ok": true,
+      "error": null
+    },
+    "7_cs_without_covariates": {
+      "seconds": 0.03254304100000005,
+      "ok": true,
+      "error": null
+    },
+    "8_practitioner_next_steps": {
+      "seconds": 3.7708000000025166e-05,
+      "ok": true,
+      "error": null
+    }
+  },
+  "metadata": {
+    "n_units": 150,
+    "n_periods": 26,
+    "n_cohorts": 2,
+    "covariates": [
+      "log_pop",
+      "baseline_spend"
+    ],
+    "n_bootstrap": 999,
+    "aggregate": "all",
+    "estimation_method": "dr"
+  },
+  "diff_diff_version": "3.1.3",
+  "numpy_version": "2.0.2"
+}