diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22c0d7027b..08dbe5aedc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1359,6 +1359,7 @@ SET(DAS_SIMULATE_INCLUDES
     include/daScript/simulate/bind_enum.h
     include/daScript/simulate/bin_serializer.h
     include/daScript/simulate/cast.h
+    include/daScript/simulate/das_qsort_r.h
     include/daScript/simulate/data_walker.h
     include/daScript/simulate/debug_info.h
     include/daScript/simulate/debug_print.h
diff --git a/benchmarks/sql/LINQ.md b/benchmarks/sql/LINQ.md
index 77bc2972c4..7a84d4b201 100644
--- a/benchmarks/sql/LINQ.md
+++ b/benchmarks/sql/LINQ.md
@@ -4,17 +4,17 @@ Project notes and progress for the `daslib/linq_fold` macro family, modeled afte
 
 ## What this is
 
-The current `_fold(...)` macro in linq_boost wraps LINQ pipelines into intermediate `array<T>` per stage and pattern-matches a small set of common shapes (`where+count`, `where+select`, `select+where`, `order+distinct`, `where`, `select`) for ad-hoc fusion. Everything else falls through to a default emitter that builds nested `var pass_N <- pass_(N-1) |> next(...)` — one fresh array per stage, every predicate called via lambda dispatch.
+`_fold(chain)` is a three-tier cascade — always-safe, never breaks semantics, only ever faster:
 
-The goal is a planner-driven dispatch macro that emits one fused for-loop with predicates inlined (splice mode), materializing only when an operator genuinely needs random access. Three output modes:
-
-1. **Splice** (default): producer body inlined into consumer's loop. Zero allocation, zero per-element dispatch.
-2. **Array intermediate**: when a downstream op needs random access / multi-pass / length (`sort`, `reverse`, `distinct`, `groupby`). Once we go array, we stay array (iterating an array is faster than iterating an iterator).
-3. **Helper-call fallback**: when splice can't apply at all (escape into `let`, opaque source). Emits calls to named helper functions in `linq_fold`.
+1. **Splice** (tier 1): a fused for-loop with predicates and projections inlined directly. Zero per-element lambda dispatch, zero intermediate iterators. Two planners feed this tier:
+   - **`plan_order_family`** — handles chains containing any of `order` / `order_by` / `order_descending` / `order_by_descending`, optionally with `take(K)` and/or a `where_*` prefilter. Dispatches to `top_n*` helpers or emits a fused prefilter loop + `order*_inplace` on a buffer.
+   - **`plan_loop_or_count`** — handles `[where_*][select*][skip?][take?]` followed by a recognized terminator (count / sum / min / max / average / long_count / first / first_or_default / any / all / contains / to-array). Includes the where-after-select arm via `replaceVariablePeeling` for the peel-aware substitution required on typed AST.
+2. **`fold_linq_default`** (tier 2): an array-shape pipeline (`call → array → call → array`) with `_inplace` variants reusing the same buffer and explicit `delete` of the previous stage. Used when no splice arm matches but the chain has linq operators.
+3. **Raw clone** (tier 3): passthrough when `flatten_linq` finds no recognized linq operators in the chain at all.
 
 Lambda inlining is best-effort: literal `@(x) => expr` at the call site → splice the body; otherwise → call.
 
-See `~/.claude/plans/keen-hopping-balloon.md` for the long-form plan.
+See `~/.claude/plans/keen-hopping-balloon.md` and `~/.claude/plans/enumerated-baking-elephant.md` for the long-form plans.
 
 ## Phase status
 
@@ -27,43 +27,44 @@ See `~/.claude/plans/keen-hopping-balloon.md` for the long-form plan.
 | 2B Ring 2 | Early-exit lane: `first`, `first_or_default`, `any`, `all`, `contains` via `invoke($block { ... return val })`. Predicate-free `any` gets a `length(src) > 0` shortcut. | ✅ done |
 | 2C Ring 3 | `take(N)` / `skip(N)` in counter/array/accumulator/early-exit lanes. Canonical chain order `[where_*][select*][skip?][take?] |> terminator`. Trailing take/skip (no explicit aggregator) → ARRAY lane with implicit `to_array`. Range-form `take(start..end)` falls through (slice operator, different semantics). Buffer-required ops (`order_by`, `distinct`, `reverse`, `group_by`, `zip`, `join`, `left_join`, `group_join`) recognized by name and emit silent fallback with future-mode markers (BufferTopN / BufferDistinct / BufferReverse / BufferGroupBy / MultiSourceZip / BufferedJoin). | ✅ done |
 | 2C Ring 4 | Non-workhorse chained selects via `:=`-clone. | ✅ done |
-| 2D | Fail-loudly contract — see "Planned" section below | ⏳ |
 | 3 Phase 0 | `<algorithm>` sort-family bindings — `partial_sort`, `nth_element`, `make_heap`/`push_heap`/`pop_heap`. Both typed (19 workhorse types) and any-cblock paths (user structs via `das_qsort_r.h` introselect + binary-heap templates). `daslib/sort_boost.das` user-facing wrappers + `q*` dispatcher macros. `daslib/linq.das` `top_n` / `top_n_by` family (array + iterator sources). C++ tests, daslang tests (53/53), 6 benchmarks, doc grouping, `33_algorithm` tutorial expansion. Unblocks BufferTopN. | ✅ done |
-| 3+ | Buffer-required emit modes: BufferTopN (sort/order_by/take), BufferDistinct, BufferGroupBy, BufferReverse, MultiSourceZip, BufferedJoin. Once we go array, we stay array | ⏳ |
-| 4 | Final coverage pass + docs; full 4-way comparison table refresh; parity-test sweep | ⏳ |
+| 1 cascade | Retire `_old_fold`, drop the 7 `g_foldSeq` patterns, restructure `_fold` into a three-tier cascade (splice → `fold_linq_default` → raw clone). Cascade is always-safe — `_fold(chain)` is observationally equivalent to `chain`. `_select|_where` etc. cases that used to fall to raw clone now cascade to tier 2 array-shape with `_inplace` reuse. Phase 2D "fail-loudly contract" scrapped — always-safe cascade obviates it. | ✅ done |
+| 3a/b/c | BufferTopN splice arm via new `plan_order_family` planner: `[where_*]* + order/order_by/order_descending/order_by_descending [+ take(K)]`. No `take` → direct call to `order*` helper (or fused prefilter loop + `order*_inplace` when where is present). With `take(K)` → dispatch to `top_n[_by][_descending]` (or fused prefilter loop + `top_n*` on the buffer when where is present). New `top_n_by_descending` + `top_n_descending` library helpers added to `daslib/linq.das` (4 functions, mirrors of `top_n_by` / `top_n` with flipped comparator). | ✅ done |
+| 3d | `select + where + terminator` splice arm via `replaceVariablePeeling` (new helper in `daslib/templates_boost.das`). Substitutes the projection into the where predicate, peeling the typer-inserted ExprRef2Value wrappers that would otherwise be left orphaned around a non-reference value. Bails to tier 2 when the projection has side effects (would double-evaluate). Lands all four terminator lanes (ARRAY / COUNTER / ACCUMULATOR / EARLY_EXIT). | ✅ done |
+| 3+ | Remaining buffer-required emit modes (deferred): BufferDistinct, BufferGroupBy, BufferReverse, MultiSourceZip, BufferedJoin. Currently cascade to tier 2 array-shape. | ⏳ |
+| 4 | Final coverage pass + docs; full 3-way comparison table refresh; parity-test sweep | ⏳ |
 
 ## Baselines (100K rows, INTERP mode)
 
-Captured 2026-05-16 on commit `e691abe1b` + foundation PR. ns/op is **per element** (chunk_size = n = 100K), so 30 ns/op means ~3ms for the full operation. Smaller is better. m3f and m3f_old are intentionally identical in this PR — they diverge once Phase 2 lands.
+Foundation captured 2026-05-16 on commit `e691abe1b`. Phase 1 cascade + Phase 3 splice arms (PR retiring `_old_fold`) refresh the m3f column with new splice numbers — see "Headline benchmarks" below for the refreshed delta. ns/op is **per element** (chunk_size = n = 100K), so 30 ns/op means ~3ms for the full operation. Smaller is better.
 
 Notation: `—` means the variant is not applicable for this benchmark (operator has no clean form in that mode).
 
-| Benchmark | Shape | m1 (sql) | m3 (linq) | m3f_old | m3f |
-|---|---|---:|---:|---:|---:|
-| count_aggregate | `where → count` | 29 | 29 | 5 | 5 |
-| sum_aggregate | `select → sum` | 29 | 30 | 8 | 8 |
-| sum_where | `where → select → sum` | 33 | 43 | 12 | 12 |
-| min_aggregate | `select → min` | 30 | 38 | 25 | 25 |
-| max_aggregate | `select → max` | 31 | 36 | 23 | 23 |
-| average_aggregate | `select → average` | 30 | 34 | 20 | 20 |
-| first_match | `where → first` | 0\* | 28 | 15 | 15 |
-| any_match | `where → first_opt`/`any` | 0\* | 0\* | 0\* | 0\* |
-| all_match | `count(where ¬p)==0` / `all` | 27 | 20 | 24 | 25 |
-| to_array_filter | `where → select → to_array` | 70 | 43 | 11 | 11 |
-| take_count | `take → to_array` | 3 | 0\* | 0\* | 0\* |
-| skip_take | `skip → take → to_array` | 0\* | 16 | 23 | 23 |
-| distinct_count | `select → distinct → to_array` | 41 | 43 | 33 | 33 |
-| sort_first | `order_by → first` | 37 | 2170 | 2206 | 2238 |
-| sort_take | `order_by → take` | 38 | 2188 | 2247 | 2269 |
-| groupby_count | `group_by → select(_, length)` | 140 | 70 | 76 | 76 |
-| groupby_sum | `group_by → select(_, sum)` | 172 | 101 | 107 | 107 |
-| chained_where | `where → where → count` | 36 | 45 | 17 | 17 |
-| zip_dot_product | `zip → select → sum` | — | 53 | 37 | 37 |
-| join_count | `join → count` | —\*\* | 116 | 121 | 122 |
-| count_aggregate (existing) | `where → count` | 29 | 29 | 5 | 5 |
-| select_where (existing) | `where → to_array` | 7 | 50 | 12 | 12 |
-| select_where_order_take (existing) | `where → order_by → take` | 36 | 1024 | 1007 | 1014 |
-| indexed_lookup (existing) | `where id==k → count` | 1460\*\*\* | 2003299 | 336129 | 328207 |
+| Benchmark | Shape | m1 (sql) | m3 (linq) | m3f (foundation) |
+|---|---|---:|---:|---:|
+| count_aggregate | `where → count` | 29 | 29 | 5 |
+| sum_aggregate | `select → sum` | 29 | 30 | 8 |
+| sum_where | `where → select → sum` | 33 | 43 | 12 |
+| min_aggregate | `select → min` | 30 | 38 | 25 |
+| max_aggregate | `select → max` | 31 | 36 | 23 |
+| average_aggregate | `select → average` | 30 | 34 | 20 |
+| first_match | `where → first` | 0\* | 28 | 15 |
+| any_match | `where → first_opt`/`any` | 0\* | 0\* | 0\* |
+| all_match | `count(where ¬p)==0` / `all` | 27 | 20 | 25 |
+| to_array_filter | `where → select → to_array` | 70 | 43 | 11 |
+| take_count | `take → to_array` | 3 | 0\* | 0\* |
+| skip_take | `skip → take → to_array` | 0\* | 16 | 23 |
+| distinct_count | `select → distinct → to_array` | 41 | 43 | 33 |
+| sort_first | `order_by → first` | 37 | 2170 | 2238 |
+| sort_take | `order_by → take` | 38 | 2188 | 2269 |
+| groupby_count | `group_by → select(_, length)` | 140 | 70 | 76 |
+| groupby_sum | `group_by → select(_, sum)` | 172 | 101 | 107 |
+| chained_where | `where → where → count` | 36 | 45 | 17 |
+| zip_dot_product | `zip → select → sum` | — | 53 | 37 |
+| join_count | `join → count` | —\*\* | 116 | 122 |
+| select_where (existing) | `where → to_array` | 7 | 50 | 12 |
+| select_where_order_take (existing) | `where → order_by → take` | 36 | 1024 | 1014 |
+| indexed_lookup (existing) | `where id==k → count` | 1460\*\*\* | 2003299 | 328207 |
 
 \* Sub-nanosecond per element — early-exit operation hits answer in O(1) regardless of N; per-element timing collapses to 0/near-0 noise.
 
@@ -74,8 +75,7 @@ Notation: `—` means the variant is not applicable for this benchmark (operator
 ### Reading the table
 
 - **m1 vs m3** shows the SQLite-vs-in-memory-LINQ cost gap. SQL wins on `indexed_lookup` (b-tree) and on sorted-take patterns (engine partial-sort + LIMIT). Arrays win on raw aggregates where the SQL overhead exceeds the in-memory work.
-- **m3 vs m3f_old** shows what the *current* `_fold` macro already achieves. Big wins on the patterns it explicitly recognizes (`where+count` 6×, `where+select+to_array` ~4×, `chained_where+count` 2.6×). Negligible difference where it falls through to the default emitter.
-- **m3f vs m3f_old** is the target of Phase 2+. Each PR in the splice series adds a path for one operator family and updates this table with the new ratio.
+- **m3 vs m3f** shows what `_fold`'s splice cascade gains over plain LINQ. The headline shapes that now splice (after the Phase 1+3 cascade PR): order-family + take → `top_n*` dispatch, where + order [+ take] → fused prefilter + sort, select + where → peel-substituted fused loop, plus all the existing `[where_*][select*][skip?][take?]` patterns from Phase 2.
 
 ## Phase 2A — Loop planner (2026-05-16)
 
@@ -234,26 +234,51 @@ No further workhorse branches in the splice path.
 
 Ring 4 is a correctness gate (chained non-workhorse selects now splice instead of falling through), not a per-benchmark improvement on the existing 100K suite. Coverage tracked via test `test_chained_non_workhorse_select` in `tests/linq/test_linq_fold.das` (3 subtests: int → ComplexType → int → sum / where + ComplexType chain + sum / workhorse → ComplexType → workhorse → max).
 
-## Planned: fail-loudly contract
+## Phase 1 — three-tier cascade + `_old_fold` retirement (this PR)
+
+`_fold(chain)` is now a three-tier cascade. Tier 1 splice handles the hot patterns; tier 2 (`fold_linq_default`, the body that used to power `_old_fold`) emits an array-shape pipeline with `_inplace` reuse and explicit `delete`; tier 3 is a raw `clone_expression` passthrough. All three tiers preserve semantics — `_fold(chain)` is observationally equivalent to `chain`, just faster when patterns match. **Always safe to apply.**
 
-The current contract: when `_fold` can't splice a chain (out-of-scope terminator, buffer-required op, multiple take/skip, range-form take/skip, etc.), it falls through to plain linq — same as today's master. This is **temporary**. The planned contract (Boris design directive 2026-05-17): `_fold` will emit `macro_error("_fold: cannot splice — <reason>")` for any unsupported shape, mirroring the sqlite_linq `_sql(...)` "splice or error" contract.
+This obviates the previously-planned "fail-loudly contract" (Phase 2D) — the cascade always produces a valid output, so there's no need for explicit error emission on unsupported shapes.
 
-When the switch lands, every `m3f` variant currently relying on silent fallback breaks. Approximate accounting from the current benchmark suite (8 affected `m3f` variants), grouped by future emit mode that would resolve each:
+The 7 specific `FoldSequence` patterns (`fold_where_count`, `fold_where_select`, `fold_select_where`, `fold_where`, `fold_select`, `fold_order_distinct` × 2) and their `g_foldSeq` dispatch table are deleted — splice arms (existing + new in Phase 3 below) cover every shape they recognized. The `_old_fold` macro itself is deleted; the m3f_old benchmark column is dropped from all 29 files.
 
-| Benchmark | Future mode |
+## Phase 3 — order-family splice + select+where peel (this PR)
+
+**New planner: `plan_order_family`.** Called before `plan_loop_or_count` in the cascade. Recognizes chains containing any of `order` / `order_by` / `order_descending` / `order_by_descending`, optionally with one `take(K)`, optionally with `where_*` prefilters:
+
+| Chain shape | Emission |
 |---|---|
-| `distinct_count` | BufferDistinct (hash set) |
-| `sort_first` | BufferTopN (order_by + early-exit) |
-| `sort_take` | BufferTopN (order_by + take/skip) |
-| `select_where_order_take` | BufferTopN with predicate prefix |
-| `groupby_count` | BufferGroupBy (hash multi-bucket) |
-| `groupby_sum` | BufferGroupBy + nested fold inside select |
-| `zip_dot_product` | MultiSourceZip (2 cursors advanced lockstep) |
-| `join_count` | BufferedJoin (hash-build + probe) |
+| `arr \|> order[_descending]?` (bare) | Direct call: `order[_descending](arr)` |
+| `arr \|> order_by[_descending]?(key)` (bare) | Direct call: `order_by[_descending](arr, key)` |
+| `src \|> order[_descending]? \|> take(K)` | `top_n[_descending](src, K)` (existing helpers from PR #2707) |
+| `src \|> order_by[_descending]?(key) \|> take(K)` | `top_n_by[_descending](src, K, key)` (new descending helpers in this PR) |
+| `src \|> where_*(p)+ \|> order[_by]?[_descending]?[(key)]` | Fused: prefilter into pre-allocated buffer, then `order*_inplace` |
+| `src \|> where_*(p)+ \|> order[_by]?[_descending]?[(key)] \|> take(K)` | Fused: prefilter into buffer, then `top_n*` on the buffer |
+
+New `top_n_by_descending` (array + iterator) + `top_n_descending` (array + iterator) added to `daslib/linq.das` — mirror `top_n_by` / `top_n` with flipped comparator (partial_sort + reversed less for array; bounded min-heap for iterator).
+
+**New splice arm: `select + where + terminator`.** Previously rejected by `plan_loop_or_count` (where-after-select hit the ExprRef2Value substitution blocker). Unblocked via the new `replaceVariablePeeling` helper in `daslib/templates_boost.das` — substitutes the projection into the where predicate, peeling the typer-inserted `ExprRef2Value` wrapper as part of the substitution (mirrors `ast_match`'s `qm_peel_ref2value` pattern). Bails to tier 2 cascade when the projection has side effects (would double-evaluate, once in the substituted where and again at the terminator emission). Lands all four lanes: ARRAY (to_array / bare), COUNTER (count), ACCUMULATOR (sum / min / max / avg / long_count), EARLY_EXIT (first / first_or_default / any / all / contains).
+
+**Concurrent runtime fix:** [`src/builtin/module_builtin_runtime_sort.cpp:84`](../../src/builtin/module_builtin_runtime_sort.cpp#L84) — `builtin_sort_string` switched from unqualified `sort()` (= `std::sort` via `using namespace std`) to `das_sort` (the in-tree block-partition pdqsort from PR #2707). This is the runtime path `order_by<string>` takes; on Linux/libstdc++ users see the same ~1.5× sort speedup PR #2707 delivered for typed sorts.
+
+### Headline benchmarks (100K rows, INTERP, this PR)
+
+Refreshed m3f column after Phase 1 cascade + Phase 3 splice arms land. m3 is plain LINQ baseline; m3f is `_fold(...)` over the same chain.
+
+| Benchmark | Shape | m1 (sql) | m3 (linq) | m3f (this PR) | Win |
+|---|---|---:|---:|---:|---:|
+| order_take_desc | `order_by_desc → take(K)` | 38 | 698 | **56** | **12.5×** |
+| sort_take | `order_by → take(K)` | 38 | 713 | **56** | **12.7×** |
+| select_where_order_take | `where → order_by → take(K)` | 36 | 354 | **39** | **9.1×** |
+| select_where_count | `select → where → count` (NEW: peel) | 32 | 57 | **5** | **11.4×** |
+| select_where | `where → select → to_array` | 191 | 28 | **11** | **2.5×** |
+| bare_order_where | `where → order_by` (no take) | 273 | 357 | **340** | **1.05×** |
+| chained_where | `where → where → count` | 36 | 45 | **6** | **7.5×** |
+| sum_where | `where → select → sum` | 32 | 44 | **4** | **11×** |
 
-The fail-loudly PR will either (a) comment out `m3f` in the affected benchmarks until the corresponding emit mode lands, or (b) deliver one or more emit modes alongside the switch. Decision deferred to that PR.
+The order+take rows (`order_take_desc`, `sort_take`, `select_where_order_take`) come within ~1.5× of SQLite's index-aware plans by dispatching `_fold` directly to the `top_n_by[_descending]` partial-sort helpers from PR #2707 (asc dispatches to existing; desc is new in this PR). The `bare_order_where` win is small because full sort dominates — the splice saves only one intermediate allocation, not the sort cost.
 
-Tracking issue: the planner's `is_buffer_required_op` recognition + the named-arm `// TODO Phase 2X: <FutureMode>` markers are the in-code TODOs.
+`select_where_count` is the first **select+where** splice landing: previously rejected by the planner (the typer-inserted `ExprRef2Value` wrapper around `it` orphaned during substitution → `30921: can only dereference a reference`). The new `replaceVariablePeeling` helper in `templates_boost.das` peels the wrapper as part of the substitution, mirroring `ast_match`'s `qm_peel_ref2value`. All four terminator lanes (array / counter / accumulator / early-exit) covered.
 
 ## Operator-coverage checklist (parity tests)
 
@@ -292,12 +317,12 @@ dastest reports `ns/op` in INTERP mode by default. To bump dataset size as the s
 
 ## Design decisions
 
-**`_old_fold` lives alongside `_fold` in `linq_fold`, not in `linq_boost`.** Both macros share the entire dispatch infrastructure (`linqCalls`, `g_foldSeq`, `fold_*`, `flatten_linq`, `fold_linq_default`). Keeping them in one module avoids duplication; the only difference today is the macro-name string passed into `fold_linq_default`'s recursive sub-fold call.
+**Three-tier cascade, always safe.** `_fold(chain)` is observationally equivalent to `chain` — never breaks semantics, only ever faster. Splice arms cover the hot paths; `fold_linq_default` (tier 2) emits an array-shape pipeline with `_inplace` reuse for anything splice can't handle; raw clone (tier 3) handles the empty-chain edge. This obviates the fail-loudly contract that earlier plans considered.
 
-**Recursive macro-name is parameterized.** `fold_linq_default(expr, recursiveMacroName)` — `_fold` passes `"_fold"`, `_old_fold` passes `"_old_fold"`. This keeps the frozen baseline truly frozen once `_fold` diverges in Phase 2+: when `_fold` starts emitting splice loops, `_old_fold` keeps emitting the historical comprehension/invoke shape because its recursive sub-folds still target `_old_fold`.
+**`fold_linq_default` is load-bearing.** Its array-shape emission (`var pass_0 = call0(src); var pass_1 = call1_inplace(pass_0); delete pass_0; ...`) with explicit `delete` of intermediates and `_inplace` variant routing is genuinely better than plain LINQ (one buffer at a time, reused; no iterator wrappers). Kept as the cascade's tier 2.
 
 **100K rows.** daslang is interpreter-first; 100K gives sub-second-per-variant benchmark turnaround and clearly shows the asymmetries we care about. Bump later if AOT/JIT numbers warrant.
 
-**`PERF009` suppression in `fold_linq_default`.** The macro's `var pass_N = call` + later `return <- pass_N` pattern triggers PERF009 on single-pass chains (e.g. `take_count`). Rewriting to direct `return <- call` would change `_old_fold`'s baseline; we suppress inline at the qmacro_expr emission site and document why.
+**`PERF009` suppression in `fold_linq_default`.** The macro's `var pass_N = call` + later `return <- pass_N` pattern triggers PERF009 on single-pass chains. The shape is load-bearing for the array-pipeline semantics (every stage binds so the next can reuse the buffer in-place), so we suppress inline at the qmacro_expr emission site and document why.
 
 **Benchmark variants where SQL has no clean form.** `zip` (not a relational op), `_all(pred)` (no direct `_all` chain terminal in sqlite_linq), `join` with inner-select-from (wiring not exposed), `distinct |> count` (no `COUNT(DISTINCT col)` yet), `take/skip` before aggregate (LIMIT/OFFSET semantics conflict with aggregate-collapse). We either reformulate to a SQL-friendly shape (`count(where ¬p)` for all_match), omit the m1 column (zip, join), or terminate the chain in `to_array` instead of an aggregate (take/skip/distinct).
diff --git a/benchmarks/sql/all_match.das b/benchmarks/sql/all_match.das
index 18d5fc5dd4..de705ce1df 100644
--- a/benchmarks/sql/all_match.das
+++ b/benchmarks/sql/all_match.das
@@ -30,17 +30,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let yes = _old_fold(each(arr)._all(_.price < 9999))
-        if (!yes) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -61,11 +50,6 @@ def all_match_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def all_match_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def all_match_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/any_match.das b/benchmarks/sql/any_match.das
index 74c2c151c0..70f694e9f6 100644
--- a/benchmarks/sql/any_match.das
+++ b/benchmarks/sql/any_match.das
@@ -30,17 +30,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let yes = _old_fold(each(arr)._any(_.price > THRESHOLD))
-        if (!yes) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -61,11 +50,6 @@ def any_match_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def any_match_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def any_match_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/average_aggregate.das b/benchmarks/sql/average_aggregate.das
index 669654cfa5..af9b2adbd4 100644
--- a/benchmarks/sql/average_aggregate.das
+++ b/benchmarks/sql/average_aggregate.das
@@ -4,7 +4,7 @@ options persistent_heap
 require _common public
 
 // SQL: SELECT AVG(price) FROM Cars.
-// m3/m3f_old/m3f sum and divide on a single pass.
+// m3/m3f sum and divide on a single pass.
 
 def run_m1(b : B?; n : int) {
     with_sqlite(":memory:") $(db) {
@@ -27,17 +27,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let a = _old_fold(each(arr)._select(double(_.price)).average())
-        if (a == 0.0lf) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -58,11 +47,6 @@ def average_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def average_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def average_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/bare_order_where.das b/benchmarks/sql/bare_order_where.das
new file mode 100644
index 0000000000..744b6805e7
--- /dev/null
+++ b/benchmarks/sql/bare_order_where.das
@@ -0,0 +1,64 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let THRESHOLD = 500
+
+// _where(_.price > T) |> _order_by(_.price) — fused prefilter + sort, NO take.
+// SQL: WHERE price > T ORDER BY price.
+// m3 (plain LINQ) materializes a filter array, then sorts a clone (two allocations).
+// m3f (spliced via plan_order_family Phase 3c) emits a single fused loop that collects
+// matching elements into one pre-allocated buffer, then sort_inplace's the buffer.
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            let rows <- _sql(db |> select_from(type<Car>)
+                                |> _where(_.price > THRESHOLD)
+                                |> _order_by(_.price))
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        let rows <- (arr |> _where(_.price > THRESHOLD)
+                         |> _order_by(_.price))
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        let rows <- _fold(each(arr)._where(_.price > THRESHOLD)
+                                   ._order_by(_.price)
+                                   .to_array())
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+[benchmark]
+def bare_order_where_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def bare_order_where_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def bare_order_where_m3f(b : B?) {
+    run_m3f(b, 100000)
+}
diff --git a/benchmarks/sql/chained_where.das b/benchmarks/sql/chained_where.das
index 5ece3df272..b433d1ca11 100644
--- a/benchmarks/sql/chained_where.das
+++ b/benchmarks/sql/chained_where.das
@@ -37,19 +37,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let c = _old_fold(each(arr)._where(_.price > THRESHOLD)
-                                   ._where(_.year >= YEAR_FLOOR)
-                                   .count())
-        if (c == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -72,11 +59,6 @@ def chained_where_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def chained_where_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def chained_where_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/contains_match.das b/benchmarks/sql/contains_match.das
index 0326357e5e..8eb6e2e976 100644
--- a/benchmarks/sql/contains_match.das
+++ b/benchmarks/sql/contains_match.das
@@ -33,17 +33,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let yes = _old_fold(each(arr)._select(_.id).contains(TARGET_ID))
-        if (!yes) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -64,11 +53,6 @@ def contains_match_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def contains_match_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def contains_match_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/count_aggregate.das b/benchmarks/sql/count_aggregate.das
index 37112e4674..f6d92dfca1 100644
--- a/benchmarks/sql/count_aggregate.das
+++ b/benchmarks/sql/count_aggregate.das
@@ -7,7 +7,6 @@ let THRESHOLD = 500
 
 // SQL pushes COUNT(*) to the engine returning one row.
 // m3 must materialize the full filtered array, then walk to count it.
-// m3f_old is the pre-rewrite baseline fold (frozen — diverges as splice mode lands).
 // m3f folds where+count into a single fused pass (no intermediate array).
 
 // --- m1: _sql over :memory: ---
@@ -34,17 +33,6 @@ def run_m3(b : B?; n : int) {
     }
 }
 
-// --- m3f_old: pre-rewrite baseline fold ---
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let c = _old_fold(each(arr)._where(_.price > THRESHOLD).count())
-        if (c == 0) {
-            b->failNow()
-        }
-    }
-}
-
 // --- m3f: array LINQ folded into a single fused pass ---
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
@@ -66,11 +54,6 @@ def count_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def count_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def count_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/distinct_count.das b/benchmarks/sql/distinct_count.das
index 4241af632b..4ff234e8ee 100644
--- a/benchmarks/sql/distinct_count.das
+++ b/benchmarks/sql/distinct_count.das
@@ -31,17 +31,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let rows <- _old_fold(each(arr)._select(_.brand).distinct().to_array())
-        if (empty(rows)) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -62,11 +51,6 @@ def distinct_count_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def distinct_count_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def distinct_count_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/first_match.das b/benchmarks/sql/first_match.das
index 0fbe52320f..dee7604523 100644
--- a/benchmarks/sql/first_match.das
+++ b/benchmarks/sql/first_match.das
@@ -7,8 +7,8 @@ let THRESHOLD = 500
 
 // SQL: SELECT ... LIMIT 1 — engine stops at first hit.
 // m3 materializes the full filtered array then indexes [0].
-// m3f_old / m3f should ideally early-exit at the first matching row;
-// splice mode is the main target here.
+// m3f should ideally early-exit at the first matching row; splice mode
+// is the main target here.
 
 def run_m1(b : B?; n : int) {
     with_sqlite(":memory:") $(db) {
@@ -31,17 +31,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let row = _old_fold(each(arr)._where(_.price > THRESHOLD).first())
-        if (row.price == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -62,11 +51,6 @@ def first_match_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def first_match_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def first_match_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/first_or_default_match.das b/benchmarks/sql/first_or_default_match.das
index afb2dfe2ef..c67b3da74b 100644
--- a/benchmarks/sql/first_or_default_match.das
+++ b/benchmarks/sql/first_or_default_match.das
@@ -32,18 +32,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    let sentinel = Car(id = SENTINEL_ID, name = "none", price = 0, brand = 0, year = 0, dealer_id = 0)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let row = _old_fold(each(arr)._where(_.price > THRESHOLD).first_or_default(sentinel))
-        if (row.id == SENTINEL_ID) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     let sentinel = Car(id = SENTINEL_ID, name = "none", price = 0, brand = 0, year = 0, dealer_id = 0)
@@ -65,11 +53,6 @@ def first_or_default_match_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def first_or_default_match_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def first_or_default_match_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/groupby_count.das b/benchmarks/sql/groupby_count.das
index a7094c6ae5..35179e08ff 100644
--- a/benchmarks/sql/groupby_count.das
+++ b/benchmarks/sql/groupby_count.das
@@ -32,20 +32,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let groups <- _old_fold(each(arr)
-                                ._group_by(_.brand)
-                                ._select((Brand = _._0, N = _._1 |> length))
-                                .to_array())
-        if (empty(groups)) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -69,11 +55,6 @@ def groupby_count_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def groupby_count_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def groupby_count_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/groupby_sum.das b/benchmarks/sql/groupby_sum.das
index 90cc7525a2..52a256af29 100644
--- a/benchmarks/sql/groupby_sum.das
+++ b/benchmarks/sql/groupby_sum.das
@@ -33,21 +33,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let groups <- _old_fold(each(arr)
-                                ._group_by(_.brand)
-                                ._select((Brand = _._0,
-                                          TotalPrice = _._1 |> select($(c : Car) => c.price) |> sum()))
-                                .to_array())
-        if (empty(groups)) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -72,11 +57,6 @@ def groupby_sum_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def groupby_sum_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def groupby_sum_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/indexed_lookup.das b/benchmarks/sql/indexed_lookup.das
index 41fef08956..88aad127bf 100644
--- a/benchmarks/sql/indexed_lookup.das
+++ b/benchmarks/sql/indexed_lookup.das
@@ -34,18 +34,6 @@ def run_m3(b : B?; n : int) {
     }
 }
 
-// --- m3f_old: pre-rewrite baseline fold ---
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    let key = n / 2
-    b |> run("m3f_old_array_fold/{n}") {
-        let c = _old_fold(each(arr)._where(_.id == key).count())
-        if (c == 0) {
-            b->failNow()
-        }
-    }
-}
-
 // --- m3f: array LINQ folded into a single fused pass ---
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
@@ -68,11 +56,6 @@ def indexed_lookup_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def indexed_lookup_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def indexed_lookup_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/join_count.das b/benchmarks/sql/join_count.das
index 8384319639..aa4b275ac5 100644
--- a/benchmarks/sql/join_count.das
+++ b/benchmarks/sql/join_count.das
@@ -7,7 +7,7 @@ require _common public
 // SQL form (`_sql(... |> _join(select_from(type<Dealer>), ...))`) requires the
 // inner select_from to bind the db handle inside the _sql analyzer; that wiring
 // is not exposed for direct authoring here, so m1 is omitted for this benchmark.
-// 3-way comparison (m3 / m3f_old / m3f) focuses on array-side join cost.
+// 2-way comparison (m3 / m3f) focuses on array-side join cost.
 
 def run_m3(b : B?; n : int) {
     let cars <- fixture_array(n)
@@ -22,21 +22,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let cars <- fixture_array(n)
-    let dealers <- fixture_dealers_array()
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let c = _old_fold(cars |> _join(dealers,
-                                        $(c : Car, d : Dealer) => c.dealer_id == d.id,
-                                        $(c : Car, d : Dealer) => (c.name, d.name))
-                               |> count())
-        if (c == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let cars <- fixture_array(n)
     let dealers <- fixture_dealers_array()
@@ -56,11 +41,6 @@ def join_count_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def join_count_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def join_count_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/long_count_aggregate.das b/benchmarks/sql/long_count_aggregate.das
index dec6d4a63c..3515e73835 100644
--- a/benchmarks/sql/long_count_aggregate.das
+++ b/benchmarks/sql/long_count_aggregate.das
@@ -30,17 +30,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let c = _old_fold(each(arr)._where(_.price > THRESHOLD).long_count())
-        if (c == 0l) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -61,11 +50,6 @@ def long_count_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def long_count_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def long_count_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/max_aggregate.das b/benchmarks/sql/max_aggregate.das
index 0c353b7ef1..ff424e0f4c 100644
--- a/benchmarks/sql/max_aggregate.das
+++ b/benchmarks/sql/max_aggregate.das
@@ -4,7 +4,7 @@ options persistent_heap
 require _common public
 
 // SQL: SELECT MAX(price) FROM Cars.
-// m3/m3f_old/m3f scan the array tracking the maximum seen.
+// m3/m3f scan the array tracking the maximum seen.
 
 def run_m1(b : B?; n : int) {
     with_sqlite(":memory:") $(db) {
@@ -27,17 +27,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let m = _old_fold(each(arr)._select(_.price).max())
-        if (m == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -58,11 +47,6 @@ def max_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def max_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def max_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/min_aggregate.das b/benchmarks/sql/min_aggregate.das
index 318613acee..f0bcbd37e9 100644
--- a/benchmarks/sql/min_aggregate.das
+++ b/benchmarks/sql/min_aggregate.das
@@ -4,7 +4,7 @@ options persistent_heap
 require _common public
 
 // SQL: SELECT MIN(price) FROM Cars.
-// m3/m3f_old/m3f scan the array tracking the minimum seen.
+// m3/m3f scan the array tracking the minimum seen.
 
 def run_m1(b : B?; n : int) {
     with_sqlite(":memory:") $(db) {
@@ -27,17 +27,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let m = _old_fold(each(arr)._select(_.price).min())
-        if (m > 999) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -58,11 +47,6 @@ def min_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def min_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def min_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/order_take_desc.das b/benchmarks/sql/order_take_desc.das
new file mode 100644
index 0000000000..c4b61c510f
--- /dev/null
+++ b/benchmarks/sql/order_take_desc.das
@@ -0,0 +1,59 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let TAKE_N = 10
+
+// _order_by_descending(_.price) |> take(N) — top-N largest pattern.
+// SQL: ORDER BY price DESC LIMIT N — engine emits partial-sort optimization when possible.
+// m3 (plain LINQ) does a full descending sort then slices.
+// m3f (spliced via plan_order_family Phase 3b) dispatches directly to ``top_n_by_descending``
+// which uses partial_sort with a flipped comparator (array source) — O(M log N).
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            let rows <- _sql(db |> select_from(type<Car>) |> _order_by_descending(_.price) |> take(TAKE_N))
+            if (empty(rows)) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        let rows <- (arr |> _order_by_descending(_.price) |> take(TAKE_N))
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        let rows <- _fold(each(arr)._order_by_descending(_.price).take(TAKE_N).to_array())
+        if (empty(rows)) {
+            b->failNow()
+        }
+    }
+}
+
+[benchmark]
+def order_take_desc_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def order_take_desc_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def order_take_desc_m3f(b : B?) {
+    run_m3f(b, 100000)
+}
diff --git a/benchmarks/sql/select_count.das b/benchmarks/sql/select_count.das
index 84e2253422..2291504e4a 100644
--- a/benchmarks/sql/select_count.das
+++ b/benchmarks/sql/select_count.das
@@ -8,9 +8,7 @@ require _common public
 // so user-visible side effects fire. Phase-2A `_fold` matches that: the counter lane binds
 // the final projection to a discardable local per matched element (side effects preserved)
 // and skips array materialization. The optimizer DCEs the binding for pure projections
-// like `_.price * 2`, leaving a bare-loop counter for the common case. `_old_fold` lacks a
-// [select, count] pattern in g_foldSeq so it falls to the default nested-pass form
-// (pass_0 = select(...); count(pass_0)) — materializing the same way m3 does.
+// like `_.price * 2`, leaving a bare-loop counter for the common case.
 
 def run_m1(b : B?; n : int) {
     with_sqlite(":memory:") $(db) {
@@ -33,17 +31,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let c = _old_fold(each(arr)._select(_.price * 2).count())
-        if (c == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -64,11 +51,6 @@ def select_count_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def select_count_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def select_count_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/select_where.das b/benchmarks/sql/select_where.das
index acb827aa13..1aefe7dddc 100644
--- a/benchmarks/sql/select_where.das
+++ b/benchmarks/sql/select_where.das
@@ -29,17 +29,6 @@ def run_m3(b : B?; n : int) {
     }
 }
 
-// --- m3f_old: pre-rewrite baseline fold ---
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let rows <- _old_fold(each(arr)._where(_.price > THRESHOLD).to_array())
-        if (empty(rows)) {
-            b->failNow()
-        }
-    }
-}
-
 // --- m3f: array LINQ folded into a single fused pass ---
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
@@ -61,11 +50,6 @@ def select_where_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def select_where_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def select_where_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/select_where_count.das b/benchmarks/sql/select_where_count.das
new file mode 100644
index 0000000000..23587db9f6
--- /dev/null
+++ b/benchmarks/sql/select_where_count.das
@@ -0,0 +1,62 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let THRESHOLD = 1000
+
+// _select(_.price * 2) |> _where(_ > T) |> count — where-after-select pattern.
+// SQL: SELECT COUNT(*) FROM Cars WHERE price * 2 > T.
+// m3 (plain LINQ) materializes a projection iterator, then a filter array, then counts.
+// m3f (spliced via Phase 3d via replaceVariablePeeling) substitutes the projection into
+// the where predicate (peeling the typer-inserted ExprRef2Value wrapper) and emits a single
+// fused counter loop — no intermediate arrays.
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            // SQL form folds projection into the WHERE filter — the engine evaluates
+            // ``price * 2 > T`` per row and counts matches.
+            let c = _sql(db |> select_from(type<Car>) |> _where(_.price * 2 > THRESHOLD) |> count())
+            if (c == 0) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        let c = arr |> _select(_.price * 2) |> _where(_ > THRESHOLD) |> count
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        let c = _fold(each(arr)._select(_.price * 2)._where(_ > THRESHOLD).count())
+        if (c == 0) {
+            b->failNow()
+        }
+    }
+}
+
+[benchmark]
+def select_where_count_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def select_where_count_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def select_where_count_m3f(b : B?) {
+    run_m3f(b, 100000)
+}
diff --git a/benchmarks/sql/select_where_order_take.das b/benchmarks/sql/select_where_order_take.das
index e718907b92..ccf57c0454 100644
--- a/benchmarks/sql/select_where_order_take.das
+++ b/benchmarks/sql/select_where_order_take.das
@@ -35,20 +35,6 @@ def run_m3(b : B?; n : int) {
     }
 }
 
-// --- m3f_old: pre-rewrite baseline fold ---
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let rows <- _old_fold(each(arr)._where(_.price > THRESHOLD)
-                                       ._order_by(_.price)
-                                       .take(TAKE_N)
-                                       .to_array())
-        if (empty(rows)) {
-            b->failNow()
-        }
-    }
-}
-
 // --- m3f: array LINQ folded into a single fused pass ---
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
@@ -73,11 +59,6 @@ def select_where_order_take_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def select_where_order_take_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def select_where_order_take_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/skip_take.das b/benchmarks/sql/skip_take.das
index d860a310f3..52e000837c 100644
--- a/benchmarks/sql/skip_take.das
+++ b/benchmarks/sql/skip_take.das
@@ -32,17 +32,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let rows <- _old_fold(each(arr).skip(SKIP_N).take(TAKE_N).to_array())
-        if (empty(rows)) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -63,11 +52,6 @@ def skip_take_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def skip_take_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def skip_take_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/sort_first.das b/benchmarks/sql/sort_first.das
index 51d47ef56a..290be82624 100644
--- a/benchmarks/sql/sort_first.das
+++ b/benchmarks/sql/sort_first.das
@@ -28,17 +28,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let row = _old_fold(each(arr)._order_by(_.price).first())
-        if (row.id == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -59,11 +48,6 @@ def sort_first_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def sort_first_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def sort_first_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/sort_take.das b/benchmarks/sql/sort_take.das
index 9bc6a55c00..5787d519c8 100644
--- a/benchmarks/sql/sort_take.das
+++ b/benchmarks/sql/sort_take.das
@@ -15,7 +15,7 @@ let TAKE_N = 10
 // uses ``partial_sort`` under the hood (O(M log N)), iterator source uses a
 // bounded heap of size N during the scan (max N elements resident).
 //
-// m3f / m3f_old are intentionally identical for top-N here — Phase 0 only adds
+// m3f are intentionally identical for top-N here — Phase 0 only adds
 // the library function; PR B (BufferTopN emit mode in linq_fold) is what makes
 // the m3f column move.
 
@@ -40,19 +40,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        unsafe {
-            let rows <- _old_fold(each(arr)._order_by(_.price).take(TAKE_N).to_array())
-            if (empty(rows)) {
-                b->failNow()
-            }
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -95,11 +82,6 @@ def sort_take_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def sort_take_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def sort_take_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/sum_aggregate.das b/benchmarks/sql/sum_aggregate.das
index f8a059ba3a..a8999e6474 100644
--- a/benchmarks/sql/sum_aggregate.das
+++ b/benchmarks/sql/sum_aggregate.das
@@ -4,7 +4,7 @@ options persistent_heap
 require _common public
 
 // SQL: SELECT SUM(price) FROM Cars — single-row scalar reduction pushed to the engine.
-// m3/m3f_old/m3f traverse the array projecting and accumulating.
+// m3/m3f traverse the array projecting and accumulating.
 
 def run_m1(b : B?; n : int) {
     with_sqlite(":memory:") $(db) {
@@ -27,17 +27,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let s = _old_fold(each(arr)._select(_.price).sum())
-        if (s == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -58,11 +47,6 @@ def sum_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def sum_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def sum_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/sum_where.das b/benchmarks/sql/sum_where.das
index a59f8077ee..536fa47c02 100644
--- a/benchmarks/sql/sum_where.das
+++ b/benchmarks/sql/sum_where.das
@@ -31,17 +31,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let s = _old_fold(each(arr)._where(_.price > THRESHOLD)._select(_.price).sum())
-        if (s == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -62,11 +51,6 @@ def sum_where_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def sum_where_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def sum_where_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/take_count.das b/benchmarks/sql/take_count.das
index 9aa8954c16..fdf10f352e 100644
--- a/benchmarks/sql/take_count.das
+++ b/benchmarks/sql/take_count.das
@@ -30,17 +30,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let rows <- _old_fold(each(arr).take(TAKE_N).to_array())
-        if (empty(rows)) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -61,11 +50,6 @@ def take_count_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def take_count_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def take_count_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/take_count_filtered.das b/benchmarks/sql/take_count_filtered.das
index 6245c10ece..dce9967ace 100644
--- a/benchmarks/sql/take_count_filtered.das
+++ b/benchmarks/sql/take_count_filtered.das
@@ -7,7 +7,7 @@ let TAKE_N = 1000
 let THRESHOLD = 500
 
 // `_sql` rejects `take(n) |> count()` (LIMIT-before-aggregate collapses to one row regardless
-// in SQLite), so the m1 variant is omitted. m3/m3f_old/m3f filter + bound + count over the
+// in SQLite), so the m1 variant is omitted. m3/m3f filter + bound + count over the
 // array. Exercises counter-lane take splice with an upstream `where` predicate.
 
 def run_m3(b : B?; n : int) {
@@ -19,17 +19,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let c = _old_fold(each(arr)._where(_.price > THRESHOLD).take(TAKE_N).count())
-        if (c == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -45,11 +34,6 @@ def take_count_filtered_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def take_count_filtered_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def take_count_filtered_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/take_sum_aggregate.das b/benchmarks/sql/take_sum_aggregate.das
index 518d763305..738ee4b2c8 100644
--- a/benchmarks/sql/take_sum_aggregate.das
+++ b/benchmarks/sql/take_sum_aggregate.das
@@ -6,7 +6,7 @@ require _common public
 let TAKE_N = 1000
 
 // `_sql` rejects `take(n) |> sum()` (LIMIT-before-aggregate has no effect in SQLite — aggregate
-// collapses to one row regardless), so the m1 variant is omitted. m3/m3f_old/m3f bound the
+// collapses to one row regardless), so the m1 variant is omitted. m3/m3f bound the
 // projection-sum loop to the first TAKE_N matched elements (no upstream where here, so
 // "matched" == "every source element"). Exercises accumulator-lane take splice in _fold.
 
@@ -19,17 +19,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let s = _old_fold(each(arr)._select(_.price).take(TAKE_N).sum())
-        if (s == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -45,11 +34,6 @@ def take_sum_aggregate_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def take_sum_aggregate_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def take_sum_aggregate_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/to_array_filter.das b/benchmarks/sql/to_array_filter.das
index f06a208611..d3b6a762a4 100644
--- a/benchmarks/sql/to_array_filter.das
+++ b/benchmarks/sql/to_array_filter.das
@@ -29,17 +29,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let arr <- fixture_array(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let prices <- _old_fold(each(arr)._where(_.price > THRESHOLD)._select(_.price).to_array())
-        if (empty(prices)) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let arr <- fixture_array(n)
     b |> run("m3f_array_fold/{n}", n) {
@@ -60,11 +49,6 @@ def to_array_filter_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def to_array_filter_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def to_array_filter_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/benchmarks/sql/zip_dot_product.das b/benchmarks/sql/zip_dot_product.das
index ad2edcbf06..b23aed7804 100644
--- a/benchmarks/sql/zip_dot_product.das
+++ b/benchmarks/sql/zip_dot_product.das
@@ -26,18 +26,6 @@ def run_m3(b : B?; n : int) {
         }
     }
 }
-
-def run_m3f_old(b : B?; n : int) {
-    let xs <- make_ints(n)
-    let ys <- make_ints(n)
-    b |> run("m3f_old_array_fold/{n}", n) {
-        let s = _old_fold(zip(xs, ys)._select(_._0 * _._1).sum())
-        if (s == 0) {
-            b->failNow()
-        }
-    }
-}
-
 def run_m3f(b : B?; n : int) {
     let xs <- make_ints(n)
     let ys <- make_ints(n)
@@ -54,11 +42,6 @@ def zip_dot_product_m3(b : B?) {
     run_m3(b, 100000)
 }
 
-[benchmark]
-def zip_dot_product_m3f_old(b : B?) {
-    run_m3f_old(b, 100000)
-}
-
 [benchmark]
 def zip_dot_product_m3f(b : B?) {
     run_m3f(b, 100000)
diff --git a/daslib/linq.das b/daslib/linq.das
index 23dcb56394..65cb38b0be 100644
--- a/daslib/linq.das
+++ b/daslib/linq.das
@@ -509,6 +509,62 @@ def top_n(var a : iterator<auto(TT)>; n : int) : array<TT -const -&> {
     return <- top_n_by(a, n, $(v : TT -&) => v)
 }
 
+// ============================================================================
+// top_n_descending / top_n_by_descending — return the N largest elements in
+// descending order. Mirror of top_n / top_n_by with flipped comparator.
+//
+// Array source: partial_sort on a clone with reversed less, then resize to N.
+// Iterator source: bounded min-heap of size N tracks the N largest seen so far;
+// when an element greater than the heap min arrives, evict and insert.
+// ============================================================================
+
+def top_n_by_descending(arr : array<auto(TT)>; n : int; key) : array<TT -const -&> {
+    //! Returns the ``n`` largest elements of ``arr`` by ``key(element)``,
+    //! sorted descending. ``n <= 0`` or empty ``arr`` yields an empty result.
+    var buf : array<TT -const -&>
+    if (n <= 0 || empty(arr)) return <- buf
+    let take_count = min(n, length(arr))
+    buf |> reserve(length(arr))
+    for (it in arr) {
+        buf |> push_clone(it)
+    }
+    sort_boost::partial_sort(buf, take_count, $(v1, v2) => _::less(key(v2), key(v1)))
+    buf |> resize(take_count)
+    return <- buf
+}
+
+def top_n_by_descending(var a : iterator<auto(TT)>; n : int; key) : array<TT -const -&> {
+    //! Returns the ``n`` largest elements of an iterator by ``key(element)``,
+    //! sorted descending. Uses a bounded min-heap of size ``n`` during the scan;
+    //! at most ``n`` elements resident.
+    var buf : array<TT -const -&>
+    if (n <= 0) return <- buf
+    // Heap is ordered by reversed less — top is the MIN element seen so far.
+    // When an element greater than the heap min arrives, evict and insert.
+    for (it in a) {
+        if (length(buf) < n) {
+            buf |> push_clone(it)
+            sort_boost::push_heap(buf, $(v1, v2) => _::less(key(v2), key(v1)))
+        } elif (_::less(key(buf[0]), key(it))) {
+            sort_boost::pop_heap(buf, $(v1, v2) => _::less(key(v2), key(v1)))
+            buf[length(buf) - 1] := it
+            sort_boost::push_heap(buf, $(v1, v2) => _::less(key(v2), key(v1)))
+        }
+    }
+    sort(buf, $(v1, v2) => _::less(key(v2), key(v1)))
+    return <- buf
+}
+
+def top_n_descending(arr : array<auto(TT)>; n : int) : array<TT -const -&> {
+    //! Returns the ``n`` largest elements of ``arr`` (by ``<``), sorted descending.
+    return <- top_n_by_descending(arr, n, $(v : TT -&) => v)
+}
+
+def top_n_descending(var a : iterator<auto(TT)>; n : int) : array<TT -const -&> {
+    //! Returns the ``n`` largest elements of an iterator (by ``<``), descending.
+    return <- top_n_by_descending(a, n, $(v : TT -&) => v)
+}
+
 def unique_key(a) {
     //! generates unique key of workhorse type for the value
     static_if (typeinfo is_workhorse(a)) {
diff --git a/daslib/linq_fold.das b/daslib/linq_fold.das
index b6c27149bc..ffaa806a83 100644
--- a/daslib/linq_fold.das
+++ b/daslib/linq_fold.das
@@ -5,16 +5,23 @@ options no_unused_function_arguments = false
 
 module linq_fold shared public
 
-//! LINQ ``_fold`` and ``_old_fold`` macro pair — fuse-time-aware pipeline rewriting.
+//! LINQ ``_fold`` macro — fuse-time-aware pipeline rewriting.
 //!
-//! ``_fold(expr)`` is the active fusion macro. ``_old_fold(expr)`` is a frozen
-//! baseline that preserves the pre-rewrite behavior; benchmarks compare the two
-//! to track performance work as ``_fold`` evolves toward splice-mode fusion.
+//! ``_fold(expr)`` is a three-tier cascade:
 //!
-//! Both macros share the same dispatch infrastructure (``linqCalls`` dict,
-//! ``flatten_linq``, ``g_foldSeq``, ``fold_*`` helpers, ``fold_linq_default``);
-//! the only difference today is the macro-name string they pass into the
-//! recursive sub-fold call inside ``fold_linq_default``.
+//! 1. **Splice** — when the chain matches a recognized pattern, emit a single
+//!    fused for-loop with predicates and projections inlined (best perf,
+//!    no intermediate iterators or buffers).
+//! 2. **``fold_linq_default``** — when splice can't fire, emit an
+//!    array-shape pipeline (``call → array → call → array``) with
+//!    ``_inplace`` variants reusing the same buffer and explicit ``delete``
+//!    on the previous stage. One buffer at a time, no iterator overhead.
+//! 3. **``clone_expression``** — raw passthrough when the chain has no
+//!    recognized linq operators (``flatten_linq`` returns empty).
+//!
+//! All three tiers preserve semantics — ``_fold(chain)`` is observationally
+//! equivalent to ``chain``, just faster when patterns match. The macro is
+//! always safe to apply.
 //!
 //! Requires the ``linq`` module for the backing operator functions.
 
@@ -23,6 +30,7 @@ require daslib/linq public
 require daslib/ast_boost
 require daslib/templates_boost
 require daslib/macro_boost
+require strings
 
 def private is_call_or_generic(var expr : Expression?, name, moduleName : string) : ExprCall? {
     if (expr is ExprCall) {
@@ -61,12 +69,36 @@ def private fold_linq_cond(var expr : Expression?; argName : string) : Expressio
     return qmacro(invoke($e(expr), $i(argName)))
 }
 
+def private fold_linq_cond_peel(var expr : Expression?; var replacement : Expression?) : Expression? {
+    // Variant of fold_linq_cond that substitutes the lambda's bound variable with an
+    // arbitrary expression (not just a rename to another identifier). Uses peel-aware
+    // substitution to strip the ExprRef2Value wrappers the typer inserts around `var`
+    // reads on already-typed AST. Used by the select-then-where splice arm — see
+    // `templates_boost::replaceVariablePeeling`.
+    if (expr is ExprMakeBlock) {
+        var mblk = expr as ExprMakeBlock
+        var blk = mblk._block as ExprBlock
+        if (blk.arguments |> length == 1 && blk.list |> length == 1 && blk.list[0] is ExprReturn) {
+            var ret = blk.list[0] as ExprReturn
+            if (ret.subexpr != null) {
+                var res = clone_expression(ret.subexpr)
+                var rules : Template
+                rules |> replaceVariablePeeling(string(blk.arguments[0].name), replacement)
+                apply_template(rules, res.at, res)
+                return res
+            }
+        }
+    }
+    return qmacro(invoke($e(expr), $e(replacement)))
+}
+
 struct private LinqCall {
     //! Internal struct representing a chained LINQ method call.
     name : string
     moduleName : string = "linq"
     skip : bool = false             // if specified, expression is skipped in folding
     inplace : bool = false          // if specified, expression is inplace and does not create new variable
+    noToArrayVariant : bool = false // if specified, no `_to_array` variant exists (function already returns array regardless of input shape)
     recursive : array<int>          // indices of arguments to apply fold_linq_default on
 }
 
@@ -92,6 +124,15 @@ var private linqCalls = {
     "order_by_to_array" => LinqCall(name = "order_by", inplace = true),
     "order_by_descending" => LinqCall(name = "order_by_descending", inplace = true),
     "order_by_descending_to_array" => LinqCall(name = "order_by_descending", inplace = true),
+// top-N terminal forms — always return array regardless of input shape, so no
+// `_to_array` / `_inplace` variants exist (would alias to the same function).
+// `noToArrayVariant=true` so first-position iterator-source rewrite is skipped.
+// Not marked inplace: rename to `<name>_inplace` at non-first positions would
+// fail at lookup time. Registered so flatten_linq recognizes them in chains.
+    "top_n" => LinqCall(name = "top_n", noToArrayVariant = true),
+    "top_n_by" => LinqCall(name = "top_n_by", noToArrayVariant = true),
+    "top_n_descending" => LinqCall(name = "top_n_descending", noToArrayVariant = true),
+    "top_n_by_descending" => LinqCall(name = "top_n_by_descending", noToArrayVariant = true),
 // aggregate
     "count" => LinqCall(name = "count"),
     "long_count" => LinqCall(name = "long_count"),
@@ -226,207 +267,27 @@ def private flatten_linq(var expr : Expression?)  {
     return <- (top, calls)
 }
 
-struct private FoldSequence {
-    //! Internal struct representing a fold operation sequence.
-    calls : array<string>
-    folder : function<(
-        argIndex : int;
-        var topValue : Expression?;
-        var blk : ExprBlock?;
-        var calls : array<tuple<ExprCall?; LinqCall?>>
-    ) : Expression?>
-}
-
-var private g_foldSeq = [               // those are applied in order
-// order and distinct (for both orders)
-    FoldSequence(
-        calls = ["order", "distinct" ],
-        folder = @@fold_order_distinct
-    ),
-    FoldSequence(
-        calls = ["distinct", "order" ],
-        folder = @@fold_order_distinct
-    ),
-// where + count (single-pass count, no intermediate filter array)
-    FoldSequence(
-        calls = ["where_", "count"],
-        folder = @@fold_where_count
-    ),
-// select and where
-    FoldSequence(
-        calls = ["where_", "select" ],
-        folder = @@fold_where_select
-    ),
-    FoldSequence(
-        calls = ["select", "where_"],
-        folder = @@fold_select_where
-    ),
-    FoldSequence(
-        calls = ["where_"],
-        folder = @@fold_where
-    ),
-    FoldSequence(
-        calls = ["select"],
-        folder = @@fold_select
-    ),
-]
-
-[macro_function]
-def private append_comprehension(argIndex : int; var topValue : Expression?; var comprehension : Expression?; var blk : ExprBlock?; at : LineInfo) : Expression? {
-    comprehension.force_at(at)
-    comprehension.force_generated(true)
-    let newArgName = "pass_{argIndex}"
-    blk.list |> emplace_new <| qmacro_expr() {
-        var $i(newArgName) <- $e(comprehension)
-    }
-    (blk.list.back() as ExprLet).variables[0].flags.can_shadow = true
-    if (argIndex != 0) {
-        blk.list |> emplace_new <| qmacro_expr() {
-            delete $e(topValue)
-        }
-    }
-    return qmacro($i(newArgName))
-}
-
-[macro_function]
-def private fold_order_distinct(argIndex : int; var topValue : Expression?; var blk : ExprBlock?; var calls : array<tuple<ExprCall?; LinqCall?>>) : Expression? {
-    //! replaces order + distinct into a single order + unique
-    if (argIndex == 0) {
-        var comprehension = qmacro(order_unique_folded($e(topValue)))
-        return append_comprehension(argIndex, topValue, comprehension, blk, calls[0]._0.at)
-    } else {
-        blk.list |> emplace_new <| qmacro(order_unique_folded_inplace($e(topValue)))
-        return clone_expression(topValue)
-    }
-}
-
-[macro_function]
-def private fold_where_select(argIndex : int; var topValue : Expression?; var blk : ExprBlock?; var calls : array<tuple<ExprCall?; LinqCall?>>) : Expression? {
-    //! folds where + select into a single comprehension
-    var eWhere = calls[0]._0
-    var eSelect = calls[1]._0
-    var iterType = clone_type(eWhere.arguments[0]._type.firstType)
-    var whereCond = fold_linq_cond(eWhere.arguments[1], "it")
-    var selectExpr : Expression?
-    if (eSelect != null) {
-        selectExpr = fold_linq_cond(eSelect.arguments[1], "it")
-    } else {
-        selectExpr = new ExprVar(name := "it", at = topValue.at, _type := iterType)
-    }
-    var comprehension = qmacro([for (it in $e(topValue)); $e(selectExpr); where $e(whereCond)])
-    return append_comprehension(argIndex, topValue, comprehension, blk, calls[0]._0.at)
-}
-
-[macro_function]
-def private fold_where(argIndex : int; var topValue : Expression?; var blk : ExprBlock?; var calls : array<tuple<ExprCall?; LinqCall?>>) : Expression? {
-    //! folds where + select into a single comprehension
-    var eWhere = calls[0]._0
-    var whereCond = fold_linq_cond(eWhere.arguments[1], "it")
-    var comprehension = qmacro([for (it in $e(topValue)); it; where $e(whereCond)])
-    return append_comprehension(argIndex, topValue, comprehension, blk, calls[0]._0.at)
-}
-
 [macro_function]
-def private fold_where_count(argIndex : int; var topValue : Expression?; var blk : ExprBlock?; var calls : array<tuple<ExprCall?; LinqCall?>>) : Expression? {
-    //! folds `_where(p) |> count()` into a single-pass loop with the predicate inlined — no intermediate filter array, no block-call overhead
-    var eWhere = calls[0]._0
-    let srcName = "`source`{argIndex}`{eWhere.at.line}`{eWhere.at.column}"
-    let itName  = "`it`{argIndex}`{eWhere.at.line}`{eWhere.at.column}"
-    let nName   = "`n`{argIndex}`{eWhere.at.line}`{eWhere.at.column}"
-    var whereCond = fold_linq_cond(eWhere.arguments[1], itName)
-    var fusedCall : Expression? = qmacro(invoke($($i(srcName) : typedecl($e(topValue)) - const) {
-        var $i(nName) = 0
-        for ($i(itName) in $i(srcName)) {
-            if ($e(whereCond)) {
-                $i(nName) ++
-            }
-        }
-        return $i(nName)
-    }, $e(topValue)))
-    fusedCall.force_at(calls[0]._0.at)
-    fusedCall.force_generated(true)
-    let newArgName = "pass_{argIndex}"
-    blk.list |> emplace_new <| qmacro_expr() {
-        var $i(newArgName) = $e(fusedCall)
-    }
-    (blk.list.back() as ExprLet).variables[0].flags.can_shadow = true
-    if (argIndex != 0) {
-        blk.list |> emplace_new <| qmacro_expr() {
-            delete $e(topValue)
-        }
-    }
-    return qmacro($i(newArgName))
-}
-
-[macro_function]
-def private fold_select(argIndex : int; var topValue : Expression?; var blk : ExprBlock?; var calls : array<tuple<ExprCall?; LinqCall?>>) : Expression? {
-    //! folds select into a single comprehension
-    var eSelect = calls[0]._0
-    var selectExpr = fold_linq_cond(eSelect.arguments[1], "it")
-    var comprehension = qmacro([for (it in $e(topValue)); $e(selectExpr)])
-    return append_comprehension(argIndex, topValue, comprehension, blk, calls[0]._0.at)
-}
-
-[macro_function]
-def private fold_select_where(argIndex : int; var topValue : Expression?; var blk : ExprBlock?; var calls : array<tuple<ExprCall?; LinqCall?>>) : Expression? {
-    //! folds where + select into a single comprehension-like expression
-    //! note, order of select and where is reversed
-    var eWhere = calls[1]._0
-    var eSelect = calls[0]._0
-    var iterType = clone_type(eWhere.arguments[0]._type.firstType)
-    var whereCond : Expression?
-    let srcName = "`source`{argIndex}`{eSelect.at.line}`{eSelect.at.column}"
-    let valName = "`value`{argIndex}`{eSelect.at.line}`{eSelect.at.column}"
-    let itName = "`it`{argIndex}`{eSelect.at.line}`{eSelect.at.column}"
-    let arrName = "`arr`{argIndex}`{eSelect.at.line}`{eSelect.at.column}"
-    whereCond = fold_linq_cond(eWhere.arguments[1], valName)
-    var selectExpr : Expression?
-    if (eSelect != null) {
-        selectExpr = fold_linq_cond(eSelect.arguments[1], itName)
-    } else {
-        selectExpr = new ExprVar(name := itName, at = topValue.at, _type := iterType)
-    }
-    var comprehension = qmacro(invoke($($i(srcName) : typedecl($e(topValue)) - const) {
-            var $i(arrName) : array<$t(iterType)>
-            for ($i(itName) in $i(srcName)) {
-                static_if (typeinfo is_workhorse($e(selectExpr))) {
-                    var $i(valName) = $e(selectExpr)
-                    if ($e(whereCond)) {
-                        $i(arrName).emplace($i(valName))
-                    }
-                } else {
-                    var $i(valName) <- $e(selectExpr)
-                    if ($e(whereCond)) {
-                        $i(arrName).emplace($i(valName))
-                    }
-                }
-            }
-            return <- $i(arrName)
-        }, $e(topValue)))
-    return append_comprehension(argIndex, topValue, comprehension, blk, calls[0]._0.at)
-}
-
-[macro_function]
-def private fold_linq_default(var expr : Expression?; recursiveMacroName : string) : Expression? {
-    //! fold sequence into
-    //!   invoke ( top, $ ( it ) : auto {
-    //!      var pass_0 = call0(it,...) // or call0_to_array(it,...)
-    //!      var pass_1 = call1(pass_0,...)
+def private fold_linq_default(var expr : Expression?) : Expression? {
+    //! Tier-2 fallback emitter for ``_fold``. Emits the chain in array shape:
+    //!
+    //!   invoke ( top, $ ( source ) : auto {
+    //!      var pass_0 = call0(source,...) // or call0_to_array(...) if source is iterator
+    //!      var pass_1 = call1(pass_0,...) // or call1_inplace(pass_0,...) when callable inplace
+    //!      delete pass_0                  // free previous intermediate
     //!      ...
-    //!      return <- pass_N // or pass_N.to_sequence() if expr is iterator
+    //!      return <- pass_N               // or pass_N.to_sequence_move() if expr is iterator
     //!   } )
-    //!   skip var if call is inplace
-    //!   skip delete if call is not first
-    //!   if call is first and source is iterator, then call is call_to_array
-    //!   if call is last and expr is iterator, then return is pass_N.to_sequence()
-    //!   if expr is not linq call, return null
     //!
-    //! ``recursiveMacroName`` is the macro name used to wrap sub-pipeline
-    //! arguments (e.g. ``zip``'s second arg, ``concat``'s tail) so that each
-    //! top-level fold macro keeps recursing into itself: ``_fold`` recurses
-    //! into ``_fold``, ``_old_fold`` into ``_old_fold``. Threading the name
-    //! through this single call site keeps the frozen baseline truly frozen
-    //! once ``_fold`` diverges in later PRs.
+    //! Rules:
+    //! - skip ``var pass_N`` binding if call is inplace
+    //! - skip ``delete pass_{N-1}`` if N == 0 (no previous intermediate to free)
+    //! - if first call and source is iterator, rename to ``call_to_array`` (materialize once)
+    //! - if last call and expr is iterator, return ``pass_N.to_sequence_move()``
+    //! - if expr has no linq calls (``flatten_linq`` returns empty), return ``null`` so the
+    //!   ``_fold`` cascade can fall through to tier 3 (raw clone)
+    //! - recursive linq sub-args (e.g. ``zip``'s second arg) are wrapped in a fresh
+    //!   ``_fold`` call so the cascade applies to them too
     var blk = new ExprBlock(at = expr.at)
     var (top, calls) = flatten_linq(expr)
     if (empty(calls)) return null
@@ -441,69 +302,51 @@ def private fold_linq_default(var expr : Expression?; recursiveMacroName : strin
             let newArgName = "pass_{argIndex}"
             var callName = cll._1.name
             var inplace = false
-            // lets find folding sequences
-            var found : FoldSequence?
-            for (fs in g_foldSeq) {
-                if (fs.calls |> length + argIndex - 1 < argMax) {
-                    var match = true
-                    for (i in 0 .. fs.calls |> length) {
-                        if (fs.calls[i] != unsafe(calls[argIndex + i]._1.name)) {
-                            match = false
-                            break
-                        }
-                    }
-                    if (match) {
-                        found = unsafe(addr(fs))
-                        break
-                    }
+            if (cll._0._type.isIterator || cll._0._type.isGoodArrayType) {
+                // Canonical names that end in `_` (e.g. `where_`, to avoid the daslang
+                // `where` keyword) need the suffix joined without a connecting `_`,
+                // since the actual function names are `where_to_array` / `where_inplace`
+                // (single underscore between the base and the suffix).
+                let sep : string = ends_with(callName, "_") ? "" : "_"
+                if (argIndex == 0 && top._type.isIterator && !cll._1.noToArrayVariant) {
+                    callName = "{callName}{sep}to_array"
+                } elif (argIndex != 0 && cll._1.inplace) {
+                    callName = "{callName}{sep}inplace"
+                    inplace = true
                 }
             }
-            if (found != null) {
-                var sub = subarray(calls, argIndex .. argIndex + found.calls |> length)
-                topValue = found.folder(argIndex, topValue, blk, sub)
-                argIndex += found.calls |> length
-            } else {
-                if (cll._0._type.isIterator || cll._0._type.isGoodArrayType) {
-                    if (argIndex == 0 && top._type.isIterator) {
-                        callName = "{callName}_to_array"
-                    } elif (argIndex != 0 && cll._1.inplace) {
-                        callName = "{callName}_inplace"
-                        inplace = true
-                    }
+            for (i in cll._1.recursive) {   // recurse into sub-pipeline args (e.g. zip's second arg)
+                if (i >= 1 && i < cll._0.arguments |> length && is_linq_call(cll._0.arguments[i])) {
+                    var argExpr = make_call(cll._0.at, "_fold")
+                    (argExpr as ExprCallMacro).arguments |> emplace <| cll._0.arguments[i]
+                    cll._0.arguments[i] = argExpr
                 }
-                for (i in cll._1.recursive) {   // this is where we make recursion
-                    if (i >= 1 && i < cll._0.arguments |> length && is_linq_call(cll._0.arguments[i])) {
-                        var argExpr = make_call(cll._0.at, recursiveMacroName)
-                        (argExpr as ExprCallMacro).arguments |> emplace <| cll._0.arguments[i]
-                        cll._0.arguments[i] = argExpr
-                    }
-                }
-                var newCall = qmacro($c(callName)($e(topValue)))
-                let numArgs = cll._0.arguments |> length
-                for (i in 1..numArgs) {
-                    (newCall as ExprCall).arguments |> emplace_new <| clone_expression(cll._0.arguments[i])
+            }
+            var newCall = qmacro($c(callName)($e(topValue)))
+            let numArgs = cll._0.arguments |> length
+            for (i in 1..numArgs) {
+                (newCall as ExprCall).arguments |> emplace_new <| clone_expression(cll._0.arguments[i])
+            }
+            if (inplace) {
+                blk.list |> emplace(newCall)
+            } else {
+                // Macro emits `var pass_N = call` and a later `return <- pass_N`;
+                // for single-pass chains this triggers PERF009 (move-into-then-return)
+                // at the user call site. The shape is load-bearing for the array-pipeline
+                // semantics (every stage binds so the next can reuse the buffer in-place),
+                // so suppress the rule inline below.
+                blk.list |> emplace_new <| qmacro_expr() {
+                    var $i(newArgName) = $e(newCall)  // nolint:PERF009
                 }
-                if (inplace) {
-                    blk.list |> emplace(newCall)
-                } else {
-                    // Macro emits `var pass_N = call` and a later `return <- pass_N`;
-                    // for single-pass chains this triggers PERF009 (move-into-then-return)
-                    // at the user call site. Rewriting to direct `return <- call` would
-                    // change the historical `_old_fold` baseline that benchmarks compare
-                    // against, so we keep the shape and suppress the rule inline below.
+                (blk.list.back() as ExprLet).variables[0].flags.can_shadow = true
+                if (argIndex != 0) {
                     blk.list |> emplace_new <| qmacro_expr() {
-                        var $i(newArgName) = $e(newCall)  // nolint:PERF009
+                        delete $e(topValue)
                     }
-                    (blk.list.back() as ExprLet).variables[0].flags.can_shadow = true
-                    if (argIndex != 0) {
-                        blk.list |> emplace_new <| qmacro_expr() {
-                            delete $e(topValue)
-                        }
-                    }
-                    topValue = qmacro($i(newArgName))
                 }
-                argIndex ++
+                topValue = qmacro($i(newArgName))
             }
+            argIndex ++
         }
     }
     if (expr._type.isIterator) {
@@ -1139,6 +982,174 @@ def private emit_early_exit_lane(
     return finalize_invoke(res, at)
 }
 
+[macro_function]
+def private order_top_n_call_name(orderName : string) : string {
+    //! Maps an order-family operator name to the corresponding ``top_n*`` helper from linq.das.
+    if (orderName == "order") return "top_n"
+    if (orderName == "order_descending") return "top_n_descending"
+    if (orderName == "order_by") return "top_n_by"
+    if (orderName == "order_by_descending") return "top_n_by_descending"
+    return ""
+}
+
+[macro_function]
+def private plan_order_family(var expr : Expression?) : Expression? {
+    //! Phase 3 splice planner for chains containing an order-family operator.
+    //! Recognized chain shapes (terminator can be the order/take op itself, or implicit
+    //! ``to_array`` which is ``skip=true`` so flatten_linq elides it):
+    //!
+    //! - ``src |> order[_descending]?``                                    → direct call
+    //! - ``src |> order_by[_descending]?(key)``                            → direct call
+    //! - ``src |> order[_by]?[_descending]?[(key)] |> take(K)``            → ``top_n*`` helper
+    //! - ``src |> where_*(p)+ |> order[_by]?[_descending]?[(key)]``        → fused prefilter + ``order*_inplace``
+    //! - ``src |> where_*(p)+ |> order[_by]?[_descending]?[(key)] |> take(K)`` → fused prefilter + ``top_n*``
+    //!
+    //! Combinations with ``select`` / ``skip`` / non-ARRAY terminators (sum/count/etc. after
+    //! the order op) are out of scope and fall through to cascade tier 2.
+    var (top, calls) = flatten_linq(expr)
+    if (empty(calls)) return null
+    top = peel_each(top)
+    var whereCond : Expression?
+    var orderName : string
+    var orderKey : Expression?
+    var orderElemType : TypeDeclPtr
+    var takeExpr : Expression?
+    var hasOrder = false
+    let at = calls[0]._0.at
+    let itName = "`it`{at.line}`{at.column}"
+    for (i in 0 .. length(calls)) {
+        var cll & = unsafe(calls[i])
+        let name = cll._1.name
+        if (name == "where_") {
+            if (hasOrder) return null   // where-after-order not in scope
+            var pred = fold_linq_cond(cll._0.arguments[1], itName)
+            if (whereCond == null) {
+                whereCond = pred
+            } else {
+                whereCond = qmacro($e(whereCond) && $e(pred))
+            }
+        } elif (name == "order" || name == "order_descending"
+                || name == "order_by" || name == "order_by_descending") {
+            if (hasOrder) return null
+            hasOrder = true
+            orderName = name
+            if ((cll._0.arguments |> length) >= 2) {
+                orderKey = clone_expression(cll._0.arguments[1])
+            }
+            orderElemType = clone_type(cll._0._type.firstType)
+        } elif (name == "take") {
+            if (!hasOrder || takeExpr != null) return null
+            var arg = cll._0.arguments[1]
+            if (arg == null || arg._type == null || arg._type.baseType != Type.tInt) return null
+            takeExpr = clone_expression(arg)
+        } else {
+            return null
+        }
+    }
+    if (!hasOrder) return null
+    let hasKey = orderName == "order_by" || orderName == "order_by_descending"
+    let needIterWrap = expr._type.isIterator
+    let topNName = order_top_n_call_name(orderName)
+    let inplaceName = "{orderName}_inplace"
+    if (whereCond == null) {
+        // No prefilter — direct call to daslib helper.
+        var topExpr = clone_expression(top)
+        topExpr.genFlags.alwaysSafe = true
+        var emission : Expression?
+        if (takeExpr == null) {
+            // Bare order family — emit the direct call. Same shape as plain LINQ, but via
+            // splice so `_fold` doesn't fall through to tier 2.
+            if (hasKey) {
+                emission = qmacro($c(orderName)($e(topExpr), $e(orderKey)))
+            } else {
+                emission = qmacro($c(orderName)($e(topExpr)))
+            }
+        } else {
+            // order + take → top_n* dispatch.
+            if (hasKey) {
+                emission = qmacro($c(topNName)($e(topExpr), $e(takeExpr), $e(orderKey)))
+            } else {
+                emission = qmacro($c(topNName)($e(topExpr), $e(takeExpr)))
+            }
+        }
+        // Wrap with to_sequence_move only when emission is array-shaped: take dispatches to
+        // top_n* (always returns array) or the source was peeled to an array. For an
+        // iterator source on the bare-order path, daslib's order_*(iter,…) overload already
+        // returns iterator — wrapping would compile-fail (to_sequence_move is array-only).
+        let emissionIsArray = takeExpr != null || top._type.isGoodArrayType
+        if (needIterWrap && emissionIsArray) {
+            emission = qmacro($e(emission).to_sequence_move())
+        }
+        emission.force_generated(true)
+        return emission
+    }
+    // where_* + order_*[+take] — emit a single fused loop that prefilters into a fresh
+    // buffer, then sorts in place (no take) or extracts top-N (take). This collapses what
+    // plain LINQ would do as two array allocations + iterator dispatch into one allocation
+    // and one sort pass.
+    let srcName = "`source`{at.line}`{at.column}"
+    let bufName = "`buf`{at.line}`{at.column}"
+    var srcParamType = invoke_src_param_type(top)
+    var topExpr = clone_expression(top)
+    topExpr.genFlags.alwaysSafe = true
+    var bufElemType = clone_type(orderElemType)
+    var loopBody = qmacro_expr() {
+        if ($e(whereCond)) {
+            $i(bufName) |> push_clone($i(itName))
+        }
+    }
+    var stmts : array<Expression?>
+    stmts |> push <| qmacro_expr() {
+        var $i(bufName) : array<$t(bufElemType)>
+    }
+    if (type_has_length(top._type)) {
+        stmts |> push <| qmacro_expr() {
+            $i(bufName) |> reserve(length($i(srcName)))
+        }
+    }
+    stmts |> push <| qmacro_expr() {
+        for ($i(itName) in $i(srcName)) {
+            $e(loopBody)
+        }
+    }
+    if (takeExpr == null) {
+        // Sort the prefilter buffer in place and return it. order*_inplace is void
+        // (mutates the buffer in place), so we move the buffer out for the final result.
+        var sortCall : Expression?
+        if (hasKey) {
+            sortCall = qmacro($c(inplaceName)($i(bufName), $e(orderKey)))
+        } else {
+            sortCall = qmacro($c(inplaceName)($i(bufName)))
+        }
+        stmts |> push(sortCall)
+        stmts |> push <| qmacro_expr() {
+            return <- $i(bufName)
+        }
+    } else {
+        // top_n* on the prefilter buffer.
+        var topNCall : Expression?
+        if (hasKey) {
+            topNCall = qmacro($c(topNName)($i(bufName), $e(takeExpr), $e(orderKey)))
+        } else {
+            topNCall = qmacro($c(topNName)($i(bufName), $e(takeExpr)))
+        }
+        stmts |> push <| qmacro_expr() {
+            return <- $e(topNCall)
+        }
+    }
+    var bodyBlock = new ExprBlock(at = at)
+    for (s in stmts) {
+        bodyBlock.list |> emplace(s)
+    }
+    var emission : Expression? = qmacro(invoke($($i(srcName) : $t(srcParamType)) {
+        $e(bodyBlock);
+    }, $e(topExpr)))
+    if (needIterWrap) {
+        emission = qmacro($e(emission).to_sequence_move())
+    }
+    return finalize_invoke(emission, at)
+}
+
 [macro_function]
 def private plan_loop_or_count(var expr : Expression?) : Expression? {
     // Phase-2C loop planner. Recognizes chains of shape `[where_*][select*][skip?][take?]`
@@ -1185,10 +1196,29 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
         var cll & = unsafe(calls[i])
         let opName = cll._1.name
         if (opName == "where_") {
-            // where-after-select / -after-skip / -after-take is rejected — canonical chain
-            // order is [where_*][select*][skip?][take?].
-            if (seenSelect || seenSkip || seenTake) return null
-            var predicate = fold_linq_cond(cll._0.arguments[1], itName)
+            // skip/take-after-where is rejected — canonical chain order is
+            // [where_*][select*][skip?][take?].
+            if (seenSkip || seenTake) return null
+            var predicate : Expression?
+            if (seenSelect) {
+                // Phase 3d: where-after-select. Substitute the predicate's bound variable
+                // with the current projection via peel-aware substitution. The substitution
+                // inlines the projection into the predicate, which would re-evaluate any
+                // side effects (since the terminator also references projection) — bail to
+                // tier 2 cascade on side-effecty projections.
+                //
+                // KNOWN PERF GAP (deferred to splice-with-cmp follow-up PR): pure projections
+                // currently re-evaluate per element for ARRAY/ACCUMULATOR/EARLY_EXIT lanes —
+                // once in the inlined predicate and once in valueExpr. COUNTER lane is unaffected
+                // (no body use). Fix shape: emit a pre-condition `var v := projection` bind in
+                // the loop body (outside the if-wrap) and rewrite both predicate and valueExpr
+                // to reference `v`. Bundled with the `_with_cmp` inline-key follow-up since
+                // both share the "single-eval splice" theme.
+                if (has_sideeffects(projection)) return null
+                predicate = fold_linq_cond_peel(cll._0.arguments[1], projection)
+            } else {
+                predicate = fold_linq_cond(cll._0.arguments[1], itName)
+            }
             if (whereCond == null) {
                 whereCond = predicate
             } else {
@@ -1337,40 +1367,40 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
 
 [call_macro(name="_fold")]
 class private LinqFold : AstCallMacro {
-    //! implements _fold(expression) that folds LINQ expressions into optimized sequnences
-    //! for example::
+    //! ``_fold(expr)`` — three-tier cascade over a LINQ chain.
+    //!
+    //! 1. **Splice** (``plan_loop_or_count``) — fused for-loop with predicates
+    //!    and projections inlined, when the chain matches a recognized pattern.
+    //! 2. **Array-shape pipeline** (``fold_linq_default``) — ``call → array →
+    //!    call → array`` with ``_inplace`` reuse and explicit ``delete`` of
+    //!    previous stages, when the chain has linq operators but splice can't fire.
+    //! 3. **Raw clone** — passthrough when ``flatten_linq`` finds no recognized
+    //!    linq operators in the chain.
+    //!
+    //! All tiers preserve semantics. ``_fold(chain)`` is observationally equivalent
+    //! to ``chain``, just faster when patterns match. Always safe to apply.
+    //!
+    //! Example::
     //!
-    //!     _fold(each(foo)._where(_ > 5)._select(_ * 2))
+    //!     _fold(each(foo)._where(_ > 5)._select(_ * 2).sum())
     //!
-    //! expands into a single comprehension that does all operations in one pass
+    //! Expands tier-1 into a single fused loop accumulating the sum directly.
     def override visit(prog : ProgramPtr; mod : Module?; var call : ExprCallMacro?) : Expression? {
-        //! Visits the _fold macro call and folds LINQ expressions into optimized sequences.
+        //! Visits the _fold macro call and runs the three-tier cascade.
         macro_verify(call.arguments |> length == 1, prog, call.at, "expecting _fold(expression)")
         macro_verify(call.arguments[0]._type != null, prog, call.at, "expecting linq expression")
-        var res : Expression? = plan_loop_or_count(call.arguments[0])
+        // Tier 1 splice — try the dedicated order-family planner first (handles
+        // chains containing order / order_by / order_descending / order_by_descending,
+        // optionally followed by take(K)), then the general loop planner for the
+        // [where_*][select*][skip?][take?] + terminator shapes.
+        var res : Expression? = plan_order_family(call.arguments[0])
         if (res != null) return res
+        res = plan_loop_or_count(call.arguments[0])
+        if (res != null) return res
+        // Tier 2 — array-shape pipeline with `_inplace` reuse + explicit `delete`.
+        res = fold_linq_default(call.arguments[0])
+        if (res != null) return res
+        // Tier 3 — raw passthrough.
         return clone_expression(call.arguments[0])
     }
 }
-
-[call_macro(name="_old_fold")]
-class private LinqOldFold : AstCallMacro {
-    //! Frozen pre-rewrite baseline of ``_fold``. Same expansion as ``_fold``
-    //! today; the two diverge only when later PRs add splice-mode fusion to
-    //! ``_fold``. Benchmarks compare ``_fold`` vs ``_old_fold`` to track
-    //! performance work — ``_old_fold`` must keep producing the historical
-    //! shape so the comparison stays meaningful.
-    //! for example::
-    //!
-    //!     _old_fold(each(foo)._where(_ > 5)._select(_ * 2))
-    def override visit(prog : ProgramPtr; mod : Module?; var call : ExprCallMacro?) : Expression? {
-        macro_verify(call.arguments |> length == 1, prog, call.at, "expecting _old_fold(expression)")
-        macro_verify(call.arguments[0]._type != null, prog, call.at, "expecting linq expression")
-        var res : Expression? = fold_linq_default(call.arguments[0], "_old_fold")
-        if (res == null) {
-            prog |> macro_error(call.at, "cannot fold LINQ expression\n{describe(call.arguments[0])}")
-            return res
-        }
-        return res
-    }
-}
diff --git a/daslib/quote.das b/daslib/quote.das
index 9e18d5dc1f..95d3fc570e 100644
--- a/daslib/quote.das
+++ b/daslib/quote.das
@@ -25,10 +25,10 @@ def find_unique_function_ptr(mod : Module?; name : string; canfail : bool = fals
 /*
  * We need this wrappers for initialization of Handle types
  */
-struct CaptureEntryInitData {
+struct public CaptureEntryInitData {
     //! Initialization data for a captured variable entry.
-    name : string
-    mode : CaptureMode
+    name : string       //! Variable name being captured.
+    mode : CaptureMode  //! Capture mode (copy, reference, move, or clone).
 }
 
 def public clone(var a : dasvector`CaptureEntry; b : array<CaptureEntryInitData>) {
@@ -79,10 +79,10 @@ def public clone(var args : rtti::AnnotationList; var nargs : array<rtti::Annota
 }
 
 
-struct EnumEntryInitData {
+struct public EnumEntryInitData {
     //! Initialization data for a quoted enum entry.
-    name : string
-    cppName : string
+    name : string     //! Enum entry name.
+    cppName : string  //! C++-side name for the entry (used when generating bindings).
     // at : LineInfo
     // value : ExpressionPtr
 }
@@ -97,14 +97,14 @@ def public clone(var a : dasvector`EnumEntry; var b : array<EnumEntryInitData>)
     }
 }
 
-struct AnnotationArgumentInitData {
+struct public AnnotationArgumentInitData {
     //! Initialization data for a quoted annotation argument.
-    basicType : Type
-    name : string
-    sValue : string
-    bValue : bool
-    iValue : int
-    fValue : float
+    basicType : Type  //! Argument type tag (string / bool / int / float / etc.).
+    name : string     //! Argument name as written in the annotation.
+    sValue : string   //! String value (used when basicType is tString).
+    bValue : bool     //! Bool value (used when basicType is tBool).
+    iValue : int      //! Int value (used when basicType is tInt).
+    fValue : float    //! Float value (used when basicType is tFloat).
     // @safe_when_uninitialized at: LineInfo // todo: add if needed
 }
 
diff --git a/daslib/rst.das b/daslib/rst.das
index 78edfa77ca..24e2669428 100644
--- a/daslib/rst.das
+++ b/daslib/rst.das
@@ -1879,7 +1879,7 @@ def document_functions(doc_file : file; mods : array<Module?>; var groups : arra
                 if (grp.hidden) {
                     document_warning(doc_file, "This group of functions is hidden. It will not be in the final documentation.")
                 }
-                grp.func |> sort($(a, b) => function_name(a.fn) < function_name(b.fn))
+                grp.func |> sort($(a, b) => rst_describe_function_short(a.fn) < rst_describe_function_short(b.fn))
                 for (func in grp.func) {
                     if (!key_exists(tab, func.fn)) {
                         let descr = rst_describe_function_short(func.fn)
diff --git a/daslib/templates_boost.das b/daslib/templates_boost.das
index e54b94973f..527a6f5a16 100644
--- a/daslib/templates_boost.das
+++ b/daslib/templates_boost.das
@@ -27,6 +27,7 @@ struct Template {
     field2name : table<string; string>                                          //! field name replacement rules
     var2name : table<string; string>                                            //! variable name replacement rules
     var2expr : table<string; Expression?>                                        //! variable expression replacement rules
+    var2exprPeeling : table<string; Expression?>                                 //! variable expression replacement rules that also peel ExprRef2Value wrappers (use on typed AST)
     var2exprList : table<string; array<Expression?>>                             //! variable expression list replacement rules
     type2type : table<string; string>                                           //! type name replacement rules
     type2etype : table<string; TypeDeclPtr>                                     //! type to type declaration replacement rules
@@ -48,6 +49,20 @@ def replaceVariable(var self : Template; name : string; var expr : ast::Expressi
     self.var2expr |> emplace(name, expr)
 }
 
+def replaceVariablePeeling(var self : Template; name : string; var expr : ast::Expression?) {
+    //! Variant of ``replaceVariable`` for substituting into typed AST. Replaces
+    //! ``ExprRef2Value(ExprVar(name))`` with the replacement directly — stripping
+    //! the wrapper that the typer inserts around reference-typed variable reads.
+    //! Use this when the destination AST has already been through type inference
+    //! (e.g. inside a ``call_macro`` body); plain ``replaceVariable`` leaves the
+    //! ``ExprRef2Value`` orphaned around a non-reference value and fails with
+    //! ``can only dereference a reference``.
+    //! Do not register the same name with both ``replaceVariable`` and
+    //! ``replaceVariablePeeling`` — bottom-up traversal makes the bare-var rule win
+    //! and re-introduces the orphan-wrapper bug.
+    self.var2exprPeeling |> emplace(name, expr)
+}
+
 def replaceVarTag(var self : Template; name : string; var expr : ast::Expression?) {
     //! Adds a rule to the template to replace a variable tag with an expression.
     self.tag2expr |> emplace(name, expr)
@@ -239,6 +254,22 @@ class private TemplateVisitor : AstVisitor {
         }
         return expr
     }
+    def override visitExprRef2Value(var expr : ExprRef2Value?) : Expression? {
+        //! Peeling substitution for typed AST. When the inner ExprVar matches a
+        //! ``var2exprPeeling`` rule, returns the replacement directly — stripping
+        //! the typer-inserted ExprRef2Value wrapper that would otherwise be left
+        //! orphaned around a non-reference value.
+        if (expr.subexpr is ExprVar) {
+            let vvar = expr.subexpr as ExprVar
+            let vn = string(vvar.name)
+            if (key_exists(rules.var2exprPeeling, vn)) {
+                var rexpr = clone_expression(unsafe(rules.var2exprPeeling[vn]))
+                rexpr.at = expr.at
+                return rexpr
+            }
+        }
+        return expr
+    }
     def replaceAlias(var typ : TypeDeclPtr&) {
         //! Replaces type aliases in the given type declaration according to the template rules.
         if (typ.baseType == Type.alias) {
diff --git a/doc/source/_templates/layout.html b/doc/source/_templates/layout.html
new file mode 100644
index 0000000000..81d71cd348
--- /dev/null
+++ b/doc/source/_templates/layout.html
@@ -0,0 +1,32 @@
+{%- extends "!layout.html" %}
+
+{# Override sphinx_rtd_theme's sidebar brand so the logo links to the
+   daslang.io home page instead of pathto(_root_doc). Mirrors the upstream
+   block at sphinx_rtd_theme/layout.html sidebartitle, with the <a href>
+   swapped. Keep this file in sync with upstream when bumping the theme. #}
+
+{%- block sidebartitle %}
+
+  {%- set _logo_url = logo_url|default(pathto('_static/' + (logo or ""), 1)) %}
+  <a href="https://daslang.io"{% if not theme_logo_only %} class="icon icon-home"{% endif %}>
+    {% if not theme_logo_only %}{{ project }}{% endif %}
+    {%- if logo or logo_url %}
+      <img src="{{ _logo_url }}" class="logo" alt="{{ _('Logo') }}"/>
+    {%- endif %}
+  </a>
+
+  {%- if theme_display_version %}
+    {%- set nav_version = version %}
+    {%- if READTHEDOCS and current_version %}
+      {%- set nav_version = current_version %}
+    {%- endif %}
+    {%- if nav_version %}
+      <div class="version">
+        {{ nav_version }}
+      </div>
+    {%- endif %}
+  {%- endif %}
+
+  {%- include "searchbox.html" %}
+
+{%- endblock %}
diff --git a/doc/source/stdlib/handmade/function-ast-add_block_annotation-0xe2c30f4c26f7256d.rst b/doc/source/stdlib/handmade/function-ast-add_block_annotation-0xe2c30f4c26f7256d.rst
new file mode 100644
index 0000000000..59d9bea1a2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-add_block_annotation-0xe2c30f4c26f7256d.rst
@@ -0,0 +1 @@
+Attaches an annotation to an expression block and calls the annotation's ``apply`` method.
diff --git a/doc/source/stdlib/handmade/function-ast-add_function_annotation-0x3168617e90c60bf4.rst b/doc/source/stdlib/handmade/function-ast-add_function_annotation-0x3168617e90c60bf4.rst
new file mode 100644
index 0000000000..17e3da5cbf
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-add_function_annotation-0x3168617e90c60bf4.rst
@@ -0,0 +1 @@
+Attaches a function annotation to a function and calls the annotation's ``apply`` method.
diff --git a/doc/source/stdlib/handmade/function-ast-add_function_annotation-0xc3e601b841da4875.rst b/doc/source/stdlib/handmade/function-ast-add_function_annotation-0xc3e601b841da4875.rst
new file mode 100644
index 0000000000..17e3da5cbf
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-add_function_annotation-0xc3e601b841da4875.rst
@@ -0,0 +1 @@
+Attaches a function annotation to a function and calls the annotation's ``apply`` method.
diff --git a/doc/source/stdlib/handmade/function-ast-add_structure_annotation-0x3d3e18643eb661d4.rst b/doc/source/stdlib/handmade/function-ast-add_structure_annotation-0x3d3e18643eb661d4.rst
new file mode 100644
index 0000000000..582cf180f4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-add_structure_annotation-0x3d3e18643eb661d4.rst
@@ -0,0 +1 @@
+Attaches a structure annotation to the specified structure. The annotation is moved from the provided smart pointer.
\ No newline at end of file
diff --git a/doc/source/stdlib/handmade/function-ast-add_structure_annotation-0xcb281ad07b7d69b7.rst b/doc/source/stdlib/handmade/function-ast-add_structure_annotation-0xcb281ad07b7d69b7.rst
new file mode 100644
index 0000000000..e424f2f1ae
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-add_structure_annotation-0xcb281ad07b7d69b7.rst
@@ -0,0 +1 @@
+Attaches a structure annotation to the specified module. The annotation is moved from the provided smart pointer.
\ No newline at end of file
diff --git a/doc/source/stdlib/handmade/function-ast-for_each_generic-0x65b9a789781d266.rst b/doc/source/stdlib/handmade/function-ast-for_each_generic-0x65b9a789781d266.rst
new file mode 100644
index 0000000000..1946078d4b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-for_each_generic-0x65b9a789781d266.rst
@@ -0,0 +1 @@
+Iterates through each generic function in the given module.
diff --git a/doc/source/stdlib/handmade/function-ast-force_at-0x3d0b3ed9973e1ca1.rst b/doc/source/stdlib/handmade/function-ast-force_at-0x3d0b3ed9973e1ca1.rst
new file mode 100644
index 0000000000..0104bb9fd8
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-force_at-0x3d0b3ed9973e1ca1.rst
@@ -0,0 +1 @@
+Replaces line info in an expression, its subexpressions, and their types.
diff --git a/doc/source/stdlib/handmade/function-ast-force_generated-0xf168208a0bc75a54.rst b/doc/source/stdlib/handmade/function-ast-force_generated-0xf168208a0bc75a54.rst
new file mode 100644
index 0000000000..0f6b2e3abf
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-force_generated-0xf168208a0bc75a54.rst
@@ -0,0 +1 @@
+Sets the generated flag on an expression and its subexpressions.
diff --git a/doc/source/stdlib/handmade/function-ast-make_visitor-0x1063140cba10265f.rst b/doc/source/stdlib/handmade/function-ast-make_visitor-0x1063140cba10265f.rst
new file mode 100644
index 0000000000..45657858cd
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-make_visitor-0x1063140cba10265f.rst
@@ -0,0 +1 @@
+Creates an adapter for the AstVisitor interface.
diff --git a/doc/source/stdlib/handmade/function-ast-visit-0x292f09ea508219f2.rst b/doc/source/stdlib/handmade/function-ast-visit-0x292f09ea508219f2.rst
new file mode 100644
index 0000000000..80bea2caa2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-visit-0x292f09ea508219f2.rst
@@ -0,0 +1 @@
+Visit the program with the given visitor adapter. When sortStructures is true, struct declarations are visited in topological (dependency) order, ensuring that structs used by value in fields are visited before the structs that contain them.
diff --git a/doc/source/stdlib/handmade/function-ast-visit-0x9717f9b46ea50aef.rst b/doc/source/stdlib/handmade/function-ast-visit-0x9717f9b46ea50aef.rst
new file mode 100644
index 0000000000..21542c832e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast-visit-0x9717f9b46ea50aef.rst
@@ -0,0 +1 @@
+Invokes an AST visitor on the given object.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-add_annotation_argument-0x51c8a49842980915.rst b/doc/source/stdlib/handmade/function-ast_boost-add_annotation_argument-0x51c8a49842980915.rst
new file mode 100644
index 0000000000..9dbeabf21a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-add_annotation_argument-0x51c8a49842980915.rst
@@ -0,0 +1 @@
+Adds a typed annotation argument (``bool``, ``int``, ``float``, ``string``, or ``AnnotationArgument``) to an ``AnnotationArgumentList`` and returns the new argument index.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-add_annotation_argument-0xb811a1eea4b2377f.rst b/doc/source/stdlib/handmade/function-ast_boost-add_annotation_argument-0xb811a1eea4b2377f.rst
new file mode 100644
index 0000000000..9dbeabf21a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-add_annotation_argument-0xb811a1eea4b2377f.rst
@@ -0,0 +1 @@
+Adds a typed annotation argument (``bool``, ``int``, ``float``, ``string``, or ``AnnotationArgument``) to an ``AnnotationArgumentList`` and returns the new argument index.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-append_annotation-0x76aeebadb8a3f4cc.rst b/doc/source/stdlib/handmade/function-ast_boost-append_annotation-0x76aeebadb8a3f4cc.rst
new file mode 100644
index 0000000000..221d0cde55
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-append_annotation-0x76aeebadb8a3f4cc.rst
@@ -0,0 +1 @@
+Creates an ``AnnotationDeclaration`` for the named annotation (with optional typed arguments) and attaches it to a ``Function``, ``ExprBlock``, or ``Structure``.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-append_annotation-0xf9e82b1cfd06dde8.rst b/doc/source/stdlib/handmade/function-ast_boost-append_annotation-0xf9e82b1cfd06dde8.rst
new file mode 100644
index 0000000000..221d0cde55
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-append_annotation-0xf9e82b1cfd06dde8.rst
@@ -0,0 +1 @@
+Creates an ``AnnotationDeclaration`` for the named annotation (with optional typed arguments) and attaches it to a ``Function``, ``ExprBlock``, or ``Structure``.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-convert_to_expression-0x440771f18187d2d6.rst b/doc/source/stdlib/handmade/function-ast_boost-convert_to_expression-0x440771f18187d2d6.rst
new file mode 100644
index 0000000000..8b8c840e11
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-convert_to_expression-0x440771f18187d2d6.rst
@@ -0,0 +1 @@
+Converts a runtime value of any supported type to an equivalent AST ``ExpressionPtr`` that would produce that value when compiled, using ``typeinfo`` for reflection.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-convert_to_expression-0xa246e48166a245b5.rst b/doc/source/stdlib/handmade/function-ast_boost-convert_to_expression-0xa246e48166a245b5.rst
new file mode 100644
index 0000000000..8b8c840e11
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-convert_to_expression-0xa246e48166a245b5.rst
@@ -0,0 +1 @@
+Converts a runtime value of any supported type to an equivalent AST ``ExpressionPtr`` that would produce that value when compiled, using ``typeinfo`` for reflection.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-describe-0xd7d2b188d1cbcad2.rst b/doc/source/stdlib/handmade/function-ast_boost-describe-0xd7d2b188d1cbcad2.rst
new file mode 100644
index 0000000000..ac6b4699cf
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-describe-0xd7d2b188d1cbcad2.rst
@@ -0,0 +1 @@
+Returns a human-readable textual representation of an AST object (``AnnotationArgumentList``, ``AnnotationDeclaration``, ``AnnotationList``, ``Variable``, or ``Expression``).
diff --git a/doc/source/stdlib/handmade/function-ast_boost-emplace_new-0xc5547f245f0a7950.rst b/doc/source/stdlib/handmade/function-ast_boost-emplace_new-0xc5547f245f0a7950.rst
new file mode 100644
index 0000000000..7054963eb6
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-emplace_new-0xc5547f245f0a7950.rst
@@ -0,0 +1 @@
+Moves a newly created pointer (Expression, TypeDecl, Variable, or MakeFieldDecl) into a vector container.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-emplace_new-0xfecbf5e79f9ef529.rst b/doc/source/stdlib/handmade/function-ast_boost-emplace_new-0xfecbf5e79f9ef529.rst
new file mode 100644
index 0000000000..7054963eb6
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-emplace_new-0xfecbf5e79f9ef529.rst
@@ -0,0 +1 @@
+Moves a newly created pointer (Expression, TypeDecl, Variable, or MakeFieldDecl) into a vector container.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-find_arg-0xb42e148158db8035.rst b/doc/source/stdlib/handmade/function-ast_boost-find_arg-0xb42e148158db8035.rst
new file mode 100644
index 0000000000..00dc550dad
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-find_arg-0xb42e148158db8035.rst
@@ -0,0 +1 @@
+Searches an ``AnnotationArgumentList`` for an argument by name and returns its ``RttiValue``; returns ``nothing`` if the argument is not present.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-get_for_source_index-0x9d18bc04187fa6df.rst b/doc/source/stdlib/handmade/function-ast_boost-get_for_source_index-0x9d18bc04187fa6df.rst
new file mode 100644
index 0000000000..6345592636
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-get_for_source_index-0x9d18bc04187fa6df.rst
@@ -0,0 +1 @@
+Returns the zero-based index of a given iterator variable or source expression within a ``for`` loop's source list, or ``-1`` if not found.
diff --git a/doc/source/stdlib/handmade/function-ast_boost-setup_call_list-0x11b87ba9b2952138.rst b/doc/source/stdlib/handmade/function-ast_boost-setup_call_list-0x11b87ba9b2952138.rst
new file mode 100644
index 0000000000..a96767d35b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-ast_boost-setup_call_list-0x11b87ba9b2952138.rst
@@ -0,0 +1 @@
+Creates or locates a compilation-phase setup function (``__setup_macros``) and returns its body ``ExprBlock`` so callers can append registration calls to it.
diff --git a/doc/source/stdlib/handmade/function-builtin-capacity-0xd4b03bedd843ee55.rst b/doc/source/stdlib/handmade/function-builtin-capacity-0xd4b03bedd843ee55.rst
new file mode 100644
index 0000000000..a388ea2975
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-capacity-0xd4b03bedd843ee55.rst
@@ -0,0 +1 @@
+Returns the current capacity of the `table` — the number of key-value pairs it can hold before triggering a reallocation.
diff --git a/doc/source/stdlib/handmade/function-builtin-clz-0x8d9e0ebec5d1d935.rst b/doc/source/stdlib/handmade/function-builtin-clz-0x8d9e0ebec5d1d935.rst
new file mode 100644
index 0000000000..97f27db4fd
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-clz-0x8d9e0ebec5d1d935.rst
@@ -0,0 +1 @@
+Counts the number of leading zero bits in the 64-bit unsigned integer `bits`, returning 64 if the value is zero.
diff --git a/doc/source/stdlib/handmade/function-builtin-ctz-0x69fe50d3a2139135.rst b/doc/source/stdlib/handmade/function-builtin-ctz-0x69fe50d3a2139135.rst
new file mode 100644
index 0000000000..5c81c0b6fa
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-ctz-0x69fe50d3a2139135.rst
@@ -0,0 +1 @@
+Counts the number of trailing zero bits in the 64-bit unsigned integer `bits`, returning 64 if the value is zero.
diff --git a/doc/source/stdlib/handmade/function-builtin-emplace-0xb095b9def9d6f7ff.rst b/doc/source/stdlib/handmade/function-builtin-emplace-0xb095b9def9d6f7ff.rst
new file mode 100644
index 0000000000..a0e123aa50
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-emplace-0xb095b9def9d6f7ff.rst
@@ -0,0 +1 @@
+Moves `val` into the table `Tab` under `at` using move semantics. If the key already exists, its value is replaced.
diff --git a/doc/source/stdlib/handmade/function-builtin-emplace-0xbe9c028b6703e2eb.rst b/doc/source/stdlib/handmade/function-builtin-emplace-0xbe9c028b6703e2eb.rst
new file mode 100644
index 0000000000..93d9b3841b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-emplace-0xbe9c028b6703e2eb.rst
@@ -0,0 +1 @@
+Moves `value` into the dynamic array `Arr` using move semantics, appending it to the end.
diff --git a/doc/source/stdlib/handmade/function-builtin-emplace_new-0x8231c1b6abc987ed.rst b/doc/source/stdlib/handmade/function-builtin-emplace_new-0x8231c1b6abc987ed.rst
new file mode 100644
index 0000000000..c5d699f40a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-emplace_new-0x8231c1b6abc987ed.rst
@@ -0,0 +1 @@
+Moves a smart pointer `value` into the end of the array `Arr`, constructing the entry in-place and returning a reference to it.
diff --git a/doc/source/stdlib/handmade/function-builtin-empty-0x81cd7e89966250aa.rst b/doc/source/stdlib/handmade/function-builtin-empty-0x81cd7e89966250aa.rst
new file mode 100644
index 0000000000..514644844f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-empty-0x81cd7e89966250aa.rst
@@ -0,0 +1 @@
+Checks whether the array `a` has no elements and returns `true` if so.
diff --git a/doc/source/stdlib/handmade/function-builtin-empty-0xbe253756aa873b39.rst b/doc/source/stdlib/handmade/function-builtin-empty-0xbe253756aa873b39.rst
new file mode 100644
index 0000000000..2e67dd8cb4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-empty-0xbe253756aa873b39.rst
@@ -0,0 +1 @@
+Checks whether the `das_string` is empty (has zero length) and returns `true` if so.
diff --git a/doc/source/stdlib/handmade/function-builtin-erase-0x7c2022bcacdb8bfb.rst b/doc/source/stdlib/handmade/function-builtin-erase-0x7c2022bcacdb8bfb.rst
new file mode 100644
index 0000000000..bf410615eb
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-erase-0x7c2022bcacdb8bfb.rst
@@ -0,0 +1 @@
+Removes the entry with key `at` from the table `Tab`, returning `true` if the key was found and erased.
diff --git a/doc/source/stdlib/handmade/function-builtin-find_index-0xaa7a03e480811a14.rst b/doc/source/stdlib/handmade/function-builtin-find_index-0xaa7a03e480811a14.rst
new file mode 100644
index 0000000000..8db9cf5302
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-find_index-0xaa7a03e480811a14.rst
@@ -0,0 +1 @@
+Searches the dynamic array `arr` for the first occurrence of `key` and returns its index, or -1 if not found.
diff --git a/doc/source/stdlib/handmade/function-builtin-find_index_if-0x2a7d64edf20ed530.rst b/doc/source/stdlib/handmade/function-builtin-find_index_if-0x2a7d64edf20ed530.rst
new file mode 100644
index 0000000000..479c7d3bce
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-find_index_if-0x2a7d64edf20ed530.rst
@@ -0,0 +1 @@
+Returns the index of the first element in the fixed array `arr` for which `blk` returns `true`, or -1 if no element matches.
diff --git a/doc/source/stdlib/handmade/function-builtin-fmt-0x30d182cc44314d47.rst b/doc/source/stdlib/handmade/function-builtin-fmt-0x30d182cc44314d47.rst
new file mode 100644
index 0000000000..1c77d79808
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-fmt-0x30d182cc44314d47.rst
@@ -0,0 +1 @@
+Formats a `uint8` value as a string using the given `format` specifier (following libfmt / C++20 `std::format` syntax).
diff --git a/doc/source/stdlib/handmade/function-builtin-fmt-0xa58dca1a4b31543f.rst b/doc/source/stdlib/handmade/function-builtin-fmt-0xa58dca1a4b31543f.rst
new file mode 100644
index 0000000000..d370d525f5
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-fmt-0xa58dca1a4b31543f.rst
@@ -0,0 +1 @@
+Formats a `double` value as a string using the given `format` specifier (following libfmt / C++20 `std::format` syntax).
diff --git a/doc/source/stdlib/handmade/function-builtin-get-0x88996a27083688e7.rst b/doc/source/stdlib/handmade/function-builtin-get-0x88996a27083688e7.rst
new file mode 100644
index 0000000000..b0c6e4aabd
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-get-0x88996a27083688e7.rst
@@ -0,0 +1 @@
+Looks up `at` in the table `Tab` and, if found, invokes `blk` with a reference to the value; returns `true` if the key existed, `false` otherwise.
diff --git a/doc/source/stdlib/handmade/function-builtin-get_key-0x66f049658341b2f.rst b/doc/source/stdlib/handmade/function-builtin-get_key-0x66f049658341b2f.rst
new file mode 100644
index 0000000000..81a7ca3b03
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-get_key-0x66f049658341b2f.rst
@@ -0,0 +1,4 @@
+Returns the key associated with a value reference obtained during table iteration.
+The value must be a reference from a ``values()`` iterator on the same table.
+Computes the key via O(1) pointer arithmetic on the parallel key/value arrays.
+Throws an error if the value pointer is not inside the table or points to a deleted slot.
diff --git a/doc/source/stdlib/handmade/function-builtin-get_ptr-0xd9b22a5efe75b76a.rst b/doc/source/stdlib/handmade/function-builtin-get_ptr-0xd9b22a5efe75b76a.rst
new file mode 100644
index 0000000000..6006fef481
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-get_ptr-0xd9b22a5efe75b76a.rst
@@ -0,0 +1 @@
+Extracts a mutable raw pointer of type `TT?` from the given mutable `smart_ptr<TT>`, without affecting reference counting.
diff --git a/doc/source/stdlib/handmade/function-builtin-get_value-0xe0019f871848c7c6.rst b/doc/source/stdlib/handmade/function-builtin-get_value-0xe0019f871848c7c6.rst
new file mode 100644
index 0000000000..a4f1e406f4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-get_value-0xe0019f871848c7c6.rst
@@ -0,0 +1 @@
+Retrieves the fixed-size array value associated with key `at` from the mutable table `Tab`.
diff --git a/doc/source/stdlib/handmade/function-builtin-get_value-0xebee648e7bc16375.rst b/doc/source/stdlib/handmade/function-builtin-get_value-0xebee648e7bc16375.rst
new file mode 100644
index 0000000000..c8a6da43e0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-get_value-0xebee648e7bc16375.rst
@@ -0,0 +1 @@
+Retrieves the value associated with key `at` from the table `Tab`.
diff --git a/doc/source/stdlib/handmade/function-builtin-has_value-0xa2eb240161258fd9.rst b/doc/source/stdlib/handmade/function-builtin-has_value-0xa2eb240161258fd9.rst
new file mode 100644
index 0000000000..c26a870837
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-has_value-0xa2eb240161258fd9.rst
@@ -0,0 +1 @@
+Consumes elements from the iterator `a` and returns `true` if any element equals `key`.
diff --git a/doc/source/stdlib/handmade/function-builtin-hash-0x6b65d6b207719a2a.rst b/doc/source/stdlib/handmade/function-builtin-hash-0x6b65d6b207719a2a.rst
new file mode 100644
index 0000000000..73c15e7e3d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-hash-0x6b65d6b207719a2a.rst
@@ -0,0 +1 @@
+Computes a 64-bit FNV-1a hash of the given `int8` value and returns it as `uint64`.
diff --git a/doc/source/stdlib/handmade/function-builtin-insert-0x185abb76a0513e5d.rst b/doc/source/stdlib/handmade/function-builtin-insert-0x185abb76a0513e5d.rst
new file mode 100644
index 0000000000..6fa5db626e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-insert-0x185abb76a0513e5d.rst
@@ -0,0 +1 @@
+Inserts the key `at` into the set-style table `Tab` (a table with `void` values), effectively adding to a set.
diff --git a/doc/source/stdlib/handmade/function-builtin-insert-0x78b40fd27b8e0342.rst b/doc/source/stdlib/handmade/function-builtin-insert-0x78b40fd27b8e0342.rst
new file mode 100644
index 0000000000..affa2705d0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-insert-0x78b40fd27b8e0342.rst
@@ -0,0 +1 @@
+Inserts the value `val` into the table `Tab` under key `at`. If the key already exists, its value is replaced.
diff --git a/doc/source/stdlib/handmade/function-builtin-insert_clone-0x7081d5276cf33f3b.rst b/doc/source/stdlib/handmade/function-builtin-insert_clone-0x7081d5276cf33f3b.rst
new file mode 100644
index 0000000000..bb6e2bcb0d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-insert_clone-0x7081d5276cf33f3b.rst
@@ -0,0 +1 @@
+Inserts or updates an entry in the table `Tab` at key `at` by cloning the mutable value `val` into the table.
diff --git a/doc/source/stdlib/handmade/function-builtin-insert_default-0x975230cf25f8422f.rst b/doc/source/stdlib/handmade/function-builtin-insert_default-0x975230cf25f8422f.rst
new file mode 100644
index 0000000000..1683df9b6a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-insert_default-0x975230cf25f8422f.rst
@@ -0,0 +1 @@
+Inserts key `key` with the given const `value` into table `tab` only if the key does not already exist; existing entries are left unchanged.
diff --git a/doc/source/stdlib/handmade/function-builtin-interval-0x2a2dee6bf976b8ba.rst b/doc/source/stdlib/handmade/function-builtin-interval-0x2a2dee6bf976b8ba.rst
new file mode 100644
index 0000000000..3789fde5d2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-interval-0x2a2dee6bf976b8ba.rst
@@ -0,0 +1 @@
+Constructs a `urange` value from the two `uint` endpoints `arg0` (inclusive) and `arg1` (exclusive).
diff --git a/doc/source/stdlib/handmade/function-builtin-interval-0xa989f7317c978450.rst b/doc/source/stdlib/handmade/function-builtin-interval-0xa989f7317c978450.rst
new file mode 100644
index 0000000000..db2ed9cb36
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-interval-0xa989f7317c978450.rst
@@ -0,0 +1 @@
+Constructs a `range64` value from the two `int64` endpoints `arg0` (inclusive) and `arg1` (exclusive).
diff --git a/doc/source/stdlib/handmade/function-builtin-intptr-0x5f5068da6629520e.rst b/doc/source/stdlib/handmade/function-builtin-intptr-0x5f5068da6629520e.rst
new file mode 100644
index 0000000000..73263d9ce2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-intptr-0x5f5068da6629520e.rst
@@ -0,0 +1 @@
+Converts a `smart_ptr` `p` to its `uint64` integer representation, useful for pointer arithmetic or serialization.
diff --git a/doc/source/stdlib/handmade/function-builtin-keys-0xa12ea9661997fb4b.rst b/doc/source/stdlib/handmade/function-builtin-keys-0xa12ea9661997fb4b.rst
new file mode 100644
index 0000000000..0e503d1318
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-keys-0xa12ea9661997fb4b.rst
@@ -0,0 +1 @@
+Creates an iterator over all keys of the mutable table `a`, allowing enumeration of the table's key set.
diff --git a/doc/source/stdlib/handmade/function-builtin-length-0x12843bb9d4e8cff0.rst b/doc/source/stdlib/handmade/function-builtin-length-0x12843bb9d4e8cff0.rst
new file mode 100644
index 0000000000..c22c01bf39
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-length-0x12843bb9d4e8cff0.rst
@@ -0,0 +1 @@
+Returns the number of elements currently stored in a table or dynamic array `a`.
diff --git a/doc/source/stdlib/handmade/function-builtin-lock_data-0x25ac18ae0d7c1a07.rst b/doc/source/stdlib/handmade/function-builtin-lock_data-0x25ac18ae0d7c1a07.rst
new file mode 100644
index 0000000000..c5cc1af9ef
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-lock_data-0x25ac18ae0d7c1a07.rst
@@ -0,0 +1 @@
+Locks a constant array and invokes `blk` with a read-only pointer `p` to the array's contiguous data and its size `s`, allowing direct memory-level read access.
diff --git a/doc/source/stdlib/handmade/function-builtin-move-0xeb90a6e787315d09.rst b/doc/source/stdlib/handmade/function-builtin-move-0xeb90a6e787315d09.rst
new file mode 100644
index 0000000000..f5084a9c24
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-move-0xeb90a6e787315d09.rst
@@ -0,0 +1 @@
+Moves the smart pointer `src` into the smart pointer `dest`, nullifying the previous contents of `dest` and transferring ownership from `src`.
diff --git a/doc/source/stdlib/handmade/function-builtin-popcnt-0xa3d5a34ac4ca4dde.rst b/doc/source/stdlib/handmade/function-builtin-popcnt-0xa3d5a34ac4ca4dde.rst
new file mode 100644
index 0000000000..928b8345dc
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-popcnt-0xa3d5a34ac4ca4dde.rst
@@ -0,0 +1 @@
+Counts and returns the number of set (1) bits in the 64-bit unsigned integer `bits`.
diff --git a/doc/source/stdlib/handmade/function-builtin-push-0x1d7f5913856b5286.rst b/doc/source/stdlib/handmade/function-builtin-push-0x1d7f5913856b5286.rst
new file mode 100644
index 0000000000..c1bb2a2759
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-push-0x1d7f5913856b5286.rst
@@ -0,0 +1 @@
+Appends a constant fixed-size array element `varr` to the end of `Arr`, an array of fixed-size arrays.
diff --git a/doc/source/stdlib/handmade/function-builtin-push-0x58b9f3c4ef6a2595.rst b/doc/source/stdlib/handmade/function-builtin-push-0x58b9f3c4ef6a2595.rst
new file mode 100644
index 0000000000..1620b8bb68
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-push-0x58b9f3c4ef6a2595.rst
@@ -0,0 +1 @@
+Appends a constant `value` to the end of dynamic array `Arr`, copying it into place.
diff --git a/doc/source/stdlib/handmade/function-builtin-reserve-0x590234832f498366.rst b/doc/source/stdlib/handmade/function-builtin-reserve-0x590234832f498366.rst
new file mode 100644
index 0000000000..9dcc27f34b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-reserve-0x590234832f498366.rst
@@ -0,0 +1 @@
+Pre-allocates memory in `Tab` to hold at least `newSize` entries without rehashing, improving performance of subsequent insertions.
diff --git a/doc/source/stdlib/handmade/function-builtin-resize_and_init-0x7417623711ee1885.rst b/doc/source/stdlib/handmade/function-builtin-resize_and_init-0x7417623711ee1885.rst
new file mode 100644
index 0000000000..cb8f649b48
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-resize_and_init-0x7417623711ee1885.rst
@@ -0,0 +1 @@
+Resizes dynamic array `Arr` to `newSize` elements, default-initializing any newly added elements.
diff --git a/doc/source/stdlib/handmade/function-builtin-smart_ptr_clone-0xeb12b0981444f24f.rst b/doc/source/stdlib/handmade/function-builtin-smart_ptr_clone-0xeb12b0981444f24f.rst
new file mode 100644
index 0000000000..151f8dc6a1
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-smart_ptr_clone-0xeb12b0981444f24f.rst
@@ -0,0 +1 @@
+Clones the smart pointer `src` into smart pointer `dest`, incrementing the internal reference count to share ownership.
diff --git a/doc/source/stdlib/handmade/function-builtin-sort-0x23e0d148defd0ddc.rst b/doc/source/stdlib/handmade/function-builtin-sort-0x23e0d148defd0ddc.rst
new file mode 100644
index 0000000000..770f36da4f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-sort-0x23e0d148defd0ddc.rst
@@ -0,0 +1 @@
+Sorts a dynamic array in place in ascending order using the default comparison for its element type.
diff --git a/doc/source/stdlib/handmade/function-builtin-subarray-0x7af7560ee30bf5bc.rst b/doc/source/stdlib/handmade/function-builtin-subarray-0x7af7560ee30bf5bc.rst
new file mode 100644
index 0000000000..2eac2a649b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-subarray-0x7af7560ee30bf5bc.rst
@@ -0,0 +1 @@
+Creates and returns a new dynamic array containing a copy of elements from fixed-size array `a` within the unsigned range `r`.
diff --git a/doc/source/stdlib/handmade/function-builtin-subarray-0x8f48cc5efedc9022.rst b/doc/source/stdlib/handmade/function-builtin-subarray-0x8f48cc5efedc9022.rst
new file mode 100644
index 0000000000..83de7d6d74
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-subarray-0x8f48cc5efedc9022.rst
@@ -0,0 +1 @@
+Returns a temporary sub-range of the dynamic array `a` defined by the signed range `r`, providing access to elements from `r.x` up to but not including `r.y`.
diff --git a/doc/source/stdlib/handmade/function-builtin-to_array_move-0x3356b325b05e43ff.rst b/doc/source/stdlib/handmade/function-builtin-to_array_move-0x3356b325b05e43ff.rst
new file mode 100644
index 0000000000..3588f70e19
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-to_array_move-0x3356b325b05e43ff.rst
@@ -0,0 +1 @@
+Converts a mutable container `a` into a new dynamic array, moving elements when possible instead of cloning.
diff --git a/doc/source/stdlib/handmade/function-builtin-to_table-0x134c6d3cb7d1db2a.rst b/doc/source/stdlib/handmade/function-builtin-to_table-0x134c6d3cb7d1db2a.rst
new file mode 100644
index 0000000000..a0cbbb1215
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-to_table-0x134c6d3cb7d1db2a.rst
@@ -0,0 +1 @@
+Converts a fixed-size array of key-value tuples `a` into a `table<keyT, valT>` by cloning each key and value.
diff --git a/doc/source/stdlib/handmade/function-builtin-to_table_move-0xb7fd1c47b782fe2c.rst b/doc/source/stdlib/handmade/function-builtin-to_table_move-0xb7fd1c47b782fe2c.rst
new file mode 100644
index 0000000000..fab51b1b30
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-to_table_move-0xb7fd1c47b782fe2c.rst
@@ -0,0 +1 @@
+Converts a mutable dynamic array of keys `a` into a set-style `table<keyT, void>`, moving elements when possible.
diff --git a/doc/source/stdlib/handmade/function-builtin-to_table_move-0xf4aaed1ab56441e1.rst b/doc/source/stdlib/handmade/function-builtin-to_table_move-0xf4aaed1ab56441e1.rst
new file mode 100644
index 0000000000..53c436d075
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-to_table_move-0xf4aaed1ab56441e1.rst
@@ -0,0 +1 @@
+Converts a mutable fixed-size array of key-value tuples `a` into a `table<keyT, valT>`, moving elements when possible.
diff --git a/doc/source/stdlib/handmade/function-builtin-values-0x937e576d1e8d6fbc.rst b/doc/source/stdlib/handmade/function-builtin-values-0x937e576d1e8d6fbc.rst
new file mode 100644
index 0000000000..9865646d49
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-builtin-values-0x937e576d1e8d6fbc.rst
@@ -0,0 +1 @@
+Returns a read-only iterator over all values in a `table<keyT, valT>`, yielding each value by const reference.
diff --git a/doc/source/stdlib/handmade/function-dashv-DELETE-0x369ff3f3c49e570b.rst b/doc/source/stdlib/handmade/function-dashv-DELETE-0x369ff3f3c49e570b.rst
new file mode 100644
index 0000000000..550cf97679
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-DELETE-0x369ff3f3c49e570b.rst
@@ -0,0 +1 @@
+Registers a DELETE route handler, or performs an HTTP DELETE client request.
diff --git a/doc/source/stdlib/handmade/function-dashv-GET-0xf970046426fb0d0a.rst b/doc/source/stdlib/handmade/function-dashv-GET-0xf970046426fb0d0a.rst
new file mode 100644
index 0000000000..6c301d4bd7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-GET-0xf970046426fb0d0a.rst
@@ -0,0 +1 @@
+Registers a GET route handler, or performs an HTTP GET client request.
diff --git a/doc/source/stdlib/handmade/function-dashv-POST-0xed9663189b54a7d8.rst b/doc/source/stdlib/handmade/function-dashv-POST-0xed9663189b54a7d8.rst
new file mode 100644
index 0000000000..abd41f80e4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-POST-0xed9663189b54a7d8.rst
@@ -0,0 +1 @@
+Registers a POST route handler, or performs an HTTP POST client request.
diff --git a/doc/source/stdlib/handmade/function-dashv-PUT-0x92a57de557956a8f.rst b/doc/source/stdlib/handmade/function-dashv-PUT-0x92a57de557956a8f.rst
new file mode 100644
index 0000000000..3d5f9fd642
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-PUT-0x92a57de557956a8f.rst
@@ -0,0 +1 @@
+Registers a PUT route handler, or performs an HTTP PUT client request.
diff --git a/doc/source/stdlib/handmade/function-dashv-add_cookie-0x2f5bbf6511f19b67.rst b/doc/source/stdlib/handmade/function-dashv-add_cookie-0x2f5bbf6511f19b67.rst
new file mode 100644
index 0000000000..7589049b29
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-add_cookie-0x2f5bbf6511f19b67.rst
@@ -0,0 +1 @@
+Adds a cookie to a request or response, optionally with domain, path, max-age, secure, and httponly flags.
diff --git a/doc/source/stdlib/handmade/function-dashv-each_cookie-0xc1140ed44788680e.rst b/doc/source/stdlib/handmade/function-dashv-each_cookie-0xc1140ed44788680e.rst
new file mode 100644
index 0000000000..c29fdd45e4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-each_cookie-0xc1140ed44788680e.rst
@@ -0,0 +1 @@
+Iterates over all cookies, invoking the block with each name-value pair.
diff --git a/doc/source/stdlib/handmade/function-dashv-send-0x33200e5ed86d5a3a.rst b/doc/source/stdlib/handmade/function-dashv-send-0x33200e5ed86d5a3a.rst
new file mode 100644
index 0000000000..b53c87098f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-send-0x33200e5ed86d5a3a.rst
@@ -0,0 +1 @@
+Sends a message frame on the WebSocket channel with the given opcode and fin flag. Returns the number of bytes sent, or a negative error code.
diff --git a/doc/source/stdlib/handmade/function-dashv-set_content_type-0xfa5159e454dba2c3.rst b/doc/source/stdlib/handmade/function-dashv-set_content_type-0xfa5159e454dba2c3.rst
new file mode 100644
index 0000000000..e309cafe95
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-set_content_type-0xfa5159e454dba2c3.rst
@@ -0,0 +1 @@
+Sets the Content-Type header on a response or request.
diff --git a/doc/source/stdlib/handmade/function-dashv-set_header-0x5efcaa3c662f240a.rst b/doc/source/stdlib/handmade/function-dashv-set_header-0x5efcaa3c662f240a.rst
new file mode 100644
index 0000000000..68f27c9b6b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-set_header-0x5efcaa3c662f240a.rst
@@ -0,0 +1 @@
+Sets a response or request header.
diff --git a/doc/source/stdlib/handmade/function-dashv-tick-0xce505d5dae5b9d36.rst b/doc/source/stdlib/handmade/function-dashv-tick-0xce505d5dae5b9d36.rst
new file mode 100644
index 0000000000..c2bf09480d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-dashv-tick-0xce505d5dae5b9d36.rst
@@ -0,0 +1 @@
+Drains one batch of queued HTTP/WebSocket events and invokes the user callbacks. Call periodically from the owning thread.
diff --git a/doc/source/stdlib/handmade/function-debugapi-get_context_global_variable-0xe75140196f0a8fa8.rst b/doc/source/stdlib/handmade/function-debugapi-get_context_global_variable-0xe75140196f0a8fa8.rst
new file mode 100644
index 0000000000..4292795cee
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-get_context_global_variable-0xe75140196f0a8fa8.rst
@@ -0,0 +1 @@
+Returns a pointer to a global variable in the given context, looked up by name (string variant) or by index (int variant).  Returns `null` if not found.
diff --git a/doc/source/stdlib/handmade/function-debugapi-instrument_all_functions_thread_local-0x598b06c95139fb83.rst b/doc/source/stdlib/handmade/function-debugapi-instrument_all_functions_thread_local-0x598b06c95139fb83.rst
new file mode 100644
index 0000000000..fd79a193d3
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-instrument_all_functions_thread_local-0x598b06c95139fb83.rst
@@ -0,0 +1 @@
+Thread-local variant of `instrument_all_functions`.  Instruments functions only on the current thread.
diff --git a/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_function-0x205e5bd497cb5790.rst b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_function-0x205e5bd497cb5790.rst
new file mode 100644
index 0000000000..ef15e6e390
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_function-0x205e5bd497cb5790.rst
@@ -0,0 +1,3 @@
+Calls an ``[export, pinvoke]`` function in the named agent's context.  Similar to ``invoke_in_context`` but resolves the agent context automatically from the category name.
+
+When *category* is an empty string (``""``), the call targets the **thread-local** debug agent's context instead of a globally named one.  There can be only one thread-local agent per thread, so no name is needed.  The thread-local path is faster because it skips the global agent map lookup.
diff --git a/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_function-0x7ddaa06b8a2d381a.rst b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_function-0x7ddaa06b8a2d381a.rst
new file mode 100644
index 0000000000..ef15e6e390
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_function-0x7ddaa06b8a2d381a.rst
@@ -0,0 +1,3 @@
+Calls an ``[export, pinvoke]`` function in the named agent's context.  Similar to ``invoke_in_context`` but resolves the agent context automatically from the category name.
+
+When *category* is an empty string (``""``), the call targets the **thread-local** debug agent's context instead of a globally named one.  There can be only one thread-local agent per thread, so no name is needed.  The thread-local path is faster because it skips the global agent map lookup.
diff --git a/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_method-0x71b1972561138d2e.rst b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_method-0x71b1972561138d2e.rst
new file mode 100644
index 0000000000..0dda66ea7c
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_method-0x71b1972561138d2e.rst
@@ -0,0 +1,3 @@
+Calls a method on the debug agent's class instance by name.  The first argument is the agent category, the second is the method name, followed by up to 10 user arguments.  The agent's ``self`` is passed automatically.
+
+When *category* is an empty string (``""``), the call targets the **thread-local** debug agent instead of a globally named one.  There can be only one thread-local agent per thread — that is why it needs no name.  The thread-local path is faster than a named agent lookup because it skips the global map search.
diff --git a/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_method-0x7228354cad9ff20.rst b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_method-0x7228354cad9ff20.rst
new file mode 100644
index 0000000000..0dda66ea7c
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-invoke_debug_agent_method-0x7228354cad9ff20.rst
@@ -0,0 +1,3 @@
+Calls a method on the debug agent's class instance by name.  The first argument is the agent category, the second is the method name, followed by up to 10 user arguments.  The agent's ``self`` is passed automatically.
+
+When *category* is an empty string (``""``), the call targets the **thread-local** debug agent instead of a globally named one.  There can be only one thread-local agent per thread — that is why it needs no name.  The thread-local path is faster than a named agent lookup because it skips the global map search.
diff --git a/doc/source/stdlib/handmade/function-debugapi-invoke_in_context-0x19a0ddf922c9ad95.rst b/doc/source/stdlib/handmade/function-debugapi-invoke_in_context-0x19a0ddf922c9ad95.rst
new file mode 100644
index 0000000000..2bd510103e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-invoke_in_context-0x19a0ddf922c9ad95.rst
@@ -0,0 +1 @@
+Calls a function in another context.  Accepts a `Context` reference and either a function name (string), a `function` pointer, or a `lambda`, plus up to 10 extra arguments.  Target functions must be marked `[export, pinvoke]`.
diff --git a/doc/source/stdlib/handmade/function-debugapi-invoke_in_context-0xdc8aad5ea68674bf.rst b/doc/source/stdlib/handmade/function-debugapi-invoke_in_context-0xdc8aad5ea68674bf.rst
new file mode 100644
index 0000000000..2bd510103e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-invoke_in_context-0xdc8aad5ea68674bf.rst
@@ -0,0 +1 @@
+Calls a function in another context.  Accepts a `Context` reference and either a function name (string), a `function` pointer, or a `lambda`, plus up to 10 extra arguments.  Target functions must be marked `[export, pinvoke]`.
diff --git a/doc/source/stdlib/handmade/function-debugapi-walk_data-0x132a2d5d02a743aa.rst b/doc/source/stdlib/handmade/function-debugapi-walk_data-0x132a2d5d02a743aa.rst
new file mode 100644
index 0000000000..d85d760ca6
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-walk_data-0x132a2d5d02a743aa.rst
@@ -0,0 +1 @@
+Walks a daslang data structure using the provided `DataWalker`.  The walker receives typed callbacks for each value encountered.  Overloaded for raw data+`StructInfo`, `float4`+`TypeInfo`, and `void?`+`TypeInfo`.
diff --git a/doc/source/stdlib/handmade/function-debugapi-walk_data-0x53f25f6076d8be37.rst b/doc/source/stdlib/handmade/function-debugapi-walk_data-0x53f25f6076d8be37.rst
new file mode 100644
index 0000000000..d85d760ca6
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-debugapi-walk_data-0x53f25f6076d8be37.rst
@@ -0,0 +1 @@
+Walks a daslang data structure using the provided `DataWalker`.  The walker receives typed callbacks for each value encountered.  Overloaded for raw data+`StructInfo`, `float4`+`TypeInfo`, and `void?`+`TypeInfo`.
diff --git a/doc/source/stdlib/handmade/function-fio-register_dynamic_module-0x92b5f754f5226f49.rst b/doc/source/stdlib/handmade/function-fio-register_dynamic_module-0x92b5f754f5226f49.rst
new file mode 100644
index 0000000000..d0c4209223
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-fio-register_dynamic_module-0x92b5f754f5226f49.rst
@@ -0,0 +1 @@
+Loads a shared library from the given path and registers it as a daslang module under the specified name, making it available for require.
diff --git a/doc/source/stdlib/handmade/function-math-_st_-0x1a3b25fb86b08da8.rst b/doc/source/stdlib/handmade/function-math-_st_-0x1a3b25fb86b08da8.rst
new file mode 100644
index 0000000000..cbee635d20
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-_st_-0x1a3b25fb86b08da8.rst
@@ -0,0 +1 @@
+Transforms a float3 vector by a 3x3 matrix.
diff --git a/doc/source/stdlib/handmade/function-math-_st_-0x2790c7fbc31c1203.rst b/doc/source/stdlib/handmade/function-math-_st_-0x2790c7fbc31c1203.rst
new file mode 100644
index 0000000000..57c866de4c
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-_st_-0x2790c7fbc31c1203.rst
@@ -0,0 +1 @@
+Transforms a float3 vector by a 3x4 matrix.
diff --git a/doc/source/stdlib/handmade/function-math-_st_-0x58d66dbb15e749dd.rst b/doc/source/stdlib/handmade/function-math-_st_-0x58d66dbb15e749dd.rst
new file mode 100644
index 0000000000..c158fe37e1
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-_st_-0x58d66dbb15e749dd.rst
@@ -0,0 +1 @@
+Multiplies two 4x4 matrices and returns the resulting float4x4.
diff --git a/doc/source/stdlib/handmade/function-math-abs-0x5506bc2ebabe028b.rst b/doc/source/stdlib/handmade/function-math-abs-0x5506bc2ebabe028b.rst
new file mode 100644
index 0000000000..465289b20a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-abs-0x5506bc2ebabe028b.rst
@@ -0,0 +1 @@
+Returns the absolute value of x, component-wise for vectors and across numeric scalar overloads.
diff --git a/doc/source/stdlib/handmade/function-math-abs-0xa2bb5b0630167088.rst b/doc/source/stdlib/handmade/function-math-abs-0xa2bb5b0630167088.rst
new file mode 100644
index 0000000000..465289b20a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-abs-0xa2bb5b0630167088.rst
@@ -0,0 +1 @@
+Returns the absolute value of x, component-wise for vectors and across numeric scalar overloads.
diff --git a/doc/source/stdlib/handmade/function-math-acos-0x173fd103d9468516.rst b/doc/source/stdlib/handmade/function-math-acos-0x173fd103d9468516.rst
new file mode 100644
index 0000000000..3409850be0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-acos-0x173fd103d9468516.rst
@@ -0,0 +1 @@
+Returns the arccosine of x in radians, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-acosh-0x5a1bb3e797e98954.rst b/doc/source/stdlib/handmade/function-math-acosh-0x5a1bb3e797e98954.rst
new file mode 100644
index 0000000000..bf4261894b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-acosh-0x5a1bb3e797e98954.rst
@@ -0,0 +1 @@
+Returns the inverse hyperbolic cosine of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-asin-0x80414d941f625c38.rst b/doc/source/stdlib/handmade/function-math-asin-0x80414d941f625c38.rst
new file mode 100644
index 0000000000..8751648a1e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-asin-0x80414d941f625c38.rst
@@ -0,0 +1 @@
+Returns the arcsine of x in radians, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-atan-0x678487d973eda8f0.rst b/doc/source/stdlib/handmade/function-math-atan-0x678487d973eda8f0.rst
new file mode 100644
index 0000000000..044ad48144
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-atan-0x678487d973eda8f0.rst
@@ -0,0 +1 @@
+Returns the arctangent of x in radians, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-atan-0x9994f1e77b3f4f88.rst b/doc/source/stdlib/handmade/function-math-atan-0x9994f1e77b3f4f88.rst
new file mode 100644
index 0000000000..168bc42884
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-atan-0x9994f1e77b3f4f88.rst
@@ -0,0 +1 @@
+Returns the arctangent of x in radians, with the result in the range [-pi/2, pi/2]; works with float and double.
diff --git a/doc/source/stdlib/handmade/function-math-atan2_est-0xa2596e78aba1b820.rst b/doc/source/stdlib/handmade/function-math-atan2_est-0xa2596e78aba1b820.rst
new file mode 100644
index 0000000000..8e137e8a86
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-atan2_est-0xa2596e78aba1b820.rst
@@ -0,0 +1 @@
+Returns a fast estimated arctangent of y/x in radians, using the signs of both arguments to determine the correct quadrant; trades some precision for speed.
diff --git a/doc/source/stdlib/handmade/function-math-atan_est-0xc9f99055bb5cb78e.rst b/doc/source/stdlib/handmade/function-math-atan_est-0xc9f99055bb5cb78e.rst
new file mode 100644
index 0000000000..c71f8ff9cf
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-atan_est-0xc9f99055bb5cb78e.rst
@@ -0,0 +1 @@
+Returns a fast estimated arctangent of x in radians, trading some precision for speed; the result approximates the range [-pi/2, pi/2].
diff --git a/doc/source/stdlib/handmade/function-math-atanh-0xa1d8463c4261a6ec.rst b/doc/source/stdlib/handmade/function-math-atanh-0xa1d8463c4261a6ec.rst
new file mode 100644
index 0000000000..c7f14dd388
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-atanh-0xa1d8463c4261a6ec.rst
@@ -0,0 +1 @@
+Returns the inverse hyperbolic tangent of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-cbrt-0x61914e8e75ed3f09.rst b/doc/source/stdlib/handmade/function-math-cbrt-0x61914e8e75ed3f09.rst
new file mode 100644
index 0000000000..e409b183d2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-cbrt-0x61914e8e75ed3f09.rst
@@ -0,0 +1 @@
+Returns the cube root of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-cbrt-0x7593149c6d2e5615.rst b/doc/source/stdlib/handmade/function-math-cbrt-0x7593149c6d2e5615.rst
new file mode 100644
index 0000000000..e409b183d2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-cbrt-0x7593149c6d2e5615.rst
@@ -0,0 +1 @@
+Returns the cube root of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-ceili-0x600f679ae0f35c90.rst b/doc/source/stdlib/handmade/function-math-ceili-0x600f679ae0f35c90.rst
new file mode 100644
index 0000000000..1acd94b3e4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-ceili-0x600f679ae0f35c90.rst
@@ -0,0 +1 @@
+Returns the smallest integer not less than x (rounds toward positive infinity), converting the double argument to an int result.
diff --git a/doc/source/stdlib/handmade/function-math-clamp-0x565ea9b35ad32b5f.rst b/doc/source/stdlib/handmade/function-math-clamp-0x565ea9b35ad32b5f.rst
new file mode 100644
index 0000000000..be13b801f9
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-clamp-0x565ea9b35ad32b5f.rst
@@ -0,0 +1 @@
+Returns the value t clamped to the inclusive range [a, b], equivalent to min(max(t, a), b); works with float, double, float2, float3, float4, int, int64, uint, and uint64 types.
diff --git a/doc/source/stdlib/handmade/function-math-clamp-0xaba45b70a54c7f1d.rst b/doc/source/stdlib/handmade/function-math-clamp-0xaba45b70a54c7f1d.rst
new file mode 100644
index 0000000000..be13b801f9
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-clamp-0xaba45b70a54c7f1d.rst
@@ -0,0 +1 @@
+Returns the value t clamped to the inclusive range [a, b], equivalent to min(max(t, a), b); works with float, double, float2, float3, float4, int, int64, uint, and uint64 types.
diff --git a/doc/source/stdlib/handmade/function-math-cos-0xd4d91a45a49f8cd5.rst b/doc/source/stdlib/handmade/function-math-cos-0xd4d91a45a49f8cd5.rst
new file mode 100644
index 0000000000..611bb1c363
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-cos-0xd4d91a45a49f8cd5.rst
@@ -0,0 +1 @@
+Returns the cosine of x in radians, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-cosh-0x725ce66adc5ce81c.rst b/doc/source/stdlib/handmade/function-math-cosh-0x725ce66adc5ce81c.rst
new file mode 100644
index 0000000000..69161c9c66
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-cosh-0x725ce66adc5ce81c.rst
@@ -0,0 +1 @@
+Returns the hyperbolic cosine of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-determinant-0x2f2e400c66afff50.rst b/doc/source/stdlib/handmade/function-math-determinant-0x2f2e400c66afff50.rst
new file mode 100644
index 0000000000..bdf5d42ba7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-determinant-0x2f2e400c66afff50.rst
@@ -0,0 +1 @@
+Returns the determinant of a float3x3 matrix as a float scalar; a zero determinant indicates the matrix is singular and non-invertible.
diff --git a/doc/source/stdlib/handmade/function-math-determinant-0x2f3f450c66be7b02.rst b/doc/source/stdlib/handmade/function-math-determinant-0x2f3f450c66be7b02.rst
new file mode 100644
index 0000000000..57072011b2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-determinant-0x2f3f450c66be7b02.rst
@@ -0,0 +1 @@
+Returns the determinant of a float3x4 matrix as a float scalar; a zero determinant indicates the matrix is singular and non-invertible.
diff --git a/doc/source/stdlib/handmade/function-math-distance_sq-0x933eb227434b179a.rst b/doc/source/stdlib/handmade/function-math-distance_sq-0x933eb227434b179a.rst
new file mode 100644
index 0000000000..9533da50ec
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-distance_sq-0x933eb227434b179a.rst
@@ -0,0 +1 @@
+Returns the squared Euclidean distance between two vectors as a float scalar, avoiding the square root for faster distance comparisons; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-dot-0xf0a384b713be2ccb.rst b/doc/source/stdlib/handmade/function-math-dot-0xf0a384b713be2ccb.rst
new file mode 100644
index 0000000000..a4678778ca
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-dot-0xf0a384b713be2ccb.rst
@@ -0,0 +1 @@
+Returns the dot product (scalar product) of two vectors as a float; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-exp-0xca54d5dff1072987.rst b/doc/source/stdlib/handmade/function-math-exp-0xca54d5dff1072987.rst
new file mode 100644
index 0000000000..2c02d6a268
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-exp-0xca54d5dff1072987.rst
@@ -0,0 +1 @@
+Returns e raised to x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-exp2-0xbaac219b448c3159.rst b/doc/source/stdlib/handmade/function-math-exp2-0xbaac219b448c3159.rst
new file mode 100644
index 0000000000..1503740e9d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-exp2-0xbaac219b448c3159.rst
@@ -0,0 +1 @@
+Returns 2 raised to x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-expm1-0x36c1eeeff18f7125.rst b/doc/source/stdlib/handmade/function-math-expm1-0x36c1eeeff18f7125.rst
new file mode 100644
index 0000000000..57174e5d59
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-expm1-0x36c1eeeff18f7125.rst
@@ -0,0 +1 @@
+Returns exp(x)-1 with improved accuracy near zero, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-float3x3-0x25466deba351a30e.rst b/doc/source/stdlib/handmade/function-math-float3x3-0x25466deba351a30e.rst
new file mode 100644
index 0000000000..bfe906c032
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-float3x3-0x25466deba351a30e.rst
@@ -0,0 +1 @@
+Extracts the upper-left 3x3 rotation part from a float3x4 transformation matrix, returning it as a float3x3.
diff --git a/doc/source/stdlib/handmade/function-math-float3x4-0x25436deba34c8a0e.rst b/doc/source/stdlib/handmade/function-math-float3x4-0x25436deba34c8a0e.rst
new file mode 100644
index 0000000000..d42f0f851f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-float3x4-0x25436deba34c8a0e.rst
@@ -0,0 +1 @@
+Constructs a float3x4 transformation matrix from a float3x3 rotation matrix, with the translation component set to zero.
diff --git a/doc/source/stdlib/handmade/function-math-float4x4-0x36416debb1bc570e.rst b/doc/source/stdlib/handmade/function-math-float4x4-0x36416debb1bc570e.rst
new file mode 100644
index 0000000000..db59baf4bf
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-float4x4-0x36416debb1bc570e.rst
@@ -0,0 +1 @@
+Converts a float3x4 transformation matrix to a float4x4 matrix, filling the fourth row with [0, 0, 0, 1].
diff --git a/doc/source/stdlib/handmade/function-math-floor-0xaba8e1a373b2daa8.rst b/doc/source/stdlib/handmade/function-math-floor-0xaba8e1a373b2daa8.rst
new file mode 100644
index 0000000000..ecd73cf8eb
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-floor-0xaba8e1a373b2daa8.rst
@@ -0,0 +1 @@
+Returns the largest integer value not greater than x, component-wise for vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-floori-0x39d5fba43a42bbd9.rst b/doc/source/stdlib/handmade/function-math-floori-0x39d5fba43a42bbd9.rst
new file mode 100644
index 0000000000..d9582dd067
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-floori-0x39d5fba43a42bbd9.rst
@@ -0,0 +1 @@
+Returns the largest integer not greater than x (rounds toward negative infinity), converting the float argument to an int result.
diff --git a/doc/source/stdlib/handmade/function-math-fmod-0x8ce0dea42e95e830.rst b/doc/source/stdlib/handmade/function-math-fmod-0x8ce0dea42e95e830.rst
new file mode 100644
index 0000000000..0ab0adcb24
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-fmod-0x8ce0dea42e95e830.rst
@@ -0,0 +1 @@
+Returns the floating-point remainder of x/y using truncation-style quotient (same sign behavior as %), component-wise for vectors and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-fmod-0xeb05d432f9e2311c.rst b/doc/source/stdlib/handmade/function-math-fmod-0xeb05d432f9e2311c.rst
new file mode 100644
index 0000000000..0ab0adcb24
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-fmod-0xeb05d432f9e2311c.rst
@@ -0,0 +1 @@
+Returns the floating-point remainder of x/y using truncation-style quotient (same sign behavior as %), component-wise for vectors and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-fract-0x772190bc58e2ca8b.rst b/doc/source/stdlib/handmade/function-math-fract-0x772190bc58e2ca8b.rst
new file mode 100644
index 0000000000..4a244c312d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-fract-0x772190bc58e2ca8b.rst
@@ -0,0 +1 @@
+Returns the fractional part of x (equivalent to x - floor(x)), computed component-wise for float2, float3, and float4 vector types; works with float and double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-fract-0x93c7637c9fbea514.rst b/doc/source/stdlib/handmade/function-math-fract-0x93c7637c9fbea514.rst
new file mode 100644
index 0000000000..4a244c312d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-fract-0x93c7637c9fbea514.rst
@@ -0,0 +1 @@
+Returns the fractional part of x (equivalent to x - floor(x)), computed component-wise for float2, float3, and float4 vector types; works with float and double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-hypot-0x3c18597c24ef04b3.rst b/doc/source/stdlib/handmade/function-math-hypot-0x3c18597c24ef04b3.rst
new file mode 100644
index 0000000000..03dbcfd948
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-hypot-0x3c18597c24ef04b3.rst
@@ -0,0 +1 @@
+Returns sqrt(x*x + y*y), component-wise for float2/float3/float4 vectors, and for float/double scalar pairs.
diff --git a/doc/source/stdlib/handmade/function-math-identity-0x16b5cfe28d94fc9c.rst b/doc/source/stdlib/handmade/function-math-identity-0x16b5cfe28d94fc9c.rst
new file mode 100644
index 0000000000..1d24ae9786
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-identity-0x16b5cfe28d94fc9c.rst
@@ -0,0 +1 @@
+Sets the given float3x4 matrix to the identity transformation (rotation part is the identity matrix, translation is zero) and returns it.
diff --git a/doc/source/stdlib/handmade/function-math-identity-0x27b8cfe29c0d489c.rst b/doc/source/stdlib/handmade/function-math-identity-0x27b8cfe29c0d489c.rst
new file mode 100644
index 0000000000..af1440070d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-identity-0x27b8cfe29c0d489c.rst
@@ -0,0 +1 @@
+Sets the given float3x3 matrix to the identity transformation and returns it.
diff --git a/doc/source/stdlib/handmade/function-math-inv_distance-0x767f5e1e897c408d.rst b/doc/source/stdlib/handmade/function-math-inv_distance-0x767f5e1e897c408d.rst
new file mode 100644
index 0000000000..dce29e05d6
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-inv_distance-0x767f5e1e897c408d.rst
@@ -0,0 +1 @@
+Returns the reciprocal of the Euclidean distance between two vectors (1 / distance(x, y)) as a float; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-inv_distance_sq-0xdccb06e35982188d.rst b/doc/source/stdlib/handmade/function-math-inv_distance_sq-0xdccb06e35982188d.rst
new file mode 100644
index 0000000000..70ebc368b7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-inv_distance_sq-0xdccb06e35982188d.rst
@@ -0,0 +1 @@
+Returns the reciprocal of the squared Euclidean distance between two vectors (1 / distance_sq(x, y)) as a float; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-inv_length-0xa1704a3643a392a5.rst b/doc/source/stdlib/handmade/function-math-inv_length-0xa1704a3643a392a5.rst
new file mode 100644
index 0000000000..03c7f36495
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-inv_length-0xa1704a3643a392a5.rst
@@ -0,0 +1 @@
+Returns the reciprocal of the length of the vector (1 / length(x)) as a float; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-inv_length_sq-0x7253887c1ebf602f.rst b/doc/source/stdlib/handmade/function-math-inv_length_sq-0x7253887c1ebf602f.rst
new file mode 100644
index 0000000000..aeeb4e9700
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-inv_length_sq-0x7253887c1ebf602f.rst
@@ -0,0 +1 @@
+Returns the reciprocal of the squared length of the vector (1 / length_sq(x)) as a float; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-inverse-0x943f3f5747d2ddf4.rst b/doc/source/stdlib/handmade/function-math-inverse-0x943f3f5747d2ddf4.rst
new file mode 100644
index 0000000000..f1ed131307
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-inverse-0x943f3f5747d2ddf4.rst
@@ -0,0 +1 @@
+Returns the inverse of a matrix, such that multiplying the original by its inverse yields the identity; works with float3x4 and float4x4 matrix types.
diff --git a/doc/source/stdlib/handmade/function-math-is_nan-0x877d2378b5542335.rst b/doc/source/stdlib/handmade/function-math-is_nan-0x877d2378b5542335.rst
new file mode 100644
index 0000000000..d5b5812f62
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-is_nan-0x877d2378b5542335.rst
@@ -0,0 +1 @@
+Returns true if x is NaN (Not a Number), checked component-wise for float2, float3, and float4 vector types; works with float and double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-length-0xf89ec6c452198bee.rst b/doc/source/stdlib/handmade/function-math-length-0xf89ec6c452198bee.rst
new file mode 100644
index 0000000000..52746147af
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-length-0xf89ec6c452198bee.rst
@@ -0,0 +1 @@
+Returns the Euclidean length (magnitude) of the vector as a float; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-length_sq-0x1f6efec869828f70.rst b/doc/source/stdlib/handmade/function-math-length_sq-0x1f6efec869828f70.rst
new file mode 100644
index 0000000000..c69f008528
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-length_sq-0x1f6efec869828f70.rst
@@ -0,0 +1 @@
+Returns the squared Euclidean length of the vector as a float, equivalent to dot(x, x) and avoiding the square root for faster magnitude comparisons; works with float2, float3, and float4 vector types.
diff --git a/doc/source/stdlib/handmade/function-math-lerp-0x1f877349aa636aca.rst b/doc/source/stdlib/handmade/function-math-lerp-0x1f877349aa636aca.rst
new file mode 100644
index 0000000000..46a412d1b0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-lerp-0x1f877349aa636aca.rst
@@ -0,0 +1 @@
+Performs linear interpolation between a and b using the factor t, returning a + (b - a) * t; when t is 0 the result is a, when t is 1 the result is b; works component-wise with float, double, float2, float3, and float4 types.
diff --git a/doc/source/stdlib/handmade/function-math-log-0x2a7616eeac94c1a4.rst b/doc/source/stdlib/handmade/function-math-log-0x2a7616eeac94c1a4.rst
new file mode 100644
index 0000000000..17ec22873e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-log-0x2a7616eeac94c1a4.rst
@@ -0,0 +1 @@
+Returns the natural (base-e) logarithm of x; the input must be positive; computed component-wise for float2, float3, and float4 vector types; works with float and double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-log-0x8e957c6804a3ec4b.rst b/doc/source/stdlib/handmade/function-math-log-0x8e957c6804a3ec4b.rst
new file mode 100644
index 0000000000..17ec22873e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-log-0x8e957c6804a3ec4b.rst
@@ -0,0 +1 @@
+Returns the natural (base-e) logarithm of x; the input must be positive; computed component-wise for float2, float3, and float4 vector types; works with float and double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-log1p-0x2a3ece2ba11eac3.rst b/doc/source/stdlib/handmade/function-math-log1p-0x2a3ece2ba11eac3.rst
new file mode 100644
index 0000000000..b3eeddb84d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-log1p-0x2a3ece2ba11eac3.rst
@@ -0,0 +1 @@
+Returns ln(1+x) with improved accuracy near zero, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-mad-0x4617460ed0b8e334.rst b/doc/source/stdlib/handmade/function-math-mad-0x4617460ed0b8e334.rst
new file mode 100644
index 0000000000..dc2300d9d0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-mad-0x4617460ed0b8e334.rst
@@ -0,0 +1 @@
+Computes the multiply-add operation `a * b + c`.
diff --git a/doc/source/stdlib/handmade/function-math-mad-0x8f309dc91b094503.rst b/doc/source/stdlib/handmade/function-math-mad-0x8f309dc91b094503.rst
index aae701e1b9..fcdd7c603d 100644
--- a/doc/source/stdlib/handmade/function-math-mad-0x8f309dc91b094503.rst
+++ b/doc/source/stdlib/handmade/function-math-mad-0x8f309dc91b094503.rst
@@ -1,2 +1,2 @@
-Computes the fused multiply-add operation `a * b + c`.
+Computes the multiply-add operation `a * b + c`. May be fused into a single rounding (FMA) on platforms with native FMA support; otherwise computed as separate multiply and add.
 
diff --git a/doc/source/stdlib/handmade/function-math-mad-0xa8b8859c4483210b.rst b/doc/source/stdlib/handmade/function-math-mad-0xa8b8859c4483210b.rst
new file mode 100644
index 0000000000..fcdd7c603d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-mad-0xa8b8859c4483210b.rst
@@ -0,0 +1,2 @@
+Computes the multiply-add operation `a * b + c`. May be fused into a single rounding (FMA) on platforms with native FMA support; otherwise computed as separate multiply and add.
+
diff --git a/doc/source/stdlib/handmade/function-math-max-0x1f58b63cd00ad9d0.rst b/doc/source/stdlib/handmade/function-math-max-0x1f58b63cd00ad9d0.rst
new file mode 100644
index 0000000000..178e2a6fc1
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-max-0x1f58b63cd00ad9d0.rst
@@ -0,0 +1 @@
+Returns the component-wise maximum of two values, supporting scalar double, float, int, int64, uint, uint64 and vector float2, float3, float4 types.
diff --git a/doc/source/stdlib/handmade/function-math-max-0xd01093880b341938.rst b/doc/source/stdlib/handmade/function-math-max-0xd01093880b341938.rst
new file mode 100644
index 0000000000..178e2a6fc1
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-max-0xd01093880b341938.rst
@@ -0,0 +1 @@
+Returns the component-wise maximum of two values, supporting scalar double, float, int, int64, uint, uint64 and vector float2, float3, float4 types.
diff --git a/doc/source/stdlib/handmade/function-math-min-0x84e39cd61a2fd519.rst b/doc/source/stdlib/handmade/function-math-min-0x84e39cd61a2fd519.rst
new file mode 100644
index 0000000000..b0116ba02a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-min-0x84e39cd61a2fd519.rst
@@ -0,0 +1 @@
+Returns the component-wise minimum of two values, supporting scalar double, float, int, int64, uint, uint64 and vector float2, float3, float4 types.
diff --git a/doc/source/stdlib/handmade/function-math-min-0x8599dae683caac62.rst b/doc/source/stdlib/handmade/function-math-min-0x8599dae683caac62.rst
new file mode 100644
index 0000000000..b0116ba02a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-min-0x8599dae683caac62.rst
@@ -0,0 +1 @@
+Returns the component-wise minimum of two values, supporting scalar double, float, int, int64, uint, uint64 and vector float2, float3, float4 types.
diff --git a/doc/source/stdlib/handmade/function-math-pow-0x58ac7376b2cd7210.rst b/doc/source/stdlib/handmade/function-math-pow-0x58ac7376b2cd7210.rst
new file mode 100644
index 0000000000..c6e56a93c4
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-pow-0x58ac7376b2cd7210.rst
@@ -0,0 +1 @@
+Returns x raised to the power of y for scalar double, float, or vector float2, float3, float4 types; domain requires x >= 0 for non-integer y values.
diff --git a/doc/source/stdlib/handmade/function-math-quat-0x716561d6bb34368.rst b/doc/source/stdlib/handmade/function-math-quat-0x716561d6bb34368.rst
new file mode 100644
index 0000000000..218304200a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-quat-0x716561d6bb34368.rst
@@ -0,0 +1 @@
+Extracts the rotation part of a float3x4 matrix and returns it as a float4 quaternion in (x, y, z, w) format.
diff --git a/doc/source/stdlib/handmade/function-math-quat_from_euler-0x66163c5b3168cde2.rst b/doc/source/stdlib/handmade/function-math-quat_from_euler-0x66163c5b3168cde2.rst
new file mode 100644
index 0000000000..4bd482f1b7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-quat_from_euler-0x66163c5b3168cde2.rst
@@ -0,0 +1 @@
+Creates a float4 quaternion from a float3 of Euler angles (pitch, yaw, roll) given in radians.
diff --git a/doc/source/stdlib/handmade/function-math-rcp-0xc1228b4eb6e8809e.rst b/doc/source/stdlib/handmade/function-math-rcp-0xc1228b4eb6e8809e.rst
new file mode 100644
index 0000000000..1430903129
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-rcp-0xc1228b4eb6e8809e.rst
@@ -0,0 +1 @@
+Returns the reciprocal (1/x) of a scalar float or each component of a float2, float3, or float4 vector.
diff --git a/doc/source/stdlib/handmade/function-math-rcp_est-0x1e22c587fb16fb45.rst b/doc/source/stdlib/handmade/function-math-rcp_est-0x1e22c587fb16fb45.rst
new file mode 100644
index 0000000000..b745137900
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-rcp_est-0x1e22c587fb16fb45.rst
@@ -0,0 +1 @@
+Returns a fast hardware estimate of the reciprocal (1/x) of a scalar float or each component of a float2, float3, or float4 vector, trading precision for speed.
diff --git a/doc/source/stdlib/handmade/function-math-rcp_est-0x4c111d0fa80d4b35.rst b/doc/source/stdlib/handmade/function-math-rcp_est-0x4c111d0fa80d4b35.rst
new file mode 100644
index 0000000000..b745137900
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-rcp_est-0x4c111d0fa80d4b35.rst
@@ -0,0 +1 @@
+Returns a fast hardware estimate of the reciprocal (1/x) of a scalar float or each component of a float2, float3, or float4 vector, trading precision for speed.
diff --git a/doc/source/stdlib/handmade/function-math-reflect-0x755a9e9909bee6a6.rst b/doc/source/stdlib/handmade/function-math-reflect-0x755a9e9909bee6a6.rst
new file mode 100644
index 0000000000..5a7ee3f89f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-reflect-0x755a9e9909bee6a6.rst
@@ -0,0 +1 @@
+Computes the reflection of float2 or float3 vector v off a surface with unit normal n, returning the reflected vector as v - 2*dot(v,n)*n.
diff --git a/doc/source/stdlib/handmade/function-math-refract-0x4d57f7974e4e6ea3.rst b/doc/source/stdlib/handmade/function-math-refract-0x4d57f7974e4e6ea3.rst
new file mode 100644
index 0000000000..a4eab543df
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-refract-0x4d57f7974e4e6ea3.rst
@@ -0,0 +1 @@
+Computes the refraction direction of vector v through a surface with unit normal n using Snell's law with index of refraction ratio nint. Returns a zero vector if total internal reflection occurs.
diff --git a/doc/source/stdlib/handmade/function-math-remainder-0x14cd516d6e6b2d15.rst b/doc/source/stdlib/handmade/function-math-remainder-0x14cd516d6e6b2d15.rst
new file mode 100644
index 0000000000..472fe72425
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-remainder-0x14cd516d6e6b2d15.rst
@@ -0,0 +1 @@
+Returns the IEEE 754 remainder of x/y (round-to-nearest quotient), component-wise for vectors and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-round-0x1807de5c1dd28ab3.rst b/doc/source/stdlib/handmade/function-math-round-0x1807de5c1dd28ab3.rst
new file mode 100644
index 0000000000..b8e600c601
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-round-0x1807de5c1dd28ab3.rst
@@ -0,0 +1 @@
+Rounds each component of the scalar double, float, or vector float2, float3, float4 value x to the nearest integer; halfway cases round away from zero (roundf-style) on the vector path, and follow `::round` for scalar double.
diff --git a/doc/source/stdlib/handmade/function-math-round-0xa7e95d86acc16365.rst b/doc/source/stdlib/handmade/function-math-round-0xa7e95d86acc16365.rst
index 7e09b545cb..b8e600c601 100644
--- a/doc/source/stdlib/handmade/function-math-round-0xa7e95d86acc16365.rst
+++ b/doc/source/stdlib/handmade/function-math-round-0xa7e95d86acc16365.rst
@@ -1 +1 @@
-Rounds each component of the scalar double, float, or vector float2, float3, float4 value x to the nearest integer, with halfway cases rounded to the nearest even value.
+Rounds each component of the scalar double, float, or vector float2, float3, float4 value x to the nearest integer; halfway cases round away from zero (roundf-style) on the vector path, and follow `::round` for scalar double.
diff --git a/doc/source/stdlib/handmade/function-math-round-0xfddb15b26124db4c.rst b/doc/source/stdlib/handmade/function-math-round-0xfddb15b26124db4c.rst
new file mode 100644
index 0000000000..836d81cf4c
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-round-0xfddb15b26124db4c.rst
@@ -0,0 +1 @@
+Returns the rounded value of x to the nearest integer; works with float and double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-roundi-0x893aeb0e2a97b575.rst b/doc/source/stdlib/handmade/function-math-roundi-0x893aeb0e2a97b575.rst
new file mode 100644
index 0000000000..ef5da30a63
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-roundi-0x893aeb0e2a97b575.rst
@@ -0,0 +1 @@
+Rounds the float x to the nearest integer value and returns the result as an int.
diff --git a/doc/source/stdlib/handmade/function-math-rsqrt-0x875d3c94b60ae197.rst b/doc/source/stdlib/handmade/function-math-rsqrt-0x875d3c94b60ae197.rst
new file mode 100644
index 0000000000..7ef5336161
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-rsqrt-0x875d3c94b60ae197.rst
@@ -0,0 +1 @@
+Returns the reciprocal square root (1/sqrt(x)) of a scalar float or each component of a float2, float3, or float4 vector.
diff --git a/doc/source/stdlib/handmade/function-math-rsqrt_est-0xfd9bda43f9984c6a.rst b/doc/source/stdlib/handmade/function-math-rsqrt_est-0xfd9bda43f9984c6a.rst
new file mode 100644
index 0000000000..0c330cdcea
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-rsqrt_est-0xfd9bda43f9984c6a.rst
@@ -0,0 +1 @@
+Returns a fast hardware estimate of the reciprocal square root (1/sqrt(x)) of a scalar float or each component of a float2, float3, or float4 vector, trading precision for speed.
diff --git a/doc/source/stdlib/handmade/function-math-safe_asin-0x14a3433604fb80f1.rst b/doc/source/stdlib/handmade/function-math-safe_asin-0x14a3433604fb80f1.rst
new file mode 100644
index 0000000000..dc81b3ac6f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-safe_asin-0x14a3433604fb80f1.rst
@@ -0,0 +1 @@
+Returns the arcsine of x in radians, clamping the input to the valid domain [-1, 1] to prevent NaN results from out-of-range values.
diff --git a/doc/source/stdlib/handmade/function-math-saturate-0xbfc698f93638b481.rst b/doc/source/stdlib/handmade/function-math-saturate-0xbfc698f93638b481.rst
new file mode 100644
index 0000000000..7f93db9608
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-saturate-0xbfc698f93638b481.rst
@@ -0,0 +1 @@
+Clamps the scalar double, float, or each component of a float2, float3, float4 vector to the [0, 1] range, returning 0 for values below 0 and 1 for values above 1.
diff --git a/doc/source/stdlib/handmade/function-math-sign-0x400bb24ff4d036d8.rst b/doc/source/stdlib/handmade/function-math-sign-0x400bb24ff4d036d8.rst
new file mode 100644
index 0000000000..826d06ced0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-sign-0x400bb24ff4d036d8.rst
@@ -0,0 +1 @@
+Returns the sign of x component-wise: -1 for negative, 0 for zero, and 1 for positive.
diff --git a/doc/source/stdlib/handmade/function-math-sin-0xe69fa43ee54985eb.rst b/doc/source/stdlib/handmade/function-math-sin-0xe69fa43ee54985eb.rst
new file mode 100644
index 0000000000..dad8e4c53b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-sin-0xe69fa43ee54985eb.rst
@@ -0,0 +1 @@
+Returns the sine of the angle x given in radians for double or float, with output in the range [-1, 1].
diff --git a/doc/source/stdlib/handmade/function-math-sincos-0x48b7fded0f1787cd.rst b/doc/source/stdlib/handmade/function-math-sincos-0x48b7fded0f1787cd.rst
new file mode 100644
index 0000000000..b8f6e8bef9
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-sincos-0x48b7fded0f1787cd.rst
@@ -0,0 +1 @@
+Computes both the sine and cosine of the angle x in radians simultaneously, writing the results to output parameters s and c, for float or double types.
diff --git a/doc/source/stdlib/handmade/function-math-sinh-0x67b31be9c4d427b9.rst b/doc/source/stdlib/handmade/function-math-sinh-0x67b31be9c4d427b9.rst
new file mode 100644
index 0000000000..a6e41f1f60
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-sinh-0x67b31be9c4d427b9.rst
@@ -0,0 +1 @@
+Returns the hyperbolic sine of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-sinh-0xad1e85e5a2f4cdcd.rst b/doc/source/stdlib/handmade/function-math-sinh-0xad1e85e5a2f4cdcd.rst
new file mode 100644
index 0000000000..a6e41f1f60
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-sinh-0xad1e85e5a2f4cdcd.rst
@@ -0,0 +1 @@
+Returns the hyperbolic sine of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-tan-0x85aeefde3b6ace60.rst b/doc/source/stdlib/handmade/function-math-tan-0x85aeefde3b6ace60.rst
new file mode 100644
index 0000000000..0d933331d6
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-tan-0x85aeefde3b6ace60.rst
@@ -0,0 +1 @@
+Returns the tangent of the angle x given in radians for double or float; undefined at odd multiples of pi/2.
diff --git a/doc/source/stdlib/handmade/function-math-tanh-0x7fa203d8b8030c10.rst b/doc/source/stdlib/handmade/function-math-tanh-0x7fa203d8b8030c10.rst
new file mode 100644
index 0000000000..2262ed66f5
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-tanh-0x7fa203d8b8030c10.rst
@@ -0,0 +1 @@
+Returns the hyperbolic tangent of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-tanh-0x8bf46ddc3c3172a8.rst b/doc/source/stdlib/handmade/function-math-tanh-0x8bf46ddc3c3172a8.rst
new file mode 100644
index 0000000000..2262ed66f5
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-tanh-0x8bf46ddc3c3172a8.rst
@@ -0,0 +1 @@
+Returns the hyperbolic tangent of x, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-trunc-0x666600df9d093e85.rst b/doc/source/stdlib/handmade/function-math-trunc-0x666600df9d093e85.rst
new file mode 100644
index 0000000000..17b20893e7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-trunc-0x666600df9d093e85.rst
@@ -0,0 +1 @@
+Rounds x toward zero, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-math-trunc-0x8aa4540ecb056930.rst b/doc/source/stdlib/handmade/function-math-trunc-0x8aa4540ecb056930.rst
new file mode 100644
index 0000000000..17b20893e7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-math-trunc-0x8aa4540ecb056930.rst
@@ -0,0 +1 @@
+Rounds x toward zero, component-wise for float2/float3/float4 vectors, and for float/double scalars.
diff --git a/doc/source/stdlib/handmade/function-pugixml-append_copy-0xfaa4a4fb72c385df.rst b/doc/source/stdlib/handmade/function-pugixml-append_copy-0xfaa4a4fb72c385df.rst
new file mode 100644
index 0000000000..f9dadf7853
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-append_copy-0xfaa4a4fb72c385df.rst
@@ -0,0 +1 @@
+Appends a deep copy of the given attribute or node as the last child.
diff --git a/doc/source/stdlib/handmade/function-pugixml-as_float-0xa6a319ce9863126b.rst b/doc/source/stdlib/handmade/function-pugixml-as_float-0xa6a319ce9863126b.rst
new file mode 100644
index 0000000000..4147c005ba
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-as_float-0xa6a319ce9863126b.rst
@@ -0,0 +1 @@
+Returns the attribute or text value as a float, or *default_value* if conversion fails.
diff --git a/doc/source/stdlib/handmade/function-pugixml-as_string-0x1feda375c1759c98.rst b/doc/source/stdlib/handmade/function-pugixml-as_string-0x1feda375c1759c98.rst
new file mode 100644
index 0000000000..13139be847
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-as_string-0x1feda375c1759c98.rst
@@ -0,0 +1 @@
+Returns the attribute or text value as a string, or *default_value* if empty.
diff --git a/doc/source/stdlib/handmade/function-pugixml-find_child_by_attribute-0xdd586a0c75566b58.rst b/doc/source/stdlib/handmade/function-pugixml-find_child_by_attribute-0xdd586a0c75566b58.rst
new file mode 100644
index 0000000000..cbbd6b3e9a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-find_child_by_attribute-0xdd586a0c75566b58.rst
@@ -0,0 +1 @@
+Finds the first child element that has an attribute matching the given name and value.
diff --git a/doc/source/stdlib/handmade/function-pugixml-prepend_copy-0xc0a01afddb415a7c.rst b/doc/source/stdlib/handmade/function-pugixml-prepend_copy-0xc0a01afddb415a7c.rst
new file mode 100644
index 0000000000..065bfd7441
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-prepend_copy-0xc0a01afddb415a7c.rst
@@ -0,0 +1 @@
+Prepends a deep copy of the given attribute or node as the first child.
diff --git a/doc/source/stdlib/handmade/function-pugixml-select_node-0x57836d963ffd5974.rst b/doc/source/stdlib/handmade/function-pugixml-select_node-0x57836d963ffd5974.rst
new file mode 100644
index 0000000000..5df29e9d31
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-select_node-0x57836d963ffd5974.rst
@@ -0,0 +1 @@
+Selects the first node matching the XPath query string or compiled query.
diff --git a/doc/source/stdlib/handmade/function-pugixml-select_nodes-0x7fdb17f47af4f797.rst b/doc/source/stdlib/handmade/function-pugixml-select_nodes-0x7fdb17f47af4f797.rst
new file mode 100644
index 0000000000..6a68d18b08
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-select_nodes-0x7fdb17f47af4f797.rst
@@ -0,0 +1 @@
+Selects all nodes matching the XPath query and returns them as an xpath_node_set.
diff --git a/doc/source/stdlib/handmade/function-pugixml-set-0x46f4c93c0346dfd5.rst b/doc/source/stdlib/handmade/function-pugixml-set-0x46f4c93c0346dfd5.rst
new file mode 100644
index 0000000000..0a1be772b0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-set-0x46f4c93c0346dfd5.rst
@@ -0,0 +1 @@
+Sets the text content or XPath variable value. Multiple overloads accept string, int, uint, float, double, bool, int64, uint64.
diff --git a/doc/source/stdlib/handmade/function-pugixml-set-0x9e72baf4bbac288c.rst b/doc/source/stdlib/handmade/function-pugixml-set-0x9e72baf4bbac288c.rst
new file mode 100644
index 0000000000..0a1be772b0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-set-0x9e72baf4bbac288c.rst
@@ -0,0 +1 @@
+Sets the text content or XPath variable value. Multiple overloads accept string, int, uint, float, double, bool, int64, uint64.
diff --git a/doc/source/stdlib/handmade/function-pugixml-set_name-0xf7648c2c8868fd9.rst b/doc/source/stdlib/handmade/function-pugixml-set_name-0xf7648c2c8868fd9.rst
new file mode 100644
index 0000000000..36e01e5f87
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-set_name-0xf7648c2c8868fd9.rst
@@ -0,0 +1 @@
+Changes the name (tag) of the node or attribute.
diff --git a/doc/source/stdlib/handmade/function-pugixml-set_value-0x1668b158d0b70dc.rst b/doc/source/stdlib/handmade/function-pugixml-set_value-0x1668b158d0b70dc.rst
new file mode 100644
index 0000000000..86d9ab61ba
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-set_value-0x1668b158d0b70dc.rst
@@ -0,0 +1 @@
+Sets the value of the node or attribute. Accepts string, int, uint, float, double, or bool.
diff --git a/doc/source/stdlib/handmade/function-pugixml-set_value-0xbff6087b79253a01.rst b/doc/source/stdlib/handmade/function-pugixml-set_value-0xbff6087b79253a01.rst
new file mode 100644
index 0000000000..86d9ab61ba
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-set_value-0xbff6087b79253a01.rst
@@ -0,0 +1 @@
+Sets the value of the node or attribute. Accepts string, int, uint, float, double, or bool.
diff --git a/doc/source/stdlib/handmade/function-pugixml-xpath_compile-0x58df4c8d901aeaf8.rst b/doc/source/stdlib/handmade/function-pugixml-xpath_compile-0x58df4c8d901aeaf8.rst
new file mode 100644
index 0000000000..a05a303925
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-pugixml-xpath_compile-0x58df4c8d901aeaf8.rst
@@ -0,0 +1 @@
+Compiles an XPath expression string into an xpath_query. Optionally accepts an xpath_variable_set for parameterized queries.
diff --git a/doc/source/stdlib/handmade/function-raster-gather-0x272c1425b94a8eba.rst b/doc/source/stdlib/handmade/function-raster-gather-0x272c1425b94a8eba.rst
new file mode 100644
index 0000000000..a15779fe98
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather-0x272c1425b94a8eba.rst
@@ -0,0 +1 @@
+Gather pixels from source using an index array into destination.
diff --git a/doc/source/stdlib/handmade/function-raster-gather-0x3c774f64c1dee720.rst b/doc/source/stdlib/handmade/function-raster-gather-0x3c774f64c1dee720.rst
new file mode 100644
index 0000000000..a15779fe98
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather-0x3c774f64c1dee720.rst
@@ -0,0 +1 @@
+Gather pixels from source using an index array into destination.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_scatter-0x288a620074ebe8af.rst b/doc/source/stdlib/handmade/function-raster-gather_scatter-0x288a620074ebe8af.rst
new file mode 100644
index 0000000000..2372a6d70d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_scatter-0x288a620074ebe8af.rst
@@ -0,0 +1 @@
+Gather pixels from source and scatter to destination using index arrays.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_scatter-0x727540d0f3c456ad.rst b/doc/source/stdlib/handmade/function-raster-gather_scatter-0x727540d0f3c456ad.rst
new file mode 100644
index 0000000000..2372a6d70d
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_scatter-0x727540d0f3c456ad.rst
@@ -0,0 +1 @@
+Gather pixels from source and scatter to destination using index arrays.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_scatter_neq_mask-0x73063598d63d601c.rst b/doc/source/stdlib/handmade/function-raster-gather_scatter_neq_mask-0x73063598d63d601c.rst
new file mode 100644
index 0000000000..b44ddd5055
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_scatter_neq_mask-0x73063598d63d601c.rst
@@ -0,0 +1 @@
+Gather-scatter pixels where mask value differs from gathered pixel.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_scatter_neq_mask-0xcbd53fcdd0c7309c.rst b/doc/source/stdlib/handmade/function-raster-gather_scatter_neq_mask-0xcbd53fcdd0c7309c.rst
new file mode 100644
index 0000000000..b44ddd5055
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_scatter_neq_mask-0xcbd53fcdd0c7309c.rst
@@ -0,0 +1 @@
+Gather-scatter pixels where mask value differs from gathered pixel.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_store_neq_mask-0x258a07c78bbb973c.rst b/doc/source/stdlib/handmade/function-raster-gather_store_neq_mask-0x258a07c78bbb973c.rst
new file mode 100644
index 0000000000..bd2a7ef50a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_store_neq_mask-0x258a07c78bbb973c.rst
@@ -0,0 +1 @@
+Gather from source and store where mask differs, using a single value.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_store_neq_mask-0xfd5b3b3444f74f10.rst b/doc/source/stdlib/handmade/function-raster-gather_store_neq_mask-0xfd5b3b3444f74f10.rst
new file mode 100644
index 0000000000..bd2a7ef50a
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_store_neq_mask-0xfd5b3b3444f74f10.rst
@@ -0,0 +1 @@
+Gather from source and store where mask differs, using a single value.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_store_stride-0xa2dbcc09e11b460a.rst b/doc/source/stdlib/handmade/function-raster-gather_store_stride-0xa2dbcc09e11b460a.rst
new file mode 100644
index 0000000000..0e0994cc51
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_store_stride-0xa2dbcc09e11b460a.rst
@@ -0,0 +1 @@
+Gather pixels with a stride offset between source elements.
diff --git a/doc/source/stdlib/handmade/function-raster-gather_store_stride-0xf07af2a65ade3184.rst b/doc/source/stdlib/handmade/function-raster-gather_store_stride-0xf07af2a65ade3184.rst
new file mode 100644
index 0000000000..0e0994cc51
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-gather_store_stride-0xf07af2a65ade3184.rst
@@ -0,0 +1 @@
+Gather pixels with a stride offset between source elements.
diff --git a/doc/source/stdlib/handmade/function-raster-scatter-0xac10e7903c2c2139.rst b/doc/source/stdlib/handmade/function-raster-scatter-0xac10e7903c2c2139.rst
new file mode 100644
index 0000000000..cabbfbf4ec
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-scatter-0xac10e7903c2c2139.rst
@@ -0,0 +1 @@
+Scatter pixels from source to destination using an index array.
diff --git a/doc/source/stdlib/handmade/function-raster-scatter-0xf250afe3ee1db364.rst b/doc/source/stdlib/handmade/function-raster-scatter-0xf250afe3ee1db364.rst
new file mode 100644
index 0000000000..cabbfbf4ec
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-scatter-0xf250afe3ee1db364.rst
@@ -0,0 +1 @@
+Scatter pixels from source to destination using an index array.
diff --git a/doc/source/stdlib/handmade/function-raster-scatter_neq_mask-0x5ea36fd5e035a.rst b/doc/source/stdlib/handmade/function-raster-scatter_neq_mask-0x5ea36fd5e035a.rst
new file mode 100644
index 0000000000..34c3ac0b91
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-scatter_neq_mask-0x5ea36fd5e035a.rst
@@ -0,0 +1 @@
+Scatter pixels to destination where mask value differs from pixel value.
diff --git a/doc/source/stdlib/handmade/function-raster-scatter_neq_mask-0xf7d6d6a526045f4.rst b/doc/source/stdlib/handmade/function-raster-scatter_neq_mask-0xf7d6d6a526045f4.rst
new file mode 100644
index 0000000000..34c3ac0b91
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-scatter_neq_mask-0xf7d6d6a526045f4.rst
@@ -0,0 +1 @@
+Scatter pixels to destination where mask value differs from pixel value.
diff --git a/doc/source/stdlib/handmade/function-raster-store_neq_mask-0x70653f7e8b8c8e67.rst b/doc/source/stdlib/handmade/function-raster-store_neq_mask-0x70653f7e8b8c8e67.rst
new file mode 100644
index 0000000000..216cf476f9
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-store_neq_mask-0x70653f7e8b8c8e67.rst
@@ -0,0 +1 @@
+Store a single pixel at indexed positions where mask differs.
diff --git a/doc/source/stdlib/handmade/function-raster-u8x4_gather_store-0x71c7f2516a21647c.rst b/doc/source/stdlib/handmade/function-raster-u8x4_gather_store-0x71c7f2516a21647c.rst
new file mode 100644
index 0000000000..90de024975
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-raster-u8x4_gather_store-0x71c7f2516a21647c.rst
@@ -0,0 +1 @@
+Gather 4-byte pixels and store to sequential destination.
diff --git a/doc/source/stdlib/handmade/function-regex-regex_compile-0x1dec62bdc2bc6440.rst b/doc/source/stdlib/handmade/function-regex-regex_compile-0x1dec62bdc2bc6440.rst
new file mode 100644
index 0000000000..ff5dcfda38
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-regex-regex_compile-0x1dec62bdc2bc6440.rst
@@ -0,0 +1 @@
+Compiles a regular expression pattern string into a ``Regex`` object. Panics if the pattern is invalid. An overload taking a ``var re : Regex`` out-parameter returns ``bool`` instead of panicking. Optional flags: ``case_insensitive=true`` for ASCII case-insensitive matching, ``dot_all=true`` for ``.`` to also match newline characters.
diff --git a/doc/source/stdlib/handmade/function-rst-function_label_file-0xaadfdda567bbfd3e.rst b/doc/source/stdlib/handmade/function-rst-function_label_file-0xaadfdda567bbfd3e.rst
new file mode 100644
index 0000000000..39a0add024
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rst-function_label_file-0xaadfdda567bbfd3e.rst
@@ -0,0 +1 @@
+Creates a unique, file-name-safe label string for a function.
diff --git a/doc/source/stdlib/handmade/function-rtti-compile-0x97cb4ad1839fc0a.rst b/doc/source/stdlib/handmade/function-rtti-compile-0x97cb4ad1839fc0a.rst
new file mode 100644
index 0000000000..c1cd43d1d9
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-compile-0x97cb4ad1839fc0a.rst
@@ -0,0 +1 @@
+Compiles a daslang program from a source code string using the provided ``FileAccess`` and ``ModuleGroup``, returning a ``ProgramPtr`` (null on failure).
diff --git a/doc/source/stdlib/handmade/function-rtti-describe-0xd6fdcb998ae44f46.rst b/doc/source/stdlib/handmade/function-rtti-describe-0xd6fdcb998ae44f46.rst
new file mode 100644
index 0000000000..9a09b56f10
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-describe-0xd6fdcb998ae44f46.rst
@@ -0,0 +1 @@
+Returns a human-readable ``string`` description of an RTTI object (``TypeInfo``, ``VarInfo``, ``FuncInfo``, etc.), useful for logging and debug output.
diff --git a/doc/source/stdlib/handmade/function-rtti-each-0x1b45f606c41c591c.rst b/doc/source/stdlib/handmade/function-rtti-each-0x1b45f606c41c591c.rst
new file mode 100644
index 0000000000..2fb72a0bc0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-each-0x1b45f606c41c591c.rst
@@ -0,0 +1 @@
+Iterates through each element of an RTTI container (e.g., ``AnnotationArguments``, ``AnnotationArgumentList``, ``AnnotationList``), yielding individual entries.
diff --git a/doc/source/stdlib/handmade/function-rtti-each-0x531450ac21d9e0dc.rst b/doc/source/stdlib/handmade/function-rtti-each-0x531450ac21d9e0dc.rst
new file mode 100644
index 0000000000..2fb72a0bc0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-each-0x531450ac21d9e0dc.rst
@@ -0,0 +1 @@
+Iterates through each element of an RTTI container (e.g., ``AnnotationArguments``, ``AnnotationArgumentList``, ``AnnotationList``), yielding individual entries.
diff --git a/doc/source/stdlib/handmade/function-rtti-get_dim-0xfb47ce5d74f4d55.rst b/doc/source/stdlib/handmade/function-rtti-get_dim-0xfb47ce5d74f4d55.rst
new file mode 100644
index 0000000000..c95b3dfd39
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-get_dim-0xfb47ce5d74f4d55.rst
@@ -0,0 +1 @@
+Returns the dimension size (``int``) at the specified index for a fixed-size array type described by ``TypeInfo``.
diff --git a/doc/source/stdlib/handmade/function-rtti-get_function_by_mangled_name_hash-0xc38120dfbbd65398.rst b/doc/source/stdlib/handmade/function-rtti-get_function_by_mangled_name_hash-0xc38120dfbbd65398.rst
new file mode 100644
index 0000000000..c4a0fabd74
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-get_function_by_mangled_name_hash-0xc38120dfbbd65398.rst
@@ -0,0 +1 @@
+Returns a ``function<>`` lambda value looked up by its mangled name hash in the given ``Context``.
diff --git a/doc/source/stdlib/handmade/function-rtti-get_function_info-0x3e1cd5690dcaf413.rst b/doc/source/stdlib/handmade/function-rtti-get_function_info-0x3e1cd5690dcaf413.rst
new file mode 100644
index 0000000000..fd36907429
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-get_function_info-0x3e1cd5690dcaf413.rst
@@ -0,0 +1 @@
+Returns the ``FuncInfo`` pointer for a function at the given index in the ``Context``, providing access to its name, arguments, and return type.
diff --git a/doc/source/stdlib/handmade/function-rtti-sprint_data-0xf7eae29ca5b5c5e.rst b/doc/source/stdlib/handmade/function-rtti-sprint_data-0xf7eae29ca5b5c5e.rst
new file mode 100644
index 0000000000..c7c77fec90
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-rtti-sprint_data-0xf7eae29ca5b5c5e.rst
@@ -0,0 +1 @@
+Returns a ``string`` representation of a value given its data pointer and ``TypeInfo``, similar to ``debug`` or ``print`` but capturing output as a string.
diff --git a/doc/source/stdlib/handmade/function-strings-double-0xe25d1174cddadb94.rst b/doc/source/stdlib/handmade/function-strings-double-0xe25d1174cddadb94.rst
new file mode 100644
index 0000000000..b526733e58
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-double-0xe25d1174cddadb94.rst
@@ -0,0 +1 @@
+Converts a string to a double value, panicking on failure; an overload accepts `result` and `offset` output parameters to report the ConversionResult status and parsed position instead of panicking.
diff --git a/doc/source/stdlib/handmade/function-strings-ends_with-0xb1dc8d3e408b4102.rst b/doc/source/stdlib/handmade/function-strings-ends_with-0xb1dc8d3e408b4102.rst
new file mode 100644
index 0000000000..d4dfbb80d0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-ends_with-0xb1dc8d3e408b4102.rst
@@ -0,0 +1 @@
+Returns true if the string `str` ends with the substring `cmp`, false otherwise.
diff --git a/doc/source/stdlib/handmade/function-strings-find-0x9b44bbfe385ca191.rst b/doc/source/stdlib/handmade/function-strings-find-0x9b44bbfe385ca191.rst
new file mode 100644
index 0000000000..3bbb7f9cac
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-find-0x9b44bbfe385ca191.rst
@@ -0,0 +1 @@
+Returns the first index at which `substr` (string or character code) occurs in `str`, optionally searching from `start`, or -1 if not found.
diff --git a/doc/source/stdlib/handmade/function-strings-find-0xa37f23299b9f6f54.rst b/doc/source/stdlib/handmade/function-strings-find-0xa37f23299b9f6f54.rst
new file mode 100644
index 0000000000..3bbb7f9cac
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-find-0xa37f23299b9f6f54.rst
@@ -0,0 +1 @@
+Returns the first index at which `substr` (string or character code) occurs in `str`, optionally searching from `start`, or -1 if not found.
diff --git a/doc/source/stdlib/handmade/function-strings-first_character-0x45eb857df9e013a6.rst b/doc/source/stdlib/handmade/function-strings-first_character-0x45eb857df9e013a6.rst
new file mode 100644
index 0000000000..40f4844151
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-first_character-0x45eb857df9e013a6.rst
@@ -0,0 +1 @@
+Returns the first character of the string as an integer. Throws an error if the string is empty. O(1) — no strlen call.
diff --git a/doc/source/stdlib/handmade/function-strings-float-0xf959e0efbd533373.rst b/doc/source/stdlib/handmade/function-strings-float-0xf959e0efbd533373.rst
new file mode 100644
index 0000000000..a6e36b39f7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-float-0xf959e0efbd533373.rst
@@ -0,0 +1 @@
+Converts a string to a float value, panicking on failure; an overload accepts `result` and `offset` output parameters to report the ConversionResult status and parsed position instead of panicking.
diff --git a/doc/source/stdlib/handmade/function-strings-fmt-0x9257119750352.rst b/doc/source/stdlib/handmade/function-strings-fmt-0x9257119750352.rst
new file mode 100644
index 0000000000..82bbf75137
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-fmt-0x9257119750352.rst
@@ -0,0 +1 @@
+Formats a numeric value of type T into the StringBuilderWriter using a libfmt/C++20 std::format format string and returns a reference to the writer.
diff --git a/doc/source/stdlib/handmade/function-strings-fmt-0xb82eee156bb35ebb.rst b/doc/source/stdlib/handmade/function-strings-fmt-0xb82eee156bb35ebb.rst
new file mode 100644
index 0000000000..82bbf75137
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-fmt-0xb82eee156bb35ebb.rst
@@ -0,0 +1 @@
+Formats a numeric value of type T into the StringBuilderWriter using a libfmt/C++20 std::format format string and returns a reference to the writer.
diff --git a/doc/source/stdlib/handmade/function-strings-format-0x5c4462a56f63c80.rst b/doc/source/stdlib/handmade/function-strings-format-0x5c4462a56f63c80.rst
new file mode 100644
index 0000000000..c4168f568c
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-format-0x5c4462a56f63c80.rst
@@ -0,0 +1 @@
+Formats a numeric value of type T using a C printf-style format string, either appending to a StringBuilderWriter and returning a reference to it, or returning the formatted result as a new string.
diff --git a/doc/source/stdlib/handmade/function-strings-format-0xd32bbd9480a20a41.rst b/doc/source/stdlib/handmade/function-strings-format-0xd32bbd9480a20a41.rst
new file mode 100644
index 0000000000..c4168f568c
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-format-0xd32bbd9480a20a41.rst
@@ -0,0 +1 @@
+Formats a numeric value of type T using a C printf-style format string, either appending to a StringBuilderWriter and returning a reference to it, or returning the formatted result as a new string.
diff --git a/doc/source/stdlib/handmade/function-strings-int16-0xec87fbcca08749b0.rst b/doc/source/stdlib/handmade/function-strings-int16-0xec87fbcca08749b0.rst
new file mode 100644
index 0000000000..740da022f0
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-int16-0xec87fbcca08749b0.rst
@@ -0,0 +1 @@
+Converts a string to an int16, panicking on failure; an overload accepts `result`, `offset`, and optional `hex` flag to report the ConversionResult status and parsed position instead of panicking.
diff --git a/doc/source/stdlib/handmade/function-strings-int64-0xafbd82ae816c4186.rst b/doc/source/stdlib/handmade/function-strings-int64-0xafbd82ae816c4186.rst
new file mode 100644
index 0000000000..3e4edeb360
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-int64-0xafbd82ae816c4186.rst
@@ -0,0 +1 @@
+Converts a string to an int64, panicking on failure; an overload accepts `result`, `offset`, and optional `hex` flag to report the ConversionResult status and parsed position instead of panicking.
diff --git a/doc/source/stdlib/handmade/function-strings-length-0xd94a56e7054949c1.rst b/doc/source/stdlib/handmade/function-strings-length-0xd94a56e7054949c1.rst
new file mode 100644
index 0000000000..7f31c4f24b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-length-0xd94a56e7054949c1.rst
@@ -0,0 +1 @@
+Returns the length of the string or das_string in characters as an int.
diff --git a/doc/source/stdlib/handmade/function-strings-rtrim-0xd58c228a964a0a26.rst b/doc/source/stdlib/handmade/function-strings-rtrim-0xd58c228a964a0a26.rst
new file mode 100644
index 0000000000..f9104d68d2
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-rtrim-0xd58c228a964a0a26.rst
@@ -0,0 +1 @@
+Returns a new string with trailing whitespace removed from `str`, or with trailing characters from the specified `chars` set removed.
diff --git a/doc/source/stdlib/handmade/function-strings-slice-0xf6743c06a6fbc6f7.rst b/doc/source/stdlib/handmade/function-strings-slice-0xf6743c06a6fbc6f7.rst
new file mode 100644
index 0000000000..db2bd408f5
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-slice-0xf6743c06a6fbc6f7.rst
@@ -0,0 +1 @@
+Returns a substring of `str` from index `start` to optional `end` (exclusive), where negative indices count from the end of the string.
diff --git a/doc/source/stdlib/handmade/function-strings-starts_with-0x238de2f7d3703acf.rst b/doc/source/stdlib/handmade/function-strings-starts_with-0x238de2f7d3703acf.rst
new file mode 100644
index 0000000000..3f883d6a3e
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-starts_with-0x238de2f7d3703acf.rst
@@ -0,0 +1 @@
+Returns true if the beginning of string `str` matches the string `cmp`, with optional `offset` and `cmpLen` parameters to control the comparison start position and length.
diff --git a/doc/source/stdlib/handmade/function-strings-uint16-0xce3398a37e6e9632.rst b/doc/source/stdlib/handmade/function-strings-uint16-0xce3398a37e6e9632.rst
new file mode 100644
index 0000000000..a2c95f084f
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-uint16-0xce3398a37e6e9632.rst
@@ -0,0 +1 @@
+Converts a string to a uint16, panicking on failure; an overload accepts `result`, `offset`, and optional `hex` flag to report the ConversionResult status and parsed position instead of panicking.
diff --git a/doc/source/stdlib/handmade/function-strings-uint8-0x5311e124c6a0137c.rst b/doc/source/stdlib/handmade/function-strings-uint8-0x5311e124c6a0137c.rst
new file mode 100644
index 0000000000..8d3286670b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings-uint8-0x5311e124c6a0137c.rst
@@ -0,0 +1 @@
+Converts a string to a uint8, panicking on failure; an overload accepts `result`, `offset`, and optional `hex` flag to report the ConversionResult status and parsed position instead of panicking.
diff --git a/doc/source/stdlib/handmade/function-strings_boost-eq-0x5182c011f809ce55.rst b/doc/source/stdlib/handmade/function-strings_boost-eq-0x5182c011f809ce55.rst
new file mode 100644
index 0000000000..6a1db2f9c7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings_boost-eq-0x5182c011f809ce55.rst
@@ -0,0 +1 @@
+Compares a ``string`` with a ``das_string`` for equality, returning ``true`` if they match.
diff --git a/doc/source/stdlib/handmade/function-strings_boost-jaccard-0xc75af48aec1e56e9.rst b/doc/source/stdlib/handmade/function-strings_boost-jaccard-0xc75af48aec1e56e9.rst
new file mode 100644
index 0000000000..00555454f1
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings_boost-jaccard-0xc75af48aec1e56e9.rst
@@ -0,0 +1 @@
+Jaccard similarity over two string-sets, returning ``|intersection| / |union|`` in 0..1. Empty either side returns 0. Pass two ``table<string>`` (the set form) for O(1) intersect lookup, or two ``array<string>`` and the array overload will build the sets internally.
diff --git a/doc/source/stdlib/handmade/function-strings_boost-last_index_of-0x407138f638406519.rst b/doc/source/stdlib/handmade/function-strings_boost-last_index_of-0x407138f638406519.rst
new file mode 100644
index 0000000000..05fbba8f0b
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-strings_boost-last_index_of-0x407138f638406519.rst
@@ -0,0 +1 @@
+Returns the index of the last occurrence of `sub` in `str` searching only up to position `start` (exclusive), or -1 if not found.
diff --git a/doc/source/stdlib/handmade/function-uriparser-Uri-0xa58553083a9533db.rst b/doc/source/stdlib/handmade/function-uriparser-Uri-0xa58553083a9533db.rst
new file mode 100644
index 0000000000..f9c1c67d01
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-uriparser-Uri-0xa58553083a9533db.rst
@@ -0,0 +1 @@
+Constructs a new ``Uri`` object by parsing the given URI string.
diff --git a/doc/source/stdlib/handmade/function-uriparser-string-0x7eae5205b0554c8e.rst b/doc/source/stdlib/handmade/function-uriparser-string-0x7eae5205b0554c8e.rst
new file mode 100644
index 0000000000..ca88fce4b9
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-uriparser-string-0x7eae5205b0554c8e.rst
@@ -0,0 +1 @@
+Converts a ``Uri`` object to its string representation.
diff --git a/doc/source/stdlib/handmade/function-uriparser-using-0xa36621ff5574db11.rst b/doc/source/stdlib/handmade/function-uriparser-using-0xa36621ff5574db11.rst
new file mode 100644
index 0000000000..dfef2fd5a7
--- /dev/null
+++ b/doc/source/stdlib/handmade/function-uriparser-using-0xa36621ff5574db11.rst
@@ -0,0 +1 @@
+Creates a scoped ``Uri`` variable that is automatically finalized at end of block.
diff --git a/examples/sort/CMakeLists.txt b/examples/sort/CMakeLists.txt
new file mode 100644
index 0000000000..37510952f6
--- /dev/null
+++ b/examples/sort/CMakeLists.txt
@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.16)
+project(example_sort_bench CXX)
+
+# Standalone — does not link against the daslang runtime, only includes
+# src/builtin/das_qsort_r.h (header-only, no external deps). Build it on
+# its own: `cmake -S examples/sort -B build/example_sort_bench -DCMAKE_BUILD_TYPE=Release`
+# then `cmake --build build/example_sort_bench -j`.
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
+endif()
+
+add_executable(example_sort_bench bench_sort_family.cpp)
+target_include_directories(example_sort_bench PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+
+add_executable(example_byte_swap_bench bench_byte_swap.cpp)
+
+if(MSVC)
+    target_compile_options(example_sort_bench PRIVATE /O2 /W4)
+    target_compile_options(example_byte_swap_bench PRIVATE /O2 /W4)
+else()
+    target_compile_options(example_sort_bench PRIVATE -O3 -Wall -Wextra)
+    target_compile_options(example_byte_swap_bench PRIVATE -O3 -Wall -Wextra)
+endif()
+
+# Optional libstdc++ build of the sort bench, gated on availability of a
+# system GCC. On macOS the system clang uses libc++; libstdc++ ships only
+# with Homebrew GCC. This produces a second binary so we can see how our
+# das_qsort_r.h compares against libstdc++'s Musser introsort in addition
+# to libc++'s pdqsort+block-quicksort.
+find_program(EXAMPLE_SORT_BENCH_GXX
+    NAMES g++-15 g++-14 g++-13 g++-12 g++-11
+    DOC "GCC C++ compiler for the libstdc++ variant of the sort bench")
+if(EXAMPLE_SORT_BENCH_GXX)
+    set(_libstdcxx_out ${CMAKE_CURRENT_BINARY_DIR}/example_sort_bench_libstdcxx)
+    add_custom_command(
+        OUTPUT ${_libstdcxx_out}
+        COMMAND ${EXAMPLE_SORT_BENCH_GXX}
+                -std=c++17 -O3 -Wall -Wextra
+                -I ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+                ${CMAKE_CURRENT_SOURCE_DIR}/bench_sort_family.cpp
+                -o ${_libstdcxx_out}
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/bench_sort_family.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/../../include/daScript/simulate/das_qsort_r.h
+        COMMENT "Building example_sort_bench_libstdcxx with ${EXAMPLE_SORT_BENCH_GXX}"
+        VERBATIM)
+    add_custom_target(example_sort_bench_libstdcxx ALL DEPENDS ${_libstdcxx_out})
+endif()
diff --git a/examples/sort/bench_byte_swap.cpp b/examples/sort/bench_byte_swap.cpp
new file mode 100644
index 0000000000..dd5edc1402
--- /dev/null
+++ b/examples/sort/bench_byte_swap.cpp
@@ -0,0 +1,181 @@
+// Bake-off: byte_swap variants at widths used by the sort family.
+// Candidate that wins across {4..256} replaces das::byte_swap in das_qsort_r.h.
+//
+// Build:  cmake -S examples/sort -B build/example_sort_bench -DCMAKE_BUILD_TYPE=Release
+//         cmake --build build/example_sort_bench -j
+// Run:    ./build/example_sort_bench/example_byte_swap_bench
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <random>
+#include <vector>
+
+// ===== Candidate implementations =====
+
+// chunked-256 — current production impl in src/builtin/das_qsort_r.h.
+static inline void bs_chunked256(void *pa, void *pb, size_t width)
+{
+    unsigned char tmp[256];
+    unsigned char *a = (unsigned char *)pa;
+    unsigned char *b = (unsigned char *)pb;
+    while (width) {
+        size_t chunk = sizeof(tmp) < width ? sizeof(tmp) : width;
+        memcpy(tmp, a, chunk);
+        memcpy(a, b, chunk);
+        memcpy(b, tmp, chunk);
+        a += chunk; b += chunk; width -= chunk;
+    }
+}
+
+// chunked-64 — same algorithm, smaller stack buffer. Control to isolate
+// whether the 256B stack write is the cost driver.
+static inline void bs_chunked64(void *pa, void *pb, size_t width)
+{
+    unsigned char tmp[64];
+    unsigned char *a = (unsigned char *)pa;
+    unsigned char *b = (unsigned char *)pb;
+    while (width) {
+        size_t chunk = sizeof(tmp) < width ? sizeof(tmp) : width;
+        memcpy(tmp, a, chunk);
+        memcpy(a, b, chunk);
+        memcpy(b, tmp, chunk);
+        a += chunk; b += chunk; width -= chunk;
+    }
+}
+
+// words64 — Linux-kernel-style word swap, no buffer. Caller must ensure
+// width is a multiple of 8 and pointers are 8-aligned.
+static inline void bs_words64(void *pa, void *pb, size_t width)
+{
+    unsigned char *a = (unsigned char *)pa;
+    unsigned char *b = (unsigned char *)pb;
+    do {
+        width -= 8;
+        uint64_t t;
+        memcpy(&t, a + width, 8);
+        memcpy(a + width, b + width, 8);
+        memcpy(b + width, &t, 8);
+    } while (width);
+}
+
+// sized — switch on width, sized memcpy that the compiler lowers
+// to a single SIMD load/store pair. Falls back to chunked-256 for unknown widths.
+static inline void bs_sized(void *pa, void *pb, size_t width)
+{
+    unsigned char *a = (unsigned char *)pa;
+    unsigned char *b = (unsigned char *)pb;
+    switch (width) {
+        case 4:  { uint32_t t; memcpy(&t, a, 4); memcpy(a, b, 4); memcpy(b, &t, 4); return; }
+        case 8:  { uint64_t t; memcpy(&t, a, 8); memcpy(a, b, 8); memcpy(b, &t, 8); return; }
+        case 16: { unsigned char t[16]; memcpy(t, a, 16); memcpy(a, b, 16); memcpy(b, t, 16); return; }
+        case 32: { unsigned char t[32]; memcpy(t, a, 32); memcpy(a, b, 32); memcpy(b, t, 32); return; }
+        case 64: { unsigned char t[64]; memcpy(t, a, 64); memcpy(a, b, 64); memcpy(b, t, 64); return; }
+    }
+    bs_chunked256(pa, pb, width);
+}
+
+// hybrid — best-of: sized inline for w ∈ {4,8,16,32,64}, word-swap loop for
+// aligned-multiple-of-8, chunked-256 as ultimate fallback.
+static inline void bs_hybrid(void *pa, void *pb, size_t width)
+{
+    unsigned char *a = (unsigned char *)pa;
+    unsigned char *b = (unsigned char *)pb;
+    switch (width) {
+        case 4:  { uint32_t t; memcpy(&t, a, 4); memcpy(a, b, 4); memcpy(b, &t, 4); return; }
+        case 8:  { uint64_t t; memcpy(&t, a, 8); memcpy(a, b, 8); memcpy(b, &t, 8); return; }
+        case 16: { unsigned char t[16]; memcpy(t, a, 16); memcpy(a, b, 16); memcpy(b, t, 16); return; }
+        case 32: { unsigned char t[32]; memcpy(t, a, 32); memcpy(a, b, 32); memcpy(b, t, 32); return; }
+        case 64: { unsigned char t[64]; memcpy(t, a, 64); memcpy(a, b, 64); memcpy(b, t, 64); return; }
+    }
+    if ((((uintptr_t)a | (uintptr_t)b | width) & 7u) == 0) {
+        bs_words64(pa, pb, width);
+        return;
+    }
+    bs_chunked256(pa, pb, width);
+}
+
+// ===== Harness =====
+
+namespace {
+
+// Correctness check: swap two distinct buffers, check that contents exchanged.
+template <typename Fn>
+void verify(const char * name, Fn fn, size_t width) {
+    std::vector<unsigned char> a(width), b(width), a0(width), b0(width);
+    std::mt19937 rng(0xBADBEEFu);
+    for (size_t i = 0; i < width; i++) { a[i] = a0[i] = (unsigned char)(rng() & 0xFF); b[i] = b0[i] = (unsigned char)(rng() & 0xFF); }
+    fn(a.data(), b.data(), width);
+    if (memcmp(a.data(), b0.data(), width) != 0 || memcmp(b.data(), a0.data(), width) != 0) {
+        std::fprintf(stderr, "FAIL: '%s' produced wrong swap at width=%zu\n", name, width);
+        std::abort();
+    }
+}
+
+template <typename Fn>
+double time_op(Fn fn, size_t width, size_t pairs, int iters)
+{
+    // Big arena of `pairs` × 2 elements; swap each pair in turn. Iterate
+    // many times to amortize timing overhead.
+    std::vector<unsigned char> data(pairs * 2 * width);
+    std::mt19937 rng(0xC0FFEE);
+    for (size_t i = 0; i < data.size(); i++) data[i] = (unsigned char)(rng() & 0xFF);
+    using clk = std::chrono::steady_clock;
+    auto t0 = clk::now();
+    for (int it = 0; it < iters; it++) {
+        for (size_t p = 0; p < pairs; p++) {
+            fn(data.data() + (2*p) * width, data.data() + (2*p + 1) * width, width);
+        }
+    }
+    auto t1 = clk::now();
+    // Anti-DCE: force a volatile observation of post-loop data so the optimizer
+    // can't prove the swap loop is dead. (The prior `data.back() == 0xDEAD`
+    // guard was always false — `data.back()` is unsigned char (0–255), can
+    // never equal 0xDEAD — so the optimizer could fold and eliminate the loop.)
+    volatile unsigned char observed = data.back();
+    (void)observed;
+    double ns = double(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count());
+    return ns / double(iters) / double(pairs);
+}
+
+struct Row { const char * name; double ns; };
+
+void run_width(size_t width)
+{
+    const size_t pairs = 1024;   // L1-friendly working set: pairs * 2 * width
+    const int iters = 5000;
+
+    verify("chunked256", bs_chunked256, width);
+    verify("chunked64",  bs_chunked64,  width);
+    if (width >= 8 && (width % 8) == 0) verify("words64", bs_words64, width);
+    verify("sized",      bs_sized,      width);
+    verify("hybrid",     bs_hybrid,     width);
+
+    double r_chunked256 = time_op(bs_chunked256, width, pairs, iters);
+    double r_chunked64  = time_op(bs_chunked64,  width, pairs, iters);
+    double r_words64    = (width >= 8 && (width % 8) == 0) ? time_op(bs_words64, width, pairs, iters) : -1.0;
+    double r_sized      = time_op(bs_sized,      width, pairs, iters);
+    double r_hybrid     = time_op(bs_hybrid,     width, pairs, iters);
+
+    std::printf("| %4zu | %10.2f | %10.2f | ", width, r_chunked256, r_chunked64);
+    if (r_words64 < 0.0) std::printf("        n/a | ");
+    else                 std::printf("%10.2f | ", r_words64);
+    std::printf("%10.2f | %10.2f |\n", r_sized, r_hybrid);
+}
+
+} // anonymous
+
+int main()
+{
+    std::printf("# byte_swap candidate bake-off (ns per swap, lower=better)\n\n");
+    std::printf("Working set: 1024 pairs × 2 × W bytes. 5000 iterations. Per-pair time printed.\n\n");
+    std::printf("| W    | chunked256 |  chunked64 |    words64 |      sized |     hybrid |\n");
+    std::printf("|------|-----------:|-----------:|-----------:|-----------:|-----------:|\n");
+    for (size_t w : { (size_t)4, (size_t)8, (size_t)16, (size_t)32, (size_t)64, (size_t)128, (size_t)256, (size_t)512 }) {
+        run_width(w);
+    }
+    return 0;
+}
diff --git a/examples/sort/bench_sort_family.cpp b/examples/sort/bench_sort_family.cpp
new file mode 100644
index 0000000000..cbc247a3f3
--- /dev/null
+++ b/examples/sort/bench_sort_family.cpp
@@ -0,0 +1,284 @@
+// Performance comparison: std::* vs the winning das_*_r implementations
+// in src/builtin/das_qsort_r.h. After Phase 0.1 algorithm bake-off (replaced
+// smoothsort with pdq, Lomuto introselect with Hoare introselect, swap-based
+// heap with hole-sliding, and nth-then-sort partial_sort with heap-of-N).
+//
+// Pure C++ — no daslang runtime. Just include the header.
+// Build:  cmake -S examples/sort -B build/example_sort_bench -DCMAKE_BUILD_TYPE=Release
+//         cmake --build build/example_sort_bench -j
+// Run:    ./build/example_sort_bench/example_sort_bench
+
+#include "daScript/simulate/das_qsort_r.h"
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <vector>
+
+using namespace das;
+
+namespace {
+
+template <int W> struct Payload {
+    int32_t key;
+    char pad[W - sizeof(int32_t)];
+    bool operator<(const Payload & o) const { return key < o.key; }
+};
+using P32 = Payload<32>;
+using P128 = Payload<128>;
+
+template <typename T> T make_value(int32_t k) { T v{}; v.key = k; return v; }
+template <> int32_t make_value<int32_t>(int32_t k) { return k; }
+template <> int64_t make_value<int64_t>(int32_t k) { return int64_t(k); }
+
+template <typename T> int32_t key_of(const T & v) { return v.key; }
+template <> int32_t key_of(const int32_t & v) { return v; }
+template <> int32_t key_of(const int64_t & v) { return int32_t(v); }
+
+template <typename T> std::vector<T> make_input(size_t n, uint32_t seed) {
+    std::mt19937 rng(seed);
+    std::vector<T> a; a.reserve(n);
+    for (size_t i = 0; i < n; i++) a.push_back(make_value<T>(int32_t(rng() & 0xFFFFFF)));
+    return a;
+}
+
+template <typename T> auto das_cmp() {
+    return [](const void * a, const void * b) {
+        return key_of(*static_cast<const T *>(a)) < key_of(*static_cast<const T *>(b));
+    };
+}
+
+// C-qsort-style comparator (returns int): used to bench the libc qsort entry
+// point. Captureless so it converts to a function pointer.
+template <typename T>
+static int c_qsort_cmp(const void * a, const void * b)
+{
+    int32_t ka = key_of(*static_cast<const T *>(a));
+    int32_t kb = key_of(*static_cast<const T *>(b));
+    return (ka > kb) - (ka < kb);
+}
+
+// Typed bool comparator — peer to std::less<T>{} for das_sort<T> and std::sort.
+template <typename T>
+static bool typed_less(const T & a, const T & b) { return key_of(a) < key_of(b); }
+
+// Verification predicates — every operation's output is checked once before
+// timing; bench aborts loudly on incorrect output.
+
+template <typename T> bool is_sorted_range(const std::vector<T> & a) {
+    for (size_t i = 1; i < a.size(); i++) if (key_of(a[i]) < key_of(a[i-1])) return false;
+    return true;
+}
+
+template <typename T> bool is_partial_sorted(const std::vector<T> & a, size_t n) {
+    if (n > a.size()) n = a.size();
+    for (size_t i = 1; i < n; i++) if (key_of(a[i]) < key_of(a[i-1])) return false;
+    if (n == 0 || n == a.size()) return true;
+    int32_t kth = key_of(a[n-1]);
+    for (size_t i = n; i < a.size(); i++) if (key_of(a[i]) < kth) return false;
+    return true;
+}
+
+template <typename T> bool is_nth_correct(const std::vector<T> & a, size_t k) {
+    if (k >= a.size()) return true;
+    int32_t kth = key_of(a[k]);
+    for (size_t i = 0; i < k; i++) if (key_of(a[i]) > kth) return false;
+    for (size_t i = k+1; i < a.size(); i++) if (key_of(a[i]) < kth) return false;
+    return true;
+}
+
+template <typename T> bool is_max_heap(const std::vector<T> & a) {
+    size_t n = a.size();
+    for (size_t i = 0; i < n; i++) {
+        size_t l = 2*i + 1, r = 2*i + 2;
+        if (l < n && key_of(a[i]) < key_of(a[l])) return false;
+        if (r < n && key_of(a[i]) < key_of(a[r])) return false;
+    }
+    return true;
+}
+
+template <typename T, typename Op>
+double time_op(const std::vector<T> & seed_data, int iters, Op op) {
+    using clk = std::chrono::steady_clock;
+    auto t0 = clk::now();
+    for (int i = 0; i < iters; i++) {
+        std::vector<T> a = seed_data;
+        op(a);
+    }
+    auto t1 = clk::now();
+    double ns = double(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count());
+    return ns / double(iters);
+}
+
+template <typename T, typename Op, typename Verify>
+double bench_one(const char * name, const std::vector<T> & seed_data, int iters, Op op, Verify verify) {
+    std::vector<T> check = seed_data;
+    op(check);
+    if (!verify(check)) {
+        std::fprintf(stderr, "FAIL: '%s' produced wrong output (N=%zu, sizeof=%zu)\n",
+                     name, seed_data.size(), sizeof(T));
+        std::abort();
+    }
+    return time_op(seed_data, iters, op);
+}
+
+int pick_iters(size_t n) {
+    if (n <= 1000)    return 10000;
+    if (n <= 10000)   return 1000;
+    return 200;  // n <= 100000 — higher iter count for stable ratios
+}
+
+template <typename T>
+void run_for_type(const char * type_name) {
+    const size_t sizes[] = { 1000, 10000, 100000 };
+    const size_t topn = 10;
+    std::printf("\n## %s (sizeof=%zu)\n\n", type_name, sizeof(T));
+    std::printf("| N      | op                  |   std (ns) |   das (ns) | ratio das/std |\n");
+    std::printf("|--------|---------------------|-----------:|-----------:|--------------:|\n");
+    for (size_t n : sizes) {
+        auto data = make_input<T>(n, 0xC0FFEEu);
+        int iters = pick_iters(n);
+
+        double s_sort = bench_one("std::sort", data, iters,
+            [](std::vector<T> & a) { std::sort(a.begin(), a.end()); },
+            is_sorted_range<T>);
+        double d_sort = bench_one("das_qsort_r", data, iters,
+            [](std::vector<T> & a) { das_qsort_r(a.data(), a.size(), sizeof(T), das_cmp<T>()); },
+            is_sorted_range<T>);
+
+        auto verify_partial = [](const std::vector<T> & a) { return is_partial_sorted(a, topn); };
+        double s_partial = bench_one("std::partial_sort", data, iters,
+            [](std::vector<T> & a) { std::partial_sort(a.begin(), a.begin() + topn, a.end()); },
+            verify_partial);
+        double d_partial = bench_one("das_partial_sort_r", data, iters,
+            [](std::vector<T> & a) { das_partial_sort_r(a.data(), a.size(), topn, sizeof(T), das_cmp<T>()); },
+            verify_partial);
+
+        size_t k = n / 2;
+        auto verify_nth = [k](const std::vector<T> & a) { return is_nth_correct(a, k); };
+        double s_nth = bench_one("std::nth_element", data, iters,
+            [k](std::vector<T> & a) { std::nth_element(a.begin(), a.begin() + k, a.end()); },
+            verify_nth);
+        double d_nth = bench_one("das_nth_element_r", data, iters,
+            [k](std::vector<T> & a) { das_nth_element_r(a.data(), a.size(), k, sizeof(T), das_cmp<T>()); },
+            verify_nth);
+
+        double s_heap = bench_one("std::make_heap", data, iters,
+            [](std::vector<T> & a) { std::make_heap(a.begin(), a.end()); },
+            is_max_heap<T>);
+        double d_heap = bench_one("das_make_heap_r", data, iters,
+            [](std::vector<T> & a) { das_make_heap_r(a.data(), a.size(), sizeof(T), das_cmp<T>()); },
+            is_max_heap<T>);
+
+        double s_hsort = bench_one("std heap_sort", data, iters,
+            [](std::vector<T> & a) {
+                std::make_heap(a.begin(), a.end());
+                for (auto it = a.end(); it != a.begin(); --it) std::pop_heap(a.begin(), it);
+            },
+            is_sorted_range<T>);
+        double d_hsort = bench_one("das heap_sort", data, iters,
+            [](std::vector<T> & a) {
+                das_make_heap_r(a.data(), a.size(), sizeof(T), das_cmp<T>());
+                for (size_t len = a.size(); len > 0; --len) {
+                    das_pop_heap_r(a.data(), len, sizeof(T), das_cmp<T>());
+                }
+            },
+            is_sorted_range<T>);
+
+        auto row = [&](const char * op, double s, double d) {
+            std::printf("| %-6zu | %-19s | %10.0f | %10.0f | %13.2f |\n", n, op, s, d, d / s);
+        };
+        row("sort",         s_sort,    d_sort);
+        row("partial_sort",  s_partial, d_partial);
+        row("nth_element",   s_nth,     d_nth);
+        row("make_heap",     s_heap,    d_heap);
+        row("heap_sort",     s_hsort,   d_hsort);
+    }
+}
+
+// Sort-only deep dive: four entry points on the same input.
+//   std::sort       — typed iterator, typed cmp, fully inlined (apples-to-apples ceiling)
+//   C qsort         — byte-pointer + runtime width + int-cmp via function pointer (libc baseline)
+//   das_qsort_r     — byte-pointer + runtime width + bool-cmp (our daslang binding-path impl)
+//   das_sort<T>     — typed iterator + typed cmp (our typed pdqsort-lite, apples-to-apples vs std::sort)
+//
+// The four-way table isolates two cost dimensions:
+//   - typed vs byte-pointer  (das_sort<T> vs das_qsort_r — same algorithm, different access)
+//   - our algorithm vs libstdc++'s (das_sort<T> vs std::sort — same access, different algorithm)
+template <typename T>
+void run_sort_deep_dive(const char * type_name)
+{
+    const size_t sizes[] = { 1000, 10000, 100000 };
+    std::printf("\n## sort deep-dive — %s (sizeof=%zu)\n\n", type_name, sizeof(T));
+    std::printf("Columns: std::sort=stdlib, C=libc qsort, dr=das_qsort_r (hybrid byte), db=das_qsort_block_r\n");
+    std::printf("(pure block byte), dt=das_sort<T> (hybrid typed), dtb=das_sort_block<T> (pure block typed)\n\n");
+    std::printf("| N      |  std::sort |   C qsort  |    dr_byte |  db_byte_b |   dt_typed | dtb_typed_b| C/std  | dr/std | db/std | dt/std | dtb/std|\n");
+    std::printf("|--------|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-------:|-------:|-------:|-------:|-------:|\n");
+    for (size_t n : sizes) {
+        auto data = make_input<T>(n, 0xC0FFEEu);
+        int iters = pick_iters(n);
+
+        double s_sort = bench_one("std::sort", data, iters,
+            [](std::vector<T> & a) { std::sort(a.begin(), a.end()); },
+            is_sorted_range<T>);
+        double c_sort = bench_one("C qsort", data, iters,
+            [](std::vector<T> & a) { std::qsort(a.data(), a.size(), sizeof(T), c_qsort_cmp<T>); },
+            is_sorted_range<T>);
+        double dr_sort = bench_one("das_qsort_r", data, iters,
+            [](std::vector<T> & a) { das_qsort_r(a.data(), a.size(), sizeof(T), das_cmp<T>()); },
+            is_sorted_range<T>);
+        double db_sort = bench_one("das_qsort_block_r", data, iters,
+            [](std::vector<T> & a) { das_qsort_block_r(a.data(), a.size(), sizeof(T), das_cmp<T>()); },
+            is_sorted_range<T>);
+        double dt_sort = bench_one("das_sort<T>", data, iters,
+            [](std::vector<T> & a) { das_sort(a.data(), a.data() + a.size(), typed_less<T>); },
+            is_sorted_range<T>);
+        double dtb_sort = bench_one("das_sort_block<T>", data, iters,
+            [](std::vector<T> & a) { das_sort_block(a.data(), a.data() + a.size(), typed_less<T>); },
+            is_sorted_range<T>);
+
+        std::printf("| %-6zu | %10.0f | %10.0f | %10.0f | %10.0f | %10.0f | %10.0f | %6.2f | %6.2f | %6.2f | %6.2f | %6.2f |\n",
+            n, s_sort, c_sort, dr_sort, db_sort, dt_sort, dtb_sort,
+            c_sort / s_sort, dr_sort / s_sort, db_sort / s_sort, dt_sort / s_sort, dtb_sort / s_sort);
+    }
+}
+
+} // anonymous
+
+int main() {
+    std::printf("# das_qsort_r.h vs std::* — final perf comparison (Phase 0.1)\n\n");
+#if defined(_LIBCPP_VERSION)
+    std::printf("C++ stdlib: **libc++ %d** (pdqsort + block-quicksort partition)\n", _LIBCPP_VERSION);
+#elif defined(__GLIBCXX__)
+    std::printf("C++ stdlib: **libstdc++ %d** (Musser introsort)\n", __GLIBCXX__);
+#else
+    std::printf("C++ stdlib: unknown\n");
+#endif
+    std::printf("Compiler: ");
+#if defined(__clang__)
+    std::printf("clang %d.%d.%d\n", __clang_major__, __clang_minor__, __clang_patchlevel__);
+#elif defined(__GNUC__)
+    std::printf("gcc %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
+#else
+    std::printf("unknown\n");
+#endif
+    std::printf("\nEach implementation's output is verified for correctness before timing —\n");
+    std::printf("the harness aborts loudly on a bad result.\n");
+    std::printf("partial_sort uses top-N=10; nth_element uses k=N/2; heap_sort builds then drains via pop_heap.\n");
+    run_for_type<int32_t>("int32_t");
+    run_for_type<int64_t>("int64_t");
+    run_for_type<P32>("Payload<32>");
+    run_for_type<P128>("Payload<128>");
+
+    std::printf("\n# Sort deep-dive — four entry points on the same input\n");
+    std::printf("\nIsolates byte-pointer-vs-typed cost and our-algorithm-vs-libstdc++'s.\n");
+    run_sort_deep_dive<int32_t>("int32_t");
+    run_sort_deep_dive<int64_t>("int64_t");
+    run_sort_deep_dive<P32>("Payload<32>");
+    run_sort_deep_dive<P128>("Payload<128>");
+    return 0;
+}
diff --git a/include/daScript/simulate/aot.h b/include/daScript/simulate/aot.h
index f5f05752b7..09b8bb8f5e 100644
--- a/include/daScript/simulate/aot.h
+++ b/include/daScript/simulate/aot.h
@@ -9,6 +9,7 @@
 #include "daScript/simulate/aot_builtin.h"
 #include "daScript/simulate/aot_builtin_matrix.h"
 #include "daScript/simulate/aot_builtin_time.h"
+#include "daScript/simulate/das_qsort_r.h"
 #include "daScript/simulate/runtime_iterator.h"
 #include "daScript/simulate/runtime_table.h"
 #include "daScript/simulate/interop.h"
@@ -3241,7 +3242,7 @@ namespace das {
     struct scblk {
         template <int dimSize>
         static __forceinline void srt ( TDim<TT,dimSize> & arr, int32_t, int32_t, CompareFn && cmp, Context *, LineInfoArg * ) {
-            sort(arr.data, arr.data + dimSize, cmp);
+            das_sort(arr.data, arr.data + dimSize, cmp);
         }
         template <int dimSize>
         static __forceinline void srtr ( TDim<TT,dimSize> & arr, int32_t elemSize, int32_t length, CompareFn && cmp, Context * context, LineInfoArg * lineinfo ) {
@@ -3256,7 +3257,7 @@ namespace das {
             vec4f bargs[2];
             auto data = (TT *) arr.data;
             context->invokeEx(cmp, bargs, nullptr, [&](SimNode * code) {
-                sort ( data, data+length, [&](TT x, TT y) -> bool {
+                das_sort ( data, data+length, [&](TT x, TT y) -> bool {
                     bargs[0] = cast<TT>::from(x);
                     bargs[1] = cast<TT>::from(y);
                     return code->evalBool(*context);
@@ -3268,7 +3269,7 @@ namespace das {
             vec4f bargs[2];
             auto data = (TT *) arr.data;
             context->invokeEx(cmp, bargs, nullptr, [&](SimNode * code) {
-                sort ( data, data+length, [&](const TT & x, const TT & y) -> bool {
+                das_sort ( data, data+length, [&](const TT & x, const TT & y) -> bool {
                     bargs[0] = cast<const TT &>::from(x);
                     bargs[1] = cast<const TT &>::from(y);
                     return code->evalBool(*context);
@@ -3292,13 +3293,13 @@ namespace das {
         auto data = cast<TT *>::to(arr);
         if ( cmp.jitFunction ) {
             using CmpFn = CallJitFn<bool, TT, TT, const Block &, Context*>;
-            sort ( data, data+length, [&](TT x, TT y) -> bool {
+            das_sort ( data, data+length, [&](TT x, TT y) -> bool {
                 return CmpFn::static_call(cmp.jitFunction,x,y,cmp,context);
             });
         } else {
             vec4f bargs[2];
             context->invokeEx(cmp, bargs, nullptr, [&](SimNode * code) {
-                sort ( data, data+length, [&](TT x, TT y) -> bool {
+                das_sort ( data, data+length, [&](TT x, TT y) -> bool {
                     bargs[0] = cast<TT>::from(x);
                     bargs[1] = cast<TT>::from(y);
                     return code->evalBool(*context);
@@ -3316,7 +3317,7 @@ namespace das {
             if ( arr.size<=1 ) return;
             array_lock(*context, arr, at);
             auto sdata = (TT *) arr.data;
-            das::sort(sdata, sdata + arr.size, cmp);
+            das_sort (sdata, sdata + arr.size, cmp);
             array_unlock(*context, arr, at);
         }
         static __forceinline void srtr ( Array & arr, int32_t elemSize, int32_t length, CompareFn && cmp, Context * context, LineInfoArg * lineinfo ) {
@@ -3332,13 +3333,13 @@ namespace das {
             array_lock(*context, arr, at);
             if ( cmp.jitFunction ) {
                 using CmpFn = CallJitFn<bool,TT, TT,const Block &,Context *>;
-                das::sort ( data, data+arr.size, [&](TT x, TT y) -> bool {
+                das_sort ( data, data+arr.size, [&](TT x, TT y) -> bool {
                     return CmpFn::static_call(cmp.jitFunction, x,y,cmp,context);
                 });
             } else {
                 vec4f bargs[2];
                 context->invokeEx(cmp, bargs, nullptr, [&](SimNode * code) {
-                    das::sort ( data, data+arr.size, [&](TT x, TT y) -> bool {
+                    das_sort ( data, data+arr.size, [&](TT x, TT y) -> bool {
                         bargs[0] = cast<TT>::from(x);
                         bargs[1] = cast<TT>::from(y);
                         return code->evalBool(*context);
@@ -3353,13 +3354,13 @@ namespace das {
             array_lock(*context, arr, at);
             if ( cmp.jitFunction ) {
                 using CmpFn = CallJitFn<bool,const TT &,const TT &,const Block &,Context *>;
-                das::sort ( data, data+arr.size, [&](const TT & x,const TT & y) -> bool {
+                das_sort ( data, data+arr.size, [&](const TT & x,const TT & y) -> bool {
                     return CmpFn::static_call(cmp.jitFunction,x,y,cmp,context);
                 });
             } else {
                 vec4f bargs[2];
                 context->invokeEx(cmp, bargs, nullptr, [&](SimNode * code) {
-                    das::sort ( data, data+arr.size, [&](const TT & x, const TT & y) -> bool {
+                    das_sort ( data, data+arr.size, [&](const TT & x, const TT & y) -> bool {
                         bargs[0] = cast<const TT &>::from(x);
                         bargs[1] = cast<const TT &>::from(y);
                         return code->evalBool(*context);
diff --git a/include/daScript/simulate/das_qsort_r.h b/include/daScript/simulate/das_qsort_r.h
new file mode 100644
index 0000000000..ac1fa2f54c
--- /dev/null
+++ b/include/daScript/simulate/das_qsort_r.h
@@ -0,0 +1,790 @@
+#pragma once
+
+#include "daScript/misc/platform.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
+namespace das
+{
+
+// Header-only byte-pointer algorithms with inline comparator. std::sort,
+// std::nth_element, std::partial_sort, std::make_heap etc. template on the
+// iterator type and cannot operate on opaque void* data with runtime-known
+// element width. The daslang any-cblock binding path (user-defined struct
+// types going through __builtin_*_any_cblock in module_builtin_runtime_sort.cpp)
+// needs algorithms that take (void*, nel, [n,] width, Compare) — these
+// implementations fill that gap.
+//
+// Each algorithm was picked from a bake-off (examples/sort/bench_sort_family.cpp)
+// against std::* and against alternative implementations. Notes per op:
+//
+//   - das_qsort_r            — pdqsort-lite (Hoare partition, median-of-3,
+//                              insertion-sort tail at 24, heapsort fallback on
+//                              2*log2(N) depth blowup). Won over smoothsort
+//                              (Anton Yudintsev's musl port, prior implementation)
+//                              by 51-66% across element sizes, and over plain
+//                              introsort with Lomuto partition by 7-10%.
+//   - das_nth_element_r      — introselect with Hoare partition + median-of-3.
+//                              Beats std::nth_element by ~30% across all sizes
+//                              and types (vs prior Lomuto introselect which
+//                              merely matched std).
+//   - das_partial_sort_r     — heap-of-topn scan: build max-heap of first N,
+//                              replace+sift while scanning the rest, drain.
+//                              Matches std::partial_sort; 4-7% FASTER than std
+//                              on large struct elements; 17× faster than the
+//                              prior nth_element+sort strategy when topn << N.
+//   - das_sift_down_r /      — hole-sliding sift-down with memcpy. Saves parent
+//     das_make_heap_r /        value to a stack buffer, slides larger children
+//     das_pop_heap_r           up into the hole (1 memcpy per level vs 3 memcpys
+//                              per byte_swap), places value at final hole. Floyd
+//                              bottom-up for make_heap. 35% faster than the
+//                              prior swap-based variants; matches std::make_heap;
+//                              7% faster than std for >32-byte elements.
+//   - das_push_heap_r        — classic sift-up with byte_swap. No bake-off
+//                              candidate measurably faster.
+
+// byte_swap — sized dispatch for common element widths plus a chunked-memcpy
+// fallback. Constant-size memcpy at w ∈ {4,8,16,32,64,128,256} is inlined by
+// every supported compiler to a single SIMD load/store pair per direction;
+// 30-140× faster than the generic loop for workhorse-type widths (measured
+// at examples/sort/bench_byte_swap.cpp). Wider elements fall through to the
+// chunked path.
+static inline void byte_swap(void *pa, void *pb, size_t width)
+{
+    unsigned char *a = (unsigned char *)pa;
+    unsigned char *b = (unsigned char *)pb;
+    switch (width) {
+        case 4:   { uint32_t t; memcpy(&t, a, 4); memcpy(a, b, 4); memcpy(b, &t, 4); return; }
+        case 8:   { uint64_t t; memcpy(&t, a, 8); memcpy(a, b, 8); memcpy(b, &t, 8); return; }
+        case 16:  { unsigned char t[16];  memcpy(t, a, 16);  memcpy(a, b, 16);  memcpy(b, t, 16);  return; }
+        case 32:  { unsigned char t[32];  memcpy(t, a, 32);  memcpy(a, b, 32);  memcpy(b, t, 32);  return; }
+        case 64:  { unsigned char t[64];  memcpy(t, a, 64);  memcpy(a, b, 64);  memcpy(b, t, 64);  return; }
+        case 128: { unsigned char t[128]; memcpy(t, a, 128); memcpy(a, b, 128); memcpy(b, t, 128); return; }
+        case 256: { unsigned char t[256]; memcpy(t, a, 256); memcpy(a, b, 256); memcpy(b, t, 256); return; }
+    }
+    unsigned char tmp[256];
+    while (width) {
+        size_t chunk = sizeof(tmp) < width ? sizeof(tmp) : width;
+        memcpy(tmp, a, chunk);
+        memcpy(a, b, chunk);
+        memcpy(b, tmp, chunk);
+        a += chunk; b += chunk; width -= chunk;
+    }
+}
+
+// sized_memcpy — same sized dispatch idea for plain dst<-src copies. Used
+// by das_sift_down_r's hole-sliding inner loop so the per-level memcpy at
+// known struct widths becomes a single inlined SIMD load/store pair instead
+// of a runtime-width libc memcpy call. Same widths as byte_swap.
+static inline void sized_memcpy(void *dst, const void *src, size_t width)
+{
+    switch (width) {
+        case 4:   memcpy(dst, src, 4);   return;
+        case 8:   memcpy(dst, src, 8);   return;
+        case 16:  memcpy(dst, src, 16);  return;
+        case 32:  memcpy(dst, src, 32);  return;
+        case 64:  memcpy(dst, src, 64);  return;
+        case 128: memcpy(dst, src, 128); return;
+        case 256: memcpy(dst, src, 256); return;
+    }
+    memcpy(dst, src, width);
+}
+
+// Hole-sliding sift-down. Saves parent value to stack buffer, slides larger
+// children up into the hole one memcpy at a time, places parent value at the
+// final hole position. For elements wider than the stack buffer, falls back
+// to a classic swap-based sift (uncommon — most workhorse + user-struct types
+// fit). Used by make_heap (Floyd bottom-up), pop_heap, and the partial_sort
+// heap drain.
+template <typename Compare>
+static inline void das_sift_down_r(unsigned char *data, size_t parent, size_t nel, size_t width, Compare cmp)
+{
+    if (2 * parent + 1 >= nel) return;
+    unsigned char tmp[256];
+    if (width > sizeof(tmp)) {
+        // Swap-based fallback for very wide elements.
+        while (true) {
+            size_t left = 2 * parent + 1;
+            if (left >= nel) return;
+            size_t right = left + 1;
+            size_t largest = parent;
+            if (cmp(data + parent * width, data + left * width)) largest = left;
+            if (right < nel && cmp(data + largest * width, data + right * width)) largest = right;
+            if (largest == parent) return;
+            byte_swap(data + parent * width, data + largest * width, width);
+            parent = largest;
+        }
+    }
+    sized_memcpy(tmp, data + parent * width, width);
+    size_t hole = parent;
+    while (true) {
+        size_t left = 2 * hole + 1;
+        if (left >= nel) break;
+        size_t larger = left;
+        if (left + 1 < nel && cmp(data + left * width, data + (left + 1) * width)) {
+            larger = left + 1;
+        }
+        if (!cmp(tmp, data + larger * width)) break;
+        sized_memcpy(data + hole * width, data + larger * width, width);
+        hole = larger;
+    }
+    sized_memcpy(data + hole * width, tmp, width);
+}
+
+template <typename Compare>
+inline void das_make_heap_r(void *base, size_t nel, size_t width, Compare cmp)
+{
+    if (nel <= 1) return;
+    unsigned char *data = (unsigned char *)base;
+    // Floyd bottom-up: sift_down from the last non-leaf node.
+    for (size_t i = nel / 2; i-- > 0;) {
+        das_sift_down_r(data, i, nel, width, cmp);
+    }
+}
+
+template <typename Compare>
+inline void das_push_heap_r(void *base, size_t nel, size_t width, Compare cmp)
+{
+    // Assumes caller has just appended the new element at index (nel-1) and
+    // now wants the heap property restored. Classic sift-up with byte_swap.
+    if (nel <= 1) return;
+    unsigned char *data = (unsigned char *)base;
+    size_t child = nel - 1;
+    while (child > 0) {
+        size_t parent = (child - 1) / 2;
+        if (!cmp(data + parent * width, data + child * width)) return;
+        byte_swap(data + parent * width, data + child * width, width);
+        child = parent;
+    }
+}
+
+template <typename Compare>
+inline void das_pop_heap_r(void *base, size_t nel, size_t width, Compare cmp)
+{
+    // Swap root with last, then sift_down over the reduced range [0..nel-2].
+    // Caller is expected to pop / drop the last slot after.
+    if (nel <= 1) return;
+    unsigned char *data = (unsigned char *)base;
+    byte_swap(data, data + (nel - 1) * width, width);
+    das_sift_down_r(data, 0, nel - 1, width, cmp);
+}
+
+// Used by qsort/nth_element as the depth-blowup fallback.
+template <typename Compare>
+static inline void das_heapsort_helper_r(unsigned char *data, size_t nel, size_t width, Compare cmp)
+{
+    if (nel <= 1) return;
+    das_make_heap_r(data, nel, width, cmp);
+    for (size_t len = nel; len > 1; len--) {
+        byte_swap(data, data + (len - 1) * width, width);
+        das_sift_down_r(data, 0, len - 1, width, cmp);
+    }
+}
+
+// Forward declaration of das_block_partition_r — defined below, referenced
+// by das_qsort_r and das_sort<T> which dispatch to it for large sub-ranges.
+template <typename Compare>
+static inline size_t das_block_partition_r(
+    unsigned char *data, size_t lo, size_t hi, size_t width,
+    Compare cmp, bool *already_partitioned);
+
+// Hoare partition with pivot at data[lo] (median-of-3 placed it there).
+// Returns pivot's final position. Used by das_qsort_r for sub-ranges below
+// the block-partition threshold and as the width-too-wide fallback inside
+// das_block_partition_r.
+template <typename Compare>
+static inline size_t das_hoare_partition_r(
+    unsigned char *data, size_t lo, size_t hi, size_t width, Compare cmp)
+{
+    size_t i = lo + 1, j = hi;
+    unsigned char *piv = data + lo * width;
+    while (true) {
+        while (i <= j && cmp(data + i * width, piv)) i++;
+        while (i <= j && cmp(piv, data + j * width)) j--;
+        if (i >= j) break;
+        byte_swap(data + i * width, data + j * width, width);
+        i++; j--;
+    }
+    byte_swap(piv, data + j * width, width);
+    return j;
+}
+
+// das_qsort_r — introsort with libc++-style block-bitset partition for
+// sub-ranges ≥ 128 elements; Hoare partition for smaller (bitset setup
+// overhead doesn't amortize below ~128 elements, especially on big structs).
+// Median-of-3 pivot placed at data[lo] for both partition styles.
+// Insertion-sort tail at 24; heapsort fallback at 2*log2(N) recursion depth.
+// Iterative explicit-stack form; stack depth ≤ 128 frames covers nel ~ 2^64.
+//
+// Block-partition strategy from Edelkamp & Weiß "BlockQuicksort" (ESA 2016),
+// landed in libc++ via D93923 (Kutenin, 2021). Closes ~half the gap to
+// libc++ std::sort on workhorse types and beats both libstdc++ Musser
+// introsort and libc++ pdqsort+block on struct types.
+template <typename Compare>
+inline void das_qsort_r(void *base, size_t nel, size_t width, Compare cmp)
+{
+    if (nel <= 1) return;
+    unsigned char *data = (unsigned char *)base;
+    int max_depth = 0;
+    for (size_t x = nel; x > 0; x >>= 1) max_depth += 2;
+    struct Frame { size_t lo, hi; int depth; };
+    Frame stack[128];
+    int sp = 0;
+    stack[sp++] = { size_t(0), nel - 1, max_depth };
+    while (sp > 0) {
+        Frame f = stack[--sp];
+        size_t lo = f.lo, hi = f.hi;
+        int depth = f.depth;
+        while (lo < hi) {
+            if (hi - lo < 24) {
+                for (size_t i = lo + 1; i <= hi; i++) {
+                    for (size_t j = i; j > lo && cmp(data + j * width, data + (j - 1) * width); j--) {
+                        byte_swap(data + j * width, data + (j - 1) * width, width);
+                    }
+                }
+                break;
+            }
+            if (depth-- <= 0) {
+                das_heapsort_helper_r(data + lo * width, hi - lo + 1, width, cmp);
+                break;
+            }
+            // Median-of-3 pivot at mid, then move median to lo.
+            size_t mid = lo + (hi - lo) / 2;
+            unsigned char *plo = data + lo * width;
+            unsigned char *pmid = data + mid * width;
+            unsigned char *phi = data + hi * width;
+            if (cmp(pmid, plo)) byte_swap(plo, pmid, width);
+            if (cmp(phi,  plo)) byte_swap(plo, phi,  width);
+            if (cmp(phi,  pmid)) byte_swap(pmid, phi, width);
+            byte_swap(plo, pmid, width);
+
+            size_t p;
+            if (hi - lo >= 128) {
+                bool already_partitioned;
+                p = das_block_partition_r(data, lo, hi, width, cmp, &already_partitioned);
+            } else {
+                p = das_hoare_partition_r(data, lo, hi, width, cmp);
+            }
+            // Iterative tail-call on the larger side.
+            size_t lsz = p > lo ? p - lo : 0;
+            size_t rsz = hi > p ? hi - p : 0;
+            if (lsz < rsz) {
+                if (p + 1 < hi) stack[sp++] = { p + 1, hi, depth };
+                if (p > lo) hi = p - 1; else break;
+            } else {
+                if (p > lo) stack[sp++] = { lo, p - 1, depth };
+                if (p + 1 < hi) lo = p + 1; else break;
+            }
+        }
+    }
+}
+
+// ============================================================================
+// Block-partition bake-off candidate (Phase 0.2)
+// ============================================================================
+//
+// Port of libc++'s __bitset_partition + __introsort to byte-pointer form,
+// runtime width, runtime cmp. The win vs Hoare partition: branchless inner
+// populate loop fills a uint64_t bitset of "which 64 elements need to move",
+// then the swap pass is driven by countr_zero — branches collapse from one
+// per cmp to one per 64 elements. Reference: Edelkamp & Weiß "BlockQuicksort"
+// (ESA 2016), Peters pdqsort, libc++ <__algorithm/sort.h>:495.
+
+static constexpr int DAS_BLOCK_SIZE = 64;
+
+// Block-bitset partition. Pre: data[lo..hi] inclusive, lo < hi, pivot value
+// is at data[lo]. Post: pivot moved to its final position p; data[lo..p-1] <
+// pivot; data[p+1..hi] >= pivot. Returns p. Sets *already_partitioned to
+// true if no element needed to move (no swaps in the initial guarded find).
+template <typename Compare>
+static inline size_t das_block_partition_r(
+    unsigned char *data, size_t lo, size_t hi, size_t width,
+    Compare cmp, bool *already_partitioned)
+{
+    unsigned char pivot[256];
+    if (width > sizeof(pivot)) {
+        // Width too wide for stack buffer — fall back to Hoare partition.
+        // Reuses das_qsort_r body via a separate code path would be cleanest;
+        // for now, embed a minimal Hoare partition here.
+        size_t i = lo + 1, j = hi;
+        unsigned char *piv = data + lo * width;
+        while (true) {
+            while (i <= j && cmp(data + i * width, piv)) i++;
+            while (i <= j && cmp(piv, data + j * width)) j--;
+            if (i >= j) break;
+            byte_swap(data + i * width, data + j * width, width);
+            i++; j--;
+        }
+        byte_swap(piv, data + j * width, width);
+        *already_partitioned = false;
+        return j;
+    }
+    memcpy(pivot, data + lo * width, width);
+
+    // Initial guarded find: skip elements already on the correct side.
+    // After median-of-3 the caller placed median at lo; data[hi] is >= median
+    // and at least one element exists with cmp(pivot, *) == true at or before hi.
+    size_t first = lo + 1;
+    while (first <= hi && !cmp(pivot, data + first * width)) first++;
+
+    size_t last = hi;
+    if (first < last) {
+        while (cmp(pivot, data + last * width)) {
+            if (last == lo + 1) break;
+            last--;
+        }
+    }
+
+    *already_partitioned = (first >= last);
+    if (!*already_partitioned) {
+        byte_swap(data + first * width, data + last * width, width);
+        first++;
+    }
+    // From here on, last is inclusive on the right.
+
+    uint64_t left_bitset = 0;
+    uint64_t right_bitset = 0;
+    constexpr int B = DAS_BLOCK_SIZE;
+
+    // Main block loop: process full 64-element blocks from each side.
+    while (first + 2 * size_t(B) - 1 <= last) {
+        if (left_bitset == 0) {
+            // Populate: bit j set if data[first+j] >= pivot (needs to move right)
+            for (int j = 0; j < B; j++) {
+                bool gte = !cmp(data + (first + j) * width, pivot);
+                left_bitset |= uint64_t(gte) << j;
+            }
+        }
+        if (right_bitset == 0) {
+            // Populate: bit j set if data[last-j] <= pivot (needs to move left)
+            for (int j = 0; j < B; j++) {
+                bool lte = !cmp(pivot, data + (last - j) * width);
+                right_bitset |= uint64_t(lte) << j;
+            }
+        }
+        // Pairwise swap driven by countr_zero on both bitsets.
+        while (left_bitset != 0 && right_bitset != 0) {
+            int li = int(das_ctz64(left_bitset));
+            int ri = int(das_ctz64(right_bitset));
+            byte_swap(data + (first + li) * width, data + (last - ri) * width, width);
+            left_bitset  &= left_bitset  - 1;
+            right_bitset &= right_bitset - 1;
+        }
+        if (left_bitset  == 0) first += B;
+        if (right_bitset == 0) last  -= B;
+    }
+
+    // Tail: < 2*B remaining elements; one side may still have a residual bitset.
+    size_t remaining = last - first + 1;
+    size_t l_size, r_size;
+    if (left_bitset == 0 && right_bitset == 0) {
+        l_size = remaining / 2;
+        r_size = remaining - l_size;
+    } else if (left_bitset == 0) {
+        l_size = remaining - B;
+        r_size = B;
+    } else {
+        l_size = B;
+        r_size = remaining - B;
+    }
+    if (left_bitset == 0) {
+        for (size_t j = 0; j < l_size; j++) {
+            bool gte = !cmp(data + (first + j) * width, pivot);
+            left_bitset |= uint64_t(gte) << j;
+        }
+    }
+    if (right_bitset == 0) {
+        for (size_t j = 0; j < r_size; j++) {
+            bool lte = !cmp(pivot, data + (last - j) * width);
+            right_bitset |= uint64_t(lte) << j;
+        }
+    }
+    while (left_bitset != 0 && right_bitset != 0) {
+        int li = int(das_ctz64(left_bitset));
+        int ri = int(das_ctz64(right_bitset));
+        byte_swap(data + (first + li) * width, data + (last - ri) * width, width);
+        left_bitset  &= left_bitset  - 1;
+        right_bitset &= right_bitset - 1;
+    }
+    if (left_bitset  == 0) first += l_size;
+    if (right_bitset == 0) last  -= r_size;
+
+    // Residual: at most one side still has set bits. Swap them into place by
+    // walking from the highest set bit downward (libc++ __swap_bitmap_pos_within).
+    if (left_bitset != 0) {
+        while (left_bitset != 0) {
+            int tz = int(63 - das_clz64(left_bitset));
+            left_bitset &= (uint64_t(1) << tz) - 1;
+            size_t pos = first + size_t(tz);
+            if (pos != last) byte_swap(data + pos * width, data + last * width, width);
+            last--;
+        }
+        first = last + 1;
+    } else if (right_bitset != 0) {
+        while (right_bitset != 0) {
+            int tz = int(63 - das_clz64(right_bitset));
+            right_bitset &= (uint64_t(1) << tz) - 1;
+            size_t pos = last - size_t(tz);
+            if (pos != first) byte_swap(data + pos * width, data + first * width, width);
+            first++;
+        }
+    }
+
+    // Place pivot at first-1 (its final position).
+    size_t pivot_pos = first - 1;
+    if (pivot_pos != lo) {
+        memcpy(data + lo * width, data + pivot_pos * width, width);
+    }
+    memcpy(data + pivot_pos * width, pivot, width);
+    return pivot_pos;
+}
+
+// das_qsort_block_r — introsort with libc++-style block-bitset partition.
+// Same outer skeleton as das_qsort_r (depth limit, insertion-sort tail at 24,
+// heapsort fallback at 2*log2(N)). Only the partition algorithm differs.
+template <typename Compare>
+inline void das_qsort_block_r(void *base, size_t nel, size_t width, Compare cmp)
+{
+    if (nel <= 1) return;
+    unsigned char *data = (unsigned char *)base;
+    int max_depth = 0;
+    for (size_t x = nel; x > 0; x >>= 1) max_depth += 2;
+    struct Frame { size_t lo, hi; int depth; };
+    Frame stack[128];
+    int sp = 0;
+    stack[sp++] = { size_t(0), nel - 1, max_depth };
+    while (sp > 0) {
+        Frame f = stack[--sp];
+        size_t lo = f.lo, hi = f.hi;
+        int depth = f.depth;
+        while (lo < hi) {
+            if (hi - lo < 24) {
+                for (size_t i = lo + 1; i <= hi; i++) {
+                    for (size_t j = i; j > lo && cmp(data + j * width, data + (j - 1) * width); j--) {
+                        byte_swap(data + j * width, data + (j - 1) * width, width);
+                    }
+                }
+                break;
+            }
+            if (depth-- <= 0) {
+                das_heapsort_helper_r(data + lo * width, hi - lo + 1, width, cmp);
+                break;
+            }
+            // Median-of-3 pivot at mid, then move median to lo so partition
+            // can use the (pivot-at-first) convention.
+            size_t mid = lo + (hi - lo) / 2;
+            unsigned char *plo = data + lo * width;
+            unsigned char *pmid = data + mid * width;
+            unsigned char *phi = data + hi * width;
+            if (cmp(pmid, plo)) byte_swap(plo, pmid, width);
+            if (cmp(phi,  plo)) byte_swap(plo, phi,  width);
+            if (cmp(phi,  pmid)) byte_swap(pmid, phi, width);
+            byte_swap(plo, pmid, width);
+
+            bool already_partitioned = false;
+            size_t p = das_block_partition_r(data, lo, hi, width, cmp, &already_partitioned);
+
+            // Iterative tail-call on the larger side.
+            size_t lsz = p > lo ? p - lo : 0;
+            size_t rsz = hi > p ? hi - p : 0;
+            if (lsz < rsz) {
+                if (p + 1 < hi) stack[sp++] = { p + 1, hi, depth };
+                if (p > lo) hi = p - 1; else break;
+            } else {
+                if (p > lo) stack[sp++] = { lo, p - 1, depth };
+                if (p + 1 < hi) lo = p + 1; else break;
+            }
+        }
+    }
+}
+
+// das_nth_element_r — introselect with Hoare partition + median-of-3 pivot +
+// insertion-sort cutoff at 16 + heapsort fallback at 2*log2(N) recursion depth.
+// Beats std::nth_element by ~30% on the bake-off (large N, random data).
+template <typename Compare>
+inline void das_nth_element_r(void *base, size_t nel, size_t n, size_t width, Compare cmp)
+{
+    if (nel <= 1 || n >= nel) return;
+    unsigned char *data = (unsigned char *)base;
+    size_t lo = 0;
+    size_t hi = nel - 1;
+    int depth_limit = 0;
+    for (size_t x = nel; x > 0; x >>= 1) depth_limit += 2;
+    while (lo < hi) {
+        if (depth_limit-- <= 0) {
+            das_heapsort_helper_r(data + lo * width, hi - lo + 1, width, cmp);
+            return;
+        }
+        if (hi - lo < 16) {
+            for (size_t i = lo + 1; i <= hi; i++) {
+                for (size_t j = i; j > lo && cmp(data + j * width, data + (j - 1) * width); j--) {
+                    byte_swap(data + j * width, data + (j - 1) * width, width);
+                }
+            }
+            return;
+        }
+        size_t mid = lo + (hi - lo) / 2;
+        if (cmp(data + mid * width, data + lo * width)) byte_swap(data + lo * width, data + mid * width, width);
+        if (cmp(data + hi  * width, data + lo * width)) byte_swap(data + lo * width, data + hi  * width, width);
+        if (cmp(data + hi  * width, data + mid * width)) byte_swap(data + mid * width, data + hi * width, width);
+        byte_swap(data + mid * width, data + (lo + 1) * width, width);
+        unsigned char *pivot = data + (lo + 1) * width;
+        size_t i = lo + 2;
+        size_t j = hi;
+        while (true) {
+            while (i <= j && cmp(data + i * width, pivot)) i++;
+            while (i <= j && cmp(pivot, data + j * width)) j--;
+            if (i >= j) break;
+            byte_swap(data + i * width, data + j * width, width);
+            i++; j--;
+        }
+        byte_swap(pivot, data + j * width, width);
+        if (j == n) return;
+        if (j < n) lo = j + 1;
+        else       hi = j > 0 ? j - 1 : 0;
+    }
+}
+
+// Typed Hoare partition with pivot at data[lo]. Mirror of das_hoare_partition_r
+// for the typed das_sort<T> path. Returns pivot's final position.
+template <typename T, typename Compare>
+static inline size_t das_hoare_partition_t(T *data, size_t lo, size_t hi, Compare cmp)
+{
+    using std::swap;
+    size_t i = lo + 1, j = hi;
+    while (true) {
+        while (i <= j && cmp(data[i], data[lo])) i++;
+        while (i <= j && cmp(data[lo], data[j])) j--;
+        if (i >= j) break;
+        swap(data[i], data[j]);
+        i++; j--;
+    }
+    swap(data[lo], data[j]);
+    return j;
+}
+
+// Typed block-bitset partition with pivot at data[lo]. Mirror of
+// das_block_partition_r for the typed das_sort<T> path. Uses std::swap on T&
+// instead of byte_swap, otherwise structurally identical. Returns pivot's
+// final position; sets *already_partitioned if the guarded find skipped to
+// the end without swaps.
+template <typename T, typename Compare>
+static inline size_t das_block_partition_t(
+    T *data, size_t lo, size_t hi, Compare cmp, bool *already_partitioned)
+{
+    using std::swap;
+    T pivot = data[lo];   // value copy
+    size_t first = lo + 1;
+    while (first <= hi && !cmp(pivot, data[first])) first++;
+    size_t last = hi;
+    if (first < last) {
+        while (cmp(pivot, data[last])) {
+            if (last == lo + 1) break;
+            last--;
+        }
+    }
+    *already_partitioned = (first >= last);
+    if (!*already_partitioned) {
+        swap(data[first], data[last]);
+        first++;
+    }
+
+    uint64_t left_bitset = 0;
+    uint64_t right_bitset = 0;
+    constexpr int B = DAS_BLOCK_SIZE;
+
+    while (first + 2 * size_t(B) - 1 <= last) {
+        if (left_bitset == 0) {
+            for (int j = 0; j < B; j++) {
+                bool gte = !cmp(data[first + j], pivot);
+                left_bitset |= uint64_t(gte) << j;
+            }
+        }
+        if (right_bitset == 0) {
+            for (int j = 0; j < B; j++) {
+                bool lte = !cmp(pivot, data[last - j]);
+                right_bitset |= uint64_t(lte) << j;
+            }
+        }
+        while (left_bitset != 0 && right_bitset != 0) {
+            int li = int(das_ctz64(left_bitset));
+            int ri = int(das_ctz64(right_bitset));
+            swap(data[first + li], data[last - ri]);
+            left_bitset  &= left_bitset  - 1;
+            right_bitset &= right_bitset - 1;
+        }
+        if (left_bitset  == 0) first += B;
+        if (right_bitset == 0) last  -= B;
+    }
+
+    size_t remaining = last - first + 1;
+    size_t l_size, r_size;
+    if (left_bitset == 0 && right_bitset == 0) {
+        l_size = remaining / 2;
+        r_size = remaining - l_size;
+    } else if (left_bitset == 0) {
+        l_size = remaining - B;
+        r_size = B;
+    } else {
+        l_size = B;
+        r_size = remaining - B;
+    }
+    if (left_bitset == 0) {
+        for (size_t j = 0; j < l_size; j++) {
+            bool gte = !cmp(data[first + j], pivot);
+            left_bitset |= uint64_t(gte) << j;
+        }
+    }
+    if (right_bitset == 0) {
+        for (size_t j = 0; j < r_size; j++) {
+            bool lte = !cmp(pivot, data[last - j]);
+            right_bitset |= uint64_t(lte) << j;
+        }
+    }
+    while (left_bitset != 0 && right_bitset != 0) {
+        int li = int(das_ctz64(left_bitset));
+        int ri = int(das_ctz64(right_bitset));
+        swap(data[first + li], data[last - ri]);
+        left_bitset  &= left_bitset  - 1;
+        right_bitset &= right_bitset - 1;
+    }
+    if (left_bitset  == 0) first += l_size;
+    if (right_bitset == 0) last  -= r_size;
+
+    if (left_bitset != 0) {
+        while (left_bitset != 0) {
+            int tz = int(63 - das_clz64(left_bitset));
+            left_bitset &= (uint64_t(1) << tz) - 1;
+            size_t pos = first + size_t(tz);
+            if (pos != last) swap(data[pos], data[last]);
+            last--;
+        }
+        first = last + 1;
+    } else if (right_bitset != 0) {
+        while (right_bitset != 0) {
+            int tz = int(63 - das_clz64(right_bitset));
+            right_bitset &= (uint64_t(1) << tz) - 1;
+            size_t pos = last - size_t(tz);
+            if (pos != first) swap(data[pos], data[first]);
+            first++;
+        }
+    }
+
+    size_t pivot_pos = first - 1;
+    if (pivot_pos != lo) data[lo] = std::move(data[pivot_pos]);
+    data[pivot_pos] = std::move(pivot);
+    return pivot_pos;
+}
+
+// Shared introsort skeleton for typed das_sort<T> and das_sort_block<T>.
+// The two differ only in which partition routine fires for ≥ 128 elements;
+// below 128 both use typed Hoare partition.
+template <typename T, typename Compare, bool ForceBlock>
+static inline void das_sort_skeleton_t(T *first, T *last, Compare cmp)
+{
+    if (last - first <= 1) return;
+    size_t nel = size_t(last - first);
+    T *data = first;
+    int max_depth = 0;
+    for (size_t x = nel; x > 0; x >>= 1) max_depth += 2;
+    struct Frame { size_t lo, hi; int depth; };
+    Frame stack[128];
+    int sp = 0;
+    stack[sp++] = { size_t(0), nel - 1, max_depth };
+    while (sp > 0) {
+        Frame f = stack[--sp];
+        size_t lo = f.lo, hi = f.hi;
+        int depth = f.depth;
+        while (lo < hi) {
+            if (hi - lo < 24) {
+                using std::swap;
+                for (size_t i = lo + 1; i <= hi; i++) {
+                    for (size_t j = i; j > lo && cmp(data[j], data[j - 1]); j--) {
+                        swap(data[j], data[j - 1]);
+                    }
+                }
+                break;
+            }
+            if (depth-- <= 0) {
+                std::make_heap(data + lo, data + hi + 1, cmp);
+                std::sort_heap(data + lo, data + hi + 1, cmp);
+                break;
+            }
+            // Median-of-3, place median at lo.
+            size_t mid = lo + (hi - lo) / 2;
+            using std::swap;
+            if (cmp(data[mid], data[lo])) swap(data[lo], data[mid]);
+            if (cmp(data[hi],  data[lo])) swap(data[lo], data[hi]);
+            if (cmp(data[hi],  data[mid])) swap(data[mid], data[hi]);
+            swap(data[lo], data[mid]);
+
+            size_t p;
+            if (ForceBlock || hi - lo >= 128) {
+                bool ap;
+                p = das_block_partition_t(data, lo, hi, cmp, &ap);
+            } else {
+                p = das_hoare_partition_t(data, lo, hi, cmp);
+            }
+            size_t lsz = p > lo ? p - lo : 0;
+            size_t rsz = hi > p ? hi - p : 0;
+            if (lsz < rsz) {
+                if (p + 1 < hi) stack[sp++] = { p + 1, hi, depth };
+                if (p > lo) hi = p - 1; else break;
+            } else {
+                if (p > lo) stack[sp++] = { lo, p - 1, depth };
+                if (p + 1 < hi) lo = p + 1; else break;
+            }
+        }
+    }
+}
+
+// das_sort — typed introsort with block-bitset partition for sub-ranges
+// ≥ 128 elements, typed Hoare for smaller. Mirror of das_qsort_r but
+// operating on T* with std::swap. Provides an apples-to-apples peer for
+// std::sort and a candidate for the workhorse-typed binding path in
+// module_builtin_runtime_sort.cpp.
+template <typename T, typename Compare>
+inline void das_sort(T *first, T *last, Compare cmp)
+{
+    das_sort_skeleton_t<T, Compare, false>(first, last, cmp);
+}
+
+// das_sort_block — typed pure-block-partition introsort. No Hoare fallback
+// at any sub-range size. Bench-only ceiling test for the block-partition
+// algorithm's typed-access upper bound.
+template <typename T, typename Compare>
+inline void das_sort_block(T *first, T *last, Compare cmp)
+{
+    das_sort_skeleton_t<T, Compare, true>(first, last, cmp);
+}
+
+// das_partial_sort_r — heap-of-topn scan. Builds a max-heap from the first N
+// elements; for each remaining element, if it's smaller than the heap top,
+// replaces top + sifts to restore heap. Final drain via repeated pop produces
+// sorted output. O(M log N). Matches std::partial_sort.
+template <typename Compare>
+inline void das_partial_sort_r(void *base, size_t nel, size_t n, size_t width, Compare cmp)
+{
+    if (nel <= 1 || n == 0) return;
+    if (n > nel) n = nel;
+    unsigned char *data = (unsigned char *)base;
+    das_make_heap_r(data, n, width, cmp);
+    for (size_t i = n; i < nel; i++) {
+        unsigned char *xi = data + i * width;
+        if (cmp(xi, data)) {           // a[i] < heap top (current Nth-smallest)
+            byte_swap(data, xi, width);// displace top to position i (outside heap; never revisited)
+            das_sift_down_r(data, 0, n, width, cmp);
+        }
+    }
+    // Drain: pop max repeatedly into positions n-1, n-2, ...
+    for (size_t len = n; len > 1; len--) {
+        byte_swap(data, data + (len - 1) * width, width);
+        das_sift_down_r(data, 0, len - 1, width, cmp);
+    }
+}
+
+}
diff --git a/mouse-data/docs/byte-swap-micro-win-invisible-under-cblock-dominance.md b/mouse-data/docs/byte-swap-micro-win-invisible-under-cblock-dominance.md
new file mode 100644
index 0000000000..8f67491e5f
--- /dev/null
+++ b/mouse-data/docs/byte-swap-micro-win-invisible-under-cblock-dominance.md
@@ -0,0 +1,61 @@
+---
+slug: byte-swap-micro-win-invisible-under-cblock-dominance
+title: why does the byte_swap sized-dispatch micro-improvement (140× at W=4 in isolation) not show up in any real daslang sort benchmark?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+A byte_swap variant that demolishes chunked-256 in isolation (`examples/sort/bench_byte_swap.cpp` — 140× faster at W=4, 30× at W=8, never slower) was invisible in every real-world sort benchmark when dropped into `src/builtin/das_qsort_r.h` (2026-05-17 bake-off, Phase 0.2 of PR #2706 follow-up).
+
+**Measured deltas (chunked-256 → sized-dispatch):**
+
+| benchmark | metric | before | after |
+|---|---|---:|---:|
+| `bench_sort_family.cpp` int32/100K sort | das/std ratio | 2.57× | 2.56× |
+| `bench_sort_family.cpp` P32/100K sort | das/std ratio | 1.07× | 1.07× |
+| `benchmarks/sort/sort.das` struct+cblock 100K | ns/op | 279 | 281 |
+| `benchmarks/sort/nth_element.das` k=50000/100K | ns/op | 19 | 19 |
+| `benchmarks/sort/heap_ops.das` bounded_topn/100K | ns/op | 27 | 27 |
+
+Two distinct reasons, depending on the path:
+
+### 1. C++ bench (bench_sort_family.cpp): constant propagation
+
+The bench calls `das_qsort_r(a.data(), a.size(), sizeof(T), das_cmp<T>())`. `sizeof(T)` is a compile-time constant. With `-O3` and the `static inline` byte_swap, the compiler propagates `sizeof(T)` all the way down into the byte_swap body. The OLD chunked loop:
+
+```cpp
+while (width) {
+    size_t chunk = sizeof(tmp) < width ? sizeof(tmp) : width;
+    memcpy(tmp, a, chunk); memcpy(a, b, chunk); memcpy(b, tmp, chunk);
+    a += chunk; b += chunk; width -= chunk;
+}
+```
+
+at width=4 becomes: 1 iteration, chunk=4, 3× constant-size memcpy → identical machine code to the new switch arm. The micro-bench shows the cost because it calls byte_swap through a `Fn` function-pointer template parameter, defeating constant propagation. **The micro-bench measures dispatch cost; the real bench measures inlined-and-folded cost.**
+
+### 2. daslang runtime path (any_cblock binding): callback dominance
+
+`module_builtin_runtime_sort.cpp` calls `das_qsort_r(anyData, length, elementSize, lambda)` where elementSize IS runtime. Here the sized switch DOES win (5-10ns saved per swap vs the runtime-width loop). For 100K elements with ~1.7M swaps, that's ~10ms of theoretical savings.
+
+But the lambda does `context->invokeEx(cmp, bargs, ...)` per comparison — script-block invocation with bargs marshaling. Measured cost per call dominates the swap by ~100×. The 27.9ms total for `sort_struct_by_key/100K` is overwhelmingly comparator-callback time; shaving 10ms off byte_swap would be visible but wasn't reproduced in repeated runs (281 vs 279 within noise).
+
+### Implication for future perf work
+
+- Micro-benchmark a primitive in isolation → not predictive of integrated impact when (a) callers know the parameter at compile time, or (b) some other operation in the same path dominates.
+- If byte_swap is your suspected bottleneck, write a daslang benchmark that uses a CHEAP comparator (e.g. ints with `<`, going through the workhorse path, or a typed C++ harness that defeats constant propagation). The any_cblock path will never reveal byte_swap wins.
+- The actual sort-vs-std gap on workhorse types (2.5× slower) is NOT byte_swap. Likely candidates: partition algorithm choice (libstdc++ may use block-quicksort partition), cmp callback inlining quality. Future bake-offs targeting that gap should NOT spend cycles on byte_swap variants.
+
+### Decision
+
+Boris (2026-05-17): keep the sized dispatch anyway. Net-neutral but never worse; cleaner code at small widths; header comment documents the bake-off so future maintainers don't redo the experiment. The 30-140× micro-win remains real for any future caller that defeats constant-prop (e.g. a direct C++ binding with runtime width).
+
+### Source measurements
+
+- Header: `src/builtin/das_qsort_r.h` byte_swap
+- Micro-bench: `examples/sort/bench_byte_swap.cpp` (chunked256/chunked64/words64/sized/hybrid candidates at W∈{4,8,16,32,64,128,256,512})
+- Integration: `examples/sort/bench_sort_family.cpp` + `benchmarks/sort/*.das`
+- Related card: [[qsort-byte-swap-implementations-survey]] — production-impl survey from the bake-off research
+
+## Questions
+- why does the byte_swap sized-dispatch micro-improvement (140× at W=4 in isolation) not show up in any real daslang sort benchmark?
diff --git a/mouse-data/docs/das-qsort-r-vs-std-perf-comparison.md b/mouse-data/docs/das-qsort-r-vs-std-perf-comparison.md
new file mode 100644
index 0000000000..7f5888cc42
--- /dev/null
+++ b/mouse-data/docs/das-qsort-r-vs-std-perf-comparison.md
@@ -0,0 +1,55 @@
+---
+slug: das-qsort-r-vs-std-perf-comparison
+title: how do das_partial_sort_r / das_nth_element_r / das_make_heap_r perform compared to std::* equivalents, and what's the real-world cost of the custom impl?
+created: 2026-05-17
+last_verified: 2026-05-17
+links: []
+---
+
+# das_*_r templates vs std::* — measured perf comparison
+
+Source: `examples/sort/bench_sort_family.cpp` (standalone, runs in ~16s on M-series Mac). N ∈ {1K, 10K, 100K}, element sizes {int32, int64, 32B struct, 128B struct}.
+
+## Headline numbers (das ÷ std ratio, worst case across sizes)
+
+| op | das vs std | reading |
+|---|---|---|
+| `das_nth_element_r` | 0.95–1.85× | **Match** — beats std at N=100K |
+| `das_make_heap_r` + drain | 1.6–2.8× slower | Acceptable overhead |
+| `das_qsort_r` (smoothsort) | 2.3–5.7× slower | Pre-existing (Anton Yudintsev's musl port). Smoothsort vs std's introsort. Known cost. |
+| **`das_partial_sort_r`** | **2–18× slower** | **Real implementation bug** (see below) |
+
+## Why partial_sort is so slow
+
+`das_partial_sort_r` is implemented as `das_nth_element_r(N) + das_qsort_r(first N)` — O(N) + O(topn·log·topn). Looks correct on paper, BUT:
+
+`std::partial_sort` uses a **heap-of-size-topn** approach: scan all N elements, maintain a top-N max-heap, output sorted. O(N · log topn). When topn ≪ N (e.g. topn=10, N=100K) this **destroys** nth-element-then-sort because nth_element still pays O(N) with a big constant.
+
+Concrete: at N=100K, topn=10 — std::partial_sort 39μs vs das_partial_sort_r 730μs = **18× gap**.
+
+The same asymmetry shows in daslib LINQ benchmarks:
+- `m3_topn_array` (calls das_partial_sort_r): 442 ns/op at M=100K, top-N=10
+- `m3_topn_iter` (does heap-of-N manually via push_heap loop): 83 ns/op same params
+- 5× perf difference, same algorithmic cause
+
+## Why custom impl exists at all
+
+`std::sort` / `std::partial_sort` / `std::nth_element` template on **iterator type**. They cannot operate on opaque `void*` data with runtime-known `width`. The daslang any-cblock path (user-defined struct types going through `addExtern<...any_cblock>` bindings) needs byte-pointer algorithms — that's exactly what `das_*_r` templates provide.
+
+So the custom impl IS justified for the any-cblock path. The typed paths (workhorse types) in `module_builtin_runtime_sort.cpp` already specialize to `std::*` directly via the `STAMP_NUMERIC_OPS` / `STAMP_VECTOR_OPS` macros.
+
+## Fix candidates (out of scope for Phase 0, planned follow-up)
+
+1. **Rewrite `das_partial_sort_r` to heap-of-topn** — closes the 18× gap. ~20 LOC change, in the Phase-0-added section of `das_qsort_r.h` (NOT Anton's smoothsort).
+2. **`das_make_heap_r` Floyd construction** — should match std but ~2× slower; revisit if profiling points here.
+3. **`das_qsort_r` smoothsort** — pre-existing; out of scope to replace.
+
+## When this matters
+
+- `top_n_by(arr, N, key)` in `daslib/linq` — currently uses `das_partial_sort_r` under the hood for the array overload. The iterator overload uses heap-of-N manually and is much faster.
+- BufferTopN emit mode in `linq_fold` (planned PR B) — wants the fast heap-of-N path for `[where][select]* |> order_by |> take(N)` fusion.
+
+Run the bench yourself: `cmake -S examples/sort -B build/example_sort_bench && cmake --build build/example_sort_bench -j && ./build/example_sort_bench/example_sort_bench`.
+
+## Questions
+- how do das_partial_sort_r / das_nth_element_r / das_make_heap_r perform compared to std::* equivalents, and what's the real-world cost of the custom impl?
diff --git a/mouse-data/docs/libcxx-stdsort-block-partition-pdqsort.md b/mouse-data/docs/libcxx-stdsort-block-partition-pdqsort.md
new file mode 100644
index 0000000000..878a5f0fcb
--- /dev/null
+++ b/mouse-data/docs/libcxx-stdsort-block-partition-pdqsort.md
@@ -0,0 +1,100 @@
+---
+slug: libcxx-stdsort-block-partition-pdqsort
+title: why is libc++ std::sort 2.5× faster than our das_qsort_r pdqsort-lite on workhorse types at N=100K?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+Investigated 2026-05-17 in the PR #2706 follow-up: on macOS, the std::sort beating das_qsort_r by 2.5× at N=100K (int32/int64) is **libc++**, NOT libstdc++. libstdc++ ships plain Musser introsort and is roughly the same algorithm as ours. libc++ ships **block-quicksort pdqsort** since 2021 ([D93923](https://reviews.llvm.org/D93923), Kutenin) — same algorithm as Orson Peters' pdqsort + the Edelkamp/Weiß BlockQuicksort partition.
+
+## Three specific techniques in libc++
+
+Source: `/Library/Developer/CommandLineTools/SDKs/MacOSX26.4.sdk/usr/include/c++/v1/__algorithm/sort.h`.
+
+### 1. `__bitset_partition` (sort.h:495) — branchless partition
+
+Populate a `uint64_t` bitmask of comparison outcomes for 64 elements (branchless inner loop), THEN do swaps in a separate pass driven by `countr_zero` (tzcnt):
+
+```cpp
+// populate (no swaps, no branches)
+for (int __j = 0; __j < 64;) {
+  bool __comp_result = !__comp(*__iter, __pivot);
+  __left_bitset |= (uint64_t(__comp_result) << __j);
+  __j++; ++__iter;
+}
+// swap (predictable loop driven by tzcnt)
+while (__left_bitset != 0 && __right_bitset != 0) {
+  difference_type __tz_left  = __countr_zero(__left_bitset);
+  __left_bitset              = __libcpp_blsr(__left_bitset);
+  ...
+  _Ops::iter_swap(__first + __tz_left, __last - __tz_right);
+}
+```
+
+Cuts branch mispredictions from ~32/partition (50% rate over Hoare's `while (cmp) ++i; while (cmp) --j;`) to **one per 64 elements** (when the bitset empties). On random int32 at N=100K this is the dominant win.
+
+### 2. Branchless small-sort kernels (sort.h:99–198)
+
+Sub-ranges of 2/3/4/5 elements use sorting networks built from `__cond_swap`:
+
+```cpp
+// sort.h:67
+bool __r = __c(*__x, *__y);
+value_type __tmp = __r ? *__x : *__y;
+*__y = __r ? *__y : *__x;
+*__x = __tmp;
+```
+
+Clang lowers `bool ? T : T` on arithmetic types to `csel`/`cmov` — zero branches. Gated by `__use_branchless_sort` (sort.h:54): `contiguous_iterator && is_arithmetic && (std::less || std::greater)`. **Fundamentally typed.**
+
+### 3. Tukey ninther + already-partitioned fast-path (sort.h:775, 809)
+
+For `__len > 128`, pivot is median-of-three-medians-of-three (5× `__sort3` calls). After partition, returns a `bool already_partitioned`; caller then tries an **incomplete** insertion sort capped at 8 inversions before recursing. Pre-sorted/nearly-sorted runs collapse to O(N).
+
+## What is and isn't portable
+
+**Portable to byte-pointer (`(void*, width, cmp)` binding path):**
+- Block-partition bitset loop — `__builtin_ctzll` is universal. Swaps go through byte_swap.
+- Ninther pivot + already-partitioned bool — pure logic, works at any width.
+
+**Requires typed access:**
+- `__cond_swap` → cmov (compiler needs a real value type to emit conditional move)
+- SIMD-vectorization of the 64-element populate-bitset loop (compiler needs constant width + inlined cmp to vectorize)
+
+## Bake-off candidates (priority order)
+
+| # | Candidate | Portable to byte-ptr? | Est. lift | Effort |
+|---|---|---|---|---|
+| D | Ninther + already-partitioned bolt-on to das_qsort_r | yes | small | half day |
+| C | Block-partition pdqsort, byte-pointer | yes | **big — likely most of the 2.5×** | 2-3 days |
+| B | Block-partition pdqsort, typed das_sort\<T\> | n/a (typed only) | biggest | 2-3 days on top of C |
+| A | Dispatch typed das_sort\<T\> from binding for workhorse types | n/a (typed only) | medium | 1 day |
+| E | Branchless `__cond_swap` small-sort kernels | n/a (typed only) | small decoration | half day |
+
+Critical question the bake-off settles: if (C) ≈ (B), the gap is algorithmic (block partition wins regardless of typedness). If (B) ≫ (C), the gap is SIMD-vectorize.
+
+## Verified comparison data (2026-05-17, M-series Mac, Apple clang, libc++)
+
+`examples/sort/bench_sort_family.cpp` deep-dive at N=100K:
+
+| type | std::sort | C qsort | das_qsort_r | das_sort\<T\> |
+|---|---:|---:|---:|---:|
+| int32_t | 1.86M | 7.49M (**4.03×**) | 4.77M (2.57×) | 4.86M (2.62×) |
+| int64_t | 1.92M | 7.49M (**3.89×**) | 4.77M (2.48×) | 4.84M (2.52×) |
+| P32 | 4.97M | 8.44M (1.70×) | 5.48M (1.10×) | 5.64M (1.13×) |
+| P128 | 7.44M | 11.91M (1.60×) | 8.80M (1.18×) | 9.63M (1.30×) |
+
+das_sort\<T\> (typed pdqsort-lite, same algorithm as das_qsort_r) is ≈ identical to das_qsort_r byte-pointer at every (N, type). The 2.5× gap is algorithm, not API. C qsort (the proper byte-pointer + runtime-cmp peer) is **2× SLOWER than das_qsort_r** — we beat libc qsort by a clean margin.
+
+## Source pointers
+
+- libc++ sort: `/Library/Developer/CommandLineTools/SDKs/MacOSX26.4.sdk/usr/include/c++/v1/__algorithm/sort.h` lines 54/67/99/274/302/495/717
+- libstdc++ sort: `/opt/homebrew/include/c++/13/bits/stl_algo.h` lines 85/1792/1854/1893/1918/1942 (Musser introsort, unchanged since SGI STL era)
+- pdqsort reference: github.com/orlp/pdqsort
+- Block partition paper: Edelkamp & Weiß, "BlockQuicksort", ESA 2016 (arXiv:1604.06697)
+- libc++ landing commit: [D93923](https://reviews.llvm.org/D93923) — design rationale lives in review comments
+- Our impl for comparison: `src/builtin/das_qsort_r.h:159-220` (das_qsort_r) + `src/builtin/das_qsort_r.h:` (das_sort\<T\>, added 2026-05-17)
+
+## Questions
+- why is libc++ std::sort 2.5× faster than our das_qsort_r pdqsort-lite on workhorse types at N=100K?
diff --git a/mouse-data/docs/qsort-byte-swap-implementations-survey.md b/mouse-data/docs/qsort-byte-swap-implementations-survey.md
new file mode 100644
index 0000000000..823dc1bd40
--- /dev/null
+++ b/mouse-data/docs/qsort-byte-swap-implementations-survey.md
@@ -0,0 +1,34 @@
+---
+slug: qsort-byte-swap-implementations-survey
+title: What byte-swap strategies do production qsorts use for arbitrary-width element exchange?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+Four canonical families across libc/kernel implementations:
+
+**1. Word-at-a-time temp swap (Linux kernel, glibc).** Linux `lib/sort.c` and glibc `stdlib/qsort.c` both classify (size, alignment) once at qsort entry — `is_aligned(base, size, 8)` → `SWAP_WORDS_64`, `is_aligned(base, size, 4)` → `SWAP_WORDS_32`, else `SWAP_BYTES`. Dispatcher is a 3-way `if`, NOT an indirect call (glibc comment: "should help the branch predictor"). Each variant is a tiny do-while: `do { T t = *(T*)(a+(n-=k)); *(T*)(a+n) = *(T*)(b+n); *(T*)(b+n) = t; } while (n);` — no buffer, in-place, compiler vectorizes the loop body.
+
+**2. Bentley-McIlroy "Engineering a Sort Function" (1993).** Same idea but older: `SWAPINIT` computes `swaptype = (a-(char*)0 | es) % sizeof(long) ? 2 : es > sizeof(long)`; `swaptype==0` means single-long inline exch (no call), `==1` means word-loop call, `==2` means byte-loop call. The single-long fast path inlined at the call site is the key trick — most sorts are 8-byte keys.
+
+**3. memcpy 3-way with stack buffer (musl smoothsort `cycle()`).** `char tmp[256]; while (width) { l = min(256, width); memcpy(ar[n], ar[0], l); for (i..n) { memcpy(ar[i], ar[i+1], l); ar[i] += l; } width -= l; }`. Chunked at 256 bytes — small enough to fit L1, large enough to amortize memcpy overhead. Generalizes to n-cycle rotations (smoothsort's trinkle/sift needs this).
+
+**4. Byte-by-byte only (FreeBSD).** `do { t = *a; *a++ = *b; *b++ = t; } while (--es > 0);` — no dispatch, no optimization. Simpler, but slow for anything >4 bytes. FreeBSD dropped Bentley-McIlroy's swaptype dispatch.
+
+**Specialized fast paths.** All four production implementations (except FreeBSD) special-case the common width=8/16 case. Linux kernel via SWAP_WORDS_64 + do-while-unrollable, glibc same, B-M via swaptype==0 inline single-long exchange.
+
+**pdqsort is C++-templated** — `std::iter_swap` on typed iterators, no byte primitive. Not applicable to byte-pointer ports.
+
+**Bake-off candidates worth measuring:**
+- Linux kernel SWAP_WORDS_64 (predicted winner for ≤64B aligned)
+- Bentley-McIlroy single-long fast-path + word-loop fallback
+- Current memcpy chunked-256 baseline
+- memcpy 3-way unchunked with sized stack buffer (single memcpy per direction, no loop)
+- Inline `__builtin_memcpy(a,b,W)` with width as compile-time constant via switch on common widths (4/8/16/32) — compiler emits direct SIMD load/store
+
+Reference impls quoted at: torvalds/linux lib/sort.c, bminor/glibc stdlib/qsort.c, musl src/stdlib/qsort.c, cs.dartmouth.edu/~doug/qsort.c (B-M), freebsd-src lib/libc/stdlib/qsort.c.</body>
+</invoke>
+
+## Questions
+- What byte-swap strategies do production qsorts use for arbitrary-width element exchange?
diff --git a/mouse-data/docs/sphinx-w-fails-on-my-pr-branch-with-undefined-label-struct-module-x-but-master-ci-is-green-what-causes-this-hash-order-flip-and.md b/mouse-data/docs/sphinx-w-fails-on-my-pr-branch-with-undefined-label-struct-module-x-but-master-ci-is-green-what-causes-this-hash-order-flip-and.md
new file mode 100644
index 0000000000..71b7570ed5
--- /dev/null
+++ b/mouse-data/docs/sphinx-w-fails-on-my-pr-branch-with-undefined-label-struct-module-x-but-master-ci-is-green-what-causes-this-hash-order-flip-and.md
@@ -0,0 +1,38 @@
+---
+slug: sphinx-w-fails-on-my-pr-branch-with-undefined-label-struct-module-x-but-master-ci-is-green-what-causes-this-hash-order-flip-and
+title: Sphinx -W fails on my PR branch with "undefined label: 'struct-MODULE-X'" but master CI is green — what causes this hash-order flip and how do I fix it?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**Symptom:** CI doc workflow's `sphinx-build -W` fails on your branch only. Error: `doc/source/stdlib/generated/MODULE.rst:N: WARNING: undefined label: 'struct-MODULE-StructName'`. Master CI is green for the exact same `daslib/*.das` source.
+
+**Diagnosis:** `doc/reflections/das2rst.das` emits the **full** RST detail (description + `:Arguments:` block with `:ref:` to param types) only for the **first** function of a same-named overload group; the rest get a stub `.. das:function::` label + signature only. The iteration order over functions is **hash-bucket-dependent** — binary layout changes (new templates, added install paths, anything that shifts pointer/address-based hashing) can flip which overload wins the "first detailed" slot.
+
+If the winning overload references a **private struct** as a parameter type, das2rst still emits `:ref:`StructName <struct-MODULE-StructName>``, but the struct label is never defined (das2rst only emits Structures-section entries for public structs). Sphinx -W then fails with the undefined-label warning.
+
+**Verification pattern:**
+1. Build master's daslang in a worktree (`git worktree add /tmp/repo-master master`)
+2. Run `./bin/daslang doc/reflections/das2rst.das` on each
+3. `diff` the affected `.rst` — if the broken `:ref:` appears in a *different* overload's `:Arguments:` block on master, you've confirmed the order-flip pattern.
+
+**Durable fix:** Mark the struct `public` in the `.das` source. Since the function exposing it is `public`, the struct already needs to be reachable from caller code anyway — the private qualifier was the latent bug. Example from `daslib/quote.das` (PR #2707):
+```das
+struct public CaptureEntryInitData {
+    //! Initialization data for a captured variable entry.
+    name : string       //! Variable name being captured.
+    mode : CaptureMode  //! Capture mode (copy, reference, move, or clone).
+}
+```
+
+**Caveat — public structs need per-field `//!` docstrings.** Otherwise `daslang doc/reflections/das2rst.das` PANICs with `"... has less documentation than values. Expected at least N lines, got 1"`. The struct's main `//!` + one `//!` per field is the convention (see `LineInfoInitData` in `daslib/quote.das`).
+
+**Don't:** Add a handmade RST stub under `doc/source/stdlib/handmade/` for the struct — handmade files are only consulted for C++ builtin modules. For daslang modules das2rst regenerates from the source.
+
+**Reference:** PR #2707, commit `dd9b250ca`; my added template instantiations in `include/daScript/simulate/das_qsort_r.h` shifted hash buckets so `CaptureEntryInitData` won the detailed slot over `LineInfoInitData`.</body>
+<slug>sphinx-w-undefined-label-private-struct-public-fn-hash-order-flip</slug>
+</invoke>
+
+## Questions
+- Sphinx -W fails on my PR branch with "undefined label: 'struct-MODULE-X'" but master CI is green — what causes this hash-order flip and how do I fix it?
diff --git a/mouse-data/docs/standalone-example-no-daslang-link.md b/mouse-data/docs/standalone-example-no-daslang-link.md
new file mode 100644
index 0000000000..942647c203
--- /dev/null
+++ b/mouse-data/docs/standalone-example-no-daslang-link.md
@@ -0,0 +1,73 @@
+---
+slug: standalone-example-no-daslang-link
+title: how do I add a fully standalone benchmark or example under /examples that does not link the daslang runtime (header-only consumer)?
+created: 2026-05-17
+last_verified: 2026-05-17
+links: []
+---
+
+# Standalone example under /examples (no daslang runtime link)
+
+The conventional pattern in /examples (crash, pathTracer, etc.) is to register the example in the **root** `CMakeLists.txt` via `include(examples/<name>/CMakeLists.txt)` (around line 1132). That style **requires** linking `libDaScriptDyn`, runs as part of the main DAS.sln build, and gets rebuilt with everything else.
+
+If your example only consumes a header (e.g. `src/builtin/das_qsort_r.h` — the smoothsort + introselect + heap-on-byte-buffer templates), **don't register it in the root CMakeLists**. Make it fully standalone instead. Build it on its own without rebuilding the daslang world.
+
+## Layout
+
+```
+examples/<name>/
+  CMakeLists.txt           # standalone — no parent include
+  bench_<thing>.cpp        # includes "<header_name>.h" only
+```
+
+## CMakeLists.txt template
+
+```cmake
+cmake_minimum_required(VERSION 3.16)
+project(example_<name> CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
+endif()
+
+add_executable(example_<name> bench_<thing>.cpp)
+target_include_directories(example_<name> PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../src/builtin)
+
+if(MSVC)
+    target_compile_options(example_<name> PRIVATE /O2 /W4)
+else()
+    target_compile_options(example_<name> PRIVATE -O3 -Wall -Wextra)
+endif()
+```
+
+## Build + run
+
+```sh
+cmake -S examples/<name> -B build/example_<name> -DCMAKE_BUILD_TYPE=Release
+cmake --build build/example_<name> -j 64
+./build/example_<name>/example_<name>
+```
+
+Build time: a few seconds. No daslang dependency.
+
+## When to use
+
+- Header-only consumers — your example needs `das_qsort_r.h` or another self-contained header but no `Context`, `Program`, `Module`, etc.
+- One-off perf comparisons / regression baselines that should NOT slow down the main daslang build by being part of it.
+- Demos of standalone components that should remain compilable without the full daslang toolchain.
+
+## When NOT to use
+
+If your example needs to run a daslang script, create a `Module`, or call into the simulate/runtime — use the **registered** pattern instead (root `CMakeLists.txt` include + `TARGET_LINK_LIBRARIES libDaScriptDyn`). See `examples/crash/CMakeLists.txt` for the canonical registered example.
+
+## Concrete reference
+
+`examples/sort/` (added 2026-05-17 in PR #2706 follow-up branch) — standalone perf benchmark comparing std::sort/std::partial_sort/std::nth_element/std::make_heap+pop_heap against the das_*_r templates in das_qsort_r.h, no daslang link, ~16s end-to-end on M-series Mac.
+
+## Questions
+- how do I add a fully standalone benchmark or example under /examples that does not link the daslang runtime (header-only consumer)?
diff --git a/mouse-data/docs/what-ci-checks-must-pass-when-i-regenerate-doc-source-stdlib-via-das2rst-das.md b/mouse-data/docs/what-ci-checks-must-pass-when-i-regenerate-doc-source-stdlib-via-das2rst-das.md
new file mode 100644
index 0000000000..a0d4c4dbb3
--- /dev/null
+++ b/mouse-data/docs/what-ci-checks-must-pass-when-i-regenerate-doc-source-stdlib-via-das2rst-das.md
@@ -0,0 +1,35 @@
+---
+slug: what-ci-checks-must-pass-when-i-regenerate-doc-source-stdlib-via-das2rst-das
+title: What CI checks must pass when I regenerate doc/source/stdlib/ via das2rst.das?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+CI workflow `.github/workflows/doc.yml` runs three sequential gates after `./bin/daslang doc/reflections/das2rst.das`:
+
+1. **`// stub` check** — `grep -rl '// stub' doc/source/stdlib/handmade/` must return nothing. das2rst writes `// stub` placeholders for any public function/struct/enum without a handmade description.
+
+2. **Untracked-files check** — `git ls-files --others --exclude-standard doc/source/stdlib/` must be empty. Newly-generated RST files must be committed.
+
+3. **`sphinx-build -W --keep-going -b latex`** — warnings-as-errors LaTeX build. Catches dangling `:ref:`s, malformed tables, duplicate labels.
+
+**Local workflow before pushing:**
+```bash
+./bin/daslang doc/reflections/das2rst.das
+grep -rl "// stub" doc/source/stdlib/handmade/   # must be empty
+git add doc/source/stdlib/handmade/              # if new files appeared
+~/Library/Python/3.11/bin/sphinx-build -W -b latex -d doc/sphinx-build doc/source /tmp/site_doc   # must exit 0
+```
+
+**Gotchas:**
+- For **daslang modules** (`daslib/*.das`, `modules/*/*.das`): fix stubs by adding `//!` comments in the `.das` source — handmade files for daslang modules are ignored. See `skills/documentation_rst.md`.
+- For **C++ builtin modules** (math, strings, audio, etc.): fix stubs by editing `doc/source/stdlib/handmade/*.rst` directly.
+- When bulk-filling many stubs by copying from siblings (e.g. when a binary layout change shifts function hash buckets and regenerates 158 new RST files), **hand-check math overloads** — vector-specific descriptions may not match the scalar overload, and some library descriptions are technically wrong (e.g. `mad` is *not* always fused FMA; `round` halfway behavior is *not* nearest-even on the vector path).
+
+Reference: PR #2707 round 3 — `mad` and `round` doc fixes after Copilot caught the incorrect inherited descriptions.</body>
+<slug>ci-doc-workflow-gates-stubs-untracked-sphinx-w</slug>
+</invoke>
+
+## Questions
+- What CI checks must pass when I regenerate doc/source/stdlib/ via das2rst.das?
diff --git a/mouse-data/docs/what-daslib-operations-exist-for-partial-sort-nth-element-heap-ops-and-top-n-selection.md b/mouse-data/docs/what-daslib-operations-exist-for-partial-sort-nth-element-heap-ops-and-top-n-selection.md
new file mode 100644
index 0000000000..131f6be99a
--- /dev/null
+++ b/mouse-data/docs/what-daslib-operations-exist-for-partial-sort-nth-element-heap-ops-and-top-n-selection.md
@@ -0,0 +1,39 @@
+---
+slug: what-daslib-operations-exist-for-partial-sort-nth-element-heap-ops-and-top-n-selection
+title: What daslib operations exist for partial-sort, nth-element, heap ops, and top-N selection?
+created: 2026-05-17
+last_verified: 2026-05-17
+links: []
+---
+
+Phase 0 of the linq_fold project (PR landing 2026-05-17) added the `<algorithm>` sort-family bindings. Operations now available:
+
+**Sort-family (`require daslib/sort_boost`)** — array form, with and without custom comparator block:
+- `partial_sort(arr, n[, cmp])` — sorts only the first N elements ascending. `O(M log N)` via `std::partial_sort` (typed) or `das_partial_sort_r` (any-cblock path for user structs).
+- `nth_element(arr, n[, cmp])` — places the kth-smallest at position k; left ≤ kth ≤ right. Sub-O(M) overall.
+- `make_heap(arr[, cmp])` / `push_heap(arr[, cmp])` / `pop_heap(arr[, cmp])` — binary max-heap on contiguous storage. `push_heap` assumes the new element was just appended; `pop_heap` moves max to last slot for caller to drop.
+
+**Top-N selection (`require daslib/linq`)** — array + iterator sources:
+- `top_n(src, n)` — N smallest by `<`.
+- `top_n_by(src, n, key)` — N smallest by `key(element)`.
+- Array source uses `partial_sort` under the hood (O(M log N), single allocation).
+- Iterator source uses a bounded heap of size N during the scan (O(M log N), max N+1 elements resident).
+
+**Dispatcher macros (`require daslib/sort_boost`)** — mirror `qsort` for the full surface:
+- `qpartial_sort(value, n, block)`, `qnth_element(value, n, block)`, `qmake_heap(value, block)`, `qpush_heap(value, block)`, `qpop_heap(value, block)`. Each dispatches to the named user-facing function for `array<T>` / dim, or wraps in `temp_array(...)` for handled vector types.
+
+**C++ binding shape** — all 5 ops follow the existing `__builtin_sort` template registered across 19 workhorse types (numeric + vector). Generic user-struct path goes through `das_qsort_r.h` byte-pointer templates (introselect for partial_sort/nth_element; standard sift-up/sift-down for heap ops).
+
+**Out of Phase 0:** dim (`[]`) overloads are not provided for the daslib wrappers — use array form or call `__builtin_*` directly. Buffer-required emit modes in linq_fold (BufferTopN, BufferDistinct, etc.) that consume these primitives land in subsequent PRs (Phase 3+).
+
+**Critical files:**
+- `daslib/sort_boost.das:60-` — user-facing wrappers + q-prefix macros
+- `daslib/linq.das:460-510` — `top_n` / `top_n_by` family
+- `src/builtin/das_qsort_r.h:233-` — algorithm templates
+- `include/daScript/simulate/aot.h:3390-` — cblock template surface
+- `src/builtin/module_builtin_runtime_sort.cpp` — bindings + any-cblock wrappers
+
+Project plan: `~/.claude/plans/merry-dazzling-curry.md`. Project context: `benchmarks/sql/LINQ.md`.
+
+## Questions
+- What daslib operations exist for partial-sort, nth-element, heap ops, and top-N selection?
diff --git a/mouse-data/docs/what-s-the-right-anti-dce-pattern-for-a-c-microbenchmark-inner-loop-so-the-optimizer-can-t-elide-it.md b/mouse-data/docs/what-s-the-right-anti-dce-pattern-for-a-c-microbenchmark-inner-loop-so-the-optimizer-can-t-elide-it.md
new file mode 100644
index 0000000000..a5d554b94d
--- /dev/null
+++ b/mouse-data/docs/what-s-the-right-anti-dce-pattern-for-a-c-microbenchmark-inner-loop-so-the-optimizer-can-t-elide-it.md
@@ -0,0 +1,41 @@
+---
+slug: what-s-the-right-anti-dce-pattern-for-a-c-microbenchmark-inner-loop-so-the-optimizer-can-t-elide-it
+title: What's the right anti-DCE pattern for a C++ microbenchmark inner loop so the optimizer can't elide it?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**Pattern:** after the timer stops, take a `volatile` observation of post-loop state. The volatile load forces the compiler to materialize the value, which transitively forces the timed loop body to have an observed effect.
+
+```cpp
+auto t0 = clk::now();
+for (int it = 0; it < iters; it++) {
+    for (size_t p = 0; p < pairs; p++) {
+        fn(data.data() + (2*p) * width, data.data() + (2*p + 1) * width, width);
+    }
+}
+auto t1 = clk::now();
+// Anti-DCE: force a volatile observation of post-loop data so the optimizer
+// can't prove the loop is dead.
+volatile unsigned char observed = data.back();
+(void)observed;
+```
+
+**Anti-pattern (broken):** comparing against a value the type can never hold.
+```cpp
+if (data.back() == 0xDEAD) std::fprintf(stderr, "%d", int(data[0]));
+```
+`data.back()` is `unsigned char` (0–255) and can never equal `0xDEAD` (57005). The optimizer constant-folds the condition to `false`, eliminates the if-branch, then has no post-loop observation of `data`, so the entire timed loop can be DCE'd. Benchmark reports unrealistically low times.
+
+**Other valid patterns:**
+- `asm volatile("" : : "r"(data.data()) : "memory")` — compiler fence + memory clobber (Google benchmark's `DoNotOptimize`/`ClobberMemory` use this)
+- `std::cerr << uint64_t(data[0]) << "\n";` — actual I/O, but adds latency
+- The `volatile unsigned char` load is the cheapest portable form
+
+Reference: PR #2707 round 3 — Copilot caught the `0xDEAD` bug in `examples/sort/bench_byte_swap.cpp`. Commit `a11a3d79e`.</body>
+<slug>anti-dce-bench-pattern-volatile-post-loop-observation</slug>
+</invoke>
+
+## Questions
+- What's the right anti-DCE pattern for a C++ microbenchmark inner loop so the optimizer can't elide it?
diff --git a/mouse-data/docs/where-are-the-cross-compiler-bit-scan-and-popcount-helpers-in-daslang-s-c-headers.md b/mouse-data/docs/where-are-the-cross-compiler-bit-scan-and-popcount-helpers-in-daslang-s-c-headers.md
new file mode 100644
index 0000000000..6e33c74e57
--- /dev/null
+++ b/mouse-data/docs/where-are-the-cross-compiler-bit-scan-and-popcount-helpers-in-daslang-s-c-headers.md
@@ -0,0 +1,25 @@
+---
+slug: where-are-the-cross-compiler-bit-scan-and-popcount-helpers-in-daslang-s-c-headers
+title: Where are the cross-compiler bit-scan and popcount helpers in daslang's C++ headers?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**`include/daScript/misc/platform.h:149-211`** — daslang's cross-compiler primitives. On MSVC they wrap `_BitScanForward64` / `_BitScanReverse64` / `__popcnt64` (with explicit branches for x86-32 and ARM64); on GCC/Clang they are `#define`s to the `__builtin_*` intrinsics.
+
+Helpers:
+- `das_clz(uint32_t)` / `das_clz64(uint64_t)` — leading-zero count (same return as `__builtin_clzll`)
+- `das_ctz(uint32_t)` / `das_ctz64(uint64_t)` — trailing-zero count (same return as `__builtin_ctzll`)
+- `das_popcount(uint32_t)` / `das_popcount64(uint64_t)` — population count
+
+**Rule:** before writing any `#ifdef _MSC_VER` shim for `__builtin_ctzll` / `__builtin_clzll` / `__popcnt64`, **search `include/daScript/misc/platform.h` first**. This was the cause of PR #2707 CI failure round 2 — I wrote `__builtin_ctzll` directly in a new public header and MSVC choked. Fix was `#include "daScript/misc/platform.h"` + replace `__builtin_ctzll(x)` → `int(das_ctz64(x))` and `63 - __builtin_clzll(x)` → `int(63 - das_clz64(x))`.
+
+**Same energy for SIMD/vector intrinsics:** check `include/vecmath/dag_vecMath_*.h` for hand-tuned wrappers before rolling your own.
+
+Reference: PR #2707, commit `e69210c69`. Boris's standing rule (`feedback_cross_compiler_helpers_first.md`): always search before writing `#ifdef _MSC_VER` shims.</body>
+<slug>cross-compiler-bitscan-popcount-helpers-daScript-misc-platform-h</slug>
+</invoke>
+
+## Questions
+- Where are the cross-compiler bit-scan and popcount helpers in daslang's C++ headers?
diff --git a/mouse-data/docs/why-does-a-new-top-level-html-page-e-g-daspkg-html-added-under-site-404-on-daslang-io-after-merging-to-master.md b/mouse-data/docs/why-does-a-new-top-level-html-page-e-g-daspkg-html-added-under-site-404-on-daslang-io-after-merging-to-master.md
new file mode 100644
index 0000000000..a070dc16ba
--- /dev/null
+++ b/mouse-data/docs/why-does-a-new-top-level-html-page-e-g-daspkg-html-added-under-site-404-on-daslang-io-after-merging-to-master.md
@@ -0,0 +1,31 @@
+---
+slug: why-does-a-new-top-level-html-page-e-g-daspkg-html-added-under-site-404-on-daslang-io-after-merging-to-master
+title: Why does a new top-level HTML page (e.g. /daspkg.html) added under site/ 404 on daslang.io after merging to master?
+created: 2026-05-17
+last_verified: 2026-05-17
+links: []
+---
+
+The Pages deploy workflow (`.github/workflows/pages.yml`, "Stage site for deployment" step) stages root-level HTML via an **explicit allowlist**, not a glob:
+
+```yaml
+cp site/index.html _site/
+cp site/downloads.html _site/
+cp site/daspkg.html _site/      # add a new line per new top-level page
+cp site/robots.txt _site/
+cp -r site/files _site/files
+```
+
+Only `index.html`, `downloads.html`, `daspkg.html`, `robots.txt`, and `files/` are explicitly copied. Subdirectories with their own staging (`build/site` → `/doc`, `web/output/*` → `/playground`, `_news/` + `blog/_posts/` → blog via `build_blog.py`) are independent of this allowlist.
+
+**Failure mode**: add `site/foo.html` to a commit, merge to master, watch /foo.html return 404 indefinitely because no `cp site/foo.html _site/` line was added. The page sits in the repo but never reaches `_site/`, and since `actions/deploy-pages` publishes `_site` as a complete snapshot ([[github-pages-deploy-pages-publishes-snapshot-not-overlay]]) there's no carryover from a prior deploy either.
+
+**Fix**: edit pages.yml, add the `cp site/<name>.html _site/` line, commit. Re-running the workflow without the edit will not help — the cp list is the source of truth.
+
+**Diagnose with**: `grep "cp site/" .github/workflows/pages.yml` shows the current allowlist.
+
+The 2026-05-17 daspkg page (PR #2703, commit 281ac2e28) shipped without this line and 404'd in production until the workflow was patched.</body>
+<parameter name="slug">pages-yml-explicit-cp-allowlist-new-html-needs-line
+
+## Questions
+- Why does a new top-level HTML page (e.g. /daspkg.html) added under site/ 404 on daslang.io after merging to master?
diff --git a/site/blog/_posts/do-u-even-sort.md b/site/blog/_posts/do-u-even-sort.md
new file mode 100644
index 0000000000..7caa986f28
--- /dev/null
+++ b/site/blog/_posts/do-u-even-sort.md
@@ -0,0 +1,77 @@
+---
+title: Do you even sort?
+date: 2026-05-18 20:35:17
+tags:
+    - daScript
+    - C++
+---
+
+The Art Of Computer Programming Volume 3. Kind of.
+
+<!-- more -->
+
+Sort and rescue? Also hashtag hash - but that is das_hash_map, and its a story for another day. It's an old story too, but I'll get to it.
+
+Today is top_n_by day
+
+    let rows <- _sql(db |> select_from(type<Car>) |> _order_by(_.price) |> take(TAKE_N))
+    let rows <- _fold(arr |> _order_by(_.price) |> take(TAKE_N))
+    let rows <- top_n_by(arr, TAKE_N, @@(c : Car -&) => c.price)
+
+Its like take all the cars, sort by price, pick top N. Naive solution is to first sort, then take. Don't be naive.
+
+But what's in box number 2?
+
+    for (it in a) {
+        if (length(buf) < n) {
+            buf |> push_clone(it)
+            sort_boost::push_heap(buf, $(v1, v2) => _::less(key(v1), key(v2)))
+        } elif (_::less(key(it), key(buf[0]))) {
+            sort_boost::pop_heap(buf, $(v1, v2) => _::less(key(v1), key(v2)))
+            buf[length(buf) - 1] := it
+            sort_boost::push_heap(buf, $(v1, v2) => _::less(key(v1), key(v2)))
+        }
+    }
+    sort(buf, $(v1, v2) => _::less(key(v1), key(v2)))
+
+There is also nth_element in there somewhere, but I will get to that one eventually.
+
+It wasn't here this morning. I was working on making LINQ awesome. SQL version is excellent, but containers are lacking. Not for long.
+
+Problem is - das has sort, sort, and only sort. No heap, no nth_element, no nada. Here is what I do when I see nada. I turn nothing into something. An hour later I had all this jazz there in the PR. Easy. Right? Right??
+
+It never is. I can't just merge. I have to measure. Take the ruler, take it out, and see where it lands. This is how das became das. So a matrix was born. The matrix of the benchmarks, that is.
+
+std::sort vs another std::sort (because Apple Clang is more awesome more often), vs qsort (which was later dropped for being dog slow), vs das_sort (and later vs das_sort<T>). repeat for nth_element, make_heap, heap_sort. Different sizes too.
+
+Guess what? It instantly showed that what I had was not IT. A few hours later, what I had became IT enough. Did I mention the Apple Clang runtime? Its good.
+
+It's bold when das wins.
+
+### The Matrix vs libc++ std:: (thats Apple Clang)
+
+| op | int32 | int64 | P32 | P128 |
+|---|---:|---:|---:|---:|
+| sort | 1.37× | 1.38× | **0.61×** | **0.91×** |
+| partial_sort | 1.03× | 1.01× | 1.05× | **0.95×** |
+| nth_element | **0.64×** | **0.65×** | **0.70×** | **0.74×** |
+| make_heap | **0.95×** | 1.02× | 1.09× | 1.06× |
+| heap_sort | **0.92×** | **0.94×** | 1.12× | 1.12× |
+
+das is better 10/20. we win on larger structs, which I find more important.
+
+### The Matrix vs libstdc++ std:: (thats regular Clang)
+
+| op | int32 | int64 | P32 | P128 |
+|---|---:|---:|---:|---:|
+| sort | **0.66×** | **0.66×** | **0.68×** | **0.88×** |
+| partial_sort | **0.99×** | **0.93×** | 1.02× | 1.01× |
+| nth_element | **0.68×** | **0.71×** | **0.66×** | **0.74×** |
+| make_heap | **0.67×** | **0.66×** | **0.56×** | **0.76×** |
+| heap_sort | **0.89×** | **0.91×** | **0.93×** | 1.01× |
+
+das is better 17/20.
+
+That cars example on top - das is now on par with SQLITE, when operating on arrays. SQLITE has had years, its solid. Thats all it does.
+
+All I do is make das awesome, one LINQ at a time.
diff --git a/src/builtin/das_qsort_r.h b/src/builtin/das_qsort_r.h
deleted file mode 100644
index 8027e0c74e..0000000000
--- a/src/builtin/das_qsort_r.h
+++ /dev/null
@@ -1,380 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-
-namespace das
-{
-
-//this is qsort_r/qsort_s like implementation, but with (potentially) inline comparator
-//qsort_r/qsort_s is not portable, and all comparator calls are out-of-line
-//from https://github.com/esmil/musl/blob/master/src/stdlib/qsort.c
-
-/* Copyright (C) 2011 by Valentin Ochs
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/* Minor changes by Rich Felker for integration in musl, 2011-04-27. */
-//modified by Anton Yudintsev, to make inline comparator
-
-static inline int ntz(unsigned long x)
-{
-    static const char debruijn64[64] = {
-        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
-        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
-        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
-        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
-    };
-    return debruijn64[(x&-x)*0x022fdd63cc95386dull >> 58];
-}
-
-static inline int pntz(uint32_t p[2]) {
-    int r = ntz(p[0] - 1);
-    if(r != 0 || (r = 8*sizeof(uint32_t) + ntz(p[1])) != 8*sizeof(uint32_t)) {
-        return r;
-    }
-    return 0;
-}
-
-static void cycle(uint32_t width, unsigned char* ar[], int n)
-{
-    unsigned char tmp[256];
-    uint32_t l;
-    int i;
-
-    if(n < 2) {
-        return;
-    }
-
-    ar[n] = tmp;
-    while(width) {
-        l = sizeof(tmp) < width ? sizeof(tmp) : width;
-        memcpy(ar[n], ar[0], l);
-        for(i = 0; i < n; i++) {
-            memcpy(ar[i], ar[i + 1], l);
-            ar[i] += l;
-        }
-        width -= l;
-    }
-}
-
-/* shl() and shr() need n > 0 */
-static inline void shl(uint32_t p[2], int n)
-{
-    if(n >= int(8 * sizeof(uint32_t))) {
-        n -= 8 * sizeof(uint32_t);
-        p[1] = p[0];
-        p[0] = 0;
-    }
-    p[1] <<= n;
-    p[1] |= p[0] >> (sizeof(uint32_t) * 8 - n);
-    p[0] <<= n;
-}
-
-static inline void shr(uint32_t p[2], int n)
-{
-    if(n >= int(8 * sizeof(uint32_t))) {
-        n -= 8 * sizeof(uint32_t);
-        p[0] = p[1];
-        p[1] = 0;
-    }
-    p[0] >>= n;
-    p[0] |= p[1] << (sizeof(uint32_t) * 8 - n);
-    p[1] >>= n;
-}
-
-template <typename Compare>
-static void sift(unsigned char *head, uint32_t width, Compare cmp, int pshift, uint32_t lp[])
-{
-    unsigned char *rt, *lf;
-    unsigned char *ar[14 * sizeof(uint32_t) + 1];
-    int i = 1;
-
-    ar[0] = head;
-    while(pshift > 1) {
-        rt = head - width;
-        lf = head - width - lp[pshift - 2];
-
-        if(!cmp(ar[0], lf) && !cmp(ar[0], rt)) {
-            break;
-        }
-        if(!cmp(lf, rt)) {
-            ar[i++] = lf;
-            head = lf;
-            pshift -= 1;
-        } else {
-            ar[i++] = rt;
-            head = rt;
-            pshift -= 2;
-        }
-    }
-    cycle(width, ar, i);
-}
-
-template <typename Compare>
-static void trinkle(unsigned char *head, uint32_t width, Compare cmp, uint32_t pp[2], int pshift, int trusty, uint32_t lp[])
-{
-    unsigned char *stepson,
-                  *rt, *lf;
-    uint32_t p[2];
-    unsigned char *ar[14 * sizeof(uint32_t) + 1];
-    int i = 1;
-    int trail;
-
-    p[0] = pp[0];
-    p[1] = pp[1];
-
-    ar[0] = head;
-    while(p[0] != 1 || p[1] != 0) {
-        stepson = head - lp[pshift];
-        if(cmp(stepson, ar[0])) {
-            break;
-        }
-        if(!trusty && pshift > 1) {
-            rt = head - width;
-            lf = head - width - lp[pshift - 2];
-            if(!cmp(rt, stepson) || !cmp(lf, stepson)) {
-                break;
-            }
-        }
-
-        ar[i++] = stepson;
-        head = stepson;
-        trail = pntz(p);
-        shr(p, trail);
-        pshift += trail;
-        trusty = 0;
-    }
-    if(!trusty) {
-        cycle(width, ar, i);
-        sift(head, width, cmp, pshift, lp);
-    }
-}
-
-template <typename Compare>
-inline void das_qsort_r(void *base, uint32_t nel, uint32_t width, Compare cmp)
-{
-    if (nel <= 1)
-      return;
-    uint32_t lp[12*sizeof(uint32_t)];
-    uint32_t i, size = width * nel;
-    unsigned char *head, *high;
-    uint32_t p[2] = {1, 0};
-    int pshift = 1;
-    int trail;
-
-    //if (!size) return;//we assume width isn't 0!
-    head = (unsigned char *)base;
-    high = head + size - width;
-
-    /* Precompute Leonardo numbers, scaled by element width */
-    for(lp[0]=lp[1]=width, i=2; (lp[i]=lp[i-2]+lp[i-1]+width) < size; i++);
-
-    while(head < high) {
-        if((p[0] & 3) == 3) {
-            sift(head, width, cmp, pshift, lp);
-            shr(p, 2);
-            pshift += 2;
-        } else {
-            if(lp[pshift - 1] >= uint32_t(high - head)) {
-                trinkle(head, width, cmp, p, pshift, 0, lp);
-            } else {
-                sift(head, width, cmp, pshift, lp);
-            }
-
-            if(pshift == 1) {
-                shl(p, 1);
-                pshift = 0;
-            } else {
-                shl(p, pshift - 1);
-                pshift = 1;
-            }
-        }
-
-        p[0] |= 1;
-        head += width;
-    }
-
-    trinkle(head, width, cmp, p, pshift, 0, lp);
-
-    while(pshift != 1 || p[0] != 1 || p[1] != 0) {
-        if(pshift <= 1) {
-            trail = pntz(p);
-            shr(p, trail);
-            pshift += trail;
-        } else {
-            shl(p, 2);
-            pshift -= 2;
-            p[0] ^= 7;
-            shr(p, 1);
-            trinkle(head - lp[pshift] - width, width, cmp, p, pshift + 1, 1, lp);
-            shl(p, 1);
-            p[0] |= 1;
-            trinkle(head - width, width, cmp, p, pshift, 1, lp);
-        }
-        head -= width;
-    }
-}
-
-// Expanded 2026-05-17 by Boris Batkin / Claude (Opus 4.7):
-//   das_nth_element_r / das_partial_sort_r — introselect (median-of-3
-//     quickselect with smoothsort fallback on depth blowup) + smoothsort tail
-//   das_make_heap_r / das_push_heap_r / das_pop_heap_r — binary max-heap on a
-//     contiguous byte buffer with explicit width
-// Same inline-comparator template style as the existing das_qsort_r above.
-// Motivated by the linq_fold splice project's BufferTopN emit mode — see
-// benchmarks/sql/LINQ.md for context. Without these the partial_sort /
-// nth_element / heap-op bindings would only cover workhorse types
-// (std::partial_sort etc. template on iterator type, not void*+width).
-// User-defined struct types route through the any-cblock path which lands here.
-
-static inline void byte_swap(void *pa, void *pb, uint32_t width)
-{
-    unsigned char tmp[256];
-    unsigned char *a = (unsigned char *)pa;
-    unsigned char *b = (unsigned char *)pb;
-    while (width) {
-        uint32_t chunk = uint32_t(sizeof(tmp)) < width ? uint32_t(sizeof(tmp)) : width;
-        memcpy(tmp, a, chunk);
-        memcpy(a, b, chunk);
-        memcpy(b, tmp, chunk);
-        a += chunk;
-        b += chunk;
-        width -= chunk;
-    }
-}
-
-template <typename Compare>
-static inline void das_sift_down_r(unsigned char *data, uint32_t parent, uint32_t nel, uint32_t width, Compare cmp)
-{
-    while (true) {
-        uint32_t left = 2u * parent + 1u;
-        if (left >= nel) return;
-        uint32_t right = left + 1u;
-        uint32_t largest = parent;
-        if (cmp(data + parent * width, data + left * width)) largest = left;
-        if (right < nel && cmp(data + largest * width, data + right * width)) largest = right;
-        if (largest == parent) return;
-        byte_swap(data + parent * width, data + largest * width, width);
-        parent = largest;
-    }
-}
-
-template <typename Compare>
-inline void das_make_heap_r(void *base, uint32_t nel, uint32_t width, Compare cmp)
-{
-    if (nel <= 1) return;
-    unsigned char *data = (unsigned char *)base;
-    // Floyd bottom-up: sift_down from the last non-leaf node.
-    for (uint32_t i = nel / 2u; i-- > 0u;) {
-        das_sift_down_r(data, i, nel, width, cmp);
-    }
-}
-
-template <typename Compare>
-inline void das_push_heap_r(void *base, uint32_t nel, uint32_t width, Compare cmp)
-{
-    // Assumes the caller has just appended the new element at index (nel-1)
-    // and now wants the heap property restored.
-    if (nel <= 1) return;
-    unsigned char *data = (unsigned char *)base;
-    uint32_t child = nel - 1u;
-    while (child > 0u) {
-        uint32_t parent = (child - 1u) / 2u;
-        if (!cmp(data + parent * width, data + child * width)) return;
-        byte_swap(data + parent * width, data + child * width, width);
-        child = parent;
-    }
-}
-
-template <typename Compare>
-inline void das_pop_heap_r(void *base, uint32_t nel, uint32_t width, Compare cmp)
-{
-    // Swap root with last, then sift_down over the reduced range [0..nel-2].
-    // Caller is expected to pop / drop the last slot after.
-    if (nel <= 1) return;
-    unsigned char *data = (unsigned char *)base;
-    byte_swap(data, data + (nel - 1u) * width, width);
-    das_sift_down_r(data, 0u, nel - 1u, width, cmp);
-}
-
-template <typename Compare>
-inline void das_nth_element_r(void *base, uint32_t nel, uint32_t n, uint32_t width, Compare cmp)
-{
-    if (nel <= 1u || n >= nel) return;
-    unsigned char *data = (unsigned char *)base;
-    uint32_t lo = 0u;
-    uint32_t hi = nel - 1u;
-    // Introselect depth bound: 2 * floor(log2(nel)). On bound exhaustion,
-    // fall back to smoothsort over the remaining range — O(n log n) but
-    // immune to adversarial quickselect inputs.
-    int depth_limit = 0;
-    for (uint32_t x = nel; x > 0u; x >>= 1) depth_limit += 2;
-    while (lo < hi) {
-        if (depth_limit-- <= 0) {
-            das_qsort_r(data + lo * width, hi - lo + 1u, width, cmp);
-            return;
-        }
-        // Small range: insertion sort and we're done. 16 is a common cutoff.
-        if (hi - lo < 16u) {
-            for (uint32_t i = lo + 1u; i <= hi; i++) {
-                for (uint32_t j = i; j > lo && cmp(data + j * width, data + (j - 1u) * width); j--) {
-                    byte_swap(data + j * width, data + (j - 1u) * width, width);
-                }
-            }
-            return;
-        }
-        // Median-of-3 pivot across lo, mid, hi; leaves median at `mid`.
-        uint32_t mid = lo + (hi - lo) / 2u;
-        unsigned char *plo  = data + lo  * width;
-        unsigned char *pmid = data + mid * width;
-        unsigned char *phi  = data + hi  * width;
-        if (cmp(pmid, plo)) byte_swap(plo, pmid, width);
-        if (cmp(phi,  plo)) byte_swap(plo, phi,  width);
-        if (cmp(phi,  pmid)) byte_swap(pmid, phi, width);
-        // Move pivot to `hi` and partition (Lomuto).
-        byte_swap(pmid, phi, width);
-        unsigned char *pivot = phi;
-        uint32_t i = lo;
-        for (uint32_t j = lo; j < hi; j++) {
-            if (cmp(data + j * width, pivot)) {
-                if (i != j) byte_swap(data + i * width, data + j * width, width);
-                i++;
-            }
-        }
-        byte_swap(data + i * width, pivot, width);
-        if (i == n) return;
-        if (i < n) lo = i + 1u;
-        else       hi = i - 1u;  // i > n implies i >= 1
-    }
-}
-
-template <typename Compare>
-inline void das_partial_sort_r(void *base, uint32_t nel, uint32_t n, uint32_t width, Compare cmp)
-{
-    if (nel <= 1u || n == 0u) return;
-    if (n > nel) n = nel;
-    if (n < nel) {
-        das_nth_element_r(base, nel, n, width, cmp);
-    }
-    das_qsort_r(base, n, width, cmp);
-}
-
-}
diff --git a/src/builtin/module_builtin_runtime_sort.cpp b/src/builtin/module_builtin_runtime_sort.cpp
index 3771fb1f1d..fad5e93f09 100644
--- a/src/builtin/module_builtin_runtime_sort.cpp
+++ b/src/builtin/module_builtin_runtime_sort.cpp
@@ -5,7 +5,7 @@
 #include "daScript/ast/ast_interop.h"
 #include "daScript/simulate/aot_builtin.h"
 #include "daScript/simulate/sim_policy.h"
-#include "das_qsort_r.h"
+#include "daScript/simulate/das_qsort_r.h"
 
 namespace das
 {
@@ -84,7 +84,7 @@ namespace das
     void builtin_sort_string ( void * data, int32_t length ) {
         if ( length<=1 ) return;
         const char ** pdata = (const char **) data;
-        sort ( pdata, pdata + length, [&](const char * a, const char * b){
+        das_sort ( pdata, pdata + length, [&](const char * a, const char * b){
             return strcmp(to_rts(a), to_rts(b))<0;
         });
     }
diff --git a/tests-cpp/small/test_sort_family.cpp b/tests-cpp/small/test_sort_family.cpp
index 0773a24275..8bf12cced5 100644
--- a/tests-cpp/small/test_sort_family.cpp
+++ b/tests-cpp/small/test_sort_family.cpp
@@ -1,4 +1,4 @@
-// Template-level tests for the sort family in src/builtin/das_qsort_r.h.
+// Template-level tests for the sort family in daScript/simulate/das_qsort_r.h.
 //
 // Covers:
 //   - das_qsort_r        (baseline — existed pre-Phase-0 but had no cpp test)
@@ -13,11 +13,12 @@
 
 #include <doctest/doctest.h>
 
-#include "../../src/builtin/das_qsort_r.h"
+#include "daScript/simulate/das_qsort_r.h"
 
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
+#include <functional>
 #include <vector>
 
 using namespace das;
@@ -103,6 +104,124 @@ TEST_CASE("das_qsort_r baseline") {
         das_qsort_r(a.data(), a.size(), sizeof(int32_t), desc);
         CHECK(a[0] == 5); CHECK(a[1] == 4); CHECK(a[2] == 3); CHECK(a[3] == 2); CHECK(a[4] == 1);
     }
+    // The hybrid switches to block-partition when hi-lo >= 128. Small arrays
+    // above never enter that branch, so cover it explicitly.
+    SUBCASE("large random — exercises block-partition branch") {
+        std::vector<int32_t> a;
+        a.reserve(1024);
+        uint32_t s = 0xC0FFEEu;
+        for (size_t i = 0; i < 1024; i++) {
+            s = s * 1664525u + 1013904223u;
+            a.push_back(int32_t(s));
+        }
+        das_qsort_r(a.data(), a.size(), sizeof(int32_t), less_cmp<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large duplicate-heavy — block-partition with few unique keys") {
+        std::vector<int32_t> a;
+        a.reserve(512);
+        uint32_t s = 0xDEADBEEFu;
+        for (size_t i = 0; i < 512; i++) {
+            s = s * 1664525u + 1013904223u;
+            a.push_back(int32_t(s % 8u));
+        }
+        das_qsort_r(a.data(), a.size(), sizeof(int32_t), less_cmp<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large already-sorted — block-partition fast path") {
+        std::vector<int32_t> a;
+        a.reserve(512);
+        for (int32_t i = 0; i < 512; i++) a.push_back(i);
+        das_qsort_r(a.data(), a.size(), sizeof(int32_t), less_cmp<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large reverse-sorted — block-partition") {
+        std::vector<int32_t> a;
+        a.reserve(512);
+        for (int32_t i = 511; i >= 0; i--) a.push_back(i);
+        das_qsort_r(a.data(), a.size(), sizeof(int32_t), less_cmp<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large 16-byte struct — block-partition on Quad payload") {
+        std::vector<Quad> a;
+        a.reserve(512);
+        uint32_t s = 0xBADC0DEu;
+        for (size_t i = 0; i < 512; i++) {
+            s = s * 1664525u + 1013904223u;
+            a.push_back({int32_t(s), int32_t(s + 1), int32_t(s + 2), int32_t(s + 3)});
+        }
+        das_qsort_r(a.data(), a.size(), sizeof(Quad), less_quad());
+        // Sorted by key, and payload follows each key (key+1 == payload_a).
+        for (size_t i = 1; i < a.size(); i++) CHECK(a[i].key >= a[i-1].key);
+        for (size_t i = 0; i < a.size(); i++) CHECK(a[i].payload_a == a[i].key + 1);
+    }
+}
+
+// ============================================================================
+// das_sort<T> typed — production path used by aot.h typed-sort bindings.
+// Mirrors the byte-pointer block-partition coverage above on the typed path.
+// ============================================================================
+
+TEST_CASE("das_sort<T> typed") {
+    SUBCASE("int random") {
+        std::vector<int32_t> a = { 5, 2, 8, 1, 9, 3, 7, 4, 6, 0 };
+        das_sort(a.data(), a.data() + a.size(), std::less<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large random — exercises typed block-partition branch") {
+        std::vector<int32_t> a;
+        a.reserve(1024);
+        uint32_t s = 0xC0FFEEu;
+        for (size_t i = 0; i < 1024; i++) {
+            s = s * 1664525u + 1013904223u;
+            a.push_back(int32_t(s));
+        }
+        das_sort(a.data(), a.data() + a.size(), std::less<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large duplicate-heavy — typed block-partition with few unique keys") {
+        std::vector<int32_t> a;
+        a.reserve(512);
+        uint32_t s = 0xDEADBEEFu;
+        for (size_t i = 0; i < 512; i++) {
+            s = s * 1664525u + 1013904223u;
+            a.push_back(int32_t(s % 8u));
+        }
+        das_sort(a.data(), a.data() + a.size(), std::less<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large already-sorted") {
+        std::vector<int32_t> a;
+        a.reserve(512);
+        for (int32_t i = 0; i < 512; i++) a.push_back(i);
+        das_sort(a.data(), a.data() + a.size(), std::less<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large reverse-sorted") {
+        std::vector<int32_t> a;
+        a.reserve(512);
+        for (int32_t i = 511; i >= 0; i--) a.push_back(i);
+        das_sort(a.data(), a.data() + a.size(), std::less<int32_t>());
+        CHECK(is_sorted_range(a.data(), a.size()));
+    }
+    SUBCASE("large 16-byte struct by key field") {
+        std::vector<Quad> a;
+        a.reserve(512);
+        uint32_t s = 0xBADC0DEu;
+        auto less_by_key = [](const Quad & x, const Quad & y) { return x.key < y.key; };
+        for (size_t i = 0; i < 512; i++) {
+            s = s * 1664525u + 1013904223u;
+            a.push_back({int32_t(s), int32_t(s + 1), int32_t(s + 2), int32_t(s + 3)});
+        }
+        das_sort(a.data(), a.data() + a.size(), less_by_key);
+        for (size_t i = 1; i < a.size(); i++) CHECK(a[i].key >= a[i-1].key);
+        for (size_t i = 0; i < a.size(); i++) CHECK(a[i].payload_a == a[i].key + 1);
+    }
+    SUBCASE("descending comparator") {
+        std::vector<int32_t> a = { 1, 2, 3, 4, 5 };
+        das_sort(a.data(), a.data() + a.size(), std::greater<int32_t>());
+        CHECK(a[0] == 5); CHECK(a[1] == 4); CHECK(a[2] == 3); CHECK(a[3] == 2); CHECK(a[4] == 1);
+    }
 }
 
 // ============================================================================
diff --git a/tests/linq/test_linq_fold.das b/tests/linq/test_linq_fold.das
index e9b2cc41b5..f003a7b61c 100644
--- a/tests/linq/test_linq_fold.das
+++ b/tests/linq/test_linq_fold.das
@@ -800,9 +800,6 @@ def test_sum_accumulator(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
         let s = _fold(each(arr).sum())
         t |> equal(15, s)
-        // parity vs _old_fold
-        let s_old = _old_fold(each(arr).sum())
-        t |> equal(s_old, s)
     }
     t |> run("sum: where filter") @(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
@@ -838,8 +835,6 @@ def test_min_accumulator(t : T?) {
         let arr <- [5, 3, 8, 1, 4]
         let m = _fold(each(arr).min())
         t |> equal(1, m)
-        let m_old = _old_fold(each(arr).min())
-        t |> equal(m_old, m)
     }
     t |> run("min: where filter") @(t : T?) {
         let arr <- [5, 3, 8, 1, 4]
@@ -869,8 +864,6 @@ def test_max_accumulator(t : T?) {
         let arr <- [5, 3, 8, 1, 4]
         let m = _fold(each(arr).max())
         t |> equal(8, m)
-        let m_old = _old_fold(each(arr).max())
-        t |> equal(m_old, m)
     }
     t |> run("max: where filter") @(t : T?) {
         let arr <- [5, 3, 8, 1, 4]
@@ -900,8 +893,6 @@ def test_average_accumulator(t : T?) {
         let arr <- [2, 4, 6, 8]
         let a = _fold(each(arr).average())
         t |> equal(5.0lf, a)
-        let a_old = _old_fold(each(arr).average())
-        t |> equal(a_old, a)
     }
     t |> run("average: where filter") @(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
@@ -927,8 +918,6 @@ def test_long_count_accumulator(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
         let c = _fold(each(arr).long_count())
         t |> equal(5l, c)
-        let c_old = _old_fold(each(arr).long_count())
-        t |> equal(c_old, c)
     }
     t |> run("long_count: where filter") @(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
@@ -954,8 +943,6 @@ def test_first_early_exit(t : T?) {
         let arr <- [7, 8, 9]
         let f = _fold(each(arr).first())
         t |> equal(7, f)
-        let f_old = _old_fold(each(arr).first())
-        t |> equal(f_old, f)
     }
     t |> run("first: where matches returns first match") @(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
@@ -980,8 +967,6 @@ def test_first_or_default_early_exit(t : T?) {
         let arr <- [7, 8, 9]
         let f = _fold(each(arr).first_or_default(99))
         t |> equal(7, f)
-        let f_old = _old_fold(each(arr).first_or_default(99))
-        t |> equal(f_old, f)
     }
     t |> run("first_or_default: where matches returns first match") @(t : T?) {
         let arr <- [1, 2, 3, 4, 5]
@@ -1308,3 +1293,37 @@ def test_chained_non_workhorse_select(t : T?) {
         t |> equal(11, r)
     }
 }
+
+[test]
+def test_iterator_source_bare_order(t : T?) {
+    // Regression — splice path for bare order family on an iterator source. peel_each only
+    // unwraps `each(<array>)`; `each(<iterator-yielding source>)` leaves an iterator on top.
+    // Old planner unconditionally wrapped the emission with `to_sequence_move()` whenever
+    // `expr._type.isIterator`, which compile-errored because daslib's `order_*(iter,…)`
+    // overloads already return iterator and `to_sequence_move` is array-only.
+    t |> run("each(range) |> order_by — splice emits iter-returning overload") @(tt : T?) {
+        var query = _fold(each(range(0, 5))._order_by(-_))
+        // range(0,5) = [0,1,2,3,4]; ordered by -v = [4,3,2,1,0]
+        let expected = [4, 3, 2, 1, 0]
+        for (v, i in query, 0 .. 5) {
+            tt |> equal(expected[i], v)
+        }
+    }
+}
+
+[test]
+def test_top_n_mid_chain_iterator_source(t : T?) {
+    // Regression — `top_n_*` registered in linqCalls but always returns array regardless of
+    // input shape, so no `_to_array` variant exists. `fold_linq_default`'s first-position
+    // iterator-source rewrite must skip the `_to_array` suffix for these. Hits when the user
+    // hand-pipes `top_n_*(...)` mid-chain (it isn't a comprehension form, so this is the
+    // only way to land here, but the path is real and the bug was a compile error.)
+    t |> run("each(range) |> top_n_by |> _select — top_n_by stays unrewritten") @(tt : T?) {
+        var query = _fold(each(range(0, 10)) |> top_n_by(3, @@(v : int) => -v) |> _select(_ + 100))
+        // top_n_by smallest 3 by -v ⇒ largest 3 by v = [9, 8, 7]; + 100 ⇒ [109, 108, 107]
+        let expected = [109, 108, 107]
+        for (v, i in query, 0 .. 3) {
+            tt |> equal(expected[i], v)
+        }
+    }
+}
diff --git a/tests/linq/test_linq_fold_ast.das b/tests/linq/test_linq_fold_ast.das
index 7454d3a77b..1533d5e395 100644
--- a/tests/linq/test_linq_fold_ast.das
+++ b/tests/linq/test_linq_fold_ast.das
@@ -29,11 +29,6 @@ def target_where_select_fold() : array<int> {
     return <- [1, 2, 3, 4, 5]._where(_ > 3)._select(_ * 2)._fold()
 }
 
-[export, marker(no_coverage)]
-def target_select_where_fold() : array<int> {
-    return <- [1, 2, 3, 4, 5]._select(_ * 2)._where(_ > 6)._fold()
-}
-
 [export, marker(no_coverage)]
 def target_reverse_where_fold() : array<int> {
     return <- [1, 2, 3, 4, 5].to_sequence().reverse()._where(_ > 3).to_array()._fold()
@@ -54,318 +49,6 @@ def target_zip3_predicate_fold() : array<int> {
     return <- [1, 2, 3]._select(_ * 2).zip([10, 20, 30]._select(_ + 1), [100, 200, 300]._select(_ / 10), $(a, b, c : int) => a + b + c)._fold()
 }
 
-// `_old_fold` targets — used by retargeted AST tests that document the frozen comprehension contract.
-[export, marker(no_coverage)]
-def target_where_old_fold() : array<int> {
-    return <- [1, 2, 3, 4, 5]._where(_ > 3)._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_select_old_fold() : array<int> {
-    return <- [1, 2, 3, 4, 5]._select(_ * 2)._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_where_select_old_fold() : array<int> {
-    return <- [1, 2, 3, 4, 5]._where(_ > 3)._select(_ * 2)._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_select_where_old_fold() : array<int> {
-    return <- [1, 2, 3, 4, 5]._select(_ * 2)._where(_ > 6)._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_reverse_where_old_fold() : array<int> {
-    return <- [1, 2, 3, 4, 5].to_sequence().reverse()._where(_ > 3).to_array()._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_zip_old_fold() : array<tuple<int; int>> {
-    return <- [1, 2, 3]._select(_ * 2).zip([10, 20, 30]._select(_ + 1))._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_zip3_old_fold() : array<tuple<int; int; int>> {
-    return <- [1, 2, 3]._select(_ * 2).zip([10, 20, 30]._select(_ + 1), [100, 200, 300]._select(_ / 10))._old_fold()
-}
-
-[export, marker(no_coverage)]
-def target_zip3_predicate_old_fold() : array<int> {
-    return <- [1, 2, 3]._select(_ * 2).zip([10, 20, 30]._select(_ + 1), [100, 200, 300]._select(_ / 10), $(a, b, c : int) => a + b + c)._old_fold()
-}
-
-// ── Tests: _old_fold contract — comprehension emission (frozen baseline) ──
-// These tests retain the pre-rewrite AST shape that `_fold` used to emit.
-// `_fold` itself has diverged (Phase 2A loop planner); see test_*_fold_emits_loop
-// below for the current `_fold` shape contract. The pair documents the
-// comprehension-vs-loop split between the two macros.
-
-[test]
-def test_where_old_fold_produces_comprehension(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_where_old_fold)
-        t |> success(func != null, "should find target_where_old_fold")
-        if (func == null) return
-        // fold_where output: invoke($(var source) .. var pass_0 <- COMP; return <- pass_0 .., src)
-        var comp_expr : ExpressionPtr
-        var source_expr : ExpressionPtr
-        let r = qmatch_function(func) $() { // nolint:STYLE016
-            return <- invoke($(var source : array<int>) {
-                var pass_0 : array<int> <- $e(comp_expr)
-                return <- pass_0
-            }, $e(source_expr))
-        }
-        t |> success(r.matched, "should match fold invoke structure, error={int(r.error)}")
-        if (!r.matched) return
-        // Verify the captured expression is a comprehension with where
-        var resolved <- qm_resolve_comprehension(comp_expr)
-        t |> success(resolved != null, "inner expression should be a comprehension")
-        if (resolved == null) return
-        let ac = resolved as ExprArrayComprehension
-        t |> success(ac.exprWhere != null, "comprehension should have where clause")
-    }
-}
-
-[test]
-def test_where_old_fold_comprehension_pattern(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_where_old_fold)
-        if (func == null) return
-        // Match the full structure including comprehension pattern
-        var where_cond : ExpressionPtr
-        var source_expr : ExpressionPtr
-        let r = qmatch_function(func) $() { // nolint:STYLE016
-            return <- invoke($(var source : array<int>) {
-                var pass_0 : array<int> <- [for (it in source); it; where $e(where_cond)]
-                return <- pass_0
-            }, $e(source_expr))
-        }
-        t |> success(r.matched, "should match comprehension with where, error={int(r.error)}")
-    }
-}
-
-[test]
-def test_select_old_fold_produces_comprehension(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_select_old_fold)
-        t |> success(func != null, "should find target_select_old_fold")
-        if (func == null) return
-        var comp_expr : ExpressionPtr
-        var source_expr : ExpressionPtr
-        let r = qmatch_function(func) $() { // nolint:STYLE016
-            return <- invoke($(var source : array<int>) {
-                var pass_0 : array<int> <- $e(comp_expr)
-                return <- pass_0
-            }, $e(source_expr))
-        }
-        t |> success(r.matched, "should match fold structure, error={int(r.error)}")
-        if (!r.matched) return
-        var resolved <- qm_resolve_comprehension(comp_expr)
-        t |> success(resolved != null, "inner should be a comprehension")
-        if (resolved == null) return
-        let ac = resolved as ExprArrayComprehension
-        t |> success(ac.exprWhere == null, "select-only comprehension should have no where clause")
-    }
-}
-
-[test]
-def test_select_old_fold_comprehension_pattern(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_select_old_fold)
-        if (func == null) return
-        var select_expr : ExpressionPtr
-        var source_expr : ExpressionPtr
-        let r = qmatch_function(func) $() { // nolint:STYLE016
-            return <- invoke($(var source : array<int>) {
-                var pass_0 : array<int> <- [for (it in source); $e(select_expr)]
-                return <- pass_0
-            }, $e(source_expr))
-        }
-        t |> success(r.matched, "should match comprehension without where, error={int(r.error)}")
-        if (!r.matched) return
-        // Verify the select expression is a multiplication: it * 2
-        let r2 = qmatch(select_expr, it * 2)
-        t |> success(r2.matched, "select expression should be it * 2")
-    }
-}
-
-[test]
-def test_where_select_old_fold_comprehension(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_where_select_old_fold)
-        t |> success(func != null, "should find target_where_select_old_fold")
-        if (func == null) return
-        var select_expr : ExpressionPtr
-        var where_cond : ExpressionPtr
-        var source_expr : ExpressionPtr
-        let r = qmatch_function(func) $() { // nolint:STYLE016
-            return <- invoke($(var source : array<int>) {
-                var pass_0 : array<int> <- [for (it in source); $e(select_expr); where $e(where_cond)]
-                return <- pass_0
-            }, $e(source_expr))
-        }
-        t |> success(r.matched, "should match comprehension with where+select, error={int(r.error)}")
-        if (!r.matched) return
-        // Verify select is multiplication and where is comparison
-        let r_sel = qmatch(select_expr, it * 2)
-        t |> success(r_sel.matched, "select should be it * 2")
-        let r_where = qmatch(where_cond, it > 3)
-        t |> success(r_where.matched, "where should be it > 3")
-    }
-}
-
-[test]
-def test_select_where_old_fold_structure(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_select_where_old_fold)
-        t |> success(func != null, "should find target_select_where_old_fold")
-        if (func == null) return
-        // select_where fold produces an invoke with a lambda that has a for loop + if
-        // It is NOT a simple comprehension - verify the fold still happened
-        var inner_expr : ExpressionPtr
-        var source_expr : ExpressionPtr
-        let r = qmatch_function(func) $() { // nolint:STYLE016
-            return <- invoke($(var source : array<int>) {
-                var pass_0 : array<int> <- $e(inner_expr)
-                return <- pass_0
-            }, $e(source_expr))
-        }
-        t |> success(r.matched, "should match fold invoke structure, error={int(r.error)}")
-        if (!r.matched) return
-        // The inner expression should NOT be a comprehension (select_where uses a different strategy)
-        var resolved <- qm_resolve_comprehension(inner_expr)
-        t |> success(resolved == null, "select_where should not produce a simple comprehension")
-    }
-}
-
-[test]
-def test_reverse_where_old_fold_structure(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_reverse_where_old_fold)
-        t |> success(func != null, "should find target_reverse_where_old_fold")
-        if (func == null) return
-        // Multi-step fold: reverse_to_array + where comprehension
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return <- $e(body_expr)
-        }
-        t |> success(r.matched, "should have a return expression")
-        if (!r.matched) return
-        t |> success(body_expr is ExprInvoke, "fold should produce invoke wrapper")
-    }
-}
-
-[test]
-def test_zip_old_fold_structure(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_zip_old_fold)
-        t |> success(func != null, "should find target_zip_old_fold")
-        if (func == null) return
-        // zip fold recursively folds the second argument
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return <- $e(body_expr)
-        }
-        t |> success(r.matched, "should match return expression")
-        if (!r.matched) return
-        t |> success(body_expr is ExprInvoke, "fold should produce invoke wrapper")
-    }
-}
-
-// ── Behavioral verification (fold produces correct results) ────────────
-
-[test]
-def test_where_fold_result(t : T?) {
-    t |> run("where fold produces correct values") @(t : T?) {
-        let result <- target_where_fold()
-        t |> equal(length(result), 2)
-        t |> equal(result[0], 4)
-        t |> equal(result[1], 5)
-    }
-}
-
-[test]
-def test_select_fold_result(t : T?) {
-    t |> run("select fold produces correct values") @(t : T?) {
-        let result <- target_select_fold()
-        t |> equal(length(result), 5)
-        let expected = [2, 4, 6, 8, 10]
-        for (i, v in 0..5, result) {
-            t |> equal(expected[i], v)
-        }
-    }
-}
-
-[test]
-def test_where_select_fold_result(t : T?) {
-    t |> run("where_select fold produces correct values") @(t : T?) {
-        let result <- target_where_select_fold()
-        t |> equal(length(result), 2)
-        t |> equal(result[0], 8)
-        t |> equal(result[1], 10)
-    }
-}
-
-[test]
-def test_select_where_fold_result(t : T?) {
-    t |> run("select_where fold produces correct values") @(t : T?) {
-        let result <- target_select_where_fold()
-        t |> equal(length(result), 2)
-        t |> equal(result[0], 8)
-        t |> equal(result[1], 10)
-    }
-}
-
-[test]
-def test_zip_fold_result(t : T?) {
-    t |> run("zip fold produces correct values") @(t : T?) {
-        let result <- target_zip_fold()
-        t |> equal(length(result), 3)
-        t |> equal(result[0]._0, 2)
-        t |> equal(result[0]._1, 11)
-        t |> equal(result[1]._0, 4)
-        t |> equal(result[1]._1, 21)
-        t |> equal(result[2]._0, 6)
-        t |> equal(result[2]._1, 31)
-    }
-}
-
-// ── Tests: zip3 fold — all 3 subexpressions fold ──────────────────────
-
-[test]
-def test_zip3_old_fold_structure(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_zip3_old_fold)
-        t |> success(func != null, "should find target_zip3_old_fold")
-        if (func == null) return
-        // zip3 fold: all three sources should be folded into invoke wrappers
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return <- $e(body_expr)
-        }
-        t |> success(r.matched, "should match return expression")
-        if (!r.matched) return
-        t |> success(body_expr is ExprInvoke, "zip3 fold should produce invoke wrapper")
-    }
-}
-
-[test]
-def test_zip3_predicate_old_fold_structure(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_zip3_predicate_old_fold)
-        t |> success(func != null, "should find target_zip3_predicate_old_fold")
-        if (func == null) return
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return <- $e(body_expr)
-        }
-        t |> success(r.matched, "should match return expression")
-        if (!r.matched) return
-        t |> success(body_expr is ExprInvoke, "zip3 predicate fold should produce invoke wrapper")
-    }
-}
-
 [test]
 def test_zip3_fold_result(t : T?) {
     t |> run("zip3 fold produces correct tuples") @(t : T?) {
@@ -491,15 +174,6 @@ def target_long_count_shortcut_fold() : int64 {
     return _fold(each([1, 2, 3, 4, 5]).long_count())
 }
 
-[export, marker(no_coverage)]
-def target_select_where_sum_fall_through() : int {
-    // `_select |> _where |> sum` is select-then-where which Phase 2A/2B planner rejects
-    // (where-after-select is blocked on ExprRef2Value substitution). Should fall through
-    // unfolded — body is the raw call chain, not an invoke wrapper. Use array-source form
-    // (no each) so the unfolded chain stays in safe land.
-    return [1, 2, 3, 4, 5]._select(_ * 2)._where(_ > 4).sum()._fold()
-}
-
 // ── Targets for Phase-2B Ring 2 early-exit lane ────────────────────────
 
 [export, marker(no_coverage)]
@@ -533,13 +207,6 @@ def target_contains_fold() : bool {
     return _fold(each([1, 2, 3, 4, 5]).contains(3))
 }
 
-[export, marker(no_coverage)]
-def target_early_exit_select_where_fall_through() : bool {
-    // select-then-where-then-any is rejected by the planner (where-after-select).
-    // Falls through unfolded; body is raw chain. Array-source form for safety.
-    return [1, 2, 3, 4, 5]._select(_ * 2)._where(_ > 4).any()._fold()
-}
-
 // ── Tests: `_fold` Phase-2A loop emission ──────────────────────────────
 // Phase-2A `_fold` emits explicit for-loops inside an `invoke($block, $src)` wrapper
 // (no `ExprArrayComprehension` nodes). Each test asserts the invoke wrapper exists
@@ -656,22 +323,8 @@ def test_count_fold_emits_counter(t : T?) {
     }
 }
 
-[test]
-def test_select_where_fold_falls_through(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_select_where_fold)
-        if (func == null) return
-        // _select |> _where is out of Phase 2A scope (where-after-select) — chain falls
-        // through unfolded. The function body is the raw `where_(select(...), ...)` call,
-        // NOT a generated invoke wrapper.
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return <- $e(body_expr)
-        }
-        t |> success(r.matched, "should have return expression")
-        t |> success(!(body_expr is ExprInvoke), "select_where should fall through unfolded (no invoke wrapper)")
-    }
-}
+// `select |> where` array-lane coverage is in the Phase 3d section below
+// (target_select_where_to_array_splices_fold / test_select_where_*_emits_fused_loop).
 
 // ── Behavioral parity: results of new shapes ───────────────────────────
 
@@ -1218,22 +871,8 @@ def test_long_count_length_shortcut(t : T?) {
     }
 }
 
-[test]
-def test_accumulator_falls_through_on_select_where(t : T?) {
-    ast_gc_guard() {
-        // `select |> where_ |> sum` is rejected by the planner (select-then-where blocker).
-        // Should fall through unfolded — body is the raw call chain, not an invoke wrapper.
-        var func = find_module_function_via_rtti(compiling_module(), @@target_select_where_sum_fall_through)
-        if (func == null) return
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return $e(body_expr)
-        }
-        t |> success(r.matched, "should have return expression")
-        t |> success(!(body_expr is ExprInvoke),
-            "select|where|sum should fall through unfolded (no invoke wrapper)")
-    }
-}
+// `select |> where |> sum` accumulator-lane coverage is in the Phase 3d section below
+// (target_select_where_sum_splices_fold / test_select_where_count_emits_fused_loop pattern).
 
 // ── Phase 2B Ring 2 — early-exit lane shape assertions ─────────────────
 
@@ -1333,20 +972,8 @@ def test_contains_loop_shape(t : T?) {
     }
 }
 
-[test]
-def test_early_exit_falls_through_on_select_where(t : T?) {
-    ast_gc_guard() {
-        var func = find_module_function_via_rtti(compiling_module(), @@target_early_exit_select_where_fall_through)
-        if (func == null) return
-        var body_expr : ExpressionPtr
-        let r = qmatch_function(func) $() {
-            return $e(body_expr)
-        }
-        t |> success(r.matched, "should have return expression")
-        t |> success(!(body_expr is ExprInvoke),
-            "select|where|any should fall through unfolded (no invoke wrapper)")
-    }
-}
+// `select |> where |> any` early-exit-lane coverage is in the Phase 3d section below
+// (target_select_where_any_splices_fold).
 
 // ── Phase 2C take/skip splice ──────────────────────────────────────────
 
@@ -1470,9 +1097,10 @@ def test_take_sum_splices_in_accumulator(t : T?) {
 }
 
 [test]
-def test_order_by_take_falls_through(t : T?) {
-    // order_by + take is BufferTopN territory — recognized as buffer-required, planner returns
-    // null, falls through to plain linq. Future PR replaces with a dedicated emit path.
+def test_order_by_take_cascades_to_tier2(t : T?) {
+    // `order_by |> take` — Phase 3 wires a BufferTopN splice arm via top_n_by.
+    // Until then, the splice planner returns null and the cascade falls to tier 2
+    // (fold_linq_default), which emits the array-shape invoke wrapper.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(), @@target_order_by_take_falls_through)
         if (func == null) return
@@ -1481,14 +1109,16 @@ def test_order_by_take_falls_through(t : T?) {
             return $e(body_expr)
         }
         t |> success(r.matched, "should have return expression")
-        t |> success(!(body_expr is ExprInvoke),
-            "order_by chain should fall through unfolded — splice planner doesn't handle buffer mode yet")
+        t |> success(body_expr is ExprInvoke,
+            "order_by |> take should cascade to fold_linq_default (invoke wrapper)")
     }
 }
 
 [test]
-def test_distinct_falls_through(t : T?) {
-    // distinct is buffer-required (hash set) — fall-through marker.
+def test_distinct_cascades_to_tier2(t : T?) {
+    // `distinct` is buffer-required (hash set) — splice planner returns null.
+    // Cascade falls to tier 2 (fold_linq_default), which emits the array-shape
+    // invoke wrapper. BufferDistinct splice mode is deferred to a later PR.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(), @@target_distinct_falls_through)
         if (func == null) return
@@ -1497,8 +1127,242 @@ def test_distinct_falls_through(t : T?) {
             return $e(body_expr)
         }
         t |> success(r.matched, "should have return expression")
-        t |> success(!(body_expr is ExprInvoke),
-            "distinct chain should fall through unfolded — BufferDistinct mode not yet implemented")
+        t |> success(body_expr is ExprInvoke,
+            "distinct chain should cascade to fold_linq_default (invoke wrapper)")
+    }
+}
+
+// ── Phase 3 — order-family splice arms (plan_order_family) ─────────────
+
+[export, marker(no_coverage)]
+def target_order_by_take_splices_fold() : array<int> {
+    return <- _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._order_by(_).take(3).to_array())
+}
+
+[export, marker(no_coverage)]
+def target_order_by_descending_take_splices_fold() : array<int> {
+    return <- _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._order_by_descending(_).take(3).to_array())
+}
+
+[export, marker(no_coverage)]
+def target_bare_order_by_splices_fold() : array<int> {
+    return <- _fold(each([10, 20, 5, 8, 30])._order_by(_).to_array())
+}
+
+[export, marker(no_coverage)]
+def target_where_order_by_splices_fold() : array<int> {
+    return <- _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._where(_ > 5)._order_by(_).to_array())
+}
+
+[export, marker(no_coverage)]
+def target_where_order_by_take_splices_fold() : array<int> {
+    return <- _fold(each([10, 20, 5, 8, 30, 15, 2, 25])._where(_ > 5)._order_by(_).take(2).to_array())
+}
+
+[test]
+def test_order_by_take_emits_top_n_by(t : T?) {
+    // `order_by |> take(K)` splices via plan_order_family to a direct top_n_by(src, K, key) call.
+    // No invoke wrapper, no order_by call in the emission.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_order_by_take_splices_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "top_n_by") >= 1, "should emit a top_n_by call")
+        t |> equal(0, count_call(body_expr, "order_by"), "should not emit order_by")
+        t |> equal(0, count_call(body_expr, "take"), "should not emit take")
+    }
+}
+
+[test]
+def test_order_by_descending_take_emits_top_n_by_descending(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(),
+            @@target_order_by_descending_take_splices_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "top_n_by_descending") >= 1,
+            "should emit a top_n_by_descending call")
+        t |> equal(0, count_call(body_expr, "order_by_descending"),
+            "should not emit order_by_descending")
+    }
+}
+
+[test]
+def test_bare_order_by_emits_direct_call(t : T?) {
+    // Bare `order_by(key)` splices to a direct order_by call (no invoke wrapper, no top_n).
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_bare_order_by_splices_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "order_by") >= 1, "should emit an order_by call")
+        t |> equal(0, count_call(body_expr, "top_n_by"), "should not emit top_n_by")
+    }
+}
+
+[test]
+def test_where_order_by_emits_fused_loop(t : T?) {
+    // `where |> order_by` splices into a fused prefilter loop + order_by_inplace on the buffer.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_where_order_by_splices_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected invoke wrapper for fused emission")
+        t |> equal(1, count_inner_for_loops(body_expr), "single fused prefilter loop")
+        t |> success(count_call(body_expr, "order_by_inplace") >= 1, "should call order_by_inplace on the buffer")
+        t |> success(count_call(body_expr, "push_clone") >= 1, "should push_clone into the buffer")
+    }
+}
+
+[test]
+def test_where_order_by_take_emits_fused_top_n(t : T?) {
+    // `where |> order_by |> take(K)` splices into a fused prefilter loop + top_n_by on the buffer.
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(),
+            @@target_where_order_by_take_splices_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected invoke wrapper for fused emission")
+        t |> equal(1, count_inner_for_loops(body_expr), "single fused prefilter loop")
+        t |> success(count_call(body_expr, "top_n_by") >= 1, "should call top_n_by on the buffer")
+        t |> success(count_call(body_expr, "push_clone") >= 1, "should push_clone into the buffer")
+    }
+}
+
+// ── Phase 3 — runtime parity for order-family splice arms ──────────────
+
+[test]
+def test_order_by_take_correct_result(t : T?) {
+    t |> run("order_by + take splice produces N smallest ascending") @(t : T?) {
+        let got <- target_order_by_take_splices_fold()
+        let expected = [2, 5, 8]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+}
+
+[test]
+def test_order_by_descending_take_correct_result(t : T?) {
+    t |> run("order_by_descending + take splice produces N largest descending") @(t : T?) {
+        let got <- target_order_by_descending_take_splices_fold()
+        let expected = [30, 25, 20]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+}
+
+[test]
+def test_where_order_by_take_correct_result(t : T?) {
+    t |> run("where + order_by + take splice prefilters then takes top N") @(t : T?) {
+        let got <- target_where_order_by_take_splices_fold()
+        // src = [10, 20, 5, 8, 30, 15, 2, 25], where _>5 → [10, 20, 8, 30, 15, 25]
+        // order_by(_).take(2) → [8, 10]
+        let expected = [8, 10]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+}
+
+// ── Phase 3d — select + where splice via replaceVariablePeeling ────────
+
+[export, marker(no_coverage)]
+def target_select_where_count_splices_fold() : int {
+    return _fold(each([1, 2, 3, 4, 5, 6, 7, 8])._select(_ * 2)._where(_ > 5).count())
+}
+
+[export, marker(no_coverage)]
+def target_select_where_sum_splices_fold() : int {
+    return _fold(each([1, 2, 3, 4, 5])._select(_ * 10)._where(_ > 15).sum())
+}
+
+[export, marker(no_coverage)]
+def target_select_where_any_splices_fold() : bool {
+    return _fold(each([1, 2, 3])._select(_ * 100)._where(_ > 50).any())
+}
+
+[export, marker(no_coverage)]
+def target_select_where_to_array_splices_fold() : array<int> {
+    return <- _fold(each([1, 2, 3, 4, 5])._select(_ * 2)._where(_ > 4).to_array())
+}
+
+[test]
+def test_select_where_count_emits_fused_loop(t : T?) {
+    // select |> where |> count: peel-substitution inlines projection into pred; loop emits
+    // counter increments. NOT tier 2 (no pass_0/pass_1 binding shape).
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(),
+            @@target_select_where_count_splices_fold)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched && body_expr is ExprInvoke, "expected invoke wrapper for splice")
+        t |> equal(1, count_inner_for_loops(body_expr), "single fused for-loop (not pass_0/pass_1 chain)")
+        // tier 2 fold_linq_default would emit `var pass_0 = ...; var pass_1 = ...`.
+        // tier 1 splice emits just the loop body + accumulator.
+        t |> equal(0, count_call(body_expr, "where_"), "where_ should be inlined, not called")
+        t |> equal(0, count_call(body_expr, "select"), "select should be inlined, not called")
+    }
+}
+
+[test]
+def test_select_where_count_correct_result(t : T?) {
+    t |> run("select(x*2) |> where(>5) |> count") @(t : T?) {
+        // src = [1..8], projected = [2,4,6,8,10,12,14,16], filtered >5 = [6,8,10,12,14,16], count = 6
+        t |> equal(6, target_select_where_count_splices_fold())
+    }
+}
+
+[test]
+def test_select_where_sum_correct_result(t : T?) {
+    t |> run("select(x*10) |> where(>15) |> sum") @(t : T?) {
+        // src = [1..5], projected = [10,20,30,40,50], filtered >15 = [20,30,40,50], sum = 140
+        t |> equal(140, target_select_where_sum_splices_fold())
+    }
+}
+
+[test]
+def test_select_where_any_correct_result(t : T?) {
+    t |> run("select(x*100) |> where(>50) |> any") @(t : T?) {
+        // src = [1,2,3], projected = [100,200,300], filtered >50 = [100,200,300], any = true
+        t |> equal(true, target_select_where_any_splices_fold())
+    }
+}
+
+[test]
+def test_select_where_to_array_correct_result(t : T?) {
+    t |> run("select(x*2) |> where(>4) |> to_array") @(t : T?) {
+        // src = [1..5], projected = [2,4,6,8,10], filtered >4 = [6,8,10]
+        let got <- target_select_where_to_array_splices_fold()
+        let expected = [6, 8, 10]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
     }
 }
 
diff --git a/tests/linq/test_linq_sorting.das b/tests/linq/test_linq_sorting.das
index 9c43862f39..288f59f2b1 100644
--- a/tests/linq/test_linq_sorting.das
+++ b/tests/linq/test_linq_sorting.das
@@ -463,6 +463,118 @@ def test_top_n_by(t : T?) {
     }
 }
 
+[test]
+def test_top_n_by_descending(t : T?) {
+    t |> run("top_n_by_descending on array source — N largest descending") @(t : T?) {
+        let src <- [10, 20, 5, 8, 30, 15, 2, 25]
+        let got <- top_n_by_descending(src, 3, @@(v : int -&) => v)
+        let expected = [30, 25, 20]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+    t |> run("top_n_by_descending on iterator source — bounded min-heap") @(t : T?) {
+        let src <- [10, 20, 5, 8, 30, 15, 2, 25]
+        let got <- top_n_by_descending(src.to_sequence(), 3, @@(v : int -&) => v)
+        let expected = [30, 25, 20]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+    t |> run("top_n_by_descending N=0 yields empty") @(t : T?) {
+        let src <- [10, 20, 5]
+        let got <- top_n_by_descending(src, 0, @@(v : int -&) => v)
+        t |> equal(length(got), 0)
+    }
+    t |> run("top_n_by_descending N<0 yields empty") @(t : T?) {
+        let src <- [10, 20, 5]
+        let got <- top_n_by_descending(src, -1, @@(v : int -&) => v)
+        t |> equal(length(got), 0)
+    }
+    t |> run("top_n_by_descending N > length yields all elements sorted descending") @(t : T?) {
+        let src <- [10, 20, 5]
+        let got <- top_n_by_descending(src, 100, @@(v : int -&) => v)
+        let expected = [20, 10, 5]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+    t |> run("top_n_by_descending N=1 picks single maximum") @(t : T?) {
+        let src <- [7, 3, 1, 9, 5]
+        let got <- top_n_by_descending(src, 1, @@(v : int -&) => v)
+        t |> equal(length(got), 1)
+        t |> equal(got[0], 9)
+    }
+    t |> run("top_n_by_descending on iterator source — empty") @(t : T?) {
+        let src : array<int>
+        let got <- top_n_by_descending(src.to_sequence(), 3, @@(v : int -&) => v)
+        t |> equal(length(got), 0)
+    }
+    t |> run("top_n_by_descending on user struct by field key") @(t : T?) {
+        let src <- [
+            ComplexType(a = [10, 0]),
+            ComplexType(a = [5, 0]),
+            ComplexType(a = [25, 0]),
+            ComplexType(a = [2, 0]),
+            ComplexType(a = [15, 0])
+        ]
+        let got <- top_n_by_descending(src, 3, @@(c : ComplexType) => c.a[0])
+        t |> equal(length(got), 3)
+        t |> equal(got[0].a[0], 25)
+        t |> equal(got[1].a[0], 15)
+        t |> equal(got[2].a[0], 10)
+    }
+    t |> run("top_n_by_descending parity vs sort+take direct reference") @(t : T?) {
+        let src <- [42, 7, 13, 88, 21, 4, 99, 56, 33, 71]
+        let got <- top_n_by_descending(src, 4, @@(v : int -&) => v)
+        // hand-rolled reference: clone, sort descending, take first 4
+        var refSorted : array<int>
+        refSorted := src
+        sort(refSorted, $(a, b) => a > b)
+        t |> equal(length(got), 4)
+        for (i in 0..4) {
+            t |> equal(got[i], refSorted[i])
+        }
+    }
+}
+
+[test]
+def test_top_n_descending(t : T?) {
+    t |> run("top_n_descending on array source — N largest descending") @(t : T?) {
+        let src <- [10, 20, 5, 8, 30, 15, 2, 25]
+        let got <- top_n_descending(src, 3)
+        let expected = [30, 25, 20]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+    t |> run("top_n_descending on iterator source") @(t : T?) {
+        let src <- [10, 20, 5, 8, 30, 15, 2, 25]
+        let got <- top_n_descending(src.to_sequence(), 3)
+        let expected = [30, 25, 20]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+    t |> run("top_n_descending N=0 yields empty") @(t : T?) {
+        let src <- [10, 20, 5]
+        let got <- top_n_descending(src, 0)
+        t |> equal(length(got), 0)
+    }
+    t |> run("top_n_descending on floats") @(t : T?) {
+        let src <- [3.14, 1.41, 2.71, 0.57, 1.61]
+        let got <- top_n_descending(src, 2)
+        t |> equal(length(got), 2)
+        t |> equal(got[0], 3.14)
+        t |> equal(got[1], 2.71)
+    }
+}
+
 [test]
 def test_heap_ops(t : T?) {
     t |> run("make_heap → max at root") @(t : T?) {
diff --git a/utils/daspkg/commands.das b/utils/daspkg/commands.das
index d6c26b4367..ef0a4f2fc0 100644
--- a/utils/daspkg/commands.das
+++ b/utils/daspkg/commands.das
@@ -985,7 +985,7 @@ def build_package(pkg_dir : string) : bool {
     }
 
     // Build
-    exit_code = run_cmd("cmake --build \"{build_dir}\" --config Release", output)
+    exit_code = run_cmd("cmake --build \"{build_dir}\" --config Release --parallel", output)
     if (exit_code != 0) {
         log("  CMake build FAILED:\n{output}\n")
         return false