diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8e0d75996..4d591c0e6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -202,6 +202,17 @@ jobs:
     - name: "Install CMake and Ninja"
       uses: lukka/get-cmake@latest
 
+    - if: runner.os == 'Windows'
+      uses: ilammy/setup-nasm@v1 # need nasm for openssl
+
+    - name: "Install openssl windows"
+      if: runner.os == 'Windows'
+      run: |
+        git clone https://github.com/microsoft/vcpkg && ./vcpkg/bootstrap-vcpkg.sh
+        ./vcpkg/vcpkg install openssl:${{ matrix.architecture == 32 && 'x86' || 'x64' }}-windows --binarycaching
+        echo "VCPKG_ROOT=$(pwd)/vcpkg" >> $GITHUB_ENV
+        echo "CMAKE_TOOLCHAIN_FILE=$(pwd)/vcpkg/scripts/buildsystems/vcpkg.cmake" >> $GITHUB_ENV
+
     - name: "Install: Required Dev Packages"
       run: |
         set -eux
@@ -259,11 +270,13 @@ jobs:
                 ninja
                 ;;
               windows32)
-                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }}
+                export PATH="/c/Strawberry/perl/bin:$PATH" # prepend Strawberry perl to path, so openssl will use it.
+                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }} -DCMAKE_TOOLCHAIN_FILE="$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake"
                 cmake --build ./build --config ${{ matrix.cmake_preset }} --parallel
                 ;;
               windows64)
-                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }} -DDAS_LLVM_DISABLED=${{ env.das_llvm_disabled }}
+                export PATH="/c/Strawberry/perl/bin:$PATH" # prepend Strawberry perl to path, so openssl will use it.
+                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }} -DDAS_LLVM_DISABLED=${{ env.das_llvm_disabled }} -DCMAKE_TOOLCHAIN_FILE="$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake"
                 cmake --build ./build --config ${{ matrix.cmake_preset }} --parallel
                 ;;
               linux_arm*)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08dbe5aed..07e219872 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 option(DAS_FLEX_BISON_DISABLED "Disable FLEX/BISON stage" OFF)
 option(DAS_CLANG_BIND_DISABLED "Disable dasClangBind (libclang bindings, C/C++ parsing)" ON)
 option(DAS_LLVM_DISABLED "Disable dasLLVM (llvm bindings)" ON)
-option(DAS_HV_DISABLED "Disable dasHV (websokets,http server and client)" ON)
+option(DAS_HV_DISABLED "Disable dasHV (websokets,http server and client)" OFF)
 option(DAS_GLFW_DISABLED "Disable dasGLFW (GLFW window for graphics apps)" OFF)
 option(DAS_AUDIO_DISABLED "Disable dasAudio (Miniaudio sound library)" OFF)
 option(DAS_STDDLG_DISABLED "Disable dasStdDlg (File new,open,save etc dialogs)" OFF)
diff --git a/benchmarks/sql/LINQ.md b/benchmarks/sql/LINQ.md
index 7a84d4b20..cfc9ba295 100644
--- a/benchmarks/sql/LINQ.md
+++ b/benchmarks/sql/LINQ.md
@@ -280,6 +280,26 @@ The order+take rows (`order_take_desc`, `sort_take`, `select_where_order_take`)
 
 `select_where_count` is the first **select+where** splice landing: previously rejected by the planner (the typer-inserted `ExprRef2Value` wrapper around `it` orphaned during substitution → `30921: can only dereference a reference`). The new `replaceVariablePeeling` helper in `templates_boost.das` peels the wrapper as part of the substitution, mirroring `ast_match`'s `qm_peel_ref2value`. All four terminator lanes (array / counter / accumulator / early-exit) covered.
 
+### Headline benchmarks (100K rows, INTERP, single-eval splice PR)
+
+Follow-up to the order-family + select+where landing. Closes two single-eval gaps the previous PR documented as `KNOWN PERF GAP`:
+
+**Gap 1 — comparator key double-call inside `partial_sort`.** Before this PR, `top_n_by(arr, K, key)` ran `_::less(key(v1), key(v2))` per comparison → 2 indirect lambda dispatches per `cmp`. For pure single-expression keys (the common case, e.g. `$(_) => _.price`), the planner now inlines the key body twice into a comparator block and dispatches to the new `top_n_by_with_cmp(arr, K) <| <cmp_block>` library entry — zero per-comparison lambda dispatch. Descending direction is encoded by flipping the comparator arg order (`less(body[v2], body[v1])`), eliminating the secondary wrapper-lambda the `_descending` family used. Falls back to keyed `top_n_by` when the key has side effects or isn't a single-expression lambda.
+
+**Gap 2 — projection double-eval in `select + where + terminator`.** Phase 3d inlined `projection` into `predicate` via peel-substitution; lane emitters then *also* cloned `projection` into `valueExpr` → projection evaluated twice per element on ARRAY / ACCUMULATOR / EARLY_EXIT lanes (COUNTER unaffected — no body use). Fix: the where-after-select arm now binds `projection` to a fresh local via a new `preConditionStmts` slot (evaluated per-element, OUTSIDE the if-wrap), then rewrites `projection` to reference that bind. Both predicate (via peel) and valueExpr (via clone) share the single eval. Side-effecty projections still bail to tier 2 (moving them outside the if would visibly fire side effects on filter-rejected elements). COUNTER lane is explicitly excluded — the dedup has no benefit there and the bind decl would regress the single-stmt fast path.
+
+| Benchmark | Shape | m1 (sql) | m3 (linq) | m3f (prev PR) | m3f (this PR) | Win |
+|---|---|---:|---:|---:|---:|---:|
+| sort_take | `order_by → take(K)` | 38 | 710 | 56 | **27** | **2.1× over prev / 26× over m3 / faster than m1 SQL** |
+| order_take_desc | `order_by_desc → take(K)` | 38 | 704 | 56 | **27** | **2.1× over prev / 26× over m3 / faster than m1 SQL** |
+| select_where_order_take | `where → order_by → take(K)` | 36 | 356 | 39 | **24** | **1.6× over prev / 15× over m3 / faster than m1 SQL** |
+| select_where_sum (NEW Gap 2) | `select → where → sum` | 37 | 59 | — | **7** | **8.4× over m3 / 5.3× over m1 SQL** |
+| select_where_count (regression check) | `select → where → count` (COUNTER, dedup off) | 32 | 58 | 5 | **5** | unchanged (correctly excluded from dedup) |
+
+The sort/order rows now BEAT `m1` SQLite by ~30%. PR #2707 closed the comparator-throughput gap vs SQL; this PR's inline-key splice closes the per-iteration lambda dispatch gap.
+
+**Parser bonus:** the multi-arg `$($i(a) : T, $i(b) : T) { ... }` qmacro form failed parse with `30701: block argument is already declared MACRO``TAG` because the parser stamped every `$i(...)` in block-arg position with the literal placeholder name and dup-checked them before macro tag resolution. Fixed in `src/parser/parser_impl.cpp:885` by skipping the dup check when `name_at.tag != nullptr` (genuine post-resolution dups surface as ordinary local-lookup conflicts during type inference). General-purpose fix — usable by any macro that needs to emit a typed block with N tagged-name args.
+
 ## Operator-coverage checklist (parity tests)
 
 The 24 benchmarks above cover the most common shapes. The end-game target is one benchmark per `_fold`-applicable scenario in the broader `tests/linq/` operator suite. Tracking the long-tail coverage below; PRs that add splice support for new operators should add a benchmark here if not already present.
diff --git a/benchmarks/sql/select_where_sum.das b/benchmarks/sql/select_where_sum.das
new file mode 100644
index 000000000..0f693c9b2
--- /dev/null
+++ b/benchmarks/sql/select_where_sum.das
@@ -0,0 +1,69 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let THRESHOLD = 1000
+
+// _select(_.price * 2) |> _where(_ > T) |> sum — where-after-select pattern with the
+// sum terminator. SQL: SELECT SUM(price * 2) FROM Cars WHERE price * 2 > T.
+//
+// m3 (plain LINQ) materializes a projection iterator, then a filter array, then sums.
+// m3f (spliced via Phase 3d single-eval) binds the projection to a local once per
+// element OUTSIDE the if-wrap, then the rewritten predicate reads the bind AND the
+// sum's valueExpr reads the bind — projection (`_.price * 2`) evaluates exactly once
+// per element across the splice. Without the dedup it evaluates twice (once in the
+// inlined predicate, once in valueExpr) for ARRAY/ACCUMULATOR/EARLY_EXIT lanes.
+//
+// m1 uses ``query_scalar`` with raw SQL because ``_sql``'s ``_select`` clause only
+// accepts a single column or named-tuple shape — arbitrary scalar expressions like
+// ``_.price * 2`` aren't representable in the typed DSL. The engine still folds the
+// projection into both the WHERE filter and the SUM, so per-row work is identical to
+// what the typed form would emit.
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            let s = db |> query_scalar("SELECT SUM(price * 2) FROM Cars WHERE price * 2 > {THRESHOLD}", type<int>)
+            if (s == 0) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        let s = arr |> _select(_.price * 2) |> _where(_ > THRESHOLD) |> sum
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        let s = _fold(each(arr)._select(_.price * 2)._where(_ > THRESHOLD).sum())
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+[benchmark]
+def select_where_sum_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def select_where_sum_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def select_where_sum_m3f(b : B?) {
+    run_m3f(b, 100000)
+}
diff --git a/daslib/linq.das b/daslib/linq.das
index 65cb38b0b..4b27fd72c 100644
--- a/daslib/linq.das
+++ b/daslib/linq.das
@@ -565,6 +565,51 @@ def top_n_descending(var a : iterator<auto(TT)>; n : int) : array<TT -const -&>
     return <- top_n_by_descending(a, n, $(v : TT -&) => v)
 }
 
+// ============================================================================
+// top_n_by_with_cmp — take a comparator block directly instead of a key
+// lambda. The splice planner uses this when an order-by key body is pure and
+// inlineable: the body is spliced into the comparator twice (once per side),
+// eliminating both the per-comparison lambda dispatch (`cmp(v1, v2)`) AND the
+// per-side `key(v)` dispatch nested inside the standard comparator.
+//
+// Direction is encoded in the comparator: asc emits `less(body[v1], body[v2])`,
+// desc emits `less(body[v2], body[v1])`. The output is sorted ascending by the
+// comparator, so a reversed comparator yields a descending natural-key order.
+// ============================================================================
+
+def top_n_by_with_cmp(arr : array<auto(TT)>; n : int; cmp : block<(v1 : TT -&, v2 : TT -&) : bool>) : array<TT -const -&> {
+    //! Returns the ``n`` smallest-per-``cmp`` elements of ``arr``.
+    //! Pass a reversed comparator to extract the ``n`` largest instead.
+    var buf : array<TT -const -&>
+    if (n <= 0 || empty(arr)) return <- buf
+    let take_count = min(n, length(arr))
+    buf |> reserve(length(arr))
+    for (it in arr) {
+        buf |> push_clone(it)
+    }
+    sort_boost::partial_sort(buf, take_count, cmp)
+    buf |> resize(take_count)
+    return <- buf
+}
+
+def top_n_by_with_cmp(var a : iterator<auto(TT)>; n : int; cmp : block<(v1 : TT -&, v2 : TT -&) : bool>) : array<TT -const -&> {
+    //! Iterator variant — bounded heap of size ``n`` maintained during scan.
+    var buf : array<TT -const -&>
+    if (n <= 0) return <- buf
+    for (it in a) {
+        if (length(buf) < n) {
+            buf |> push_clone(it)
+            sort_boost::push_heap(buf, cmp)
+        } elif (cmp(it, buf[0])) {
+            sort_boost::pop_heap(buf, cmp)
+            buf[length(buf) - 1] := it
+            sort_boost::push_heap(buf, cmp)
+        }
+    }
+    sort(buf, cmp)
+    return <- buf
+}
+
 def unique_key(a) {
     //! generates unique key of workhorse type for the value
     static_if (typeinfo is_workhorse(a)) {
diff --git a/daslib/linq_fold.das b/daslib/linq_fold.das
index ffaa806a8..9b6b4dada 100644
--- a/daslib/linq_fold.das
+++ b/daslib/linq_fold.das
@@ -133,6 +133,7 @@ var private linqCalls = {
     "top_n_by" => LinqCall(name = "top_n_by", noToArrayVariant = true),
     "top_n_descending" => LinqCall(name = "top_n_descending", noToArrayVariant = true),
     "top_n_by_descending" => LinqCall(name = "top_n_by_descending", noToArrayVariant = true),
+    "top_n_by_with_cmp" => LinqCall(name = "top_n_by_with_cmp", noToArrayVariant = true),
 // aggregate
     "count" => LinqCall(name = "count"),
     "long_count" => LinqCall(name = "long_count"),
@@ -524,6 +525,23 @@ def private wrap_with_condition(var body : Expression?; var cond : Expression?)
     }
 }
 
+[macro_function]
+def private prepend_precond(var body : Expression?; var preCondStmts : array<Expression?>) : Expression? {
+    // Splice per-element unconditional binds BEFORE the body. Used by the select+where
+    // splice arm to bind a pure projection once per element so the rewritten predicate
+    // (peel-substituted to reference the bind var) and the terminator's valueExpr (cloned
+    // from `projection`) share that single eval. intermediateBinds, by contrast, go INSIDE
+    // the if-wrap — correct for chained-select chains with no where.
+    if (empty(preCondStmts)) return body
+    var stmts : array<Expression?>
+    stmts |> reserve(length(preCondStmts) + 1)
+    for (s in preCondStmts) {
+        stmts |> push(s)
+    }
+    stmts |> push(body)
+    return stmts_to_expr(stmts)
+}
+
 [macro_function]
 def private append_skip_take_prelude(var preludeStmts : array<Expression?>; var skipExpr : Expression?; var takeExpr : Expression?;
                                      skipName, takeCountName : string) {
@@ -697,6 +715,7 @@ def private emit_accumulator_lane(
                                   var projection : Expression?;
                                   var whereCond : Expression?;
                                   var intermediateBinds : array<Expression?>;
+                                  var preCondStmts : array<Expression?>;
                                   var elementType : TypeDeclPtr;
                                   srcName, accName, itName, skipName, takeCountName : string;
                                   var skipExpr : Expression?; var takeExpr : Expression?;
@@ -797,7 +816,7 @@ def private emit_accumulator_lane(
     }
     prepend_binds(perMatchStmts, intermediateBinds)
     wrap_with_skip_take(perMatchStmts, skipExpr, takeExpr, skipName, takeCountName)
-    var loopBody = wrap_with_condition(stmts_to_expr(perMatchStmts), whereCond)
+    var loopBody = prepend_precond(wrap_with_condition(stmts_to_expr(perMatchStmts), whereCond), preCondStmts)
     // Collect all body statements into one list so they share scope when spliced via $b.
     // Splitting decls / for / return into separate splice tags would put each in its own
     // sub-block, hiding the accumulator from later statements (caught by AST dump under
@@ -847,6 +866,7 @@ def private emit_early_exit_lane(
                                  var projection : Expression?;
                                  var whereCond : Expression?;
                                  var intermediateBinds : array<Expression?>;
+                                 var preCondStmts : array<Expression?>;
                                  var elementType : TypeDeclPtr;
                                  terminatorCall : ExprCall?;
                                  srcName, itName, skipName, takeCountName : string;
@@ -956,7 +976,7 @@ def private emit_early_exit_lane(
     }
     prepend_binds(perMatchStmts, intermediateBinds)
     wrap_with_skip_take(perMatchStmts, skipExpr, takeExpr, skipName, takeCountName)
-    var loopBody = wrap_with_condition(stmts_to_expr(perMatchStmts), whereCond)
+    var loopBody = prepend_precond(wrap_with_condition(stmts_to_expr(perMatchStmts), whereCond), preCondStmts)
     // Single-$b body so all stmts (skip/take counters + prelude + for + tail) share scope
     // under one wrapping block.
     var bodyStmts : array<Expression?>
@@ -992,6 +1012,66 @@ def private order_top_n_call_name(orderName : string) : string {
     return ""
 }
 
+[macro_function]
+def private try_make_inline_cmp(orderKey : Expression?; orderName : string;
+                                elemType : TypeDeclPtr; at : LineInfo) : Expression? {
+    //! When ``orderKey`` is a single-arg, single-return-statement lambda whose body has
+    //! no side effects, return a comparator block that inlines the body twice with the
+    //! lambda argument renamed to ``v1`` and ``v2``. Direction is encoded by emit order:
+    //! asc → ``_::less(body[v1], body[v2])``; desc → ``_::less(body[v2], body[v1])``.
+    //!
+    //! Returns null when the key is multi-statement, has side effects, or isn't a
+    //! recognizable lambda shape — caller falls through to the lambda-passing emission.
+    //!
+    //! What this saves: indirect-dispatch overhead. The original comparator does two
+    //! `key(v)` indirect lambda calls per comparison (one per side); the inlined version
+    //! evaluates the body twice as direct expressions (one per side). Net: 2 fewer lambda
+    //! dispatches per comparison. For trivial keys (`$(_) => _.price` — a field load)
+    //! that dispatch IS the dominant cost. For expensive keys both forms still evaluate
+    //! the body twice per comparison, so the relative win shrinks.
+    //!
+    //! `has_sideeffects` is a **semantic** gate, not a perf gate: side-effecting bodies
+    //! can't be safely re-substituted into a fresh syntactic position because the typer-
+    //! inserted ExprRef2Value wrappers and ordering guarantees aren't preserved across
+    //! `replaceVariable`. (Schwartzian-style precompute-once is an orthogonal optimization
+    //! that would help expensive keys; not in scope here.)
+    if (orderKey == null || !(orderKey is ExprMakeBlock)) return null
+    var mblk = orderKey as ExprMakeBlock
+    var blk = mblk._block as ExprBlock
+    if (blk.arguments |> length != 1 || blk.list |> length != 1
+            || !(blk.list[0] is ExprReturn)) return null
+    var ret = blk.list[0] as ExprReturn
+    if (ret.subexpr == null || has_sideeffects(ret.subexpr)) return null
+    let argName = string(blk.arguments[0].name)
+    var b1 = clone_expression(ret.subexpr)
+    var b2 = clone_expression(ret.subexpr)
+    var r1 : Template
+    r1 |> renameVariable(argName, "v1")
+    var r2 : Template
+    r2 |> renameVariable(argName, "v2")
+    apply_template(r1, b1.at, b1)
+    apply_template(r2, b2.at, b2)
+    var cmpExpr : Expression?
+    if (orderName == "order_by_descending") {
+        cmpExpr = qmacro(_::less($e(b2), $e(b1)))
+    } else {
+        cmpExpr = qmacro(_::less($e(b1), $e(b2)))
+    }
+    // Emit untyped block args (`$(v1, v2) { ... }`). The typer infers v1/v2 types from
+    // the dispatch's block-arg signature (`block<(v1 : TT -&, v2 : TT -&) : bool>` in
+    // `order(arr, block)` / `top_n_by_with_cmp(arr, n, block)`), and renders them as the
+    // const-ref shape (`TT const -&`) at the call site — which is what the candidate
+    // signature matches. Typed-explicitly emission via `$($i(a) : $t(T) -&, ...)` is
+    // possible after the parser_impl.cpp tagged-block-arg fix (error 30701), but the
+    // type-flag bookkeeping to make the emitted args match the candidate `TT const -&`
+    // exactly is fragile (must propagate clone_type, removeConstant, and ref flags
+    // through a parser path that flattens them differently per modifier order). Untyped
+    // sidesteps that entirely.
+    return qmacro($(v1, v2) {
+        return $e(cmpExpr)
+    })
+}
+
 [macro_function]
 def private plan_order_family(var expr : Expression?) : Expression? {
     //! Phase 3 splice planner for chains containing an order-family operator.
@@ -1051,6 +1131,15 @@ def private plan_order_family(var expr : Expression?) : Expression? {
     let needIterWrap = expr._type.isIterator
     let topNName = order_top_n_call_name(orderName)
     let inplaceName = "{orderName}_inplace"
+    // Inline-key path: when the key is a pure single-expression lambda, splice the body
+    // twice into a comparator block (asc: `_::less(body[v1], body[v2])`; desc: flip
+    // arg order). Eliminates the per-comparison `key(v)` indirect dispatch — dominant
+    // cost on trivial keys like `$(_) => _.field`. Null when key is multi-statement /
+    // side-effecting / not a recognizable lambda — caller falls through.
+    var inlineCmp : Expression?
+    if (hasKey) {
+        inlineCmp = try_make_inline_cmp(orderKey, orderName, orderElemType, at)
+    }
     if (whereCond == null) {
         // No prefilter — direct call to daslib helper.
         var topExpr = clone_expression(top)
@@ -1059,14 +1148,20 @@ def private plan_order_family(var expr : Expression?) : Expression? {
         if (takeExpr == null) {
             // Bare order family — emit the direct call. Same shape as plain LINQ, but via
             // splice so `_fold` doesn't fall through to tier 2.
-            if (hasKey) {
+            if (inlineCmp != null) {
+                // Inlined comparator dispatches to the asc `order(src, block)` overload —
+                // direction encoded in the comparator (flip embedded for descending).
+                emission = qmacro(_::order($e(topExpr), $e(inlineCmp)))
+            } elif (hasKey) {
                 emission = qmacro($c(orderName)($e(topExpr), $e(orderKey)))
             } else {
                 emission = qmacro($c(orderName)($e(topExpr)))
             }
         } else {
             // order + take → top_n* dispatch.
-            if (hasKey) {
+            if (inlineCmp != null) {
+                emission = qmacro(_::top_n_by_with_cmp($e(topExpr), $e(takeExpr), $e(inlineCmp)))
+            } elif (hasKey) {
                 emission = qmacro($c(topNName)($e(topExpr), $e(takeExpr), $e(orderKey)))
             } else {
                 emission = qmacro($c(topNName)($e(topExpr), $e(takeExpr)))
@@ -1116,7 +1211,10 @@ def private plan_order_family(var expr : Expression?) : Expression? {
         // Sort the prefilter buffer in place and return it. order*_inplace is void
         // (mutates the buffer in place), so we move the buffer out for the final result.
         var sortCall : Expression?
-        if (hasKey) {
+        if (inlineCmp != null) {
+            // Asc `order_inplace(buf, block)` overload + embedded-flip cmp for descending.
+            sortCall = qmacro(_::order_inplace($i(bufName), $e(inlineCmp)))
+        } elif (hasKey) {
             sortCall = qmacro($c(inplaceName)($i(bufName), $e(orderKey)))
         } else {
             sortCall = qmacro($c(inplaceName)($i(bufName)))
@@ -1128,7 +1226,9 @@ def private plan_order_family(var expr : Expression?) : Expression? {
     } else {
         // top_n* on the prefilter buffer.
         var topNCall : Expression?
-        if (hasKey) {
+        if (inlineCmp != null) {
+            topNCall = qmacro(_::top_n_by_with_cmp($i(bufName), $e(takeExpr), $e(inlineCmp)))
+        } elif (hasKey) {
             topNCall = qmacro($c(topNName)($i(bufName), $e(takeExpr), $e(orderKey)))
         } else {
             topNCall = qmacro($c(topNName)($i(bufName), $e(takeExpr)))
@@ -1184,6 +1284,13 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
     var whereCond : Expression?
     var projection : Expression?
     var intermediateBinds : array<Expression?>
+    // preConditionStmts evaluate UNCONDITIONALLY per element, BEFORE the where filter —
+    // used by the select+where splice arm to bind a pure projection once per element so
+    // both the predicate (which is rewritten to read the bind var via peel-substitution)
+    // AND the terminator's valueExpr (which clones the projection) share that single
+    // eval. intermediateBinds, by contrast, prepend INSIDE the if(whereCond) wrap, so
+    // they evaluate only on matching elements — correct for chained-select bind chains.
+    var preCondStmts : array<Expression?>
     var skipExpr : Expression?
     var takeExpr : Expression?
     var seenSelect = false
@@ -1201,20 +1308,36 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
             if (seenSkip || seenTake) return null
             var predicate : Expression?
             if (seenSelect) {
-                // Phase 3d: where-after-select. Substitute the predicate's bound variable
-                // with the current projection via peel-aware substitution. The substitution
-                // inlines the projection into the predicate, which would re-evaluate any
-                // side effects (since the terminator also references projection) — bail to
-                // tier 2 cascade on side-effecty projections.
+                // Phase 3d / single-eval: where-after-select. Bind the current projection
+                // to a fresh local in `preCondStmts` (evaluated per-element, OUTSIDE the
+                // if-wrap), then rewrite `projection` to reference that local — peel-
+                // substituted into the predicate AND cloned into the terminator's
+                // valueExpr. Both reference the bind ⇒ projection evaluates exactly once
+                // per element. Side-effecty projections still bail to tier 2: moving them
+                // outside the if would visibly fire side effects on filter-rejected
+                // elements.
                 //
-                // KNOWN PERF GAP (deferred to splice-with-cmp follow-up PR): pure projections
-                // currently re-evaluate per element for ARRAY/ACCUMULATOR/EARLY_EXIT lanes —
-                // once in the inlined predicate and once in valueExpr. COUNTER lane is unaffected
-                // (no body use). Fix shape: emit a pre-condition `var v := projection` bind in
-                // the loop body (outside the if-wrap) and rewrite both predicate and valueExpr
-                // to reference `v`. Bundled with the `_with_cmp` inline-key follow-up since
-                // both share the "single-eval splice" theme.
+                // COUNTER lane (count) is excluded: it never reads valueExpr, so the
+                // dedup brings no benefit, and the extra per-element bind would regress
+                // the lane's single-stmt fast path (predicate already inlines projection,
+                // result is just discarded).
                 if (has_sideeffects(projection)) return null
+                if (lane != LinqLane.COUNTER) {
+                    let wbName = "`vw`{at.line}`{at.column}`{length(preCondStmts)}"
+                    var projType = clone_type(elementType)
+                    preCondStmts |> push <| qmacro_expr() {
+                        var $i(wbName) : $t(projType) := $e(projection)
+                    }
+                    // Replace projection with a typed ExprVar so downstream typer passes
+                    // can resolve the reference without re-walking the loop body's local
+                    // decls. Untyped ExprVars here propagate `auto` into push_clone /
+                    // accumulator paths and surface as "cannot infer push_clone return
+                    // type" at the typer.
+                    var pvar = new ExprVar(at = at, name := wbName)
+                    pvar._type = clone_type(elementType)
+                    pvar._type.flags.ref = true
+                    projection = pvar
+                }
                 predicate = fold_linq_cond_peel(cll._0.arguments[1], projection)
             } else {
                 predicate = fold_linq_cond(cll._0.arguments[1], itName)
@@ -1290,7 +1413,7 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
     // COUNTER/ARRAY loopBody construction.
     if (lane == LinqLane.ACCUMULATOR)
         return emit_accumulator_lane(lastName, top, projection, whereCond,
-            intermediateBinds, elementType, srcName, accName, itName, skipName, takeCountName,
+            intermediateBinds, preCondStmts, elementType, srcName, accName, itName, skipName, takeCountName,
             skipExpr, takeExpr, at)
     // Ring 2: early-exit lane — `any` no-pred + no upstream work + no limits + length-bearing
     // source gets the empty-shortcut; everything else dispatches to the loop emitter.
@@ -1301,7 +1424,7 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
                 && type_has_length(top._type))
             return emit_any_empty_shortcut(top, srcName, at)
         return emit_early_exit_lane(lastName, top, projection, whereCond,
-            intermediateBinds, elementType, terminatorCall, srcName, itName, skipName,
+            intermediateBinds, preCondStmts, elementType, terminatorCall, srcName, itName, skipName,
             takeCountName, skipExpr, takeExpr, at)
     }
     // Build the per-element loop body for COUNTER / ARRAY. Both lanes follow the same shape:
@@ -1328,7 +1451,7 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
         }
         prepend_binds(stmts, intermediateBinds)
         wrap_with_skip_take(stmts, skipExpr, takeExpr, skipName, takeCountName)
-        loopBody = wrap_with_condition(stmts_to_expr(stmts), whereCond)
+        loopBody = prepend_precond(wrap_with_condition(stmts_to_expr(stmts), whereCond), preCondStmts)
     } else {
         // Array lane. `push_clone` is the safe append everywhere: for workhorse types it's a
         // byte copy (same cost as `push`); for non-workhorse it deep-clones, avoiding the
@@ -1354,7 +1477,7 @@ def private plan_loop_or_count(var expr : Expression?) : Expression? {
         }
         prepend_binds(stmts, intermediateBinds)
         wrap_with_skip_take(stmts, skipExpr, takeExpr, skipName, takeCountName)
-        loopBody = wrap_with_condition(stmts_to_expr(stmts), whereCond)
+        loopBody = prepend_precond(wrap_with_condition(stmts_to_expr(stmts), whereCond), preCondStmts)
     }
     if (counterLane) {
         return emit_counter_lane(top, srcName, accName, itName, skipName, takeCountName,
diff --git a/modules/dasHV/CMakeLists.txt b/modules/dasHV/CMakeLists.txt
index b7dab946a..cbb93744c 100644
--- a/modules/dasHV/CMakeLists.txt
+++ b/modules/dasHV/CMakeLists.txt
@@ -42,7 +42,9 @@ IF ((NOT DAS_HV_INCLUDED) AND ((NOT ${DAS_HV_DISABLED}) OR (NOT DEFINED DAS_HV_D
 		SET(HV_LIBRARIES ${DAS_HV_DIR}/hv/$<CONFIG>/lib/hv_static.lib)
 	ELSE()
 		find_package(OpenSSL REQUIRED)
-		SET(HV_LIBRARIES ${DAS_HV_DIR}/hv/$<CONFIG>/lib/libhv_static.a)
+		# libhv ≥ master renames hv_static's OUTPUT_NAME to "hv" on POSIX,
+		# so the installed file is libhv.a (was libhv_static.a in v1.3.4).
+		SET(HV_LIBRARIES ${DAS_HV_DIR}/hv/$<CONFIG>/lib/libhv.a)
 		SET(OPENSSL_LIBRARIES_FILES OpenSSL::Crypto OpenSSL::SSL)
 	ENDIF()
 
@@ -61,15 +63,22 @@ IF ((NOT DAS_HV_INCLUDED) AND ((NOT ${DAS_HV_DISABLED}) OR (NOT DEFINED DAS_HV_D
 		-DOPENSSL_LIBRARIES="${OPENSSL_CRYPTO_LIBRARY};${OPENSSL_SSL_LIBRARY}"
 	)
 	IF(DAS_USE_SANITIZER STREQUAL "address" OR DAS_USE_SANITIZER STREQUAL "asan")
+		IF(MSVC)
+			SET(_HV_ASAN_FLAG "/fsanitize=address")
+		ELSE()
+			SET(_HV_ASAN_FLAG "-fsanitize=address")
+		ENDIF()
 		LIST(APPEND HV_CMAKE_FLAGS
-			"-DCMAKE_C_FLAGS=/fsanitize=address"
-			"-DCMAKE_CXX_FLAGS=/fsanitize=address"
+			"-DCMAKE_C_FLAGS=${_HV_ASAN_FLAG}"
+			"-DCMAKE_CXX_FLAGS=${_HV_ASAN_FLAG}"
 		)
 	ENDIF()
+	# TODO: switch back to ithewei/libhv upstream tag once
+	# https://github.com/ithewei/libhv/pull/835 is merged.
 	ExternalProject_Add(
 		LIBHV
-		URL https://github.com/ithewei/libhv/archive/refs/tags/v1.3.4.tar.gz
-		URL_HASH SHA256=f0a9a197f90da55cc3ff104f9c7a27cc927f117b6c18613c3292726068588e10
+		URL https://github.com/aleksisch/libhv/archive/343437b72fbd0f348abb168d61fe7f1c6c4f4d20.tar.gz
+		URL_HASH SHA256=1b809d55dc1a637aafecb25e717c4ab302a6733390f43b32244976d6583cd866
 		DOWNLOAD_EXTRACT_TIMESTAMP TRUE
 		PREFIX ${CMAKE_CURRENT_BINARY_DIR}/libhv
 		CMAKE_ARGS ${HV_CMAKE_FLAGS}
diff --git a/mouse-data/docs/how-do-i-write-an-ast-shape-test-that-distinguishes-tier-1-splice-from-tier-2-cascade-in-fold-exprinvoke-matches-both.md b/mouse-data/docs/how-do-i-write-an-ast-shape-test-that-distinguishes-tier-1-splice-from-tier-2-cascade-in-fold-exprinvoke-matches-both.md
new file mode 100644
index 000000000..7d7e82eec
--- /dev/null
+++ b/mouse-data/docs/how-do-i-write-an-ast-shape-test-that-distinguishes-tier-1-splice-from-tier-2-cascade-in-fold-exprinvoke-matches-both.md
@@ -0,0 +1,40 @@
+---
+slug: how-do-i-write-an-ast-shape-test-that-distinguishes-tier-1-splice-from-tier-2-cascade-in-fold-exprinvoke-matches-both
+title: How do I write an AST-shape test that distinguishes tier 1 splice from tier 2 cascade in `_fold`? `ExprInvoke` matches both.
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**`body_expr is ExprInvoke` is the WEAKEST possible assertion** — both tier-1 splice emissions (in `plan_loop_or_count` / `plan_order_family` / `emit_*_lane`) AND tier-2 `fold_linq_default` emissions wrap their result in `invoke($block, $src)` form. Asserting just `ExprInvoke` passes for both tiers and tells you nothing.
+
+**Strong-form distinguishers** (use 2-3 of these together):
+
+```
+t |> equal(1, count_inner_for_loops(body_expr), "single fused for-loop")
+t |> equal(0, count_call(body_expr, "where_"),  "where_ should be inlined, not called")
+t |> equal(0, count_call(body_expr, "select"),  "select should be inlined, not called")
+```
+
+Tier 1 splice emits a single `for` loop with the predicate/projection inlined into the body — zero residual `where_(...)` / `select(...)` calls. Tier 2 `fold_linq_default` emits a multi-statement block:
+
+```
+var pass_0 = where_to_array(src, pred)
+var pass_1 = select_inplace(pass_0, proj)
+delete pass_0
+return <- pass_1
+```
+
+Multiple `var` decls, `_to_array` / `_inplace` suffixed calls, explicit `delete` between stages. The pattern is distinguishable by counting calls that survive vs. inline.
+
+**Canonical template:** [tests/linq/test_linq_fold_ast.das:1377 (`test_select_where_count_emits_fused_loop`)](tests/linq/test_linq_fold_ast.das#L1377). Use as the model when adding new tier-1 AST-shape assertions.
+
+**Why this matters:** Phase 3d (PR #2712) added the select+where splice arm. The pre-Phase-3d tests asserted `body_expr is ExprInvoke` expecting tier-2 fallback, but those chains now splice tier 1. The tests kept passing (both tiers wrap in invoke) but were stale-named and meaningless. Copilot review caught it; the fix was deleting the redundant tests and replacing with strong-form assertions.
+
+## Questions
+- How do I write an AST-shape test that distinguishes tier 1 splice from tier 2 cascade in `_fold`? `ExprInvoke` matches both.
+- Why does my `body_expr is ExprInvoke` assertion pass for both tier 1 splice and tier 2 cascade in `_fold` AST tests?
+- What's the canonical splice-emission shape I should assert in `test_linq_fold_ast.das` for a new splice arm?
+
+## Questions
+- How do I write an AST-shape test that distinguishes tier 1 splice from tier 2 cascade in `_fold`? `ExprInvoke` matches both.
diff --git a/mouse-data/docs/my-macro-substitutes-it-for-a-projection-expression-via-template-replacevariable-it-proj-apply-template-but-the-result-fails-to.md b/mouse-data/docs/my-macro-substitutes-it-for-a-projection-expression-via-template-replacevariable-it-proj-apply-template-but-the-result-fails-to.md
index 9652ba343..f55e42132 100644
--- a/mouse-data/docs/my-macro-substitutes-it-for-a-projection-expression-via-template-replacevariable-it-proj-apply-template-but-the-result-fails-to.md
+++ b/mouse-data/docs/my-macro-substitutes-it-for-a-projection-expression-via-template-replacevariable-it-proj-apply-template-but-the-result-fails-to.md
@@ -2,7 +2,7 @@
 slug: my-macro-substitutes-it-for-a-projection-expression-via-template-replacevariable-it-proj-apply-template-but-the-result-fails-to
 title: My macro substitutes `it` for a projection expression via `Template.replaceVariable("it", proj) + apply_template`, but the result fails to compile with "can only dereference a reference". What's going wrong?
 created: 2026-05-16
-last_verified: 2026-05-16
+last_verified: 2026-05-18
 links: []
 ---
 
@@ -18,7 +18,14 @@ Two fixes for substitution:
 
 Concrete repro: daslang `linq_fold`'s Phase 2A planner tried to fuse chained `_select|_select` via `substitute_it_for(proj2, "it", proj1)`. proj1 was `it * 2` (where `it` is the typed-and-wrapped loop var), proj2 was `it + 1`. Substituting via Template replaced the inner ExprVar in proj2 but left `ExprRef2Value(it * 2) + 1` — type error. The fix was deferred (chained-select falls through unfolded in Phase 2A) but Phase 2B needs option 2.
 
-See `skills/das_macros.md` "Peel ExprRef2Value before qmatch" for the matcher-side analog. The substitution side has no in-tree helper yet.
+See `skills/das_macros.md` "Peel ExprRef2Value before qmatch" for the matcher-side analog.
+
+**Update 2026-05-18 (PR #2712):** The substitution-side helper has landed as `replaceVariablePeeling` in `daslib/templates_boost.das`. Same signature as `replaceVariable` — populates a new `var2exprPeeling : table<string; Expression?>` field on `Template`. The `TemplateVisitor` gets a `visitExprRef2Value` override that detects `ExprRef2Value(ExprVar(name))` for any peeling-registered name and returns `clone_expression(replacement)` directly (Option 2 above). First user is `fold_linq_cond_peel` in `daslib/linq_fold.das` for the `_select |> _where |> terminator` splice arm; bails to tier-2 cascade when `has_sideeffects(projection)` to avoid double-evaluation.
+
+When to choose `replaceVariablePeeling` over `replaceVariable`: any time you substitute into already-typed AST. The typer's `ExprRef2Value` wrappers are invisible in the IDE outline but real in the AST.
 
 ## Questions
 - My macro substitutes `it` for a projection expression via `Template.replaceVariable("it", proj) + apply_template`, but the result fails to compile with "can only dereference a reference". What's going wrong?
+- ExprRef2Value blocker — `_select|_where` (where-after-select) in `_fold` splice — macro substitutes `it` for projection via `Template::replaceVariable`, compile error "can only dereference a reference"
+- How do I peel `ExprRef2Value` during typed-AST substitution? When should I use `replaceVariablePeeling` over `replaceVariable`?
+- Why does my `apply_template` substitution leave a `30921` error around the substituted expression?
diff --git a/mouse-data/docs/when-does-daslib-s-order-by-return-iterator-vs-array-and-why-is-to-sequence-move-dangerous-to-wrap-blindly-around-the-result.md b/mouse-data/docs/when-does-daslib-s-order-by-return-iterator-vs-array-and-why-is-to-sequence-move-dangerous-to-wrap-blindly-around-the-result.md
new file mode 100644
index 000000000..35b587ea1
--- /dev/null
+++ b/mouse-data/docs/when-does-daslib-s-order-by-return-iterator-vs-array-and-why-is-to-sequence-move-dangerous-to-wrap-blindly-around-the-result.md
@@ -0,0 +1,40 @@
+---
+slug: when-does-daslib-s-order-by-return-iterator-vs-array-and-why-is-to-sequence-move-dangerous-to-wrap-blindly-around-the-result
+title: When does daslib's `order_by` return iterator vs array? And why is `to_sequence_move()` dangerous to wrap blindly around the result?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**`order_by` has 2 overloads in daslib/linq.das (and same for `order` / `order_descending` / `order_by_descending`):**
+- `def order_by(var a : iterator<auto(TT)>; key) : iterator<TT -const -&>` ([daslib/linq.das:405](daslib/linq.das#L405)) — iterator in → **iterator out**
+- `def order_by(a : array<auto(TT)>; key) : array<TT -const -&>` ([daslib/linq.das:412](daslib/linq.das#L412)) — array in → **array out**
+
+Return-type-mirrors-source-type. Caller picks the overload by what they pass in.
+
+Contrast with **`top_n_by` / `top_n*` (PR #2707): both overloads return `array<T>` regardless of input shape** — iterator source still returns array. No `top_n_by_iterator` form.
+
+**Practical implication for splice planners:** `to_sequence_move()` is array-only — it crashes on iterator. So if you have a planner that emits a chain call and then wraps with `to_sequence_move()` when the outer expression was iterator-typed, gate the wrap on the *emission's* output type, not the outer-chain type:
+
+```
+let emissionIsArray = takeExpr != null /* top_n* always array */ || top._type.isGoodArrayType
+if (needIterWrap && emissionIsArray) {
+    emission = qmacro($e(emission).to_sequence_move())
+}
+```
+
+Without the gate, a bare `order_by(iter, key)` emission gets `.to_sequence_move()` glued on its iterator result → compile error.
+
+This was a latent bug in `plan_order_family` (PR #2712 round 1) caught by Copilot review; my tests all used `each(arr)` sources where `peel_each` succeeded and yielded array, masking the iterator-source path.
+
+## Questions
+- When does daslib's `order_by` return iterator vs array? And why is `to_sequence_move()` dangerous to wrap blindly around the result?
+- What's the difference between `order_by` and `top_n_by` return types for iterator vs array source?
+- Why does my splice planner's `.to_sequence_move()` wrap compile-fail on iterator sources?
+- What's the array-only constraint on `to_sequence_move` in linq?
+
+## See also
+- `daslib-order-family-and-top-n-overload-shapes` — companion table
+
+## Questions
+- When does daslib's `order_by` return iterator vs array? And why is `to_sequence_move()` dangerous to wrap blindly around the result?
diff --git a/mouse-data/docs/when-does-peel-each-in-daslib-linq-fold-das-unwrap-each-x-to-x-and-what-s-the-design-rationale.md b/mouse-data/docs/when-does-peel-each-in-daslib-linq-fold-das-unwrap-each-x-to-x-and-what-s-the-design-rationale.md
new file mode 100644
index 000000000..f6453cf5a
--- /dev/null
+++ b/mouse-data/docs/when-does-peel-each-in-daslib-linq-fold-das-unwrap-each-x-to-x-and-what-s-the-design-rationale.md
@@ -0,0 +1,35 @@
+---
+slug: when-does-peel-each-in-daslib-linq-fold-das-unwrap-each-x-to-x-and-what-s-the-design-rationale
+title: When does `peel_each` in daslib/linq_fold.das unwrap `each(<x>)` to `<x>`, and what's the design rationale?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**`peel_each` only unwraps when `x` is a true array (`isGoodArrayType` or `isArray`).** For iterator-typed `x` (e.g., `each(range(N))`, `each(generator())`), it returns `top` unchanged — the `each(...)` wrapper stays.
+
+Definition: [daslib/linq_fold.das:431-445](daslib/linq_fold.das#L431-L445).
+
+```
+def private peel_each(var top : Expression?) : Expression? {
+    if (!(top is ExprCall)) return top
+    var topCall = top as ExprCall
+    if (!is_each_call(topCall) || topCall.arguments |> length != 1) return top
+    let argExpr = topCall.arguments[0]
+    if ((argExpr == null || argExpr._type == null)
+            || (!argExpr._type.isGoodArrayType && !argExpr._type.isArray)) return top
+    return clone_expression(argExpr)
+}
+```
+
+**Why gate on array-ness?** Downstream emitters call `length(src)` for buffer-reserve hints and rely on indexable / random-access semantics (e.g., chunked iteration, slice). An iterator behind the `each` wrapper has neither. Peeling would put the bare iterator in the emission and break those assumptions silently.
+
+**Implication for splice planner authors:** when you call `top = peel_each(top)`, `top._type` may still be iterator-typed afterward. That changes which library overload your emission dispatches to (`order_by(iter)` vs `order_by(arr)`, etc. — see [[when-does-daslibs-order-by-return-iterator-vs-array]]). Don't assume `peel_each` always gives you an array.
+
+## Questions
+- When does `peel_each` in daslib/linq_fold.das unwrap `each(<x>)` to `<x>`, and what's the design rationale?
+- Why doesn't `peel_each` unwrap `each(range(N))` or `each(some_generator())`?
+- After `peel_each`, can `top._type.isIterator` still be true?
+
+## Questions
+- When does `peel_each` in daslib/linq_fold.das unwrap `each(<x>)` to `<x>`, and what's the design rationale?
diff --git a/mouse-data/docs/why-does-order-by-v-v-compile-fail-with-function-function-while-order-by-works.md b/mouse-data/docs/why-does-order-by-v-v-compile-fail-with-function-function-while-order-by-works.md
new file mode 100644
index 000000000..397679fca
--- /dev/null
+++ b/mouse-data/docs/why-does-order-by-v-v-compile-fail-with-function-function-while-order-by-works.md
@@ -0,0 +1,27 @@
+---
+slug: why-does-order-by-v-v-compile-fail-with-function-function-while-order-by-works
+title: Why does `_order_by(@@(v) => -v)` compile-fail with "_::<(function, function)" while `_order_by(-_)` works?
+created: 2026-05-18
+last_verified: 2026-05-18
+links: []
+---
+
+**`_order_by(EXPR)` is a daslib comprehension macro — `EXPR` is a key-extraction expression using the `_` placeholder for the element, NOT a lambda or function pointer.**
+
+The macro expands `_order_by(EXPR)` to roughly `order_by(iter, $($) => EXPR)` — it wraps EXPR into a block that produces the key per element. So:
+
+- ✅ `_order_by(-_)`         → emits `order_by(iter, $($) => -$)` → key = `-element`
+- ✅ `_order_by(_.price)`    → emits `order_by(iter, $($) => $.price)` → key = field access
+- ✅ `_order_by(_ * 2)`      → emits `order_by(iter, $($) => $ * 2)`
+- ❌ `_order_by(@@(v) => -v)` → emits `order_by(iter, $($) => @@(v) => -v)` — the inner block returns a FUNCTION POINTER, not a value. Then `_::less(key(v1), key(v2))` tries to compare two function pointers → error 30341.
+
+Same rule for `_order_by_descending`, `_select`, `_where`, and every other comprehension form in `daslib/linq.das` that takes a `_`-placeholder expression.
+
+The `top_n_by(arr, K, key)` direct call is different — `key` IS a function/lambda parameter. There you pass `@@(v : int) => -v` or `$(v : int) => -v`. The two forms only collide when you write `_order_by(@@...)` (comprehension form taking a key expression but getting a function literal).
+
+**How to spot the diagnostic:** error `30341: no matching functions or generics: _::<(function<...>, function<...>)` with the call stack pointing into `linq.das` `order_by` instantiation. The "comparing two functions" is the tell.
+
+## Questions
+- Why does `_order_by(@@(v) => -v)` compile-fail with "_::<(function, function)" while `_order_by(-_)` works?
+- What's the difference between `_order_by(EXPR)` (comprehension form) and `order_by(arr, key)` (direct call) for the key argument?
+- What does the `_` placeholder mean in daslib comprehension forms like `_select` / `_where` / `_order_by`?
diff --git a/src/parser/parser_impl.cpp b/src/parser/parser_impl.cpp
index 9ff906718..dd872223a 100644
--- a/src/parser/parser_impl.cpp
+++ b/src/parser/parser_impl.cpp
@@ -882,7 +882,14 @@ namespace das {
             for ( auto pDecl : *list ) {
                 if ( pDecl->pTypeDecl ) {
                     for ( const auto & name_at : *pDecl->pNameList ) {
-                        if ( !closure->findArgument(name_at.name) ) {
+                        // Macro-tagged names (`$i(expr)` in block-arg position) all parse to the
+                        // literal placeholder "``MACRO``TAG``"; the actual name is resolved later
+                        // when the macro processor substitutes the tag expression. Skip the dup
+                        // check for tagged names so multi-arg lists like
+                        // `$($i(a) : T, $i(b) : T) { ... }` aren't false-positive at parse time.
+                        // After resolution, duplicate names surface as ordinary local-lookup
+                        // conflicts during type inference.
+                        if ( name_at.tag || !closure->findArgument(name_at.name) ) {
                             VariablePtr pVar = new Variable();
                             pVar->name = name_at.name;
                             pVar->aka = name_at.aka;
diff --git a/tests/linq/test_linq_fold_ast.das b/tests/linq/test_linq_fold_ast.das
index 1533d5e39..20fe25858 100644
--- a/tests/linq/test_linq_fold_ast.das
+++ b/tests/linq/test_linq_fold_ast.das
@@ -734,6 +734,69 @@ def count_op1(expr : Expression?; op : string) : int {
     return n
 }
 
+// Count ExprConstInt(value) occurrences anywhere in the expression tree. Used by the
+// select+where dedup tests to prove that a pure projection's signature constant
+// (`_ * 7919`) appears exactly once in the emission (in the projection-bind statement),
+// rather than twice (inlined into predicate + cloned into valueExpr).
+def count_const_int(expr : Expression?; value : int) : int {
+    if (expr == null) return 0
+    var n = 0
+    if (expr is ExprConstInt && (expr as ExprConstInt).value == value) {
+        n ++
+    }
+    if (expr is ExprBlock) {
+        let b = expr as ExprBlock
+        for (s in b.list) {
+            n += count_const_int(s, value)
+        }
+        for (s in b.finalList) {
+            n += count_const_int(s, value)
+        }
+    } elif (expr is ExprFor) {
+        let f = expr as ExprFor
+        for (s in f.sources) {
+            n += count_const_int(s, value)
+        }
+        n += count_const_int(f.body, value)
+    } elif (expr is ExprIfThenElse) {
+        let i = expr as ExprIfThenElse
+        n += count_const_int(i.cond, value)
+        n += count_const_int(i.if_true, value)
+        n += count_const_int(i.if_false, value)
+    } elif (expr is ExprOp2) {
+        let o = expr as ExprOp2
+        n += count_const_int(o.left, value)
+        n += count_const_int(o.right, value)
+    } elif (expr is ExprOp1) {
+        let o = expr as ExprOp1
+        n += count_const_int(o.subexpr, value)
+    } elif (expr is ExprCall) {
+        let c = expr as ExprCall
+        for (a in c.arguments) {
+            n += count_const_int(a, value)
+        }
+    } elif (expr is ExprMakeBlock) {
+        let mb = expr as ExprMakeBlock
+        n += count_const_int(mb._block, value)
+    } elif (expr is ExprInvoke) {
+        let inv = expr as ExprInvoke
+        for (a in inv.arguments) {
+            n += count_const_int(a, value)
+        }
+    } elif (expr is ExprReturn) {
+        let r = expr as ExprReturn
+        n += count_const_int(r.subexpr, value)
+    } elif (expr is ExprLet) {
+        let l = expr as ExprLet
+        for (v in l.variables) {
+            if (v != null && v.init != null) {
+                n += count_const_int(v.init, value)
+            }
+        }
+    }
+    return n
+}
+
 // Counts top-level `var` declarations in the outer block of an invoke wrapper.
 // sum/long_count emit one accumulator; min/max emit two (first flag + best); average
 // emits two (sum acc + count).
@@ -1160,9 +1223,10 @@ def target_where_order_by_take_splices_fold() : array<int> {
 }
 
 [test]
-def test_order_by_take_emits_top_n_by(t : T?) {
-    // `order_by |> take(K)` splices via plan_order_family to a direct top_n_by(src, K, key) call.
-    // No invoke wrapper, no order_by call in the emission.
+def test_order_by_take_emits_top_n_by_with_cmp(t : T?) {
+    // `order_by(key) |> take(K)` with an inlineable key body now splices to
+    // top_n_by_with_cmp(src, K, $(v1, v2) => _::less(body[v1], body[v2])) — the comparator
+    // block embeds the key body twice, eliminating the per-comparison key() dispatch.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(), @@target_order_by_take_splices_fold)
         if (func == null) return
@@ -1171,14 +1235,18 @@ def test_order_by_take_emits_top_n_by(t : T?) {
             return <- $e(body_expr)
         }
         t |> success(r.matched, "should have return expression")
-        t |> success(count_call(body_expr, "top_n_by") >= 1, "should emit a top_n_by call")
+        t |> success(count_call(body_expr, "top_n_by_with_cmp") >= 1, "should emit a top_n_by_with_cmp call")
+        t |> equal(0, count_call(body_expr, "top_n_by"), "should not emit non-cmp top_n_by")
         t |> equal(0, count_call(body_expr, "order_by"), "should not emit order_by")
         t |> equal(0, count_call(body_expr, "take"), "should not emit take")
     }
 }
 
 [test]
-def test_order_by_descending_take_emits_top_n_by_descending(t : T?) {
+def test_order_by_descending_take_emits_top_n_by_with_cmp(t : T?) {
+    // `order_by_descending(key) |> take(K)` with an inlineable key dispatches to the same
+    // top_n_by_with_cmp entry as ascending — direction is encoded by flipping the
+    // comparator argument order (`_::less(body[v2], body[v1])`), no `_descending` helper.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(),
             @@target_order_by_descending_take_splices_fold)
@@ -1188,8 +1256,10 @@ def test_order_by_descending_take_emits_top_n_by_descending(t : T?) {
             return <- $e(body_expr)
         }
         t |> success(r.matched, "should have return expression")
-        t |> success(count_call(body_expr, "top_n_by_descending") >= 1,
-            "should emit a top_n_by_descending call")
+        t |> success(count_call(body_expr, "top_n_by_with_cmp") >= 1,
+            "should emit a top_n_by_with_cmp call (flipped comparator embeds direction)")
+        t |> equal(0, count_call(body_expr, "top_n_by_descending"),
+            "should not emit the key-taking top_n_by_descending")
         t |> equal(0, count_call(body_expr, "order_by_descending"),
             "should not emit order_by_descending")
     }
@@ -1197,7 +1267,8 @@ def test_order_by_descending_take_emits_top_n_by_descending(t : T?) {
 
 [test]
 def test_bare_order_by_emits_direct_call(t : T?) {
-    // Bare `order_by(key)` splices to a direct order_by call (no invoke wrapper, no top_n).
+    // Bare `order_by(key)` with an inlineable key dispatches to the asc `order(src, cmp)`
+    // block-taking overload — the key body is spliced into the comparator twice.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(), @@target_bare_order_by_splices_fold)
         if (func == null) return
@@ -1206,14 +1277,16 @@ def test_bare_order_by_emits_direct_call(t : T?) {
             return <- $e(body_expr)
         }
         t |> success(r.matched, "should have return expression")
-        t |> success(count_call(body_expr, "order_by") >= 1, "should emit an order_by call")
+        t |> success(count_call(body_expr, "order") >= 1, "should emit an order(src, cmp) call")
+        t |> equal(0, count_call(body_expr, "order_by"), "should not emit the key-taking order_by")
         t |> equal(0, count_call(body_expr, "top_n_by"), "should not emit top_n_by")
     }
 }
 
 [test]
-def test_where_order_by_emits_fused_loop(t : T?) {
-    // `where |> order_by` splices into a fused prefilter loop + order_by_inplace on the buffer.
+def test_where_order_by_emits_fused_loop_with_inline_cmp(t : T?) {
+    // `where |> order_by(key)` splices into a fused prefilter loop + order_inplace(buf, cmp)
+    // when the key is inlineable — the comparator embeds the key body twice.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(), @@target_where_order_by_splices_fold)
         if (func == null) return
@@ -1223,14 +1296,16 @@ def test_where_order_by_emits_fused_loop(t : T?) {
         }
         t |> success(r.matched && body_expr is ExprInvoke, "expected invoke wrapper for fused emission")
         t |> equal(1, count_inner_for_loops(body_expr), "single fused prefilter loop")
-        t |> success(count_call(body_expr, "order_by_inplace") >= 1, "should call order_by_inplace on the buffer")
+        t |> success(count_call(body_expr, "order_inplace") >= 1, "should call order_inplace(buf, cmp)")
+        t |> equal(0, count_call(body_expr, "order_by_inplace"), "should not emit the key-taking order_by_inplace")
         t |> success(count_call(body_expr, "push_clone") >= 1, "should push_clone into the buffer")
     }
 }
 
 [test]
-def test_where_order_by_take_emits_fused_top_n(t : T?) {
-    // `where |> order_by |> take(K)` splices into a fused prefilter loop + top_n_by on the buffer.
+def test_where_order_by_take_emits_fused_top_n_with_cmp(t : T?) {
+    // `where |> order_by(key) |> take(K)` splices into a fused prefilter loop +
+    // top_n_by_with_cmp(buf, K, cmp) when the key is inlineable.
     ast_gc_guard() {
         var func = find_module_function_via_rtti(compiling_module(),
             @@target_where_order_by_take_splices_fold)
@@ -1241,7 +1316,8 @@ def test_where_order_by_take_emits_fused_top_n(t : T?) {
         }
         t |> success(r.matched && body_expr is ExprInvoke, "expected invoke wrapper for fused emission")
         t |> equal(1, count_inner_for_loops(body_expr), "single fused prefilter loop")
-        t |> success(count_call(body_expr, "top_n_by") >= 1, "should call top_n_by on the buffer")
+        t |> success(count_call(body_expr, "top_n_by_with_cmp") >= 1, "should call top_n_by_with_cmp on the buffer")
+        t |> equal(0, count_call(body_expr, "top_n_by"), "should not emit non-cmp top_n_by")
         t |> success(count_call(body_expr, "push_clone") >= 1, "should push_clone into the buffer")
     }
 }
@@ -1366,3 +1442,113 @@ def test_select_where_to_array_correct_result(t : T?) {
     }
 }
 
+// ── Gap 2 — select + where projection dedup (single-eval splice) ───────
+
+[export, marker(no_coverage)]
+def target_select_where_sum_dedup() : int {
+    // Pure projection with a distinctive constant (7919, prime). After the splice the
+    // constant appears EXACTLY ONCE in the body — in the projection-bind statement.
+    // Without the dedup the constant would appear twice: once inlined into the predicate
+    // via peel-substitution, and once cloned into the sum's valueExpr.
+    return _fold(each([1, 2, 3, 4, 5])._select(_ * 7919)._where(_ > 0).sum())
+}
+
+[test]
+def test_select_where_projection_dedup(t : T?) {
+    // Single-eval splice: the projection constant 7919 must appear exactly once in the
+    // body — proves the projection is bound to a local once per element rather than
+    // inlined twice (once in predicate, once in valueExpr).
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(), @@target_select_where_sum_dedup)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> equal(1, count_const_int(body_expr, 7919),
+            "projection constant should appear exactly once (single-eval dedup)")
+        t |> equal(1, count_inner_for_loops(body_expr), "single fused for-loop")
+        t |> equal(0, count_call(body_expr, "select"), "select should be inlined")
+        t |> equal(0, count_call(body_expr, "where_"), "where_ should be inlined")
+    }
+}
+
+[test]
+def test_select_where_dedup_correct_result(t : T?) {
+    t |> run("select+where dedup preserves arithmetic") @(t : T?) {
+        // [1,2,3,4,5] * 7919 = [7919, 15838, 23757, 31676, 39595], all > 0,
+        // sum = 7919 * (1+2+3+4+5) = 7919 * 15 = 118785
+        t |> equal(118785, target_select_where_sum_dedup())
+    }
+}
+
+[export, marker(no_coverage)]
+def target_select_where_impure_falls_to_tier2() : int {
+    // Impure projection (side-effecting function call) must NOT be moved outside the
+    // where if-wrap — that would visibly fire side effects on filter-rejected elements.
+    // The splice arm bails to tier 2 cascade in this case.
+    return _fold(each([1, 2, 3, 4, 5])._select(side_effect_select_proj(_))._where(_ > 5).count())
+}
+
+[test]
+def test_select_where_impure_correctness(t : T?) {
+    t |> run("impure projection in select+where preserves per-element call count") @(t : T?) {
+        // Plain LINQ semantics: select runs the projection for EVERY element before the
+        // where filter sees the result. 5 input elements → 5 projection invocations,
+        // regardless of how many survive the filter.
+        g_select_count_proj_hits = 0
+        let n = target_select_where_impure_falls_to_tier2()
+        // projected: [2,4,6,8,10]; filtered >5: [6,8,10]; count = 3
+        t |> equal(3, n)
+        t |> equal(5, g_select_count_proj_hits, "projection must fire once per source element")
+    }
+}
+
+// ── Gap 1 — inlineable key fallback when key is side-effecting ───────────
+
+var g_order_key_hits = 0
+
+def side_effect_order_key(x : int) : int {
+    g_order_key_hits ++
+    return -x
+}
+
+[export, marker(no_coverage)]
+def target_order_by_take_side_effecting_key() : array<int> {
+    // Side-effecting key — try_make_inline_cmp's has_sideeffects gate must bail; the
+    // splice falls back to the keyed top_n_by entry point (not _with_cmp).
+    return <- _fold(each([3, 1, 4, 1, 5, 9, 2, 6])._order_by(side_effect_order_key(_)).take(3).to_array())
+}
+
+[test]
+def test_order_by_take_side_effecting_key_falls_back(t : T?) {
+    ast_gc_guard() {
+        var func = find_module_function_via_rtti(compiling_module(),
+            @@target_order_by_take_side_effecting_key)
+        if (func == null) return
+        var body_expr : ExpressionPtr
+        let r = qmatch_function(func) $() {
+            return <- $e(body_expr)
+        }
+        t |> success(r.matched, "should have return expression")
+        t |> success(count_call(body_expr, "top_n_by") >= 1,
+            "side-effecting key should fall back to keyed top_n_by")
+        t |> equal(0, count_call(body_expr, "top_n_by_with_cmp"),
+            "side-effecting key must NOT use the inline-cmp variant")
+    }
+}
+
+[test]
+def test_order_by_take_side_effecting_key_correct_result(t : T?) {
+    t |> run("side-effecting order_by key produces correct top-N by -key") @(t : T?) {
+        let got <- target_order_by_take_side_effecting_key()
+        // -key sorts ascending by -x ⇒ descending by x; take 3 largest.
+        let expected = [9, 6, 5]
+        t |> equal(length(got), length(expected))
+        for (v, e in got, expected) {
+            t |> equal(v, e)
+        }
+    }
+}
+