forksnd · pull · May 18, 2026 · May 15, 2026 · May 18, 2026 · May 18, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -202,6 +202,17 @@ jobs:
     - name: "Install CMake and Ninja"
       uses: lukka/get-cmake@latest
 
+    - if: runner.os == 'Windows'
+      uses: ilammy/setup-nasm@v1 # need nasm for openssl
+
+    - name: "Install openssl windows"
+      if: runner.os == 'Windows'
+      run: |
+        git clone https://github.com/microsoft/vcpkg && ./vcpkg/bootstrap-vcpkg.sh
+        ./vcpkg/vcpkg install openssl:${{ matrix.architecture == 32 && 'x86' || 'x64' }}-windows --binarycaching
+        echo "VCPKG_ROOT=$(pwd)/vcpkg" >> $GITHUB_ENV
+        echo "CMAKE_TOOLCHAIN_FILE=$(pwd)/vcpkg/scripts/buildsystems/vcpkg.cmake" >> $GITHUB_ENV
+
     - name: "Install: Required Dev Packages"
       run: |
         set -eux
@@ -259,11 +270,13 @@ jobs:
                 ninja
                 ;;
               windows32)
-                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }}
+                export PATH="/c/Strawberry/perl/bin:$PATH" # prepend Strawberry perl to path, so openssl will use it.
+                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }} -DCMAKE_TOOLCHAIN_FILE="$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake"
                 cmake --build ./build --config ${{ matrix.cmake_preset }} --parallel
                 ;;
               windows64)
-                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }} -DDAS_LLVM_DISABLED=${{ env.das_llvm_disabled }}
+                export PATH="/c/Strawberry/perl/bin:$PATH" # prepend Strawberry perl to path, so openssl will use it.
+                cmake --no-warn-unused-cli -B./build -G "${{ matrix.cmake_generator }}" -T host=x64 -A ${{ matrix.architecture_string }} -DDAS_LLVM_DISABLED=${{ env.das_llvm_disabled }} -DCMAKE_TOOLCHAIN_FILE="$VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake"
                 cmake --build ./build --config ${{ matrix.cmake_preset }} --parallel
                 ;;
               linux_arm*)

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,7 +16,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 option(DAS_FLEX_BISON_DISABLED "Disable FLEX/BISON stage" OFF)
 option(DAS_CLANG_BIND_DISABLED "Disable dasClangBind (libclang bindings, C/C++ parsing)" ON)
 option(DAS_LLVM_DISABLED "Disable dasLLVM (llvm bindings)" ON)
-option(DAS_HV_DISABLED "Disable dasHV (websokets,http server and client)" ON)
+option(DAS_HV_DISABLED "Disable dasHV (websokets,http server and client)" OFF)
 option(DAS_GLFW_DISABLED "Disable dasGLFW (GLFW window for graphics apps)" OFF)
 option(DAS_AUDIO_DISABLED "Disable dasAudio (Miniaudio sound library)" OFF)
 option(DAS_STDDLG_DISABLED "Disable dasStdDlg (File new,open,save etc dialogs)" OFF)

diff --git a/benchmarks/sql/LINQ.md b/benchmarks/sql/LINQ.md
@@ -280,6 +280,26 @@ The order+take rows (`order_take_desc`, `sort_take`, `select_where_order_take`)
 
 `select_where_count` is the first **select+where** splice landing: previously rejected by the planner (the typer-inserted `ExprRef2Value` wrapper around `it` orphaned during substitution → `30921: can only dereference a reference`). The new `replaceVariablePeeling` helper in `templates_boost.das` peels the wrapper as part of the substitution, mirroring `ast_match`'s `qm_peel_ref2value`. All four terminator lanes (array / counter / accumulator / early-exit) covered.
 
+### Headline benchmarks (100K rows, INTERP, single-eval splice PR)
+
+Follow-up to the order-family + select+where landing. Closes two single-eval gaps the previous PR documented as `KNOWN PERF GAP`:
+
+**Gap 1 — comparator key double-call inside `partial_sort`.** Before this PR, `top_n_by(arr, K, key)` ran `_::less(key(v1), key(v2))` per comparison → 2 indirect lambda dispatches per `cmp`. For pure single-expression keys (the common case, e.g. `$(_) => _.price`), the planner now inlines the key body twice into a comparator block and dispatches to the new `top_n_by_with_cmp(arr, K) <| <cmp_block>` library entry — zero per-comparison lambda dispatch. Descending direction is encoded by flipping the comparator arg order (`less(body[v2], body[v1])`), eliminating the secondary wrapper-lambda the `_descending` family used. Falls back to keyed `top_n_by` when the key has side effects or isn't a single-expression lambda.
+
+**Gap 2 — projection double-eval in `select + where + terminator`.** Phase 3d inlined `projection` into `predicate` via peel-substitution; lane emitters then *also* cloned `projection` into `valueExpr` → projection evaluated twice per element on ARRAY / ACCUMULATOR / EARLY_EXIT lanes (COUNTER unaffected — no body use). Fix: the where-after-select arm now binds `projection` to a fresh local via a new `preConditionStmts` slot (evaluated per-element, OUTSIDE the if-wrap), then rewrites `projection` to reference that bind. Both predicate (via peel) and valueExpr (via clone) share the single eval. Side-effecty projections still bail to tier 2 (moving them outside the if would visibly fire side effects on filter-rejected elements). COUNTER lane is explicitly excluded — the dedup has no benefit there and the bind decl would regress the single-stmt fast path.
+
+| Benchmark | Shape | m1 (sql) | m3 (linq) | m3f (prev PR) | m3f (this PR) | Win |
+|---|---|---:|---:|---:|---:|---:|
+| sort_take | `order_by → take(K)` | 38 | 710 | 56 | **27** | **2.1× over prev / 26× over m3 / faster than m1 SQL** |
+| order_take_desc | `order_by_desc → take(K)` | 38 | 704 | 56 | **27** | **2.1× over prev / 26× over m3 / faster than m1 SQL** |
+| select_where_order_take | `where → order_by → take(K)` | 36 | 356 | 39 | **24** | **1.6× over prev / 15× over m3 / faster than m1 SQL** |
+| select_where_sum (NEW Gap 2) | `select → where → sum` | 37 | 59 | — | **7** | **8.4× over m3 / 5.3× over m1 SQL** |
+| select_where_count (regression check) | `select → where → count` (COUNTER, dedup off) | 32 | 58 | 5 | **5** | unchanged (correctly excluded from dedup) |
+
+The sort/order rows now BEAT `m1` SQLite by ~30%. PR #2707 closed the comparator-throughput gap vs SQL; this PR's inline-key splice closes the per-iteration lambda dispatch gap.
+
+**Parser bonus:** the multi-arg `$($i(a) : T, $i(b) : T) { ... }` qmacro form failed parse with `30701: block argument is already declared MACRO``TAG` because the parser stamped every `$i(...)` in block-arg position with the literal placeholder name and dup-checked them before macro tag resolution. Fixed in `src/parser/parser_impl.cpp:885` by skipping the dup check when `name_at.tag != nullptr` (genuine post-resolution dups surface as ordinary local-lookup conflicts during type inference). General-purpose fix — usable by any macro that needs to emit a typed block with N tagged-name args.
+
 ## Operator-coverage checklist (parity tests)
 
 The 24 benchmarks above cover the most common shapes. The end-game target is one benchmark per `_fold`-applicable scenario in the broader `tests/linq/` operator suite. Tracking the long-tail coverage below; PRs that add splice support for new operators should add a benchmark here if not already present.

diff --git a/benchmarks/sql/select_where_sum.das b/benchmarks/sql/select_where_sum.das
@@ -0,0 +1,69 @@
+options gen2
+options persistent_heap
+
+require _common public
+
+let THRESHOLD = 1000
+
+// _select(_.price * 2) |> _where(_ > T) |> sum — where-after-select pattern with the
+// sum terminator. SQL: SELECT SUM(price * 2) FROM Cars WHERE price * 2 > T.
+//
+// m3 (plain LINQ) materializes a projection iterator, then a filter array, then sums.
+// m3f (spliced via Phase 3d single-eval) binds the projection to a local once per
+// element OUTSIDE the if-wrap, then the rewritten predicate reads the bind AND the
+// sum's valueExpr reads the bind — projection (`_.price * 2`) evaluates exactly once
+// per element across the splice. Without the dedup it evaluates twice (once in the
+// inlined predicate, once in valueExpr) for ARRAY/ACCUMULATOR/EARLY_EXIT lanes.
+//
+// m1 uses ``query_scalar`` with raw SQL because ``_sql``'s ``_select`` clause only
+// accepts a single column or named-tuple shape — arbitrary scalar expressions like
+// ``_.price * 2`` aren't representable in the typed DSL. The engine still folds the
+// projection into both the WHERE filter and the SUM, so per-row work is identical to
+// what the typed form would emit.
+
+def run_m1(b : B?; n : int) {
+    with_sqlite(":memory:") $(db) {
+        fixture_db(db, n)
+        b |> run("m1_sql/{n}", n) {
+            let s = db |> query_scalar("SELECT SUM(price * 2) FROM Cars WHERE price * 2 > {THRESHOLD}", type<int>)
+            if (s == 0) {
+                b->failNow()
+            }
+        }
+    }
+}
+
+def run_m3(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3_array/{n}", n) {
+        let s = arr |> _select(_.price * 2) |> _where(_ > THRESHOLD) |> sum
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+def run_m3f(b : B?; n : int) {
+    let arr <- fixture_array(n)
+    b |> run("m3f_array_fold/{n}", n) {
+        let s = _fold(each(arr)._select(_.price * 2)._where(_ > THRESHOLD).sum())
+        if (s == 0) {
+            b->failNow()
+        }
+    }
+}
+
+[benchmark]
+def select_where_sum_m1(b : B?) {
+    run_m1(b, 100000)
+}
+
+[benchmark]
+def select_where_sum_m3(b : B?) {
+    run_m3(b, 100000)
+}
+
+[benchmark]
+def select_where_sum_m3f(b : B?) {
+    run_m3f(b, 100000)
+}
diff --git a/daslib/linq.das b/daslib/linq.das
@@ -565,6 +565,51 @@ def top_n_descending(var a : iterator<auto(TT)>; n : int) : array<TT -const -&>
     return <- top_n_by_descending(a, n, $(v : TT -&) => v)
 }
 
+// ============================================================================
+// top_n_by_with_cmp — take a comparator block directly instead of a key
+// lambda. The splice planner uses this when an order-by key body is pure and
+// inlineable: the body is spliced into the comparator twice (once per side),
+// eliminating both the per-comparison lambda dispatch (`cmp(v1, v2)`) AND the
+// per-side `key(v)` dispatch nested inside the standard comparator.
+//
+// Direction is encoded in the comparator: asc emits `less(body[v1], body[v2])`,
+// desc emits `less(body[v2], body[v1])`. The output is sorted ascending by the
+// comparator, so a reversed comparator yields a descending natural-key order.
+// ============================================================================
+
+def top_n_by_with_cmp(arr : array<auto(TT)>; n : int; cmp : block<(v1 : TT -&, v2 : TT -&) : bool>) : array<TT -const -&> {
+    //! Returns the ``n`` smallest-per-``cmp`` elements of ``arr``.
+    //! Pass a reversed comparator to extract the ``n`` largest instead.
+    var buf : array<TT -const -&>
+    if (n <= 0 || empty(arr)) return <- buf
+    let take_count = min(n, length(arr))
+    buf |> reserve(length(arr))
+    for (it in arr) {
+        buf |> push_clone(it)
+    }
+    sort_boost::partial_sort(buf, take_count, cmp)
+    buf |> resize(take_count)
+    return <- buf
+}
+
+def top_n_by_with_cmp(var a : iterator<auto(TT)>; n : int; cmp : block<(v1 : TT -&, v2 : TT -&) : bool>) : array<TT -const -&> {
+    //! Iterator variant — bounded heap of size ``n`` maintained during scan.
+    var buf : array<TT -const -&>
+    if (n <= 0) return <- buf
+    for (it in a) {
+        if (length(buf) < n) {
+            buf |> push_clone(it)
+            sort_boost::push_heap(buf, cmp)
+        } elif (cmp(it, buf[0])) {
+            sort_boost::pop_heap(buf, cmp)
+            buf[length(buf) - 1] := it
+            sort_boost::push_heap(buf, cmp)
+        }
+    }
+    sort(buf, cmp)
+    return <- buf
+}
+
 def unique_key(a) {
     //! generates unique key of workhorse type for the value
     static_if (typeinfo is_workhorse(a)) {