diff --git a/README.md b/README.md
index 52098cb..f2e7865 100644
--- a/README.md
+++ b/README.md
@@ -239,6 +239,16 @@ To run the SQL tests:
 make test
 ```
 
+### Running Benchmarks
+
+Performance benchmarks detect regressions. Run the full suite:
+
+```bash
+make release && uv run python bench/bench.py
+```
+
+See [bench/README.md](bench/README.md) and [bench/PROFILING.md](bench/PROFILING.md) for filtering, profiling, and baseline management.
+
 ## Contributing
 
 Contributions are welcome! Please:
diff --git a/bench/PROFILING.md b/bench/PROFILING.md
new file mode 100644
index 0000000..fdef053
--- /dev/null
+++ b/bench/PROFILING.md
@@ -0,0 +1,201 @@
+# Profiling Notes (Samply + DuckDB)
+
+This doc captures what we learned while profiling `json_extract_columns` so we can repeat it without surprises.
+
+## Build (symbols)
+
+Use a symbol-rich build for useful stacks:
+
+```bash
+make reldebug
+```
+
+`make release` works, but stacks may show raw addresses if symbols are missing.
+
+## CPU sampling with Samply (no local server)
+
+To avoid launching Samply's local UI server, use `--save-only`.
+
+```bash
+mkdir -p bench/results/samply
+samply record --save-only --output bench/results/samply/<case>.json.gz -- \
+  uv run python bench/run_benchmarks.py --filter <case>
+```
+
+Example:
+
+```bash
+samply record --save-only \
+  --output bench/results/samply/json_extract_columns-100k-many_patterns.json.gz -- \
+  uv run python bench/run_benchmarks.py --filter json_extract_columns/100k/many_patterns
+```
+
+Notes:
+- `--save-only` prevents starting the local web server.
+- `--no-open` only avoids opening the UI; it can still start the server.
+
+## Offline symbolization (optional)
+
+If you want symbols available later (even without the original binaries), add:
+
+```bash
+samply record --save-only --unstable-presymbolicate \
+  --output bench/results/samply/<case>.json.gz -- \
+  uv run python bench/run_benchmarks.py --filter <case>
+```
+
+This emits a sidecar file next to the profile:
+
+```
+bench/results/samply/<case>.json.syms.json
+```
+
+`--unstable-presymbolicate` is marked unstable by Samply, but it is useful when
+you need symbols after moving the profile.
+
+## Viewing a saved profile
+
+Start the server without auto-opening a browser:
+
+```bash
+samply load --no-open bench/results/samply/<case>.json.gz
+```
+
+Then open `http://127.0.0.1:3000` manually (or the Firefox Profiler URL printed
+by samply).
+
+If a `.syms.json` sidecar exists in the same directory, Samply uses it for
+symbolization.
+
+## Analyzing profiles programmatically
+
+Use `bench/analyze_profile.py` to extract function timings from profiles.
+Requires `--unstable-presymbolicate` when recording to generate the `.syms.json` sidecar.
+
+### Basic usage
+
+```bash
+python3 bench/analyze_profile.py bench/results/samply/<case>.json.gz
+```
+
+### Options
+
+| Option | Description |
+|--------|-------------|
+| `--top N` | Show top N functions (default: 30) |
+| `--filter STRING` | Filter functions containing STRING (case-insensitive) |
+| `--thread NAME` | Analyze specific thread (default: thread with most samples) |
+
+### Examples
+
+```bash
+# Basic analysis - shows all threads, then top functions by self/inclusive time
+python3 bench/analyze_profile.py bench/results/samply/json_group_merge.json.gz
+
+# Filter for json-related functions only
+python3 bench/analyze_profile.py <profile> --filter json --top 20
+
+# Analyze a specific thread (useful when multiple workers)
+python3 bench/analyze_profile.py <profile> --thread python3
+
+# Show more results
+python3 bench/analyze_profile.py <profile> --top 50
+```
+
+### Output format
+
+The script outputs two sections:
+
+**Self time**: Time spent directly in each function (excluding callees).
+Useful for finding CPU-intensive functions.
+
+```
+=== Self time (top 30) ===
+ 30.3%   2335  duckdb::JsonGroupMergeApplyPatchInternal
+ 26.2%   2015  duckdb::yyjson_mut_obj_iter_next
+ 13.6%   1046  _platform_memcmp
+```
+
+**Inclusive time**: Time spent in each function including all callees.
+Useful for finding hot call paths.
+
+```
+=== Inclusive time (top 30) ===
+ 79.0%   6078  duckdb::AggregateFunction::UnaryScatterUpdate
+ 40.8%   3143  duckdb::JsonGroupMergeApplyPatchInternal
+```
+
+### Symbol file structure
+
+The `.syms.json` sidecar (generated by `--unstable-presymbolicate`):
+
+```json
+{
+  "string_table": ["symbol1", "symbol2", ...],
+  "data": [
+    {
+      "debug_name": "duckdb",
+      "symbol_table": [
+        {"rva": 8960, "size": 624, "symbol": 2}
+      ]
+    }
+  ]
+}
+```
+
+- `string_table`: function names indexed by symbol_table entries
+- `data[].debug_name`: library name (e.g., "duckdb", "libc")
+- `data[].symbol_table`: maps RVA ranges to symbol indices
+- Profile's `frameTable.address` contains RVAs to look up
+
+### Troubleshooting
+
+**"Error: syms file not found"**
+Re-record with `--unstable-presymbolicate`:
+```bash
+samply record --save-only --unstable-presymbolicate --output <file>.json.gz -- <cmd>
+```
+
+**Functions showing as `<frame:N>` or `fun_XXXXXX`**
+Symbols not found. Possible causes:
+- Build without debug symbols (use `make reldebug`)
+- System libraries without debug packages
+- Binary stripped after recording
+
+## Attaching to an existing process
+
+On Linux, you can attach by PID:
+
+```bash
+samply record -p <pid>
+```
+
+On macOS, attaching to a running process requires:
+
+```bash
+samply setup
+```
+
+(This codesigns the binary so it can attach.)
+
+## DuckDB query profiles (not CPU sampling)
+
+To collect DuckDB's JSON query profile:
+
+```bash
+uv run python bench/run_benchmarks.py --profile --filter <case>
+```
+
+This writes:
+
+```
+bench/results/profiles/<case>/query_profile.json
+```
+
+## Benchmark outputs
+
+`run_benchmarks.py` always writes timing results to:
+
+```
+bench/results/latest.json
+```
diff --git a/bench/README.md b/bench/README.md
index b541fb0..cd31c4c 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -15,6 +15,31 @@ uv run python bench/bench.py
 uv run python bench/compare_results.py --save-baseline
 ```
 
+## Architecture
+
+### Script Relationships
+
+```
+bench.py (orchestrator)
+├─ ensure_data_exists() → generate_data.py
+├─ run_sanity_checks() → sanity_checks.py
+├─ run_benchmarks() → run_benchmarks.py
+└─ run_comparison() → compare_results.py
+```
+
+| Script | Purpose |
+|--------|---------|
+| `bench.py` | One-command pipeline: generates data, validates, benchmarks, compares |
+| `run_benchmarks.py` | Runs benchmarks with filtering/profiling options |
+| `compare_results.py` | Compares latest vs baseline, detects regressions |
+| `generate_data.py` | Creates deterministic synthetic datasets |
+| `sanity_checks.py` | Validates data row counts and schema |
+| `config.py` | Centralized configuration (sizes, scenarios, thresholds) |
+
+**When to use which:**
+- `bench.py` — Full pipeline, no options. Use for CI and general validation.
+- `run_benchmarks.py` — Targeted runs with `--filter` and `--profile`. Use for investigation.
+
 ## Filtering Benchmarks
 
 Use `--filter` with substring matching to run specific benchmarks:
@@ -46,6 +71,11 @@ All artifacts are in `bench/results/`:
 | `diff.json` | Comparison between latest and baseline |
 | `profiles/<case>/` | DuckDB query profiles (when collected) |
 
+## Profiling
+
+DuckDB query profiles are collected via `--profile`. For CPU sampling with Samply
+(and `--save-only` to avoid a local server), see `bench/PROFILING.md`.
+
 ## Interpreting Results
 
 ### Statuses
@@ -63,7 +93,11 @@ All artifacts are in `bench/results/`:
 - **tolerance_pct** (default: 5%): Minimum percentage change to be considered significant
 - **min_effect_ms** (default: 5ms): Minimum absolute change to be considered significant
 
-A change is `UNCHANGED` if it's within tolerance% OR below min_effect_ms.
+A change is classified as `UNCHANGED` if **either**:
+- Absolute change < min_effect_ms, OR
+- Percentage change ≤ tolerance_pct
+
+Both conditions protect against noise: small absolute changes in fast queries and small percentage changes in slow queries.
 
 ## Baseline Rules
 
@@ -100,6 +134,19 @@ uv run python bench/run_benchmarks.py --profile
 
 Profiles are saved to `bench/results/profiles/<case>/query_profile.json`.
 
+## Sanity Checks
+
+Before running benchmarks, `bench.py` validates data integrity:
+
+1. **Row count** — Each file has exactly the expected rows (1k, 10k, 100k)
+2. **Schema** — Required columns exist: `json_nested`, `json_flat`, `g1e1`, `g1e3`, `g1e4`
+
+If checks fail, regenerate data:
+
+```bash
+uv run python bench/generate_data.py
+```
+
 ## Data Generation
 
 Data is auto-generated on first run. To regenerate manually:
@@ -108,6 +155,45 @@ Data is auto-generated on first run. To regenerate manually:
 uv run python bench/generate_data.py
 ```
 
+### Dataset Structure
+
+Each parquet file contains:
+
+| Column | Description |
+|--------|-------------|
+| `json_nested` | Hierarchical JSON with 1-5 levels of nesting |
+| `json_flat` | Flattened dot-notation version |
+| `g1e1` | Group key with ~10 unique values |
+| `g1e3` | Group key with ~1,000 unique values |
+| `g1e4` | Group key with ~10,000 unique values |
+
+Data is deterministic (seed=42) and reproducible across runs.
+
 Dataset sizes are defined in `bench/config.py`:
+- `1k`: 1,000 rows
 - `10k`: 10,000 rows
 - `100k`: 100,000 rows
+
+## Adding New Benchmarks
+
+1. **Define scenario in `config.py`:**
+   ```python
+   SCENARIOS = [
+       # ...existing scenarios...
+       {"function": "json_new_fn", "scenario": "basic"},
+   ]
+   ```
+
+2. **Add query builder in `run_benchmarks.py`:**
+   ```python
+   case "json_new_fn":
+       return f"SELECT sum(length(CAST(json_new_fn(json_nested) AS VARCHAR))) FROM {table}"
+   ```
+
+3. **Run and save baseline:**
+   ```bash
+   uv run python bench/run_benchmarks.py --filter json_new_fn
+   uv run python bench/compare_results.py --save-baseline
+   ```
+
+Cases are auto-discovered from `SIZES × SCENARIOS` (currently 27 cases).
diff --git a/bench/analyze_profile.py b/bench/analyze_profile.py
new file mode 100644
index 0000000..16c1005
--- /dev/null
+++ b/bench/analyze_profile.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""Analyze samply profiles programmatically.
+
+Usage:
+    python bench/analyze_profile.py bench/results/samply/<case>.json.gz [--thread <name>]
+
+Requires --unstable-presymbolicate when recording:
+    samply record --save-only --unstable-presymbolicate --output <file>.json.gz -- <command>
+"""
+
+import argparse
+import gzip
+import json
+from collections import defaultdict
+from pathlib import Path
+
+
+def load_profile(profile_path: Path):
+    with gzip.open(profile_path, "rt") as f:
+        return json.load(f)
+
+
+def load_symbols(syms_path: Path):
+    with open(syms_path, "r") as f:
+        return json.load(f)
+
+
+def build_symbol_lookup(syms: dict) -> dict:
+    """Build address -> symbol lookup per library."""
+    string_table = syms.get("string_table", [])
+    lib_symbols = {}
+    for entry in syms.get("data", []):
+        debug_name = entry.get("debug_name")
+        symbols = []
+        for s in entry.get("symbol_table", []):
+            rva = s["rva"]
+            size = s.get("size", 0)
+            sym_idx = s["symbol"]
+            sym_name = string_table[sym_idx] if sym_idx < len(string_table) else f"<{sym_idx}>"
+            symbols.append((rva, rva + size, sym_name))
+        symbols.sort(key=lambda x: x[0])
+        lib_symbols[debug_name] = symbols
+    return lib_symbols
+
+
+def lookup_symbol(symbols: list, addr: int) -> str | None:
+    for rva_start, rva_end, name in symbols:
+        if rva_start <= addr < rva_end:
+            return name
+    return None
+
+
+def analyze_thread(thread: dict, lib_symbols: dict, lib_debug_names: dict) -> tuple[dict, dict]:
+    """Analyze a thread and return (self_samples, inclusive_samples) dicts."""
+    samples = thread.get("samples", {})
+    stack_table = thread.get("stackTable", {})
+    frame_table = thread.get("frameTable", {})
+    func_table = thread.get("funcTable", {})
+    resource_table = thread.get("resourceTable", {})
+
+    sample_stacks = samples.get("stack", [])
+    stack_frame = stack_table.get("frame", [])
+    stack_prefix = stack_table.get("prefix", [])
+    frame_func = frame_table.get("func", [])
+    frame_address = frame_table.get("address", [])
+    func_resource = func_table.get("resource", [])
+    res_lib = resource_table.get("lib", [])
+
+    # Build frame_idx -> symbol_name lookup
+    frame_symbols = {}
+    for frame_idx in range(len(frame_func)):
+        func_idx = frame_func[frame_idx]
+        addr = frame_address[frame_idx] if frame_idx < len(frame_address) else None
+        if func_idx < len(func_resource):
+            res_idx = func_resource[func_idx]
+            if res_idx is not None and res_idx < len(res_lib):
+                lib_idx = res_lib[res_idx]
+                debug_name = lib_debug_names.get(lib_idx)
+                if debug_name and debug_name in lib_symbols and addr is not None:
+                    sym = lookup_symbol(lib_symbols[debug_name], addr)
+                    if sym:
+                        frame_symbols[frame_idx] = sym
+                        continue
+        frame_symbols[frame_idx] = f"<frame:{frame_idx}>"
+
+    # Count samples
+    self_samples = defaultdict(int)
+    inclusive_samples = defaultdict(int)
+
+    for stack_idx in sample_stacks:
+        if stack_idx is None:
+            continue
+        seen = set()
+        current = stack_idx
+        is_leaf = True
+        while current is not None:
+            frame_idx = stack_frame[current]
+            name = frame_symbols.get(frame_idx, f"<unknown:{frame_idx}>")
+            if is_leaf:
+                self_samples[name] += 1
+                is_leaf = False
+            if name not in seen:
+                inclusive_samples[name] += 1
+                seen.add(name)
+            current = stack_prefix[current]
+
+    return dict(self_samples), dict(inclusive_samples)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Analyze samply profiles")
+    parser.add_argument("profile", type=Path, help="Path to profile.json.gz")
+    parser.add_argument("--thread", default=None, help="Thread name to analyze (default: largest)")
+    parser.add_argument("--top", type=int, default=30, help="Number of top functions to show")
+    parser.add_argument("--filter", default=None, help="Filter functions containing this string")
+    args = parser.parse_args()
+
+    syms_path = args.profile.with_suffix("").with_suffix(".json.syms.json")
+    if not syms_path.exists():
+        print(f"Error: syms file not found: {syms_path}")
+        print("Re-record with --unstable-presymbolicate")
+        raise SystemExit(2)
+
+    profile = load_profile(args.profile)
+    syms = load_symbols(syms_path)
+    lib_symbols = build_symbol_lookup(syms)
+
+    libs = profile.get("libs", [])
+    lib_debug_names = {i: lib.get("debugName", lib.get("name", f"lib{i}")) for i, lib in enumerate(libs)}
+
+    # Find thread
+    threads = profile.get("threads", [])
+    print("Threads:")
+    for t in threads:
+        sample_count = len(t.get("samples", {}).get("stack", []))
+        print(f"  {t.get('name', '<unnamed>')}: {sample_count} samples")
+
+    # Select thread
+    if args.thread:
+        thread = next((t for t in threads if t.get("name") == args.thread), None)
+        if not thread:
+            print(f"Error: thread '{args.thread}' not found")
+            raise SystemExit(2)
+    else:
+        thread = max(threads, key=lambda t: len(t.get("samples", {}).get("stack", [])))
+    print(f"\nAnalyzing thread: {thread.get('name')}")
+
+    self_samples, inclusive_samples = analyze_thread(thread, lib_symbols, lib_debug_names)
+    total = sum(self_samples.values())
+    print(f"Total samples: {total}\n")
+
+    def shorten(name: str, maxlen: int = 70) -> str:
+        short = name.split("(")[0] if "(" in name else name
+        return short[:maxlen - 3] + "..." if len(short) > maxlen else short
+
+    def matches_filter(name: str) -> bool:
+        if not args.filter:
+            return True
+        return args.filter.lower() in name.lower()
+
+    print(f"=== Self time (top {args.top}) ===")
+    for name, count in sorted(self_samples.items(), key=lambda x: -x[1])[:args.top]:
+        if not matches_filter(name):
+            continue
+        pct = 100 * count / total if total else 0
+        print(f"{pct:5.1f}%  {count:5d}  {shorten(name)}")
+
+    print(f"\n=== Inclusive time (top {args.top}) ===")
+    for name, count in sorted(inclusive_samples.items(), key=lambda x: -x[1])[:args.top]:
+        if not matches_filter(name):
+            continue
+        pct = 100 * count / total if total else 0
+        print(f"{pct:5.1f}%  {count:5d}  {shorten(name)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/compare_results.py b/bench/compare_results.py
index fd5c6f3..dd16457 100644
--- a/bench/compare_results.py
+++ b/bench/compare_results.py
@@ -75,12 +75,22 @@ def diff_environment(baseline_env: dict | None, latest_env: dict | None) -> dict
     return diffs
 
 
+def format_multiplier(baseline_ms: float, current_ms: float) -> str:
+    """Format speed change as multiplier (e.g., '2.00× faster', '1.50× slower')."""
+    if current_ms > baseline_ms:
+        mult = current_ms / baseline_ms
+        return f"{mult:.2f}× slower"
+    else:
+        mult = baseline_ms / current_ms
+        return f"{mult:.2f}× faster"
+
+
 def get_top_cases(cases: list[dict], status: str, n: int = 3) -> list[dict]:
-    """Get top N cases by absolute diff_pct for given status."""
-    filtered = [c for c in cases if c["status"] == status and "diff_pct" in c]
-    sorted_cases = sorted(filtered, key=lambda c: abs(c["diff_pct"]), reverse=True)
+    """Get top N cases by multiplier magnitude for given status."""
+    filtered = [c for c in cases if c["status"] == status and "multiplier" in c]
+    sorted_cases = sorted(filtered, key=lambda c: c["multiplier"], reverse=True)
     return [
-        {"id": c["id"], "diff_pct": c["diff_pct"], "diff_ms": c["diff_ms"]}
+        {"id": c["id"], "multiplier": c["multiplier"], "multiplier_str": c["multiplier_str"]}
         for c in sorted_cases[:n]
     ]
 
@@ -123,9 +133,12 @@ def generate_diff(
             "current_ms": current_ms,
         }
 
-        if baseline_ms is not None and current_ms is not None:
-            case["diff_ms"] = round(current_ms - baseline_ms, 2)
-            case["diff_pct"] = round((current_ms - baseline_ms) / baseline_ms * 100, 2) if baseline_ms != 0 else 0
+        if baseline_ms is not None and current_ms is not None and baseline_ms != 0:
+            if current_ms > baseline_ms:
+                case["multiplier"] = round(current_ms / baseline_ms, 2)
+            else:
+                case["multiplier"] = round(baseline_ms / current_ms, 2)
+            case["multiplier_str"] = format_multiplier(baseline_ms, current_ms)
 
         cases.append(case)
 
@@ -134,10 +147,10 @@ def generate_diff(
     regressions = statuses.count("SLOWER")
     improvements = statuses.count("FASTER")
 
-    worst_regression = 0.0
+    worst_regression_mult = 1.0
     for c in cases:
-        if c["status"] == "SLOWER" and c.get("diff_pct", 0) > worst_regression:
-            worst_regression = c["diff_pct"]
+        if c["status"] == "SLOWER" and c.get("multiplier", 1.0) > worst_regression_mult:
+            worst_regression_mult = c["multiplier"]
 
     env_diff = diff_environment(baseline.get("environment"), latest.get("environment"))
 
@@ -156,7 +169,7 @@ def generate_diff(
             "unchanged": statuses.count("UNCHANGED"),
             "missing_in_baseline": statuses.count("MISSING_IN_BASELINE"),
             "missing_in_latest": statuses.count("MISSING_IN_LATEST"),
-            "worst_regression_pct": worst_regression,
+            "worst_regression_multiplier": worst_regression_mult,
             "top_regressions": get_top_cases(cases, "SLOWER", 3),
             "top_improvements": get_top_cases(cases, "FASTER", 3),
         },
@@ -178,43 +191,37 @@ def print_diff(diff: dict, baseline: dict, latest: dict) -> None:
             print(f"  - {key}: {val['baseline']} -> {val['current']}")
         print()
 
-    # Top regressions
-    top_reg = diff["summary"].get("top_regressions", [])
-    if top_reg:
-        print("Top regressions:")
-        for c in top_reg:
+    # Regressions (sorted by multiplier, most impactful first)
+    regressions = [c for c in diff["cases"] if c["status"] == "SLOWER"]
+    regressions.sort(key=lambda c: c.get("multiplier", 1.0), reverse=True)
+    if regressions:
+        print("Regressions:")
+        for c in regressions:
             profile_path = get_profile_path(c["id"])
             profile_str = f"  [profile: {profile_path}]" if profile_path else ""
-            print(f"  SLOWER   {c['id']:<45} +{c['diff_pct']:.1f}% (+{c['diff_ms']:.1f}ms){profile_str}")
+            print(f"  {c['id']:<50} {c['multiplier_str']}{profile_str}")
         print()
 
-    # Top improvements
-    top_imp = diff["summary"].get("top_improvements", [])
-    if top_imp:
-        print("Top improvements:")
-        for c in top_imp:
+    # Improvements (sorted by multiplier, most impactful first)
+    improvements = [c for c in diff["cases"] if c["status"] == "FASTER"]
+    improvements.sort(key=lambda c: c.get("multiplier", 1.0), reverse=True)
+    if improvements:
+        print("Improvements:")
+        for c in improvements:
             profile_path = get_profile_path(c["id"])
             profile_str = f"  [profile: {profile_path}]" if profile_path else ""
-            print(f"  FASTER   {c['id']:<45} {c['diff_pct']:.1f}% ({c['diff_ms']:.1f}ms){profile_str}")
+            print(f"  {c['id']:<50} {c['multiplier_str']}{profile_str}")
         print()
 
-    # Other notable changes (not in top-3)
-    top_ids = {c["id"] for c in top_reg + top_imp}
-    other_changes = [c for c in diff["cases"] if c["status"] in ("SLOWER", "FASTER", "MISSING_IN_BASELINE", "MISSING_IN_LATEST") and c["id"] not in top_ids]
-
-    if other_changes:
-        print("Other changes:")
-        for case in other_changes:
-            status = case["status"]
-            case_id = case["id"]
-            if status == "SLOWER":
-                print(f"  SLOWER   {case_id:<45} +{case['diff_pct']:.1f}% (+{case['diff_ms']:.1f}ms)")
-            elif status == "FASTER":
-                print(f"  FASTER   {case_id:<45} {case['diff_pct']:.1f}% ({case['diff_ms']:.1f}ms)")
-            elif status == "MISSING_IN_BASELINE":
-                print(f"  NEW      {case_id:<45} (not in baseline)")
-            elif status == "MISSING_IN_LATEST":
-                print(f"  REMOVED  {case_id:<45} (not in latest)")
+    # New/removed cases
+    new_cases = [c for c in diff["cases"] if c["status"] == "MISSING_IN_BASELINE"]
+    removed_cases = [c for c in diff["cases"] if c["status"] == "MISSING_IN_LATEST"]
+    if new_cases or removed_cases:
+        print("New/removed:")
+        for c in new_cases:
+            print(f"  {c['id']:<50} (new)")
+        for c in removed_cases:
+            print(f"  {c['id']:<50} (removed)")
         print()
 
     s = diff["summary"]
diff --git a/bench/config.py b/bench/config.py
index ba2c68f..5a9a46a 100644
--- a/bench/config.py
+++ b/bench/config.py
@@ -8,11 +8,12 @@
 EXTENSION_PATH = PROJECT_ROOT / "build" / "release" / "extension" / "json_tools" / "json_tools.duckdb_extension"
 
 SIZES = {
+    "1k": 1_000,
     "10k": 10_000,
     "100k": 100_000,
 }
 
-DEFAULT_RUNS = 5
+DEFAULT_RUNS = 10
 DEFAULT_PROFILE_RUNS = 1
 DEFAULT_TOLERANCE_PCT = 5
 SCHEMA_VERSION = 2
@@ -51,10 +52,45 @@
 SCENARIOS = [
     {"function": "json_flatten", "scenario": "basic"},
     {"function": "json_add_prefix", "scenario": "basic"},
-    {"function": "json_extract_columns", "scenario": "few_patterns"},
-    {"function": "json_extract_columns", "scenario": "many_patterns"},
-    {"function": "json_group_merge", "scenario": "few_groups"},     # g1e1, 10 groups
-    {"function": "json_group_merge", "scenario": "medium_groups"},  # g1e3, 1000 groups
-    {"function": "json_group_merge", "scenario": "many_groups"},    # g1e4, 10000 groups
-    {"function": "json_group_merge", "scenario": "ignore_nulls"},   # g1e3 with IGNORE NULLS
+    {
+        # 5 patterns: 2 exact, 2 prefix, 1 suffix
+        "function": "json_extract_columns",
+        "scenario": "few_patterns",
+        "patterns": {
+            "s1": "^s1$",           # exact
+            "s2": "^s2$",           # exact
+            "all_strings": "^s",    # prefix
+            "all_numbers": "^n",    # prefix
+            "counts": r"_count$",   # suffix (fallback)
+        },
+    },
+    {
+        # 14 patterns: 5 exact, 4 prefix, 5 regex/suffix
+        "function": "json_extract_columns",
+        "scenario": "medium_patterns",
+        "patterns": {
+            **{f"exact_s{i}": f"^s{i}$" for i in range(1, 6)},   # 5 exact
+            **{f"prefix_{c}": f"^{c}" for c in ["n", "o", "g", "a"]},  # 4 prefix
+            "nested_s": r"o1\.s\d+",   # regex
+            "nested_n": r"o1\.n\d+",   # regex
+            "any_nested": r"\.\w+$",   # regex
+            "ids": r"_id$",            # suffix
+            "counts": r"_count$",      # suffix
+        },
+    },
+    {
+        # 100 patterns: 40 exact, 30 prefix, 30 regex
+        "function": "json_extract_columns",
+        "scenario": "many_patterns",
+        "patterns": {
+            **{f"exact_{i:02d}": f"^s{i}$" for i in range(1, 41)},      # 40 exact
+            **{f"prefix_{i:02d}": f"^p{i:02d}" for i in range(1, 31)},  # 30 prefix
+            **{f"regex_{i:02d}": f"r{i:02d}.*" for i in range(1, 31)},  # 30 regex
+        },
+    },
+    {"function": "json_group_merge", "scenario": "few_groups", "group_col": "g1e1", "merge_opts": ""},
+    {"function": "json_group_merge", "scenario": "medium_groups", "group_col": "g1e3", "merge_opts": ""},
+    {"function": "json_group_merge", "scenario": "many_groups", "group_col": "g1e4", "merge_opts": ""},
+    {"function": "json_group_merge", "scenario": "ignore_nulls", "group_col": "g1e3", "merge_opts": ", 'IGNORE NULLS'"},
+    {"function": "json_group_merge", "scenario": "delete_nulls", "group_col": "g1e3", "merge_opts": ", 'DELETE NULLS'"},
 ]
diff --git a/bench/results/.gitignore b/bench/results/.gitignore
index eb9084a..8cb4b40 100644
--- a/bench/results/.gitignore
+++ b/bench/results/.gitignore
@@ -1,3 +1,3 @@
-latest.json
-diff.json
+*.json
+!baseline.json
 profiles/
diff --git a/bench/results/baseline.json b/bench/results/baseline.json
index 03af884..814c6fd 100644
--- a/bench/results/baseline.json
+++ b/bench/results/baseline.json
@@ -1,6 +1,6 @@
 {
   "schema_version": 2,
-  "generated_at": "2026-01-24T18:06:04+00:00",
+  "generated_at": "2026-01-25T04:40:13+00:00",
   "environment": {
     "build_type": "release",
     "os": "Darwin",
@@ -8,14 +8,15 @@
     "cpu_model": "Apple M2 Max",
     "cpu_cores": 12,
     "duckdb_version": "v1.4.3",
-    "git_commit": "9579af5f558641f719b7fcc514b293bc37040323",
+    "git_commit": "bd4c349885e77c9da00f3a846928314f0058cc91",
     "git_dirty": true,
     "duckdb_python_package_version": "1.4.3",
     "dataset": {
       "seed": 42,
       "sizes": [
         "100k",
-        "10k"
+        "10k",
+        "1k"
       ],
       "complexity_weights": [
         0.2,
@@ -27,169 +28,309 @@
     }
   },
   "config": {
-    "runs": 5,
+    "runs": 10,
     "warmup": true
   },
   "results": [
+    {
+      "id": "json_flatten/1k/basic",
+      "function": "json_flatten",
+      "scenario": "basic",
+      "size": "1k",
+      "min_ms": 1.51,
+      "median_ms": 1.6,
+      "max_ms": 1.79,
+      "runs": 10
+    },
+    {
+      "id": "json_add_prefix/1k/basic",
+      "function": "json_add_prefix",
+      "scenario": "basic",
+      "size": "1k",
+      "min_ms": 1.42,
+      "median_ms": 1.52,
+      "max_ms": 1.6,
+      "runs": 10
+    },
+    {
+      "id": "json_extract_columns/1k/few_patterns",
+      "function": "json_extract_columns",
+      "scenario": "few_patterns",
+      "size": "1k",
+      "min_ms": 1.31,
+      "median_ms": 1.37,
+      "max_ms": 1.44,
+      "runs": 10
+    },
+    {
+      "id": "json_extract_columns/1k/medium_patterns",
+      "function": "json_extract_columns",
+      "scenario": "medium_patterns",
+      "size": "1k",
+      "min_ms": 2.11,
+      "median_ms": 2.27,
+      "max_ms": 2.36,
+      "runs": 10
+    },
+    {
+      "id": "json_extract_columns/1k/many_patterns",
+      "function": "json_extract_columns",
+      "scenario": "many_patterns",
+      "size": "1k",
+      "min_ms": 3.58,
+      "median_ms": 3.84,
+      "max_ms": 4.02,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/1k/few_groups",
+      "function": "json_group_merge",
+      "scenario": "few_groups",
+      "size": "1k",
+      "min_ms": 2.19,
+      "median_ms": 2.31,
+      "max_ms": 2.41,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/1k/medium_groups",
+      "function": "json_group_merge",
+      "scenario": "medium_groups",
+      "size": "1k",
+      "min_ms": 3.6,
+      "median_ms": 3.74,
+      "max_ms": 3.84,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/1k/many_groups",
+      "function": "json_group_merge",
+      "scenario": "many_groups",
+      "size": "1k",
+      "min_ms": 3.55,
+      "median_ms": 3.61,
+      "max_ms": 3.73,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/1k/ignore_nulls",
+      "function": "json_group_merge",
+      "scenario": "ignore_nulls",
+      "size": "1k",
+      "min_ms": 2.81,
+      "median_ms": 2.91,
+      "max_ms": 3.05,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/1k/delete_nulls",
+      "function": "json_group_merge",
+      "scenario": "delete_nulls",
+      "size": "1k",
+      "min_ms": 3.47,
+      "median_ms": 3.62,
+      "max_ms": 3.73,
+      "runs": 10
+    },
     {
       "id": "json_flatten/10k/basic",
       "function": "json_flatten",
       "scenario": "basic",
       "size": "10k",
-      "min_ms": 14.19,
-      "median_ms": 14.37,
-      "max_ms": 14.68,
-      "runs": 5
+      "min_ms": 13.84,
+      "median_ms": 14.41,
+      "max_ms": 14.56,
+      "runs": 10
     },
     {
       "id": "json_add_prefix/10k/basic",
       "function": "json_add_prefix",
       "scenario": "basic",
       "size": "10k",
-      "min_ms": 13.85,
-      "median_ms": 14.24,
-      "max_ms": 14.52,
-      "runs": 5
+      "min_ms": 13.43,
+      "median_ms": 13.6,
+      "max_ms": 13.79,
+      "runs": 10
     },
     {
       "id": "json_extract_columns/10k/few_patterns",
       "function": "json_extract_columns",
       "scenario": "few_patterns",
       "size": "10k",
-      "min_ms": 37.16,
-      "median_ms": 37.37,
-      "max_ms": 37.88,
-      "runs": 5
+      "min_ms": 10.95,
+      "median_ms": 11.13,
+      "max_ms": 11.36,
+      "runs": 10
+    },
+    {
+      "id": "json_extract_columns/10k/medium_patterns",
+      "function": "json_extract_columns",
+      "scenario": "medium_patterns",
+      "size": "10k",
+      "min_ms": 19.06,
+      "median_ms": 19.3,
+      "max_ms": 19.76,
+      "runs": 10
     },
     {
       "id": "json_extract_columns/10k/many_patterns",
       "function": "json_extract_columns",
       "scenario": "many_patterns",
       "size": "10k",
-      "min_ms": 86.51,
-      "median_ms": 86.92,
-      "max_ms": 87.61,
-      "runs": 5
+      "min_ms": 32.95,
+      "median_ms": 33.19,
+      "max_ms": 34.12,
+      "runs": 10
     },
     {
       "id": "json_group_merge/10k/few_groups",
       "function": "json_group_merge",
       "scenario": "few_groups",
       "size": "10k",
-      "min_ms": 199.68,
-      "median_ms": 203.07,
-      "max_ms": 206.52,
-      "runs": 5
+      "min_ms": 16.49,
+      "median_ms": 16.75,
+      "max_ms": 17.22,
+      "runs": 10
     },
     {
       "id": "json_group_merge/10k/medium_groups",
       "function": "json_group_merge",
       "scenario": "medium_groups",
       "size": "10k",
-      "min_ms": 82.96,
-      "median_ms": 83.15,
-      "max_ms": 86.98,
-      "runs": 5
+      "min_ms": 25.82,
+      "median_ms": 26.19,
+      "max_ms": 27.85,
+      "runs": 10
     },
     {
       "id": "json_group_merge/10k/many_groups",
       "function": "json_group_merge",
       "scenario": "many_groups",
       "size": "10k",
-      "min_ms": 34.28,
-      "median_ms": 34.88,
-      "max_ms": 36.67,
-      "runs": 5
+      "min_ms": 32.64,
+      "median_ms": 33.41,
+      "max_ms": 52.48,
+      "runs": 10
     },
     {
       "id": "json_group_merge/10k/ignore_nulls",
       "function": "json_group_merge",
       "scenario": "ignore_nulls",
       "size": "10k",
-      "min_ms": 33.79,
-      "median_ms": 34.57,
-      "max_ms": 35.34,
-      "runs": 5
+      "min_ms": 16.98,
+      "median_ms": 17.33,
+      "max_ms": 18.1,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/10k/delete_nulls",
+      "function": "json_group_merge",
+      "scenario": "delete_nulls",
+      "size": "10k",
+      "min_ms": 26.01,
+      "median_ms": 26.29,
+      "max_ms": 26.97,
+      "runs": 10
     },
     {
       "id": "json_flatten/100k/basic",
       "function": "json_flatten",
       "scenario": "basic",
       "size": "100k",
-      "min_ms": 144.09,
-      "median_ms": 144.72,
-      "max_ms": 145.21,
-      "runs": 5
+      "min_ms": 140.13,
+      "median_ms": 140.7,
+      "max_ms": 142.96,
+      "runs": 10
     },
     {
       "id": "json_add_prefix/100k/basic",
       "function": "json_add_prefix",
       "scenario": "basic",
       "size": "100k",
-      "min_ms": 138.04,
-      "median_ms": 138.93,
-      "max_ms": 143.35,
-      "runs": 5
+      "min_ms": 134.2,
+      "median_ms": 134.89,
+      "max_ms": 135.18,
+      "runs": 10
     },
     {
       "id": "json_extract_columns/100k/few_patterns",
       "function": "json_extract_columns",
       "scenario": "few_patterns",
       "size": "100k",
-      "min_ms": 368.33,
-      "median_ms": 369.37,
-      "max_ms": 379.84,
-      "runs": 5
+      "min_ms": 109.2,
+      "median_ms": 109.49,
+      "max_ms": 110.7,
+      "runs": 10
+    },
+    {
+      "id": "json_extract_columns/100k/medium_patterns",
+      "function": "json_extract_columns",
+      "scenario": "medium_patterns",
+      "size": "100k",
+      "min_ms": 189.33,
+      "median_ms": 190.41,
+      "max_ms": 191.88,
+      "runs": 10
     },
     {
       "id": "json_extract_columns/100k/many_patterns",
       "function": "json_extract_columns",
       "scenario": "many_patterns",
       "size": "100k",
-      "min_ms": 863.14,
-      "median_ms": 866.99,
-      "max_ms": 868.49,
-      "runs": 5
+      "min_ms": 330.16,
+      "median_ms": 331.04,
+      "max_ms": 337.67,
+      "runs": 10
     },
     {
       "id": "json_group_merge/100k/few_groups",
       "function": "json_group_merge",
       "scenario": "few_groups",
       "size": "100k",
-      "min_ms": 2288.68,
-      "median_ms": 2309.96,
-      "max_ms": 2330.86,
-      "runs": 5
+      "min_ms": 161.28,
+      "median_ms": 161.84,
+      "max_ms": 162.45,
+      "runs": 10
     },
     {
       "id": "json_group_merge/100k/medium_groups",
       "function": "json_group_merge",
       "scenario": "medium_groups",
       "size": "100k",
-      "min_ms": 1765.69,
-      "median_ms": 1788.39,
-      "max_ms": 1895.04,
-      "runs": 5
+      "min_ms": 199.69,
+      "median_ms": 200.52,
+      "max_ms": 202.14,
+      "runs": 10
     },
     {
       "id": "json_group_merge/100k/many_groups",
       "function": "json_group_merge",
       "scenario": "many_groups",
       "size": "100k",
-      "min_ms": 840.14,
-      "median_ms": 872.51,
-      "max_ms": 899.73,
-      "runs": 5
+      "min_ms": 271.87,
+      "median_ms": 275.44,
+      "max_ms": 294.14,
+      "runs": 10
     },
     {
       "id": "json_group_merge/100k/ignore_nulls",
       "function": "json_group_merge",
       "scenario": "ignore_nulls",
       "size": "100k",
-      "min_ms": 801.72,
-      "median_ms": 814.84,
-      "max_ms": 835.87,
-      "runs": 5
+      "min_ms": 130.63,
+      "median_ms": 132.31,
+      "max_ms": 135.88,
+      "runs": 10
+    },
+    {
+      "id": "json_group_merge/100k/delete_nulls",
+      "function": "json_group_merge",
+      "scenario": "delete_nulls",
+      "size": "100k",
+      "min_ms": 199.0,
+      "median_ms": 201.16,
+      "max_ms": 213.45,
+      "runs": 10
     }
   ]
 }
\ No newline at end of file
diff --git a/bench/run_benchmarks.py b/bench/run_benchmarks.py
index 79e44c0..be45bfd 100644
--- a/bench/run_benchmarks.py
+++ b/bench/run_benchmarks.py
@@ -31,7 +31,7 @@ def create_connection() -> duckdb.DuckDBPyConnection:
     return conn
 
 
-def build_query(function: str, scenario: str, data_path: Path) -> str:
+def build_query(scenario_config: dict, data_path: Path) -> str:
     """Build benchmark query with result consumption wrapper.
 
     All queries are wrapped with sum(length(CAST(... AS VARCHAR))) to:
@@ -40,43 +40,38 @@ def build_query(function: str, scenario: str, data_path: Path) -> str:
     """
     table = f"read_parquet('{data_path}')"
 
-    if function == "json_flatten":
-        return f"SELECT sum(length(CAST(json_flatten(json_nested, '.') AS VARCHAR))) FROM {table}"
-
-    if function == "json_add_prefix":
-        return f"SELECT sum(length(CAST(json_add_prefix(json_flat, 'pfx_') AS VARCHAR))) FROM {table}"
-
-    if function == "json_extract_columns":
-        # Build patterns based on typical flat keys
-        if scenario == "few_patterns":
-            patterns = {f"col_{i}": f"s{i}" for i in range(1, 6)}
-        elif scenario == "many_patterns":
-            patterns = {f"s{i}": f"s{i}" for i in range(1, 10)}
-            patterns.update({f"n{i}": f"n{i}" for i in range(1, 5)})
-            patterns.update({"nested_s": r"o1\.s\d+"})
-        else:  # extreme_patterns
-            patterns = {f"p_{i:04d}": f"pattern_{i:04d}" for i in range(1000)}
-
-        import json as json_mod
-        patterns_json = json_mod.dumps(patterns).replace("'", "''")
-        return f"SELECT sum(length(CAST(json_extract_columns(json_flat, '{patterns_json}', '\\n') AS VARCHAR))) FROM {table}"
-
-    if function == "json_group_merge":
-        if scenario == "few_groups":
-            group_col = "g1e1"
-        elif scenario == "medium_groups":
-            group_col = "g1e3"
-        else:  # ignore_nulls or many_groups
-            group_col = "g1e4"
-
-        # Wrap aggregate result in sum(length(...)) via subquery
-        if scenario == "ignore_nulls":
-            inner = f"SELECT json_group_merge(json_flat, 'IGNORE NULLS') as result FROM {table} GROUP BY {group_col}"
-        else:
-            inner = f"SELECT json_group_merge(json_flat) as result FROM {table} GROUP BY {group_col}"
-        return f"SELECT sum(length(CAST(result AS VARCHAR))) FROM ({inner})"
-
-    raise ValueError(f"Unknown function: {function}")
+    match scenario_config["function"]:
+        case "json_flatten":
+            return f"""
+                SELECT sum(length(CAST(json_flatten(json_nested, '.') AS VARCHAR)))
+                FROM {table}
+            """
+
+        case "json_add_prefix":
+            return f"""
+                SELECT sum(length(CAST(json_add_prefix(json_flat, 'pfx_') AS VARCHAR)))
+                FROM {table}
+            """
+
+        case "json_extract_columns":
+            patterns_json = json.dumps(scenario_config["patterns"]).replace("'", "''")
+            return f"""
+                SELECT sum(length(CAST(json_extract_columns(json_flat, '{patterns_json}', '\\n') AS VARCHAR)))
+                FROM {table}
+            """
+
+        case "json_group_merge":
+            return f"""
+                SELECT sum(length(CAST(result AS VARCHAR)))
+                FROM (
+                    SELECT json_group_merge(json_flat{scenario_config['merge_opts']}) as result
+                    FROM {table}
+                    GROUP BY {scenario_config['group_col']}
+                )
+            """
+
+        case _:
+            raise ValueError(f"Unknown function: {scenario_config['function']}")
 
 
 def collect_duckdb_profile(conn: duckdb.DuckDBPyConnection, query: str, profile_path: Path) -> None:
@@ -132,7 +127,7 @@ def run_benchmarks(
 
         print(f"Running {case_id}...", end=" ", flush=True)
 
-        query = build_query(function, scenario, data_path)
+        query = build_query(scenario_config, data_path)
 
         if profile:
             profile_dir = PROFILES_DIR / case_id.replace("/", "_")
diff --git a/src/json_tools_extension.cpp b/src/json_tools_extension.cpp
index 2e49e91..bcab88d 100644
--- a/src/json_tools_extension.cpp
+++ b/src/json_tools_extension.cpp
@@ -14,6 +14,7 @@
 #include "duckdb/function/aggregate_function.hpp"
 #include "duckdb/function/function_set.hpp"
 #include "duckdb/function/scalar/regexp.hpp"
+#include "re2/set.h"
 #include "../duckdb/extension/json/include/json_common.hpp"
 #include "yyjson.hpp"
 #include <duckdb/parser/parsed_data/create_scalar_function_info.hpp>
@@ -30,6 +31,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <unordered_set>
 
 namespace duckdb {
@@ -146,13 +148,415 @@ static unique_ptr<FunctionLocalState> JsonAddPrefixInitLocalState(ExpressionStat
 	return make_uniq<JsonAddPrefixLocalState>(BufferAllocator::Get(context));
 }
 
+// Hash map based JSON value representation for O(1) key operations
+struct JsonValue;
+using JsonArray = std::vector<JsonValue>;
+
+enum class JsonValueType : uint8_t {
+	UNINITIALIZED = 0,
+	JSON_NULL,
+	BOOL_VAL,
+	INT64_VAL,
+	UINT64_VAL,
+	DOUBLE_VAL,
+	STRING_VAL,
+	ARRAY_VAL,
+	OBJECT_VAL
+};
+
+// Forward declare JsonObject (defined after JsonValue)
+class JsonObject;
+
+struct JsonValue {
+	JsonValueType type;
+	union {
+		bool bool_val;
+		int64_t int64_val;
+		uint64_t uint64_val;
+		double double_val;
+	} primitive;
+	std::string string_val;
+	unique_ptr<JsonArray> array_ptr;
+	unique_ptr<JsonObject> object_ptr;
+
+	JsonValue();
+	~JsonValue();
+
+	// Copy constructor - deep copy
+	JsonValue(const JsonValue &other);
+
+	// Move constructor
+	JsonValue(JsonValue &&other) noexcept;
+
+	// Copy assignment - deep copy
+	JsonValue &operator=(const JsonValue &other);
+
+	// Move assignment
+	JsonValue &operator=(JsonValue &&other) noexcept;
+
+	static JsonValue MakeNull();
+	static JsonValue MakeBool(bool b);
+	static JsonValue MakeInt64(int64_t n);
+	static JsonValue MakeUint64(uint64_t n);
+	static JsonValue MakeDouble(double d);
+	static JsonValue MakeString(std::string s);
+	static JsonValue MakeArray(JsonArray arr);
+	static JsonValue MakeObject(JsonObject obj);
+
+	bool IsNull() const {
+		return type == JsonValueType::JSON_NULL;
+	}
+	bool IsObject() const {
+		return type == JsonValueType::OBJECT_VAL;
+	}
+	bool IsArray() const {
+		return type == JsonValueType::ARRAY_VAL;
+	}
+	bool IsString() const {
+		return type == JsonValueType::STRING_VAL;
+	}
+	bool IsBool() const {
+		return type == JsonValueType::BOOL_VAL;
+	}
+	bool IsInt64() const {
+		return type == JsonValueType::INT64_VAL;
+	}
+	bool IsUint64() const {
+		return type == JsonValueType::UINT64_VAL;
+	}
+	bool IsDouble() const {
+		return type == JsonValueType::DOUBLE_VAL;
+	}
+	bool IsUninitialized() const {
+		return type == JsonValueType::UNINITIALIZED;
+	}
+
+	JsonObject &AsObject();
+	const JsonObject &AsObject() const;
+	JsonArray &AsArray() {
+		return *array_ptr;
+	}
+	const JsonArray &AsArray() const {
+		return *array_ptr;
+	}
+	const std::string &AsString() const {
+		return string_val;
+	}
+	bool AsBool() const {
+		return primitive.bool_val;
+	}
+	int64_t AsInt64() const {
+		return primitive.int64_val;
+	}
+	uint64_t AsUint64() const {
+		return primitive.uint64_val;
+	}
+	double AsDouble() const {
+		return primitive.double_val;
+	}
+};
+
+// Insertion-order-preserving JSON object with O(1) key lookup
+class JsonObject {
+public:
+	struct Entry {
+		std::string key;
+		JsonValue value;
+		bool deleted;
+
+		Entry(std::string k, JsonValue v) : key(std::move(k)), value(std::move(v)), deleted(false) {
+		}
+	};
+
+	JsonObject() = default;
+	~JsonObject() = default;
+
+	JsonObject(const JsonObject &other) : entries_(other.entries_) {
+		RebuildIndex();
+	}
+
+	JsonObject(JsonObject &&other) noexcept : entries_(std::move(other.entries_)), index_(std::move(other.index_)) {
+	}
+
+	JsonObject &operator=(const JsonObject &other) {
+		if (this != &other) {
+			entries_ = other.entries_;
+			RebuildIndex();
+		}
+		return *this;
+	}
+
+	JsonObject &operator=(JsonObject &&other) noexcept {
+		if (this != &other) {
+			entries_ = std::move(other.entries_);
+			index_ = std::move(other.index_);
+		}
+		return *this;
+	}
+
+	// O(1) amortized lookup
+	JsonValue *Find(const std::string &key) {
+		auto it = index_.find(key);
+		if (it == index_.end()) {
+			return nullptr;
+		}
+		return &entries_[it->second].value;
+	}
+
+	const JsonValue *Find(const std::string &key) const {
+		auto it = index_.find(key);
+		if (it == index_.end()) {
+			return nullptr;
+		}
+		return &entries_[it->second].value;
+	}
+
+	// O(1) amortized insert or update
+	JsonValue &operator[](const std::string &key) {
+		auto it = index_.find(key);
+		if (it != index_.end()) {
+			return entries_[it->second].value;
+		}
+		idx_t new_idx = entries_.size();
+		entries_.emplace_back(key, JsonValue());
+		index_[key] = new_idx;
+		return entries_.back().value;
+	}
+
+	// O(1) erase (marks as deleted, doesn't shift)
+	void Erase(const std::string &key) {
+		auto it = index_.find(key);
+		if (it != index_.end()) {
+			idx_t idx = it->second;
+			entries_[idx].deleted = true;
+			index_.erase(it);
+		}
+	}
+
+	void Clear() {
+		entries_.clear();
+		index_.clear();
+	}
+
+	bool Empty() const {
+		return index_.empty();
+	}
+	idx_t Size() const {
+		return index_.size();
+	}
+
+	// Iterate over non-deleted entries (preserves insertion order)
+	class Iterator {
+	public:
+		Iterator(std::vector<Entry> *entries, idx_t pos) : entries_(entries), pos_(pos) {
+			SkipDeleted();
+		}
+
+		bool operator!=(const Iterator &other) const {
+			return pos_ != other.pos_;
+		}
+
+		Iterator &operator++() {
+			++pos_;
+			SkipDeleted();
+			return *this;
+		}
+
+		std::pair<const std::string &, JsonValue &> operator*() const {
+			return {(*entries_)[pos_].key, (*entries_)[pos_].value};
+		}
+
+	private:
+		void SkipDeleted() {
+			while (pos_ < entries_->size() && (*entries_)[pos_].deleted) {
+				++pos_;
+			}
+		}
+
+		std::vector<Entry> *entries_;
+		idx_t pos_;
+	};
+
+	class ConstIterator {
+	public:
+		ConstIterator(const std::vector<Entry> *entries, idx_t pos) : entries_(entries), pos_(pos) {
+			SkipDeleted();
+		}
+
+		bool operator!=(const ConstIterator &other) const {
+			return pos_ != other.pos_;
+		}
+
+		ConstIterator &operator++() {
+			++pos_;
+			SkipDeleted();
+			return *this;
+		}
+
+		std::pair<const std::string &, const JsonValue &> operator*() const {
+			return {(*entries_)[pos_].key, (*entries_)[pos_].value};
+		}
+
+	private:
+		void SkipDeleted() {
+			while (pos_ < entries_->size() && (*entries_)[pos_].deleted) {
+				++pos_;
+			}
+		}
+
+		const std::vector<Entry> *entries_;
+		idx_t pos_;
+	};
+
+	Iterator begin() {
+		return Iterator(&entries_, 0);
+	}
+	Iterator end() {
+		return Iterator(&entries_, entries_.size());
+	}
+	ConstIterator begin() const {
+		return ConstIterator(&entries_, 0);
+	}
+	ConstIterator end() const {
+		return ConstIterator(&entries_, entries_.size());
+	}
+
+private:
+	void RebuildIndex() {
+		index_.clear();
+		for (idx_t i = 0; i < entries_.size(); ++i) {
+			if (!entries_[i].deleted) {
+				index_[entries_[i].key] = i;
+			}
+		}
+	}
+
+	std::vector<Entry> entries_;
+	std::unordered_map<std::string, idx_t> index_;
+};
+
+// JsonValue method implementations (after JsonObject is defined)
+inline JsonValue::JsonValue() : type(JsonValueType::UNINITIALIZED) {
+	primitive.int64_val = 0;
+}
+inline JsonValue::~JsonValue() = default;
+
+inline JsonValue::JsonValue(const JsonValue &other)
+    : type(other.type), primitive(other.primitive), string_val(other.string_val) {
+	if (other.array_ptr) {
+		array_ptr = make_uniq<JsonArray>(*other.array_ptr);
+	}
+	if (other.object_ptr) {
+		object_ptr = make_uniq<JsonObject>(*other.object_ptr);
+	}
+}
+
+inline JsonValue::JsonValue(JsonValue &&other) noexcept
+    : type(other.type), primitive(other.primitive), string_val(std::move(other.string_val)),
+      array_ptr(std::move(other.array_ptr)), object_ptr(std::move(other.object_ptr)) {
+	other.type = JsonValueType::UNINITIALIZED;
+}
+
+inline JsonValue &JsonValue::operator=(const JsonValue &other) {
+	if (this != &other) {
+		type = other.type;
+		primitive = other.primitive;
+		string_val = other.string_val;
+		if (other.array_ptr) {
+			array_ptr = make_uniq<JsonArray>(*other.array_ptr);
+		} else {
+			array_ptr.reset();
+		}
+		if (other.object_ptr) {
+			object_ptr = make_uniq<JsonObject>(*other.object_ptr);
+		} else {
+			object_ptr.reset();
+		}
+	}
+	return *this;
+}
+
+inline JsonValue &JsonValue::operator=(JsonValue &&other) noexcept {
+	if (this != &other) {
+		type = other.type;
+		primitive = other.primitive;
+		string_val = std::move(other.string_val);
+		array_ptr = std::move(other.array_ptr);
+		object_ptr = std::move(other.object_ptr);
+		other.type = JsonValueType::UNINITIALIZED;
+	}
+	return *this;
+}
+
+inline JsonValue JsonValue::MakeNull() {
+	JsonValue v;
+	v.type = JsonValueType::JSON_NULL;
+	return v;
+}
+
+inline JsonValue JsonValue::MakeBool(bool b) {
+	JsonValue v;
+	v.type = JsonValueType::BOOL_VAL;
+	v.primitive.bool_val = b;
+	return v;
+}
+
+inline JsonValue JsonValue::MakeInt64(int64_t n) {
+	JsonValue v;
+	v.type = JsonValueType::INT64_VAL;
+	v.primitive.int64_val = n;
+	return v;
+}
+
+inline JsonValue JsonValue::MakeUint64(uint64_t n) {
+	JsonValue v;
+	v.type = JsonValueType::UINT64_VAL;
+	v.primitive.uint64_val = n;
+	return v;
+}
+
+inline JsonValue JsonValue::MakeDouble(double d) {
+	JsonValue v;
+	v.type = JsonValueType::DOUBLE_VAL;
+	v.primitive.double_val = d;
+	return v;
+}
+
+inline JsonValue JsonValue::MakeString(std::string s) {
+	JsonValue v;
+	v.type = JsonValueType::STRING_VAL;
+	v.string_val = std::move(s);
+	return v;
+}
+
+inline JsonValue JsonValue::MakeArray(JsonArray arr) {
+	JsonValue v;
+	v.type = JsonValueType::ARRAY_VAL;
+	v.array_ptr = make_uniq<JsonArray>(std::move(arr));
+	return v;
+}
+
+inline JsonValue JsonValue::MakeObject(JsonObject obj) {
+	JsonValue v;
+	v.type = JsonValueType::OBJECT_VAL;
+	v.object_ptr = make_uniq<JsonObject>(std::move(obj));
+	return v;
+}
+
+inline JsonObject &JsonValue::AsObject() {
+	return *object_ptr;
+}
+inline const JsonObject &JsonValue::AsObject() const {
+	return *object_ptr;
+}
+
 struct JsonGroupMergeState {
-	yyjson_mut_doc *result_doc;
-	yyjson_mut_doc *patch_doc;
+	JsonObject *result_map;
+	JsonObject *patch_map;
+	JsonValue *scalar_replacement; // Set when a non-object patch replaces the entire result
 	bool result_has_input;
 	bool patch_has_input;
-	idx_t result_replacements_since_compact;
-	idx_t patch_replacements_since_compact;
+	bool patch_has_nulls;
 };
 
 enum class JsonNullTreatment : uint8_t { DELETE_NULLS = 0, IGNORE_NULLS = 1 };
@@ -183,390 +587,319 @@ static JsonNullTreatment GetNullTreatment(optional_ptr<FunctionData> bind_data)
 static unique_ptr<FunctionData> JsonGroupMergeBind(ClientContext &context, AggregateFunction &function,
                                                    vector<unique_ptr<Expression>> &arguments);
 
+// Maximum nesting depth to prevent stack exhaustion from pathological inputs
+constexpr idx_t MAX_JSON_NESTING_DEPTH = 1000;
+
 static void JsonGroupMergeStateInit(JsonGroupMergeState &state) {
-	state.result_doc = yyjson_mut_doc_new(nullptr);
-	if (!state.result_doc) {
-		throw InternalException("json_group_merge: failed to allocate aggregate state");
-	}
-	state.patch_doc = yyjson_mut_doc_new(nullptr);
-	if (!state.patch_doc) {
-		yyjson_mut_doc_free(state.result_doc);
-		state.result_doc = nullptr;
-		throw InternalException("json_group_merge: failed to allocate aggregate state");
-	}
-	auto result_root = yyjson_mut_obj(state.result_doc);
-	if (!result_root) {
-		yyjson_mut_doc_free(state.patch_doc);
-		yyjson_mut_doc_free(state.result_doc);
-		state.patch_doc = nullptr;
-		state.result_doc = nullptr;
-		throw InternalException("json_group_merge: failed to allocate initial JSON object");
-	}
-	auto patch_root = yyjson_mut_obj(state.patch_doc);
-	if (!patch_root) {
-		yyjson_mut_doc_free(state.patch_doc);
-		yyjson_mut_doc_free(state.result_doc);
-		state.patch_doc = nullptr;
-		state.result_doc = nullptr;
-		throw InternalException("json_group_merge: failed to allocate initial JSON object");
-	}
-	yyjson_mut_doc_set_root(state.result_doc, result_root);
-	yyjson_mut_doc_set_root(state.patch_doc, patch_root);
+	state.result_map = new JsonObject();
+	state.patch_map = nullptr;
+	state.scalar_replacement = nullptr;
 	state.result_has_input = false;
 	state.patch_has_input = false;
-	state.result_replacements_since_compact = 0;
-	state.patch_replacements_since_compact = 0;
+	state.patch_has_nulls = false;
 }
 
 static void JsonGroupMergeStateDestroy(JsonGroupMergeState &state) {
-	if (state.result_doc) {
-		yyjson_mut_doc_free(state.result_doc);
-		state.result_doc = nullptr;
+	if (state.result_map) {
+		delete state.result_map;
+		state.result_map = nullptr;
+	}
+	if (state.patch_map) {
+		delete state.patch_map;
+		state.patch_map = nullptr;
 	}
-	if (state.patch_doc) {
-		yyjson_mut_doc_free(state.patch_doc);
-		state.patch_doc = nullptr;
+	if (state.scalar_replacement) {
+		delete state.scalar_replacement;
+		state.scalar_replacement = nullptr;
 	}
 	state.result_has_input = false;
 	state.patch_has_input = false;
-	state.result_replacements_since_compact = 0;
-	state.patch_replacements_since_compact = 0;
+	state.patch_has_nulls = false;
 }
 
-constexpr idx_t JSON_GROUP_MERGE_COMPACT_THRESHOLD = 1024;
-// Maximum nesting depth to prevent stack exhaustion from pathological inputs
-constexpr idx_t MAX_JSON_NESTING_DEPTH = 1000;
-
-static yyjson_mut_val *JsonGroupMergeApplyPatchInternal(yyjson_mut_doc *doc, yyjson_mut_val *base, yyjson_val *patch,
-                                                        idx_t depth, idx_t &replacements_since_compact,
-                                                        JsonNullTreatment null_treatment);
+// Convert read-only yyjson_val to JsonValue
+static JsonValue ParseYyjsonValue(yyjson_val *val, idx_t depth) {
+	if (!val) {
+		return JsonValue();
+	}
+	if (depth > MAX_JSON_NESTING_DEPTH) {
+		throw InvalidInputException("json_group_merge: nesting depth exceeds maximum limit of " +
+		                            std::to_string(MAX_JSON_NESTING_DEPTH));
+	}
 
-static void JsonGroupMergeCompactDoc(yyjson_mut_doc *&doc) {
-	if (!doc || !doc->root) {
-		return;
+	auto tag = duckdb_yyjson::yyjson_get_tag(val);
+	switch (tag) {
+	case YYJSON_TYPE_NULL | YYJSON_SUBTYPE_NONE:
+		return JsonValue::MakeNull();
+	case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_TRUE:
+		return JsonValue::MakeBool(true);
+	case YYJSON_TYPE_BOOL | YYJSON_SUBTYPE_FALSE:
+		return JsonValue::MakeBool(false);
+	case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_UINT:
+		return JsonValue::MakeUint64(duckdb_yyjson::yyjson_get_uint(val));
+	case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_SINT:
+		return JsonValue::MakeInt64(duckdb_yyjson::yyjson_get_sint(val));
+	case YYJSON_TYPE_NUM | YYJSON_SUBTYPE_REAL:
+		return JsonValue::MakeDouble(duckdb_yyjson::yyjson_get_real(val));
+	case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NONE:
+	case YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC: {
+		auto str = duckdb_yyjson::yyjson_get_str(val);
+		auto len = duckdb_yyjson::yyjson_get_len(val);
+		return JsonValue::MakeString(std::string(str, len));
+	}
+	case YYJSON_TYPE_ARR | YYJSON_SUBTYPE_NONE: {
+		JsonArray arr;
+		yyjson_val *elem;
+		duckdb_yyjson::yyjson_arr_iter iter = duckdb_yyjson::yyjson_arr_iter_with(val);
+		while ((elem = duckdb_yyjson::yyjson_arr_iter_next(&iter))) {
+			arr.push_back(ParseYyjsonValue(elem, depth + 1));
+		}
+		return JsonValue::MakeArray(std::move(arr));
 	}
-	auto new_doc = yyjson_mut_doc_new(nullptr);
-	if (!new_doc) {
-		throw InternalException("json_group_merge: failed to compact aggregate state");
+	case YYJSON_TYPE_OBJ | YYJSON_SUBTYPE_NONE: {
+		JsonObject obj;
+		yyjson_val *key;
+		duckdb_yyjson::yyjson_obj_iter iter = duckdb_yyjson::yyjson_obj_iter_with(val);
+		while ((key = duckdb_yyjson::yyjson_obj_iter_next(&iter))) {
+			auto key_str = duckdb_yyjson::yyjson_get_str(key);
+			auto key_len = duckdb_yyjson::yyjson_get_len(key);
+			auto child_val = duckdb_yyjson::yyjson_obj_iter_get_val(key);
+			obj[std::string(key_str, key_len)] = ParseYyjsonValue(child_val, depth + 1);
+		}
+		return JsonValue::MakeObject(std::move(obj));
 	}
-	auto root_copy = yyjson_mut_val_mut_copy(new_doc, doc->root);
-	if (!root_copy) {
-		yyjson_mut_doc_free(new_doc);
-		throw InternalException("json_group_merge: failed to copy aggregate state during compaction");
+	default:
+		throw InternalException("json_group_merge: unknown yyjson type tag");
 	}
-	yyjson_mut_doc_set_root(new_doc, root_copy);
-	yyjson_mut_doc_free(doc);
-	doc = new_doc;
 }
 
-static void JsonGroupMergeMaybeCompact(JsonGroupMergeState &state) {
-	if (state.result_replacements_since_compact >= JSON_GROUP_MERGE_COMPACT_THRESHOLD) {
-		JsonGroupMergeCompactDoc(state.result_doc);
-		state.result_replacements_since_compact = 0;
+// Forward declaration for recursive call
+static yyjson_mut_val *BuildYyjsonValue(yyjson_mut_doc *doc, const JsonValue &val);
+
+static yyjson_mut_val *BuildYyjsonValue(yyjson_mut_doc *doc, const JsonValue &val) {
+	if (val.IsUninitialized()) {
+		return nullptr;
 	}
-	if (state.patch_replacements_since_compact >= JSON_GROUP_MERGE_COMPACT_THRESHOLD) {
-		JsonGroupMergeCompactDoc(state.patch_doc);
-		state.patch_replacements_since_compact = 0;
+	if (val.IsNull()) {
+		return duckdb_yyjson::yyjson_mut_null(doc);
 	}
-}
-
-static void JsonGroupMergeApplyResultPatch(JsonGroupMergeState &state, yyjson_val *patch_root,
-                                           JsonNullTreatment null_treatment) {
-	if (!patch_root) {
-		throw InvalidInputException("json_group_merge: invalid JSON payload");
+	if (val.IsBool()) {
+		return duckdb_yyjson::yyjson_mut_bool(doc, val.AsBool());
 	}
-	auto base_root = state.result_has_input ? state.result_doc->root : nullptr;
-	auto merged_root = JsonGroupMergeApplyPatchInternal(state.result_doc, base_root, patch_root, 0,
-	                                                    state.result_replacements_since_compact, null_treatment);
-	if (!merged_root) {
-		throw InternalException("json_group_merge: failed to merge JSON documents");
+	if (val.IsInt64()) {
+		return duckdb_yyjson::yyjson_mut_sint(doc, val.AsInt64());
 	}
-	if (!state.result_has_input || merged_root != state.result_doc->root) {
-		yyjson_mut_doc_set_root(state.result_doc, merged_root);
+	if (val.IsUint64()) {
+		return duckdb_yyjson::yyjson_mut_uint(doc, val.AsUint64());
 	}
-	state.result_has_input = true;
-	JsonGroupMergeMaybeCompact(state);
-}
-
-static yyjson_mut_val *JsonGroupMergeComposePatchInternal(yyjson_mut_doc *doc, yyjson_mut_val *base, yyjson_val *patch,
-                                                          idx_t depth, idx_t &replacements_since_compact,
-                                                          JsonNullTreatment null_treatment);
-
-static void JsonGroupMergeComposePatch(JsonGroupMergeState &state, yyjson_val *patch_root,
-                                       JsonNullTreatment null_treatment) {
-	if (!patch_root) {
-		throw InvalidInputException("json_group_merge: invalid JSON payload");
+	if (val.IsDouble()) {
+		return duckdb_yyjson::yyjson_mut_real(doc, val.AsDouble());
 	}
-	auto base_root = state.patch_has_input ? state.patch_doc->root : nullptr;
-	auto merged_root = JsonGroupMergeComposePatchInternal(state.patch_doc, base_root, patch_root, 0,
-	                                                      state.patch_replacements_since_compact, null_treatment);
-	if (!merged_root) {
-		throw InternalException("json_group_merge: failed to merge JSON documents");
+	if (val.IsString()) {
+		const auto &s = val.AsString();
+		return duckdb_yyjson::yyjson_mut_strncpy(doc, s.c_str(), s.size());
 	}
-	if (!state.patch_has_input || merged_root != state.patch_doc->root) {
-		yyjson_mut_doc_set_root(state.patch_doc, merged_root);
+	if (val.IsArray()) {
+		auto arr = duckdb_yyjson::yyjson_mut_arr(doc);
+		if (!arr) {
+			throw InternalException("json_group_merge: failed to allocate JSON array");
+		}
+		for (const auto &elem : val.AsArray()) {
+			auto elem_val = BuildYyjsonValue(doc, elem);
+			if (elem_val && !duckdb_yyjson::yyjson_mut_arr_append(arr, elem_val)) {
+				throw InternalException("json_group_merge: failed to append array element");
+			}
+		}
+		return arr;
+	}
+	if (val.IsObject()) {
+		auto obj = duckdb_yyjson::yyjson_mut_obj(doc);
+		if (!obj) {
+			throw InternalException("json_group_merge: failed to allocate JSON object");
+		}
+		for (const auto &kv : val.AsObject()) {
+			auto key = duckdb_yyjson::yyjson_mut_strncpy(doc, kv.first.c_str(), kv.first.size());
+			auto child = BuildYyjsonValue(doc, kv.second);
+			if (key && child && !duckdb_yyjson::yyjson_mut_obj_add(obj, key, child)) {
+				throw InternalException("json_group_merge: failed to add object member");
+			}
+		}
+		return obj;
 	}
-	state.patch_has_input = true;
-	JsonGroupMergeMaybeCompact(state);
+	throw InternalException("json_group_merge: unexpected JsonValue variant");
 }
 
-static yyjson_mut_val *JsonGroupMergeApplyPatchInternal(yyjson_mut_doc *doc, yyjson_mut_val *base, yyjson_val *patch,
-                                                        idx_t depth, idx_t &replacements_since_compact,
-                                                        JsonNullTreatment null_treatment) {
+// Apply patch to target map (result_map) - implements JSON merge patch semantics
+static void ApplyPatchToMap(JsonObject &target, yyjson_val *patch, idx_t depth, JsonNullTreatment null_treatment,
+                            bool *saw_nulls) {
 	if (!patch) {
-		return base;
+		return;
 	}
 	if (depth > MAX_JSON_NESTING_DEPTH) {
 		throw InvalidInputException("json_group_merge: nesting depth exceeds maximum limit of " +
 		                            std::to_string(MAX_JSON_NESTING_DEPTH));
 	}
 
-	if (!duckdb_yyjson::yyjson_is_obj(patch)) {
-		auto copy = yyjson_val_mut_copy(doc, patch);
-		if (!copy) {
-			throw InternalException("json_group_merge: failed to materialize JSON value");
-		}
-		if (base) {
-			replacements_since_compact++;
-		}
-		return copy;
-	}
-
-	yyjson_mut_val *result = nullptr;
-	bool base_is_object = base && duckdb_yyjson::yyjson_mut_is_obj(base);
-	if (base_is_object) {
-		result = base;
-	}
-
-	auto EnsureResult = [&]() -> yyjson_mut_val * {
-		if (result) {
-			return result;
-		}
-		result = yyjson_mut_obj(doc);
-		if (!result) {
-			throw InternalException("json_group_merge: failed to allocate JSON object");
-		}
-		if (base && !base_is_object) {
-			replacements_since_compact++;
-		}
-		return result;
-	};
-
-	bool applied_any = false;
-
 	yyjson_val *patch_key = nullptr;
-	yyjson_obj_iter patch_iter = yyjson_obj_iter_with(patch);
-	while ((patch_key = yyjson_obj_iter_next(&patch_iter))) {
+	duckdb_yyjson::yyjson_obj_iter patch_iter = duckdb_yyjson::yyjson_obj_iter_with(patch);
+	while ((patch_key = duckdb_yyjson::yyjson_obj_iter_next(&patch_iter))) {
 		auto key_str = duckdb_yyjson::yyjson_get_str(patch_key);
 		auto key_len = duckdb_yyjson::yyjson_get_len(patch_key);
-		auto patch_val = yyjson_obj_iter_get_val(patch_key);
+		auto patch_val = duckdb_yyjson::yyjson_obj_iter_get_val(patch_key);
 
 		if (!key_str) {
 			throw InvalidInputException("json_group_merge: encountered non-string object key");
 		}
 
+		std::string key(key_str, key_len);
+
 		if (duckdb_yyjson::yyjson_is_null(patch_val)) {
+			if (saw_nulls) {
+				*saw_nulls = true;
+			}
 			if (null_treatment == JsonNullTreatment::DELETE_NULLS) {
-				if (result) {
-					auto removed = duckdb_yyjson::yyjson_mut_obj_remove_keyn(result, key_str, key_len);
-					if (removed) {
-						replacements_since_compact++;
-						applied_any = true;
-					}
-				}
+				target.Erase(key); // O(1) average
 			}
 			continue;
 		}
 
-		auto existing_child = result ? duckdb_yyjson::yyjson_mut_obj_getn(result, key_str, key_len) : nullptr;
 		if (duckdb_yyjson::yyjson_is_obj(patch_val)) {
-			auto merged_child = JsonGroupMergeApplyPatchInternal(doc, existing_child, patch_val, depth + 1,
-			                                                     replacements_since_compact, null_treatment);
-			// Skip if merged_child is null (can happen with IGNORE_NULLS when patch contains only nulls
-			// and there's no existing value)
-			if (!merged_child) {
-				continue;
-			}
-			if (!existing_child || merged_child != existing_child) {
-				auto target_obj = EnsureResult();
-				if (existing_child) {
-					replacements_since_compact++;
-					duckdb_yyjson::yyjson_mut_obj_remove_keyn(target_obj, key_str, key_len);
-				}
-				auto key_copy = yyjson_mut_strncpy(doc, key_str, key_len);
-				if (!key_copy) {
-					throw InternalException("json_group_merge: failed to allocate key storage");
-				}
-				if (!duckdb_yyjson::yyjson_mut_obj_add(target_obj, key_copy, merged_child)) {
-					throw InternalException("json_group_merge: failed to append merged object value");
+			// Recursive merge for nested objects
+			auto existing = target.Find(key); // O(1) average
+			if (existing && existing->IsObject()) {
+				// Existing value is object - merge into it
+				ApplyPatchToMap(existing->AsObject(), patch_val, depth + 1, null_treatment, saw_nulls);
+			} else {
+				// Create new object and recursively apply patch (to filter nulls in IGNORE_NULLS mode)
+				JsonObject new_obj;
+				ApplyPatchToMap(new_obj, patch_val, depth + 1, null_treatment, saw_nulls);
+				// Only store if non-empty (all-null patches should not create empty objects)
+				if (!new_obj.Empty()) {
+					target[key] = JsonValue::MakeObject(std::move(new_obj));
 				}
-				applied_any = true;
 			}
 			continue;
 		}
 
-		auto new_child = yyjson_val_mut_copy(doc, patch_val);
-		if (!new_child) {
-			throw InternalException("json_group_merge: failed to copy JSON value");
-		}
-		if (existing_child) {
-			replacements_since_compact++;
-			duckdb_yyjson::yyjson_mut_obj_remove_keyn(result, key_str, key_len);
-		}
-		auto target_obj = EnsureResult();
-		auto key_copy = yyjson_mut_strncpy(doc, key_str, key_len);
-		if (!key_copy) {
-			throw InternalException("json_group_merge: failed to allocate key storage");
-		}
-		if (!duckdb_yyjson::yyjson_mut_obj_add(target_obj, key_copy, new_child)) {
-			throw InternalException("json_group_merge: failed to append merged value");
-		}
-		applied_any = true;
+		// For all other types, replace directly
+		target[key] = ParseYyjsonValue(patch_val, depth + 1); // O(1) average
 	}
-
-	// If nothing was applied, return the base unchanged (unless we're at top level with nothing)
-	if (!applied_any) {
-		// Special case: at top level with no base and empty patch, return empty object
-		if (depth == 0 && !base) {
-			return EnsureResult();
-		}
-		return base;
-	}
-
-	// Something was applied, ensure we have a result object to return
-	return result ? result : EnsureResult();
 }
 
-static yyjson_mut_val *JsonGroupMergeComposePatchInternal(yyjson_mut_doc *doc, yyjson_mut_val *base, yyjson_val *patch,
-                                                          idx_t depth, idx_t &replacements_since_compact,
-                                                          JsonNullTreatment null_treatment) {
+// Compose patch into patch_map (for DELETE_NULLS mode) - preserves null markers
+static void ComposePatchToMap(JsonObject &target, yyjson_val *patch, idx_t depth, JsonNullTreatment null_treatment) {
 	if (!patch) {
-		return base;
+		return;
 	}
 	if (depth > MAX_JSON_NESTING_DEPTH) {
 		throw InvalidInputException("json_group_merge: nesting depth exceeds maximum limit of " +
 		                            std::to_string(MAX_JSON_NESTING_DEPTH));
 	}
-	if (!duckdb_yyjson::yyjson_is_obj(patch)) {
-		auto copy = yyjson_val_mut_copy(doc, patch);
-		if (!copy) {
-			throw InternalException("json_group_merge: failed to materialize JSON value");
-		}
-		if (base) {
-			replacements_since_compact++;
-		}
-		return copy;
-	}
-
-	yyjson_mut_val *result = nullptr;
-	bool base_is_object = base && duckdb_yyjson::yyjson_mut_is_obj(base);
-	if (base_is_object) {
-		result = base;
-	}
-
-	auto EnsureResult = [&]() -> yyjson_mut_val * {
-		if (result) {
-			return result;
-		}
-		result = yyjson_mut_obj(doc);
-		if (!result) {
-			throw InternalException("json_group_merge: failed to allocate JSON object");
-		}
-		if (base && !base_is_object) {
-			replacements_since_compact++;
-		}
-		return result;
-	};
-
-	bool applied_any = false;
 
 	yyjson_val *patch_key = nullptr;
-	yyjson_obj_iter patch_iter = yyjson_obj_iter_with(patch);
-	while ((patch_key = yyjson_obj_iter_next(&patch_iter))) {
+	duckdb_yyjson::yyjson_obj_iter patch_iter = duckdb_yyjson::yyjson_obj_iter_with(patch);
+	while ((patch_key = duckdb_yyjson::yyjson_obj_iter_next(&patch_iter))) {
 		auto key_str = duckdb_yyjson::yyjson_get_str(patch_key);
 		auto key_len = duckdb_yyjson::yyjson_get_len(patch_key);
-		auto patch_val = yyjson_obj_iter_get_val(patch_key);
+		auto patch_val = duckdb_yyjson::yyjson_obj_iter_get_val(patch_key);
 
 		if (!key_str) {
 			throw InvalidInputException("json_group_merge: encountered non-string object key");
 		}
 
+		std::string key(key_str, key_len);
+
 		if (duckdb_yyjson::yyjson_is_null(patch_val)) {
 			if (null_treatment == JsonNullTreatment::IGNORE_NULLS) {
 				continue;
 			}
-			auto target_obj = EnsureResult();
-			auto removed = duckdb_yyjson::yyjson_mut_obj_remove_keyn(target_obj, key_str, key_len);
-			if (removed) {
-				replacements_since_compact++;
+			// DELETE_NULLS: store the null marker for replay during Combine
+			target[key] = JsonValue::MakeNull();
+			continue;
+		}
+
+		if (duckdb_yyjson::yyjson_is_obj(patch_val)) {
+			auto existing = target.Find(key);
+			if (existing && existing->IsObject()) {
+				ComposePatchToMap(existing->AsObject(), patch_val, depth + 1, null_treatment);
+			} else {
+				target[key] = ParseYyjsonValue(patch_val, depth + 1);
 			}
-			auto key_copy = yyjson_mut_strncpy(doc, key_str, key_len);
-			if (!key_copy) {
-				throw InternalException("json_group_merge: failed to allocate key storage");
+			continue;
+		}
+
+		target[key] = ParseYyjsonValue(patch_val, depth + 1);
+	}
+}
+
+// Apply JsonObject patch to target map (for Combine)
+static void ApplyMapToMap(JsonObject &target, const JsonObject &source, idx_t depth, JsonNullTreatment null_treatment,
+                          bool *saw_nulls) {
+	if (depth > MAX_JSON_NESTING_DEPTH) {
+		throw InvalidInputException("json_group_merge: nesting depth exceeds maximum limit of " +
+		                            std::to_string(MAX_JSON_NESTING_DEPTH));
+	}
+
+	for (const auto &kv : source) {
+		const auto &key = kv.first;
+		const auto &val = kv.second;
+
+		if (val.IsNull()) {
+			if (saw_nulls) {
+				*saw_nulls = true;
 			}
-			auto null_value = yyjson_mut_null(doc);
-			if (!null_value) {
-				throw InternalException("json_group_merge: failed to allocate JSON null value");
+			if (null_treatment == JsonNullTreatment::DELETE_NULLS) {
+				target.Erase(key);
 			}
-			if (!duckdb_yyjson::yyjson_mut_obj_add(target_obj, key_copy, null_value)) {
-				throw InternalException("json_group_merge: failed to append merged value");
+			continue;
+		}
+
+		if (val.IsObject()) {
+			auto existing = target.Find(key);
+			if (existing && existing->IsObject()) {
+				ApplyMapToMap(existing->AsObject(), val.AsObject(), depth + 1, null_treatment, saw_nulls);
+			} else {
+				target[key] = val;
 			}
-			applied_any = true;
 			continue;
 		}
 
-		auto existing_child = result ? duckdb_yyjson::yyjson_mut_obj_getn(result, key_str, key_len) : nullptr;
-		if (duckdb_yyjson::yyjson_is_obj(patch_val)) {
-			auto merged_child = JsonGroupMergeComposePatchInternal(doc, existing_child, patch_val, depth + 1,
-			                                                       replacements_since_compact, null_treatment);
-			if (!merged_child) {
+		target[key] = val;
+	}
+}
+
+// Compose map into patch_map (for Combine in DELETE_NULLS mode)
+static void ComposeMapToMap(JsonObject &target, const JsonObject &source, idx_t depth,
+                            JsonNullTreatment null_treatment) {
+	if (depth > MAX_JSON_NESTING_DEPTH) {
+		throw InvalidInputException("json_group_merge: nesting depth exceeds maximum limit of " +
+		                            std::to_string(MAX_JSON_NESTING_DEPTH));
+	}
+
+	for (const auto &kv : source) {
+		const auto &key = kv.first;
+		const auto &val = kv.second;
+
+		if (val.IsNull()) {
+			if (null_treatment == JsonNullTreatment::IGNORE_NULLS) {
 				continue;
 			}
-			if (!existing_child || merged_child != existing_child) {
-				auto target_obj = EnsureResult();
-				if (existing_child) {
-					replacements_since_compact++;
-					duckdb_yyjson::yyjson_mut_obj_remove_keyn(target_obj, key_str, key_len);
-				}
-				auto key_copy = yyjson_mut_strncpy(doc, key_str, key_len);
-				if (!key_copy) {
-					throw InternalException("json_group_merge: failed to allocate key storage");
-				}
-				if (!duckdb_yyjson::yyjson_mut_obj_add(target_obj, key_copy, merged_child)) {
-					throw InternalException("json_group_merge: failed to append merged object value");
-				}
-				applied_any = true;
-			}
+			target[key] = JsonValue::MakeNull();
 			continue;
 		}
 
-		auto new_child = yyjson_val_mut_copy(doc, patch_val);
-		if (!new_child) {
-			throw InternalException("json_group_merge: failed to copy JSON value");
-		}
-		auto target_obj = EnsureResult();
-		if (existing_child) {
-			replacements_since_compact++;
-			duckdb_yyjson::yyjson_mut_obj_remove_keyn(target_obj, key_str, key_len);
-		}
-		auto key_copy = yyjson_mut_strncpy(doc, key_str, key_len);
-		if (!key_copy) {
-			throw InternalException("json_group_merge: failed to allocate key storage");
-		}
-		if (!duckdb_yyjson::yyjson_mut_obj_add(target_obj, key_copy, new_child)) {
-			throw InternalException("json_group_merge: failed to append merged value");
+		if (val.IsObject()) {
+			auto existing = target.Find(key);
+			if (existing && existing->IsObject()) {
+				ComposeMapToMap(existing->AsObject(), val.AsObject(), depth + 1, null_treatment);
+			} else {
+				target[key] = val;
+			}
+			continue;
 		}
-		applied_any = true;
-	}
 
-	if (!applied_any) {
-		if (depth == 0 && !base) {
-			return EnsureResult();
-		}
-		return base;
+		target[key] = val;
 	}
-
-	return result ? result : EnsureResult();
 }
 
 class JsonGroupMergeFunction {
@@ -580,27 +913,70 @@ class JsonGroupMergeFunction {
 		JsonGroupMergeStateDestroy(state);
 	}
 
-	static inline string JsonParseError(const string_t &input, yyjson_read_err &err) {
+	static inline string JsonParseError(const string_t &input, duckdb_yyjson::yyjson_read_err &err) {
 		return JSONCommon::FormatParseError(input.GetDataUnsafe(), input.GetSize(), err);
 	}
 
 	template <class INPUT_TYPE, class STATE, class OP>
 	static void Operation(STATE &state, const INPUT_TYPE &input, AggregateUnaryInput &unary_input) {
 		static_assert(std::is_same<INPUT_TYPE, string_t>::value, "json_group_merge expects string_t input");
-		yyjson_read_err err;
-		auto doc = yyjson_read_opts(const_cast<char *>(input.GetDataUnsafe()), input.GetSize(), JSONCommon::READ_FLAG,
-		                            nullptr, &err);
+		duckdb_yyjson::yyjson_read_err err;
+		auto doc = duckdb_yyjson::yyjson_read_opts(const_cast<char *>(input.GetDataUnsafe()), input.GetSize(),
+		                                           JSONCommon::READ_FLAG, nullptr, &err);
 		if (!doc) {
 			throw InvalidInputException("json_group_merge: %s", JsonParseError(input, err));
 		}
 		yyjson_doc_ptr patch_doc(doc);
-		auto patch_root = yyjson_doc_get_root(patch_doc.get());
+		auto patch_root = duckdb_yyjson::yyjson_doc_get_root(patch_doc.get());
 		if (!patch_root) {
 			throw InvalidInputException("json_group_merge: invalid JSON payload");
 		}
+
 		auto null_treatment = GetNullTreatment(unary_input.input.bind_data);
-		JsonGroupMergeApplyResultPatch(state, patch_root, null_treatment);
-		JsonGroupMergeComposePatch(state, patch_root, null_treatment);
+
+		// Non-object patch replaces the entire result (JSON merge patch semantics)
+		if (!duckdb_yyjson::yyjson_is_obj(patch_root)) {
+			if (state.scalar_replacement) {
+				delete state.scalar_replacement;
+			}
+			state.scalar_replacement = new JsonValue(ParseYyjsonValue(patch_root, 0));
+			// Clear the hash map since it's replaced
+			state.result_map->Clear();
+			state.result_has_input = true;
+			// Clear patch tracking as well
+			if (state.patch_map) {
+				delete state.patch_map;
+				state.patch_map = nullptr;
+			}
+			state.patch_has_input = false;
+			state.patch_has_nulls = false;
+			return;
+		}
+
+		// Object patch: if there was a scalar replacement, reset to object mode
+		if (state.scalar_replacement) {
+			delete state.scalar_replacement;
+			state.scalar_replacement = nullptr;
+		}
+
+		bool saw_nulls = false;
+
+		// Apply patch to result_map using O(1) hash map operations
+		ApplyPatchToMap(*state.result_map, patch_root, 0, null_treatment, &saw_nulls);
+		state.result_has_input = true;
+
+		// For DELETE_NULLS mode, also compose into patch_map for Combine replay
+		if (null_treatment == JsonNullTreatment::DELETE_NULLS) {
+			if (state.patch_has_nulls || saw_nulls) {
+				if (!state.patch_has_nulls) {
+					// First time seeing nulls: copy result_map to patch_map
+					state.patch_map = new JsonObject(*state.result_map);
+					state.patch_has_nulls = true;
+				}
+				ComposePatchToMap(*state.patch_map, patch_root, 0, null_treatment);
+				state.patch_has_input = true;
+			}
+		}
 	}
 
 	template <class INPUT_TYPE, class STATE, class OP>
@@ -613,31 +989,113 @@ class JsonGroupMergeFunction {
 
 	template <class STATE, class OP>
 	static void Combine(const STATE &source, STATE &target, AggregateInputData &aggr_input_data) {
-		if (!source.patch_has_input || !source.patch_doc || !source.patch_doc->root) {
+		if (!source.result_has_input) {
 			return;
 		}
-		auto source_patch_doc_ptr = yyjson_doc_ptr(yyjson_mut_val_imut_copy(source.patch_doc->root, nullptr));
-		if (!source_patch_doc_ptr) {
-			throw InternalException("json_group_merge: failed to materialize patch state");
+
+		auto null_treatment = GetNullTreatment(aggr_input_data.bind_data);
+
+		// If source has a scalar replacement, it replaces everything
+		if (source.scalar_replacement) {
+			if (target.scalar_replacement) {
+				delete target.scalar_replacement;
+			}
+			target.scalar_replacement = new JsonValue(*source.scalar_replacement);
+			target.result_map->Clear();
+			target.result_has_input = true;
+			if (target.patch_map) {
+				delete target.patch_map;
+				target.patch_map = nullptr;
+			}
+			target.patch_has_input = false;
+			target.patch_has_nulls = false;
+			return;
 		}
-		auto patch_root = yyjson_doc_get_root(source_patch_doc_ptr.get());
-		if (!patch_root) {
-			throw InternalException("json_group_merge: failed to materialize patch state");
+
+		// Source is an object: if target has scalar replacement, clear it first
+		if (target.scalar_replacement) {
+			delete target.scalar_replacement;
+			target.scalar_replacement = nullptr;
+		}
+
+		// For IGNORE_NULLS: use result_map directly
+		// For DELETE_NULLS: use patch_map which preserves explicit null markers for replay
+		const JsonObject *source_map = nullptr;
+		bool source_has_input = false;
+
+		if (null_treatment == JsonNullTreatment::IGNORE_NULLS || !source.patch_has_nulls) {
+			source_map = source.result_map;
+			source_has_input = source.result_has_input;
+		} else {
+			source_map = source.patch_map;
+			source_has_input = source.patch_has_input;
+		}
+
+		if (!source_has_input || !source_map || source_map->Empty()) {
+			return;
+		}
+
+		bool saw_nulls = false;
+		ApplyMapToMap(*target.result_map, *source_map, 0, null_treatment, &saw_nulls);
+		target.result_has_input = true;
+
+		// For DELETE_NULLS, update target's patch_map for further combines when needed
+		if (null_treatment == JsonNullTreatment::DELETE_NULLS) {
+			if (target.patch_has_nulls || saw_nulls) {
+				if (!target.patch_has_nulls) {
+					target.patch_map = new JsonObject(*target.result_map);
+					target.patch_has_nulls = true;
+				}
+				ComposeMapToMap(*target.patch_map, *source_map, 0, null_treatment);
+				target.patch_has_input = true;
+			}
 		}
-		auto null_treatment = GetNullTreatment(aggr_input_data.bind_data);
-		JsonGroupMergeApplyResultPatch(target, patch_root, null_treatment);
-		JsonGroupMergeComposePatch(target, patch_root, null_treatment);
 	}
 
 	template <class RESULT_TYPE, class STATE>
 	static void Finalize(STATE &state, RESULT_TYPE &target, AggregateFinalizeData &finalize_data) {
-		if (!state.result_doc || !state.result_doc->root) {
+		if (!state.result_map) {
 			finalize_data.ReturnNull();
 			return;
 		}
+
+		// Build yyjson document
+		auto doc = duckdb_yyjson::yyjson_mut_doc_new(nullptr);
+		if (!doc) {
+			throw InternalException("json_group_merge: failed to allocate output document");
+		}
+		std::unique_ptr<duckdb_yyjson::yyjson_mut_doc, decltype(&duckdb_yyjson::yyjson_mut_doc_free)> doc_ptr(
+		    doc, duckdb_yyjson::yyjson_mut_doc_free);
+
+		duckdb_yyjson::yyjson_mut_val *root;
+
+		// If scalar replacement is set, use it instead of the hash map
+		if (state.scalar_replacement) {
+			root = BuildYyjsonValue(doc, *state.scalar_replacement);
+			if (!root) {
+				throw InternalException("json_group_merge: failed to build output value");
+			}
+		} else {
+			// Build object from hash map
+			root = duckdb_yyjson::yyjson_mut_obj(doc);
+			if (!root) {
+				throw InternalException("json_group_merge: failed to allocate output object");
+			}
+
+			// Populate from hash map
+			for (const auto &kv : *state.result_map) {
+				auto key = duckdb_yyjson::yyjson_mut_strncpy(doc, kv.first.c_str(), kv.first.size());
+				auto val = BuildYyjsonValue(doc, kv.second);
+				if (key && val && !duckdb_yyjson::yyjson_mut_obj_add(root, key, val)) {
+					throw InternalException("json_group_merge: failed to add object member");
+				}
+			}
+		}
+		duckdb_yyjson::yyjson_mut_doc_set_root(doc, root);
+
 		size_t output_length = 0;
 		auto output_cstr =
-		    yyjson_mut_write_opts(state.result_doc, JSONCommon::WRITE_FLAG, nullptr, &output_length, nullptr);
+		    duckdb_yyjson::yyjson_mut_write_opts(doc, JSONCommon::WRITE_FLAG, nullptr, &output_length, nullptr);
 		if (!output_cstr) {
 			throw InternalException("json_group_merge: failed to serialize aggregate result");
 		}
@@ -751,19 +1209,49 @@ using duckdb_yyjson::yyjson_read_opts;
 using duckdb_yyjson::yyjson_val;
 using duckdb_yyjson::yyjson_val_mut_copy;
 
+// Hash function for JSON keys (FNV-1a)
+static inline uint64_t HashKeyBytes(const char *data, idx_t len) {
+	uint64_t hash = 14695981039346656037ULL;
+	for (idx_t i = 0; i < len; i++) {
+		hash ^= static_cast<uint64_t>(static_cast<uint8_t>(data[i]));
+		hash *= 1099511628211ULL;
+	}
+	return hash;
+}
+
+// Count trailing zeros for bitset iteration (x must be non-zero)
+static inline idx_t CountTrailingZeros(uint64_t x) {
+	D_ASSERT(x != 0);
+#if defined(__GNUC__) || defined(__clang__)
+	return static_cast<idx_t>(__builtin_ctzll(x));
+#elif defined(_MSC_VER)
+	unsigned long idx;
+	_BitScanForward64(&idx, x);
+	return static_cast<idx_t>(idx);
+#else
+	idx_t count = 0;
+	while ((x & 1) == 0) {
+		x >>= 1;
+		count++;
+	}
+	return count;
+#endif
+}
+
 struct JsonExtractColumnsBindData : public FunctionData {
 	JsonExtractColumnsBindData(vector<string> column_names_p, vector<string> patterns_p,
 	                           child_list_t<LogicalType> children_p, duckdb_re2::RE2::Options options_p)
 	    : column_names(std::move(column_names_p)), patterns(std::move(patterns_p)), children(std::move(children_p)),
-	      options(std::move(options_p)) {
-		CompilePatterns();
+	      options(std::move(options_p)), column_count(this->patterns.size()) {
+		BuildPatternSet();
 	}
 
 	vector<string> column_names;
 	vector<string> patterns;
 	child_list_t<LogicalType> children;
 	duckdb_re2::RE2::Options options;
-	vector<unique_ptr<duckdb_re2::RE2>> compiled_patterns;
+	unique_ptr<duckdb_re2::RE2::Set> pattern_set;
+	idx_t column_count;
 
 	unique_ptr<FunctionData> Copy() const override {
 		return make_uniq<JsonExtractColumnsBindData>(column_names, patterns, children, options);
@@ -776,28 +1264,204 @@ struct JsonExtractColumnsBindData : public FunctionData {
 	}
 
 private:
-	void CompilePatterns() {
-		compiled_patterns.clear();
-		compiled_patterns.reserve(patterns.size());
-		for (auto &pattern : patterns) {
-			auto re = make_uniq<duckdb_re2::RE2>(pattern, options);
-			if (!re->ok()) {
-				throw BinderException("json_extract_columns: %s", re->error());
+	void BuildPatternSet() {
+		pattern_set = make_uniq<duckdb_re2::RE2::Set>(options, duckdb_re2::RE2::UNANCHORED);
+		for (idx_t i = 0; i < patterns.size(); i++) {
+			std::string error;
+			int idx = pattern_set->Add(patterns[i], &error);
+			if (idx < 0) {
+				throw BinderException("json_extract_columns: %s", error);
+			}
+			D_ASSERT(static_cast<idx_t>(idx) == i);
+		}
+		if (!pattern_set->Compile()) {
+			throw BinderException("json_extract_columns: failed to compile pattern set");
+		}
+	}
+};
+
+// Cache for key→matched columns mapping
+static constexpr idx_t KEY_MATCH_CACHE_SIZE = 8192;
+static constexpr idx_t KEY_MATCH_CACHE_WAYS = 2;
+static constexpr idx_t KEY_MATCH_CACHE_SETS = KEY_MATCH_CACHE_SIZE / KEY_MATCH_CACHE_WAYS;
+static constexpr idx_t KEY_MATCH_MAX_KEY_STORAGE = 256 * 1024;
+
+struct KeyMatchCacheEntry {
+	idx_t key_offset = 0;
+	idx_t key_len = 0;
+	uint64_t key_hash = 0;
+	uint64_t match_bitset = 0;
+	bool valid = false;
+};
+
+struct KeyMatchCache {
+	std::array<KeyMatchCacheEntry, KEY_MATCH_CACHE_SIZE> entries;
+	vector<char> key_storage;
+	idx_t key_storage_offset = 0;
+
+	KeyMatchCache() {
+		key_storage.resize(64 * 1024);
+	}
+
+	void Reset() {
+		for (auto &e : entries) {
+			e.valid = false;
+		}
+		key_storage_offset = 0;
+	}
+
+	bool IsFull() const {
+		return key_storage_offset >= KEY_MATCH_MAX_KEY_STORAGE;
+	}
+
+	const char *GetKeyData(const KeyMatchCacheEntry &entry) const {
+		return key_storage.data() + entry.key_offset;
+	}
+
+	// Returns true if entry was inserted, false if storage is exhausted
+	bool TryInsert(idx_t set_idx, const char *key_str, idx_t key_len, uint64_t key_hash, uint64_t match_bitset) {
+		// Find target slot: prefer invalid, else evict way 0
+		idx_t target_way = 0;
+		for (idx_t way = 0; way < KEY_MATCH_CACHE_WAYS; way++) {
+			if (!entries[set_idx + way].valid) {
+				target_way = way;
+				break;
+			}
+		}
+
+		// Grow storage if needed and possible
+		if (key_storage_offset + key_len > key_storage.size()) {
+			if (key_storage.size() >= KEY_MATCH_MAX_KEY_STORAGE) {
+				return false;
+			}
+			key_storage.resize(key_storage.size() + 64 * 1024);
+		}
+
+		auto &entry = entries[set_idx + target_way];
+		std::memcpy(key_storage.data() + key_storage_offset, key_str, key_len);
+		entry.key_offset = key_storage_offset;
+		key_storage_offset += key_len;
+		entry.key_len = key_len;
+		entry.key_hash = key_hash;
+		entry.match_bitset = match_bitset;
+		entry.valid = true;
+		return true;
+	}
+};
+
+// Chunked cache for >64 patterns - stores match results as array of uint64_t chunks
+struct ChunkedKeyMatchCacheEntry {
+	idx_t key_offset = 0;
+	idx_t key_len = 0;
+	uint64_t key_hash = 0;
+	idx_t match_offset = 0;
+	bool valid = false;
+};
+
+struct ChunkedKeyMatchCache {
+	std::array<ChunkedKeyMatchCacheEntry, KEY_MATCH_CACHE_SIZE> entries;
+	vector<char> key_storage;
+	vector<uint64_t> match_storage;
+	idx_t key_storage_offset = 0;
+	idx_t match_storage_offset = 0;
+	idx_t chunks_per_entry = 0;
+
+	void Init(idx_t column_count) {
+		chunks_per_entry = (column_count + 63) / 64;
+		key_storage.resize(64 * 1024);
+		match_storage.resize(KEY_MATCH_CACHE_SIZE * chunks_per_entry);
+	}
+
+	void Reset() {
+		for (auto &e : entries) {
+			e.valid = false;
+		}
+		key_storage_offset = 0;
+		match_storage_offset = 0;
+	}
+
+	bool IsFull() const {
+		return key_storage_offset >= KEY_MATCH_MAX_KEY_STORAGE ||
+		       match_storage_offset + chunks_per_entry > match_storage.size();
+	}
+
+	const char *GetKeyData(const ChunkedKeyMatchCacheEntry &entry) const {
+		return key_storage.data() + entry.key_offset;
+	}
+
+	const uint64_t *GetMatchChunks(const ChunkedKeyMatchCacheEntry &entry) const {
+		return match_storage.data() + entry.match_offset;
+	}
+
+	// Returns pointer to match chunks if inserted, nullptr if storage is exhausted
+	const uint64_t *TryInsert(idx_t set_idx, const char *key_str, idx_t key_len, uint64_t key_hash,
+	                          const vector<int> &match_results) {
+		if (match_storage_offset + chunks_per_entry > match_storage.size()) {
+			return nullptr;
+		}
+
+		// Find target slot: prefer invalid, else evict way 0
+		idx_t target_way = 0;
+		for (idx_t way = 0; way < KEY_MATCH_CACHE_WAYS; way++) {
+			if (!entries[set_idx + way].valid) {
+				target_way = way;
+				break;
+			}
+		}
+
+		// Grow key storage if needed and possible
+		if (key_storage_offset + key_len > key_storage.size()) {
+			if (key_storage.size() >= KEY_MATCH_MAX_KEY_STORAGE) {
+				return nullptr;
 			}
-			compiled_patterns.push_back(std::move(re));
+			key_storage.resize(key_storage.size() + 64 * 1024);
+		}
+
+		auto &entry = entries[set_idx + target_way];
+
+		// Store key
+		std::memcpy(key_storage.data() + key_storage_offset, key_str, key_len);
+		entry.key_offset = key_storage_offset;
+		key_storage_offset += key_len;
+		entry.key_len = key_len;
+		entry.key_hash = key_hash;
+
+		// Store match chunks
+		idx_t match_offset = match_storage_offset;
+		match_storage_offset += chunks_per_entry;
+		uint64_t *chunks = match_storage.data() + match_offset;
+		std::memset(chunks, 0, chunks_per_entry * sizeof(uint64_t));
+		for (int idx : match_results) {
+			chunks[idx / 64] |= (1ULL << (idx % 64));
 		}
+		entry.match_offset = match_offset;
+		entry.valid = true;
+
+		return chunks;
 	}
 };
 
 struct JsonExtractColumnsLocalState : public FunctionLocalState {
 	JsonExtractColumnsLocalState(Allocator &allocator, idx_t column_count)
 	    : json_allocator(std::make_shared<JSONAllocator>(allocator)), buffers(column_count),
-	      has_match(column_count, false) {
+	      has_match(column_count, false), use_bitset_cache(column_count <= 64), column_count(column_count) {
+		match_results.reserve(column_count);
+		if (!use_bitset_cache) {
+			chunked_cache.Init(column_count);
+		}
 	}
 
 	shared_ptr<JSONAllocator> json_allocator;
 	vector<string> buffers;
 	vector<bool> has_match;
+
+	// Cache for key→matches (≤64 patterns)
+	KeyMatchCache cache;
+	// Chunked cache for key→matches (>64 patterns)
+	ChunkedKeyMatchCache chunked_cache;
+	vector<int> match_results;
+	bool use_bitset_cache;
+	idx_t column_count;
 };
 
 static unique_ptr<FunctionLocalState>
@@ -1007,6 +1671,22 @@ static void JsonExtractColumnsFunction(DataChunk &args, ExpressionState &state,
 		}
 		std::fill(local_state.has_match.begin(), local_state.has_match.end(), false);
 
+		// Reset cache if storage is full
+		auto &cache = local_state.cache;
+		auto &chunked_cache = local_state.chunked_cache;
+		if (local_state.use_bitset_cache) {
+			if (cache.IsFull()) {
+				cache.Reset();
+			}
+		} else {
+			if (chunked_cache.IsFull()) {
+				chunked_cache.Reset();
+			}
+		}
+
+		auto &pattern_set = *bind_data.pattern_set;
+		const bool use_bitset = local_state.use_bitset_cache;
+
 		yyjson_val *key = nullptr;
 		yyjson_obj_iter iter = yyjson_obj_iter_with(root);
 		while ((key = yyjson_obj_iter_next(&iter))) {
@@ -1015,19 +1695,113 @@ static void JsonExtractColumnsFunction(DataChunk &args, ExpressionState &state,
 			if (!key_str) {
 				throw InvalidInputException("json_extract_columns: encountered non-string object key");
 			}
+
+			// Hash the key for cache lookup
+			uint64_t key_hash = HashKeyBytes(key_str, key_len);
+			idx_t set_idx = (key_hash & (KEY_MATCH_CACHE_SETS - 1)) * KEY_MATCH_CACHE_WAYS;
+
 			auto value = yyjson_obj_iter_get_val(key);
-			duckdb_re2::StringPiece key_piece(key_str, key_len);
-			for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
-				auto &regex = *bind_data.compiled_patterns[col_idx];
-				if (!duckdb_re2::RE2::PartialMatch(key_piece, regex)) {
-					continue;
+
+			if (use_bitset) {
+				// Fast path for ≤64 columns: use cache with bitset
+				// Cache lookup (2-way associative)
+				bool cache_hit = false;
+				uint64_t match_bitset = 0;
+				for (idx_t way = 0; way < KEY_MATCH_CACHE_WAYS; way++) {
+					auto &entry = cache.entries[set_idx + way];
+					if (entry.valid && entry.key_hash == key_hash && entry.key_len == key_len &&
+					    std::memcmp(cache.GetKeyData(entry), key_str, key_len) == 0) {
+						cache_hit = true;
+						match_bitset = entry.match_bitset;
+						break;
+					}
+				}
+
+				// On cache miss, compute matches and store in cache
+				if (!cache_hit) {
+					duckdb_re2::StringPiece key_piece(key_str, key_len);
+					local_state.match_results.clear();
+					pattern_set.Match(key_piece, &local_state.match_results);
+
+					// Convert to bitset
+					match_bitset = 0;
+					for (int idx : local_state.match_results) {
+						match_bitset |= (1ULL << idx);
+					}
+
+					cache.TryInsert(set_idx, key_str, key_len, key_hash, match_bitset);
 				}
-				if (local_state.has_match[col_idx]) {
-					local_state.buffers[col_idx].append(separator_data_ptr, separator_len);
-				} else {
-					local_state.has_match[col_idx] = true;
+
+				// Iterate set bits
+				uint64_t remaining = match_bitset;
+				while (remaining != 0) {
+					idx_t col_idx = CountTrailingZeros(remaining);
+					remaining &= (remaining - 1);
+
+					if (local_state.has_match[col_idx]) {
+						local_state.buffers[col_idx].append(separator_data_ptr, separator_len);
+					} else {
+						local_state.has_match[col_idx] = true;
+					}
+					AppendJsonValue(local_state.buffers[col_idx], value, alc);
+				}
+			} else {
+				// Fallback for > 64 columns: use chunked cache
+				idx_t chunked_set_idx = (key_hash & (KEY_MATCH_CACHE_SETS - 1)) * KEY_MATCH_CACHE_WAYS;
+
+				// Cache lookup (2-way associative)
+				bool cache_hit = false;
+				const uint64_t *match_chunks = nullptr;
+				for (idx_t way = 0; way < KEY_MATCH_CACHE_WAYS; way++) {
+					auto &entry = chunked_cache.entries[chunked_set_idx + way];
+					if (entry.valid && entry.key_hash == key_hash && entry.key_len == key_len &&
+					    std::memcmp(chunked_cache.GetKeyData(entry), key_str, key_len) == 0) {
+						cache_hit = true;
+						match_chunks = chunked_cache.GetMatchChunks(entry);
+						break;
+					}
+				}
+
+				// On cache miss, compute matches and store in cache
+				if (!cache_hit) {
+					duckdb_re2::StringPiece key_piece(key_str, key_len);
+					local_state.match_results.clear();
+					pattern_set.Match(key_piece, &local_state.match_results);
+
+					match_chunks =
+					    chunked_cache.TryInsert(chunked_set_idx, key_str, key_len, key_hash, local_state.match_results);
+
+					// If cache insert failed, process directly from match_results
+					if (!match_chunks) {
+						for (int col_idx : local_state.match_results) {
+							if (local_state.has_match[col_idx]) {
+								local_state.buffers[col_idx].append(separator_data_ptr, separator_len);
+							} else {
+								local_state.has_match[col_idx] = true;
+							}
+							AppendJsonValue(local_state.buffers[col_idx], value, alc);
+						}
+						continue;
+					}
+				}
+
+				// Iterate over chunks and their set bits
+				for (idx_t chunk_idx = 0; chunk_idx < chunked_cache.chunks_per_entry; chunk_idx++) {
+					uint64_t remaining = match_chunks[chunk_idx];
+					idx_t base_col = chunk_idx * 64;
+					while (remaining != 0) {
+						idx_t bit_idx = CountTrailingZeros(remaining);
+						remaining &= (remaining - 1);
+						idx_t col_idx = base_col + bit_idx;
+
+						if (local_state.has_match[col_idx]) {
+							local_state.buffers[col_idx].append(separator_data_ptr, separator_len);
+						} else {
+							local_state.has_match[col_idx] = true;
+						}
+						AppendJsonValue(local_state.buffers[col_idx], value, alc);
+					}
 				}
-				AppendJsonValue(local_state.buffers[col_idx], value, alc);
 			}
 		}
 
diff --git a/test/sql/json_extract_columns.test b/test/sql/json_extract_columns.test
index 9a8245a..425fe26 100644
--- a/test/sql/json_extract_columns.test
+++ b/test/sql/json_extract_columns.test
@@ -108,3 +108,18 @@ statement error
 SELECT json_extract_columns('{"a":1}', '{"a":"["}', ',');
 ----
 <REGEX>:.*json_extract_columns: missing ].*
+
+# Test with >64 columns (fallback path without bitset cache)
+# Uses 70 columns to exceed the 64-column bitset limit
+query I
+SELECT (json_extract_columns(
+    '{"k0":0,"k1":1,"k2":2,"k3":3,"k4":4,"k5":5,"k6":6,"k7":7,"k8":8,"k9":9,"k10":10,"k11":11,"k12":12,"k13":13,"k14":14,"k15":15,"k16":16,"k17":17,"k18":18,"k19":19,"k20":20,"k21":21,"k22":22,"k23":23,"k24":24,"k25":25,"k26":26,"k27":27,"k28":28,"k29":29,"k30":30,"k31":31,"k32":32,"k33":33,"k34":34,"k35":35,"k36":36,"k37":37,"k38":38,"k39":39,"k40":40,"k41":41,"k42":42,"k43":43,"k44":44,"k45":45,"k46":46,"k47":47,"k48":48,"k49":49,"k50":50,"k51":51,"k52":52,"k53":53,"k54":54,"k55":55,"k56":56,"k57":57,"k58":58,"k59":59,"k60":60,"k61":61,"k62":62,"k63":63,"k64":64,"k65":65,"k66":66,"k67":67,"k68":68,"k69":69}',
+    '{"c0":"^k0$","c1":"^k1$","c2":"^k2$","c3":"^k3$","c4":"^k4$","c5":"^k5$","c6":"^k6$","c7":"^k7$","c8":"^k8$","c9":"^k9$","c10":"^k10$","c11":"^k11$","c12":"^k12$","c13":"^k13$","c14":"^k14$","c15":"^k15$","c16":"^k16$","c17":"^k17$","c18":"^k18$","c19":"^k19$","c20":"^k20$","c21":"^k21$","c22":"^k22$","c23":"^k23$","c24":"^k24$","c25":"^k25$","c26":"^k26$","c27":"^k27$","c28":"^k28$","c29":"^k29$","c30":"^k30$","c31":"^k31$","c32":"^k32$","c33":"^k33$","c34":"^k34$","c35":"^k35$","c36":"^k36$","c37":"^k37$","c38":"^k38$","c39":"^k39$","c40":"^k40$","c41":"^k41$","c42":"^k42$","c43":"^k43$","c44":"^k44$","c45":"^k45$","c46":"^k46$","c47":"^k47$","c48":"^k48$","c49":"^k49$","c50":"^k50$","c51":"^k51$","c52":"^k52$","c53":"^k53$","c54":"^k54$","c55":"^k55$","c56":"^k56$","c57":"^k57$","c58":"^k58$","c59":"^k59$","c60":"^k60$","c61":"^k61$","c62":"^k62$","c63":"^k63$","c64":"^k64$","c65":"^k65$","c66":"^k66$","c67":"^k67$","c68":"^k68$","c69":"^k69$"}',
+    ','
+)).c0 = '0' AND (json_extract_columns(
+    '{"k0":0,"k1":1,"k2":2,"k3":3,"k4":4,"k5":5,"k6":6,"k7":7,"k8":8,"k9":9,"k10":10,"k11":11,"k12":12,"k13":13,"k14":14,"k15":15,"k16":16,"k17":17,"k18":18,"k19":19,"k20":20,"k21":21,"k22":22,"k23":23,"k24":24,"k25":25,"k26":26,"k27":27,"k28":28,"k29":29,"k30":30,"k31":31,"k32":32,"k33":33,"k34":34,"k35":35,"k36":36,"k37":37,"k38":38,"k39":39,"k40":40,"k41":41,"k42":42,"k43":43,"k44":44,"k45":45,"k46":46,"k47":47,"k48":48,"k49":49,"k50":50,"k51":51,"k52":52,"k53":53,"k54":54,"k55":55,"k56":56,"k57":57,"k58":58,"k59":59,"k60":60,"k61":61,"k62":62,"k63":63,"k64":64,"k65":65,"k66":66,"k67":67,"k68":68,"k69":69}',
+    '{"c0":"^k0$","c1":"^k1$","c2":"^k2$","c3":"^k3$","c4":"^k4$","c5":"^k5$","c6":"^k6$","c7":"^k7$","c8":"^k8$","c9":"^k9$","c10":"^k10$","c11":"^k11$","c12":"^k12$","c13":"^k13$","c14":"^k14$","c15":"^k15$","c16":"^k16$","c17":"^k17$","c18":"^k18$","c19":"^k19$","c20":"^k20$","c21":"^k21$","c22":"^k22$","c23":"^k23$","c24":"^k24$","c25":"^k25$","c26":"^k26$","c27":"^k27$","c28":"^k28$","c29":"^k29$","c30":"^k30$","c31":"^k31$","c32":"^k32$","c33":"^k33$","c34":"^k34$","c35":"^k35$","c36":"^k36$","c37":"^k37$","c38":"^k38$","c39":"^k39$","c40":"^k40$","c41":"^k41$","c42":"^k42$","c43":"^k43$","c44":"^k44$","c45":"^k45$","c46":"^k46$","c47":"^k47$","c48":"^k48$","c49":"^k49$","c50":"^k50$","c51":"^k51$","c52":"^k52$","c53":"^k53$","c54":"^k54$","c55":"^k55$","c56":"^k56$","c57":"^k57$","c58":"^k58$","c59":"^k59$","c60":"^k60$","c61":"^k61$","c62":"^k62$","c63":"^k63$","c64":"^k64$","c65":"^k65$","c66":"^k66$","c67":"^k67$","c68":"^k68$","c69":"^k69$"}',
+    ','
+)).c69 = '69' AS ok;
+----
+true
diff --git a/test/sql/json_group_merge.test b/test/sql/json_group_merge.test
index 20e4414..500779f 100644
--- a/test/sql/json_group_merge.test
+++ b/test/sql/json_group_merge.test
@@ -35,6 +35,38 @@ FROM (VALUES ('{"flag":true}'::json, 1), ('{"flag":false}'::json, 2)) AS t(patch
 ----
 {"flag":false}
 
+# ========================================
+# Empty Key Handling
+# ========================================
+
+# Empty string is a valid JSON key
+query I
+SELECT json_group_merge(patch ORDER BY ts)
+FROM (VALUES ('{"":1}'::json, 1), ('{"a":2}'::json, 2)) AS t(patch, ts);
+----
+{"":1,"a":2}
+
+# Empty key with nested object
+query I
+SELECT json_group_merge(patch ORDER BY ts)
+FROM (VALUES ('{"outer":{"":42}}'::json, 1)) AS t(patch, ts);
+----
+{"outer":{"":42}}
+
+# Empty key update
+query I
+SELECT json_group_merge(patch ORDER BY ts)
+FROM (VALUES ('{"":1}'::json, 1), ('{"":2}'::json, 2)) AS t(patch, ts);
+----
+{"":2}
+
+# Empty key deletion with DELETE NULLS
+query I
+SELECT json_group_merge(patch ORDER BY ts)
+FROM (VALUES ('{"":1,"keep":2}'::json, 1), ('{"":null}'::json, 2)) AS t(patch, ts);
+----
+{"keep":2}
+
 # ========================================
 # NULL Handling
 # ========================================
@@ -301,6 +333,36 @@ ORDER BY grp;
 6	{}	{}
 7	{}	{}
 
+query III
+WITH patches AS (
+    SELECT
+        g AS grp,
+        ts,
+        CASE
+            WHEN ts = 0 THEN '{"a":{"b":1}}'::JSON
+            WHEN ts = 9999 THEN '{"a":{"b":null}}'::JSON
+            ELSE '{}'::JSON
+        END AS patch
+    FROM range(0, 8) g(g)
+    CROSS JOIN range(0, 10000) t(ts)
+)
+SELECT
+    grp,
+    json_group_merge(patch ORDER BY ts) AS agg,
+    list_reduce(list(patch ORDER BY ts), (acc, p) -> json_merge_patch(acc, p), '{}'::JSON) AS baseline
+FROM patches
+GROUP BY grp
+ORDER BY grp;
+----
+0	{"a":{}}	{"a":{}}
+1	{"a":{}}	{"a":{}}
+2	{"a":{}}	{"a":{}}
+3	{"a":{}}	{"a":{}}
+4	{"a":{}}	{"a":{}}
+5	{"a":{}}	{"a":{}}
+6	{"a":{}}	{"a":{}}
+7	{"a":{}}	{"a":{}}
+
 query II
 WITH patches AS (
     SELECT