From 8946d3338e362b8a23d9f06f486a02151bb6e9c4 Mon Sep 17 00:00:00 2001
From: mdzurick <mitch_dz@hotmail.com>
Date: Wed, 10 Jun 2026 21:57:31 +0000
Subject: [PATCH 1/2] Fix heap corruption from list[bool] kernel arguments   On
 local simulators a list[bool] is marshaled as a bit-packed  
 std::vector<bool>, but argument synthesis read every vector arg as a  
 {begin, end, capacity} pointer triple. That layout is wrong for  
 std::vector<bool> (its members are _Bit_iterators, not raw pointers), so it  
 walked off the buffer and corrupted the heap, or silently read an empty  
 vector when the padding was zero. This showed up as the flaky  
 test_list_subscript segfault.

  Read i1 vector args through the std::vector<bool> API when the target is a
  local simulator (gated by CompileTarget::isLocalSimulator). Remote and
  emulated targets pack std::vector<char> and are unchanged. Covers sample,
  observe, run, get_state, translate/get_qir, and lifted closure args.

  Regression from #4265 (argsCreator packing).

  Tests:
  - test_argument_conversion.cpp: synthesize a real std::vector<bool> and check
    the resulting constant array.
  - test_kernel_builder.py::test_repeated_builder_launch_no_segfault: run the
    make_kernel/sample loop under MALLOC_PERTURB_ so the crash is deterministic.

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 cudaq/include/cudaq/Target/CompileTarget.h    |  4 +
 python/tests/builder/test_kernel_builder.py   | 37 ++++++++
 runtime/cudaq/platform/default/python/QPU.cpp |  1 +
 .../internal/compiler/ArgumentConversion.cpp  | 91 +++++++++++++------
 runtime/internal/compiler/Compiler.cpp        |  6 +-
 .../compiler/ArgumentConversion.h             | 10 +-
 runtime/test/test_argument_conversion.cpp     | 21 ++++-
 7 files changed, 136 insertions(+), 34 deletions(-)
diff --git a/cudaq/include/cudaq/Target/CompileTarget.h b/cudaq/include/cudaq/Target/CompileTarget.h
index ffd1371f394..be25cec470c 100644
--- a/cudaq/include/cudaq/Target/CompileTarget.h
+++ b/cudaq/include/cudaq/Target/CompileTarget.h
@@ -100,6 +100,10 @@ class CompileTarget {
   /// Whether to fully specialize the kernel.
   bool fullySpecialize = true;
 
+  /// Whether this target is a local simulator (not remote, not emulated). On
+  /// this path `i1` vector args are packed as bit-packed `std::vector<bool>`.
+  bool isLocalSimulator = false;
+
   /// Set the `changeSemantics` flag for the argument synthesis pass.
   bool argumentSynthChangeSemantics = true;
 
diff --git a/python/tests/builder/test_kernel_builder.py b/python/tests/builder/test_kernel_builder.py
index 6c09ca9c101..650b85db7f3 100644
--- a/python/tests/builder/test_kernel_builder.py
+++ b/python/tests/builder/test_kernel_builder.py
@@ -13,6 +13,8 @@
 import random
 import numpy as np
 import os
+import subprocess
+import sys
 from typing import List
 
 import cudaq
@@ -1535,6 +1537,41 @@ def test_call_invalid_attribute_on_a_kernel():
     assert "not supported on PyKernel" in str(e.value)
 
 
+def test_repeated_builder_launch_no_segfault():
+    """A ``list[bool]`` arg used to corrupt the heap during argument synthesis,
+    crashing a repeated ``make_kernel`` + ``sample`` loop at a random iteration.
+
+    The crash only surfaces when the bit-packed ``std::vector<bool>`` padding is
+    nonzero, so run under ``MALLOC_PERTURB_`` (set before the process starts) in
+    a subprocess. See ``runtime/test/test_argument_conversion.cpp`` for the
+    unit-level regression.
+    """
+    script = (
+        "import cudaq\n"
+        "from typing import List\n"
+        "cudaq.set_target('qpp-cpu')\n"
+        "for _ in range(512):\n"
+        "    kernel, *_ = cudaq.make_kernel(bool, list[bool], List[int], list[float])\n"
+        "    kernel.qalloc(1)\n"
+        "    cudaq.sample(kernel, False, [False], [3], [3.5])\n"
+        "    cudaq.sample(kernel, False, [], [], [])\n"
+        "print('OK')\n")
+    env = dict(os.environ)
+    # Dirty fresh allocations so the std::vector<bool> padding is never zero,
+    # otherwise the corruption stays latent and the test passes with the bug.
+    env["MALLOC_PERTURB_"] = "165"
+    proc = subprocess.run([sys.executable, "-c", script],
+                          env=env,
+                          capture_output=True,
+                          text=True,
+                          timeout=900)
+    assert proc.returncode == 0, ("repeated make_kernel/sample crashed "
+                                  f"(returncode={proc.returncode}).\n"
+                                  f"stdout:\n{proc.stdout}\n"
+                                  f"stderr (tail):\n{proc.stderr[-3000:]}")
+    assert "OK" in proc.stdout
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 4801d8e53bc..57ee82d560b 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -62,6 +62,7 @@ getCompileTarget(cudaq::ExecutionContext *context) {
       !(cudaq::is_remote_platform() || cudaq::is_emulated_platform());
 
   ct->fullySpecialize = !isLocalSimulator;
+  ct->isLocalSimulator = isLocalSimulator;
   ct->supportDeviceCalls = true;
   ct->emitResourceCounts = context && context->name == "resource-count";
   ct->argumentSynthChangeSemantics = false;
diff --git a/runtime/internal/compiler/ArgumentConversion.cpp b/runtime/internal/compiler/ArgumentConversion.cpp
index dcb0f09efab..32ec4e72864 100644
--- a/runtime/internal/compiler/ArgumentConversion.cpp
+++ b/runtime/internal/compiler/ArgumentConversion.cpp
@@ -97,15 +97,15 @@ static Value genConstant(OpBuilder &builder, const std::string &v,
 
 // Forward declare aggregate type builder as they can be recursive.
 static Value genRecursiveSpan(OpBuilder &, cudaq::cc::StdvecType, void *,
-                              ModuleOp, llvm::DataLayout &);
+                              ModuleOp, llvm::DataLayout &, bool);
 static Value genConstant(OpBuilder &, cudaq::cc::StdvecType, void *, ModuleOp,
-                         llvm::DataLayout &);
+                         llvm::DataLayout &, bool);
 static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *, ModuleOp,
-                         llvm::DataLayout &);
+                         llvm::DataLayout &, bool);
 static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *, ModuleOp,
-                         llvm::DataLayout &);
+                         llvm::DataLayout &, bool);
 static Value genConstant(OpBuilder &, cudaq::cc::CallableType, void *, ModuleOp,
-                         llvm::DataLayout &);
+                         llvm::DataLayout &, bool);
 
 /// Create callee.init_N that initializes the state
 ///
@@ -524,7 +524,7 @@ static bool isSupportedRecursiveSpan(cudaq::cc::StdvecType ty) {
 
 // Recursive step processing of aggregates.
 Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod,
-                      llvm::DataLayout &layout) {
+                      llvm::DataLayout &layout, bool boolVecBitPacked = false) {
   auto *ctx = builder.getContext();
   return TypeSwitch<Type, Value>(ty)
       .Case([&](IntegerType intTy) -> Value {
@@ -567,16 +567,16 @@ Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod,
                            substMod);
       })
       .Case([&](cudaq::cc::StdvecType ty) {
-        return genConstant(builder, ty, p, substMod, layout);
+        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
       })
       .Case([&](cudaq::cc::StructType ty) {
-        return genConstant(builder, ty, p, substMod, layout);
+        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
       })
       .Case([&](cudaq::cc::ArrayType ty) {
-        return genConstant(builder, ty, p, substMod, layout);
+        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
       })
       .Case([&](cudaq::cc::CallableType ty) {
-        return genConstant(builder, ty, p, substMod, layout);
+        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
       })
       .Default({});
 }
@@ -597,21 +597,40 @@ static std::size_t getHostSideElementSize(Type eleTy,
 }
 
 /// Recursively builds an `ArrayAttr` containing the constants.
+///
+/// Set \p boolVecBitPacked when an `i1` vector arg is a host
+/// `std::vector<bool>` (bit-packed; not the `{begin, end, capacity}` triple).
 ArrayAttr genRecursiveConstantArray(OpBuilder &builder,
                                     cudaq::cc::StdvecType vecTy, void *p,
-                                    llvm::DataLayout &layout) {
+                                    llvm::DataLayout &layout,
+                                    bool boolVecBitPacked = false) {
+  auto eleTy = vecTy.getElementType();
+
+  // Bit-packed `std::vector<bool>`: read via the container API, not a triple.
+  if (boolVecBitPacked && eleTy.isInteger(1)) {
+    auto *boolVec = reinterpret_cast<const std::vector<bool> *>(p);
+    if (boolVec->empty())
+      return {};
+    auto intTy = cast<IntegerType>(eleTy);
+    SmallVector<Attribute> members;
+    members.reserve(boolVec->size());
+    for (bool bit : *boolVec)
+      members.push_back(IntegerAttr::get(intTy, bit ? 1 : 0));
+    return ArrayAttr::get(builder.getContext(), members);
+  }
+
   typedef const char *VectorType[3];
   VectorType *vecPtr = static_cast<VectorType *>(p);
   auto delta = (*vecPtr)[1] - (*vecPtr)[0];
   if (!delta)
     return {};
-  auto eleTy = vecTy.getElementType();
   unsigned stepBy = 0;
   std::function<Attribute(char *)> genAttr;
   if (auto innerTy = dyn_cast<cudaq::cc::StdvecType>(eleTy)) {
     stepBy = sizeof(VectorType);
     genAttr = [&, innerTy](char *p) -> Attribute {
-      return genRecursiveConstantArray(builder, innerTy, p, layout);
+      return genRecursiveConstantArray(builder, innerTy, p, layout,
+                                       boolVecBitPacked);
     };
   } else if (auto stringTy = dyn_cast<cudaq::cc::CharspanType>(eleTy)) {
     stepBy = sizeof(std::string);
@@ -688,8 +707,10 @@ static Type convertRecursiveSpanType(Type ty) {
 /// constant propagation through the recursive span structure. The reify
 /// operation will be lowered to more primitive ops on an as-needed basis.
 Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p,
-                       ModuleOp substMod, llvm::DataLayout &layout) {
-  ArrayAttr constants = genRecursiveConstantArray(builder, ty, p, layout);
+                       ModuleOp substMod, llvm::DataLayout &layout,
+                       bool boolVecBitPacked = false) {
+  ArrayAttr constants =
+      genRecursiveConstantArray(builder, ty, p, layout, boolVecBitPacked);
   auto loc = builder.getUnknownLoc();
   if (!constants) {
     // Empty vector. Not much to contemplate here.
@@ -705,9 +726,11 @@ Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout) {
+                  ModuleOp substMod, llvm::DataLayout &layout,
+                  bool boolVecBitPacked = false) {
   if (isSupportedRecursiveSpan(vecTy))
-    return genRecursiveSpan(builder, vecTy, p, substMod, layout);
+    return genRecursiveSpan(builder, vecTy, p, substMod, layout,
+                            boolVecBitPacked);
   typedef const char *VectorType[3];
   VectorType *vecPtr = static_cast<VectorType *>(p);
   auto delta = (*vecPtr)[1] - (*vecPtr)[0];
@@ -727,7 +750,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
   for (std::int32_t i = 0; i < vecSize; ++i) {
     if (Value val = dispatchSubtype(
             builder, eleTy, static_cast<void *>(const_cast<char *>(cursor)),
-            substMod, layout)) {
+            substMod, layout, boolVecBitPacked)) {
       auto atLoc = cudaq::cc::ComputePtrOp::create(
           builder, loc, elePtrTy, buffer,
           ArrayRef<cudaq::cc::ComputePtrArg>{i});
@@ -740,7 +763,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout) {
+                  ModuleOp substMod, llvm::DataLayout &layout,
+                  bool boolVecBitPacked = false) {
   if (strTy.getMembers().empty())
     return {};
   const char *cursor = static_cast<const char *>(p);
@@ -752,7 +776,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
             builder, iter.value(),
             static_cast<void *>(const_cast<char *>(
                 cursor + cudaq::opt::getDataOffset(layout, strTy, i))),
-            substMod, layout))
+            substMod, layout, boolVecBitPacked))
       aggie =
           cudaq::cc::InsertValueOp::create(builder, loc, strTy, aggie, v, i);
   }
@@ -760,7 +784,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout) {
+                  ModuleOp substMod, llvm::DataLayout &layout,
+                  bool boolVecBitPacked = false) {
   if (!p)
     return {};
   auto loc = builder.getUnknownLoc();
@@ -787,7 +812,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
         if (hasLiftedArgs) {
           for (unsigned i = liftedPos, j = 0; i < liftedArity; ++i, ++j) {
             Value v = dispatchSubtype(builder, calleeInpTys[i], closureArgs[j],
-                                      substMod, layout);
+                                      substMod, layout, boolVecBitPacked);
             assert(v && "lifted argument must be handled");
             args.push_back(v);
           }
@@ -806,7 +831,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout) {
+                  ModuleOp substMod, llvm::DataLayout &layout,
+                  bool boolVecBitPacked = false) {
   if (arrTy.isUnknownSize())
     return {};
   auto eleTy = arrTy.getElementType();
@@ -818,7 +844,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p,
   for (std::size_t i = 0; i < arrSize; ++i) {
     if (Value v = dispatchSubtype(
             builder, eleTy, static_cast<void *>(const_cast<char *>(cursor)),
-            substMod, layout))
+            substMod, layout, boolVecBitPacked))
       aggie =
           cudaq::cc::InsertValueOp::create(builder, loc, arrTy, aggie, v, i);
     cursor += eleSize;
@@ -854,8 +880,9 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 //===----------------------------------------------------------------------===//
 
 cudaq_internal::compiler::ArgumentConverter::ArgumentConverter(
-    StringRef kernelName, ModuleOp sourceModule)
-    : sourceModule(sourceModule), kernelName(kernelName) {}
+    StringRef kernelName, ModuleOp sourceModule, bool boolVecBitPacked)
+    : sourceModule(sourceModule), kernelName(kernelName),
+      boolVecBitPacked(boolVecBitPacked) {}
 
 void cudaq_internal::compiler::ArgumentConverter::gen(
     std::span<void *const> arguments) {
@@ -954,19 +981,23 @@ void cudaq_internal::compiler::ArgumentConverter::gen(
               return {};
             })
             .Case([&](cudaq::cc::StdvecType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout);
+              return buildSubst(ty, argPtr, substModule, dataLayout,
+                                boolVecBitPacked);
             })
             .Case([&](cudaq::cc::StructType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout);
+              return buildSubst(ty, argPtr, substModule, dataLayout,
+                                boolVecBitPacked);
             })
             .Case([&](cudaq::cc::ArrayType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout);
+              return buildSubst(ty, argPtr, substModule, dataLayout,
+                                boolVecBitPacked);
             })
             .Case([&](cudaq::cc::IndirectCallableType ty) {
               return buildSubst(ty, argPtr, substModule, dataLayout);
             })
             .Case([&](cudaq::cc::CallableType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout);
+              return buildSubst(ty, argPtr, substModule, dataLayout,
+                                boolVecBitPacked);
             })
             .Default({});
     if (subst)
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index ab6e6c16986..aaf01878c91 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -224,7 +224,11 @@ cudaq_internal::compiler::Compiler::prepareModule(const std::string &kernelName,
       // For quantum devices, we generate a collection of `init` and
       // `num_qubits` functions and their substitutions created
       // from a kernel and arguments that generated a state argument.
-      cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp);
+      // Local simulators marshal `i1` vectors as bit-packed `std::vector<bool>`
+      // (argsCreator); remote/emulated targets use `std::vector<char>`.
+      const bool boolVecBitPacked = target->isLocalSimulator;
+      cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp,
+                                                         boolVecBitPacked);
       // Must stay in scope as `eraseNonCallableArguments` may populate it
       std::vector<void *> closureArgs;
       if (cudaq::opt::marshal::isFullySynthesized(epFunc)) {
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
index 8abf11f7096..d83a2c673db 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
@@ -51,7 +51,11 @@ class ArgumentConverter {
 public:
   /// Build an instance to create argument substitutions for a specified \p
   /// kernelName in \p sourceModule.
-  ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
+  ///
+  /// Set \p boolVecBitPacked when `i1` vector args are host `std::vector<bool>`
+  /// (local-simulator launch path), not `std::vector<char>`.
+  ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule,
+                    bool boolVecBitPacked = false);
 
   ~ArgumentConverter() {
     for (auto *kInfo : kernelSubstitutions) {
@@ -110,6 +114,10 @@ class ArgumentConverter {
 
   /// Kernel we are substituting the arguments for.
   mlir::StringRef kernelName;
+
+  /// Whether `i1` vector args are bit-packed `std::vector<bool>` (vs
+  /// `std::vector<char>`). See the constructor.
+  bool boolVecBitPacked;
 };
 
 /// Merge modules from any CallableClosureArgument arguments into \p intoModule.
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 82f3e3f814a..863a0d710bc 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -152,7 +152,8 @@ void dumpSubstitutionModules(ArgumentConverter &con) {
 
 void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName,
                   std::vector<void *> args,
-                  const std::string &additionalCode = "") {
+                  const std::string &additionalCode = "",
+                  bool boolVecBitPacked = false) {
   std::string code = additionalCode + R"#(
 func.func private @callee(%0: )#" +
                      typeName + R"#()
@@ -166,7 +167,7 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" +
   // Create the Module
   auto mod = mlir::parseSourceString<mlir::ModuleOp>(code, ctx);
   llvm::outs() << "Source module:\n" << *mod << '\n';
-  ArgumentConverter ab{"testy", *mod};
+  ArgumentConverter ab{"testy", *mod, boolVecBitPacked};
   // Create the argument conversions
   ab.gen(args);
   // Dump all conversions
@@ -400,6 +401,22 @@ void test_vectors(mlir::MLIRContext *ctx) {
  // CHECK:         }
   // clang-format on
 
+  {
+    // Real bit-packed `std::vector<bool>`, as the local-simulator launch path
+    // passes it. Reading this as a `{begin, end, capacity}` triple corrupts the
+    // heap; `boolVecBitPacked` selects the correct reader.
+    std::vector<bool> x = {true, false, true, true};
+    std::vector<void *> v = {static_cast<void *>(&x)};
+    doSimpleTest(ctx, "!cc.stdvec<i1>", v, /*additionalCode=*/"",
+                 /*boolVecBitPacked=*/true);
+  }
+  // clang-format off
+// CHECK-LABEL:   cc.arg_subst[0] {
+// CHECK: %[[VAL_0:.*]] = cc.const_array [true, false, true, true] : !cc.array<i1 x ?>
+// CHECK: %[[VAL_1:.*]] = cc.reify_span %[[VAL_0]] : (!cc.array<i1 x ?>) -> !cc.stdvec<i1>
+ // CHECK:         }
+  // clang-format on
+
   {
     std::vector<std::vector<cudaq::pauli_word>> x = {
         {cudaq::pauli_word{"XX"}, cudaq::pauli_word{"XY"}},

From d9dae8a5d029243cbe9959478bec5ca14c3cd821 Mon Sep 17 00:00:00 2001
From: mdzurick <mitch_dz@hotmail.com>
Date: Mon, 15 Jun 2026 18:48:06 +0000
Subject: [PATCH 2/2] Fix list[bool] heap corruption by packing per execution
 mode

A `list[bool]` kernel argument was marshaled as a bit-packed
`std::vector<bool>` for every local-simulator launch. That object is the
libstdc++/libc++ specialization (its members are `_Bit_iterator`s, not a
`{begin, end, capacity}` raw-pointer triple), so when the argument-synthesis
path (`ArgumentConverter`) read it as the universal vector triple it walked off
the buffer and corrupted the heap. This showed up as the flaky
`test_list_subscript` segfault. Regression from #4265 (argsCreator packing).

The two launch modes want different host encodings, and the args were always
packed the same way:

  - Argument synthesis folds the arguments in as constants and reads every
    vector as the `{begin, end, capacity}` triple, so an `i1` vector must be a
    `std::vector<char>` (one byte per element).
  - Direct launch (argsCreator) supplies the arguments at runtime through the
    generated thunk, whose existing "vector<bool> unpack" helpers understand a
    host `std::vector<bool>`.

Pick the encoding by mode in `marshal_arguments_for_module_launch`: a kernel
whose formal arguments are all unused is synthesized (`isFullySynthesized`),
so pack the synthesis encoding (`std::vector<char>`); otherwise a local
simulator direct-launches it, so keep the argsCreator encoding
(`std::vector<bool>`). This leaves `argsCreator` and `ArgumentConverter`
untouched and drops the `boolVecBitPacked`/`isLocalSimulator` plumbing that had
taught argument synthesis to read `std::vector<bool>` directly.

Verified on qpp-cpu and nvidia: test_kernel_parameters (list[bool],
list[list[bool]]), test_list_subscript + test_repeated_builder_launch_no_segfault
under MALLOC_PERTURB_, and the full python/tests/kernel + builder suites
(1018 passed). C++ test_argument_conversion FileCheck passes.

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 cudaq/include/cudaq/Target/CompileTarget.h    |  5 -
 .../cudaq/platform/py_alt_launch_kernel.cpp   | 16 +++-
 runtime/cudaq/platform/quantum_platform.cpp   |  1 -
 .../internal/compiler/ArgumentConversion.cpp  | 91 ++++++-------------
 runtime/internal/compiler/Compiler.cpp        |  6 +-
 .../compiler/ArgumentConversion.h             | 11 +--
 runtime/test/test_argument_conversion.cpp     | 21 +----
 7 files changed, 49 insertions(+), 102 deletions(-)

diff --git a/cudaq/include/cudaq/Target/CompileTarget.h b/cudaq/include/cudaq/Target/CompileTarget.h
index 95c1a9d9fdc..ffd1371f394 100644
--- a/cudaq/include/cudaq/Target/CompileTarget.h
+++ b/cudaq/include/cudaq/Target/CompileTarget.h
@@ -100,11 +100,6 @@ class CompileTarget {
   /// Whether to fully specialize the kernel.
   bool fullySpecialize = true;
 
-  /// Whether this target is a local simulator (not remote, not emulated). On
-  /// this path `i1` vector arguments are packed as bit-packed
-  /// `std::vector<bool>`.
-  bool isLocalSimulator = false;
-
   /// Set the `changeSemantics` flag for the argument synthesis pass.
   bool argumentSynthChangeSemantics = true;
 
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 0175111c607..cd7edd0877c 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -1010,7 +1010,21 @@ cudaq::OpaqueArguments cudaq::marshal_arguments_for_module_launch(
                      unsigned pos) {
     return linkResolvedCallable(mod, kernelFunc, pos, pyArg);
   };
-  if (isLocalSimulator)
+  // Two encodings, one per execution mode (see PackingStyle):
+  //   - Direct launch (argsCreator): the kernel keeps live argument uses that
+  //     are supplied at runtime through the generated `.argsCreator`/thunk,
+  //     whose "C++ side magic" understands a host `std::vector<bool>` for an
+  //     `i1` vector. Used only for local simulators with un-synthesized args.
+  //   - Argument synthesis (the default): the arguments are folded into the
+  //     kernel as constants by `ArgumentConverter`, which reads every vector as
+  //     the universal `{begin, end, capacity}` triple and therefore must be
+  //     given the triple-compatible `std::vector<char>` for an `i1` vector
+  //     (never the bit-packed `std::vector<bool>` specialization).
+  // A kernel whose formal arguments are all unused is synthesized
+  // (`isFullySynthesized`); otherwise a local simulator direct-launches it.
+  const bool directLaunch =
+      isLocalSimulator && !cudaq::opt::factory::isFullySynthesized(kernelFunc);
+  if (directLaunch)
     cudaq::packArgs<cudaq::PackingStyle::argsCreator>(args, runtimeArgs,
                                                       kernelFunc, handler);
   else
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index 45ceba25a31..bd04812a2bc 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -114,7 +114,6 @@ getDefaultPythonCompileTargetImpl() {
   bool isLocalSimulator = !(platform->is_remote() || platform->is_emulated());
 
   ct->fullySpecialize = !isLocalSimulator;
-  ct->isLocalSimulator = isLocalSimulator;
   ct->supportDeviceCalls = true;
   ct->argumentSynthChangeSemantics = false;
   ct->pipelineConfig.codegenTranslation = "qir:";
diff --git a/runtime/internal/compiler/ArgumentConversion.cpp b/runtime/internal/compiler/ArgumentConversion.cpp
index 32ec4e72864..dcb0f09efab 100644
--- a/runtime/internal/compiler/ArgumentConversion.cpp
+++ b/runtime/internal/compiler/ArgumentConversion.cpp
@@ -97,15 +97,15 @@ static Value genConstant(OpBuilder &builder, const std::string &v,
 
 // Forward declare aggregate type builder as they can be recursive.
 static Value genRecursiveSpan(OpBuilder &, cudaq::cc::StdvecType, void *,
-                              ModuleOp, llvm::DataLayout &, bool);
+                              ModuleOp, llvm::DataLayout &);
 static Value genConstant(OpBuilder &, cudaq::cc::StdvecType, void *, ModuleOp,
-                         llvm::DataLayout &, bool);
+                         llvm::DataLayout &);
 static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *, ModuleOp,
-                         llvm::DataLayout &, bool);
+                         llvm::DataLayout &);
 static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *, ModuleOp,
-                         llvm::DataLayout &, bool);
+                         llvm::DataLayout &);
 static Value genConstant(OpBuilder &, cudaq::cc::CallableType, void *, ModuleOp,
-                         llvm::DataLayout &, bool);
+                         llvm::DataLayout &);
 
 /// Create callee.init_N that initializes the state
 ///
@@ -524,7 +524,7 @@ static bool isSupportedRecursiveSpan(cudaq::cc::StdvecType ty) {
 
 // Recursive step processing of aggregates.
 Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod,
-                      llvm::DataLayout &layout, bool boolVecBitPacked = false) {
+                      llvm::DataLayout &layout) {
   auto *ctx = builder.getContext();
   return TypeSwitch<Type, Value>(ty)
       .Case([&](IntegerType intTy) -> Value {
@@ -567,16 +567,16 @@ Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod,
                            substMod);
       })
       .Case([&](cudaq::cc::StdvecType ty) {
-        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
+        return genConstant(builder, ty, p, substMod, layout);
       })
       .Case([&](cudaq::cc::StructType ty) {
-        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
+        return genConstant(builder, ty, p, substMod, layout);
       })
       .Case([&](cudaq::cc::ArrayType ty) {
-        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
+        return genConstant(builder, ty, p, substMod, layout);
       })
       .Case([&](cudaq::cc::CallableType ty) {
-        return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked);
+        return genConstant(builder, ty, p, substMod, layout);
       })
       .Default({});
 }
@@ -597,40 +597,21 @@ static std::size_t getHostSideElementSize(Type eleTy,
 }
 
 /// Recursively builds an `ArrayAttr` containing the constants.
-///
-/// Set \p boolVecBitPacked when an `i1` vector arg is a host
-/// `std::vector<bool>` (bit-packed; not the `{begin, end, capacity}` triple).
 ArrayAttr genRecursiveConstantArray(OpBuilder &builder,
                                     cudaq::cc::StdvecType vecTy, void *p,
-                                    llvm::DataLayout &layout,
-                                    bool boolVecBitPacked = false) {
-  auto eleTy = vecTy.getElementType();
-
-  // Bit-packed `std::vector<bool>`: read via the container API, not a triple.
-  if (boolVecBitPacked && eleTy.isInteger(1)) {
-    auto *boolVec = reinterpret_cast<const std::vector<bool> *>(p);
-    if (boolVec->empty())
-      return {};
-    auto intTy = cast<IntegerType>(eleTy);
-    SmallVector<Attribute> members;
-    members.reserve(boolVec->size());
-    for (bool bit : *boolVec)
-      members.push_back(IntegerAttr::get(intTy, bit ? 1 : 0));
-    return ArrayAttr::get(builder.getContext(), members);
-  }
-
+                                    llvm::DataLayout &layout) {
   typedef const char *VectorType[3];
   VectorType *vecPtr = static_cast<VectorType *>(p);
   auto delta = (*vecPtr)[1] - (*vecPtr)[0];
   if (!delta)
     return {};
+  auto eleTy = vecTy.getElementType();
   unsigned stepBy = 0;
   std::function<Attribute(char *)> genAttr;
   if (auto innerTy = dyn_cast<cudaq::cc::StdvecType>(eleTy)) {
     stepBy = sizeof(VectorType);
     genAttr = [&, innerTy](char *p) -> Attribute {
-      return genRecursiveConstantArray(builder, innerTy, p, layout,
-                                       boolVecBitPacked);
+      return genRecursiveConstantArray(builder, innerTy, p, layout);
     };
   } else if (auto stringTy = dyn_cast<cudaq::cc::CharspanType>(eleTy)) {
     stepBy = sizeof(std::string);
@@ -707,10 +688,8 @@ static Type convertRecursiveSpanType(Type ty) {
 /// constant propagation through the recursive span structure. The reify
 /// operation will be lowered to more primitive ops on an as-needed basis.
 Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p,
-                       ModuleOp substMod, llvm::DataLayout &layout,
-                       bool boolVecBitPacked = false) {
-  ArrayAttr constants =
-      genRecursiveConstantArray(builder, ty, p, layout, boolVecBitPacked);
+                       ModuleOp substMod, llvm::DataLayout &layout) {
+  ArrayAttr constants = genRecursiveConstantArray(builder, ty, p, layout);
   auto loc = builder.getUnknownLoc();
   if (!constants) {
     // Empty vector. Not much to contemplate here.
@@ -726,11 +705,9 @@ Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout,
-                  bool boolVecBitPacked = false) {
+                  ModuleOp substMod, llvm::DataLayout &layout) {
   if (isSupportedRecursiveSpan(vecTy))
-    return genRecursiveSpan(builder, vecTy, p, substMod, layout,
-                            boolVecBitPacked);
+    return genRecursiveSpan(builder, vecTy, p, substMod, layout);
   typedef const char *VectorType[3];
   VectorType *vecPtr = static_cast<VectorType *>(p);
   auto delta = (*vecPtr)[1] - (*vecPtr)[0];
@@ -750,7 +727,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
   for (std::int32_t i = 0; i < vecSize; ++i) {
     if (Value val = dispatchSubtype(
             builder, eleTy, static_cast<void *>(const_cast<char *>(cursor)),
-            substMod, layout, boolVecBitPacked)) {
+            substMod, layout)) {
       auto atLoc = cudaq::cc::ComputePtrOp::create(
           builder, loc, elePtrTy, buffer,
           ArrayRef<cudaq::cc::ComputePtrArg>{i});
@@ -763,8 +740,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout,
-                  bool boolVecBitPacked = false) {
+                  ModuleOp substMod, llvm::DataLayout &layout) {
   if (strTy.getMembers().empty())
     return {};
   const char *cursor = static_cast<const char *>(p);
@@ -776,7 +752,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
             builder, iter.value(),
             static_cast<void *>(const_cast<char *>(
                 cursor + cudaq::opt::getDataOffset(layout, strTy, i))),
-            substMod, layout, boolVecBitPacked))
+            substMod, layout))
       aggie =
           cudaq::cc::InsertValueOp::create(builder, loc, strTy, aggie, v, i);
   }
@@ -784,8 +760,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout,
-                  bool boolVecBitPacked = false) {
+                  ModuleOp substMod, llvm::DataLayout &layout) {
   if (!p)
     return {};
   auto loc = builder.getUnknownLoc();
@@ -812,7 +787,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
         if (hasLiftedArgs) {
           for (unsigned i = liftedPos, j = 0; i < liftedArity; ++i, ++j) {
             Value v = dispatchSubtype(builder, calleeInpTys[i], closureArgs[j],
-                                      substMod, layout, boolVecBitPacked);
+                                      substMod, layout);
             assert(v && "lifted argument must be handled");
             args.push_back(v);
           }
@@ -831,8 +806,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p,
 }
 
 Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p,
-                  ModuleOp substMod, llvm::DataLayout &layout,
-                  bool boolVecBitPacked = false) {
+                  ModuleOp substMod, llvm::DataLayout &layout) {
   if (arrTy.isUnknownSize())
     return {};
   auto eleTy = arrTy.getElementType();
@@ -844,7 +818,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p,
   for (std::size_t i = 0; i < arrSize; ++i) {
     if (Value v = dispatchSubtype(
             builder, eleTy, static_cast<void *>(const_cast<char *>(cursor)),
-            substMod, layout, boolVecBitPacked))
+            substMod, layout))
       aggie =
           cudaq::cc::InsertValueOp::create(builder, loc, arrTy, aggie, v, i);
     cursor += eleSize;
@@ -880,9 +854,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy,
 //===----------------------------------------------------------------------===//
 
 cudaq_internal::compiler::ArgumentConverter::ArgumentConverter(
-    StringRef kernelName, ModuleOp sourceModule, bool boolVecBitPacked)
-    : sourceModule(sourceModule), kernelName(kernelName),
-      boolVecBitPacked(boolVecBitPacked) {}
+    StringRef kernelName, ModuleOp sourceModule)
+    : sourceModule(sourceModule), kernelName(kernelName) {}
 
 void cudaq_internal::compiler::ArgumentConverter::gen(
     std::span<void *const> arguments) {
@@ -981,23 +954,19 @@ void cudaq_internal::compiler::ArgumentConverter::gen(
               return {};
             })
             .Case([&](cudaq::cc::StdvecType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout,
-                                boolVecBitPacked);
+              return buildSubst(ty, argPtr, substModule, dataLayout);
             })
             .Case([&](cudaq::cc::StructType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout,
-                                boolVecBitPacked);
+              return buildSubst(ty, argPtr, substModule, dataLayout);
             })
             .Case([&](cudaq::cc::ArrayType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout,
-                                boolVecBitPacked);
+              return buildSubst(ty, argPtr, substModule, dataLayout);
             })
             .Case([&](cudaq::cc::IndirectCallableType ty) {
               return buildSubst(ty, argPtr, substModule, dataLayout);
             })
             .Case([&](cudaq::cc::CallableType ty) {
-              return buildSubst(ty, argPtr, substModule, dataLayout,
-                                boolVecBitPacked);
+              return buildSubst(ty, argPtr, substModule, dataLayout);
             })
             .Default({});
     if (subst)
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index 7bf586081ce..15959742790 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -224,11 +224,7 @@ cudaq_internal::compiler::Compiler::prepareModule(const std::string &kernelName,
       // For quantum devices, we generate a collection of `init` and
       // `num_qubits` functions and their substitutions created
       // from a kernel and arguments that generated a state argument.
-      // Local simulators marshal `i1` vectors as bit-packed `std::vector<bool>`
-      // (argsCreator); remote/emulated targets use `std::vector<char>`.
-      const bool boolVecBitPacked = target->isLocalSimulator;
-      cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp,
-                                                         boolVecBitPacked);
+      cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp);
       // Must stay in scope as `eraseNonCallableArguments` may populate it
       std::vector<void *> closureArgs;
       if (cudaq::opt::factory::isFullySynthesized(epFunc)) {
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
index 20148e89fe9..8abf11f7096 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h
@@ -51,12 +51,7 @@ class ArgumentConverter {
 public:
   /// Build an instance to create argument substitutions for a specified \p
   /// kernelName in \p sourceModule.
-  ///
-  /// Set \p boolVecBitPacked when `i1` vector arguments are host
-  /// `std::vector<bool>` (local-simulator launch path), not
-  /// `std::vector<char>`.
-  ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule,
-                    bool boolVecBitPacked = false);
+  ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule);
 
   ~ArgumentConverter() {
     for (auto *kInfo : kernelSubstitutions) {
@@ -115,10 +110,6 @@ class ArgumentConverter {
 
   /// Kernel we are substituting the arguments for.
   mlir::StringRef kernelName;
-
-  /// Whether `i1` vector arguments are bit-packed `std::vector<bool>` (vs
-  /// `std::vector<char>`). See the constructor.
-  bool boolVecBitPacked;
 };
 
 /// Merge modules from any CallableClosureArgument arguments into \p intoModule.
diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp
index 863a0d710bc..82f3e3f814a 100644
--- a/runtime/test/test_argument_conversion.cpp
+++ b/runtime/test/test_argument_conversion.cpp
@@ -152,8 +152,7 @@ void dumpSubstitutionModules(ArgumentConverter &con) {
 
 void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName,
                   std::vector<void *> args,
-                  const std::string &additionalCode = "",
-                  bool boolVecBitPacked = false) {
+                  const std::string &additionalCode = "") {
   std::string code = additionalCode + R"#(
 func.func private @callee(%0: )#" +
                      typeName + R"#()
@@ -167,7 +166,7 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" +
   // Create the Module
   auto mod = mlir::parseSourceString<mlir::ModuleOp>(code, ctx);
   llvm::outs() << "Source module:\n" << *mod << '\n';
-  ArgumentConverter ab{"testy", *mod, boolVecBitPacked};
+  ArgumentConverter ab{"testy", *mod};
   // Create the argument conversions
   ab.gen(args);
   // Dump all conversions
@@ -401,22 +400,6 @@ void test_vectors(mlir::MLIRContext *ctx) {
  // CHECK:         }
   // clang-format on
 
-  {
-    // Real bit-packed `std::vector<bool>`, as the local-simulator launch path
-    // passes it. Reading this as a `{begin, end, capacity}` triple corrupts the
-    // heap; `boolVecBitPacked` selects the correct reader.
-    std::vector<bool> x = {true, false, true, true};
-    std::vector<void *> v = {static_cast<void *>(&x)};
-    doSimpleTest(ctx, "!cc.stdvec<i1>", v, /*additionalCode=*/"",
-                 /*boolVecBitPacked=*/true);
-  }
-  // clang-format off
-// CHECK-LABEL:   cc.arg_subst[0] {
-// CHECK: %[[VAL_0:.*]] = cc.const_array [true, false, true, true] : !cc.array<i1 x ?>
-// CHECK: %[[VAL_1:.*]] = cc.reify_span %[[VAL_0]] : (!cc.array<i1 x ?>) -> !cc.stdvec<i1>
- // CHECK:         }
-  // clang-format on
-
   {
     std::vector<std::vector<cudaq::pauli_word>> x = {
         {cudaq::pauli_word{"XX"}, cudaq::pauli_word{"XY"}},