New sample option: stacked measurements

Signed-off-by: Ben Howe <[email protected]>
NVIDIA · Jan 31, 2025 · 5cfc41f · 5cfc41f
1 parent d893046
commit 5cfc41f
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 34 deletions.
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
@@ -113,6 +113,9 @@ class ExecutionContext {
   /// calculation on simulation backends that support trajectory simulation.
   std::optional<std::size_t> numberTrajectories;
 
+  /// @brief Whether or not to simply stack measurements in execution order.
+  bool stackMeasurements = false;
+
   /// @brief The Constructor, takes the name of the context
   /// @param n The name of the context
   ExecutionContext(const std::string n) : name(n) {}

diff --git a/runtime/common/MeasureCounts.cpp b/runtime/common/MeasureCounts.cpp
@@ -192,14 +192,31 @@ sample_result::sample_result(double preComputedExp,
     totalShots += count;
 }
 
-void sample_result::append(ExecutionResult &result) {
-  // If given a result corresponding to the same register name,
-  // replace the existing one if in the map.
+void sample_result::append(ExecutionResult &result, bool stack) {
+  // If given a result corresponding to the same register name, either a)
+  // replace the existing one if in the map if stack is false, or b) if stack is
+  // true, stitch the bitstrings from "result" into the existing one.
   auto iter = sampleResults.find(result.registerName);
-  if (iter != sampleResults.end())
-    iter->second = result;
-  else
+  if (iter != sampleResults.end()) {
+    auto &existingExecResult = iter->second;
+    if (stack) {
+      // Stitch the bitstrings together
+      if (this->totalShots == result.sequentialData.size()) {
+        existingExecResult.counts.clear();
+        for (std::size_t i = 0; i < this->totalShots; i++) {
+          std::string newStr =
+              existingExecResult.sequentialData[i] + result.sequentialData[i];
+          existingExecResult.counts[newStr]++;
+          existingExecResult.sequentialData[i] = std::move(newStr);
+        }
+      }
+    } else {
+      // Replace the existing one
+      existingExecResult = result;
+    }
+  } else {
     sampleResults.insert({result.registerName, result});
+  }
   if (!totalShots)
     for (auto &[bits, count] : result.counts)
       totalShots += count;

diff --git a/runtime/common/MeasureCounts.h b/runtime/common/MeasureCounts.h
@@ -142,8 +142,9 @@ class sample_result {
       const std::string_view registerName = GlobalRegisterName) const;
 
   /// @brief Add another `ExecutionResult` to this `sample_result`.
-  /// @param result
-  void append(ExecutionResult &result);
+  /// @param result Result to append
+  /// @param stack If prior results are found, this stacks the bitstrings.
+  void append(ExecutionResult &result, bool stack = false);
 
   /// @brief Return all register names. Can be used in tandem with
   /// sample_result::to_map(regName : string) to retrieve the counts

diff --git a/runtime/cudaq/algorithms/sample.h b/runtime/cudaq/algorithms/sample.h
@@ -39,15 +39,16 @@ namespace details {
 template <typename KernelFunctor>
 std::optional<sample_result>
 runSampling(KernelFunctor &&wrappedKernel, quantum_platform &platform,
-            const std::string &kernelName, int shots, std::size_t qpu_id = 0,
-            details::future *futureResult = nullptr,
+            const std::string &kernelName, int shots, bool stackMeasurements,
+            std::size_t qpu_id = 0, details::future *futureResult = nullptr,
             std::size_t batchIteration = 0, std::size_t totalBatchIters = 0) {
   // Create the execution context.
   auto ctx = std::make_unique<ExecutionContext>("sample", shots);
   ctx->kernelName = kernelName;
 
   ctx->batchIteration = batchIteration;
   ctx->totalIterations = totalBatchIters;
+  ctx->stackMeasurements = stackMeasurements;
 
   // Tell the context if this quantum kernel has
   // conditionals on measure results
@@ -160,15 +161,17 @@ auto runSamplingAsync(KernelFunctor &&wrappedKernel, quantum_platform &platform,
   if (platform.is_remote(qpu_id)) {
     details::future futureResult;
     details::runSampling(std::forward<KernelFunctor>(wrappedKernel), platform,
-                         kernelName, shots, qpu_id, &futureResult);
+                         kernelName, shots, /*stackMeasurements=*/false, qpu_id,
+                         &futureResult);
     return async_sample_result(std::move(futureResult));
   }
 
   // Otherwise we'll create our own future/promise and return it
   KernelExecutionTask task(
       [qpu_id, shots, kernelName, &platform,
        kernel = std::forward<KernelFunctor>(wrappedKernel)]() mutable {
-        return details::runSampling(kernel, platform, kernelName, shots, qpu_id)
+        return details::runSampling(kernel, platform, kernelName, shots,
+                                    /*stackMeasurements=*/false, qpu_id)
             .value();
       });
 
@@ -181,9 +184,12 @@ auto runSamplingAsync(KernelFunctor &&wrappedKernel, quantum_platform &platform,
 ///
 /// @param shots number of shots to run for the given kernel
 /// @param noise noise model to use for the sample operation
+/// @param stack_measurements Whether or not to simply stack measurements in
+/// execution order for the \p sample_result
 struct sample_options {
   std::size_t shots = 1000;
   cudaq::noise_model noise;
+  bool stack_measurements = false;
 };
 
 /// @overload
@@ -223,7 +229,7 @@ sample_result sample(QuantumKernel &&kernel, Args &&...args) {
                cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
                                    std::forward<Args>(args)...);
              },
-             platform, kernelName, shots)
+             platform, kernelName, shots, /*stackMeasurements=*/false)
       .value();
 }
 
@@ -264,7 +270,7 @@ auto sample(std::size_t shots, QuantumKernel &&kernel, Args &&...args) {
                cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
                                    std::forward<Args>(args)...);
              },
-             platform, kernelName, shots)
+             platform, kernelName, shots, /*stackMeasurements=*/false)
       .value();
 }
 
@@ -307,7 +313,7 @@ sample_result sample(const sample_options &options, QuantumKernel &&kernel,
                    cudaq::invokeKernel(std::forward<QuantumKernel>(kernel),
                                        std::forward<Args>(args)...);
                  },
-                 platform, kernelName, shots)
+                 platform, kernelName, shots, options.stack_measurements)
                  .value();
   platform.reset_noise();
   return ret;
@@ -487,7 +493,8 @@ std::vector<sample_result> sample(QuantumKernel &&kernel,
                    [&kernel, &singleIterParameters...]() mutable {
                      kernel(std::forward<Args>(singleIterParameters)...);
                    },
-                   platform, kernelName, shots, qpuId, nullptr, counter, N)
+                   platform, kernelName, shots, /*stackMeasurements=*/false,
+                   qpuId, nullptr, counter, N)
                    .value();
     return ret;
   };
@@ -529,7 +536,8 @@ std::vector<sample_result> sample(std::size_t shots, QuantumKernel &&kernel,
                    [&kernel, &singleIterParameters...]() mutable {
                      kernel(std::forward<Args>(singleIterParameters)...);
                    },
-                   platform, kernelName, shots, qpuId, nullptr, counter, N)
+                   platform, kernelName, shots, /*stackMeasurements=*/false,
+                   qpuId, nullptr, counter, N)
                    .value();
     return ret;
   };
@@ -568,15 +576,17 @@ std::vector<sample_result> sample(const sample_options &options,
   // Create the functor that will broadcast the sampling tasks across
   // all requested argument sets provided.
   details::BroadcastFunctorType<sample_result, Args...> functor =
-      [&](std::size_t qpuId, std::size_t counter, std::size_t N,
+      [&, stack = options.stack_measurements](
+          std::size_t qpuId, std::size_t counter, std::size_t N,
           Args &...singleIterParameters) -> sample_result {
     auto kernelName = cudaq::getKernelName(kernel);
-    auto ret = details::runSampling(
-                   [&kernel, &singleIterParameters...]() mutable {
-                     kernel(std::forward<Args>(singleIterParameters)...);
-                   },
-                   platform, kernelName, shots, qpuId, nullptr, counter, N)
-                   .value();
+    auto ret =
+        details::runSampling(
+            [&kernel, &singleIterParameters...]() mutable {
+              kernel(std::forward<Args>(singleIterParameters)...);
+            },
+            platform, kernelName, shots, stack, qpuId, nullptr, counter, N)
+            .value();
     return ret;
   };
 
@@ -620,7 +630,8 @@ sample_n(QuantumKernel &&kernel, ArgumentSet<Args...> &&params) {
                    [&kernel, &singleIterParameters...]() mutable {
                      kernel(std::forward<Args>(singleIterParameters)...);
                    },
-                   platform, kernelName, shots, qpuId, nullptr, counter, N)
+                   platform, kernelName, shots, /*stackMeasurements=*/false,
+                   qpuId, nullptr, counter, N)
                    .value();
     return ret;
   };
@@ -663,7 +674,8 @@ sample_n(std::size_t shots, QuantumKernel &&kernel,
                    [&kernel, &singleIterParameters...]() mutable {
                      kernel(std::forward<Args>(singleIterParameters)...);
                    },
-                   platform, kernelName, shots, qpuId, nullptr, counter, N)
+                   platform, kernelName, shots, /*stackMeasurements=*/false,
+                   qpuId, nullptr, counter, N)
                    .value();
     return ret;
   };

diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
@@ -496,6 +496,11 @@ class CircuitSimulatorBase : public CircuitSimulator {
       // Add the qubit to the sampling list
       sampleQubits.push_back(qubitIdx);
 
+      // If we're stacking measurements (an optimized sampling mode), then don't
+      // populate registerNameToMeasuredQubit.
+      if (executionContext->stackMeasurements)
+        return true;
+
       auto processForRegName = [&](const std::string &regStr) {
         // Insert the sample qubit into the register name map
         auto iter = registerNameToMeasuredQubit.find(regStr);
@@ -646,13 +651,23 @@ class CircuitSimulatorBase : public CircuitSimulator {
     if (sampleQubits.empty())
       return;
 
+    // FIXME - make this a base class attribute that subclasses can override?
+    bool requiresUniqueQubitsInSample = name() != "stim";
+    if (executionContext->stackMeasurements && !requiresUniqueQubitsInSample &&
+        !force)
+      return;
+
     if (executionContext->hasConditionalsOnMeasureResults && !force)
       return;
 
-    // Sort the qubit indices
-    std::sort(sampleQubits.begin(), sampleQubits.end());
-    auto last = std::unique(sampleQubits.begin(), sampleQubits.end());
-    sampleQubits.erase(last, sampleQubits.end());
+    // Sort the qubit indices (unless we're in the optimized sampling mode that
+    // simply stacks sequential measurements)
+    auto origSampleQubits = sampleQubits;
+    if (requiresUniqueQubitsInSample || !executionContext->stackMeasurements) {
+      std::sort(sampleQubits.begin(), sampleQubits.end());
+      auto last = std::unique(sampleQubits.begin(), sampleQubits.end());
+      sampleQubits.erase(last, sampleQubits.end());
+    }
 
     cudaq::info("Sampling the current state, with measure qubits = {}",
                 sampleQubits);
@@ -663,8 +678,26 @@ class CircuitSimulatorBase : public CircuitSimulator {
                                  ? 1
                                  : executionContext->shots);
 
+    if (requiresUniqueQubitsInSample && executionContext->stackMeasurements) {
+      // This is the case where there may be duplicates in origSampleQubits.
+      // We need to update the execResult by duplicating measurements. This is
+      // not particularly efficient, but it is only used in rare use cases.
+      cudaq::ExecutionResult newExecResult;
+      newExecResult.sequentialData.resize(executionContext->shots);
+      for (auto b : origSampleQubits) {
+        auto iter = std::find(sampleQubits.begin(), sampleQubits.end(), b);
+        auto idx = std::distance(sampleQubits.begin(), iter);
+        for (std::size_t s = 0; s < executionContext->shots; s++)
+          newExecResult.sequentialData[s] += execResult.sequentialData[s][idx];
+      }
+      for (auto &bitstring : newExecResult.sequentialData)
+        newExecResult.counts[bitstring]++;
+      execResult = std::move(newExecResult);
+    }
+
     if (registerNameToMeasuredQubit.empty()) {
-      executionContext->result.append(execResult);
+      executionContext->result.append(execResult,
+                                      executionContext->stackMeasurements);
     } else {
 
       for (auto &[regName, qubits] : registerNameToMeasuredQubit) {
@@ -1046,7 +1079,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     // If we are sampling...
     if (execContextName.find("sample") != std::string::npos) {
       // Sample the state over the specified number of shots
-      if (sampleQubits.empty()) {
+      if (sampleQubits.empty() && !executionContext->stackMeasurements) {
         if (isInBatchMode())
           sampleQubits.resize(batchModeCurrentNumQubits);
         else

diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -242,6 +242,7 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
   const auto samples = m_state->sample(measuredBitIds, shots);
   cudaq::ExecutionResult counts(samples);
   double expVal = 0.0;
+  std::size_t sum_counts = 0;
   // Compute the expectation value from the counts
   for (auto &kv : counts.counts) {
     auto par = cudaq::sample_result::has_even_parity(kv.first);
@@ -250,9 +251,15 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
       p = -p;
     }
     expVal += p;
+    sum_counts += kv.second;
   }
 
   counts.expectationValue = expVal;
+  counts.sequentialData.resize(sum_counts);
+  std::size_t s = 0;
+  for (auto &kv : counts.counts)
+    for (std::size_t c = 0; c < kv.second; c++)
+      counts.sequentialData[s++] = kv.first;
 
   return counts;
 }

diff --git a/runtime/nvqir/stim/StimCircuitSimulator.cpp b/runtime/nvqir/stim/StimCircuitSimulator.cpp
@@ -244,7 +244,9 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
   /// @param index 0-based index of qubit to reset
   void resetQubit(const std::size_t index) override {
     flushGateQueue();
-    flushAnySamplingTasks();
+    if (getExecutionContext() && getExecutionContext()->name == "sample" &&
+        !getExecutionContext()->stackMeasurements)
+      flushAnySamplingTasks();
     applyOpToSims(
         "R", std::vector<std::uint32_t>{static_cast<std::uint32_t>(index)});
   }
@@ -284,6 +286,7 @@ class StimCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
     assert(bits_per_sample >= qubits.size());
     std::size_t first_bit_to_save = bits_per_sample - qubits.size();
     CountsDictionary counts;
+    sequentialData.reserve(shots);
     for (std::size_t shot = 0; shot < shots; shot++) {
       std::string aShot(qubits.size(), '0');
       for (std::size_t b = first_bit_to_save; b < bits_per_sample; b++)

diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -97,7 +97,7 @@ cudaq::sample_result sampleJitCode(ExecutionEngine *jit,
                                             kernelName);
                ASSERT_TRUE(!err);
              },
-             p, kernelName, 1000)
+             p, kernelName, 1000, /*stackMeasurements=*/false)
       .value();
 }
 

diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
@@ -1493,3 +1493,38 @@ CUDAQ_TEST(BuilderTester, checkMidCircuitMeasureWithReset) {
       match = false;
   EXPECT_EQ(match, false);
 }
+
+CUDAQ_TEST(BuilderTester, checkStackedMeasurements) {
+  int n_qubits = 4;
+  int n_rounds = 10;
+  auto stacked_kernel = cudaq::make_kernel();
+  auto q = stacked_kernel.qalloc(n_qubits);
+  for (int round = 0; round < n_rounds; round++)
+    for (int i = 0; i < n_qubits; i++)
+      stacked_kernel.mz(q[i]);
+
+  cudaq::sample_options options{.stack_measurements = true};
+  auto counts = cudaq::sample(options, stacked_kernel);
+  auto seq = counts.sequential_data();
+  EXPECT_EQ(seq.size(), 1000);
+  EXPECT_EQ(seq[0].size(), n_qubits * n_rounds);
+}
+
+CUDAQ_TEST(BuilderTester, checkStackedMeasurements2) {
+  int n_qubits = 4;
+  int n_rounds = 10;
+  auto stacked_kernel = cudaq::make_kernel();
+  auto q = stacked_kernel.qalloc(n_qubits);
+  for (int round = 0; round < n_rounds; round++) {
+    for (int i = 0; i < n_qubits; i++)
+      stacked_kernel.mz(q[i]);
+    for (int i = 0; i < n_qubits; i++)
+      stacked_kernel.reset(q[i]);
+  }
+
+  cudaq::sample_options options{.stack_measurements = true};
+  auto counts = cudaq::sample(options, stacked_kernel);
+  auto seq = counts.sequential_data();
+  EXPECT_EQ(seq.size(), 1000);
+  EXPECT_EQ(seq[0].size(), n_qubits * n_rounds);
+}