From ecea8d9cf25728752eb3da5e983fe5c87813e4d9 Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sat, 28 Jun 2025 00:17:04 -0600
Subject: [PATCH 1/7] feat: add tests and benchmarks + begin gpu support

---
 Makefile                                  |    7 +
 README.md                                 |   34 +-
 TODOs.md                                  |   28 +-
 benchmarks/all_benchmarks.mojo            |    7 +
 benchmarks/bench_qubit_wise_multiply.mojo |  178 ++
 examples/gpu_examples.mojo                |  288 +++
 examples/main.mojo                        |  184 +-
 pixi.lock                                 | 2503 +++++++++++++++++----
 pixi.toml                                 |   61 +-
 src/abstractions/simulator.mojo           |   24 +-
 src/base/__init__.mojo                    |    3 +-
 src/base/gate.mojo                        |   11 +
 src/base/gpu/__init__.mojo                |    3 +
 src/base/gpu/qubits_operations.mojo       |  124 +
 src/base/qubits_operations.mojo           |  153 +-
 src/base/state_and_matrix.mojo            |  124 +-
 tests/base/test_qubit_operations.mojo     |  182 +-
 tests/base/testing_matrix.mojo            |   46 +-
 tests/base/testing_state_vector.mojo      |   52 +
 19 files changed, 3311 insertions(+), 701 deletions(-)
 create mode 100644 benchmarks/all_benchmarks.mojo
 create mode 100644 benchmarks/bench_qubit_wise_multiply.mojo
 create mode 100644 examples/gpu_examples.mojo
 create mode 100644 src/base/gpu/__init__.mojo
 create mode 100644 src/base/gpu/qubits_operations.mojo
 create mode 100644 tests/base/testing_state_vector.mojo

diff --git a/Makefile b/Makefile
index 32b6834..6c458ee 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,11 @@
 # Makefile that forward every target to `pixi run`
+
+.PHONY: all
+all:
+	# @pixi run test
+	@pixi run main
+	@pixi run bench
+
 .PHONY: %
 %:
 	@pixi run $@ $(ARGS)
\ No newline at end of file
diff --git a/README.md b/README.md
index 107fcf9..d43cefd 100644
--- a/README.md
+++ b/README.md
@@ -2,43 +2,49 @@
 
 **A Quantum Circuit Composer & Simulator in Mojo** 🔥⚛️
 
+
+## Education 
+
 This project reimplements and extends the ideas from the following tutorial paper:
 
 > **How to Write a Simulator for Quantum Circuits from Scratch: A Tutorial**  
 > *Michael J. McGuffin, Jean-Marc Robert, and Kazuki Ikeda*  
 > Published: 2025-06-09 on [arXiv:2506.08142v1](https://arxiv.org/abs/2506.08142v1) (last accessed: 2025-06-12)
 
-
-## 🎯 Project Objectives
+###  🎯 Project Objectives
 
 * **Mojo Implementation:** Re-implement the approach from the paper in Mojo for more Pythonic synthax and better readability.
 * **Learning by Doing:** Gain hands-on experience with quantum circuit simulation to better understand the capabilities and limitations of classical simulation.
 * **Performance & Safety:** Leverage Mojo's strong static typing and compilation for blazing-fast and safe operations.
 * **Hardware Acceleration:** Utilize Mojo’s universal GPU programming support to accelerate simulations.
 
+### 🔥 Current Implementation
 
-## ⚙️ Environment Setup
+The current implementation uses a State Vector approach, which is an efficient method for simulating small-scale quantum circuits (20–30 qubits) with high precision. This approach also enables relatively straightforward exact gradient computations.
 
-Follow these steps to set up your environment and build the binary:
+An alternative implementation for the futur could be using the Tensor Network approach. This method is more suitable for larger circuits but offers lower precision and would involves more computationally expensive gradient calculations.
 
+## Usage
+
+### ⚙️ Environment Setup
+
+Follow these steps to set up your environment, build the library and run some examples:
+
+If you don't have Pixi installed yet:
 ```bash
-# If you don't have Pixi installed yet:
 curl -sSf https://pixi.sh/install.sh | bash
-
-# Install all project dependencies:
+```
+Install all project dependencies:
+```
 pixi install
+```
 
-# Build and run examples of the simulator:
+Build and run examples of the simulator:
+```bash
 pixi run main
 ```
 
 
-## 🔥 Current Implementation
-
-The current implementation uses a State Vector approach, which is an efficient method for simulating small-scale quantum circuits (20–30 qubits) with high precision. This approach also enables relatively straightforward exact gradient computations.
-
-An alternative implementation for the futur could be using the Tensor Network approach. This method is more suitable for larger circuits but offers lower precision and would involves more computationally expensive gradient calculations.
-
 ## 📄 License
 
 This project is open-source and licensed under Apache License 2.0.
diff --git a/TODOs.md b/TODOs.md
index af6d789..156e503 100644
--- a/TODOs.md
+++ b/TODOs.md
@@ -4,16 +4,8 @@
 
 ### Implementations
 
-- 4 / 3 : Implement measurement gates
-
-- 4 / 5 : Implement the computation of statistics (6.5 and 6.6)
-
-- 3 / 3 : Implement naive implementation of the functions to compare performances
-    - matrix multiplication (but starting from right or smart)
-    - partial trace
-
 - 5 / 5 : Start adding support for GPU in the base classes if needed (not possible to use SIMD(complexfloat64) anymore, or keep them but seperate them when moving data to GPU)
-    - struct PureBasisState
+    - struct StateVector
     - struct ComplexMatrix
     - struct Gate
 
@@ -21,7 +13,13 @@
     - qubit_wise_multiply()
     - apply_swap()
     - partial_trace()
+    - StateVector.to_density_matrix()
 
+- 2 / 4 : Efficient support for tracking a state statistic like entropy during the execution of the circuit by the simulator.
+
+- 3 / 3 : Implement naive implementation of the functions to compare performances
+    - matrix multiplication (but starting from right or smart)
+    - partial trace
 
 ### Tests
 
@@ -30,8 +28,7 @@
 - 5 / 2 : Test for everything that will be implement in GPU
     - qubit_wise_multiply()
     - apply_swap()
-    - partial_trace()
-    - struct PureBasisState's methods
+    - struct StateVector's methods
     - struct ComplexMatrix's methods
     - struct Gate's Gate
 
@@ -41,14 +38,23 @@
 
 ## Droped for now
 
+- 4 / 3 : Implement end of circuit measurement gates with some of those options:
+    - https://docs.pennylane.ai/en/stable/introduction/measurements.html
+
 - 4 / 4 : Gradient computation with Finite Difference
 
 - 3 / 2 :  Use a separate list for things that are not real gate to not slow down the main run logic
 
+- 3 / 3 : Setup automatic Doc generation with pixi but also on github.io repository page
+
 - 3 / 4 : Compile time circuit creation?
 
 - 3 / 4 : Gradient computation with Parameter-Shift
 
+- 3 / 4 : Implement mid circuit measurement gates (Section 7 of paper)
+
 - 3 / 100 : Gradient computation with Adjoint Method
 
 - 2 / 4 : qubit_wise_multiply_extended() but for gates applied to non-adjacent qubits
+
+- 2 / 3 : Implement concurence (2-qubits entanglement metric) computePairwiseQubitConcurrences()
diff --git a/benchmarks/all_benchmarks.mojo b/benchmarks/all_benchmarks.mojo
new file mode 100644
index 0000000..bb3fc0e
--- /dev/null
+++ b/benchmarks/all_benchmarks.mojo
@@ -0,0 +1,7 @@
+from bench_qubit_wise_multiply import bench_main
+
+
+def main():
+    print("Running all benchmarks...")
+    bench_main()
+    print("All benchmarks completed.")
diff --git a/benchmarks/bench_qubit_wise_multiply.mojo b/benchmarks/bench_qubit_wise_multiply.mojo
new file mode 100644
index 0000000..56e9b7d
--- /dev/null
+++ b/benchmarks/bench_qubit_wise_multiply.mojo
@@ -0,0 +1,178 @@
+from gpu.host import DeviceContext
+
+from benchmark import (
+    Bench,
+    BenchConfig,
+    Bencher,
+    BenchId,
+)
+
+from pathlib import Path
+
+import random
+
+from qlabs.base import (
+    StateVector,
+    ComplexMatrix,
+    Gate,
+    Hadamard,
+    PauliX,
+    PauliY,
+    PauliZ,
+    NOT,
+    H,
+    X,
+    Y,
+    Z,
+    SWAP,
+    iSWAP,
+    qubit_wise_multiply,
+    qubit_wise_multiply_extended,
+    apply_swap,
+    partial_trace,
+)
+
+from qlabs.abstractions import (
+    GateCircuit,
+    StateVectorSimulator,
+    ShowAfterEachGate,
+    ShowAfterEachLayer,
+    ShowOnlyEnd,
+)
+
+
+fn simulate_random_circuit[num_qubits: Int, number_layers: Int]() -> None:
+    """Simulates a random quantum circuit with the specified number of qubits and layers.
+
+    Parameters:
+        num_qubits: The number of qubits in the circuit.
+        number_layers: The number of layers in the circuit.
+    """
+
+    qc: GateCircuit = GateCircuit(num_qubits)
+
+    gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+    # index: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(2*num_qubits)
+    # print("Creating random circuit...")
+    # random.seed()  # Seed on current time
+    # for _ in range(400):
+    #     random.randint(index, 2*num_qubits, 0, len(gates_list) - 1)
+    #     for i in range(num_qubits):
+    #         qc = qc.apply(gates_list[Int(index[i])], i)
+    #     qc = qc.barrier()
+    #     for i in range(num_qubits - 1):
+    #         qc = qc.apply(
+    #             gates_list[Int(index[num_qubits + i])],
+    #             i,
+    #             controls=[(i + 1) % num_qubits],
+    #             is_anti_control=[False],
+    #         )
+    #     qc = qc.barrier()
+
+    index: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+        number_layers * 2 * num_qubits
+    )
+    random.seed()  # Seed on current time
+    random.randint(
+        index, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
+    )
+
+    for iter in range(number_layers):
+        for i in range(num_qubits):
+            qc.apply(gates_list[Int(index[iter * num_qubits + i])](i))
+        qc.barrier()
+        for i in range(num_qubits - 1):
+            qc.apply(
+                gates_list[Int(index[iter * num_qubits + num_qubits + i])](
+                    i, controls=[(i + 1) % num_qubits]
+                ),
+            )
+        qc.barrier()
+
+    initial_state_bitstring: String = (
+        "0" * num_qubits
+    )  # Initial state |000...0⟩
+    initial_state: StateVector = StateVector.from_bitstring(
+        initial_state_bitstring
+    )
+
+    qsimu = StateVectorSimulator(
+        qc,
+        initial_state=initial_state,
+        optimisation_level=0,  # No optimisations for now
+        verbose=False,
+        # verbose_step_size=ShowAfterEachLayer,  # ShowAfterEachGate, ShowOnlyEnd
+        verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
+        # stop_at=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd # TODO implement that instead of having access to manual methods
+    )
+
+    for _ in range(100):
+        _ = qsimu.run()
+
+
+@parameter
+@always_inline
+fn benchmark_elementwise_parameterized[
+    num_qubits: Int, number_layers: Int
+](mut b: Bencher) raises:
+    @parameter
+    @always_inline
+    fn elementwise_workflow(ctx: DeviceContext) raises:
+        simulate_random_circuit[num_qubits, number_layers]()
+
+    bench_ctx = DeviceContext()
+    b.iter_custom[elementwise_workflow](bench_ctx)
+
+
+def bench_main[
+    max_number_qubits: Int = 10,
+    max_number_layers: Int = 20,
+    fixed_number_qubits: Int = 5,
+    fixed_number_layers: Int = 10,
+]():
+    print("Running qubit_wise_multiply() CPU Benchmarks...")
+    # print("SIMD width:", SIMD_WIDTH)
+    print("-" * 80)
+    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+    bench = Bench(bench_config)
+
+    @parameter
+    for number_qubits in range(1, max_number_qubits + 1):
+        bench.bench_function[
+            benchmark_elementwise_parameterized[
+                number_qubits, fixed_number_layers
+            ]
+        ](
+            BenchId(
+                "simulate_random_circuit_"
+                + String(number_qubits)
+                + "q_"
+                + String(fixed_number_layers)
+                + "l"
+            )
+        )
+
+    @parameter
+    for number_layers in range(1, max_number_layers + 1):
+        bench.bench_function[
+            benchmark_elementwise_parameterized[
+                fixed_number_qubits, number_layers
+            ]
+        ](
+            BenchId(
+                "simulate_random_circuit_"
+                + String(fixed_number_qubits)
+                + "q_"
+                + String(number_layers)
+                + "l"
+            )
+        )
+
+    print(bench)
+
+    bench.config.out_file = Path("out.csv")
+    bench.dump_report()
+
+    print("qubit_wise_multiply() CPU Benchmarks completed!")
+    print("-" * 80)
diff --git a/examples/gpu_examples.mojo b/examples/gpu_examples.mojo
new file mode 100644
index 0000000..05609e4
--- /dev/null
+++ b/examples/gpu_examples.mojo
@@ -0,0 +1,288 @@
+# from qlabs.base.gpu import (
+#     qubit_wise_multiply_gpu,
+# )
+
+from bit import count_trailing_zeros
+
+from sys import has_accelerator
+
+from gpu import thread_idx, block_dim, block_idx
+from gpu.host import DeviceContext
+from layout import Layout, LayoutTensor
+
+alias SIZE = 2
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = (1, 1)
+alias dtype = DType.float32
+alias layout = Layout.row_major(SIZE, SIZE)
+
+alias GATE_SIZE = 2
+alias STATE_VECTOR_SIZE = 8
+alias NUMBER_CONTROL_BITS = 1
+
+alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
+alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE, 1)
+alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
+
+
+fn qubit_wise_multiply_gpu(
+    gate_re: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
+    gate_im: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
+    gate_size: Int,
+    target_qubit: Int,
+    quantum_state_re: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    quantum_state_im: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    number_qubits: Int,
+    quantum_state_size: Int,
+    quantum_state_out_re: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    quantum_state_out_im: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+    # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+    number_control_bits: Int,
+) -> None:
+    """Applies a quantum gate to specific qubits in the quantum state.
+
+    It will apply the gate starting from the target qubit assuming that the other
+    qubits that the gate acts on are following the target qubit.
+
+    Args:
+        gate_re: Real part of the gate matrix.
+        gate_im: Imaginary part of the gate matrix.
+        gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
+        target_qubit: The index of the target qubit to apply the gate to.
+        quantum_state_re: Real part of the quantum state vector.
+        quantum_state_im: Imaginary part of the quantum state vector.
+        number_qubits: Total number of qubits in the quantum state.
+        quantum_state_size: Size of the quantum state vector (2^number_qubits).
+        quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
+        quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
+        control_bits: List of control bits, where each control bit is a list containing
+                      [wire_index, flag] (1 for control, 0 for anti-control).
+    """
+    print("Inside qubit_wise_multiply_gpu")
+    target_qubits_count: Int = count_trailing_zeros(gate_size)
+    if (target_qubit < 0) or (target_qubit >= number_qubits):
+        print(
+            "Error: target_qubit index out of bounds. Must be between 0 and",
+            number_qubits - 1,
+        )
+        print("Skipping gate application.")
+        return
+
+    print("AAAAA")
+    inclusion_mask: Int = 0
+    desired_value_mask: Int = 0
+    for i in range(number_control_bits):
+        print("before")
+        wire_index, flag = control_bits[i, 0], control_bits[i, 1]
+        print("after")
+        bit: Int = 1 << Int(
+            wire_index
+        )  # efficient way of computing 2^wire_index
+        inclusion_mask |= bit  # turn on the bit
+        if flag == 1:
+            desired_value_mask |= bit  # turn on the bit
+
+    print("BBBBB")
+    size_of_state_vector: Int = quantum_state_size
+    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+    size_of_block: Int = size_of_half_block << target_qubits_count
+
+    print("CCCC")
+    # copies all amplitudes from quantum_state to quantum_state_out
+    for i in range(size_of_state_vector):
+        quantum_state_out_re[i, 0] = quantum_state_re[i, 0]
+        quantum_state_out_im[i, 0] = quantum_state_im[i, 0]
+
+    print("before loop")
+    for block_start in range(0, size_of_state_vector, size_of_block):
+        # print("block_start:", block_start)
+        for offset in range(size_of_half_block):
+            # print("offset:", offset)
+            i1: Int = (
+                block_start | offset
+            )  # faster than, but equivalent to, block_start + offset
+
+            if (i1 & inclusion_mask) != desired_value_mask:
+                continue  # skip this iteration if the control bits do not match
+
+            i2: Int = (
+                i1 | size_of_half_block
+            )  # equivalent to i1 + size_of_half_block
+
+            # new_state_vector[i1] = (
+            #     gate[0, 0] * quantum_state[i1] + gate[0, 1] * quantum_state[i2]
+            # )
+
+            print("i1:", i1, "i2:", i2)
+
+            quantum_state_out_re[i1] = (
+                (gate_re[0, 0] * quantum_state_re[i1, 0])
+                - (gate_im[0, 0] * quantum_state_im[i1, 0])
+                + (gate_re[0, 1] * quantum_state_re[i2, 0])
+                - (gate_im[0, 1] * quantum_state_im[i2, 0])
+            )
+
+            quantum_state_out_im[i1] = (
+                (gate_re[0, 0] * quantum_state_im[i1, 0])
+                - (gate_im[0, 0] * quantum_state_re[i1, 0])
+                + (gate_re[0, 1] * quantum_state_im[i2, 0])
+                - (gate_im[0, 1] * quantum_state_re[i2, 0])
+            )
+
+            quantum_state_out_re[i2] = (
+                (gate_re[1, 0] * quantum_state_re[i1, 0])
+                - (gate_im[1, 0] * quantum_state_im[i1, 0])
+                + (gate_re[1, 1] * quantum_state_re[i2, 0])
+                - (gate_im[1, 1] * quantum_state_im[i2, 0])
+            )
+
+            quantum_state_out_im[i2] = (
+                (gate_re[1, 0] * quantum_state_im[i1, 0])
+                - (gate_im[1, 0] * quantum_state_re[i1, 0])
+                + (gate_re[1, 1] * quantum_state_im[i2, 0])
+                - (gate_im[1, 1] * quantum_state_re[i2, 0])
+            )
+
+
+def gpu_debug_something():
+    @parameter
+    if not has_accelerator():
+        print("No compatible GPU found")
+    else:
+        ctx = DeviceContext()
+        print("Found GPU:", ctx.name())
+
+        gate_re = ctx.enqueue_create_buffer[dtype](
+            GATE_SIZE * GATE_SIZE
+        ).enqueue_fill(0)
+        gate_im = ctx.enqueue_create_buffer[dtype](
+            GATE_SIZE * GATE_SIZE
+        ).enqueue_fill(0)
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+
+        control_bits = ctx.enqueue_create_buffer[DType.int32](
+            NUMBER_CONTROL_BITS * 2
+        ).enqueue_fill(0)
+
+        # gate_re = ctx.enqueue_create_host_buffer[dtype](
+        #     GATE_SIZE * GATE_SIZE
+        # ).enqueue_fill(0)
+        # gate_im = ctx.enqueue_create_host_buffer[dtype](
+        #     GATE_SIZE * GATE_SIZE
+        # ).enqueue_fill(0)
+        # quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+        #     STATE_VECTOR_SIZE
+        # ).enqueue_fill(0)
+        # quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+        #     STATE_VECTOR_SIZE
+        # ).enqueue_fill(0)
+        # quantum_state_out_re = ctx.enqueue_create_host_buffer[dtype](
+        #     STATE_VECTOR_SIZE
+        # ).enqueue_fill(0)
+        # quantum_state_out_im = ctx.enqueue_create_host_buffer[dtype](
+        #     STATE_VECTOR_SIZE
+        # ).enqueue_fill(0)
+
+        gate_re_tensor = LayoutTensor[mut=True, dtype, gate_1qubit_layout](
+            gate_re.unsafe_ptr()
+        )
+        gate_im_tensor = LayoutTensor[mut=True, dtype, gate_1qubit_layout](
+            gate_im.unsafe_ptr()
+        )
+        quantum_state_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_re.unsafe_ptr())
+        quantum_state_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_im.unsafe_ptr())
+        quantum_state_out_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_out_re.unsafe_ptr())
+        quantum_state_out_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_out_im.unsafe_ptr())
+        control_bits_tensor = LayoutTensor[
+            mut=True, DType.int32, control_bits_layout
+        ](control_bits.unsafe_ptr())
+
+        matrix = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
+
+        # with matrix.map_to_host() as host_matrix:
+        #     print(host_matrix)
+
+        # cannot replace layout by runtime_layout here
+        matrix_tensor = LayoutTensor[mut=True, dtype, layout](
+            matrix.unsafe_ptr()
+        )
+
+        ctx.synchronize()
+
+        print("Before")
+
+        # qubit_wise_multiply_gpu(
+        #     gate_re_tensor,
+        #     gate_im_tensor,
+        #     GATE_SIZE,
+        #     0,  # target_qubit
+        #     quantum_state_re_tensor,
+        #     quantum_state_im_tensor,
+        #     3,  # number_qubits
+        #     STATE_VECTOR_SIZE,  # quantum_state_size
+        #     quantum_state_out_re_tensor,
+        #     quantum_state_out_im_tensor,
+        #     control_bits,
+        # )
+
+        ctx.enqueue_function[qubit_wise_multiply_gpu](
+            gate_re_tensor,
+            gate_im_tensor,
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_tensor,
+            NUMBER_CONTROL_BITS,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        print("After")
+
+        ctx.synchronize()
+
+        with quantum_state_out_re.map_to_host() as host_re:
+            print("Output real part:", host_re)
+        with quantum_state_out_im.map_to_host() as host_im:
+            print("Output imaginary part:", host_im)
+        with gate_re.map_to_host() as host_gate_re:
+            print("Gate real part:", host_gate_re)
+        with gate_im.map_to_host() as host_gate_im:
+            print("Gate imaginary part:", host_gate_im)
+        with quantum_state_re.map_to_host() as host_quantum_state_re:
+            print("Quantum state real part:", host_quantum_state_re)
+        with quantum_state_im.map_to_host() as host_quantum_state_im:
+            print("Quantum state imaginary part:", host_quantum_state_im)
diff --git a/examples/main.mojo b/examples/main.mojo
index 139afbd..2c73897 100644
--- a/examples/main.mojo
+++ b/examples/main.mojo
@@ -4,14 +4,13 @@
 
 from sys import argv
 import random
-from collections.linked_list import LinkedList
 
 # from complex import ComplexFloat64
 from qlabs.local_stdlib.complex import ComplexFloat64
 from qlabs.local_stdlib import CustomList
 
 from qlabs.base import (
-    PureBasisState,
+    StateVector,
     ComplexMatrix,
     Gate,
     Hadamard,
@@ -39,6 +38,8 @@ from qlabs.abstractions import (
     ShowOnlyEnd,
 )
 
+from gpu_examples import gpu_debug_something
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 # MARK:         Examples             #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -57,7 +58,7 @@ fn simulate_figure1_circuit() -> None:
     """
     )
     # Initialize the quantum circuit to the |000⟩ state
-    quantum_state: PureBasisState = PureBasisState.from_bitstring("000")
+    quantum_state: StateVector = StateVector.from_bitstring("000")
 
     print("Initial quantum state:\n", quantum_state)
 
@@ -129,7 +130,7 @@ fn simulate_figure1_circuit_abstract() -> None:
     )
 
     # Create the initial state |000⟩
-    initial_state: PureBasisState = PureBasisState.from_bitstring("000")
+    initial_state: StateVector = StateVector.from_bitstring("000")
 
     qsimu = StateVectorSimulator(
         qc,
@@ -208,7 +209,7 @@ fn simulate_random_circuit(num_qubits: Int, number_layers: Int) -> None:
     initial_state_bitstring: String = (
         "0" * num_qubits
     )  # Initial state |000...0⟩
-    initial_state: PureBasisState = PureBasisState.from_bitstring(
+    initial_state: StateVector = StateVector.from_bitstring(
         initial_state_bitstring
     )
 
@@ -245,7 +246,7 @@ fn simulate_figure4_circuit() -> None:
     num_qubits: Int = 3
 
     # Initialize the quantum circuit to the |000⟩ state
-    quantum_state: PureBasisState = PureBasisState.from_bitstring("000")
+    quantum_state: StateVector = StateVector.from_bitstring("000")
 
     print("Intial quantum state:\n", quantum_state)
 
@@ -337,7 +338,7 @@ fn simulate_figure4_circuit_abstract() -> None:
 
     qsimu = StateVectorSimulator(
         qc,
-        initial_state=PureBasisState.from_bitstring("000"),
+        initial_state=StateVector.from_bitstring("000"),
         optimisation_level=0,  # No optimisations for now
         verbose=True,
         verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
@@ -371,7 +372,7 @@ fn presentation() -> None:
 
     qsimu = StateVectorSimulator(
         qc,
-        initial_state=PureBasisState.from_bitstring("000"),
+        initial_state=StateVector.from_bitstring("000"),
         optimisation_level=0,  # No optimisations for now
         verbose=True,
         verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
@@ -399,7 +400,7 @@ fn test_density_matrix() -> None:
 
     qsimu = StateVectorSimulator(
         qc,
-        initial_state=PureBasisState.from_bitstring("00"),
+        initial_state=StateVector.from_bitstring("00"),
         optimisation_level=0,  # No optimisations for now
         verbose=True,
         verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
@@ -421,44 +422,129 @@ fn test_density_matrix() -> None:
     print("Partial trace matrix qubit 1:\n", other_matrix_1)
 
 
+fn try_get_purity() -> None:
+    """
+    Returns the density matrix of the given quantum state.
+    If qubits is empty, returns the full density matrix.
+    """
+    num_qubits: Int = 2
+    qc: GateCircuit = GateCircuit(num_qubits)
+
+    qc.apply_gates(
+        Hadamard(0),
+        Hadamard(1, controls=[0]),
+        Z(0),
+        X(1),
+    )
+
+    print("Quantum circuit created:\n", qc)
+
+    qsimu = StateVectorSimulator(
+        qc,
+        initial_state=StateVector.from_bitstring("00"),
+        optimisation_level=0,  # No optimisations for now
+        verbose=True,
+        verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
+    )
+    final_state = qsimu.run()
+    print("Final quantum state:\n", final_state)
+
+    purity = final_state.purity()
+    print("Purity of the quantum state:", purity)
+
+    purity0 = final_state.purity([0, 1])
+    print("Purity of the quantum state:", purity0)
+
+    purity1 = final_state.purity([0])
+    print("Purity of qubit 0:", purity1)
+
+    # for QOL
+    # list_purity = final_state.purity(0, 1)
+    # print("Purity of qubit 0:", list_purity[0])
+    # print("Purity of qubit 1:", list_purity[1])
+
+    normalised_purity = final_state.normalised_purity()
+    print("Normalised purity of the quantum state:", normalised_purity)
+
+
+fn try_measument() -> None:
+    """
+    Returns the density matrix of the given quantum state.
+    If qubits is empty, returns the full density matrix.
+    """
+    num_qubits: Int = 2
+    qc: GateCircuit = GateCircuit(num_qubits)
+
+    qc.apply_gates(
+        Hadamard(0),
+        Hadamard(1, controls=[0]),
+        Z(0),
+        X(1),
+    )
+
+    print("Quantum circuit created:\n", qc)
+
+    qsimu = StateVectorSimulator(
+        qc,
+        initial_state=StateVector.from_bitstring("00"),
+        optimisation_level=0,  # No optimisations for now
+        verbose=True,
+        verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
+    )
+    final_state = qsimu.run()
+    print("Final quantum state:\n", final_state)
+
+    purity = final_state.purity()
+    print("Purity of the quantum state:", purity)
+
+    purity0 = final_state.purity([0, 1])
+    print("Purity of the quantum state:", purity0)
+
+    purity1 = final_state.purity([0])
+    print("Purity of qubit 0:", purity1)
+
+    # for QOL
+    # list_purity = final_state.purity(0, 1)
+    # print("Purity of qubit 0:", list_purity[0])
+    # print("Purity of qubit 1:", list_purity[1])
+
+    normalised_purity = final_state.normalised_purity()
+    print("Normalised purity of the quantum state:", normalised_purity)
+
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
-# MARK:         Tests                #
+# MARK:         Debug                #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
-# fn test_all() -> None:
-#     """
-#     Runs all tests and examples.
-#     """
-# qc = qc.apply_layer([
-#     Hadamard([1]),
-#     NOT([2], controls=[1], anti_controls=[])
-#     ])
-# qc = qc.apply_layer([
-#     NOT([0], controls=[1], anti_controls=[])
-# ])
-# qc = qc.apply_layer([
-#     PauliZ([0]),
-#     NOT([2], controls=[1], anti_controls=[])
-#     ])
-
-# # Create the initial state |000⟩
-# initial_state: PureBasisState = PureBasisState.from_bitstring("000")
-
-# qsimu = StateVectorSimulator(
-#     qc,
-#     initial_state=initial_state,
-#     optimisation_level=0,  # No optimisations for now
-#     verbose=True,
-#     # verbose_step_size=ShowAfterEachLayer,  # ShowAfterEachGate, ShowOnlyEnd
-#     verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
-# )
-
-# while (qsimu.circuit.num_gates() != 0):
-#     qsimu, state = qsimu.next_gate(state)
-#     print("New quantum state after gate:\n", state)
-#     # qsimu, state = qsimu.next_layer(state)
-#     # or
-#     # qsimu, state = qsimu.next_block(state)
+
+# fn qubit_wise_multiply_gpu(
+#     # gate: ComplexMatrix,
+#     # Use SIMD instead
+#     gate_re: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
+#     gate_im: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
+#     gate_size: Int,
+#     target_qubit: Int,
+#     # owned quantum_state: StateVector,
+#     quantum_state_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     number_qubits: Int,
+#     quantum_state_size: Int,
+#     quantum_state_out_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_out_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     control_bits: List[List[Int]] = [],
+# ) -> None:
+
+
+fn debug_something():
+    return
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -484,9 +570,9 @@ def main():
 
     # simulate_figure1_circuit()
 
-    simulate_figure1_circuit_abstract()
+    # simulate_figure1_circuit_abstract()
 
-    simulate_random_circuit(number_qubits, number_layers)
+    # simulate_random_circuit(number_qubits, number_layers)
 
     # simulate_figure4_circuit()
 
@@ -495,3 +581,9 @@ def main():
     # presentation()
 
     # test_density_matrix()
+
+    # try_get_purity()
+
+    # debug_something()
+
+    gpu_debug_something()
diff --git a/pixi.lock b/pixi.lock
index 15fdf4f..3c27d8b 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -1,7 +1,388 @@
 version: 6
 environments:
+  cuda:
+    channels:
+    - url: https://conda.anaconda.org/nvidia/
+    - url: https://conda.modular.com/max-nightly/
+    - url: https://conda.anaconda.org/conda-forge/
+    packages:
+      linux-64:
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-3_kmp_llvm.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/aiofiles-24.1.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/aiohappyeyeballs-2.6.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aiohttp-3.12.13-py312h178313f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/aiosignal-1.3.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.9.0-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/asgiref-3.8.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/attrs-25.3.0-pyh71513ae_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-hbfa7f16_15.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.2-h5e3027f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-hafb2847_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-h76f0014_12.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.2-h015de20_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.20.1-hdfce8c9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.1-h1e5e6c0_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.3-h5e174a9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-hafb2847_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-hafb2847_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.32.10-hff780f1_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-h937e755_11.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.14.0-h5cfcd09_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.10.0-h113e628_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.13.0-h3cf044e_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.8.0-h736e048_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.12.0-ha633028_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/backoff-2.2.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h2ec8cdc_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.6.15-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.6.15-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py312h06ac9bb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.11-py312hd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cccl_linux-64-12.9.27-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-command-line-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-compiler-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-crt-dev_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-crt-tools-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-dev-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart-dev_linux-64-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-static-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart-static_linux-64-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart_linux-64-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cuobjdump-12.9.26-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cupti-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cupti-dev-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cuxxfilt-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-driver-dev-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-driver-dev_linux-64-12.9.79-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-gdb-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-dev-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nsight-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-nvcc-dev_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-impl-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-tools-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvdisasm-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvml-dev-12.9.40-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvprof-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvprune-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvrtc-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvrtc-dev-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvtx-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-nvvm-dev_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvm-impl-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvm-tools-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvp-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-opencl-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-opencl-dev-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-profiler-api-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-sanitizer-api-12.9.27-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-toolkit-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-version-12.9-3.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-visual-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/datasets-2.14.4-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/deprecated-1.2.18-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/dill-0.3.7-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.7.0-pyhff2d567_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/email-validator-2.2.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/email_validator-2.2.0-hd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.0-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.14-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-cli-0.0.7-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.18.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/frozenlist-1.6.0-py312hb9e946c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-13.3.0-h1e990d8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-13.3.0-h6f18a23_11.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/gds-tools-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gettext-0.24.1-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gettext-tools-0.24.1-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/gguf-0.17.1-pyhc364b38_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.2.1-py312h7201bc8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/googleapis-common-protos-1.70.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-1.71.0-py312hdcb7bd4_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/grpcio-reflection-1.71.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-tools-1.71.0-py312h2a0d124_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-13.3.0-hae580e1_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-hb14504d_11.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-transfer-0.1.9-py312h5bc9d60_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.5-py39h260a9e5_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/httpcore-1.0.9-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/httptools-0.6.4-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/httpx-0.28.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-0.33.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.5.0-pyha770c72_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_client-8.6.3-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_core-5.8.1-pyh31011fe_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20250127.1-cxx17_hbbce691_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-20.0.0-h019e7cd_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-20.0.0-hb826db4_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-20.0.0-hb826db4_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-20.0.0-h69308b4_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.24.1-h8e693c7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libasprintf-devel-0.24.1-h8e693c7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-32_hfdb39a5_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_h372d94f_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-12.9.0.13-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-dev-12.9.0.13-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-11.4.0.6-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-dev-11.4.0.6-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-dev-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-10.3.10.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-dev-10.3.10.19-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusolver-11.7.4.40-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusolver-dev-11.7.4.40-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-12.5.9.5-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-dev-12.5.9.5-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libflac-1.4.3-h59595ed_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-13.3.0-hc03c837_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.24.1-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.24.1-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-hc4361e1_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.36.0-h0121fbd_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.71.0-h8e591d7_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.2-default_h0d58e46_1001.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_hc41d3b0_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-12.4.0.27-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-dev-12.4.0.27-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hb9d3cd8_1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-dev-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-dev-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-12.4.0.16-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-dev-12.4.0.16-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hd1b1c89_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopus-1.5.2-hd0c01bc_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h3f30f2e_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.49-h943b412_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.29.3-h501fc15_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.06.26-hba17884_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsentencepiece-0.2.0-he636bdd_11.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc60ed4a_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.20-h4ab18f5_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.1-h6cd9bfd_7.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-13.3.0-hc03c837_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.21.0-h0e7cc3e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.7.1-cpu_mkl_hb1c5dc7_100.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.10.0-h202a827_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libvorbis-1.3.7-h9c3ff4c_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.7-h024ca30_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py312h178313f_1.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/max-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/linux-64/max-python-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/mblack-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/mkl-2024.2.2-ha957f24_16.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/modular-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/mojo-jupyter-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/mpc-1.3.1-h24ddda3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.9-hc50e24c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/msgspec-0.19.0-py312h66e93f0_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.1-py312h178313f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/multiprocess-0.70.15-py312h98912ed_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.5-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.0-h7aa8ee6_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h3f2d84a_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/nsight-compute-2025.2.0.11-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.36-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.113-h159eef7_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.3.0-py312h6cf2f7f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-api-1.30.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-exporter-otlp-proto-common-1.30.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-exporter-otlp-proto-http-1.30.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-exporter-prometheus-0.51b0-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-proto-1.30.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-sdk-1.30.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-semantic-conventions-0.51b0-pyh3cfb1c2_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/optree-0.16.0-py312h68727a3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/orc-2.1.2-h17f744e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-2.3.0-py312hf9745cd_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py312h80c1187_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.8-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/prometheus-async-25.1.0-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.22.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/propcache-0.3.1-py312h178313f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/protobuf-5.29.3-py312h0f4f066_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.0.0-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-20.0.0-py312h7900ff3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-20.0.0-py312h09cf70e_0_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-2.13.6-pyhc790b64_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-global-2.13.6-pyh217bc35_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.11.7-pyh3cfb1c2_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.33.2-py312h680f630_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.10.1-pyh3cfb1c2_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyinstrument-5.0.2-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pysoundfile-0.13.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.11-h9e4cc4f_0_cpython.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.1.1-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.11-hd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-json-logger-2.0.7-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-multipart-0.0.20-pyhff2d567_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/python-xxhash-3.5.0-py312h66e93f0_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.12-7_cp312.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.7.1-cpu_mkl_py312_h6a7998d_100.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py312h178313f_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyzmq-27.0.0-py312hbf22597_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.06.26-h9925aae_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2024.11.6-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.4-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.0.0-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/rich-toolkit-0.11.3-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/s2n-1.5.21-h7ab7c64_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/safetensors-0.5.3-py312h12e396e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/scipy-1.15.2-py312ha707e6e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/sentencepiece-0.2.0-hc8f76dd_11.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/sentencepiece-python-0.2.0-py312hb957f94_11.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/sentencepiece-spm-0.2.0-he636bdd_11.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sentinel-1.0.0-pyh29332c3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/shellingham-1.5.4-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/sleef-3.8-h1b44611_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/snappy-1.2.1-h8bd8927_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sniffio-1.3.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sse-starlette-2.1.3-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/starlette-0.41.2-pyha770c72_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_105.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/taskgroup-0.2.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.13.0-hceb3a55_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tiktoken-0.9.0-py312h14ff09d_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tokenizers-0.21.2-py312h8360d73_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/transformers-4.53.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.3.1-cuda126py312hebffaa9_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.16.0-pyh167b9f4_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-0.16.0-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-standard-0.16.0-hf964461_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.14.0-h32cad80_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.14.0-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.34.3-pyh31011fe_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.34.3-h31011fe_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/uvloop-0.21.0-py312h66e93f0_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/watchfiles-1.1.0-py312h12e396e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/websockets-15.0.1-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.17.2-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xgrammar-0.1.19-py312he346f12_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.45-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xxhash-0.8.3-hb47aa4a_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/yarl-1.20.1-py312h178313f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zeromq-4.3.5-h3b0a872_7.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-1.3.1-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstandard-0.23.0-py312h66e93f0_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb8e6e7a_2.conda
   default:
     channels:
+    - url: https://conda.anaconda.org/nvidia/
     - url: https://conda.modular.com/max-nightly/
     - url: https://conda.anaconda.org/conda-forge/
     packages:
@@ -16,25 +397,27 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.9.0-pyh29332c3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/asgiref-3.8.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/attrs-25.3.0-pyh71513ae_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-he099f37_14.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-hbfa7f16_15.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.2-h5e3027f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-common-0.12.3-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-compression-0.3.1-hafb2847_5.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-h814f7a8_11.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.2-h02758d5_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.20.0-hdfce8c9_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.1-hbebb1f4_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.1-h3ef4824_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-h76f0014_12.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.2-h015de20_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.20.1-hdfce8c9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.1-h1e5e6c0_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.3-h5e174a9_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-hafb2847_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-checksums-0.2.7-hafb2847_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.32.8-hf309a9c_5.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-h4607db7_10.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.32.10-hff780f1_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-h937e755_11.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.14.0-h5cfcd09_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-identity-cpp-1.10.0-h113e628_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-blobs-cpp-12.13.0-h3cf044e_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-common-cpp-12.8.0-h736e048_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-storage-files-datalake-cpp-12.12.0-ha633028_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/backoff-2.2.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_5.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h2ec8cdc_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda
@@ -45,31 +428,74 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.11-py312hd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-crt-tools-12.9.86-ha770c72_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cuobjdump-12.9.82-hbd13f7d_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cupti-12.9.79-h9ab20c4_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvcc-tools-12.9.86-he02047a_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvdisasm-12.9.88-hbd13f7d_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-he02047a_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cccl_linux-64-12.9.27-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-command-line-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-compiler-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-crt-dev_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-crt-tools-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-dev-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart-dev_linux-64-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-static-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart-static_linux-64-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart_linux-64-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cuobjdump-12.9.26-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cupti-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cupti-dev-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cuxxfilt-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-driver-dev-12.9.37-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-driver-dev_linux-64-12.9.79-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-gdb-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-dev-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nsight-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-nvcc-dev_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-impl-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-tools-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvdisasm-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvml-dev-12.9.40-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvprof-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvprune-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvrtc-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvrtc-dev-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvtx-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-nvvm-dev_linux-64-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvm-impl-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvm-tools-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvp-12.9.19-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-opencl-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-opencl-dev-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-profiler-api-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-sanitizer-api-12.9.27-1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-toolkit-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/nvidia/noarch/cuda-version-12.9-3.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-visual-tools-12.9.0-0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/datasets-2.14.4-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/deprecated-1.2.18-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/dill-0.3.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.7.0-pyhff2d567_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/email-validator-2.2.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/email_validator-2.2.0-hd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.13-pyhe01879c_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.0-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.14-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-cli-0.0.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.18.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/frozenlist-1.6.0-py312hb9e946c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-13.3.0-h1e990d8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-13.3.0-h6f18a23_11.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/gds-tools-1.14.0.30-4.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gettext-0.24.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gettext-tools-0.24.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gflags-2.2.2-h5888daf_1005.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/gguf-0.17.0-pyhc364b38_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/gguf-0.17.1-pyhc364b38_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.2.1-py312h7201bc8_0.conda
@@ -77,40 +503,57 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-1.71.0-py312hdcb7bd4_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/grpcio-reflection-1.71.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-tools-1.71.0-py312h2a0d124_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-13.3.0-hae580e1_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-hb14504d_11.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-transfer-0.1.9-py312h5bc9d60_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.4-py39h057ba11_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.5-py39h260a9e5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/httpcore-1.0.9-pyh29332c3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/httptools-0.6.4-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/httpx-0.28.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-0.33.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-0.33.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/icu-75.1-he02047a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.5.0-pyha770c72_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_client-8.6.3-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_core-5.8.1-pyh31011fe_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h1423503_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_5.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libabseil-20250127.1-cxx17_hbbce691_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-20.0.0-h314c690_7_cpu.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-20.0.0-hcb10f89_7_cpu.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-20.0.0-hcb10f89_7_cpu.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-20.0.0-h1bed206_7_cpu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-20.0.0-h019e7cd_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-20.0.0-hb826db4_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-20.0.0-hb826db4_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-20.0.0-h69308b4_8_cuda.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.24.1-h8e693c7_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libasprintf-devel-0.24.1-h8e693c7_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_hfdb39a5_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-32_hfdb39a5_mkl.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_3.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_h372d94f_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_h372d94f_mkl.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-12.9.0.13-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-dev-12.9.0.13-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-11.4.0.6-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-dev-11.4.0.6-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-dev-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-10.3.10.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-dev-10.3.10.19-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusolver-11.7.4.40-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusolver-dev-11.7.4.40-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-12.5.9.5-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-dev-12.5.9.5-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
@@ -120,40 +563,53 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libflac-1.4.3-h59595ed_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.13.3-ha770c72_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.13.3-h48d6fc4_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-13.3.0-hc03c837_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.24.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.24.1-h5888daf_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-hc4361e1_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.36.0-h0121fbd_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgrpc-1.71.0-h8e591d7_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libhwloc-2.11.2-default_h0d58e46_1001.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_hc41d3b0_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_hc41d3b0_mkl.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-12.4.0.27-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-dev-12.4.0.27-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hb9d3cd8_1.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-dev-12.9.19-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-dev-12.9.41-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-12.4.0.16-0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-dev-12.4.0.16-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hd1b1c89_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopus-1.5.2-hd0c01bc_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h081d1f1_7_cpu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h3f30f2e_8_cuda.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.49-h943b412_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.29.3-h501fc15_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2024.07.02-hba17884_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.06.26-hba17884_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libsentencepiece-0.2.0-he636bdd_11.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libsndfile-1.2.2-hc60ed4a_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libsodium-1.0.20-h4ab18f5_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.1-hee588c1_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.1-h6cd9bfd_7.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-13.3.0-hc03c837_102.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.21.0-h0e7cc3e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.0-hf01ce69_5.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.7.0-cpu_mkl_hf6ddc5a_100.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.7.1-cpu_mkl_hb1c5dc7_100.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.10.0-h202a827_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libuv-1.51.0-hb9d3cd8_0.conda
@@ -161,32 +617,40 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.5.0-h851e524_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.7-h024ca30_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py312h178313f_1.conda
-      - conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062405-release.conda
-      - conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062405-release.conda
-      - conda: https://conda.modular.com/max-nightly/linux-64/max-python-25.5.0.dev2025062405-release.conda
-      - conda: https://conda.modular.com/max-nightly/noarch/mblack-25.5.0.dev2025062405-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/max-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/linux-64/max-python-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/mblack-25.5.0.dev2025062705-release.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/mkl-2024.2.2-ha957f24_16.conda
-      - conda: https://conda.modular.com/max-nightly/noarch/modular-25.5.0.dev2025062405-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/modular-25.5.0.dev2025062705-release.conda
+      - conda: https://conda.modular.com/max-nightly/noarch/mojo-jupyter-25.5.0.dev2025062705-release.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/mpc-1.3.1-h24ddda3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/mpfr-4.2.1-h90cbb55_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/mpg123-1.32.9-hc50e24c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/mpmath-1.3.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/msgspec-0.19.0-py312h66e93f0_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.0-py312h178313f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.1-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/multiprocess-0.70.15-py312h98912ed_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.5-pyhe01879c_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.0-h7aa8ee6_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h3f2d84a_0.conda
+      - conda: https://conda.anaconda.org/nvidia/linux-64/nsight-compute-2025.2.0.11-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.36-h5888daf_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.113-h159eef7_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.3.0-py312h6cf2f7f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-api-1.30.0-pyhd8ed1ab_0.conda
@@ -201,6 +665,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/packaging-25.0-pyh29332c3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pandas-2.3.0-py312hf9745cd_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py312h80c1187_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.8-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/prometheus-async-25.1.0-pyh29332c3_0.conda
@@ -211,31 +676,31 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/psutil-7.0.0-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-20.0.0-py312h7900ff3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-20.0.0-py312h01725c0_0_cpu.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-20.0.0-py312h09cf70e_0_cuda.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-2.13.6-pyhc790b64_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-global-2.13.6-pyh217bc35_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pycparser-2.22-pyh29332c3_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.11.7-pyh3cfb1c2_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.33.2-py312h680f630_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.9.1-pyh3cfb1c2_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.10.1-pyh3cfb1c2_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyinstrument-5.0.2-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pysoundfile-0.13.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.11-h9e4cc4f_0_cpython.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0.post0-pyhff2d567_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.1.0-pyh29332c3_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.1.1-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.11-hd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/python-json-logger-2.0.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/python-multipart-0.0.20-pyhff2d567_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2025.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/python-xxhash-3.5.0-py312h66e93f0_2.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.12-7_cp312.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.7.0-cpu_mkl_py312_h6a7998d_100.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.7.1-cpu_mkl_py312_h6a7998d_100.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py312h178313f_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyzmq-27.0.0-py312hbf22597_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2024.07.02-h9925aae_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.06.26-h9925aae_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2024.11.6-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.4-pyhd8ed1ab_0.conda
@@ -248,7 +713,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/sentencepiece-python-0.2.0-py312hb957f94_11.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/sentencepiece-spm-0.2.0-he636bdd_11.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sentinel-1.0.0-pyh29332c3_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.8.2-pyhff2d567_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/shellingham-1.5.4-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/sleef-3.8-h1b44611_0.conda
@@ -257,14 +722,17 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sse-starlette-2.1.3-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/starlette-0.41.2-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sympy-1.14.0-pyh2585a3b_105.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/taskgroup-0.2.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/tbb-2021.13.0-hceb3a55_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/tiktoken-0.9.0-py312h14ff09d_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/tokenizers-0.21.1-py312h8360d73_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tokenizers-0.21.2-py312h8360d73_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/transformers-4.52.4-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.3.1-cuda126py312hebffaa9_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/transformers-4.53.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.3.1-cuda126py312hebffaa9_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.16.0-pyh167b9f4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-0.16.0-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/typer-slim-standard-0.16.0-hf964461_0.conda
@@ -272,7 +740,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.14.0-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.4.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.34.3-pyh31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.34.3-h31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/uvloop-0.21.0-py312h66e93f0_1.conda
@@ -280,6 +748,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/websockets-15.0.1-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.17.2-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xgrammar-0.1.19-py312he346f12_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.45-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xxhash-0.8.3-hb47aa4a_0.conda
@@ -404,21 +874,21 @@ packages:
   license_family: MIT
   size: 57181
   timestamp: 1741918625732
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-he099f37_14.conda
-  sha256: 83ed2008b2510c34a34daff71db0adb0d51195fd3f61749537043e6f19eea874
-  md5: 92966a75254cef7f36aa48cbbbcd0d18
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-auth-0.9.0-hbfa7f16_15.conda
+  sha256: 85086df9b358450196a13fc55bab1c552227df78cafddbe2d15caaea458b41a6
+  md5: 16baa9bb7f70a1e457a82023898314a7
   depends:
   - libgcc >=13
   - __glibc >=2.17,<3.0.a0
-  - aws-c-cal >=0.9.2,<0.9.3.0a0
-  - aws-c-io >=0.20.0,<0.20.1.0a0
+  - aws-c-io >=0.20.1,<0.20.2.0a0
+  - aws-c-http >=0.10.2,<0.10.3.0a0
   - aws-c-sdkutils >=0.2.4,<0.2.5.0a0
   - aws-c-common >=0.12.3,<0.12.4.0a0
-  - aws-c-http >=0.10.2,<0.10.3.0a0
+  - aws-c-cal >=0.9.2,<0.9.3.0a0
   license: Apache-2.0
   license_family: APACHE
-  size: 122998
-  timestamp: 1749824007282
+  size: 122993
+  timestamp: 1750291448852
 - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-cal-0.9.2-h5e3027f_0.conda
   sha256: d61cce967e6d97d03aa2828458f7344cdc93422fd2c1126976ab8f475a313363
   md5: 0ead3ab65460d51efb27e5186f50f8e4
@@ -452,78 +922,76 @@ packages:
   license_family: APACHE
   size: 21817
   timestamp: 1747144982788
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-h814f7a8_11.conda
-  sha256: d8e43c9b9a6edcab74b639b41bedcc40bf152248b12fb3454a98945269eabef6
-  md5: 5d311430ba378adc1740de11d94e889f
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-event-stream-0.5.4-h76f0014_12.conda
+  sha256: 7b89ed99ac73c863bea4479f1f1af6ce250f9f1722d2804e07cf05d3630c7e08
+  md5: f978f2a3032952350d0036c4c4a63bd6
   depends:
-  - libgcc >=13
+  - __glibc >=2.17,<3.0.a0
   - libstdcxx >=13
   - libgcc >=13
-  - __glibc >=2.17,<3.0.a0
+  - aws-c-io >=0.20.1,<0.20.2.0a0
   - aws-checksums >=0.2.7,<0.2.8.0a0
-  - aws-c-io >=0.20.0,<0.20.1.0a0
   - aws-c-common >=0.12.3,<0.12.4.0a0
   license: Apache-2.0
   license_family: APACHE
-  size: 57228
-  timestamp: 1749819127292
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.2-h02758d5_1.conda
-  sha256: 926f00d230bd81acb4567f4c6481c73aa7ca3e738a55648bcbb7896e555f640d
-  md5: ff204e8da6461eacdca12d39786122c3
+  size: 57252
+  timestamp: 1750287878861
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-http-0.10.2-h015de20_2.conda
+  sha256: ca0268cead19e985f9b153613f0f6cdb46e0ca32e1647466c506f256269bcdd9
+  md5: ad05d594704926ba7c0c894a02ea98f1
   depends:
   - __glibc >=2.17,<3.0.a0
   - libgcc >=13
-  - aws-c-io >=0.20.0,<0.20.1.0a0
-  - aws-c-cal >=0.9.2,<0.9.3.0a0
+  - aws-c-io >=0.20.1,<0.20.2.0a0
   - aws-c-common >=0.12.3,<0.12.4.0a0
+  - aws-c-cal >=0.9.2,<0.9.3.0a0
   - aws-c-compression >=0.3.1,<0.3.2.0a0
   license: Apache-2.0
   license_family: APACHE
-  size: 223042
-  timestamp: 1749819081641
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.20.0-hdfce8c9_0.conda
-  sha256: 66c6edf2987e2bd53a04d7fb9e18a64401db769a3d9aee894e888698b498cd30
-  md5: 9ec920201723beb7a186ab56710f4b72
+  size: 223038
+  timestamp: 1750289165728
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-io-0.20.1-hdfce8c9_0.conda
+  sha256: c6bd4f067a7829795e1c44e4536b71d46f55f69569216aed34a7b375815fa046
+  md5: dd2d3530296d75023a19bc9dfb0a1d59
   depends:
-  - __glibc >=2.17,<3.0.a0
   - libgcc >=13
-  - aws-c-cal >=0.9.2,<0.9.3.0a0
-  - aws-c-common >=0.12.3,<0.12.4.0a0
+  - __glibc >=2.17,<3.0.a0
   - s2n >=1.5.21,<1.5.22.0a0
+  - aws-c-common >=0.12.3,<0.12.4.0a0
+  - aws-c-cal >=0.9.2,<0.9.3.0a0
   license: Apache-2.0
   license_family: APACHE
-  size: 179237
-  timestamp: 1749775931654
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.1-hbebb1f4_2.conda
-  sha256: 80e480d7f22fd3afb1609402d3811e14850dd0976994084a42c858f6174ecc57
-  md5: a53fe33c3c59cbd3e63e17af18c999c8
+  size: 179223
+  timestamp: 1749844480175
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-mqtt-0.13.1-h1e5e6c0_3.conda
+  sha256: f9e63492d5dd17f361878ce7efa1878de27225216b4e07990a6cb18c378014dc
+  md5: d55921ca3469224f689f974278107308
   depends:
-  - __glibc >=2.17,<3.0.a0
   - libgcc >=13
-  - aws-c-common >=0.12.3,<0.12.4.0a0
+  - __glibc >=2.17,<3.0.a0
   - aws-c-http >=0.10.2,<0.10.3.0a0
-  - aws-c-io >=0.20.0,<0.20.1.0a0
+  - aws-c-io >=0.20.1,<0.20.2.0a0
+  - aws-c-common >=0.12.3,<0.12.4.0a0
   license: Apache-2.0
   license_family: APACHE
-  size: 215828
-  timestamp: 1749824561358
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.1-h3ef4824_2.conda
-  sha256: 6bcea3a79fce378ec29e34c19559187c86a8164f5f2577960d11777183bb8ec5
-  md5: 0e6ed6b678271f3820eecc1cd414fde8
+  size: 215867
+  timestamp: 1750291920145
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-s3-0.8.3-h5e174a9_0.conda
+  sha256: f4e7b200da5df7135cd087618fa30b2cd60cec0eebbd5570fb4c1e9a789dd9aa
+  md5: dea2540e57e8c1b949ca58ff4c7c0cbf
   depends:
   - __glibc >=2.17,<3.0.a0
   - libgcc >=13
+  - aws-c-io >=0.20.1,<0.20.2.0a0
+  - openssl >=3.5.0,<4.0a0
+  - aws-c-auth >=0.9.0,<0.9.1.0a0
+  - aws-c-http >=0.10.2,<0.10.3.0a0
   - aws-checksums >=0.2.7,<0.2.8.0a0
   - aws-c-common >=0.12.3,<0.12.4.0a0
-  - aws-c-auth >=0.9.0,<0.9.1.0a0
-  - openssl >=3.5.0,<4.0a0
-  - aws-c-io >=0.20.0,<0.20.1.0a0
   - aws-c-cal >=0.9.2,<0.9.3.0a0
-  - aws-c-http >=0.10.2,<0.10.3.0a0
   license: Apache-2.0
-  license_family: APACHE
-  size: 133911
-  timestamp: 1749829860605
+  size: 133960
+  timestamp: 1750831815089
 - conda: https://conda.anaconda.org/conda-forge/linux-64/aws-c-sdkutils-0.2.4-hafb2847_0.conda
   sha256: 18c588c386e21e2a926c6f3c1ba7aaf69059ce1459a134f7c8c1ebfc68cf67ec
   md5: 65853df44b7e4029d978c50be888ed89
@@ -546,44 +1014,41 @@ packages:
   license_family: APACHE
   size: 76627
   timestamp: 1747141741534
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.32.8-hf309a9c_5.conda
-  sha256: 6321d86ec964f22be9009cd486f54888a8cf823f824b1ae45984fce8afab469f
-  md5: 608d8f531f2d78deb8ef735405535468
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-crt-cpp-0.32.10-hff780f1_1.conda
+  sha256: 9602a5199dccf257709afdef326abfde6e84c63862b7cee59979803c4d636840
+  md5: 843f52366658086c4f0b0654afbf3730
   depends:
-  - libstdcxx >=13
-  - libgcc >=13
   - __glibc >=2.17,<3.0.a0
+  - libstdcxx >=13
   - libgcc >=13
-  - aws-c-sdkutils >=0.2.4,<0.2.5.0a0
-  - aws-c-http >=0.10.2,<0.10.3.0a0
-  - aws-c-common >=0.12.3,<0.12.4.0a0
-  - aws-c-event-stream >=0.5.4,<0.5.5.0a0
-  - aws-c-io >=0.20.0,<0.20.1.0a0
   - aws-c-mqtt >=0.13.1,<0.13.2.0a0
-  - aws-c-cal >=0.9.2,<0.9.3.0a0
+  - aws-c-event-stream >=0.5.4,<0.5.5.0a0
   - aws-c-auth >=0.9.0,<0.9.1.0a0
-  - aws-c-s3 >=0.8.1,<0.8.2.0a0
+  - aws-c-s3 >=0.8.3,<0.8.4.0a0
+  - aws-c-http >=0.10.2,<0.10.3.0a0
+  - aws-c-sdkutils >=0.2.4,<0.2.5.0a0
+  - aws-c-cal >=0.9.2,<0.9.3.0a0
+  - aws-c-common >=0.12.3,<0.12.4.0a0
+  - aws-c-io >=0.20.1,<0.20.2.0a0
   license: Apache-2.0
-  license_family: APACHE
-  size: 395097
-  timestamp: 1749834528362
-- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-h4607db7_10.conda
-  sha256: b340651ca85500d9adc2ac639bc484a500f1ba8eb8a1e8e224799e3f1b4cfca7
-  md5: 96f240f245fe2e031ec59dbb3044bd6c
+  size: 399987
+  timestamp: 1750855462459
+- conda: https://conda.anaconda.org/conda-forge/linux-64/aws-sdk-cpp-1.11.510-h937e755_11.conda
+  sha256: 6febc586060ed0458a1cfbb8d6ceae94f5dd29e18315c4f0bc239df2b07715df
+  md5: 11264cb7d5ad4c27d3eaffc909839698
   depends:
-  - libstdcxx >=13
-  - libgcc >=13
   - __glibc >=2.17,<3.0.a0
   - libgcc >=13
-  - libcurl >=8.14.0,<9.0a0
+  - libstdcxx >=13
+  - libgcc >=13
   - aws-c-event-stream >=0.5.4,<0.5.5.0a0
-  - libzlib >=1.3.1,<2.0a0
-  - aws-crt-cpp >=0.32.8,<0.32.9.0a0
+  - libcurl >=8.14.1,<9.0a0
   - aws-c-common >=0.12.3,<0.12.4.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - aws-crt-cpp >=0.32.10,<0.32.11.0a0
   license: Apache-2.0
-  license_family: APACHE
-  size: 3401506
-  timestamp: 1748938911866
+  size: 3401544
+  timestamp: 1750844008776
 - conda: https://conda.anaconda.org/conda-forge/linux-64/azure-core-cpp-1.14.0-h5cfcd09_0.conda
   sha256: fe07debdb089a3db17f40a7f20d283d75284bb4fc269ef727b8ba6fc93f7cb5a
   md5: 0a8838771cc2e985cd295e01ae83baf1
@@ -660,6 +1125,25 @@ packages:
   license_family: MIT
   size: 18816
   timestamp: 1733771192649
+- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_5.conda
+  sha256: 27ae158d415ff2942214b32ac7952e642f0f4c2a45ab683691e2a9a9159f868c
+  md5: 18852d82df8e5737e320a8731ace51b9
+  depends:
+  - ld_impl_linux-64 2.43 h712a8e2_5
+  - sysroot_linux-64
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 6376971
+  timestamp: 1749852878015
+- conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_5.conda
+  sha256: fccbb1974d5557cd5bd4dfccc13c0d15ca198c6a45c2124341dea8c952538512
+  md5: 327ef163ac88b57833c1c1a20a9e7e0d
+  depends:
+  - binutils_impl_linux-64 2.43 h4bf12b8_5
+  license: GPL-3.0-only
+  license_family: GPL
+  size: 36038
+  timestamp: 1749852914153
 - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h2ec8cdc_3.conda
   sha256: dc27c58dc717b456eee2d57d8bc71df3f562ee49368a2351103bc8f1b67da251
   md5: a32e0c069f6c3dcac635f7b0b0dac67e
@@ -763,103 +1247,529 @@ packages:
   license: Python-2.0
   size: 45852
   timestamp: 1749047748072
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-crt-tools-12.9.86-ha770c72_1.conda
-  sha256: 4475409f91176c0a77ead29e961617366ef1fbe932c7315abdd5699ad134f0be
-  md5: ba98092d1090d5f5ddd2d7f827e7d3a5
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cccl_linux-64-12.9.27-0.conda
+  sha256: 5ea225926517655bd27b723870a3cfa2c6248d634fd6cabc87553306a34113b9
+  md5: 2aa00c417be8ce0fa4024464d4eb162c
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1145685
+  timestamp: 1742008547224
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-command-line-tools-12.9.0-0.conda
+  sha256: 403b7cba82ca471f66cd4f4bc751b9a11fa728f6ea6a044a8487521ab05d4ca3
+  md5: 8856251a02efc622fc555d126a95fca6
+  depends:
+  - cuda-cupti-dev 12.9.19.*
+  - cuda-gdb 12.9.19.*
+  - cuda-nvdisasm 12.9.19.*
+  - cuda-nvprof 12.9.19.*
+  - cuda-nvtx 12.9.19.*
+  - cuda-sanitizer-api 12.9.27.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17002
+  timestamp: 1745696170904
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-compiler-12.9.0-0.conda
+  sha256: c71d18dbc656b815677f37581489efc3dd1656fc1a87d5238fcf24d74ad605b4
+  md5: c08481bd6c994ae7e0ee6e24a2933728
+  depends:
+  - __linux
+  - cuda-cuobjdump 12.9.26.*
+  - cuda-cuxxfilt 12.9.19.*
+  - cuda-nvcc 12.9.41.*
+  - cuda-nvprune 12.9.19.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 16990
+  timestamp: 1745696180161
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-crt-dev_linux-64-12.9.41-0.conda
+  sha256: 10941d0a8530e0c7bf3cb08ba6ad9f6aafcccad79f0b58ab1949c74a754129fc
+  md5: 62905d6cd7884e080cfc2585ccf6855d
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 84845
+  timestamp: 1744257165738
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-crt-tools-12.9.41-0.conda
+  sha256: 14c627e330a421838553509b4f69192d7a3c4a6fdf92296dad0f80948a42ab57
+  md5: 2f1f259ad44488e3f3678a88d155e1f5
   depends:
   - cuda-version >=12.9,<12.10.0a0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 28928
-  timestamp: 1749226545023
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cudart-12.9.79-h5888daf_0.conda
-  sha256: 57d1294ecfaf9dc8cdb5fc4be3e63ebc7614538bddb5de53cfd9b1b7de43aed5
-  md5: cb15315d19b58bd9cd424084e58ad081
+  size: 19407
+  timestamp: 1744257168866
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-12.9.37-0.conda
+  sha256: 45a5d49fad313f1992d1fd502014a8bd003965fa8d09f496d5d21c94658952f4
+  md5: 635ba485b837cb66010b5564b07eb1a7
   depends:
   - __glibc >=2.17,<3.0.a0
-  - cuda-cudart_linux-64 12.9.79 h3f2d84a_0
+  - cuda-cudart_linux-64 12.9.37 0
   - cuda-version >=12.9,<12.10.0a0
-  - libgcc >=13
-  - libstdcxx >=13
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 23242
-  timestamp: 1749218416505
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-12.9.79-h3f2d84a_0.conda
-  sha256: 6cde0ace2b995b49d0db2eefb7bc30bf00ffc06bb98ef7113632dec8f8907475
-  md5: 64508631775fbbf9eca83c84b1df0cae
+  size: 17647
+  timestamp: 1743653143801
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-dev-12.9.37-0.conda
+  sha256: c7fbd64f98cbb7a46a780c4b10464a4a8cb549701851b94f48382e6e7110df1e
+  md5: 7619e68477cb3e03985771c93f7c7581
   depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart 12.9.37 0
+  - cuda-cudart-dev_linux-64 12.9.37 0
+  - cuda-cudart-static 12.9.37 0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17725
+  timestamp: 1743653160315
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart-dev_linux-64-12.9.37-0.conda
+  sha256: ad2b3a35de82fb890c60eb5f6d7687d60997c78560311648795b3cc64869b3d1
+  md5: fda9dff9052604ef158b94f26b6d5730
+  depends:
+  - cuda-cccl_linux-64
+  - cuda-cudart-static_linux-64
+  - cuda-cudart_linux-64
   - cuda-version >=12.9,<12.10.0a0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 197249
-  timestamp: 1749218394213
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cuobjdump-12.9.82-hbd13f7d_0.conda
-  sha256: a4f37cd8823d209639bdda1eea3ee0eb01040e44e2480c2f393e684c472c2f0c
-  md5: 667a138d80047e7869f5330087772fd7
+  size: 382557
+  timestamp: 1743653147617
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cudart-static-12.9.37-0.conda
+  sha256: 659ed53654964f44374ea2b5576f832de16dd6da03bce7c2bb5fc32f3b69205b
+  md5: 7e39087121bfbdec3e7c00c075b50945
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart-static_linux-64 12.9.37 0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17342
+  timestamp: 1743653152780
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart-static_linux-64-12.9.37-0.conda
+  sha256: cefbf233e8d1d22199f8a6aed92bf7dba03fb8df1d75b44a5b23950d69923a99
+  md5: ac15e331561827baaa6516322d46f92e
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 1141480
+  timestamp: 1743653128918
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-cudart_linux-64-12.9.37-0.conda
+  sha256: cad0e59ee2a89fd47e857aa21579fe559f86712c608832ebd2c2478799b46f57
+  md5: b03b34b41c106ec17ce48858127ca3a2
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 191712
+  timestamp: 1743653136384
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cuobjdump-12.9.26-1.conda
+  sha256: 7c56e799048f7331c177dca92b5218db50881ae2139e6c65bc860908a7ca2f84
+  md5: 0c583c4e36e536de475801fafd04b962
   depends:
   - __glibc >=2.17,<3.0.a0
   - cuda-nvdisasm
   - cuda-version >=12.9,<12.10.0a0
-  - libgcc >=13
-  - libstdcxx >=13
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 243219
-  timestamp: 1749223489014
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-cupti-12.9.79-h9ab20c4_0.conda
-  sha256: 55922005d1b31ba090455ab39d2e5a9b771fe503713d4b7699752a76aedccb2b
-  md5: 229b3cc1f6b6b633923e1c9856ee0d80
+  size: 239720
+  timestamp: 1741948214047
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cupti-12.9.19-0.conda
+  sha256: 85ec556a95e7578aaba7ae186c0a2119ca49e45fdbf5de1444f6b55472ea4c3d
+  md5: f53df2e35fc1bdf6f85a262934d9cc7f
   depends:
-  - __glibc >=2.28,<3.0.a0
+  - __glibc >=2.17,<3.0.a0
   - cuda-version >=12.9,<12.10.0a0
-  - libgcc >=13
-  - libstdcxx >=13
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 1842820
-  timestamp: 1749218443367
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvcc-tools-12.9.86-he02047a_1.conda
-  sha256: 7e5ab4ae67254c6d814007708a8183355684c81a917b383a7f042c25149737c3
-  md5: a076f1ec812ce8fceacd538d6e672f37
+  size: 1869276
+  timestamp: 1741061884298
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cupti-dev-12.9.19-0.conda
+  sha256: 5404c8379a428c6cafb1473fe04a4d4ddf90db45371c37d132336394c0e6d1f1
+  md5: 0197fa523113e10a0a63c1a76d7643aa
   depends:
   - __glibc >=2.17,<3.0.a0
-  - cuda-crt-tools 12.9.86 ha770c72_1
-  - cuda-nvvm-tools 12.9.86 he02047a_1
+  - cuda-cupti 12.9.19 0
   - cuda-version >=12.9,<12.10.0a0
-  - libgcc >=12
-  - libstdcxx >=12
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
   constrains:
-  - gcc_impl_linux-64 >=6,<15.0a0
+  - cuda-cupti-static >=12.9.19
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 27490340
-  timestamp: 1749226666055
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvdisasm-12.9.88-hbd13f7d_0.conda
-  sha256: 6ef7c122897a9e27bc3aaed1745ea03bfecb5f553d420b0e4bf2ef6f568aab81
-  md5: 7e9e4991e5890f32e8ef3c9a971171df
+  size: 4664106
+  timestamp: 1741061932420
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cuxxfilt-12.9.19-1.conda
+  sha256: d16e055c0dce78156a8ff8fb5cee44a988873fbf89d4997a55064ef6561c2863
+  md5: ed9824c59f95905e9f66255826bb238a
   depends:
   - __glibc >=2.17,<3.0.a0
   - cuda-version >=12.9,<12.10.0a0
-  - libgcc >=13
-  - libstdcxx >=13
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 5517799
-  timestamp: 1749221325784
-- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-12.9.86-he02047a_1.conda
-  sha256: 0958aee5a72f4be02c8f988539261cf549c9fcd6b61c6ce895bc6a13fe61f5d6
-  md5: f716064b73c93d9aab74b5cc7f57985d
+  size: 213445
+  timestamp: 1741063395108
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-driver-dev-12.9.37-0.conda
+  sha256: a3bbd993d5a8fdb91e3cd2bc3864f8f504b9df1b5d9e00948202e8321fb9bfd6
+  md5: 2cf54d45b785b4fbd9feb09521a30836
   depends:
   - __glibc >=2.17,<3.0.a0
+  - cuda-driver-dev_linux-64
   - cuda-version >=12.9,<12.10.0a0
-  - libgcc >=12
-  - libstdcxx >=12
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 24248725
-  timestamp: 1749226615764
-- conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-12.9-h4f385c5_3.conda
-  sha256: 5f5f428031933f117ff9f7fcc650e6ea1b3fef5936cf84aa24af79167513b656
-  md5: b6d5d7f1c171cbd228ea06b556cfa859
+  size: 17165
+  timestamp: 1743653156541
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-driver-dev_linux-64-12.9.79-0.conda
+  sha256: 335ab74cb54b1e7230407a26535d7511ff9dc557a85a005c0144a5be002cbbfa
+  md5: 2e8dfd3518d90e296282dbb9e68631a1
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 32177
+  timestamp: 1747087792527
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-gdb-12.9.19-1.conda
+  sha256: 71a78a3aa81be80c547fa78094971dfde33bb7e3f18fb2028fa490091aa2d98c
+  md5: a4fd98bb1ba698ccb98cc7d7e371ffb1
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - gmp >=6.3.0,<7.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 382347
+  timestamp: 1741063634158
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-12.9.0-0.conda
+  sha256: e2d660ff077faa4c11cde7f9675f8dfee2e5182edddf997d2593f16bcd721a97
+  md5: 4265e9a78a4dc5d4e4322324d6399d91
+  depends:
+  - cuda-cudart 12.9.37.*
+  - cuda-nvrtc 12.9.41.*
+  - cuda-opencl 12.9.19.*
+  - libcublas 12.9.0.13.*
+  - libcufft 11.4.0.6.*
+  - libcufile 1.14.0.30.*
+  - libcurand 10.3.10.19.*
+  - libcusolver 11.7.4.40.*
+  - libcusparse 12.5.9.5.*
+  - libnpp 12.4.0.27.*
+  - libnvfatbin 12.9.19.*
+  - libnvjitlink 12.9.41.*
+  - libnvjpeg 12.4.0.16.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17021
+  timestamp: 1745696198849
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-libraries-dev-12.9.0-0.conda
+  sha256: 8c461df38927f2051e5d923068ee400a4b33709da9f76a1ae1aa6510be9286dd
+  md5: 29001c9cec737f0ce1572687894e0f29
+  depends:
+  - cuda-cccl_linux-64 12.9.27.*
+  - cuda-cudart-dev 12.9.37.*
+  - cuda-driver-dev 12.9.37.*
+  - cuda-nvrtc-dev 12.9.41.*
+  - cuda-opencl-dev 12.9.19.*
+  - cuda-profiler-api 12.9.19.*
+  - libcublas-dev 12.9.0.13.*
+  - libcufft-dev 11.4.0.6.*
+  - libcufile-dev 1.14.0.30.*
+  - libcurand-dev 10.3.10.19.*
+  - libcusolver-dev 11.7.4.40.*
+  - libcusparse-dev 12.5.9.5.*
+  - libnpp-dev 12.4.0.27.*
+  - libnvfatbin-dev 12.9.19.*
+  - libnvjitlink-dev 12.9.41.*
+  - libnvjpeg-dev 12.4.0.16.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17077
+  timestamp: 1745696208234
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nsight-12.9.19-0.conda
+  sha256: 1fdbcf0ccdc556d78bc842d54835c3ca118ff6dd9924e8843c82c7f46f7c56aa
+  md5: cc655ace54cdd3fdf6c39661e19003af
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 118702209
+  timestamp: 1741061205454
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-12.9.41-0.conda
+  sha256: 6b9cbc2f9dd9c6c3322fd28d757f0c001a40f7967f5426644d740212e7dadd31
+  md5: 0bec1b9dd8d45cca7106f1a9b0647373
+  depends:
+  - cuda-nvcc_linux-64 12.9.41.*
+  - gcc_linux-64
+  - gxx_linux-64
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 16929
+  timestamp: 1744257369989
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-nvcc-dev_linux-64-12.9.41-0.conda
+  sha256: 934a8eb8bdc7f37d648842eaeb1d5f987bd567c2cddc1382598c6e391b7714f3
+  md5: 796eec03642b18940cd4351c2e1ddda4
+  depends:
+  - cuda-crt-dev_linux-64 12.9.41 0
+  - cuda-nvvm-dev_linux-64 12.9.41 0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc >=6
+  constrains:
+  - gcc_impl_linux-64 >=6,<14.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 14438005
+  timestamp: 1744257305744
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-impl-12.9.41-0.conda
+  sha256: f3e2d35a0fd8d86cd5859b4271273dd7a50cf10339aa2a3470c45ba067a12abd
+  md5: f8a4d89d232d42dcd7bdda28a0bae614
+  depends:
+  - cuda-cudart-dev
+  - cuda-nvcc-dev_linux-64 12.9.41 0
+  - cuda-nvcc-tools 12.9.41 0
+  - cuda-nvvm-impl 12.9.41 0
+  - cuda-version >=12.9,<12.10.0a0
+  constrains:
+  - gcc_impl_linux-64 >=6,<14.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 18940
+  timestamp: 1744257342691
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc-tools-12.9.41-0.conda
+  sha256: d5d60464402ca2c8feca2c447c21b8550ece97fbc2638eee7cee0185e71d37c3
+  md5: cd7d1d4103235449bc0f109a142b1a4f
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-crt-tools 12.9.41 0
+  - cuda-nvvm-tools 12.9.41 0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - gcc_impl_linux-64 >=6,<14.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 27596051
+  timestamp: 1744257260088
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvcc_linux-64-12.9.41-0.conda
+  sha256: 4fd70c93bac1ccd2516c6d0ee1bc0507b0934529a4baf41bbbc497810bdd68ef
+  md5: 0ea172a96ec49dcc5e02fa38db15d6ff
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cudart-dev_linux-64 12.9.*
+  - cuda-driver-dev_linux-64 12.9.*
+  - cuda-nvcc-dev_linux-64 12.9.41.*
+  - cuda-nvcc-impl 12.9.41.*
+  - cuda-nvcc-tools 12.9.41.*
+  - sysroot_linux-64 >=2.17,<3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 20075
+  timestamp: 1744257369213
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvdisasm-12.9.19-1.conda
+  sha256: 9e78ac249de5b06d0cc2e8f442697df821ab38d35f0d3c5a8e3e47bbec2c7e98
+  md5: 93f5c953ac1de233ac54f3c9e34b4b14
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 5514288
+  timestamp: 1741063585300
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvml-dev-12.9.40-1.conda
+  sha256: 0145fc35987bbc40bd1b2b8da3bb778826d1c2fca9c169ec3004bfd445009d5e
+  md5: 0e0e8e84fea017c52aa08fb8f1f46973
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 138155
+  timestamp: 1744082073263
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvprof-12.9.19-0.conda
+  sha256: 522005ca37f8cca252ca64cee4282c128489472dca954f7afe4cf3cd5091bcd3
+  md5: 4209eb3bc39ec5d74f6b664c0cec0869
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-cupti
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 2623420
+  timestamp: 1741062036390
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvprune-12.9.19-1.conda
+  sha256: 410dc8231a1f90596a27ea406e96f037428577ee668ea966af868a1523e95de3
+  md5: ae1a0556a8165de439782b8e4d4b896c
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 67011
+  timestamp: 1741063507094
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvrtc-12.9.41-0.conda
+  sha256: ca42ea9eb92982b0110259a9e13742a0dbe4a6c80acde2d84de8a815b4f1a087
+  md5: 7afa7c7fd452a205098a70b3b7676bc9
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 67357022
+  timestamp: 1744256457761
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvrtc-dev-12.9.41-0.conda
+  sha256: 30aca46022d7a3a51e9b8ac1c94ffae3b69c0b4be41c39b6535e90b02727c6ee
+  md5: 492f4838ebf20c984cc723baaba7032a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvrtc 12.9.41 0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - cuda-nvrtc-static >=12.9.41
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 30612
+  timestamp: 1744256698276
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvtx-12.9.19-0.conda
+  sha256: 3970c2af085180beb5d936c3e30b4601f383df1f431676fc3454915c0c1673fe
+  md5: 72462de491962dd3808fa3b9c26f3b30
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24491
+  timestamp: 1741065484136
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-nvvm-dev_linux-64-12.9.41-0.conda
+  sha256: c86deebc8611caabeab6d3ed2edfc5b92fcfc57a03b9fd0c140d264cd62060ab
+  md5: a3ee4e064b1e9cfb43d190f8ac10c522
+  depends:
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17446
+  timestamp: 1744257171593
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvm-impl-12.9.41-0.conda
+  sha256: 9cc40d07578e0293284186e0eb8fb9cec11428afbeb4a12bfb6606ccc449c2da
+  md5: c795bcefe64ef4f714e5982449e99ad7
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21426206
+  timestamp: 1744257179503
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvm-tools-12.9.41-0.conda
+  sha256: 945384cfc9ff1545b01183992b94f8f482c0343825762a75d607a8e5d2d1f699
+  md5: 368a3da068b6fbd6a6aa064e18dd11b6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 24283255
+  timestamp: 1744257216974
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-nvvp-12.9.19-1.conda
+  sha256: c4885f7955b1810b888175799f0aa279b90c512762aa27a59d824adb3088a4e9
+  md5: f97f33f78bb5c1f9c489c153401d7aee
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvdisasm
+  - cuda-nvprof
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 117833398
+  timestamp: 1741063470595
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-opencl-12.9.19-0.conda
+  sha256: f91739c5b97e2b31684a5c9eef99dce247c090796fc7293d5b0df3c2d9f54447
+  md5: 3c20df1172e66379f7717bedf8ac16d4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  - ocl-icd >=2.3.2,<3.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 25946
+  timestamp: 1741064030231
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-opencl-dev-12.9.19-0.conda
+  sha256: 9205a98b2c367adbfdf1a7ee14c40ce459c0687f6a33befdc40610c78536138f
+  md5: 657fc5d9fb70d1a68d635880be91e6b0
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-opencl 12.9.19 0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 92890
+  timestamp: 1741064033745
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-profiler-api-12.9.19-0.conda
+  sha256: 240aa9244a58446cf295de5f1b02639b2fb64cdbe2a8cec938f8ffd366f00b1c
+  md5: c6595f690818cc944b6c6345dd932c42
+  depends:
+  - cuda-cudart-dev
+  - cuda-version >=12.9,<12.10.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 19415
+  timestamp: 1741065021093
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-sanitizer-api-12.9.27-1.conda
+  sha256: 5a2649f9cebe89e80a1209c853e3b7741a275ffb525d2cda29959cb838572a1e
+  md5: 5f09e6e4c157d1936238b21ee242266d
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 9108684
+  timestamp: 1742008676075
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-toolkit-12.9.0-0.conda
+  sha256: b925076b1e44a2a5b413e5cf410cf76fee4b8eecc142992633b81b8ceeecfcff
+  md5: 94bd4c411787470a9c5d3ab4f03bd175
+  depends:
+  - __linux
+  - cuda-compiler 12.9.0.*
+  - cuda-libraries 12.9.0.*
+  - cuda-libraries-dev 12.9.0.*
+  - cuda-nvml-dev 12.9.40.*
+  - cuda-tools 12.9.0.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 16933
+  timestamp: 1745696266612
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-tools-12.9.0-0.conda
+  sha256: 9794922404dec7215efd494174d1dc2f2b0c23393ab5cf4794b3e0076cbf7576
+  md5: e4758fb1c982cb276b2a10471fb0ea88
+  depends:
+  - cuda-command-line-tools 12.9.0.*
+  - cuda-visual-tools 12.9.0.*
+  - gds-tools 1.14.0.30.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 16921
+  timestamp: 1745696257275
+- conda: https://conda.anaconda.org/nvidia/noarch/cuda-version-12.9-3.conda
+  sha256: d05c0d4e2d1fbf32275db798275b9a8b57e97c0748f4c073ba6c532fe839bb06
+  md5: 40969c18662ba1d35292e70e8545ce90
   constrains:
-  - cudatoolkit 12.9|12.9.*
   - __cuda >=12
+  - cudatoolkit 12.9|12.9.*
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 17161
+  timestamp: 1748727273518
+- conda: https://conda.anaconda.org/nvidia/linux-64/cuda-visual-tools-12.9.0-0.conda
+  sha256: 6a64ef625be480c35470344b8f86041354ee8d1e3d55885920217aa05d29b63a
+  md5: 3a0dd025246efc6dba8c8012045c48e1
+  depends:
+  - cuda-libraries-dev 12.9.0.*
+  - cuda-nsight 12.9.19.*
+  - cuda-nvml-dev 12.9.40.*
+  - cuda-nvvp 12.9.19.*
+  - nsight-compute 2025.2.0.11.*
   license: LicenseRef-NVIDIA-End-User-License-Agreement
-  size: 21578
-  timestamp: 1746134436166
+  size: 16987
+  timestamp: 1745696228542
 - conda: https://conda.anaconda.org/conda-forge/noarch/datasets-2.14.4-pyhd8ed1ab_0.conda
   sha256: 7e09bd083a609138b780fcc4535924cb96814d2c908a36d4c64a2ba9ee3efe7f
   md5: 3e087f072ce03c43a9b60522f5d0ca2f
@@ -883,6 +1793,21 @@ packages:
   license_family: Apache
   size: 347303
   timestamp: 1691593908658
+- conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda
+  sha256: 3b988146a50e165f0fa4e839545c679af88e4782ec284cc7b6d07dd226d6a068
+  md5: 679616eb5ad4e521c83da4650860aba7
+  depends:
+  - libstdcxx >=13
+  - libgcc >=13
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libexpat >=2.7.0,<3.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - libglib >=2.84.2,<3.0a0
+  license: GPL-2.0-or-later
+  license_family: GPL
+  size: 437860
+  timestamp: 1747855126005
 - conda: https://conda.anaconda.org/conda-forge/noarch/deprecated-1.2.18-pyhd8ed1ab_0.conda
   sha256: d614bcff10696f1efc714df07651b50bf3808401fcc03814309ecec242cc8870
   md5: 0cef44b1754ae4d6924ac0eef6b9fdbe
@@ -948,9 +1873,20 @@ packages:
   license: MIT and PSF-2.0
   size: 21284
   timestamp: 1746947398083
-- conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.13-pyhe01879c_0.conda
-  sha256: b4e57f05081f093d686b1286014f0b82159e0c7619a41cc3310eeadc1dc21dc7
-  md5: 0e61e817af37098262b5ae1f4e04988b
+- conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.0-h5888daf_0.conda
+  sha256: dd5530ddddca93b17318838b97a2c9d7694fa4d57fc676cf0d06da649085e57a
+  md5: d6845ae4dea52a2f90178bf1829a21f8
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libexpat 2.7.0 h5888daf_0
+  - libgcc >=13
+  license: MIT
+  license_family: MIT
+  size: 140050
+  timestamp: 1743431809745
+- conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.14-pyhe01879c_0.conda
+  sha256: 4e1d1aabe3199033c9c5a47176b0b4e0cd40621156fc72f706047c2348dd72ff
+  md5: 8f4fcc62c241e372495c19fe6f8b1908
   depends:
   - python >=3.9
   - starlette >=0.40.0,<0.47.0
@@ -964,8 +1900,8 @@ packages:
   - uvicorn-standard >=0.12.0
   - python
   license: MIT
-  size: 78221
-  timestamp: 1750180794637
+  size: 78363
+  timestamp: 1750986285010
 - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-cli-0.0.7-pyhd8ed1ab_0.conda
   sha256: 300683731013b7221922339cd40430bb3c2ddeeb658fd7e37f5099ffe64e4db0
   md5: d960e0ea9e1c561aa928f6c4439f04c7
@@ -986,6 +1922,29 @@ packages:
   license: Unlicense
   size: 17887
   timestamp: 1741969612334
+- conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda
+  sha256: 7093aa19d6df5ccb6ca50329ef8510c6acb6b0d8001191909397368b65b02113
+  md5: 8f5b0b297b59e1ac160ad4beec99dbee
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - freetype >=2.12.1,<3.0a0
+  - libexpat >=2.6.3,<3.0a0
+  - libgcc >=13
+  - libuuid >=2.38.1,<3.0a0
+  - libzlib >=1.3.1,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 265599
+  timestamp: 1730283881107
+- conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda
+  sha256: 7ef7d477c43c12a5b4cddcf048a83277414512d1116aba62ebadfa7056a7d84f
+  md5: 9ccd736d31e0c6e41f54e704e5312811
+  depends:
+  - libfreetype 2.13.3 ha770c72_1
+  - libfreetype6 2.13.3 h48d6fc4_1
+  license: GPL-2.0-only OR FTL
+  size: 172450
+  timestamp: 1745369996765
 - conda: https://conda.anaconda.org/conda-forge/linux-64/frozenlist-1.6.0-py312hb9e946c_0.conda
   sha256: 685ef959d9f3ceeb2bd0dbda36b4bdcfb6e3ae7d1a7cc2c364de543cc28c597f
   md5: 13290e5d9cb327b1b61c1bd8089ac920
@@ -1008,6 +1967,44 @@ packages:
   license_family: BSD
   size: 145521
   timestamp: 1748101667956
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_impl_linux-64-13.3.0-h1e990d8_2.conda
+  sha256: c3e9f243ea8292eecad78bb200d8f5b590e0f82bf7e7452a3a7c8df4eea6f774
+  md5: f46cf0acdcb6019397d37df1e407ab91
+  depends:
+  - binutils_impl_linux-64 >=2.40
+  - libgcc >=13.3.0
+  - libgcc-devel_linux-64 13.3.0 hc03c837_102
+  - libgomp >=13.3.0
+  - libsanitizer 13.3.0 he8ea267_2
+  - libstdcxx >=13.3.0
+  - sysroot_linux-64
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 66770653
+  timestamp: 1740240400031
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gcc_linux-64-13.3.0-h6f18a23_11.conda
+  sha256: b2533388ec510ef0fc95774f15fdfb89582623049494506ea27622333f90bc09
+  md5: 639ef869618e311eee4888fcb40747e2
+  depends:
+  - binutils_linux-64
+  - gcc_impl_linux-64 13.3.0.*
+  - sysroot_linux-64
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 32538
+  timestamp: 1748905867619
+- conda: https://conda.anaconda.org/nvidia/linux-64/gds-tools-1.14.0.30-4.conda
+  sha256: 9beeb9efec4cff2411a7c40d2c5bac6ae6dca346fc9cc11181e7dbb8ef6f5def
+  md5: a299ca238f0b2088eb597a1752d41d86
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcufile >=1.14.0.30,<2.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 39613442
+  timestamp: 1744072718742
 - conda: https://conda.anaconda.org/conda-forge/linux-64/gettext-0.24.1-h5888daf_0.conda
   sha256: 88db27c666e1f8515174bf622a3e2ad983c94d69e3a23925089e476b9b06ad00
   md5: c63e7590d4d6f4c85721040ed8b12888
@@ -1044,9 +2041,9 @@ packages:
   license_family: BSD
   size: 119654
   timestamp: 1726600001928
-- conda: https://conda.anaconda.org/conda-forge/noarch/gguf-0.17.0-pyhc364b38_0.conda
-  sha256: bc56f4a861df32e353c8fabe4041df92c4af788c04931ddfbe67fabe51a21ffb
-  md5: 8178b00203c262c93ed3c88b792481cd
+- conda: https://conda.anaconda.org/conda-forge/noarch/gguf-0.17.1-pyhc364b38_0.conda
+  sha256: 06aa364c6ce109e21858fc016a430c22f738fe6377c67944504df7fc0da3ec20
+  md5: aaaa7074fd79c4e1e79b3e1af5a77efa
   depends:
   - python >=3.8
   - numpy >=1.17
@@ -1056,8 +2053,8 @@ packages:
   - python
   license: MIT
   license_family: MIT
-  size: 91133
-  timestamp: 1748532296134
+  size: 92085
+  timestamp: 1750400728782
 - conda: https://conda.anaconda.org/conda-forge/linux-64/glog-0.7.1-hbabe93e_0.conda
   sha256: dc824dc1d0aa358e28da2ecbbb9f03d932d976c8dca11214aa1dcdfcbd054ba2
   md5: ff862eebdfeb2fd048ae9dc92510baca
@@ -1148,6 +2145,30 @@ packages:
   license_family: APACHE
   size: 234597
   timestamp: 1745229889814
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_impl_linux-64-13.3.0-hae580e1_2.conda
+  sha256: 7cb36526a5c3e75ae07452aee5c9b6219f62fad9f85cc6d1dab5b21d1c4cc996
+  md5: b55f02540605c322a47719029f8404cc
+  depends:
+  - gcc_impl_linux-64 13.3.0 h1e990d8_2
+  - libstdcxx-devel_linux-64 13.3.0 hc03c837_102
+  - sysroot_linux-64
+  - tzdata
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 13362974
+  timestamp: 1740240672045
+- conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-hb14504d_11.conda
+  sha256: dda6a2765249c40168defea26aa67ff37d4d9fd214fb6e8d4fe0f434033bef87
+  md5: 2ca7575e4f2da39c5ee260e022ab1a6f
+  depends:
+  - binutils_linux-64
+  - gcc_linux-64 13.3.0 h6f18a23_11
+  - gxx_impl_linux-64 13.3.0.*
+  - sysroot_linux-64
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 30844
+  timestamp: 1748905886442
 - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
   sha256: f64b68148c478c3bfc8f8d519541de7d2616bf59d44485a5271041d40c061887
   md5: 4b69232755285701bc86a5afe4d9933a
@@ -1183,23 +2204,23 @@ packages:
   license_family: APACHE
   size: 1339225
   timestamp: 1739803760467
-- conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.4-py39h057ba11_0.conda
+- conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.5-py39h260a9e5_3.conda
   noarch: python
-  sha256: 782b217631ebe4ec90d4aab0dd98d5da84e461c89bfdcf46e17917cd3b4a4b7b
-  md5: 00f492749bd4646685fdf7d9cc84e406
+  sha256: b28905ff975bd935cd113ee97b7eb5b5e3b0969a21302135c6ae096aa06a61f6
+  md5: 7b6007f4ad18a970ca3a977148cf47de
   depends:
+  - python
   - __glibc >=2.17,<3.0.a0
-  - _python_abi3_support 1.*
-  - cpython >=3.9
   - libgcc >=13
   - openssl >=3.5.0,<4.0a0
-  - python
+  - _python_abi3_support 1.*
+  - cpython >=3.9
   constrains:
   - __glibc >=2.17
   license: Apache-2.0
   license_family: APACHE
-  size: 2295287
-  timestamp: 1750121714959
+  size: 2537615
+  timestamp: 1750541218448
 - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
   sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba
   md5: 0a802cb9888dd14eeefc611f05c40b6e
@@ -1249,9 +2270,9 @@ packages:
   license_family: BSD
   size: 63082
   timestamp: 1733663449209
-- conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-0.33.0-pyhd8ed1ab_0.conda
-  sha256: 9fd6534b78206797e6fa2e7eb1f364566f07bc5183975472bfe1d78fd405673e
-  md5: d6c805ee758223b1e2915f8234bfaba8
+- conda: https://conda.anaconda.org/conda-forge/noarch/huggingface_hub-0.33.1-pyhd8ed1ab_0.conda
+  sha256: bdbfb0a2aa957fc2a79dc342022529def69162825d6420f03b2dcfaab92765a2
+  md5: 4a634f9e9ad0e28ecd4da031a4616d03
   depends:
   - filelock
   - fsspec >=2023.5.0
@@ -1264,9 +2285,8 @@ packages:
   - typing-extensions >=3.7.4.3
   - typing_extensions >=3.7.4.3
   license: Apache-2.0
-  license_family: APACHE
-  size: 317206
-  timestamp: 1749742816832
+  size: 317782
+  timestamp: 1750865913736
 - conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda
   sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8
   md5: 8e6923fc12f1fe8f8c4e5c9f343256ac
@@ -1316,6 +2336,42 @@ packages:
   license_family: BSD
   size: 112714
   timestamp: 1741263433881
+- conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_client-8.6.3-pyhd8ed1ab_1.conda
+  sha256: 19d8bd5bb2fde910ec59e081eeb59529491995ce0d653a5209366611023a0b3a
+  md5: 4ebae00eae9705b0c3d6d1018a81d047
+  depends:
+  - importlib-metadata >=4.8.3
+  - jupyter_core >=4.12,!=5.0.*
+  - python >=3.9
+  - python-dateutil >=2.8.2
+  - pyzmq >=23.0
+  - tornado >=6.2
+  - traitlets >=5.3
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 106342
+  timestamp: 1733441040958
+- conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_core-5.8.1-pyh31011fe_0.conda
+  sha256: 56a7a7e907f15cca8c4f9b0c99488276d4cb10821d2d15df9245662184872e81
+  md5: b7d89d860ebcda28a5303526cdee68ab
+  depends:
+  - __unix
+  - platformdirs >=2.5
+  - python >=3.8
+  - traitlets >=5.3
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 59562
+  timestamp: 1748333186063
+- conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda
+  sha256: a922841ad80bd7b222502e65c07ecb67e4176c4fa5b03678a005f39fcc98be4b
+  md5: ad8527bf134a90e1c9ed35fa0b64318c
+  constrains:
+  - sysroot_linux-64 ==2.17
+  license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later AND MPL-2.0
+  license_family: GPL
+  size: 943486
+  timestamp: 1729794504440
 - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
   sha256: 150c05a6e538610ca7c43beb3a40d65c90537497a4f6a5f4d15ec0451b6f5ebb
   md5: 30186d27e2c9fa62b45fb1476b7200e3
@@ -1359,17 +2415,17 @@ packages:
   license_family: MIT
   size: 248046
   timestamp: 1739160907615
-- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h1423503_5.conda
-  sha256: dcd2b1a065bbf5c54004ddf6551c775a8eb6993c8298ca8a6b92041ed413f785
-  md5: 6dc9e1305e7d3129af4ad0dabda30e56
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_5.conda
+  sha256: de097284f497b391fe9d000c75b684583c30aad172d9508ed05df23ce39d75cb
+  md5: acd9213a63cb62521290e581ef82de80
   depends:
   - __glibc >=2.17,<3.0.a0
   constrains:
   - binutils_impl_linux-64 2.43
   license: GPL-3.0-only
   license_family: GPL
-  size: 670635
-  timestamp: 1749858327854
+  size: 670525
+  timestamp: 1749852860076
 - conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.0.0-h0aef613_1.conda
   sha256: 412381a43d5ff9bbed82cd52a0bbca5b90623f62e41007c9c42d3870c60945ff
   md5: 9344155d33912347b37f0ae6c410a835
@@ -1395,13 +2451,13 @@ packages:
   license_family: Apache
   size: 1325007
   timestamp: 1742369558286
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-20.0.0-h314c690_7_cpu.conda
-  build_number: 7
-  sha256: fcdec351aac8d5114171e01ec7bc21e8924c665fe52b7ce82612148b0a1c81e4
-  md5: e31c941000c86b5a52b5d520cdff7e20
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-20.0.0-h019e7cd_8_cuda.conda
+  build_number: 8
+  sha256: 52eada3c2c4b8dba96ff41e6610f66f6c4fe437107623ebe52fdb696df3da4ce
+  md5: f2cfbe6e135eec1e658b01c875f41520
   depends:
   - __glibc >=2.17,<3.0.a0
-  - aws-crt-cpp >=0.32.8,<0.32.9.0a0
+  - aws-crt-cpp >=0.32.10,<0.32.11.0a0
   - aws-sdk-cpp >=1.11.510,<1.11.511.0a0
   - azure-core-cpp >=1.14.0,<1.14.1.0a0
   - azure-identity-cpp >=1.10.0,<1.10.1.0a0
@@ -1413,13 +2469,15 @@ packages:
   - libabseil >=20250127.1,<20250128.0a0
   - libbrotlidec >=1.1.0,<1.2.0a0
   - libbrotlienc >=1.1.0,<1.2.0a0
-  - libgcc >=13
+  - libgcc
+  - libgcc-ng >=12
   - libgoogle-cloud >=2.36.0,<2.37.0a0
   - libgoogle-cloud-storage >=2.36.0,<2.37.0a0
   - libopentelemetry-cpp >=1.21.0,<1.22.0a0
   - libprotobuf >=5.29.3,<5.29.4.0a0
   - libre2-11 >=2024.7.2
-  - libstdcxx >=13
+  - libstdcxx
+  - libstdcxx-ng >=12
   - libutf8proc >=2.10.0,<2.11.0a0
   - libzlib >=1.3.1,<2.0a0
   - lz4-c >=1.10.0,<1.11.0a0
@@ -1428,59 +2486,61 @@ packages:
   - snappy >=1.2.1,<1.3.0a0
   - zstd >=1.5.7,<1.6.0a0
   constrains:
-  - parquet-cpp <0.0a0
-  - apache-arrow-proc =*=cpu
+  - apache-arrow-proc =*=cuda
   - arrow-cpp <0.0a0
+  - parquet-cpp <0.0a0
   license: Apache-2.0
-  license_family: APACHE
-  size: 9194829
-  timestamp: 1749948580961
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-20.0.0-hcb10f89_7_cpu.conda
-  build_number: 7
-  sha256: 37e19d7db9c8b6031e6a5036b7519c9d613acd6024f8bf36c51ed66a6702041a
-  md5: 241bdde1a0401bc6db4019d5908fa673
+  size: 8969353
+  timestamp: 1750865838916
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-acero-20.0.0-hb826db4_8_cuda.conda
+  build_number: 8
+  sha256: 1738851640d3b63ccd45e5a77348a91f0b0de9939cb154bcbb4aec6d7d490df2
+  md5: b7f0d9d6c6cbad5adcdfb6b00257901e
   depends:
   - __glibc >=2.17,<3.0.a0
-  - libarrow 20.0.0 h314c690_7_cpu
-  - libgcc >=13
-  - libstdcxx >=13
+  - libarrow 20.0.0 h019e7cd_8_cuda
+  - libgcc
+  - libgcc-ng >=12
+  - libstdcxx
+  - libstdcxx-ng >=12
   license: Apache-2.0
-  license_family: APACHE
-  size: 642249
-  timestamp: 1749948657167
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-20.0.0-hcb10f89_7_cpu.conda
-  build_number: 7
-  sha256: 3ca668ae0257d65b212a7c11516d22b062438e49b1ad72a98d96e5211cd63451
-  md5: ab55d9094b97f25746f26cb988abe15b
+  size: 627008
+  timestamp: 1750865911748
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-dataset-20.0.0-hb826db4_8_cuda.conda
+  build_number: 8
+  sha256: 4a3cb4b8220f219d4bdc1ad9a270938d814df57b2e8fba925e0542a9304a27ce
+  md5: 9948bbe038b9409bd0ed077ea920c261
   depends:
   - __glibc >=2.17,<3.0.a0
-  - libarrow 20.0.0 h314c690_7_cpu
-  - libarrow-acero 20.0.0 hcb10f89_7_cpu
-  - libgcc >=13
-  - libparquet 20.0.0 h081d1f1_7_cpu
-  - libstdcxx >=13
+  - libarrow 20.0.0 h019e7cd_8_cuda
+  - libarrow-acero 20.0.0 hb826db4_8_cuda
+  - libgcc
+  - libgcc-ng >=12
+  - libparquet 20.0.0 h3f30f2e_8_cuda
+  - libstdcxx
+  - libstdcxx-ng >=12
   license: Apache-2.0
-  license_family: APACHE
-  size: 607973
-  timestamp: 1749948789812
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-20.0.0-h1bed206_7_cpu.conda
-  build_number: 7
-  sha256: c1e146098beb32de7060db899c4af2b57abe30cbaf9a2adf3e5e0f88511689db
-  md5: 9e6fb2001a6e86113231ebae5dd51dc9
+  size: 603255
+  timestamp: 1750865973612
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libarrow-substrait-20.0.0-h69308b4_8_cuda.conda
+  build_number: 8
+  sha256: 5a31a8c1d6be22911f975456ebd607ebbe4313f32b24b1ec7315cf6c2fc13f1c
+  md5: 1985d19932063a00ac4d9de2fc111ca4
   depends:
   - __glibc >=2.17,<3.0.a0
   - libabseil * cxx17*
   - libabseil >=20250127.1,<20250128.0a0
-  - libarrow 20.0.0 h314c690_7_cpu
-  - libarrow-acero 20.0.0 hcb10f89_7_cpu
-  - libarrow-dataset 20.0.0 hcb10f89_7_cpu
-  - libgcc >=13
+  - libarrow 20.0.0 h019e7cd_8_cuda
+  - libarrow-acero 20.0.0 hb826db4_8_cuda
+  - libarrow-dataset 20.0.0 hb826db4_8_cuda
+  - libgcc
+  - libgcc-ng >=12
   - libprotobuf >=5.29.3,<5.29.4.0a0
-  - libstdcxx >=13
+  - libstdcxx
+  - libstdcxx-ng >=12
   license: Apache-2.0
-  license_family: APACHE
-  size: 525519
-  timestamp: 1749948876372
+  size: 503113
+  timestamp: 1750866015240
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libasprintf-0.24.1-h8e693c7_0.conda
   sha256: e30733a729eb6efd9cb316db0202897c882d46f6c20a0e647b4de8ec921b7218
   md5: 57566a81dd1e5aa3d98ac7582e8bfe03
@@ -1501,23 +2561,23 @@ packages:
   license: LGPL-2.1-or-later
   size: 34680
   timestamp: 1746228884730
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_hfdb39a5_mkl.conda
-  build_number: 31
-  sha256: 862289f2cfb84bb6001d0e3569e908b8c42d66b881bd5b03f730a3924628b978
-  md5: bdf4a57254e8248222cb631db4393ff1
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-32_hfdb39a5_mkl.conda
+  build_number: 32
+  sha256: 7a04219d42b3b0b85ed9d019f481e4227efa2baa12ff48547758e90e2e208adc
+  md5: eceb19ae9105bc4d0e8d5a321d66c426
   depends:
   - mkl >=2024.2.2,<2025.0a0
   constrains:
-  - liblapack =3.9.0=31*_mkl
-  - liblapacke =3.9.0=31*_mkl
-  - blas =2.131=mkl
-  - libcblas =3.9.0=31*_mkl
+  - liblapack  3.9.0   32*_mkl
+  - blas 2.132   mkl
+  - liblapacke 3.9.0   32*_mkl
+  - libcblas   3.9.0   32*_mkl
   track_features:
   - blas_mkl
   license: BSD-3-Clause
   license_family: BSD
-  size: 17259
-  timestamp: 1740087718283
+  size: 17657
+  timestamp: 1750388671003
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlicommon-1.1.0-hb9d3cd8_3.conda
   sha256: 462a8ed6a7bb9c5af829ec4b90aab322f8bcd9d8987f793e6986ea873bbd05cf
   md5: cb98af5db26e3f482bebb80ce9d947d3
@@ -1550,22 +2610,22 @@ packages:
   license_family: MIT
   size: 282657
   timestamp: 1749230124839
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-31_h372d94f_mkl.conda
-  build_number: 31
-  sha256: 2ee3ab2b6eeb59f2d3c6f933fa0db28f1b56f0bc543ed2c0f6ec04060e4b6ec0
-  md5: 2a06a6c16b45bd3d10002927ca204b67
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_h372d94f_mkl.conda
+  build_number: 32
+  sha256: d0449cdfb6c6e993408375bcabbb4c9630a9b8750c406455ce3a4865ec7a321c
+  md5: 68b55daaf083682f58d9b7f5d52aeb37
   depends:
-  - libblas 3.9.0 31_hfdb39a5_mkl
+  - libblas 3.9.0 32_hfdb39a5_mkl
   constrains:
-  - liblapack =3.9.0=31*_mkl
-  - liblapacke =3.9.0=31*_mkl
-  - blas =2.131=mkl
+  - liblapack  3.9.0   32*_mkl
+  - liblapacke 3.9.0   32*_mkl
+  - blas 2.132   mkl
   track_features:
   - blas_mkl
   license: BSD-3-Clause
   license_family: BSD
-  size: 16724
-  timestamp: 1740087727554
+  size: 17280
+  timestamp: 1750388682101
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2
   sha256: fd1d153962764433fe6233f34a72cdeed5dcf8a883a85769e8295ce940b5b0c5
   md5: c965a5aa0d5c1c37ffc62dff36e28400
@@ -1576,6 +2636,109 @@ packages:
   license_family: BSD
   size: 20440
   timestamp: 1633683576494
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-12.9.0.13-0.conda
+  sha256: a9f3a5fd5469466df97db282606c5853c9fbe58ac6a2da9eeaa8224e616722e7
+  md5: b879169d83047c25dfdb56ccf5856883
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-nvrtc
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 467645383
+  timestamp: 1745639141703
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-dev-12.9.0.13-0.conda
+  sha256: 303c3c2cd6551c5facbc1e71150e8e0cd5e2f56e4b7e610605f0ccb468158834
+  md5: af5972de89d00a9fb2532cc8d709b38d
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-crt-dev_linux-64
+  - cuda-cudart-dev_linux-64
+  - cuda-version >=12.9,<12.10.0a0
+  - libcublas 12.9.0.13 0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libcublas-static >=12.9.0.13
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 87629
+  timestamp: 1745639842019
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-11.4.0.6-0.conda
+  sha256: d96355eddcb7437ae766b961a235d1d8d33d1d618b9315b7006c477dfec13fdb
+  md5: b2dadfbdaa47ffc561e2b6d0d178a137
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 161761850
+  timestamp: 1741064461027
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-dev-11.4.0.6-0.conda
+  sha256: 6fbc9e4aa5bed2746798fd760562c12fcdf0496b333982af58c7d991139dd5a0
+  md5: 72fc5751eb14172b30d41fcb285ec2ba
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcufft 11.4.0.6 0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libcufft-static >=11.4.0.6
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 29037
+  timestamp: 1741064836519
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-1.14.0.30-4.conda
+  sha256: 5d1a684ea052a400e88734907f8f3a29b164bd0ac4ff9dda49ba679a4c2c113a
+  md5: 4fb2098a0e2470c7dc87c86615848cfc
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 968637
+  timestamp: 1744072688735
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-dev-1.14.0.30-4.conda
+  sha256: c70c4dda0e95360aa141e9e49d13b28a5156e860471a1b54841b1f33ece485bb
+  md5: 57930324035bac8d189a5a4781db4a3b
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcufile 1.14.0.30 4
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libcufile-static >=1.14.0.30
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 30710
+  timestamp: 1744072713584
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-10.3.10.19-0.conda
+  sha256: 65d4f3e3286af1165679baee2cb81ba518a186d27362293321c8cfe1317f87b6
+  md5: 21dcbdfb0482a369aab2b1e0abb6a399
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 46145899
+  timestamp: 1741064115039
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-dev-10.3.10.19-0.conda
+  sha256: b0f5197bccc12ec0449d252e0f0b07ca843f687c63aa06362934cd2f78a4328c
+  md5: 8a50519a4292616353b00d336af3e705
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcurand 10.3.10.19 0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libcurand-static >=10.3.10.19
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 243843
+  timestamp: 1741064211694
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda
   sha256: b6c5cf340a4f80d70d64b3a29a7d9885a5918d16a5cb952022820e6d3e79dc8b
   md5: 45f6713cb00f124af300342512219182
@@ -1592,6 +2755,61 @@ packages:
   license_family: MIT
   size: 449910
   timestamp: 1749033146806
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcusolver-11.7.4.40-0.conda
+  sha256: 4dbab3b2790e8d0abe4706a8d8487697ec9a59640fa392756f1c09bfd8f9e27a
+  md5: cd3f31f3aa59dfd6af9eda59a018dae3
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcublas
+  - libcusparse
+  - libgcc-ng >=11.2.0
+  - libnvjitlink
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 201810175
+  timestamp: 1744088224795
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcusolver-dev-11.7.4.40-0.conda
+  sha256: 1a1232b578669f2e5daa5054af61a255684622365895b5b291429f2e0f28e138
+  md5: 2bb6b87c9a8e1e1b85013cbe55eef486
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcusolver 11.7.4.40 0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libcusolver-static >=11.7.4.40
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 56145
+  timestamp: 1744088523706
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-12.5.9.5-0.conda
+  sha256: b911fcdd09031afb6e6005c40965f50a0421519f0525b8273e3cde7c79737033
+  md5: 4751807d75a44415b3a001b5a4b208e9
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libnvjitlink
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 208950976
+  timestamp: 1741066342896
+- conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-dev-12.5.9.5-0.conda
+  sha256: b5aed011ff42e2404d9c6213efc4e4d9b346163259af269f593e75355db0c8f9
+  md5: 66433c327f13fabceca36c189d93fa88
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libcusparse 12.5.9.5 0
+  - libgcc-ng >=11.2.0
+  - libnvjitlink
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libcusparse-static >=12.5.9.5
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 47649
+  timestamp: 1741066704525
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda
   sha256: 8420748ea1cc5f18ecc5068b4f24c7a023cc9b20971c99c824ba10641fb95ddf
   md5: 64f0c503da58ec25ebd359e4d990afa8
@@ -1689,28 +2907,35 @@ packages:
   license: GPL-2.0-only OR FTL
   size: 380134
   timestamp: 1745369987697
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_2.conda
-  sha256: 0024f9ab34c09629621aefd8603ef77bf9d708129b0dd79029e502c39ffc2195
-  md5: ea8ac52380885ed41c1baa8f1d6d2b93
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_3.conda
+  sha256: 59a87161212abe8acc57d318b0cc8636eb834cdfdfddcf1f588b5493644b39a3
+  md5: 9e60c55e725c20d23125a5f0dd69af5d
   depends:
   - __glibc >=2.17,<3.0.a0
   - _openmp_mutex >=4.5
   constrains:
-  - libgcc-ng ==15.1.0=*_2
-  - libgomp 15.1.0 h767d61c_2
+  - libgcc-ng ==15.1.0=*_3
+  - libgomp 15.1.0 h767d61c_3
   license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 829108
-  timestamp: 1746642191935
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_2.conda
-  sha256: 0ab5421a89f090f3aa33841036bb3af4ed85e1f91315b528a9d75fab9aad51ae
-  md5: ddca86c7040dd0e73b2b69bd7833d225
+  size: 824921
+  timestamp: 1750808216066
+- conda: https://conda.anaconda.org/conda-forge/noarch/libgcc-devel_linux-64-13.3.0-hc03c837_102.conda
+  sha256: 538544a2e0651bfeb0348ca6469b6b608606f6080a0b5a531af3a3852fec0215
+  md5: 4c1d6961a6a54f602ae510d9bf31fa60
   depends:
-  - libgcc 15.1.0 h767d61c_2
+  - __unix
   license: GPL-3.0-only WITH GCC-exception-3.1
   license_family: GPL
-  size: 34586
-  timestamp: 1746642200749
+  size: 2597400
+  timestamp: 1740240211859
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_3.conda
+  sha256: b0b0a5ee6ce645a09578fc1cb70c180723346f8a45fdb6d23b3520591c6d6996
+  md5: e66f2b8ad787e7beb0f846e4bd7e8493
+  depends:
+  - libgcc 15.1.0 h767d61c_3
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  size: 29033
+  timestamp: 1750808224854
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-0.24.1-h5888daf_0.conda
   sha256: 104f2341546e295d1136ab3010e81391bd3fd5be0f095db59266e8eba2082d37
   md5: 2ee6d71b72f75d50581f2f68e965efdb
@@ -1732,29 +2957,50 @@ packages:
   license_family: GPL
   size: 37234
   timestamp: 1746228897993
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_2.conda
-  sha256: 914daa4f632b786827ea71b5e07cd00d25fc6e67789db2f830dc481eec660342
-  md5: f92e6e0a3c0c0c85561ef61aa59d555d
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda
+  sha256: 77dd1f1efd327e6991e87f09c7c97c4ae1cfbe59d9485c41d339d6391ac9c183
+  md5: bfbca721fd33188ef923dfe9ba172f29
   depends:
-  - libgfortran5 15.1.0 hcea5267_2
+  - libgfortran5 15.1.0 hcea5267_3
   constrains:
-  - libgfortran-ng ==15.1.0=*_2
+  - libgfortran-ng ==15.1.0=*_3
   license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 34541
-  timestamp: 1746642233221
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_2.conda
-  sha256: be23750f3ca1a5cb3ada858c4f633effe777487d1ea35fddca04c0965c073350
-  md5: 01de444988ed960031dbe84cf4f9b1fc
+  size: 29057
+  timestamp: 1750808257258
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda
+  sha256: eea6c3cf22ad739c279b4d665e6cf20f8081f483b26a96ddd67d4df3c88dfa0a
+  md5: 530566b68c3b8ce7eec4cd047eae19fe
   depends:
   - __glibc >=2.17,<3.0.a0
   - libgcc >=15.1.0
   constrains:
   - libgfortran 15.1.0
   license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 1569986
-  timestamp: 1746642212331
+  size: 1565627
+  timestamp: 1750808236464
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda
+  sha256: a6b5cf4d443044bc9a0293dd12ca2015f0ebe5edfdc9c4abdde0b9947f9eb7bd
+  md5: 072ab14a02164b7c0c089055368ff776
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libffi >=3.4.6,<3.5.0a0
+  - libgcc >=13
+  - libiconv >=1.18,<2.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - pcre2 >=10.45,<10.46.0a0
+  constrains:
+  - glib 2.84.2 *_0
+  license: LGPL-2.1-or-later
+  size: 3955066
+  timestamp: 1747836671118
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda
+  sha256: 43710ab4de0cd7ff8467abff8d11e7bb0e36569df04ce1c099d48601818f11d1
+  md5: 3cd1a7238a0dd3d0860fdefc496cc854
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  size: 447068
+  timestamp: 1750808138400
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-hc4361e1_1.conda
   sha256: 3a56c653231d6233de5853dc01f07afad6a332799a39c3772c0948d2e68547e4
   md5: ae36e6296a8dd8e8a9a8375965bf6398
@@ -1844,22 +3090,22 @@ packages:
   license: IJG AND BSD-3-Clause AND Zlib
   size: 628947
   timestamp: 1745268527144
-- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-31_hc41d3b0_mkl.conda
-  build_number: 31
-  sha256: a2d20845d916ac8fba09376cd791136a9b4547afb2131bc315178adfc87bb4ca
-  md5: 10d012ddd7cc1c7ff9093d4974a34e53
+- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_hc41d3b0_mkl.conda
+  build_number: 32
+  sha256: dc1be931203a71f5c84887cde24659fdd6fda73eb8c6cf56e67b68e3c7916efd
+  md5: 6dc827963c12f90c79f5b2be4eaea072
   depends:
-  - libblas 3.9.0 31_hfdb39a5_mkl
+  - libblas 3.9.0 32_hfdb39a5_mkl
   constrains:
-  - liblapacke =3.9.0=31*_mkl
-  - blas =2.131=mkl
-  - libcblas =3.9.0=31*_mkl
+  - liblapacke 3.9.0   32*_mkl
+  - blas 2.132   mkl
+  - libcblas   3.9.0   32*_mkl
   track_features:
   - blas_mkl
   license: BSD-3-Clause
   license_family: BSD
-  size: 16760
-  timestamp: 1740087736615
+  size: 17284
+  timestamp: 1750388691797
 - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
   sha256: f2591c0069447bbe28d4d696b7fcb0c5bd0b4ac582769b89addbcf26fb3430d8
   md5: 1a580f7796c7bf6393fddb8bbbde58dc
@@ -1887,15 +3133,114 @@ packages:
   license_family: MIT
   size: 647599
   timestamp: 1729571887612
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda
-  sha256: 26d77a3bb4dceeedc2a41bd688564fe71bf2d149fdcf117049970bc02ff1add6
-  md5: 30fd6e37fe21f86f4bd26d6ee73eeec7
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-12.4.0.27-0.conda
+  sha256: f780220634bbf64586efbeecc01d89bf600a571b3386f4da8a43e19b22097737
+  md5: 444500275c26f9ebf56bb885fa8d26f8
   depends:
-  - libgcc-ng >=12
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 175439936
+  timestamp: 1741065633129
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-dev-12.4.0.27-0.conda
+  sha256: 86407dcdecbd7bf646e7e4b5fb0d374f78aa8b946113e9bae49987cdcd6be34b
+  md5: 4928ed7509dc016fde03ecf21e3cc9c5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libnpp 12.4.0.27 0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libnpp-static >=12.4.0.27
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 455178
+  timestamp: 1741065933554
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hb9d3cd8_1.conda
+  sha256: 927fe72b054277cde6cb82597d0fcf6baf127dcbce2e0a9d8925a68f1265eef5
+  md5: d864d34357c3b65a4b731f78c0801dc4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
   license: LGPL-2.1-only
   license_family: GPL
-  size: 33408
-  timestamp: 1697359010159
+  size: 33731
+  timestamp: 1750274110928
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-12.9.19-0.conda
+  sha256: 6518f895024b6c03f384369e4870b8b2068c04e39677f074b96a969d84c1c055
+  md5: fb75d10b7d3ae4bdb7697dfcdfa8991c
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 817685
+  timestamp: 1741062723933
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-dev-12.9.19-0.conda
+  sha256: 801d6f2359ff16d07f50dfa861f67181d880aad7da0fae9a69ee0071751221df
+  md5: 1a3d4d404b09e9cec9472d2775b21878
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libnvfatbin 12.9.19 0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - liblibnvfatbin-static >=12.9.19
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 22145
+  timestamp: 1741062735007
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-12.9.41-0.conda
+  sha256: 2e46b638a6a7d7ef2a1c244ec6c52b1692b9ed37e3f4f5057375ce375d9261ba
+  md5: c3a16bd9064e8d8b1bb815c60ab6153c
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 30616865
+  timestamp: 1744256952566
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-dev-12.9.41-0.conda
+  sha256: b9627cff583642e5b87f501c73a41df38fd73966ca23f01d8caf19fe6e529ffa
+  md5: bec5fb65d0813a1d6c5acdd1a5b106da
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libnvjitlink 12.9.41 0
+  - libstdcxx-ng >=11.2.0
+  constrains:
+  - libnvjitlink-static >=12.9.41
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 21384
+  timestamp: 1744257075278
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-12.4.0.16-0.conda
+  sha256: addbea5c4114bf37773e8391aac9b7225630574bfa8233beb3f186acb8282f15
+  md5: 50a47954bccb0b0db2046014a65000d8
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - libgcc-ng >=11.2.0
+  - libstdcxx-ng >=11.2.0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 3579313
+  timestamp: 1741063701861
+- conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-dev-12.4.0.16-0.conda
+  sha256: d4259401852685a8245132a93c3ab14b88b96d065e6812e84209783bf05feaa5
+  md5: b177d223cd5f2612b5cbda6231418d39
+  depends:
+  - cuda-cudart-dev
+  - cuda-version >=12.9,<12.10.0a0
+  - libnvjpeg 12.4.0.16 0
+  constrains:
+  - libnvjpeg-static >=12.4.0.16
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 27635
+  timestamp: 1741063711649
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda
   sha256: ffb066ddf2e76953f92e06677021c73c85536098f1c21fcd15360dbc859e22e4
   md5: 68e52064ed3897463c0e958ab5c8f91b
@@ -1942,21 +3287,22 @@ packages:
   license_family: BSD
   size: 312472
   timestamp: 1744330953241
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h081d1f1_7_cpu.conda
-  build_number: 7
-  sha256: 338aa913e5f68606baa86c5deebe4d4d1d615e0b3df40db200084837905201e2
-  md5: f8714819f786deb7a10bd255d4e0740c
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h3f30f2e_8_cuda.conda
+  build_number: 8
+  sha256: e9b7b4416b83b86de7028878cd6117cd369f5640b66d476e301a60e4589d8a26
+  md5: c3f4f661fa6a55e6763b59fd494e4ede
   depends:
   - __glibc >=2.17,<3.0.a0
-  - libarrow 20.0.0 h314c690_7_cpu
-  - libgcc >=13
-  - libstdcxx >=13
+  - libarrow 20.0.0 h019e7cd_8_cuda
+  - libgcc
+  - libgcc-ng >=12
+  - libstdcxx
+  - libstdcxx-ng >=12
   - libthrift >=0.21.0,<0.21.1.0a0
   - openssl >=3.5.0,<4.0a0
   license: Apache-2.0
-  license_family: APACHE
-  size: 1243202
-  timestamp: 1749948757263
+  size: 1214865
+  timestamp: 1750865956895
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.49-h943b412_0.conda
   sha256: c8f5dc929ba5fcee525a66777498e03bbcbfefc05a0773e5163bb08ac5122f1a
   md5: 37511c874cf3b8d0034c8d24e73c0884
@@ -1981,21 +3327,31 @@ packages:
   license_family: BSD
   size: 3358788
   timestamp: 1745159546868
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2024.07.02-hba17884_3.conda
-  sha256: 392ec1e49370eb03270ffd4cc8d727f8e03e1e3a92b12f10c53f396ae4554668
-  md5: 545e93a513c10603327c76c15485e946
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.06.26-hba17884_0.conda
+  sha256: 89535af669f63e0dc4ae75a5fc9abb69b724b35e0f2ca0304c3d9744a55c8310
+  md5: f6881c04e6617ebba22d237c36f1b88e
   depends:
   - __glibc >=2.17,<3.0.a0
   - libabseil * cxx17*
-  - libabseil >=20250127.0,<20250128.0a0
+  - libabseil >=20250127.1,<20250128.0a0
   - libgcc >=13
   - libstdcxx >=13
   constrains:
-  - re2 2024.07.02.*
+  - re2 2025.06.26.*
   license: BSD-3-Clause
-  license_family: BSD
-  size: 210073
-  timestamp: 1741121121238
+  size: 211720
+  timestamp: 1751053073521
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda
+  sha256: 27c4c8bf8e2dd60182d47274389be7c70446df6ed5344206266321ee749158b4
+  md5: 2b6cdf7bb95d3d10ef4e38ce0bc95dba
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13.3.0
+  - libstdcxx >=13.3.0
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  license_family: GPL
+  size: 4155341
+  timestamp: 1740240344242
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libsentencepiece-0.2.0-he636bdd_11.conda
   sha256: c5b98351daa23979a6728d297bf3b3eaae0324ae60487f5637b09a9ed7656d43
   md5: aed2d089d7d343500921f9ad3f7ba9c8
@@ -2034,16 +3390,16 @@ packages:
   license: ISC
   size: 205978
   timestamp: 1716828628198
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.1-hee588c1_0.conda
-  sha256: cd15ab1b9f0d53507e7ad7a01e52f6756ab3080bf623ab0e438973b6e4dba3c0
-  md5: 96a7e36bff29f1d0ddf5b771e0da373a
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.1-h6cd9bfd_7.conda
+  sha256: 9a9e5bf30178f821d4f8de25eac0ae848915bfde6a78a66ae8b77d9c33d9d0e5
+  md5: c7c4888059a8324e52de475d1e7bdc53
   depends:
   - __glibc >=2.17,<3.0.a0
   - libgcc >=13
   - libzlib >=1.3.1,<2.0a0
   license: Unlicense
-  size: 919819
-  timestamp: 1749232795476
+  size: 919723
+  timestamp: 1750925531920
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hcf80075_0.conda
   sha256: fa39bfd69228a13e553bd24601332b7cfeb30ca11a3ca50bb028108fe90a7661
   md5: eecce068c7e4eddeb169591baac20ac4
@@ -2056,25 +3412,32 @@ packages:
   license_family: BSD
   size: 304790
   timestamp: 1745608545575
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_2.conda
-  sha256: 6ae3d153e78f6069d503d9309f2cac6de5b93d067fc6433160a4c05226a5dad4
-  md5: 1cb1c67961f6dd257eae9e9691b341aa
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.1.0-h8f9b012_3.conda
+  sha256: 7650837344b7850b62fdba02155da0b159cf472b9ab59eb7b472f7bd01dff241
+  md5: 6d11a5edae89fe413c0569f16d308f5a
   depends:
   - __glibc >=2.17,<3.0.a0
-  - libgcc 15.1.0 h767d61c_2
+  - libgcc 15.1.0 h767d61c_3
   license: GPL-3.0-only WITH GCC-exception-3.1
-  license_family: GPL
-  size: 3902355
-  timestamp: 1746642227493
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_2.conda
-  sha256: 11bea86e11de7d6bce87589197a383344df3fa0a3552dab7e931785ff1159a5b
-  md5: 9d2072af184b5caa29492bf2344597bb
+  size: 3896407
+  timestamp: 1750808251302
+- conda: https://conda.anaconda.org/conda-forge/noarch/libstdcxx-devel_linux-64-13.3.0-hc03c837_102.conda
+  sha256: abc89056d4ca7debe938504b3b6d9ccc6d7a0f0b528fe3409230636a21e81002
+  md5: aa38de2738c5f4a72a880e3d31ffe8b4
   depends:
-  - libstdcxx 15.1.0 h8f9b012_2
+  - __unix
   license: GPL-3.0-only WITH GCC-exception-3.1
   license_family: GPL
-  size: 34647
-  timestamp: 1746642266826
+  size: 12873130
+  timestamp: 1740240239655
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-15.1.0-h4852527_3.conda
+  sha256: bbaea1ecf973a7836f92b8ebecc94d3c758414f4de39d2cc6818a3d10cb3216b
+  md5: 57541755b5a51691955012b8e197c06c
+  depends:
+  - libstdcxx 15.1.0 h8f9b012_3
+  license: GPL-3.0-only WITH GCC-exception-3.1
+  size: 29093
+  timestamp: 1750808292700
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libthrift-0.21.0-h0e7cc3e_0.conda
   sha256: ebb395232973c18745b86c9a399a4725b2c39293c9a91b8e59251be013db42f0
   md5: dcb95c0a98ba9ff737f7ae482aef7833
@@ -2106,9 +3469,9 @@ packages:
   license: HPND
   size: 429575
   timestamp: 1747067001268
-- conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.7.0-cpu_mkl_hf6ddc5a_100.conda
-  sha256: 7b6178464b02d65c4af92086c71b79e5c2b7fc1500c1547334a4755e6e92d8a9
-  md5: 6bdda0b10852c6d03b030bab7ec251f0
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libtorch-2.7.1-cpu_mkl_hb1c5dc7_100.conda
+  sha256: e221eaa1b3caf0e228cc7fa296d17708b5f0099122084f539e4b75844789f4e9
+  md5: 80bf999d61d95328cb37391ccdb9f03d
   depends:
   - __glibc >=2.17,<3.0.a0
   - _openmp_mutex * *_llvm
@@ -2120,19 +3483,19 @@ packages:
   - libgcc >=13
   - libprotobuf >=5.29.3,<5.29.4.0a0
   - libstdcxx >=13
-  - libuv >=1.50.0,<2.0a0
+  - libuv >=1.51.0,<2.0a0
   - libzlib >=1.3.1,<2.0a0
-  - llvm-openmp >=20.1.4
+  - llvm-openmp >=20.1.7
   - mkl >=2024.2.2,<2025.0a0
   - sleef >=3.8,<4.0a0
   constrains:
+  - pytorch 2.7.1 cpu_mkl_*_100
   - pytorch-gpu <0.0a0
-  - pytorch 2.7.0 cpu_mkl_*_100
-  - pytorch-cpu 2.7.0
+  - pytorch-cpu 2.7.1
   license: BSD-3-Clause
   license_family: BSD
-  size: 55565925
-  timestamp: 1746256872466
+  size: 55596081
+  timestamp: 1750205154609
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libutf8proc-2.10.0-h202a827_0.conda
   sha256: c4ca78341abb308134e605476d170d6f00deba1ec71b0b760326f36778972c0e
   md5: 0f98f3e95272d118f7931b6bef69bfe5
@@ -2206,6 +3569,21 @@ packages:
   license: LGPL-2.1-or-later
   size: 100393
   timestamp: 1702724383534
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda
+  sha256: a8043a46157511b3ceb6573a99952b5c0232313283f2d6a066cec7c8dcaed7d0
+  md5: fedf6bfe5d21d21d2b1785ec00a8889a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  - libxcb >=1.17.0,<2.0a0
+  - libxml2 >=2.13.8,<2.14.0a0
+  - xkeyboard-config
+  - xorg-libxau >=1.0.12,<2.0a0
+  license: MIT/X11 Derivative
+  license_family: MIT
+  size: 707156
+  timestamp: 1747911059945
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda
   sha256: b0b3a96791fa8bb4ec030295e8c8bf2d3278f33c0f9ad540e73b5e538e6268e7
   md5: 14dbe05b929e329dbaa6f2d0aa19466d
@@ -2278,16 +3656,27 @@ packages:
   license_family: BSD
   size: 24604
   timestamp: 1733219911494
-- conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062405-release.conda
-  sha256: 1ba31e176fd6d2c45763f0f0020ec3dc63d7146fd56fb3cf29fdf45b296631d9
+- conda: https://conda.modular.com/max-nightly/noarch/max-25.5.0.dev2025062705-release.conda
+  noarch: python
+  sha256: 311e01e00ce7302eb97c263c021700c7baa1a6d9be9477f60edcfd17fbf4b49d
+  depends:
+  - max-core ==25.5.0.dev2025062705 release
+  - max-python ==25.5.0.dev2025062705 release
+  - mojo-jupyter ==25.5.0.dev2025062705 release
+  - mblack ==25.5.0.dev2025062705 release
+  license: LicenseRef-Modular-Proprietary
+  size: 9411
+  timestamp: 1751001417267
+- conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062705-release.conda
+  sha256: be1ef195c01fd7253ad1fb4b4ec647708bd53cbe31a2e97318943fed4062f92c
   depends:
-  - mblack ==25.5.0.dev2025062405 release
+  - mblack ==25.5.0.dev2025062705 release
   license: LicenseRef-Modular-Proprietary
-  size: 223598518
-  timestamp: 1750742290069
-- conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062405-release.conda
+  size: 222775006
+  timestamp: 1751001417267
+- conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062705-release.conda
   noarch: python
-  sha256: cad7e9e269eaf28df456a654baf97b8c250bae15f2873622c5cdf47a97dec871
+  sha256: 9b5e67567dcf2ff892cd0abf436a98032c54f04e5253394a50d0b24369cbe586
   depends:
   - aiohttp >=3.11.12
   - click >=8.0.0
@@ -2299,7 +3688,7 @@ packages:
   - requests >=2.32.3
   - safetensors >=0.5.2
   - pysoundfile >=0.12.1
-  - pytorch >=2.5.0,<=2.7.0
+  - pytorch >=2.5.0
   - tqdm >=4.67.1
   - transformers >=4.52.4
   - uvicorn >=0.34.0
@@ -2331,17 +3720,17 @@ packages:
   - starlette >=0.40.0,<0.41.3
   - taskgroup >=0.2.2
   - tokenizers >=0.19.0
-  - max-python ==25.5.0.dev2025062405 release
+  - max-python ==25.5.0.dev2025062705 release
   license: LicenseRef-Modular-Proprietary
-  size: 10036
-  timestamp: 1750742290069
-- conda: https://conda.modular.com/max-nightly/linux-64/max-python-25.5.0.dev2025062405-release.conda
+  size: 10025
+  timestamp: 1751001417267
+- conda: https://conda.modular.com/max-nightly/linux-64/max-python-25.5.0.dev2025062705-release.conda
   noarch: python
-  sha256: 0b482fb7d3b26c7c449a6b6ecca5499ef5f42ac3b19d5a06f7020331e1f11d1a
+  sha256: 9204326c477affb8fd8f9c84de91591a21bd1cf24f0d4390716b3ca642cdb711
   depends:
   - numpy >=1.18
   - python-gil >=3.9,<3.14
-  - max-core ==25.5.0.dev2025062405 release
+  - max-core ==25.5.0.dev2025062705 release
   constrains:
   - aiohttp >=3.11.12
   - click >=8.0.0
@@ -2353,7 +3742,7 @@ packages:
   - requests >=2.32.3
   - safetensors >=0.5.2
   - pysoundfile >=0.12.1
-  - pytorch >=2.5.0,<=2.7.0
+  - pytorch >=2.5.0
   - tqdm >=4.67.1
   - transformers >=4.52.4
   - uvicorn >=0.34.0
@@ -2386,11 +3775,11 @@ packages:
   - taskgroup >=0.2.2
   - tokenizers >=0.19.0
   license: LicenseRef-Modular-Proprietary
-  size: 30539997
-  timestamp: 1750742290069
-- conda: https://conda.modular.com/max-nightly/noarch/mblack-25.5.0.dev2025062405-release.conda
+  size: 30541775
+  timestamp: 1751001417267
+- conda: https://conda.modular.com/max-nightly/noarch/mblack-25.5.0.dev2025062705-release.conda
   noarch: python
-  sha256: 4bf94a96c20902afff1c680bc36a2a1cba19a68ebc2da14930683f4b3afae4a9
+  sha256: 20b01c20917066a7f186385ae19a017c091e094dbddb4c59f1737bf2eed0a9fe
   depends:
   - python >=3.9,<3.14
   - click >=8.0.0
@@ -2401,8 +3790,8 @@ packages:
   - typing_extensions >=v4.12.2
   - python
   license: MIT
-  size: 131247
-  timestamp: 1750742290068
+  size: 131254
+  timestamp: 1751001417267
 - conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda
   sha256: 78c1bbe1723449c52b7a9df1af2ee5f005209f67e40b6e1d3c7619127c43b1c7
   md5: 592132998493b3ff25fd7479396e8351
@@ -2424,14 +3813,25 @@ packages:
   license_family: Proprietary
   size: 124718448
   timestamp: 1730231808335
-- conda: https://conda.modular.com/max-nightly/noarch/modular-25.5.0.dev2025062405-release.conda
+- conda: https://conda.modular.com/max-nightly/noarch/modular-25.5.0.dev2025062705-release.conda
+  noarch: python
+  sha256: 5da2726bab828c2f039bb8e393987b0eb7787e174bb94cae7d0f07b56f3e00b3
+  depends:
+  - max-pipelines ==25.5.0.dev2025062705 release
+  license: LicenseRef-Modular-Proprietary
+  size: 9394
+  timestamp: 1751001417267
+- conda: https://conda.modular.com/max-nightly/noarch/mojo-jupyter-25.5.0.dev2025062705-release.conda
   noarch: python
-  sha256: 60fbf35f20fa7cac4ef38d2329c96532111d0e0e306a614a2b8b1c10baf58ac7
+  sha256: 27842df9aea46c60c8d6487e38c44b18c5aa533e7e84f3d98c5ee7fd978862a4
   depends:
-  - max-pipelines ==25.5.0.dev2025062405 release
+  - max-core ==25.5.0.dev2025062705 release
+  - python >=3.9,<3.14
+  - jupyter_client >=8.6.2,<8.7
+  - python
   license: LicenseRef-Modular-Proprietary
-  size: 9396
-  timestamp: 1750742290069
+  size: 22485
+  timestamp: 1751001417267
 - conda: https://conda.anaconda.org/conda-forge/linux-64/mpc-1.3.1-h24ddda3_1.conda
   sha256: 1bf794ddf2c8b3a3e14ae182577c624fa92dea975537accff4bc7e5fea085212
   md5: aa14b9a5196a6d8dd364164b7ce56acf
@@ -2487,17 +3887,17 @@ packages:
   license_family: BSD
   size: 211752
   timestamp: 1736368259541
-- conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.0-py312h178313f_0.conda
-  sha256: c9f917c0cf08a1935383d44dcc26c94bef8f2ef75dafbf43788c3d7f320d3e44
-  md5: a27583450558016071c02e6d866f8868
+- conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.1-py312h178313f_0.conda
+  sha256: 6c6ed7a70092bbd97282e4c5cb6dc0cab1dbde4cb222e320c15f3cf9b0a9ffb1
+  md5: 291622f79db2b176d2c142b40e4b6640
   depends:
   - __glibc >=2.17,<3.0.a0
   - libgcc >=13
   - python >=3.12,<3.13.0a0
   - python_abi 3.12.* *_cp312
   license: Apache-2.0
-  size: 94456
-  timestamp: 1750240152329
+  size: 95036
+  timestamp: 1750832449984
 - conda: https://conda.anaconda.org/conda-forge/linux-64/multiprocess-0.70.15-py312h98912ed_1.conda
   sha256: bb612a921fafda6375a2204ffebd8811db8dd3b8f25ac9886cc9bcbff7e3664e
   md5: 5a64b9f44790d9a187a85366dd0ffa8d
@@ -2543,17 +3943,17 @@ packages:
   license_family: BSD
   size: 1564462
   timestamp: 1749078300258
-- conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.12.1-hff21bea_1.conda
-  sha256: 1f2f7e26084971e87bfbb733f17d824ff3323ee9618fb713ae9932386da76aed
-  md5: 2322531904f27501ee19847b87ba7c64
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ninja-1.13.0-h7aa8ee6_0.conda
+  sha256: 8cf09470430b5aba5165c7aefed070d2c8f998f69fede01197ef838bf17fa81a
+  md5: 2f67cb5c5ec172faeba94348ae8af444
   depends:
   - __glibc >=2.17,<3.0.a0
   - libstdcxx >=13
   - libgcc >=13
   license: Apache-2.0
   license_family: APACHE
-  size: 161883
-  timestamp: 1745526264371
+  size: 180917
+  timestamp: 1750273173789
 - conda: https://conda.anaconda.org/conda-forge/linux-64/nlohmann_json-3.12.0-h3f2d84a_0.conda
   sha256: e2fc624d6f9b2f1b695b6be6b905844613e813aa180520e73365062683fe7b49
   md5: d76872d096d063e226482c99337209dc
@@ -2561,6 +3961,50 @@ packages:
   license_family: MIT
   size: 135906
   timestamp: 1744445169928
+- conda: https://conda.anaconda.org/nvidia/linux-64/nsight-compute-2025.2.0.11-0.conda
+  sha256: 96b4d6bc34186bd54041ea8f1758a51e64268ad50c843656a1db6bbdc1a6b842
+  md5: 38abf11e6184555d4255ba6431eb68b6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cuda-version >=12.9,<12.10.0a0
+  - dbus >=1.13.18,<2.0a0
+  - expat >=2.6.4,<3.0a0
+  - fontconfig >=2.14.1,<3.0a0
+  - freetype >=2.12.1,<3.0a0
+  - libgcc-ng >=11.2.0
+  - libglib >=2.78.4,<3.0a0
+  - libstdcxx-ng >=11.2.0
+  - libxkbcommon >=1.0.1,<2.0a0
+  - nspr >=4.35,<5.0a0
+  - nss >=3.89.1,<4.0a0
+  license: LicenseRef-NVIDIA-End-User-License-Agreement
+  size: 336942919
+  timestamp: 1740756913041
+- conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.36-h5888daf_0.conda
+  sha256: a87471d9265a7c02a98c20debac8b13afd80963968ed7b1c1c2ac7b80955ce31
+  md5: de9cd5bca9e4918527b9b72b6e2e1409
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  license: MPL-2.0
+  license_family: MOZILLA
+  size: 230204
+  timestamp: 1729545773406
+- conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.113-h159eef7_0.conda
+  sha256: ceef2b561cc92f00bac1263562f8fd9c17ff11de710fc27370572adee1b34b1c
+  md5: 47fbbbda15a2a03bae2b3d2cd3735b30
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libsqlite >=3.50.1,<4.0a0
+  - libstdcxx >=13
+  - libzlib >=1.3.1,<2.0a0
+  - nspr >=4.36,<5.0a0
+  license: MPL-2.0
+  license_family: MOZILLA
+  size: 2009636
+  timestamp: 1750369586316
 - conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.3.0-py312h6cf2f7f_0.conda
   sha256: 59da92a150737e830c75e8de56c149d6dc4e42c9d38ba30d2f0d4787a0c43342
   md5: 8b4095ed29d1072f7e4badfbaf9e5851
@@ -2579,6 +4023,28 @@ packages:
   license_family: BSD
   size: 8417476
   timestamp: 1749430957684
+- conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.3-hb9d3cd8_0.conda
+  sha256: 2254dae821b286fb57c61895f2b40e3571a070910fdab79a948ff703e1ea807b
+  md5: 56f8947aa9d5cf37b0b3d43b83f34192
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - opencl-headers >=2024.10.24
+  license: BSD-2-Clause
+  license_family: BSD
+  size: 106742
+  timestamp: 1743700382939
+- conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-h5888daf_0.conda
+  sha256: 2b6ce54174ec19110e1b3c37455f7cd138d0e228a75727a9bba443427da30a36
+  md5: 45c3d2c224002d6d0d7769142b29f986
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  license: Apache-2.0
+  license_family: APACHE
+  size: 55357
+  timestamp: 1749853464518
 - conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda
   sha256: 5bee706ea5ba453ed7fd9da7da8380dd88b865c8d30b5aaec14d2b6dd32dbc39
   md5: 9e5816bc95d285c115a3ebc2f8563564
@@ -2789,6 +4255,18 @@ packages:
   license_family: MOZILLA
   size: 41075
   timestamp: 1733233471940
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda
+  sha256: 27c4014f616326240dcce17b5f3baca3953b6bc5f245ceb49c3fa1e6320571eb
+  md5: b90bece58b4c2bf25969b70f3be42d25
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - bzip2 >=1.0.8,<2.0a0
+  - libgcc >=13
+  - libzlib >=1.3.1,<2.0a0
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 1197308
+  timestamp: 1745955064657
 - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py312h80c1187_0.conda
   sha256: 15f32ec89f3a7104fcb190546a2bc0fc279372d9073e5ec08a8d61a1c79af4c0
   md5: ca438bf57e4f2423d261987fe423a0dd
@@ -2923,24 +4401,27 @@ packages:
   license_family: APACHE
   size: 25757
   timestamp: 1746001175919
-- conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-20.0.0-py312h01725c0_0_cpu.conda
-  sha256: afd636ecaea60e1ebb422b1a3e5a5b8f6f28da3311b7079cbd5caa4464a50a48
-  md5: 9b1b453cdb91a2f24fb0257bbec798af
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pyarrow-core-20.0.0-py312h09cf70e_0_cuda.conda
+  sha256: ee4dcca659fde10e8521628f889caf67e1a9e12097d5278fbf7887815354d1e9
+  md5: 1f86f00a1c29703e6287e9413488e224
   depends:
+  - __cuda >=11.8
   - __glibc >=2.17,<3.0.a0
-  - libarrow 20.0.0.* *cpu
-  - libgcc >=13
-  - libstdcxx >=13
+  - libarrow 20.0.0.* *cuda
+  - libgcc
+  - libgcc-ng >=12
+  - libstdcxx
+  - libstdcxx-ng >=12
   - libzlib >=1.3.1,<2.0a0
   - python >=3.12,<3.13.0a0
   - python_abi 3.12.* *_cp312
   constrains:
-  - apache-arrow-proc * cpu
+  - apache-arrow-proc * cuda
   - numpy >=1.21,<3
   license: Apache-2.0
   license_family: APACHE
-  size: 4658639
-  timestamp: 1746000738593
+  size: 4702349
+  timestamp: 1746001105856
 - conda: https://conda.anaconda.org/conda-forge/noarch/pybind11-2.13.6-pyhc790b64_3.conda
   sha256: d429f6f255fbe49f09b9ae1377aa8cbc4d9285b8b220c17ae2ad9c4894c91317
   md5: 1594696beebf1ecb6d29a1136f859a74
@@ -3004,27 +4485,26 @@ packages:
   license_family: MIT
   size: 1890081
   timestamp: 1746625309715
-- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.9.1-pyh3cfb1c2_0.conda
-  sha256: ea2f1027218e83e484fd581933e0ce60b9194486c56c98053b4277b0fb291364
-  md5: 29dd5c4ece2497b75b4050ec3c8d4044
+- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.10.1-pyh3cfb1c2_0.conda
+  sha256: e56b9a0320e3cab58b88f62ccdcd4bf7cd89ec348c878e1843d4d22315bfced1
+  md5: a5f9c3e867917c62d796c20dba792cbd
   depends:
   - pydantic >=2.7.0
   - python >=3.9
   - python-dotenv >=0.21.0
   - typing-inspection >=0.4.0
   license: MIT
-  license_family: MIT
-  size: 38135
-  timestamp: 1745015303766
-- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.1-pyhd8ed1ab_0.conda
-  sha256: 28a3e3161390a9d23bc02b4419448f8d27679d9e2c250e29849e37749c8de86b
-  md5: 232fb4577b6687b2d503ef8e254270c9
+  size: 38816
+  timestamp: 1750801673349
+- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
+  sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a
+  md5: 6b6ece66ebcae2d5f326c77ef2c5a066
   depends:
   - python >=3.9
   license: BSD-2-Clause
   license_family: BSD
-  size: 888600
-  timestamp: 1736243563082
+  size: 889287
+  timestamp: 1750615908735
 - conda: https://conda.anaconda.org/conda-forge/linux-64/pyinstrument-5.0.2-py312h66e93f0_0.conda
   sha256: 5c3a01095cb3dbb2acc6f218ba8c16cf6cee821220e985d92fa5e5e9f84aa2d5
   md5: ba356edae556ad919cfd14ea03fa4d4a
@@ -3095,16 +4575,15 @@ packages:
   license_family: APACHE
   size: 222505
   timestamp: 1733215763718
-- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.1.0-pyh29332c3_1.conda
-  sha256: 7d927317003544049c97e7108e8ca5f2be5ff0ea954f5c84c8bbeb243b663fc8
-  md5: 27d816c6981a8d50090537b761de80f4
+- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.1.1-pyhe01879c_0.conda
+  sha256: 9a90570085bedf4c6514bcd575456652c47918ff3d7b383349e26192a4805cc8
+  md5: a245b3c04afa11e2e52a0db91550da7c
   depends:
   - python >=3.9
   - python
   license: BSD-3-Clause
-  license_family: BSD
-  size: 25557
-  timestamp: 1742948348635
+  size: 26031
+  timestamp: 1750789290754
 - conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.12.11-hd8ed1ab_0.conda
   sha256: b8afeaefe409d61fa4b68513b25a66bb17f3ca430d67cfea51083c7bfbe098ef
   md5: 859c6bec94cd74119f12b961aba965a8
@@ -3164,9 +4643,9 @@ packages:
   license_family: BSD
   size: 6971
   timestamp: 1745258861359
-- conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.7.0-cpu_mkl_py312_h6a7998d_100.conda
-  sha256: 5c4a340f7a729bcdc19c530b25ed71ed5239f5ad0e907c49f03d88efd5b3be75
-  md5: c67501107a48c049f18e8cb7c7e800b2
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pytorch-2.7.1-cpu_mkl_py312_h6a7998d_100.conda
+  sha256: 102df2421e1c6ee63b8f96dd763aa516760480135ec27b31adb2eb91c4c388f3
+  md5: a3d2654966425bb60085bb59d9d9d27a
   depends:
   - __glibc >=2.17,<3.0.a0
   - _openmp_mutex * *_llvm
@@ -3181,10 +4660,10 @@ packages:
   - libgcc >=13
   - libprotobuf >=5.29.3,<5.29.4.0a0
   - libstdcxx >=13
-  - libtorch 2.7.0 cpu_mkl_hf6ddc5a_100
-  - libuv >=1.50.0,<2.0a0
+  - libtorch 2.7.1 cpu_mkl_hb1c5dc7_100
+  - libuv >=1.51.0,<2.0a0
   - libzlib >=1.3.1,<2.0a0
-  - llvm-openmp >=20.1.4
+  - llvm-openmp >=20.1.7
   - mkl >=2024.2.2,<2025.0a0
   - networkx
   - numpy >=1.19,<3
@@ -3192,17 +4671,17 @@ packages:
   - pybind11
   - python >=3.12,<3.13.0a0
   - python_abi 3.12.* *_cp312
-  - setuptools <76
+  - setuptools
   - sleef >=3.8,<4.0a0
   - sympy >=1.13.3
   - typing_extensions >=4.10.0
   constrains:
   - pytorch-gpu <0.0a0
-  - pytorch-cpu 2.7.0
+  - pytorch-cpu 2.7.1
   license: BSD-3-Clause
   license_family: BSD
-  size: 28982129
-  timestamp: 1746260259104
+  size: 28938244
+  timestamp: 1750208726017
 - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda
   sha256: 8d2a8bf110cc1fc3df6904091dead158ba3e614d8402a83e51ed3a8aa93cdeb0
   md5: bc8e3267d44011051f2eb14d22fb0960
@@ -3240,15 +4719,14 @@ packages:
   license_family: BSD
   size: 378610
   timestamp: 1749898590652
-- conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2024.07.02-h9925aae_3.conda
-  sha256: 66d34e3b4881f856486d11914392c585713100ca547ccfc0947f3a4765c2c486
-  md5: 6f445fb139c356f903746b2b91bbe786
+- conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.06.26-h9925aae_0.conda
+  sha256: 7a0b82cb162229e905f500f18e32118ef581e1fd182036f3298510b8e8663134
+  md5: 2b4249747a9091608dbff2bd22afde44
   depends:
-  - libre2-11 2024.07.02 hba17884_3
+  - libre2-11 2025.06.26 hba17884_0
   license: BSD-3-Clause
-  license_family: BSD
-  size: 26811
-  timestamp: 1741121137599
+  size: 27330
+  timestamp: 1751053087063
 - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda
   sha256: 2d6d0c026902561ed77cd646b5021aef2d4db22e57a5b0178dfc669231e06d2c
   md5: 283b96675859b20a825f8fa30f311446
@@ -3412,15 +4890,15 @@ packages:
   license_family: MIT
   size: 13828
   timestamp: 1740445106343
-- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-75.8.2-pyhff2d567_0.conda
-  sha256: 91d664ace7c22e787775069418daa9f232ee8bafdd0a6a080a5ed2395a6fa6b2
-  md5: 9bddfdbf4e061821a1a443f93223be61
+- conda: https://conda.anaconda.org/conda-forge/noarch/setuptools-80.9.0-pyhff2d567_0.conda
+  sha256: 972560fcf9657058e3e1f97186cc94389144b46dbdf58c807ce62e83f977e863
+  md5: 4de79c071274a53dcaf2a8c749d1499e
   depends:
   - python >=3.9
   license: MIT
   license_family: MIT
-  size: 777736
-  timestamp: 1740654030775
+  size: 748788
+  timestamp: 1748804951958
 - conda: https://conda.anaconda.org/conda-forge/noarch/shellingham-1.5.4-pyhd8ed1ab_1.conda
   sha256: 0557c090913aa63cdbe821dbdfa038a321b488e22bc80196c4b3b1aace4914ef
   md5: 7c3c2a0f3ebdea2bbc35538d162b43bf
@@ -3506,6 +4984,16 @@ packages:
   license_family: BSD
   size: 4616621
   timestamp: 1745946173026
+- conda: https://conda.anaconda.org/conda-forge/noarch/sysroot_linux-64-2.17-h0157908_18.conda
+  sha256: 69ab5804bdd2e8e493d5709eebff382a72fab3e9af6adf93a237ccf8f7dbd624
+  md5: 460eba7851277ec1fd80a1a24080787a
+  depends:
+  - kernel-headers_linux-64 3.10.0 he073ed8_18
+  - tzdata
+  license: LGPL-2.0-or-later AND LGPL-2.0-or-later WITH exceptions AND GPL-2.0-or-later AND MPL-2.0
+  license_family: GPL
+  size: 15166921
+  timestamp: 1735290488259
 - conda: https://conda.anaconda.org/conda-forge/noarch/taskgroup-0.2.2-pyhd8ed1ab_0.conda
   sha256: 6f8db6da8de445930de55b708e6a5d3ab5f076bc14a39578db0190b2a9b8e437
   md5: 9fa69537fb68a095fbac139210575bad
@@ -3557,23 +5045,35 @@ packages:
   license_family: BSD
   size: 3285204
   timestamp: 1748387766691
-- conda: https://conda.anaconda.org/conda-forge/linux-64/tokenizers-0.21.1-py312h8360d73_0.conda
-  sha256: b6e519f831a2ae4a54dc7df450ef1f5a87e380e786b2231f9e4d6e69976c290c
-  md5: 503add80c04b5c982ab4c5bf66a6e1d8
+- conda: https://conda.anaconda.org/conda-forge/linux-64/tokenizers-0.21.2-py312h8360d73_0.conda
+  sha256: a54dcbed5910e0e94f7d14ec4dd0cf137a835a8c069846a9f3fc638d76a8fe52
+  md5: f311d7f63df2ab7069a98f5a89f9d358
   depends:
   - __glibc >=2.17,<3.0.a0
   - huggingface_hub >=0.16.4,<1.0
   - libgcc >=13
   - libstdcxx >=13
-  - openssl >=3.4.1,<4.0a0
+  - openssl >=3.5.0,<4.0a0
   - python >=3.12,<3.13.0a0
   - python_abi 3.12.* *_cp312
   constrains:
   - __glibc >=2.17
   license: Apache-2.0
   license_family: APACHE
-  size: 2314369
-  timestamp: 1741890584674
+  size: 2374175
+  timestamp: 1750798318498
+- conda: https://conda.anaconda.org/conda-forge/linux-64/tornado-6.5.1-py312h66e93f0_0.conda
+  sha256: c96be4c8bca2431d7ad7379bad94ed6d4d25cd725ae345540a531d9e26e148c9
+  md5: c532a6ee766bed75c4fa0c39e959d132
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  license: Apache-2.0
+  license_family: Apache
+  size: 850902
+  timestamp: 1748003427956
 - conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.1-pyhd8ed1ab_1.conda
   sha256: 11e2c85468ae9902d24a27137b6b39b4a78099806e551d390e394a8c34b48e40
   md5: 9efbfdc37242619130ea42b1cc4ed861
@@ -3583,9 +5083,18 @@ packages:
   license: MPL-2.0 or MIT
   size: 89498
   timestamp: 1735661472632
-- conda: https://conda.anaconda.org/conda-forge/noarch/transformers-4.52.4-pyhd8ed1ab_0.conda
-  sha256: 1cbe19a01c01eae95c1ee55e5aa2dd5dc59fb0f5d34621c29c2c4a5e806156ac
-  md5: 9fa4ceae2fb9e7ec0732f5a756f89957
+- conda: https://conda.anaconda.org/conda-forge/noarch/traitlets-5.14.3-pyhd8ed1ab_1.conda
+  sha256: f39a5620c6e8e9e98357507262a7869de2ae8cc07da8b7f84e517c9fd6c2b959
+  md5: 019a7385be9af33791c989871317e1ed
+  depends:
+  - python >=3.9
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 110051
+  timestamp: 1733367480074
+- conda: https://conda.anaconda.org/conda-forge/noarch/transformers-4.53.0-pyhd8ed1ab_0.conda
+  sha256: d3526ea2617dc8650a2c7fd01d7568cda7a709472eb6881e08a4d8e4d68124db
+  md5: 42c5cc096057a22b882b8fa92c5e8883
   depends:
   - datasets !=2.5.0
   - filelock
@@ -3601,11 +5110,11 @@ packages:
   - tqdm >=4.27
   license: Apache-2.0
   license_family: APACHE
-  size: 3744686
-  timestamp: 1748608107918
-- conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.3.1-cuda126py312hebffaa9_0.conda
-  sha256: 33cce0f856d6ff2702354bdab32011006b7cadc380946071d1f822288df1b3ca
-  md5: 12d9c45edb9b870ef8222dc5b3544892
+  size: 3916160
+  timestamp: 1750964780590
+- conda: https://conda.anaconda.org/conda-forge/linux-64/triton-3.3.1-cuda126py312hebffaa9_1.conda
+  sha256: e87c11148ae599bf195b8f5d670ac63b7902b0de3b8c64b73450fb0f388bfc97
+  md5: 8131fb1ca6f47e8b3639af406413dc4a
   depends:
   - python
   - setuptools
@@ -3617,14 +5126,14 @@ packages:
   - libstdcxx >=13
   - libgcc >=13
   - cuda-version >=12.6,<13
-  - python_abi 3.12.* *_cp312
   - libzlib >=1.3.1,<2.0a0
   - cuda-cupti >=12.6.80,<13.0a0
+  - python_abi 3.12.* *_cp312
   - zstd >=1.5.7,<1.6.0a0
   license: MIT
   license_family: MIT
-  size: 163152560
-  timestamp: 1748728316330
+  size: 163152313
+  timestamp: 1750544259463
 - conda: https://conda.anaconda.org/conda-forge/noarch/typer-0.16.0-pyh167b9f4_0.conda
   sha256: 1ca70f0c0188598f9425a947afb74914a068bee4b7c4586eabb1c3b02fbf669f
   md5: 985cc086b73bda52b2f8d66dcda460a1
@@ -3698,9 +5207,9 @@ packages:
   license: LicenseRef-Public-Domain
   size: 122968
   timestamp: 1742727099393
-- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.4.0-pyhd8ed1ab_0.conda
-  sha256: a25403b76f7f03ca1a906e1ef0f88521edded991b9897e7fed56a3e334b3db8c
-  md5: c1e349028e0052c4eea844e94f773065
+- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda
+  sha256: 4fb9789154bd666ca74e428d973df81087a697dbb987775bc3198d2215f240f8
+  md5: 436c165519e140cb08d246a4472a9d6a
   depends:
   - brotli-python >=1.0.9
   - h2 >=4,<5
@@ -3709,8 +5218,8 @@ packages:
   - zstandard >=0.18.0
   license: MIT
   license_family: MIT
-  size: 100791
-  timestamp: 1744323705540
+  size: 101735
+  timestamp: 1750271478254
 - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.34.3-pyh31011fe_0.conda
   sha256: 4edebc7b6b96ebf92db8b5488c4b39594982eab79db44b267d8a3502e12b051b
   md5: 1520c1396715d45d02f5aa045854a65c
@@ -3811,6 +5320,28 @@ packages:
   license: Apache-2.0 AND BSD-4-Clause
   size: 6644883
   timestamp: 1748972753695
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.45-hb9d3cd8_0.conda
+  sha256: a5d4af601f71805ec67403406e147c48d6bad7aaeae92b0622b7e2396842d3fe
+  md5: 397a013c2dc5145a70737871aaa87e98
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.12,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 392406
+  timestamp: 1749375847832
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda
+  sha256: 51909270b1a6c5474ed3978628b341b4d4472cd22610e5f22b506855a5e20f67
+  md5: db038ce880f100acc74dba10302b5630
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libxcb >=1.17.0,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 835896
+  timestamp: 1741901112627
 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda
   sha256: ed10c9283974d311855ae08a16dfd7e56241fac632aec3b92e3cfe73cff31038
   md5: f6ebe2cb3f82ba6c057dde5d9debe4f7
diff --git a/pixi.toml b/pixi.toml
index 8f16304..48e015d 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -4,6 +4,20 @@ name = "qlabs"
 platforms = ["linux-64"]
 version = "0.1.0"
 
+[feature.cuda]
+system-requirements = { cuda = "12" }
+channels = ["nvidia"]
+
+[feature.cuda.dependencies]
+cuda-toolkit = "12.*" # for compute-sanitizer etc.
+
+[dependencies]
+modular = ">=25.5.0.dev2025062705,<26"
+max = ">=25.5.0.dev2025062405,<26"
+
+[environments]
+cuda = { features = ["cuda"] }
+default = { features = ["cuda"] }
 
 [tasks.clear] # Clear the terminal
 cmd = "clear"
@@ -11,9 +25,21 @@ cmd = "clear"
 [tasks.lint] # Check code formatting using an external script
 cmd = ".github/scripts/check-format.sh"
 
-[tasks.format] # Format the code
-cmd = "pixi run mojo format ./src ./tests"
-inputs = ["./examples/**/*.mojo", "./src/**/*.mojo", "./tests/**/*.mojo"]
+[tasks.format_examples] # Format the examples code
+cmd = "pixi run mojo format ./examples"
+inputs = ["./examples/**/*.mojo"]
+
+[tasks.format_src] # Format the src code
+cmd = "pixi run mojo format ./src"
+inputs = ["./src/**/*.mojo"]
+
+[tasks.format_tests] # Format the tests code
+cmd = "pixi run mojo format ./tests"
+inputs = ["./tests/**/*.mojo"]
+
+[tasks.format_benchmarks] # Format the tests code
+cmd = "pixi run mojo format ./benchmarks"
+inputs = ["./benchmarks/**/*.mojo"]
 
 [tasks.create_build_dir]
 cmd = "mkdir -p build/"
@@ -22,8 +48,9 @@ cmd = "mkdir -p build/"
 args = [
     { "arg" = "full_file_path", "default" = "examples/main.mojo" },
     { "arg" = "executable_name", "default" = "main" },
+    { "arg" = "additional_input", "default" = "pixi.toml" },        # place holder to prevent freeze
 ]
-inputs = ["{{ full_file_path }}", "./src/**/*.mojo"]
+inputs = ["./src/**/*.mojo", "{{ full_file_path }}", "{{ additional_input }}"]
 outputs = ["build/{{ executable_name }}"]
 cmd = "pixi run mojo build {{ full_file_path }} -o {{ executable_name }} && cp {{ executable_name }} build/{{ executable_name }} && rm {{ executable_name }}"
 depends-on = ["create_build_dir"]
@@ -32,13 +59,13 @@ depends-on = ["create_build_dir"]
 cmd = "mkdir -p build/ && pixi run mojo package src -o qlabs.mojopkg && cp qlabs.mojopkg build/ && rm qlabs.mojopkg"
 inputs = ["./src/**/*.mojo"]
 outputs = ["build/qlabs.mojopkg"]
-depends-on = ["create_build_dir", "format"]
+depends-on = ["create_build_dir", "format_src"]
 
 [tasks.clean] # Clean the package files and Build directory
-cmd = "rm build/* && rmdir build/ && rm examples/qlabs.mojopkg && rm tests/qlabs.mojopkg"
+cmd = "rm build/* && rmdir build/ && rm examples/qlabs.mojopkg && rm tests/qlabs.mojopkg && rm benchmarks/qlabs.mojopkg"
 
 [tasks.install] # Install the package in the necessary directories
-cmd = "cp build/qlabs.mojopkg examples/qlabs.mojopkg && cp build/qlabs.mojopkg tests/qlabs.mojopkg"
+cmd = "cp build/qlabs.mojopkg examples/qlabs.mojopkg && cp build/qlabs.mojopkg tests/qlabs.mojopkg && cp build/qlabs.mojopkg benchmarks/qlabs.mojopkg"
 inputs = ["build/qlabs.mojopkg"]
 outputs = ["examples/qlabs.mojopkg", "tests/qlabs.mojopkg"]
 depends-on = ["package"]
@@ -47,15 +74,30 @@ depends-on = ["package"]
 cmd = "./build/main"
 depends-on = [
     "install",
+    "format_examples",
     { "task" = "build", "args" = [
         "examples/main.mojo",
         "main",
+        "./examples/**/*.mojo",
     ] },
 ]
 
 [tasks.test] # Unitary Tests (uses the mojo testing tool)
 cmd = "pixi run mojo test tests"
-depends-on = ["install"]
+depends-on = ["install", "format_tests"]
+
+[tasks.bench] # Run all benchmarks
+cmd = "./build/all_benchmarks"
+depends-on = [
+    "install",
+    "format_benchmarks",
+    { "task" = "build", "args" = [
+        "benchmarks/all_benchmarks.mojo",
+        "all_benchmarks",
+        "./benchmarks/**/*.mojo",
+    ] },
+]
+
 
 # # Benches
 # bench_decimal = "clear && pixi run package && cd benches/decimal && pixi run mojo -I ../ bench.mojo && cd ../.. && pixi run clean"
@@ -74,6 +116,3 @@ tests = [{ task = "test" }]
 p = [{ task = "clear" }, { task = "package" }]
 m = [{ task = "clear" }, { task = "main" }]
 # t = "clear && pixi run package && pixi run mojo test tests --filter"
-
-[dependencies]
-modular = ">=25.5.0.dev2025062405,<26"
diff --git a/src/abstractions/simulator.mojo b/src/abstractions/simulator.mojo
index e59aed2..55d6405 100644
--- a/src/abstractions/simulator.mojo
+++ b/src/abstractions/simulator.mojo
@@ -9,7 +9,7 @@ from ..base.qubits_operations import (
 )
 
 from ..base.state_and_matrix import (
-    PureBasisState,
+    StateVector,
 )
 from ..base.gate import _START, _SEPARATOR, SWAP
 
@@ -40,9 +40,9 @@ struct StateVectorSimulator(Copyable, Movable):
     """The quantum circuit containing the gates to be applied."""
     var original_circuit: GateCircuit
     """The original circuit before any modifications, used for resetting the simulator."""
-    var initial_state: PureBasisState
+    var initial_state: StateVector
     """The initial state of the quantum system before any gates are applied."""
-    var original_initial_state: PureBasisState
+    var original_initial_state: StateVector
     """The original initial state before any modifications, used for resetting the simulator."""
     var optimisation_level: Int
     """The level of optimisation to apply during simulation, affecting performance and accuracy."""
@@ -55,9 +55,9 @@ struct StateVectorSimulator(Copyable, Movable):
     fn __init__(
         out self,
         owned circuit: GateCircuit,
-        # initial_state: Optional[PureBasisState] = None, # TODO ask how to use that with return of next_layer()
+        # initial_state: Optional[StateVector] = None, # TODO ask how to use that with return of next_layer()
         # initial_state: __type_of(Self.initial_state), # doesn't work
-        initial_state: PureBasisState,
+        initial_state: StateVector,
         optimisation_level: Int = 0,
         verbose: Bool = False,
         verbose_step_size: String = "ShowOnlyEnd",
@@ -71,7 +71,7 @@ struct StateVectorSimulator(Copyable, Movable):
             verbose: Whether to print verbose output during simulation steps.
             verbose_step_size: The verbosity level for simulation output.
         """
-        # new_initial_state = initial_state.or_else(PureBasisState.from_bitstring("0" * circuit.num_qubits))
+        # new_initial_state = initial_state.or_else(StateVector.from_bitstring("0" * circuit.num_qubits))
         new_initial_state = initial_state
         self.circuit = circuit
         self.original_circuit = circuit
@@ -84,7 +84,7 @@ struct StateVectorSimulator(Copyable, Movable):
     @always_inline
     fn next_gate(
         mut self,
-        owned quantum_state: PureBasisState,
+        owned quantum_state: StateVector,
     ) -> __type_of(quantum_state):
         """Applies the next gate in the circuit to the quantum state.
 
@@ -132,7 +132,7 @@ struct StateVectorSimulator(Copyable, Movable):
 
     fn next_layer(
         self,
-        quantum_state: PureBasisState,
+        quantum_state: StateVector,
     ) -> (Self, __type_of(quantum_state)):
         """Applies the next layer of gates in the circuit to the quantum state.
 
@@ -185,11 +185,11 @@ struct StateVectorSimulator(Copyable, Movable):
 
     fn next_block(
         self,
-        quantum_state: PureBasisState,
+        quantum_state: StateVector,
     ) -> (Self, __type_of(quantum_state)):
         return self.next_layer(quantum_state)  # For now, treat blocks as layers
 
-    fn run(self) -> PureBasisState:
+    fn run(self) -> StateVector:
         """Runs the quantum circuit simulation.
 
         Applies all gates in sequence to the initial state and computes the
@@ -197,7 +197,7 @@ struct StateVectorSimulator(Copyable, Movable):
         the specified verbosity level.
 
         Returns:
-            The final `PureBasisState` after all gates have been applied.
+            The final `StateVector` after all gates have been applied.
         """
         if self.verbose:
             print(
@@ -207,7 +207,7 @@ struct StateVectorSimulator(Copyable, Movable):
             print("Initial state:\n", self.initial_state)
 
         # Start with the initial state
-        quantum_state: PureBasisState = self.initial_state
+        quantum_state: StateVector = self.initial_state
         i: Int = 0
         layer_index: Int = 0
         for gate in self.circuit.gates:  # Iterate over the gates in the circuit
diff --git a/src/base/__init__.mojo b/src/base/__init__.mojo
index 6ce2d7c..eecf74b 100644
--- a/src/base/__init__.mojo
+++ b/src/base/__init__.mojo
@@ -1,6 +1,7 @@
-from .state_and_matrix import PureBasisState, ComplexMatrix
+from .state_and_matrix import StateVector, ComplexMatrix
 
 from .gate import (
+    Identity,
     Gate,
     Hadamard,
     PauliX,
diff --git a/src/base/gate.mojo b/src/base/gate.mojo
index d02e6af..43dfe33 100644
--- a/src/base/gate.mojo
+++ b/src/base/gate.mojo
@@ -14,6 +14,17 @@ from .state_and_matrix import (
 # MARK:         Aliases              #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+alias Identity = Gate(
+    size=2,
+    matrix=ComplexMatrix(
+        List[List[ComplexFloat64]](
+            [ComplexFloat64(1, 0), ComplexFloat64(0, 0)],
+            [ComplexFloat64(0, 0), ComplexFloat64(1, 0)],
+        )
+    ),
+    symbol="I",
+)
+
 alias Hadamard = Gate(
     size=2,
     matrix=ComplexMatrix(
diff --git a/src/base/gpu/__init__.mojo b/src/base/gpu/__init__.mojo
new file mode 100644
index 0000000..b3d16df
--- /dev/null
+++ b/src/base/gpu/__init__.mojo
@@ -0,0 +1,3 @@
+from .qubits_operations import (
+    qubit_wise_multiply_gpu,
+)
diff --git a/src/base/gpu/qubits_operations.mojo b/src/base/gpu/qubits_operations.mojo
new file mode 100644
index 0000000..5cdfabf
--- /dev/null
+++ b/src/base/gpu/qubits_operations.mojo
@@ -0,0 +1,124 @@
+from bit import count_trailing_zeros
+
+from gpu import thread_idx, block_dim, block_idx
+from gpu.host import DeviceContext
+from layout import Layout, LayoutTensor
+
+alias SIZE = 2
+alias BLOCKS_PER_GRID = 1
+alias THREADS_PER_BLOCK = (3, 3)
+alias dtype = DType.float32
+alias layout = Layout.row_major(SIZE, SIZE)
+
+alias gate_1qubit_layout = Layout.row_major(2, 2)
+alias state_vector_3qubits_layout = Layout.row_major(8, 1)
+
+
+fn qubit_wise_multiply_gpu(
+    # Use SIMD instead
+    gate_re: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
+    gate_im: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
+    gate_size: Int,
+    target_qubit: Int,
+    quantum_state_re: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    quantum_state_im: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    number_qubits: Int,
+    quantum_state_size: Int,
+    quantum_state_out_re: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    quantum_state_out_im: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    control_bits: List[List[Int]] = [],
+) -> None:
+    """Applies a quantum gate to specific qubits in the quantum state.
+
+    It will apply the gate starting from the target qubit assuming that the other
+    qubits that the gate acts on are following the target qubit.
+
+    Args:
+        gate_re: Real part of the gate matrix.
+        gate_im: Imaginary part of the gate matrix.
+        gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
+        target_qubit: The index of the target qubit to apply the gate to.
+        quantum_state_re: Real part of the quantum state vector.
+        quantum_state_im: Imaginary part of the quantum state vector.
+        number_qubits: Total number of qubits in the quantum state.
+        quantum_state_size: Size of the quantum state vector (2^number_qubits).
+        quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
+        quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
+        control_bits: List of control bits, where each control bit is a list containing
+                      [wire_index, flag] (1 for control, 0 for anti-control).
+    """
+    target_qubits_count: Int = count_trailing_zeros(gate_size)
+    if (target_qubit < 0) or (target_qubit >= number_qubits):
+        print(
+            "Error: target_qubit index out of bounds. Must be between 0 and",
+            number_qubits - 1,
+        )
+        print("Skipping gate application.")
+        return
+
+    inclusion_mask: Int = 0
+    desired_value_mask: Int = 0
+    for control in control_bits:
+        wire_index, flag = control[0], control[1]
+        bit: Int = 1 << wire_index  # efficient way of computing 2^wire_index
+        inclusion_mask |= bit  # turn on the bit
+        if flag == 1:
+            desired_value_mask |= bit  # turn on the bit
+
+    size_of_state_vector: Int = quantum_state_size
+    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+    size_of_block: Int = size_of_half_block << target_qubits_count
+
+    # copies all amplitudes from quantum_state to quantum_state_out
+    for i in range(size_of_state_vector):
+        quantum_state_out_re[i] = quantum_state_re[i]
+        quantum_state_out_im[i] = quantum_state_im[i]
+
+    for block_start in range(0, size_of_state_vector, size_of_block):
+        for offset in range(size_of_half_block):
+            i1: Int = (
+                block_start | offset
+            )  # faster than, but equivalent to, block_start + offset
+
+            if (i1 & inclusion_mask) != desired_value_mask:
+                continue  # skip this iteration if the control bits do not match
+
+            i2: Int = (
+                i1 | size_of_half_block
+            )  # equivalent to i1 + size_of_half_block
+
+            quantum_state_out_re[i1] = (
+                (gate_re[0, 0] * quantum_state_re[i1])
+                - (gate_im[0, 0] * quantum_state_im[i1])
+                + (gate_re[0, 1] * quantum_state_re[i2])
+                - (gate_im[0, 1] * quantum_state_im[i2])
+            )
+
+            quantum_state_out_im[i1] = (
+                (gate_re[0, 0] * quantum_state_im[i1])
+                - (gate_im[0, 0] * quantum_state_re[i1])
+                + (gate_re[0, 1] * quantum_state_im[i2])
+                - (gate_im[0, 1] * quantum_state_re[i2])
+            )
+
+            quantum_state_out_re[i2] = (
+                (gate_re[1, 0] * quantum_state_re[i1])
+                - (gate_im[1, 0] * quantum_state_im[i1])
+                + (gate_re[1, 1] * quantum_state_re[i2])
+                - (gate_im[1, 1] * quantum_state_im[i2])
+            )
+
+            quantum_state_out_im[i2] = (
+                (gate_re[1, 0] * quantum_state_im[i1])
+                - (gate_im[1, 0] * quantum_state_re[i1])
+                + (gate_re[1, 1] * quantum_state_im[i2])
+                - (gate_im[1, 1] * quantum_state_re[i2])
+            )
diff --git a/src/base/qubits_operations.mojo b/src/base/qubits_operations.mojo
index e530b36..f39e221 100644
--- a/src/base/qubits_operations.mojo
+++ b/src/base/qubits_operations.mojo
@@ -10,22 +10,52 @@
 
 from bit import count_trailing_zeros
 
+from ..base.gate import (
+    Identity,
+    Gate,
+)
+
 from .state_and_matrix import (
-    PureBasisState,
+    StateVector,
     ComplexMatrix,
 )
 
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 # MARK:         Functions            #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+# fn educational_qubit_wise_multiply(
+#     gate: ComplexMatrix,
+#     target_qubit: Int,
+#     owned quantum_state: StateVector,
+#     control_bits: List[List[Int]] = [],
+# ) -> __type_of(quantum_state):
+#     """Applies a quantum gate to specific qubits in the quantum state.
+
+#     It will apply the gate starting from the target qubit assuming that the other
+#     qubits that the gate acts on are following the target qubit.
+
+#     Args:
+#         gate: The 2x2 matrix representing the quantum gate.
+#         target_qubit: The index of the qubit on which the gate is applied.
+#         quantum_state: The current state of the quantum system.
+
+#     Returns:
+#         A new StateVector with the gate applied.
+#     """
+#     target_qubits_count: Int = count_trailing_zeros(gate_size)
+#     system_number_qubits: Int = quantum_state.number_qubits()
+
+#     # Apply the tensor product with Identity gates until the
+
 
 @always_inline
 fn qubit_wise_multiply_extended(
     target_qubits_count: Int,
     gate: ComplexMatrix,
     target_qubits: List[Int],
-    owned quantum_state: PureBasisState,
+    owned quantum_state: StateVector,
     control_bits: List[List[Int]] = [],
 ) -> __type_of(quantum_state):
     """Applies a quantum gate to multiple qubits in the quantum state.
@@ -43,7 +73,7 @@ fn qubit_wise_multiply_extended(
                     it is an anti-control bit.
 
     Returns:
-        A new PureBasisState with the gate applied.
+        A new StateVector with the gate applied.
     """
     if target_qubits_count == 1:
         return qubit_wise_multiply(
@@ -79,9 +109,9 @@ fn qubit_wise_multiply_extended(
 
 @always_inline
 fn _apply_to_multi_qubits(
-    mut new_state_vector: PureBasisState,
+    mut new_state_vector: StateVector,
     gate: ComplexMatrix,
-    quantum_state: PureBasisState,
+    quantum_state: StateVector,
     size_of_state_vector: Int,
     size_of_block: Int,
     size_of_half_block: Int,
@@ -92,8 +122,8 @@ fn _apply_to_multi_qubits(
     indexes: List[Int] = List[Int](capacity=gate.size())
 
     # For Method 2:
-    # temp_vector_1 = PureBasisState(size=gate_size)
-    # temp_vector_2 = PureBasisState(size=gate_size)
+    # temp_vector_1 = StateVector(size=gate_size)
+    # temp_vector_2 = StateVector(size=gate_size)
 
     for block_start in range(0, size_of_state_vector, size_of_block):
         for offset in range(size_of_half_block):
@@ -139,9 +169,9 @@ fn _apply_to_multi_qubits(
 
 @always_inline
 fn _apply_to_2_qubit(
-    mut new_state_vector: PureBasisState,
+    mut new_state_vector: StateVector,
     gate: ComplexMatrix,
-    quantum_state: PureBasisState,
+    quantum_state: StateVector,
     size_of_state_vector: Int,
     size_of_block: Int,
     size_of_half_block: Int,
@@ -177,9 +207,9 @@ fn _apply_to_2_qubit(
 
 @always_inline
 fn _apply_to_1_qubit(
-    mut new_state_vector: PureBasisState,
+    mut new_state_vector: StateVector,
     gate: ComplexMatrix,
-    quantum_state: PureBasisState,
+    quantum_state: StateVector,
     size_of_state_vector: Int,
     size_of_block: Int,
     size_of_half_block: Int,
@@ -210,7 +240,7 @@ fn _apply_to_1_qubit(
 fn qubit_wise_multiply(
     gate: ComplexMatrix,
     target_qubit: Int,
-    owned quantum_state: PureBasisState,
+    owned quantum_state: StateVector,
     control_bits: List[List[Int]] = [],
 ) -> __type_of(quantum_state):
     """Applies a quantum gate to specific qubits in the quantum state.
@@ -227,7 +257,7 @@ fn qubit_wise_multiply(
                     it is an anti-control bit.
 
     Returns:
-        A new PureBasisState with the gate applied.
+        A new StateVector with the gate applied.
     """
     gate_size: Int = gate.size()
     target_qubits_count: Int = count_trailing_zeros(gate_size)
@@ -326,7 +356,7 @@ fn educational_apply_swap(
     num_qubits: Int,
     i: Int,
     j: Int,
-    quantum_state: PureBasisState,
+    quantum_state: StateVector,
     control_bits: List[List[Int]] = [],
 ) -> __type_of(quantum_state):
     """Applies a SWAP gate to two specific qubits in the quantum state.
@@ -341,7 +371,7 @@ fn educational_apply_swap(
                     If flag is 1, it is a control bit; if 0, it is an anti-control bit.
 
     Returns:
-        A new PureBasisState with the SWAP gate applied.
+        A new StateVector with the SWAP gate applied.
     """
 
     new_state_vector = quantum_state  # copies all amplitudes from quantum_state to new_state_vector
@@ -378,7 +408,7 @@ fn apply_swap(
     num_qubits: Int,
     i: Int,
     j: Int,
-    quantum_state: PureBasisState,
+    quantum_state: StateVector,
     control_bits: List[List[Int]] = [],
 ) -> __type_of(quantum_state):
     """Applies a SWAP gate to two specific qubits in the quantum state.
@@ -393,7 +423,7 @@ fn apply_swap(
                     If flag is 1, it is a control bit; if 0, it is an anti-control bit.
 
     Returns:
-        A new PureBasisState with the SWAP gate applied.
+        A new StateVector with the SWAP gate applied.
     """
     new_state_vector = quantum_state  # copies all amplitudes from quantum_state to new_state_vector
     if i == j:
@@ -460,9 +490,7 @@ fn _rearrange_bits(
     return return_value
 
 
-fn educational_partial_trace[
-    use_lookup_table: Bool = True
-](
+fn educational_partial_trace(
     n: Int,
     input_matrix: ComplexMatrix,
     qubits_to_trace_out: List[Int],
@@ -487,25 +515,6 @@ fn educational_partial_trace[
         if not is_traced_out[i]:
             qubits_to_keep.append(i)
 
-    # is_traced_out: List[Bool] = [False] * n
-    # qubits_to_keep: List[Int] = []
-
-    # current_index: Int = 0
-    # current_traced_index: Int = 0
-    # for _ in range(n):
-    #     if qubits_to_trace_out[current_traced_index] == current_index:
-    #         is_traced_out[current_index] = True
-    #         current_traced_index += 1
-    #         if current_traced_index >= len(qubits_to_trace_out):
-    #             break
-    #     else:
-    #         qubits_to_keep.append(i)
-    #     current_index += 1
-
-    # # Ensure all qubits have been added to qubits_to_keep
-    # for i in range(current_index, n):
-    #     qubits_to_keep.append(i)
-
     num_qubits_to_trace_out: Int = len(qubits_to_trace_out)
     num_qubits_to_keep: Int = len(qubits_to_keep)
     if num_qubits_to_trace_out + num_qubits_to_keep != n:
@@ -520,46 +529,24 @@ fn educational_partial_trace[
     # This is 2^num_qubits_to_keep == the dimension of the resulting matrix
     result_dimension: Int = 1 << num_qubits_to_keep
 
-    lookup_table: Dict[Int, Int] = {}
-
-    @parameter
-    if use_lookup_table:
-        for tmp in range(result_dimension):
-            lookup_table[tmp] = _rearrange_bits(tmp, qubits_to_keep)
-
     output_matrix: ComplexMatrix = ComplexMatrix(
         result_dimension, result_dimension
     )
     for shared_bits in range(
         traced_dimension
     ):  # bits common to input_row and input_col
-
-        @parameter
-        if use_lookup_table:
-            shared_bits_rearranged: Int = lookup_table.get(shared_bits, 0)
-        else:
-            shared_bits_rearranged: Int = _rearrange_bits(
-                shared_bits, qubits_to_trace_out
-            )
+        shared_bits_rearranged: Int = _rearrange_bits(
+            shared_bits, qubits_to_trace_out
+        )
 
         for output_row in range(result_dimension):
-
-            @parameter
-            if use_lookup_table:
-                input_row: Int = lookup_table.get(output_row, 0)
-            else:
-                input_row: Int = shared_bits_rearranged | _rearrange_bits(
-                    output_row, qubits_to_keep
-                )
+            input_row: Int = shared_bits_rearranged | _rearrange_bits(
+                output_row, qubits_to_keep
+            )
             for output_col in range(result_dimension):
-
-                @parameter
-                if use_lookup_table:
-                    input_col: Int = lookup_table.get(output_col, 0)
-                else:
-                    input_col: Int = shared_bits_rearranged | _rearrange_bits(
-                        output_col, qubits_to_keep
-                    )
+                input_col: Int = shared_bits_rearranged | _rearrange_bits(
+                    output_col, qubits_to_keep
+                )
 
                 output_matrix[output_row, output_col] += input_matrix[
                     input_row, input_col
@@ -570,14 +557,11 @@ fn educational_partial_trace[
 
 fn partial_trace[
     use_lookup_table: Bool = True
-](
-    quantum_state: PureBasisState,
-    qubits_to_trace_out: List[Int],
-) -> ComplexMatrix:
+](quantum_state: StateVector, qubits_to_trace_out: List[Int],) -> ComplexMatrix:
     """Performs a partial trace over specified qubits in a quantum state.
 
     Args:
-        quantum_state: A PureBasisState representing the quantum state.
+        quantum_state: A StateVector representing the quantum state.
         qubits_to_trace_out: An array of indices of qubits to trace out, in ascending order
                             and without duplicates.
 
@@ -650,20 +634,17 @@ fn partial_trace[
     for shared_bits in range(
         traced_dimension
     ):  # bits common to input_row and input_col
-
-        @parameter
-        if use_lookup_table:
-            shared_bits_rearranged: Int = lookup_table.get(shared_bits, 0)
-        else:
-            shared_bits_rearranged: Int = _rearrange_bits(
-                shared_bits, qubits_to_trace_out
-            )
+        shared_bits_rearranged: Int = _rearrange_bits(
+            shared_bits, qubits_to_trace_out
+        )
 
         for output_row in range(result_dimension):
 
             @parameter
             if use_lookup_table:
-                input_row: Int = lookup_table.get(output_row, 0)
+                input_row: Int = shared_bits_rearranged | lookup_table.get(
+                    output_row, 0
+                )
             else:
                 input_row: Int = shared_bits_rearranged | _rearrange_bits(
                     output_row, qubits_to_keep
@@ -672,7 +653,9 @@ fn partial_trace[
 
                 @parameter
                 if use_lookup_table:
-                    input_col: Int = lookup_table.get(output_col, 0)
+                    input_col: Int = shared_bits_rearranged | lookup_table.get(
+                        output_col, 0
+                    )
                 else:
                     input_col: Int = shared_bits_rearranged | _rearrange_bits(
                         output_col, qubits_to_keep
diff --git a/src/base/state_and_matrix.mojo b/src/base/state_and_matrix.mojo
index ff4defc..6205d45 100644
--- a/src/base/state_and_matrix.mojo
+++ b/src/base/state_and_matrix.mojo
@@ -2,9 +2,13 @@
 # MARK:         Imports              #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+from collections.linked_list import LinkedList
+
 from ..local_stdlib import CustomList
 from ..local_stdlib.complex import ComplexFloat64
 
+from .qubits_operations import partial_trace
+
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 # MARK:         Structs              #
@@ -12,8 +16,8 @@ from ..local_stdlib.complex import ComplexFloat64
 
 
 @fieldwise_init
-# struct PureBasisState[check_bounds:Bool = False](Copyable, Movable, Stringable, Writable):
-struct PureBasisState(Copyable, Movable, Stringable, Writable):
+# struct StateVector[check_bounds:Bool = False](Copyable, Movable, Stringable, Writable):
+struct StateVector(Copyable, Movable, Stringable, Writable):
     """Represents a pure quantum state as a basis state in the computational basis.
 
     Uses a vector of complex numbers to represent the amplitudes of the basis states.
@@ -27,7 +31,7 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
     """The state vector representing the amplitudes of the basis states."""
 
     fn __init__(out self, size: Int):
-        """Initializes a PureBasisState with the given size.
+        """Initializes a StateVector with the given size.
 
         Args:
             size: The size of the vector, which is 2^n for n qubits.
@@ -53,9 +57,9 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
         self.state_vector[index] = value
 
     fn __str__(self) -> String:
-        """Returns a beautifully formatted string representation of the PureBasisState.
+        """Returns a beautifully formatted string representation of the StateVector.
         """
-        string: String = "PureBasisState:\n"
+        string: String = "StateVector:\n"
         for i in range(self.size()):
             amplitude = self.state_vector[i]
             # amplitude_str: String = String(amplitude)
@@ -88,7 +92,7 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
 
     @staticmethod
     fn from_bitstring(bitstring: String) -> Self:
-        """Returns a PureBasisState corresponding to the given bitstring.
+        """Returns a StateVector corresponding to the given bitstring.
 
         Params:
             bitstring: A string of '0's and '1's representing the state, with
@@ -96,11 +100,11 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
 
         Examples:
         ```mojo
-        state = PureBasisState.from_bitstring("110")
+        state = StateVector.from_bitstring("110")
         ```
 
         Returns:
-            A PureBasisState object with the appropriate data initialized.
+            A StateVector object with the appropriate data initialized.
         """
         num_qubits: Int = len(bitstring)
 
@@ -173,7 +177,7 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
         in the state vector.
 
         Returns:
-            A new PureBasisState with the conjugated amplitudes.
+            A new StateVector with the conjugated amplitudes.
         """
         conjugated_state: Self = Self(self.num_qubits, self.state_vector.copy())
 
@@ -181,6 +185,10 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
             conjugated_state.state_vector[i] = self.state_vector[i].conjugate()
         return conjugated_state
 
+    # @always_inline
+    # fn tensor_product()
+
+    @always_inline
     fn to_density_matrix(self) -> ComplexMatrix:
         """Returns the density matrix of the pure state.
 
@@ -199,6 +207,102 @@ struct PureBasisState(Copyable, Movable, Stringable, Writable):
                 )
         return density_matrix
 
+    # fn partial_trace[
+    #     use_lookup_table: Bool = True
+    # ](quantum_state: StateVector, qubits_to_trace_out: List[Int],) -> ComplexMatrix:
+
+    # qubits_to_trace_out: An array of indices of qubits to trace out, in ascending order
+    #                         and without duplicates.
+
+    @always_inline
+    fn purity(self, arg_qubits_to_keep: LinkedList[Int] = []) -> Float64:
+        """Calculates the purity of the pure state.
+
+        The purity is defined as the trace of the density matrix squared.
+
+        Args:
+            arg_qubits_to_keep: A list of qubit indices to keep in the state in stricly ascending order.
+                                If empty, all qubits are kept.
+
+        Returns:
+            The purity of the pure state, which should be 1 for a valid pure state.
+        """
+        qubits_to_keep = arg_qubits_to_keep.copy()
+
+        # Sanity check for ascending order and in range
+        for i in range(len(qubits_to_keep)):
+            qubit: Int = qubits_to_keep[i]
+            if qubit < 0 or qubit >= self.num_qubits:
+                print(
+                    "Error: Qubit index",
+                    qubit,
+                    "is out of range for the number of qubits",
+                    self.num_qubits,
+                )
+                return 0.0
+            if i > 0 and qubits_to_keep[i - 1] >= qubit:
+                print(
+                    (
+                        "Error: Qubit indices must be in stricly ascending"
+                        " order. Found "
+                    ),
+                    qubits_to_keep[i - 1],
+                    "and",
+                    qubit,
+                )
+                return 0.0
+
+        # If no qubits to keep, keep all
+        if len(qubits_to_keep) == 0:
+            for i in range(self.num_qubits):
+                qubits_to_keep.append(i)
+
+        qubits_to_trace_out: List[Int] = []
+        current_qubit: Int = 0
+        for i in range(self.num_qubits):
+            if (
+                current_qubit < len(qubits_to_keep)
+                and qubits_to_keep[current_qubit] == i
+            ):
+                current_qubit += 1  # This qubit is kept, so skip it
+            else:
+                qubits_to_trace_out.append(i)  # This qubit is traced out
+
+        density_matrix: ComplexMatrix = partial_trace(self, qubits_to_trace_out)
+
+        # density_matrix = self.to_density_matrix()
+        trace_squared: Float64 = 0.0
+
+        for i in range(density_matrix.size()):
+            trace_squared += density_matrix[i, i].squared_norm()
+
+        return trace_squared
+
+    @always_inline
+    fn normalised_purity(self, qubits_to_keep: LinkedList[Int] = []) -> Float64:
+        """Calculates the normalised purity of the pure state.
+
+        For a density matrix of size 2n×2n, purity ranges from 1/2^n to 1.
+        The normalised purity is defined as the purity divided by the number of qubits.
+
+        Returns:
+            The normalised purity of the pure state.
+        """
+        return (self.size() * self.purity(qubits_to_keep) - 1) / (
+            self.size() - 1
+        )
+
+    @always_inline
+    fn linear_entropy(self, qubits_to_keep: LinkedList[Int] = []) -> Float64:
+        """Calculates the linear entropy of the pure state.
+
+        The linear entropy is defined as 1 - purity.
+
+        Returns:
+            The linear entropy of the pure state.
+        """
+        return 1.0 - self.purity(qubits_to_keep)
+
 
 @fieldwise_init
 struct ComplexMatrix(Copyable, Movable, Stringable, Writable):
@@ -264,7 +368,7 @@ struct ComplexMatrix(Copyable, Movable, Stringable, Writable):
         writer.write(String(self))
 
     @always_inline
-    fn mult(self, other: PureBasisState, mut buffer: PureBasisState) -> None:
+    fn mult(self, other: StateVector, mut buffer: StateVector) -> None:
         """Multiplies the matrix by a complex vector and stores the result in a buffer.
 
         Args:
diff --git a/tests/base/test_qubit_operations.mojo b/tests/base/test_qubit_operations.mojo
index d6e6ef2..7301b7f 100644
--- a/tests/base/test_qubit_operations.mojo
+++ b/tests/base/test_qubit_operations.mojo
@@ -8,8 +8,12 @@ from testing import (
 
 from testing_matrix import assert_matrix_almost_equal
 
+from testing_state_vector import assert_state_vector_almost_equal
+
+from math import sqrt
+
 from qlabs.base import (
-    PureBasisState,
+    StateVector,
     ComplexMatrix,
     Gate,
     Hadamard,
@@ -33,9 +37,89 @@ from qlabs.local_stdlib import CustomList
 from qlabs.local_stdlib.complex import ComplexFloat64
 
 
+def test_qubit_wise_multiply_0():
+    """Simulate a small circuit"""
+
+    quantum_state: StateVector = StateVector.from_bitstring("000")
+
+    quantum_state = qubit_wise_multiply(Hadamard.matrix, 0, quantum_state)
+    quantum_state = qubit_wise_multiply(
+        PauliX.matrix, 1, quantum_state, [[0, 1]]
+    )
+    quantum_state = qubit_wise_multiply(Hadamard.matrix, 2, quantum_state)
+
+    assert_state_vector_almost_equal(
+        quantum_state,
+        StateVector(
+            3,
+            CustomList[ComplexFloat64, hint_trivial_type=True](
+                ComplexFloat64(0.5, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0.5, 0),
+                ComplexFloat64(0.5, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0.5, 0),
+            ),
+        ),
+    )
+
+
+def test_qubit_wise_multiply_figure1():
+    """Simulates the circuit from Figure 1 in the paper.
+
+    |0> -------|X|--|Z|--
+                |
+    |0> --|H|---*----*---
+                     |
+    |0> --|X|-------|X|--
+
+    """
+
+    # Initialize the quantum circuit to the |000⟩ state
+    quantum_state: StateVector = StateVector.from_bitstring("000")
+
+    # Gate 0
+    quantum_state = qubit_wise_multiply(Hadamard.matrix, 1, quantum_state)
+
+    # Gate 1
+    quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
+
+    # Gate 2
+    quantum_state = qubit_wise_multiply(
+        PauliX.matrix, 0, quantum_state, [[1, 1]]
+    )
+
+    # Gate 3
+    quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
+
+    # Gate 4
+    quantum_state = qubit_wise_multiply(
+        PauliX.matrix, 2, quantum_state, [[1, 1]]
+    )
+
+    assert_state_vector_almost_equal(
+        quantum_state,
+        StateVector(
+            3,
+            CustomList[ComplexFloat64, hint_trivial_type=True](
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(-1.0 / sqrt(2.0), 0),
+                ComplexFloat64(1.0 / sqrt(2.0), 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0, 0),
+                ComplexFloat64(0, 0),
+            ),
+        ),
+    )
+
+
 def test_partial_trace_all():
     """Test the partial trace operation on a 2-qubit state. Keep all qubits."""
-    state: PureBasisState = PureBasisState(
+    state: StateVector = StateVector(
         2,
         CustomList[ComplexFloat64, hint_trivial_type=True](
             ComplexFloat64(0, 0),
@@ -79,61 +163,111 @@ def test_partial_trace_all():
     )
 
 
-def test_partial_trace_qubit0():
-    """Test the partial trace operation on a 2-qubit state. Trace out qubit 0, keep qubit 1.
+def test_partial_trace_sec67():
+    """Test the partial trace operation on a 2-qubit state. Trace out qubit 1, keep qubit 0.
     """
-    state: PureBasisState = PureBasisState(
-        2,
+    state: StateVector = StateVector(
+        3,
         CustomList[ComplexFloat64, hint_trivial_type=True](
+            ComplexFloat64(0.5, 0),
             ComplexFloat64(0, 0),
-            ComplexFloat64(-0.5, 0),
-            ComplexFloat64(0.7071067811863477, 0),
-            ComplexFloat64(-0.5, 0),
+            ComplexFloat64(0, 0),
+            ComplexFloat64(0.5, 0),
+            ComplexFloat64(0.5, 0),
+            ComplexFloat64(0, 0),
+            ComplexFloat64(0, 0),
+            ComplexFloat64(0.5, 0),
         ),
     )
-    matrix = partial_trace(state, [0])
+
+    matrix = partial_trace(state, [0, 1])
+    assert_matrix_almost_equal(
+        matrix,
+        ComplexMatrix(
+            List[List[ComplexFloat64]](
+                [ComplexFloat64(0.5, 0), ComplexFloat64(0.5, 0)],
+                [ComplexFloat64(0.5, 0), ComplexFloat64(0.5, 0)],
+            )
+        ),
+        "partial trace qubit 0 and 1",
+    )
+
+    matrix = partial_trace(state, [2])
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
             List[List[ComplexFloat64]](
                 [
+                    ComplexFloat64(0.5, 0),
                     ComplexFloat64(0, 0),
                     ComplexFloat64(0, 0),
+                    ComplexFloat64(0.5, 0),
                 ],
                 [
                     ComplexFloat64(0, 0),
-                    ComplexFloat64(1, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0, 0),
+                ],
+                [
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0, 0),
+                ],
+                [
+                    ComplexFloat64(0.5, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0.5, 0),
                 ],
             )
         ),
+        "partial trace qubits 2",
     )
 
-
-def test_partial_trace_qubit1():
-    """Test the partial trace operation on a 2-qubit state. Trace out qubit 1, keep qubit 0.
-    """
-    state: PureBasisState = PureBasisState(
-        2,
-        CustomList[ComplexFloat64, hint_trivial_type=True](
-            ComplexFloat64(0, 0),
-            ComplexFloat64(-0.5, 0),
-            ComplexFloat64(0.7071067811863477, 0),
-            ComplexFloat64(-0.5, 0),
+    matrix = partial_trace(state, [1, 2])
+    assert_matrix_almost_equal(
+        matrix,
+        ComplexMatrix(
+            List[List[ComplexFloat64]](
+                [ComplexFloat64(0.5, 0), ComplexFloat64(0, 0)],
+                [ComplexFloat64(0, 0), ComplexFloat64(0.5, 0)],
+            )
         ),
+        "partial trace qubits 1 and 2",
     )
-    matrix = partial_trace(state, [1])
+
+    matrix = partial_trace(state, [0])
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
             List[List[ComplexFloat64]](
                 [
+                    ComplexFloat64(0.25, 0),
                     ComplexFloat64(0, 0),
+                    ComplexFloat64(0.25, 0),
                     ComplexFloat64(0, 0),
                 ],
                 [
                     ComplexFloat64(0, 0),
-                    ComplexFloat64(0.5, 0),
+                    ComplexFloat64(0.25, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0.25, 0),
+                ],
+                [
+                    ComplexFloat64(0.25, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0.25, 0),
+                    ComplexFloat64(0, 0),
+                ],
+                [
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0.25, 0),
+                    ComplexFloat64(0, 0),
+                    ComplexFloat64(0.25, 0),
                 ],
             )
         ),
+        "partial trace qubit 1",
     )
diff --git a/tests/base/testing_matrix.mojo b/tests/base/testing_matrix.mojo
index 01a77a6..cfe23ad 100644
--- a/tests/base/testing_matrix.mojo
+++ b/tests/base/testing_matrix.mojo
@@ -6,7 +6,50 @@ from testing import (
     assert_almost_equal,
 )
 
-from qlabs.base import ComplexMatrix
+from qlabs.base import StateVector, ComplexMatrix
+
+
+def assert_state_vector_almost_equal(
+    reference_state: StateVector, state: StateVector, message: String = ""
+) -> None:
+    """Asserts that two state vectors are almost equal.
+
+    Args:
+        reference_state: The reference state to compare against.
+        state: The state to check for equality.
+        message: An optional message to include in the assertion error.
+    """
+    assert_equal(
+        reference_state.size(),
+        state.size(),
+        String("State vectors must have the same size.") + message,
+    )
+    assert_equal(
+        reference_state.number_qubits(),
+        state.number_qubits(),
+        String("State vectors must have the same number of qubits.") + message,
+    )
+    for i in range(reference_state.size()):
+        assert_almost_equal(
+            reference_state[i].re,
+            state[i].re,
+            String(
+                "Real parts of state vectors are not equal at index ",
+                i,
+                ". ",
+            )
+            + message,
+        )
+        assert_almost_equal(
+            reference_state[i].im,
+            state[i].im,
+            String(
+                "Imaginary parts of state vectors are not equal at index ",
+                i,
+                ". ",
+            )
+            + message,
+        )
 
 
 def assert_matrix_almost_equal(
@@ -17,6 +60,7 @@ def assert_matrix_almost_equal(
     Args:
         reference_matrix: The reference matrix to compare against.
         matrix: The matrix to check for equality.
+        message: An optional message to include in the assertion error.
     """
     assert_equal(
         reference_matrix.size(),
diff --git a/tests/base/testing_state_vector.mojo b/tests/base/testing_state_vector.mojo
new file mode 100644
index 0000000..7cabe7d
--- /dev/null
+++ b/tests/base/testing_state_vector.mojo
@@ -0,0 +1,52 @@
+from testing import (
+    assert_true,
+    assert_false,
+    assert_equal,
+    assert_not_equal,
+    assert_almost_equal,
+)
+
+from qlabs.base import StateVector
+
+
+def assert_state_vector_almost_equal(
+    reference_state: StateVector, state: StateVector, message: String = ""
+) -> None:
+    """Asserts that two state vectors are almost equal.
+
+    Args:
+        reference_state: The reference state to compare against.
+        state: The state to check for equality.
+        message: An optional message to include in the assertion error.
+    """
+    assert_equal(
+        reference_state.size(),
+        state.size(),
+        String("State vectors must have the same size.") + message,
+    )
+    assert_equal(
+        reference_state.number_qubits(),
+        state.number_qubits(),
+        String("State vectors must have the same number of qubits.") + message,
+    )
+    for i in range(reference_state.size()):
+        assert_almost_equal(
+            reference_state[i].re,
+            state[i].re,
+            String(
+                "Real parts of state vectors are not equal at index ",
+                i,
+                ". ",
+            )
+            + message,
+        )
+        assert_almost_equal(
+            reference_state[i].im,
+            state[i].im,
+            String(
+                "Imaginary parts of state vectors are not equal at index ",
+                i,
+                ". ",
+            )
+            + message,
+        )

From 22df283e8cd907cdde496a359c4dc43c5823910f Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sat, 28 Jun 2025 12:27:00 -0600
Subject: [PATCH 2/7] feat: full circuit on gpu + show bug with alias List
 variable

---
 TODOs.md                              |   4 +-
 examples/gpu_examples.mojo            | 568 ++++++++++++++++++--------
 examples/main.mojo                    |  10 +-
 src/base/gate.mojo                    |  94 +++--
 src/base/gpu/__init__.mojo            |   1 +
 src/base/gpu/qubits_operations.mojo   | 219 ++++++++--
 src/base/state_and_matrix.mojo        |  90 ++--
 tests/base/test_qubit_operations.mojo | 180 ++++----
 8 files changed, 786 insertions(+), 380 deletions(-)

diff --git a/TODOs.md b/TODOs.md
index 156e503..6e450bb 100644
--- a/TODOs.md
+++ b/TODOs.md
@@ -4,7 +4,7 @@
 
 ### Implementations
 
-- 5 / 5 : Start adding support for GPU in the base classes if needed (not possible to use SIMD(complexfloat64) anymore, or keep them but seperate them when moving data to GPU)
+- 5 / 5 : Start adding support for GPU in the base classes if needed (not possible to use SIMD(ComplexFloat32) anymore, or keep them but seperate them when moving data to GPU)
     - struct StateVector
     - struct ComplexMatrix
     - struct Gate
@@ -15,6 +15,8 @@
     - partial_trace()
     - StateVector.to_density_matrix()
 
+- 4 / 3 : Export benchmark results as plots.
+
 - 2 / 4 : Efficient support for tracking a state statistic like entropy during the execution of the circuit by the simulator.
 
 - 3 / 3 : Implement naive implementation of the functions to compare performances
diff --git a/examples/gpu_examples.mojo b/examples/gpu_examples.mojo
index 05609e4..988874b 100644
--- a/examples/gpu_examples.mojo
+++ b/examples/gpu_examples.mojo
@@ -1,156 +1,58 @@
-# from qlabs.base.gpu import (
-#     qubit_wise_multiply_gpu,
-# )
-
 from bit import count_trailing_zeros
-
 from sys import has_accelerator
 
 from gpu import thread_idx, block_dim, block_idx
 from gpu.host import DeviceContext
-from layout import Layout, LayoutTensor
+from layout import Layout, LayoutTensor, IntTuple, print_layout
+
+from qlabs.base import (
+    StateVector,
+    Gate,
+    Hadamard,
+    PauliX,
+    PauliY,
+    PauliZ,
+    NOT,
+    H,
+    X,
+    Y,
+    Z,
+    SWAP,
+    iSWAP,
+)
+
+from qlabs.base.gpu import qubit_wise_multiply_gpu, qubit_wise_multiply_gpu_2
 
-alias SIZE = 2
 alias BLOCKS_PER_GRID = 1
 alias THREADS_PER_BLOCK = (1, 1)
 alias dtype = DType.float32
-alias layout = Layout.row_major(SIZE, SIZE)
 
 alias GATE_SIZE = 2
 alias STATE_VECTOR_SIZE = 8
 alias NUMBER_CONTROL_BITS = 1
+# crashes the code on GitHub (tag Austin Doolittle)
+# another crash due to IntTuple
+# alias control_bits_list: List[List[List[Int]]] = [
+#     [[1, 1]],  # Control on qubit 1 and is control because flag=1
+#     [[1, 1]],  # Control on qubit 1 and is control because flag=1
+# ]
+alias CIRCUIT_NUMBER_CONTROL_GATES = 2
+
 
 alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
 alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE, 1)
 alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
 
-
-fn qubit_wise_multiply_gpu(
-    gate_re: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
-    gate_im: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
-    gate_size: Int,
-    target_qubit: Int,
-    quantum_state_re: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    quantum_state_im: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    number_qubits: Int,
-    quantum_state_size: Int,
-    quantum_state_out_re: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    quantum_state_out_im: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
-    # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
-    number_control_bits: Int,
-) -> None:
-    """Applies a quantum gate to specific qubits in the quantum state.
-
-    It will apply the gate starting from the target qubit assuming that the other
-    qubits that the gate acts on are following the target qubit.
-
-    Args:
-        gate_re: Real part of the gate matrix.
-        gate_im: Imaginary part of the gate matrix.
-        gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
-        target_qubit: The index of the target qubit to apply the gate to.
-        quantum_state_re: Real part of the quantum state vector.
-        quantum_state_im: Imaginary part of the quantum state vector.
-        number_qubits: Total number of qubits in the quantum state.
-        quantum_state_size: Size of the quantum state vector (2^number_qubits).
-        quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
-        quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
-        control_bits: List of control bits, where each control bit is a list containing
-                      [wire_index, flag] (1 for control, 0 for anti-control).
-    """
-    print("Inside qubit_wise_multiply_gpu")
-    target_qubits_count: Int = count_trailing_zeros(gate_size)
-    if (target_qubit < 0) or (target_qubit >= number_qubits):
-        print(
-            "Error: target_qubit index out of bounds. Must be between 0 and",
-            number_qubits - 1,
-        )
-        print("Skipping gate application.")
-        return
-
-    print("AAAAA")
-    inclusion_mask: Int = 0
-    desired_value_mask: Int = 0
-    for i in range(number_control_bits):
-        print("before")
-        wire_index, flag = control_bits[i, 0], control_bits[i, 1]
-        print("after")
-        bit: Int = 1 << Int(
-            wire_index
-        )  # efficient way of computing 2^wire_index
-        inclusion_mask |= bit  # turn on the bit
-        if flag == 1:
-            desired_value_mask |= bit  # turn on the bit
-
-    print("BBBBB")
-    size_of_state_vector: Int = quantum_state_size
-    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
-    size_of_block: Int = size_of_half_block << target_qubits_count
-
-    print("CCCC")
-    # copies all amplitudes from quantum_state to quantum_state_out
-    for i in range(size_of_state_vector):
-        quantum_state_out_re[i, 0] = quantum_state_re[i, 0]
-        quantum_state_out_im[i, 0] = quantum_state_im[i, 0]
-
-    print("before loop")
-    for block_start in range(0, size_of_state_vector, size_of_block):
-        # print("block_start:", block_start)
-        for offset in range(size_of_half_block):
-            # print("offset:", offset)
-            i1: Int = (
-                block_start | offset
-            )  # faster than, but equivalent to, block_start + offset
-
-            if (i1 & inclusion_mask) != desired_value_mask:
-                continue  # skip this iteration if the control bits do not match
-
-            i2: Int = (
-                i1 | size_of_half_block
-            )  # equivalent to i1 + size_of_half_block
-
-            # new_state_vector[i1] = (
-            #     gate[0, 0] * quantum_state[i1] + gate[0, 1] * quantum_state[i2]
-            # )
-
-            print("i1:", i1, "i2:", i2)
-
-            quantum_state_out_re[i1] = (
-                (gate_re[0, 0] * quantum_state_re[i1, 0])
-                - (gate_im[0, 0] * quantum_state_im[i1, 0])
-                + (gate_re[0, 1] * quantum_state_re[i2, 0])
-                - (gate_im[0, 1] * quantum_state_im[i2, 0])
-            )
-
-            quantum_state_out_im[i1] = (
-                (gate_re[0, 0] * quantum_state_im[i1, 0])
-                - (gate_im[0, 0] * quantum_state_re[i1, 0])
-                + (gate_re[0, 1] * quantum_state_im[i2, 0])
-                - (gate_im[0, 1] * quantum_state_re[i2, 0])
-            )
-
-            quantum_state_out_re[i2] = (
-                (gate_re[1, 0] * quantum_state_re[i1, 0])
-                - (gate_im[1, 0] * quantum_state_im[i1, 0])
-                + (gate_re[1, 1] * quantum_state_re[i2, 0])
-                - (gate_im[1, 1] * quantum_state_im[i2, 0])
-            )
-
-            quantum_state_out_im[i2] = (
-                (gate_re[1, 0] * quantum_state_im[i1, 0])
-                - (gate_im[1, 0] * quantum_state_re[i1, 0])
-                + (gate_re[1, 1] * quantum_state_im[i2, 0])
-                - (gate_im[1, 1] * quantum_state_re[i2, 0])
-            )
+alias gate_set = [Hadamard, PauliX, PauliZ]
+alias gate_set_dic: Dict[String, Int] = {
+    Hadamard.symbol: 0,
+    PauliX.symbol: 1,
+    PauliZ.symbol: 2,
+}
+alias GATE_SET_SIZE = 3
+alias gate_set_1qubit_layout = Layout.row_major(
+    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
+)
 
 
 def gpu_debug_something():
@@ -184,25 +86,6 @@ def gpu_debug_something():
             NUMBER_CONTROL_BITS * 2
         ).enqueue_fill(0)
 
-        # gate_re = ctx.enqueue_create_host_buffer[dtype](
-        #     GATE_SIZE * GATE_SIZE
-        # ).enqueue_fill(0)
-        # gate_im = ctx.enqueue_create_host_buffer[dtype](
-        #     GATE_SIZE * GATE_SIZE
-        # ).enqueue_fill(0)
-        # quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
-        #     STATE_VECTOR_SIZE
-        # ).enqueue_fill(0)
-        # quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
-        #     STATE_VECTOR_SIZE
-        # ).enqueue_fill(0)
-        # quantum_state_out_re = ctx.enqueue_create_host_buffer[dtype](
-        #     STATE_VECTOR_SIZE
-        # ).enqueue_fill(0)
-        # quantum_state_out_im = ctx.enqueue_create_host_buffer[dtype](
-        #     STATE_VECTOR_SIZE
-        # ).enqueue_fill(0)
-
         gate_re_tensor = LayoutTensor[mut=True, dtype, gate_1qubit_layout](
             gate_re.unsafe_ptr()
         )
@@ -225,34 +108,8 @@ def gpu_debug_something():
             mut=True, DType.int32, control_bits_layout
         ](control_bits.unsafe_ptr())
 
-        matrix = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
-
-        # with matrix.map_to_host() as host_matrix:
-        #     print(host_matrix)
-
-        # cannot replace layout by runtime_layout here
-        matrix_tensor = LayoutTensor[mut=True, dtype, layout](
-            matrix.unsafe_ptr()
-        )
-
-        ctx.synchronize()
-
         print("Before")
 
-        # qubit_wise_multiply_gpu(
-        #     gate_re_tensor,
-        #     gate_im_tensor,
-        #     GATE_SIZE,
-        #     0,  # target_qubit
-        #     quantum_state_re_tensor,
-        #     quantum_state_im_tensor,
-        #     3,  # number_qubits
-        #     STATE_VECTOR_SIZE,  # quantum_state_size
-        #     quantum_state_out_re_tensor,
-        #     quantum_state_out_im_tensor,
-        #     control_bits,
-        # )
-
         ctx.enqueue_function[qubit_wise_multiply_gpu](
             gate_re_tensor,
             gate_im_tensor,
@@ -286,3 +143,354 @@ def gpu_debug_something():
             print("Quantum state real part:", host_quantum_state_re)
         with quantum_state_im.map_to_host() as host_quantum_state_im:
             print("Quantum state imaginary part:", host_quantum_state_im)
+
+
+def run_gpu_not_abstract():
+    """Simulates the circuit from Figure 1 in the paper."""
+
+    @parameter
+    if not has_accelerator():
+        print("No compatible GPU found")
+    else:
+        print("Simulating Figure 1 circuit.\nCircuit design:")
+        print(
+            """
+    |0> -------|X|--|Z|--
+                |
+    |0> --|H|---*----*---
+                    |
+    |0> --|X|-------|X|--
+        """
+        )
+        var control_bits_list: List[List[List[Int]]] = [
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+        ]
+
+        ctx = DeviceContext()
+        print("Using GPU:", ctx.name())
+
+        # -- Create GPU variables -- #
+        ctx = DeviceContext()
+
+        # Initialize the quantum circuit to the |000⟩ state
+        quantum_state: StateVector = StateVector.from_bitstring("000")
+        print("Initial quantum state:\n", quantum_state)
+
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+
+        with quantum_state_re.map_to_host() as device_re, quantum_state_im.map_to_host() as device_im:
+            for i in range(STATE_VECTOR_SIZE):
+                device_re[i] = quantum_state[i].re
+                device_im[i] = quantum_state[i].im
+
+        with quantum_state_re.map_to_host() as host_re:
+            print("Quantum state real part:", host_re)
+        with quantum_state_im.map_to_host() as host_im:
+            print("Quantum state imaginary part:", host_im)
+
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        ).enqueue_fill(0)
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        ).enqueue_fill(0)
+
+        with gate_set_re.map_to_host() as device_gate_re, gate_set_im.map_to_host() as device_gate_im:
+            for i in range(GATE_SET_SIZE):
+                gate = gate_set[i]
+                for j in range(GATE_SIZE):
+                    for k in range(GATE_SIZE):
+                        index = gate_set_1qubit_layout(
+                            IntTuple(i, j, k)
+                        )  # Get the index in the 1D buffer
+                        device_gate_re[index] = gate[j, k].re
+                        device_gate_im[index] = gate[j, k].im
+
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            STATE_VECTOR_SIZE
+        ).enqueue_fill(0)
+
+        # TODO have one big variable for all control bits
+        # and have NUMBER_CONTROL_BITS be a list defining each gates specific control bits count
+        # maybe need a variable to keep track of the current control gate index
+        control_bits_0 = ctx.enqueue_create_buffer[DType.int32](
+            NUMBER_CONTROL_BITS * 2
+        ).enqueue_fill(0)
+
+        control_bits_1 = ctx.enqueue_create_buffer[DType.int32](
+            NUMBER_CONTROL_BITS * 2
+        ).enqueue_fill(0)
+
+        with control_bits_0.map_to_host() as device_control_0, control_bits_1.map_to_host() as device_control_1:
+            # Set control bits for the first controlled gate
+            device_control_0[0] = 1
+            device_control_0[1] = 1
+
+            # Set control bits for the second controlled gate
+            device_control_1[0] = 1
+            device_control_1[1] = 1
+
+        control_bits_empty = ctx.enqueue_create_buffer[DType.int32](
+            NUMBER_CONTROL_BITS * 2
+        ).enqueue_fill(0)
+
+        # Create control bits for the circuit
+        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        ).enqueue_fill(0)
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](
+            1
+        ).enqueue_fill(0)
+
+        print("FIRST LOOP")
+        for i in range(len(control_bits_list)):
+            for j in range(len(control_bits_list[i])):
+                for k in range(len(control_bits_list[i][j])):
+                    print("i:", i, "j:", j, "k:", k)
+                    print(
+                        "control_bits_list[i][j][k]:",
+                        control_bits_list[i][j][k],
+                    )
+
+        print("before printing index")
+
+        var coords_2 = IntTuple(0, 0, 0)
+        print("coords_2:", coords_2)
+
+        alias circuit_control_bits_layout = Layout.row_major(
+            CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
+        )
+        var index = circuit_control_bits_layout(coords_2)
+        print(
+            "gate_set_1qubit_layout(coords_2):",
+            index,
+        )
+
+        # print(
+        #     "circuit_control_bits_layout(coords_2):",
+        #     circuit_control_bits_layout(IntTuple(0, 0, 0)),
+        # )
+
+        # _ = gate_set_1qubit_layout
+        # _ = circuit_control_bits_layout
+
+        print("\nSECOND LOOP")
+        # with control_bits_circuit.map_to_host() as device_control_bits_circuit:
+        #     for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+        #         for j in range(NUMBER_CONTROL_BITS):
+        #             for k in range(2):
+        #                 # print("i:", i, "j:", j, "k:", k, "index:", index)
+        #                 print("i:", i, "j:", j, "k:", k)
+        #                 print(
+        #                     "control_bits_list[i][j][k]:",
+        #                     control_bits_list[i][j][k],
+        #                 )
+        #                 index = circuit_control_bits_layout(IntTuple(i, j, k))
+        #                 print("index:", index)
+        #                 # device_control_bits_circuit[index] = control_bits_list[
+        #                 #     i
+        #                 # ][j][k]
+
+        #     print(
+        #         "Control bits for the circuit:\n",
+        #         device_control_bits_circuit,
+        #     )
+
+        # -- Create layout tensors for GPU operations -- #
+        gate_set_re_tensor = LayoutTensor[
+            mut=True, dtype, gate_set_1qubit_layout
+        ](gate_set_re.unsafe_ptr())
+        gate_set_im_tensor = LayoutTensor[
+            mut=True, dtype, gate_set_1qubit_layout
+        ](gate_set_im.unsafe_ptr())
+
+        quantum_state_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_re.unsafe_ptr())
+        quantum_state_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_im.unsafe_ptr())
+
+        quantum_state_out_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_out_re.unsafe_ptr())
+        quantum_state_out_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_3qubits_layout
+        ](quantum_state_out_im.unsafe_ptr())
+
+        control_bits_empty_tensor = LayoutTensor[
+            mut=True, DType.int32, control_bits_layout
+        ](control_bits_empty.unsafe_ptr())
+        control_bits_0_tensor = LayoutTensor[
+            mut=True, DType.int32, control_bits_layout
+        ](control_bits_0.unsafe_ptr())
+        control_bits_1_tensor = LayoutTensor[
+            mut=True, DType.int32, control_bits_layout
+        ](control_bits_1.unsafe_ptr())
+
+        control_bits_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, circuit_control_bits_layout
+        ](control_bits_circuit.unsafe_ptr())
+        current_control_gate_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, Layout.row_major(1)
+        ](current_control_gate_circuit.unsafe_ptr())
+
+        # Enqueue create_initial_state
+
+        # Enqueue applying gates
+
+        # Gate 0
+        # quantum_state = qubit_wise_multiply_gpu(
+        #     Hadamard.matrix, 1, quantum_state
+        # )
+        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=0]](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[Hadamard.symbol],
+            GATE_SIZE,
+            1,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_empty,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # # It works
+        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        #     print(
+        #         "After Hadamard gate on qubit 1\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 1 (reverse the states input <-> output)
+        # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
+        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=0]](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_empty,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-X gate on qubit 2:",
+        #         "\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # # Gate 2
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 0, quantum_state, [[1, 1]]
+        # )
+        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=1]](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_0_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
+        #         "\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 3
+        # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
+        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=0]](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliZ.symbol],
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_empty_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 4
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 2, quantum_state, [[1, 1]]
+        # )
+        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=1]](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_1_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+            print(
+                (
+                    "After Pauli-X gate on qubit 2 with control on qubit 1"
+                    " (Final State):\nreal part:\n"
+                ),
+                host_re,
+                "\nimaginary part:\n",
+                host_im,
+            )
diff --git a/examples/main.mojo b/examples/main.mojo
index 2c73897..f0a24c1 100644
--- a/examples/main.mojo
+++ b/examples/main.mojo
@@ -5,8 +5,8 @@
 from sys import argv
 import random
 
-# from complex import ComplexFloat64
-from qlabs.local_stdlib.complex import ComplexFloat64
+# from complex import ComplexFloat32
+from qlabs.local_stdlib.complex import ComplexFloat32
 from qlabs.local_stdlib import CustomList
 
 from qlabs.base import (
@@ -38,7 +38,7 @@ from qlabs.abstractions import (
     ShowOnlyEnd,
 )
 
-from gpu_examples import gpu_debug_something
+from gpu_examples import gpu_debug_something, run_gpu_not_abstract
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 # MARK:         Examples             #
@@ -568,7 +568,7 @@ def main():
     else:
         print("Usage: ./main [number_of_qubits] [number_of_layers]")
 
-    # simulate_figure1_circuit()
+    simulate_figure1_circuit()
 
     # simulate_figure1_circuit_abstract()
 
@@ -587,3 +587,5 @@ def main():
     # debug_something()
 
     gpu_debug_something()
+
+    run_gpu_not_abstract()
diff --git a/src/base/gate.mojo b/src/base/gate.mojo
index 43dfe33..4299a36 100644
--- a/src/base/gate.mojo
+++ b/src/base/gate.mojo
@@ -2,8 +2,10 @@
 # MARK:         Imports              #
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
+from math import sqrt
+
 from ..local_stdlib import CustomList
-from ..local_stdlib.complex import ComplexFloat64
+from ..local_stdlib.complex import ComplexFloat32
 
 from .state_and_matrix import (
     ComplexMatrix,
@@ -17,9 +19,9 @@ from .state_and_matrix import (
 alias Identity = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(1, 0), ComplexFloat64(0, 0)],
-            [ComplexFloat64(0, 0), ComplexFloat64(1, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(1, 0), ComplexFloat32(0, 0)],
+            [ComplexFloat32(0, 0), ComplexFloat32(1, 0)],
         )
     ),
     symbol="I",
@@ -28,9 +30,15 @@ alias Identity = Gate(
 alias Hadamard = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(1 / 2**0.5, 0), ComplexFloat64(1 / 2**0.5, 0)],
-            [ComplexFloat64(1 / 2**0.5, 0), ComplexFloat64(-1 / 2**0.5, 0)],
+        List[List[ComplexFloat32]](
+            [
+                ComplexFloat32(1.0 / Float32(sqrt(2.0)), 0),
+                ComplexFloat32(1 / Float32(sqrt(2.0)), 0),
+            ],
+            [
+                ComplexFloat32(1 / Float32(sqrt(2.0)), 0),
+                ComplexFloat32(-1 / Float32(sqrt(2.0)), 0),
+            ],
         )
     ),
     symbol="H",
@@ -39,9 +47,9 @@ alias Hadamard = Gate(
 alias PauliX = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(0, 0), ComplexFloat64(1, 0)],
-            [ComplexFloat64(1, 0), ComplexFloat64(0, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(0, 0), ComplexFloat32(1, 0)],
+            [ComplexFloat32(1, 0), ComplexFloat32(0, 0)],
         )
     ),
     symbol="X",
@@ -50,9 +58,9 @@ alias PauliX = Gate(
 alias PauliZ = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(1, 0), ComplexFloat64(0, 0)],
-            [ComplexFloat64(0, 0), ComplexFloat64(-1, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(1, 0), ComplexFloat32(0, 0)],
+            [ComplexFloat32(0, 0), ComplexFloat32(-1, 0)],
         )
     ),
     symbol="Z",
@@ -61,9 +69,9 @@ alias PauliZ = Gate(
 alias PauliY = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(0, 0), ComplexFloat64(0, -1)],
-            [ComplexFloat64(0, 1), ComplexFloat64(0, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(0, 0), ComplexFloat32(0, -1)],
+            [ComplexFloat32(0, 1), ComplexFloat32(0, 0)],
         )
     ),
     symbol="Y",
@@ -72,9 +80,9 @@ alias PauliY = Gate(
 alias _SEPARATOR = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(0, 0), ComplexFloat64(0, -1)],
-            [ComplexFloat64(0, 1), ComplexFloat64(0, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(0, 0), ComplexFloat32(0, -1)],
+            [ComplexFloat32(0, 1), ComplexFloat32(0, 0)],
         )
     ),
     symbol="_SEPARATOR",
@@ -83,9 +91,9 @@ alias _SEPARATOR = Gate(
 alias _START = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(0, 0), ComplexFloat64(0, -1)],
-            [ComplexFloat64(0, 1), ComplexFloat64(0, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(0, 0), ComplexFloat32(0, -1)],
+            [ComplexFloat32(0, 1), ComplexFloat32(0, 0)],
         )
     ),
     symbol="_START",
@@ -94,9 +102,9 @@ alias _START = Gate(
 alias SWAP = Gate(
     size=2,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
-            [ComplexFloat64(0, 0), ComplexFloat64(0, -1)],
-            [ComplexFloat64(0, 1), ComplexFloat64(0, 0)],
+        List[List[ComplexFloat32]](
+            [ComplexFloat32(0, 0), ComplexFloat32(0, -1)],
+            [ComplexFloat32(0, 1), ComplexFloat32(0, 0)],
         )
     ),
     symbol="SWAP",
@@ -105,30 +113,30 @@ alias SWAP = Gate(
 alias iSWAP = Gate(
     size=4,
     matrix=ComplexMatrix(
-        List[List[ComplexFloat64]](
+        List[List[ComplexFloat32]](
             [
-                ComplexFloat64(1, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
+                ComplexFloat32(1, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
             ],
             [
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 1),
-                ComplexFloat64(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 1),
+                ComplexFloat32(0, 0),
             ],
             [
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 1),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 1),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
             ],
             [
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(1, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(1, 0),
             ],
         )
     ),
@@ -252,7 +260,7 @@ struct Gate(Copyable, Movable, Representable, Stringable, Writable):
         )
 
     @always_inline
-    fn __getitem__(self, row: Int, col: Int) -> ComplexFloat64:
+    fn __getitem__(self, row: Int, col: Int) -> ComplexFloat32:
         return self.matrix[row, col]
 
     fn __str__(self) -> String:
diff --git a/src/base/gpu/__init__.mojo b/src/base/gpu/__init__.mojo
index b3d16df..bb72767 100644
--- a/src/base/gpu/__init__.mojo
+++ b/src/base/gpu/__init__.mojo
@@ -1,3 +1,4 @@
 from .qubits_operations import (
     qubit_wise_multiply_gpu,
+    qubit_wise_multiply_gpu_2,
 )
diff --git a/src/base/gpu/qubits_operations.mojo b/src/base/gpu/qubits_operations.mojo
index 5cdfabf..4b64ff6 100644
--- a/src/base/gpu/qubits_operations.mojo
+++ b/src/base/gpu/qubits_operations.mojo
@@ -4,18 +4,23 @@ from gpu import thread_idx, block_dim, block_idx
 from gpu.host import DeviceContext
 from layout import Layout, LayoutTensor
 
-alias SIZE = 2
-alias BLOCKS_PER_GRID = 1
-alias THREADS_PER_BLOCK = (3, 3)
 alias dtype = DType.float32
-alias layout = Layout.row_major(SIZE, SIZE)
 
-alias gate_1qubit_layout = Layout.row_major(2, 2)
-alias state_vector_3qubits_layout = Layout.row_major(8, 1)
+alias GATE_SIZE = 2
+alias STATE_VECTOR_SIZE = 8
+alias NUMBER_CONTROL_BITS = 1
+
+alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
+alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE, 1)
+alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
+
+alias GATE_SET_SIZE = 3
+alias gate_set_1qubit_layout = Layout.row_major(
+    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
+)
 
 
 fn qubit_wise_multiply_gpu(
-    # Use SIMD instead
     gate_re: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
     gate_im: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
     gate_size: Int,
@@ -34,7 +39,9 @@ fn qubit_wise_multiply_gpu(
     quantum_state_out_im: LayoutTensor[
         mut=True, dtype, state_vector_3qubits_layout
     ],
-    control_bits: List[List[Int]] = [],
+    control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+    # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+    number_control_bits: Int,
 ) -> None:
     """Applies a quantum gate to specific qubits in the quantum state.
 
@@ -54,7 +61,10 @@ fn qubit_wise_multiply_gpu(
         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
         control_bits: List of control bits, where each control bit is a list containing
                       [wire_index, flag] (1 for control, 0 for anti-control).
+        number_control_bits: Number of control bits.
+
     """
+    print("Inside qubit_wise_multiply_gpu")
     target_qubits_count: Int = count_trailing_zeros(gate_size)
     if (target_qubit < 0) or (target_qubit >= number_qubits):
         print(
@@ -64,26 +74,36 @@ fn qubit_wise_multiply_gpu(
         print("Skipping gate application.")
         return
 
+    print("AAAAA")
     inclusion_mask: Int = 0
     desired_value_mask: Int = 0
-    for control in control_bits:
-        wire_index, flag = control[0], control[1]
-        bit: Int = 1 << wire_index  # efficient way of computing 2^wire_index
+    for i in range(number_control_bits):
+        print("before")
+        wire_index, flag = control_bits[i, 0], control_bits[i, 1]
+        print("after")
+        bit: Int = 1 << Int(
+            wire_index
+        )  # efficient way of computing 2^wire_index
         inclusion_mask |= bit  # turn on the bit
         if flag == 1:
             desired_value_mask |= bit  # turn on the bit
 
+    print("BBBBB")
     size_of_state_vector: Int = quantum_state_size
     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
     size_of_block: Int = size_of_half_block << target_qubits_count
 
+    print("CCCC")
     # copies all amplitudes from quantum_state to quantum_state_out
     for i in range(size_of_state_vector):
-        quantum_state_out_re[i] = quantum_state_re[i]
-        quantum_state_out_im[i] = quantum_state_im[i]
+        quantum_state_out_re[i, 0] = quantum_state_re[i, 0]
+        quantum_state_out_im[i, 0] = quantum_state_im[i, 0]
 
+    print("before loop")
     for block_start in range(0, size_of_state_vector, size_of_block):
+        # print("block_start:", block_start)
         for offset in range(size_of_half_block):
+            # print("offset:", offset)
             i1: Int = (
                 block_start | offset
             )  # faster than, but equivalent to, block_start + offset
@@ -95,30 +115,163 @@ fn qubit_wise_multiply_gpu(
                 i1 | size_of_half_block
             )  # equivalent to i1 + size_of_half_block
 
-            quantum_state_out_re[i1] = (
-                (gate_re[0, 0] * quantum_state_re[i1])
-                - (gate_im[0, 0] * quantum_state_im[i1])
-                + (gate_re[0, 1] * quantum_state_re[i2])
-                - (gate_im[0, 1] * quantum_state_im[i2])
+            print("i1:", i1, "i2:", i2)
+
+            quantum_state_out_re[i1, 0] = (
+                (gate_re[0, 0] * quantum_state_re[i1, 0])
+                - (gate_im[0, 0] * quantum_state_im[i1, 0])
+                + (gate_re[0, 1] * quantum_state_re[i2, 0])
+                - (gate_im[0, 1] * quantum_state_im[i2, 0])
+            )
+
+            quantum_state_out_im[i1, 0] = (
+                (gate_re[0, 0] * quantum_state_im[i1, 0])
+                - (gate_im[0, 0] * quantum_state_re[i1, 0])
+                + (gate_re[0, 1] * quantum_state_im[i2, 0])
+                - (gate_im[0, 1] * quantum_state_re[i2, 0])
+            )
+
+            quantum_state_out_re[i2, 0] = (
+                (gate_re[1, 0] * quantum_state_re[i1, 0])
+                - (gate_im[1, 0] * quantum_state_im[i1, 0])
+                + (gate_re[1, 1] * quantum_state_re[i2, 0])
+                - (gate_im[1, 1] * quantum_state_im[i2, 0])
+            )
+
+            quantum_state_out_im[i2, 0] = (
+                (gate_re[1, 0] * quantum_state_im[i1, 0])
+                - (gate_im[1, 0] * quantum_state_re[i1, 0])
+                + (gate_re[1, 1] * quantum_state_im[i2, 0])
+                - (gate_im[1, 1] * quantum_state_re[i2, 0])
+            )
+
+
+fn qubit_wise_multiply_gpu_2[
+    number_control_bits: Int
+](
+    gate_set_re: LayoutTensor[mut=True, dtype, gate_set_1qubit_layout],
+    gate_set_im: LayoutTensor[mut=True, dtype, gate_set_1qubit_layout],
+    gate_index: Int,
+    gate_size: Int,
+    target_qubit: Int,
+    quantum_state_re: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    quantum_state_im: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    number_qubits: Int,
+    quantum_state_size: Int,
+    quantum_state_out_re: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    quantum_state_out_im: LayoutTensor[
+        mut=True, dtype, state_vector_3qubits_layout
+    ],
+    control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+) -> None:
+    """Applies a quantum gate to specific qubits in the quantum state.
+
+    It will apply the gate starting from the target qubit assuming that the other
+    qubits that the gate acts on are following the target qubit.
+
+    Parameters:
+        number_control_bits: Number of control bits.
+
+    Args:
+        gate_set_re: All unique gates applied in the circuit, real part.
+        gate_set_im: All unique gates applied in the circuit, imaginary part.
+        gate_index: Index of the gate in the gate set to apply.
+        gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
+        target_qubit: The index of the target qubit to apply the gate to.
+        quantum_state_re: Real part of the quantum state vector.
+        quantum_state_im: Imaginary part of the quantum state vector.
+        number_qubits: Total number of qubits in the quantum state.
+        quantum_state_size: Size of the quantum state vector (2^number_qubits).
+        quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
+        quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
+        control_bits: List of control bits, where each control bit is a list containing
+                      [wire_index, flag] (1 for control, 0 for anti-control).
+    """
+    print("Inside qubit_wise_multiply_gpu")
+    target_qubits_count: Int = count_trailing_zeros(gate_size)
+    if (target_qubit < 0) or (target_qubit >= number_qubits):
+        print(
+            "Error: target_qubit index out of bounds. Must be between 0 and",
+            number_qubits - 1,
+        )
+        print("Skipping gate application.")
+        return
+
+    print("AAAAA")
+    inclusion_mask: Int = 0
+    desired_value_mask: Int = 0
+
+    @parameter
+    for i in range(number_control_bits):
+        print("before")
+        wire_index, flag = control_bits[i, 0], control_bits[i, 1]
+        print("after")
+        bit: Int = 1 << Int(
+            wire_index
+        )  # efficient way of computing 2^wire_index
+        inclusion_mask |= bit  # turn on the bit
+        if flag == 1:
+            desired_value_mask |= bit  # turn on the bit
+
+    print("BBBBB")
+    size_of_state_vector: Int = quantum_state_size
+    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+    size_of_block: Int = size_of_half_block << target_qubits_count
+
+    print("CCCC")
+    # copies all amplitudes from quantum_state to quantum_state_out
+    for i in range(size_of_state_vector):
+        quantum_state_out_re[i, 0] = quantum_state_re[i, 0]
+        quantum_state_out_im[i, 0] = quantum_state_im[i, 0]
+
+    print("before loop")
+    for block_start in range(0, size_of_state_vector, size_of_block):
+        # print("block_start:", block_start)
+        for offset in range(size_of_half_block):
+            # print("offset:", offset)
+            i1: Int = (
+                block_start | offset
+            )  # faster than, but equivalent to, block_start + offset
+
+            if (i1 & inclusion_mask) != desired_value_mask:
+                continue  # skip this iteration if the control bits do not match
+
+            i2: Int = (
+                i1 | size_of_half_block
+            )  # equivalent to i1 + size_of_half_block
+
+            print("i1:", i1, "i2:", i2)
+
+            quantum_state_out_re[i1, 0] = (
+                (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1, 0])
+                - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1, 0])
+                + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2, 0])
+                - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2, 0])
             )
 
-            quantum_state_out_im[i1] = (
-                (gate_re[0, 0] * quantum_state_im[i1])
-                - (gate_im[0, 0] * quantum_state_re[i1])
-                + (gate_re[0, 1] * quantum_state_im[i2])
-                - (gate_im[0, 1] * quantum_state_re[i2])
+            quantum_state_out_im[i1, 0] = (
+                (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1, 0])
+                - (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1, 0])
+                + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2, 0])
+                - (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2, 0])
             )
 
-            quantum_state_out_re[i2] = (
-                (gate_re[1, 0] * quantum_state_re[i1])
-                - (gate_im[1, 0] * quantum_state_im[i1])
-                + (gate_re[1, 1] * quantum_state_re[i2])
-                - (gate_im[1, 1] * quantum_state_im[i2])
+            quantum_state_out_re[i2, 0] = (
+                (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1, 0])
+                - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1, 0])
+                + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2, 0])
+                - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2, 0])
             )
 
-            quantum_state_out_im[i2] = (
-                (gate_re[1, 0] * quantum_state_im[i1])
-                - (gate_im[1, 0] * quantum_state_re[i1])
-                + (gate_re[1, 1] * quantum_state_im[i2])
-                - (gate_im[1, 1] * quantum_state_re[i2])
+            quantum_state_out_im[i2, 0] = (
+                (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1, 0])
+                - (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1, 0])
+                + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2, 0])
+                - (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2, 0])
             )
diff --git a/src/base/state_and_matrix.mojo b/src/base/state_and_matrix.mojo
index 6205d45..f295216 100644
--- a/src/base/state_and_matrix.mojo
+++ b/src/base/state_and_matrix.mojo
@@ -5,10 +5,18 @@
 from collections.linked_list import LinkedList
 
 from ..local_stdlib import CustomList
-from ..local_stdlib.complex import ComplexFloat64
+from ..local_stdlib.complex import ComplexFloat32
 
 from .qubits_operations import partial_trace
 
+# GPU imports
+
+from layout import Layout, LayoutTensor
+
+alias dtype = DType.float32
+alias STATE_VECTOR_SIZE = 8
+alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE, 1)
+
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 # MARK:         Structs              #
@@ -27,7 +35,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
 
     var num_qubits: Int
     """The number of qubits in the state, which determines the size of the state vector."""
-    var state_vector: CustomList[ComplexFloat64, hint_trivial_type=True]
+    var state_vector: CustomList[ComplexFloat32, hint_trivial_type=True]
     """The state vector representing the amplitudes of the basis states."""
 
     fn __init__(out self, size: Int):
@@ -37,23 +45,23 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
             size: The size of the vector, which is 2^n for n qubits.
         """
         self.num_qubits = 0
-        self.state_vector = CustomList[ComplexFloat64, hint_trivial_type=True](
-            length=size, fill=ComplexFloat64(0.0, 0.0)
+        self.state_vector = CustomList[ComplexFloat32, hint_trivial_type=True](
+            length=size, fill=ComplexFloat32(0.0, 0.0)
         )
         # self.state_vector.memset_zero()
 
     @always_inline
-    fn __getitem__(self, index: Int) -> ComplexFloat64:
+    fn __getitem__(self, index: Int) -> ComplexFloat32:
         # @parameter
         # if check_bounds:
         #     if index < 0 or index >= self.size():
         #         print("ERROR: Index", index, "is out of bounds for state vector of size", self.size())
-        #         return ComplexFloat64(0.0, 0.0)
+        #         return ComplexFloat32(0.0, 0.0)
         # else:
         return self.state_vector[index]
 
     @always_inline
-    fn __setitem__(mut self, index: Int, value: ComplexFloat64) -> None:
+    fn __setitem__(mut self, index: Int, value: ComplexFloat32) -> None:
         self.state_vector[index] = value
 
     fn __str__(self) -> String:
@@ -63,8 +71,8 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
         for i in range(self.size()):
             amplitude = self.state_vector[i]
             # amplitude_str: String = String(amplitude)
-            amplitude_re: Float64 = amplitude.re
-            amplitude_im: Float64 = amplitude.im
+            amplitude_re: Float32 = amplitude.re
+            amplitude_im: Float32 = amplitude.im
             if amplitude_im == 0.0 and amplitude_re == 0.0:
                 amplitude_str: String = String(Int(amplitude_re))
             elif amplitude_im == 0.0:
@@ -108,12 +116,8 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
         """
         num_qubits: Int = len(bitstring)
 
-        # state_vector: List[ComplexFloat64] = [ComplexFloat64(0.0, 0.0)] * (
-        #     1 << num_qubits
-        # )  # 2^num_qubits
-
-        state_vector = CustomList[ComplexFloat64, hint_trivial_type=True](
-            length=1 << num_qubits, fill=ComplexFloat64(0.0, 0.0)
+        state_vector = CustomList[ComplexFloat32, hint_trivial_type=True](
+            length=1 << num_qubits, fill=ComplexFloat32(0.0, 0.0)
         )  # 2^num_qubits
         state_vector.memset_zero()  # Initialize the state vector with zeros
 
@@ -125,11 +129,39 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
                 index |= 1 << i  # Set the bit at position i
             i += 1
 
-        state_vector[index] = ComplexFloat64(
+        state_vector[index] = ComplexFloat32(
             1.0, 0.0
         )  # Set the amplitude for the state to 1
         return Self(num_qubits, state_vector)
 
+    # @staticmethod
+    # fn from_bitstring_gpu(
+    #     bitstring: String,    # GPU does not support String
+    #     quantum_state_re: LayoutTensor[
+    #         mut=True, dtype, state_vector_3qubits_layout
+    #     ],
+    #     quantum_state_im: LayoutTensor[
+    #         mut=True, dtype, state_vector_3qubits_layout
+    #     ],
+    # ) -> None:
+    #     """Fill the quantum state vector from the state of the given bitstring.
+
+    #     Params:
+    #         bitstring: A string of '0's and '1's representing the state, with
+    #                     the least significant qubit (top one) (LSB) at the start.
+    #         gate_re: Real part of the gate matrix, initialized to zeros.
+    #         gate_im: Imaginary part of the gate matrix, initialized to zeros.
+    #     """
+    #     # Put coefficent correspondin to the bitstring to 1
+    #     index: Int = 0
+    #     i: Int = 0
+    #     for bit in bitstring.codepoints():
+    #         if bit == Codepoint.ord("1"):
+    #             index |= 1 << i  # Set the bit at position i
+    #         i += 1
+
+    #     quantum_state_re[index, 0] = 1.0
+
     fn write_to[W: Writer](self, mut writer: W) -> None:
         writer.write(String(self))
 
@@ -147,7 +179,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
         """Fills the state vector with zeros."""
         self.state_vector.memset_zero()  # Set all elements to zero
         # for i in range(self.size()):
-        #     self.state_vector[i] = ComplexFloat64(0.0, 0.0)  # Set each amplitude to zero
+        #     self.state_vector[i] = ComplexFloat32(0.0, 0.0)  # Set each amplitude to zero
 
     fn is_valid_state(self) -> Bool:
         """Checks if the state vector is a valid quantum state.
@@ -162,7 +194,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
             return False  # No qubits means no valid state
 
         # Use the buil-in methods of the Complex type
-        squared_norm: Float64 = 0.0
+        squared_norm: Float32 = 0.0
         for amplitude in self.state_vector:
             squared_norm += amplitude.squared_norm()
         # Check if the squared norm is approximately equal to 1
@@ -215,7 +247,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
     #                         and without duplicates.
 
     @always_inline
-    fn purity(self, arg_qubits_to_keep: LinkedList[Int] = []) -> Float64:
+    fn purity(self, arg_qubits_to_keep: LinkedList[Int] = []) -> Float32:
         """Calculates the purity of the pure state.
 
         The purity is defined as the trace of the density matrix squared.
@@ -271,7 +303,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
         density_matrix: ComplexMatrix = partial_trace(self, qubits_to_trace_out)
 
         # density_matrix = self.to_density_matrix()
-        trace_squared: Float64 = 0.0
+        trace_squared: Float32 = 0.0
 
         for i in range(density_matrix.size()):
             trace_squared += density_matrix[i, i].squared_norm()
@@ -279,7 +311,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
         return trace_squared
 
     @always_inline
-    fn normalised_purity(self, qubits_to_keep: LinkedList[Int] = []) -> Float64:
+    fn normalised_purity(self, qubits_to_keep: LinkedList[Int] = []) -> Float32:
         """Calculates the normalised purity of the pure state.
 
         For a density matrix of size 2n×2n, purity ranges from 1/2^n to 1.
@@ -293,7 +325,7 @@ struct StateVector(Copyable, Movable, Stringable, Writable):
         )
 
     @always_inline
-    fn linear_entropy(self, qubits_to_keep: LinkedList[Int] = []) -> Float64:
+    fn linear_entropy(self, qubits_to_keep: LinkedList[Int] = []) -> Float32:
         """Calculates the linear entropy of the pure state.
 
         The linear entropy is defined as 1 - purity.
@@ -311,7 +343,7 @@ struct ComplexMatrix(Copyable, Movable, Stringable, Writable):
     This is used to represent quantum gates in the form of matrices.
     """
 
-    var matrix: List[List[ComplexFloat64]]
+    var matrix: List[List[ComplexFloat32]]
     """The 2D matrix representation of the quantum gate."""
 
     fn __init__(out self, rows: Int, cols: Int):
@@ -321,23 +353,23 @@ struct ComplexMatrix(Copyable, Movable, Stringable, Writable):
             rows: The number of rows in the matrix.
             cols: The number of columns in the matrix.
         """
-        self.matrix = List[List[ComplexFloat64]](
+        self.matrix = List[List[ComplexFloat32]](
             length=rows,
-            fill=List[ComplexFloat64](
-                length=cols, fill=ComplexFloat64(0.0, 0.0)
+            fill=List[ComplexFloat32](
+                length=cols, fill=ComplexFloat32(0.0, 0.0)
             ),
         )
         # # Initialize the matrix with zeros
         # for i in range(rows):
         #     for j in range(cols):
-        #         self.matrix[i][j] = ComplexFloat64(0.0, 0.0)
+        #         self.matrix[i][j] = ComplexFloat32(0.0, 0.0)
 
     @always_inline
-    fn __getitem__(self, row: Int, col: Int) -> ComplexFloat64:
+    fn __getitem__(self, row: Int, col: Int) -> ComplexFloat32:
         return self.matrix[row][col]
 
     @always_inline
-    fn __setitem__(mut self, row: Int, col: Int, value: ComplexFloat64) -> None:
+    fn __setitem__(mut self, row: Int, col: Int, value: ComplexFloat32) -> None:
         """Sets the value at the specified row and column in the matrix.
         Args:
             row: The row index of the matrix.
diff --git a/tests/base/test_qubit_operations.mojo b/tests/base/test_qubit_operations.mojo
index 7301b7f..89e6126 100644
--- a/tests/base/test_qubit_operations.mojo
+++ b/tests/base/test_qubit_operations.mojo
@@ -34,7 +34,7 @@ from qlabs.base import (
 )
 
 from qlabs.local_stdlib import CustomList
-from qlabs.local_stdlib.complex import ComplexFloat64
+from qlabs.local_stdlib.complex import ComplexFloat32
 
 
 def test_qubit_wise_multiply_0():
@@ -52,15 +52,15 @@ def test_qubit_wise_multiply_0():
         quantum_state,
         StateVector(
             3,
-            CustomList[ComplexFloat64, hint_trivial_type=True](
-                ComplexFloat64(0.5, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0.5, 0),
-                ComplexFloat64(0.5, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0.5, 0),
+            CustomList[ComplexFloat32, hint_trivial_type=True](
+                ComplexFloat32(0.5, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0.5, 0),
+                ComplexFloat32(0.5, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0.5, 0),
             ),
         ),
     )
@@ -103,15 +103,15 @@ def test_qubit_wise_multiply_figure1():
         quantum_state,
         StateVector(
             3,
-            CustomList[ComplexFloat64, hint_trivial_type=True](
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(-1.0 / sqrt(2.0), 0),
-                ComplexFloat64(1.0 / sqrt(2.0), 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
-                ComplexFloat64(0, 0),
+            CustomList[ComplexFloat32, hint_trivial_type=True](
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(-1.0 / Float32(sqrt(2.0)), 0),
+                ComplexFloat32(1.0 / Float32(sqrt(2.0)), 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
+                ComplexFloat32(0, 0),
             ),
         ),
     )
@@ -121,41 +121,41 @@ def test_partial_trace_all():
     """Test the partial trace operation on a 2-qubit state. Keep all qubits."""
     state: StateVector = StateVector(
         2,
-        CustomList[ComplexFloat64, hint_trivial_type=True](
-            ComplexFloat64(0, 0),
-            ComplexFloat64(-0.5, 0),
-            ComplexFloat64(0.7071067811863477, 0),
-            ComplexFloat64(-0.5, 0),
+        CustomList[ComplexFloat32, hint_trivial_type=True](
+            ComplexFloat32(0, 0),
+            ComplexFloat32(-0.5, 0),
+            ComplexFloat32(0.7071067811863477, 0),
+            ComplexFloat32(-0.5, 0),
         ),
     )
     matrix: ComplexMatrix = partial_trace(state, [])
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
-            List[List[ComplexFloat64]](
+            List[List[ComplexFloat32]](
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(-0.3535533905929741, 0),
-                    ComplexFloat64(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(-0.3535533905929741, 0),
+                    ComplexFloat32(0.25, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(-0.3535533905929741, 0),
-                    ComplexFloat64(0.5, 0),
-                    ComplexFloat64(-0.3535533905929741, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(-0.3535533905929741, 0),
+                    ComplexFloat32(0.5, 0),
+                    ComplexFloat32(-0.3535533905929741, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(-0.3535533905929741, 0),
-                    ComplexFloat64(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(-0.3535533905929741, 0),
+                    ComplexFloat32(0.25, 0),
                 ],
             )
         ),
@@ -168,15 +168,15 @@ def test_partial_trace_sec67():
     """
     state: StateVector = StateVector(
         3,
-        CustomList[ComplexFloat64, hint_trivial_type=True](
-            ComplexFloat64(0.5, 0),
-            ComplexFloat64(0, 0),
-            ComplexFloat64(0, 0),
-            ComplexFloat64(0.5, 0),
-            ComplexFloat64(0.5, 0),
-            ComplexFloat64(0, 0),
-            ComplexFloat64(0, 0),
-            ComplexFloat64(0.5, 0),
+        CustomList[ComplexFloat32, hint_trivial_type=True](
+            ComplexFloat32(0.5, 0),
+            ComplexFloat32(0, 0),
+            ComplexFloat32(0, 0),
+            ComplexFloat32(0.5, 0),
+            ComplexFloat32(0.5, 0),
+            ComplexFloat32(0, 0),
+            ComplexFloat32(0, 0),
+            ComplexFloat32(0.5, 0),
         ),
     )
 
@@ -184,9 +184,9 @@ def test_partial_trace_sec67():
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
-            List[List[ComplexFloat64]](
-                [ComplexFloat64(0.5, 0), ComplexFloat64(0.5, 0)],
-                [ComplexFloat64(0.5, 0), ComplexFloat64(0.5, 0)],
+            List[List[ComplexFloat32]](
+                [ComplexFloat32(0.5, 0), ComplexFloat32(0.5, 0)],
+                [ComplexFloat32(0.5, 0), ComplexFloat32(0.5, 0)],
             )
         ),
         "partial trace qubit 0 and 1",
@@ -196,30 +196,30 @@ def test_partial_trace_sec67():
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
-            List[List[ComplexFloat64]](
+            List[List[ComplexFloat32]](
                 [
-                    ComplexFloat64(0.5, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.5, 0),
+                    ComplexFloat32(0.5, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.5, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
                 ],
                 [
-                    ComplexFloat64(0.5, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.5, 0),
+                    ComplexFloat32(0.5, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.5, 0),
                 ],
             )
         ),
@@ -230,9 +230,9 @@ def test_partial_trace_sec67():
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
-            List[List[ComplexFloat64]](
-                [ComplexFloat64(0.5, 0), ComplexFloat64(0, 0)],
-                [ComplexFloat64(0, 0), ComplexFloat64(0.5, 0)],
+            List[List[ComplexFloat32]](
+                [ComplexFloat32(0.5, 0), ComplexFloat32(0, 0)],
+                [ComplexFloat32(0, 0), ComplexFloat32(0.5, 0)],
             )
         ),
         "partial trace qubits 1 and 2",
@@ -242,30 +242,30 @@ def test_partial_trace_sec67():
     assert_matrix_almost_equal(
         matrix,
         ComplexMatrix(
-            List[List[ComplexFloat64]](
+            List[List[ComplexFloat32]](
                 [
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(0, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
                 ],
                 [
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(0, 0),
                 ],
                 [
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
-                    ComplexFloat64(0, 0),
-                    ComplexFloat64(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
+                    ComplexFloat32(0, 0),
+                    ComplexFloat32(0.25, 0),
                 ],
             )
         ),

From 28801e7b3a976687dfe1fe3cc92d17c82415c0b6 Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sat, 28 Jun 2025 21:12:56 -0600
Subject: [PATCH 3/7] feat: qubit_wise_multiply improve gpu implementation +
 add benchmark

---
 benchmarks/all_benchmarks.mojo                |  22 +-
 benchmarks/bench_qubit_wise_multiply.mojo     | 836 +++++++++++++--
 benchmarks/bench_simulate_random_circuit.mojo | 178 ++++
 examples/gpu_examples.mojo                    | 974 +++++++++++++-----
 examples/main.mojo                            |  76 +-
 pixi.toml                                     |   2 +-
 src/base/__init__.mojo                        |   1 +
 src/base/gpu/__init__.mojo                    |   3 +-
 src/base/gpu/qubits_operations.mojo           | 808 +++++++++++----
 src/base/qubits_operations.mojo               |  68 ++
 10 files changed, 2457 insertions(+), 511 deletions(-)
 create mode 100644 benchmarks/bench_simulate_random_circuit.mojo

diff --git a/benchmarks/all_benchmarks.mojo b/benchmarks/all_benchmarks.mojo
index bb3fc0e..6fefb3c 100644
--- a/benchmarks/all_benchmarks.mojo
+++ b/benchmarks/all_benchmarks.mojo
@@ -1,7 +1,25 @@
-from bench_qubit_wise_multiply import bench_main
+from sys import has_accelerator
+
+from bench_simulate_random_circuit import bench_simulate_random_circuit
+from bench_qubit_wise_multiply import (
+    bench_qubit_wise_multiply,
+    bench_qubit_wise_multiply_inplace,
+    bench_qubit_wise_multiply_inplace_gpu,
+    bench_qubit_wise_multiply_extended,
+)
 
 
 def main():
     print("Running all benchmarks...")
-    bench_main()
+    # bench_qubit_wise_multiply()
+    # bench_qubit_wise_multiply_inplace()
+
+    @parameter
+    if not has_accelerator():
+        print("No compatible GPU found")
+    else:
+        bench_qubit_wise_multiply_inplace_gpu()
+
+    # bench_qubit_wise_multiply_extended()
+    # bench_simulate_random_circuit()
     print("All benchmarks completed.")
diff --git a/benchmarks/bench_qubit_wise_multiply.mojo b/benchmarks/bench_qubit_wise_multiply.mojo
index 56e9b7d..bd723d8 100644
--- a/benchmarks/bench_qubit_wise_multiply.mojo
+++ b/benchmarks/bench_qubit_wise_multiply.mojo
@@ -1,11 +1,8 @@
 from gpu.host import DeviceContext
 
-from benchmark import (
-    Bench,
-    BenchConfig,
-    Bencher,
-    BenchId,
-)
+from layout import Layout, LayoutTensor, IntTuple
+
+from benchmark import Bench, BenchConfig, Bencher, BenchId, keep
 
 from pathlib import Path
 
@@ -27,11 +24,14 @@ from qlabs.base import (
     SWAP,
     iSWAP,
     qubit_wise_multiply,
+    qubit_wise_multiply_inplace,
     qubit_wise_multiply_extended,
     apply_swap,
     partial_trace,
 )
 
+from qlabs.base.gpu import qubit_wise_multiply_inplace_gpu
+
 from qlabs.abstractions import (
     GateCircuit,
     StateVectorSimulator,
@@ -41,98 +41,788 @@ from qlabs.abstractions import (
 )
 
 
-fn simulate_random_circuit[num_qubits: Int, number_layers: Int]() -> None:
-    """Simulates a random quantum circuit with the specified number of qubits and layers.
+# alias BLOCKS_PER_GRID = 1
+# alias THREADS_PER_BLOCK = (1, 1)
+alias dtype = DType.float32
+
+alias GATE_SIZE = 2
+alias NUMBER_CONTROL_BITS = 1
+# TODO have NUMBER_CONTROL_BITS be a list defining each gates specific control bits count
+alias CIRCUIT_NUMBER_CONTROL_GATES = 1
+alias circuit_control_bits_layout = Layout.row_major(
+    CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
+)
+
+alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
+alias STATE_VECTOR_SIZE = 8
+alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE)
+alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
 
-    Parameters:
-        num_qubits: The number of qubits in the circuit.
-        number_layers: The number of layers in the circuit.
-    """
+alias gate_set_dic: Dict[String, Int] = {
+    Hadamard.symbol: 0,
+    PauliX.symbol: 1,
+    PauliY.symbol: 2,
+    PauliZ.symbol: 3,
+}
+alias GATE_SET_SIZE = 4
+alias gate_set_1qubit_layout = Layout.row_major(
+    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
+)
+alias gate_set_1qubit_vectorized_layout = Layout.row_major(
+    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE, 2
+)
 
-    qc: GateCircuit = GateCircuit(num_qubits)
 
+@parameter
+@always_inline
+fn benchmark_qubit_wise_multiply[
+    num_qubits: Int, number_layers: Int
+](mut b: Bencher) raises:
     gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
 
-    # index: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(2*num_qubits)
-    # print("Creating random circuit...")
-    # random.seed()  # Seed on current time
-    # for _ in range(400):
-    #     random.randint(index, 2*num_qubits, 0, len(gates_list) - 1)
-    #     for i in range(num_qubits):
-    #         qc = qc.apply(gates_list[Int(index[i])], i)
-    #     qc = qc.barrier()
-    #     for i in range(num_qubits - 1):
-    #         qc = qc.apply(
-    #             gates_list[Int(index[num_qubits + i])],
-    #             i,
-    #             controls=[(i + 1) % num_qubits],
-    #             is_anti_control=[False],
-    #         )
-    #     qc = qc.barrier()
-
-    index: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+    indexes: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
         number_layers * 2 * num_qubits
     )
     random.seed()  # Seed on current time
     random.randint(
-        index, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
+        indexes, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
     )
 
-    for iter in range(number_layers):
-        for i in range(num_qubits):
-            qc.apply(gates_list[Int(index[iter * num_qubits + i])](i))
-        qc.barrier()
-        for i in range(num_qubits - 1):
-            qc.apply(
-                gates_list[Int(index[iter * num_qubits + num_qubits + i])](
-                    i, controls=[(i + 1) % num_qubits]
-                ),
-            )
-        qc.barrier()
+    @parameter
+    @always_inline
+    fn qubit_wise_multiply_workflow(ctx: DeviceContext) raises:
+        """Simulates a random quantum circuit with the specified number of qubits and layers.
+        """
 
-    initial_state_bitstring: String = (
-        "0" * num_qubits
-    )  # Initial state |000...0⟩
-    initial_state: StateVector = StateVector.from_bitstring(
-        initial_state_bitstring
-    )
+        # Initialize the quantum circuit to the |0⟩ state
+        quantum_state: StateVector = StateVector.from_bitstring(
+            "0" * num_qubits
+        )
+
+        for layer in range(number_layers):
+            for i in range(num_qubits):
+                quantum_state = qubit_wise_multiply(
+                    gates_list[Int(indexes[layer * num_qubits + i])].matrix,
+                    i,
+                    quantum_state,
+                )
+            for i in range(num_qubits - 1):
+                quantum_state = qubit_wise_multiply(
+                    gates_list[
+                        Int(indexes[layer * num_qubits + num_qubits + i])
+                    ].matrix,
+                    i,
+                    quantum_state,
+                    [[(i + 1) % num_qubits, 1]],
+                )
 
-    qsimu = StateVectorSimulator(
-        qc,
-        initial_state=initial_state,
-        optimisation_level=0,  # No optimisations for now
-        verbose=False,
-        # verbose_step_size=ShowAfterEachLayer,  # ShowAfterEachGate, ShowOnlyEnd
-        verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
-        # stop_at=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd # TODO implement that instead of having access to manual methods
+    bench_ctx = DeviceContext()
+    b.iter_custom[qubit_wise_multiply_workflow](bench_ctx)
+
+
+@parameter
+@always_inline
+fn benchmark_qubit_wise_multiply_inplace[
+    num_qubits: Int, number_layers: Int
+](mut b: Bencher) raises:
+    gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+    indexes: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+        number_layers * 2 * num_qubits
+    )
+    random.seed()  # Seed on current time
+    random.randint(
+        indexes, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
     )
 
-    for _ in range(100):
-        _ = qsimu.run()
+    @parameter
+    @always_inline
+    fn qubit_wise_multiply_inplace_workflow(ctx: DeviceContext) raises:
+        """Simulates a random quantum circuit with the specified number of qubits and layers.
+        """
+
+        # TODO report github that this is inconvenient because it won't compile
+        # error: argument of 'qubit_wise_multiply_inplace' call allows writing a memory location previously writable through another aliased argument
+        # quantum_states = List[StateVector](
+        #     StateVector.from_bitstring("0" * num_qubits),
+        #     StateVector.from_bitstring("0" * num_qubits),
+        # )
+
+        # Why would this work while the above doesn't?
+        # quantum_states: Dict[Int, StateVector] = {
+        #     0: StateVector.from_bitstring("0" * num_qubits),
+        #     1: StateVector.from_bitstring("0" * num_qubits),
+        # }
+
+        quantum_state_0 = StateVector.from_bitstring("0" * num_qubits)
+        quantum_state_1 = StateVector.from_bitstring("0" * num_qubits)
+
+        current_state = 0
+        for layer in range(number_layers):
+            for i in range(num_qubits):
+                # NOTE Works but is slow with the dictionary
+                # qubit_wise_multiply_inplace(
+                #     gates_list[Int(indexes[layer * num_qubits + i])].matrix,
+                #     i,
+                #     quantum_states[current_state],
+                #     quantum_states[1 - current_state],
+                # )
+                # NOTE: Fast buty doesn't actually use the next state for new operations
+                # qubit_wise_multiply_inplace(
+                #     gates_list[Int(indexes[layer * num_qubits + i])].matrix,
+                #     i,
+                #     quantum_state_0,
+                #     quantum_state_1,
+                # )
+                if current_state == 0:
+                    qubit_wise_multiply_inplace(
+                        gates_list[Int(indexes[layer * num_qubits + i])].matrix,
+                        i,
+                        quantum_state_0,
+                        quantum_state_1,
+                    )
+                    current_state = 1
+                else:
+                    qubit_wise_multiply_inplace(
+                        gates_list[Int(indexes[layer * num_qubits + i])].matrix,
+                        i,
+                        quantum_state_1,
+                        quantum_state_0,
+                    )
+                    current_state = 0
+            for i in range(num_qubits - 1):
+                # qubit_wise_multiply_inplace(
+                #     gates_list[
+                #         Int(indexes[layer * num_qubits + num_qubits + i])
+                #     ].matrix,
+                #     i,
+                #     quantum_states[current_state],
+                #     quantum_states[1 - current_state],
+                #     [[(i + 1) % num_qubits, 1]],
+                # )
+                # current_state = 1 - current_state
+                # qubit_wise_multiply_inplace(
+                #     gates_list[
+                #         Int(indexes[layer * num_qubits + num_qubits + i])
+                #     ].matrix,
+                #     i,
+                #     quantum_state_0,
+                #     quantum_state_1,
+                #     [[(i + 1) % num_qubits, 1]],
+                # )
+                if current_state == 0:
+                    qubit_wise_multiply_inplace(
+                        gates_list[
+                            Int(indexes[layer * num_qubits + num_qubits + i])
+                        ].matrix,
+                        i,
+                        quantum_state_0,
+                        quantum_state_1,
+                        [[(i + 1) % num_qubits, 1]],
+                    )
+                    current_state = 1
+                else:
+                    qubit_wise_multiply_inplace(
+                        gates_list[
+                            Int(indexes[layer * num_qubits + num_qubits + i])
+                        ].matrix,
+                        i,
+                        quantum_state_1,
+                        quantum_state_0,
+                        [[(i + 1) % num_qubits, 1]],
+                    )
+                    current_state = 0
+
+    bench_ctx = DeviceContext()
+    b.iter_custom[qubit_wise_multiply_inplace_workflow](bench_ctx)
+
+
+@parameter
+@always_inline
+fn benchmark_qubit_wise_multiply_inplace_gpu[
+    num_qubits: Int, number_layers: Int
+](mut b: Bencher) raises:
+    # gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+    # indexes: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+    #     number_layers * 1 * num_qubits
+    # )
+    # random.seed()  # Seed on current time
+    # random.randint(
+    #     indexes, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
+    # )
+
+    bench_ctx = DeviceContext()
+
+    alias state_vector_size = 1 << num_qubits
+    alias state_vector_layout = Layout.row_major(state_vector_size)
+
+    alias total_threads = 2 * state_vector_size
+
+    # alias max_threads_per_block = 1024
+    alias sm_count = bench_ctx.device_info.sm_count
+    alias max_blocks_per_multiprocessor = bench_ctx.device_info.max_blocks_per_multiprocessor
+
+    # alias max_number_blocks = sm_count * max_blocks_per_multiprocessor
+    alias max_number_blocks = 128
+    alias max_threads_per_block = bench_ctx.device_info.max_thread_block_size
+
+    print("state_vector_size:", state_vector_size)
+
+    # try:
+    #     print("BEFORE:")
+    #     (free, total) = bench_ctx.get_memory_info()
+    #     print("Free memory:", free / (1024 * 1024), "MB")
+    #     print("Total memory:", total / (1024 * 1024), "MB")
+    # except:
+    #     print("Failed to get memory information")
+
+    @parameter
+    @always_inline
+    fn qubit_wise_multiply_inplace_gpu_workflow(ctx: DeviceContext) raises:
+        """Simulates on GPU a random quantum circuit with the specified number of qubits and layers.
+        """
+        # gate_set: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+        try:
+            print("1.BEFORE ALLOCATING:")
+            (free, total) = ctx.get_memory_info()
+            print("Free memory:", free / (1024 * 1024), "MB")
+            print("Total memory:", total / (1024 * 1024), "MB")
+        except:
+            print("Failed to get memory information")
+
+        # blocks_per_grid = (
+        #     total_threads + max_threads_per_block - 1
+        # ) // max_threads_per_block
+
+        # if blocks_per_grid >= max_number_blocks:
+        #     blocks_per_grid = max_number_blocks - 1
+
+        blocks_per_grid = 1
+
+        threads_per_block = (
+            1,
+            1,
+            1,
+        )
+
+        # @parameter
+        # if total_threads < max_threads_per_block:
+        #     threads_per_block = (
+        #         total_threads,
+        #         1,
+        #         1,
+        #     )  # 1D block of threads
+
+        # print(
+        #     "blocks_per_grid:",
+        #     blocks_per_grid,
+        #     "max_number_blocks:",
+        #     max_number_blocks,
+        # )
+        # print("threads_per_block[0]:", threads_per_block[0])
+
+        # var control_bits_list: List[List[List[Int]]] = [
+        #     [[1, 1]],
+        #     # [[1, 1]],
+        # ]
+
+        # control_bits_list: List[List[List[Int]]] = []
+
+        # -- Create GPU variables -- #
+
+        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+
+        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+
+        # host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+        #     CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        # )
+
+        quantum_state: StateVector = StateVector.from_bitstring("000")
+
+        # Wait for host buffers to be ready
+        ctx.synchronize()
+
+        # -- Fill host buffers -- #
+
+        for i in range(state_vector_size):
+            host_quantum_state_re[i] = quantum_state[i].re
+            host_quantum_state_im[i] = quantum_state[i].im
+
+        # for i in range(GATE_SET_SIZE):
+        #     gate = gate_set[i]
+        #     for j in range(GATE_SIZE):
+        #         for k in range(GATE_SIZE):
+        #             index = gate_set_1qubit_layout(
+        #                 IntTuple(i, j, k)
+        #             )  # Get the index in the 1D buffer
+        #             host_gate_set_re[index] = gate[j, k].re
+        #             host_gate_set_im[index] = gate[j, k].im
+
+        # for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+        #     for j in range(NUMBER_CONTROL_BITS):
+        #         for k in range(2):
+        #             index = circuit_control_bits_layout(IntTuple(i, j, k))
+        #             host_control_bits_circuit[index] = control_bits_list[i][j][
+        #                 k
+        #             ]
+
+        # -- Copy host buffers to device buffers -- #
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+
+        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        )
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+        # Create other buffers for functions
+
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+
+        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+        # gate_set_re.enqueue_copy_from(host_gate_set_re)
+        # gate_set_im.enqueue_copy_from(host_gate_set_im)
+        ctx.enqueue_memset(gate_set_re, 0.0)
+        ctx.enqueue_memset(gate_set_im, 0.0)
+
+        # control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+        ctx.enqueue_memset(control_bits_circuit, 0)
+
+        # TODO report that this create a runtime error only in this context not when
+        # running the same code in a standalone script
+        # ctx.enqueue_memset(current_control_gate_circuit, 0.0)
+        ctx.enqueue_memset(current_control_gate_circuit, 0)
+        ctx.enqueue_memset(quantum_state_out_re, 0.0)
+        ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+        # -- Create layout tensors for GPU operations -- #
+        gate_set_re_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_re.unsafe_ptr())
+        gate_set_im_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_im.unsafe_ptr())
+
+        quantum_state_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_re.unsafe_ptr())
+        quantum_state_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_im.unsafe_ptr())
+
+        quantum_state_out_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_re.unsafe_ptr())
+        quantum_state_out_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_im.unsafe_ptr())
+
+        control_bits_circuit_tensor = LayoutTensor[
+            mut=False, DType.int32, circuit_control_bits_layout
+        ](control_bits_circuit.unsafe_ptr())
+        current_control_gate_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, Layout.row_major(1)
+        ](current_control_gate_circuit.unsafe_ptr())
+
+        # -- Apply circuit operations -- #
+
+        ctx.synchronize()
+        try:
+            print("2.AFTER ALLOCATING:")
+            (free, total) = ctx.get_memory_info()
+            print("Free memory:", free / (1024 * 1024), "MB")
+            print("Total memory:", total / (1024 * 1024), "MB")
+        except:
+            print("Failed to get memory information")
+
+        # print("HERE")
+        current_state = 0
+        for layer in range(number_layers):
+            # print("Layer:", layer, "out of", number_layers)
+            for qubit in range(num_qubits):
+                # print("Applying gate on qubit:", i, "of", num_qubits)
+                # print(
+                #     "gate symbol: ",
+                #     gates_list[Int(indexes[layer * num_qubits + i])].symbol,
+                # )
+                # print(
+                #     "gate index: ",
+                #     gate_set_dic[
+                #         gates_list[Int(indexes[layer * num_qubits + i])].symbol
+                #     ],
+                # )
+                if current_state == 0:
+                    ctx.enqueue_function[
+                        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+                    ](
+                        gate_set_re_tensor,
+                        gate_set_im_tensor,
+                        gate_set_dic[Hadamard.symbol],
+                        # gate_set_dic[
+                        #     gates_list[
+                        #         Int(indexes[layer * num_qubits + qubit])
+                        #     ].symbol
+                        # ],
+                        GATE_SIZE,
+                        qubit,  # target_qubit
+                        quantum_state_re_tensor,
+                        quantum_state_im_tensor,
+                        num_qubits,  # number_qubits
+                        state_vector_size,  # quantum_state_size
+                        quantum_state_out_re_tensor,
+                        quantum_state_out_im_tensor,
+                        control_bits_circuit_tensor,
+                        current_control_gate_circuit_tensor,
+                        grid_dim=blocks_per_grid,
+                        block_dim=threads_per_block,
+                    )
+                    current_state = 1
+                else:
+                    ctx.enqueue_function[
+                        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+                    ](
+                        gate_set_re_tensor,
+                        gate_set_im_tensor,
+                        gate_set_dic[Hadamard.symbol],
+                        # gate_set_dic[
+                        #     gates_list[
+                        #         Int(indexes[layer * num_qubits + qubit])
+                        #     ].symbol
+                        # ],
+                        GATE_SIZE,
+                        qubit,  # target_qubit
+                        quantum_state_out_re_tensor,
+                        quantum_state_out_im_tensor,
+                        num_qubits,  # number_qubits
+                        state_vector_size,  # quantum_state_size
+                        quantum_state_re_tensor,
+                        quantum_state_im_tensor,
+                        control_bits_circuit_tensor,
+                        current_control_gate_circuit_tensor,
+                        grid_dim=blocks_per_grid,
+                        block_dim=threads_per_block,
+                    )
+                    current_state = 0
+
+        keep(quantum_state_re.unsafe_ptr())
+        keep(quantum_state_im.unsafe_ptr())
+        keep(quantum_state_out_re.unsafe_ptr())
+        keep(quantum_state_out_im.unsafe_ptr())
+        keep(gate_set_re.unsafe_ptr())
+        keep(gate_set_im.unsafe_ptr())
+        keep(control_bits_circuit.unsafe_ptr())
+        keep(current_control_gate_circuit.unsafe_ptr())
+
+        ctx.synchronize()
+        try:
+            print("3. AFTER AUTOMATIC FREE:")
+            (free, total) = ctx.get_memory_info()
+            print("Free memory:", free / (1024 * 1024), "MB")
+            print("Total memory:", total / (1024 * 1024), "MB")
+        except:
+            print("Failed to get memory information")
+
+    b.iter_custom[qubit_wise_multiply_inplace_gpu_workflow](bench_ctx)
 
 
 @parameter
 @always_inline
-fn benchmark_elementwise_parameterized[
+fn benchmark_qubit_wise_multiply_extended[
     num_qubits: Int, number_layers: Int
 ](mut b: Bencher) raises:
+    gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+    indexes: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+        number_layers * 2 * num_qubits
+    )
+    random.seed()  # Seed on current time
+    random.randint(
+        indexes, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
+    )
+
     @parameter
     @always_inline
-    fn elementwise_workflow(ctx: DeviceContext) raises:
-        simulate_random_circuit[num_qubits, number_layers]()
+    fn qubit_wise_multiply_extended_workflow(ctx: DeviceContext) raises:
+        """Simulates a random quantum circuit with the specified number of qubits and layers.
+        """
+
+        # Initialize the quantum circuit to the |0⟩ state
+        quantum_state: StateVector = StateVector.from_bitstring(
+            "0" * num_qubits
+        )
+
+        for layer in range(number_layers):
+            for i in range(num_qubits):
+                quantum_state = qubit_wise_multiply_extended(
+                    1,
+                    gates_list[Int(indexes[layer * num_qubits + i])].matrix,
+                    [i],
+                    quantum_state,
+                )
+            for i in range(num_qubits - 1):
+                quantum_state = qubit_wise_multiply_extended(
+                    1,
+                    gates_list[
+                        Int(indexes[layer * num_qubits + num_qubits + i])
+                    ].matrix,
+                    [i],
+                    quantum_state,
+                    [[(i + 1) % num_qubits, 1]],
+                )
 
     bench_ctx = DeviceContext()
-    b.iter_custom[elementwise_workflow](bench_ctx)
+    b.iter_custom[qubit_wise_multiply_extended_workflow](bench_ctx)
+
+
+# def run_benchmark[
+#     max_number_qubits: Int = 10,
+#     max_number_layers: Int = 20,
+#     fixed_number_qubits: Int = 5,
+#     fixed_number_layers: Int = 10,
+#     # TODO how to do this without errors?
+#     benchmark_function: fn[Int, Int] (
+#         mut b: Bencher
+#     ) raises capturing -> None = benchmark_qubit_wise_multiply_extended,
+# ]():
+#     print("Running aaa() Benchmarks...")
+#     print("-" * 80)
+#     bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+#     bench = Bench(bench_config)
+
+#     @parameter
+#     for number_qubits in range(1, max_number_qubits + 1):
+#         bench.bench_function[
+#             benchmark_function[number_qubits, fixed_number_layers]
+#         ](
+#             BenchId(
+#                 "aaa_"
+#                 + String(number_qubits)
+#                 + "q_"
+#                 + String(fixed_number_layers)
+#                 + "l"
+#             )
+#         )
+
+#     @parameter
+#     for number_layers in range(1, max_number_layers + 1):
+#         bench.bench_function[
+#             benchmark_function[fixed_number_qubits, number_layers]
+#         ](
+#             BenchId(
+#                 "aaa_"
+#                 + String(fixed_number_qubits)
+#                 + "q_"
+#                 + String(number_layers)
+#                 + "l"
+#             )
+#         )
+
+#     print(bench)
+
+#     # bench.config.out_file = Path("out.csv")
+#     # bench.dump_report()
+
+#     print("aaa() Benchmarks completed!")
+#     print("-" * 80)
+
+
+def bench_qubit_wise_multiply[
+    max_number_qubits: Int = 10,
+    max_number_layers: Int = 20,
+    fixed_number_qubits: Int = 5,
+    fixed_number_layers: Int = 10,
+]():
+    # run_benchmark[
+    #     max_number_qubits,
+    #     max_number_layers,
+    #     fixed_number_qubits,
+    #     fixed_number_layers,
+    #     benchmark_function=benchmark_qubit_wise_multiply,
+    # ]()
+    print("Running qubit_wise_multiply() Benchmarks...")
+    print("-" * 80)
+    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+    bench = Bench(bench_config)
+
+    @parameter
+    for number_qubits in range(1, max_number_qubits + 1):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply[number_qubits, fixed_number_layers]
+        ](
+            BenchId(
+                "qubit_wise_multiply_"
+                + String(number_qubits)
+                + "q_"
+                + String(fixed_number_layers)
+                + "l"
+            )
+        )
+
+    @parameter
+    for number_layers in range(1, max_number_layers + 1):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply[fixed_number_qubits, number_layers]
+        ](
+            BenchId(
+                "qubit_wise_multiply_"
+                + String(fixed_number_qubits)
+                + "q_"
+                + String(number_layers)
+                + "l"
+            )
+        )
+
+    print(bench)
+
+    # bench.config.out_file = Path("out.csv")
+    # bench.dump_report()
+
+    print("qubit_wise_multiply() Benchmarks completed!")
+    print("-" * 80)
+
+
+def bench_qubit_wise_multiply_inplace[
+    # max_number_qubits: Int = 10,
+    # max_number_layers: Int = 20,
+    # fixed_number_qubits: Int = 5,
+    # fixed_number_layers: Int = 10,
+    max_number_qubits: Int = 16,
+    max_number_layers: Int = 2000,
+    fixed_number_qubits: Int = 5,
+    fixed_number_layers: Int = 200,
+]():
+    print("Running qubit_wise_multiply_inplace() Benchmarks...")
+    print("-" * 80)
+    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+    bench = Bench(bench_config)
+
+    @parameter
+    for number_qubits in range(1, max_number_qubits + 1, 5):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply_inplace[
+                number_qubits, fixed_number_layers
+            ]
+        ](
+            BenchId(
+                "qubit_wise_multiply_inplace_"
+                + String(number_qubits)
+                + "q_"
+                + String(fixed_number_layers)
+                + "l"
+            )
+        )
+
+    @parameter
+    for number_layers in range(1, max_number_layers + 1, 200):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply_inplace[
+                fixed_number_qubits, number_layers
+            ]
+        ](
+            BenchId(
+                "qubit_wise_multiply_inplace_"
+                + String(fixed_number_qubits)
+                + "q_"
+                + String(number_layers)
+                + "l"
+            )
+        )
+
+    print(bench)
+
+    # bench.config.out_file = Path("out.csv")
+    # bench.dump_report()
+
+    print("qubit_wise_multiply_inplace() Benchmarks completed!")
+    print("-" * 80)
+
+
+def bench_qubit_wise_multiply_inplace_gpu[
+    max_number_qubits: Int = 25,
+    max_number_layers: Int = 2000,
+    fixed_number_qubits: Int = 5,
+    fixed_number_layers: Int = 2,
+]():
+    print("Running qubit_wise_multiply_inplace() Benchmarks...")
+    print("-" * 80)
+    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+    bench = Bench(bench_config)
+
+    @parameter
+    for number_qubits in range(15, max_number_qubits + 1, 1):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply_inplace_gpu[
+                number_qubits, fixed_number_layers
+            ]
+        ](
+            BenchId(
+                "qubit_wise_multiply_inplace_"
+                + String(number_qubits)
+                + "q_"
+                + String(fixed_number_layers)
+                + "l"
+            )
+        )
+
+    @parameter
+    for number_layers in range(1, max_number_layers + 1, 200):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply_inplace_gpu[
+                fixed_number_qubits, number_layers
+            ]
+        ](
+            BenchId(
+                "qubit_wise_multiply_inplace_"
+                + String(fixed_number_qubits)
+                + "q_"
+                + String(number_layers)
+                + "l"
+            )
+        )
+
+    print(bench)
+
+    # bench.config.out_file = Path("out.csv")
+    # bench.dump_report()
+
+    print("qubit_wise_multiply_inplace_gpu() Benchmarks completed!")
+    print("-" * 80)
 
 
-def bench_main[
+def bench_qubit_wise_multiply_extended[
     max_number_qubits: Int = 10,
     max_number_layers: Int = 20,
     fixed_number_qubits: Int = 5,
     fixed_number_layers: Int = 10,
 ]():
-    print("Running qubit_wise_multiply() CPU Benchmarks...")
-    # print("SIMD width:", SIMD_WIDTH)
+    print("Running qubit_wise_multiply() Benchmarks...")
     print("-" * 80)
     bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
     bench = Bench(bench_config)
@@ -140,12 +830,12 @@ def bench_main[
     @parameter
     for number_qubits in range(1, max_number_qubits + 1):
         bench.bench_function[
-            benchmark_elementwise_parameterized[
+            benchmark_qubit_wise_multiply_extended[
                 number_qubits, fixed_number_layers
             ]
         ](
             BenchId(
-                "simulate_random_circuit_"
+                "qubit_wise_multiply_extended_"
                 + String(number_qubits)
                 + "q_"
                 + String(fixed_number_layers)
@@ -156,12 +846,12 @@ def bench_main[
     @parameter
     for number_layers in range(1, max_number_layers + 1):
         bench.bench_function[
-            benchmark_elementwise_parameterized[
+            benchmark_qubit_wise_multiply_extended[
                 fixed_number_qubits, number_layers
             ]
         ](
             BenchId(
-                "simulate_random_circuit_"
+                "qubit_wise_multiply_extended_"
                 + String(fixed_number_qubits)
                 + "q_"
                 + String(number_layers)
@@ -171,8 +861,8 @@ def bench_main[
 
     print(bench)
 
-    bench.config.out_file = Path("out.csv")
-    bench.dump_report()
+    # bench.config.out_file = Path("out.csv")
+    # bench.dump_report()
 
-    print("qubit_wise_multiply() CPU Benchmarks completed!")
+    print("qubit_wise_multiply_extended() Benchmarks completed!")
     print("-" * 80)
diff --git a/benchmarks/bench_simulate_random_circuit.mojo b/benchmarks/bench_simulate_random_circuit.mojo
new file mode 100644
index 0000000..120f04c
--- /dev/null
+++ b/benchmarks/bench_simulate_random_circuit.mojo
@@ -0,0 +1,178 @@
+from gpu.host import DeviceContext
+
+from benchmark import (
+    Bench,
+    BenchConfig,
+    Bencher,
+    BenchId,
+)
+
+from pathlib import Path
+
+import random
+
+from qlabs.base import (
+    StateVector,
+    ComplexMatrix,
+    Gate,
+    Hadamard,
+    PauliX,
+    PauliY,
+    PauliZ,
+    NOT,
+    H,
+    X,
+    Y,
+    Z,
+    SWAP,
+    iSWAP,
+    qubit_wise_multiply,
+    qubit_wise_multiply_extended,
+    apply_swap,
+    partial_trace,
+)
+
+from qlabs.abstractions import (
+    GateCircuit,
+    StateVectorSimulator,
+    ShowAfterEachGate,
+    ShowAfterEachLayer,
+    ShowOnlyEnd,
+)
+
+
+fn simulate_random_circuit[num_qubits: Int, number_layers: Int]() -> None:
+    """Simulates a random quantum circuit with the specified number of qubits and layers.
+
+    Parameters:
+        num_qubits: The number of qubits in the circuit.
+        number_layers: The number of layers in the circuit.
+    """
+
+    qc: GateCircuit = GateCircuit(num_qubits)
+
+    gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+    # index: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(2*num_qubits)
+    # print("Creating random circuit...")
+    # random.seed()  # Seed on current time
+    # for _ in range(400):
+    #     random.randint(index, 2*num_qubits, 0, len(gates_list) - 1)
+    #     for i in range(num_qubits):
+    #         qc = qc.apply(gates_list[Int(index[i])], i)
+    #     qc = qc.barrier()
+    #     for i in range(num_qubits - 1):
+    #         qc = qc.apply(
+    #             gates_list[Int(index[num_qubits + i])],
+    #             i,
+    #             controls=[(i + 1) % num_qubits],
+    #             is_anti_control=[False],
+    #         )
+    #     qc = qc.barrier()
+
+    index: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+        number_layers * 2 * num_qubits
+    )
+    random.seed()  # Seed on current time
+    random.randint(
+        index, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
+    )
+
+    for iter in range(number_layers):
+        for i in range(num_qubits):
+            qc.apply(gates_list[Int(index[iter * num_qubits + i])](i))
+        qc.barrier()
+        for i in range(num_qubits - 1):
+            qc.apply(
+                gates_list[Int(index[iter * num_qubits + num_qubits + i])](
+                    i, controls=[(i + 1) % num_qubits]
+                ),
+            )
+        qc.barrier()
+
+    initial_state_bitstring: String = (
+        "0" * num_qubits
+    )  # Initial state |000...0⟩
+    initial_state: StateVector = StateVector.from_bitstring(
+        initial_state_bitstring
+    )
+
+    qsimu = StateVectorSimulator(
+        qc,
+        initial_state=initial_state,
+        optimisation_level=0,  # No optimisations for now
+        verbose=False,
+        # verbose_step_size=ShowAfterEachLayer,  # ShowAfterEachGate, ShowOnlyEnd
+        verbose_step_size=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd
+        # stop_at=ShowAfterEachGate,  # ShowAfterEachGate, ShowOnlyEnd # TODO implement that instead of having access to manual methods
+    )
+
+    for _ in range(100):
+        _ = qsimu.run()
+
+
+@parameter
+@always_inline
+fn benchmark_simulate_random_circuit[
+    num_qubits: Int, number_layers: Int
+](mut b: Bencher) raises:
+    @parameter
+    @always_inline
+    fn simulate_random_circuit_workflow(ctx: DeviceContext) raises:
+        simulate_random_circuit[num_qubits, number_layers]()
+
+    bench_ctx = DeviceContext()
+    b.iter_custom[simulate_random_circuit_workflow](bench_ctx)
+
+
+def bench_simulate_random_circuit[
+    max_number_qubits: Int = 10,
+    max_number_layers: Int = 20,
+    fixed_number_qubits: Int = 5,
+    fixed_number_layers: Int = 10,
+]():
+    print("Running qubit_wise_multiply() CPU Benchmarks...")
+    # print("SIMD width:", SIMD_WIDTH)
+    print("-" * 80)
+    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+    bench = Bench(bench_config)
+
+    @parameter
+    for number_qubits in range(1, max_number_qubits + 1):
+        bench.bench_function[
+            benchmark_simulate_random_circuit[
+                number_qubits, fixed_number_layers
+            ]
+        ](
+            BenchId(
+                "simulate_random_circuit_"
+                + String(number_qubits)
+                + "q_"
+                + String(fixed_number_layers)
+                + "l"
+            )
+        )
+
+    @parameter
+    for number_layers in range(1, max_number_layers + 1):
+        bench.bench_function[
+            benchmark_simulate_random_circuit[
+                fixed_number_qubits, number_layers
+            ]
+        ](
+            BenchId(
+                "simulate_random_circuit_"
+                + String(fixed_number_qubits)
+                + "q_"
+                + String(number_layers)
+                + "l"
+            )
+        )
+
+    print(bench)
+
+    # bench.config.out_file = Path("out.csv")
+    # bench.dump_report()
+
+    print("simulate_random_circuit() CPU Benchmarks completed!")
+    print("-" * 80)
diff --git a/examples/gpu_examples.mojo b/examples/gpu_examples.mojo
index 988874b..5a64153 100644
--- a/examples/gpu_examples.mojo
+++ b/examples/gpu_examples.mojo
@@ -21,7 +21,7 @@ from qlabs.base import (
     iSWAP,
 )
 
-from qlabs.base.gpu import qubit_wise_multiply_gpu, qubit_wise_multiply_gpu_2
+from qlabs.base.gpu import qubit_wise_multiply_inplace_gpu
 
 alias BLOCKS_PER_GRID = 1
 alias THREADS_PER_BLOCK = (1, 1)
@@ -30,20 +30,17 @@ alias dtype = DType.float32
 alias GATE_SIZE = 2
 alias STATE_VECTOR_SIZE = 8
 alias NUMBER_CONTROL_BITS = 1
-# crashes the code on GitHub (tag Austin Doolittle)
-# another crash due to IntTuple
-# alias control_bits_list: List[List[List[Int]]] = [
-#     [[1, 1]],  # Control on qubit 1 and is control because flag=1
-#     [[1, 1]],  # Control on qubit 1 and is control because flag=1
-# ]
+# TODO have NUMBER_CONTROL_BITS be a list defining each gates specific control bits count
 alias CIRCUIT_NUMBER_CONTROL_GATES = 2
-
+alias circuit_control_bits_layout = Layout.row_major(
+    CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
+)
 
 alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
-alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE, 1)
+alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE)
 alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
 
-alias gate_set = [Hadamard, PauliX, PauliZ]
+alias gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
 alias gate_set_dic: Dict[String, Int] = {
     Hadamard.symbol: 0,
     PauliX.symbol: 1,
@@ -53,115 +50,641 @@ alias GATE_SET_SIZE = 3
 alias gate_set_1qubit_layout = Layout.row_major(
     GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
 )
+alias gate_set_1qubit_vectorized_layout = Layout.row_major(
+    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE, 2
+)
 
 
-def gpu_debug_something():
+def simulate_figure1_circuit_gpu():
+    """Simulates the circuit from Figure 1 in the paper."""
+
     @parameter
     if not has_accelerator():
         print("No compatible GPU found")
     else:
+        print("Simulating Figure 1 circuit.\nCircuit design:")
+        print(
+            """
+    |0> -------|X|--|Z|--
+                |
+    |0> --|H|---*----*---
+                    |
+    |0> --|X|-------|X|--
+        """
+        )
+        var control_bits_list: List[List[List[Int]]] = [
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+        ]
+
         ctx = DeviceContext()
-        print("Found GPU:", ctx.name())
-
-        gate_re = ctx.enqueue_create_buffer[dtype](
-            GATE_SIZE * GATE_SIZE
-        ).enqueue_fill(0)
-        gate_im = ctx.enqueue_create_buffer[dtype](
-            GATE_SIZE * GATE_SIZE
-        ).enqueue_fill(0)
-        quantum_state_re = ctx.enqueue_create_buffer[dtype](
+        print("Using GPU:", ctx.name())
+
+        # -- Create GPU variables -- #
+        # These don't need to be initialized to zero, they will be filled later
+
+        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
             STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
-        quantum_state_im = ctx.enqueue_create_buffer[dtype](
+        )
+        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
             STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
+        )
+
+        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+
+        host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        )
+
+        # -- Initialize the quantum circuit to the |000⟩ state -- #
+        quantum_state: StateVector = StateVector.from_bitstring("000")
+        print("Initial quantum state:\n", quantum_state)
+
+        # Wait for host buffers to be ready
+        ctx.synchronize()
+
+        # -- Fill host buffers -- #
+
+        for i in range(STATE_VECTOR_SIZE):
+            host_quantum_state_re[i] = quantum_state[i].re
+            host_quantum_state_im[i] = quantum_state[i].im
+
+        print("Initial state real part:", host_quantum_state_re)
+        print("Initial state imaginary part:", host_quantum_state_im)
+
+        for i in range(GATE_SET_SIZE):
+            gate = gate_set[i]
+            for j in range(GATE_SIZE):
+                for k in range(GATE_SIZE):
+                    index = gate_set_1qubit_layout(
+                        IntTuple(i, j, k)
+                    )  # Get the index in the 1D buffer
+                    host_gate_set_re[index] = gate[j, k].re
+                    host_gate_set_im[index] = gate[j, k].im
+
+        for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+            for j in range(NUMBER_CONTROL_BITS):
+                for k in range(2):
+                    index = circuit_control_bits_layout(IntTuple(i, j, k))
+                    host_control_bits_circuit[index] = control_bits_list[i][j][
+                        k
+                    ]
+
+        # -- Copy host buffers to device buffers -- #
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](STATE_VECTOR_SIZE)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](STATE_VECTOR_SIZE)
+
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+
+        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        )
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+        # Create other buffers for functions
+
         quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
             STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
+        )
         quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
             STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
+        )
 
-        control_bits = ctx.enqueue_create_buffer[DType.int32](
-            NUMBER_CONTROL_BITS * 2
-        ).enqueue_fill(0)
+        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+        gate_set_re.enqueue_copy_from(host_gate_set_re)
+        gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+        control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+        ctx.enqueue_memset(current_control_gate_circuit, 0)
+        ctx.enqueue_memset(quantum_state_out_re, 0.0)
+        ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+        # -- Create layout tensors for GPU operations -- #
+        gate_set_re_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_re.unsafe_ptr())
+        gate_set_im_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_im.unsafe_ptr())
 
-        gate_re_tensor = LayoutTensor[mut=True, dtype, gate_1qubit_layout](
-            gate_re.unsafe_ptr()
-        )
-        gate_im_tensor = LayoutTensor[mut=True, dtype, gate_1qubit_layout](
-            gate_im.unsafe_ptr()
-        )
         quantum_state_re_tensor = LayoutTensor[
             mut=True, dtype, state_vector_3qubits_layout
         ](quantum_state_re.unsafe_ptr())
         quantum_state_im_tensor = LayoutTensor[
             mut=True, dtype, state_vector_3qubits_layout
         ](quantum_state_im.unsafe_ptr())
+
         quantum_state_out_re_tensor = LayoutTensor[
             mut=True, dtype, state_vector_3qubits_layout
         ](quantum_state_out_re.unsafe_ptr())
         quantum_state_out_im_tensor = LayoutTensor[
             mut=True, dtype, state_vector_3qubits_layout
         ](quantum_state_out_im.unsafe_ptr())
-        control_bits_tensor = LayoutTensor[
-            mut=True, DType.int32, control_bits_layout
-        ](control_bits.unsafe_ptr())
 
-        print("Before")
+        control_bits_circuit_tensor = LayoutTensor[
+            mut=False, DType.int32, circuit_control_bits_layout
+        ](control_bits_circuit.unsafe_ptr())
+        current_control_gate_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, Layout.row_major(1)
+        ](current_control_gate_circuit.unsafe_ptr())
+
+        # -- Apply circuit operations -- #
+
+        # Gate 0
+        # quantum_state = qubit_wise_multiply_gpu(
+        #     Hadamard.matrix, 1, quantum_state
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[Hadamard.symbol],
+            GATE_SIZE,
+            1,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # # It works
+        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        #     print(
+        #         "After Hadamard gate on qubit 1\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 1 (reverse the states input <-> output)
+        # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-X gate on qubit 2:",
+        #         "\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # # Gate 2
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 0, quantum_state, [[1, 1]]
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
+        #         "\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
 
-        ctx.enqueue_function[qubit_wise_multiply_gpu](
-            gate_re_tensor,
-            gate_im_tensor,
+        # Gate 3
+        # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliZ.symbol],
             GATE_SIZE,
             0,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            3,  # number_qubits
+            STATE_VECTOR_SIZE,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=BLOCKS_PER_GRID,
+            block_dim=THREADS_PER_BLOCK,
+        )
+
+        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 4
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 2, quantum_state, [[1, 1]]
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
             quantum_state_re_tensor,
             quantum_state_im_tensor,
             3,  # number_qubits
             STATE_VECTOR_SIZE,  # quantum_state_size
             quantum_state_out_re_tensor,
             quantum_state_out_im_tensor,
-            control_bits_tensor,
-            NUMBER_CONTROL_BITS,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
             grid_dim=BLOCKS_PER_GRID,
             block_dim=THREADS_PER_BLOCK,
         )
 
-        print("After")
+        with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+            print(
+                (
+                    "After Pauli-X gate on qubit 2 with control on qubit 1"
+                    " (Final State):\nreal part:\n"
+                ),
+                host_re,
+                "\nimaginary part:\n",
+                host_im,
+            )
 
-        ctx.synchronize()
 
-        with quantum_state_out_re.map_to_host() as host_re:
-            print("Output real part:", host_re)
-        with quantum_state_out_im.map_to_host() as host_im:
-            print("Output imaginary part:", host_im)
-        with gate_re.map_to_host() as host_gate_re:
-            print("Gate real part:", host_gate_re)
-        with gate_im.map_to_host() as host_gate_im:
-            print("Gate imaginary part:", host_gate_im)
-        with quantum_state_re.map_to_host() as host_quantum_state_re:
-            print("Quantum state real part:", host_quantum_state_re)
-        with quantum_state_im.map_to_host() as host_quantum_state_im:
-            print("Quantum state imaginary part:", host_quantum_state_im)
-
-
-def run_gpu_not_abstract():
-    """Simulates the circuit from Figure 1 in the paper."""
+# def run_gpu_not_abstract_3():
+#     """Simulates the circuit from Figure 1 in the paper."""
+
+#     @parameter
+#     if not has_accelerator():
+#         print("No compatible GPU found")
+#     else:
+#         print("Simulating Figure 1 circuit.\nCircuit design:")
+#         print(
+#             """
+#     |0> -------|X|--|Z|--
+#                 |
+#     |0> --|H|---*----*---
+#                     |
+#     |0> --|X|-------|X|--
+#         """
+#         )
+#         var control_bits_list: List[List[List[Int]]] = [
+#             [[1, 1]],  # Control on qubit 1 and is control because flag=1
+#             [[1, 1]],  # Control on qubit 1 and is control because flag=1
+#         ]
+
+#         ctx = DeviceContext()
+#         print("Using GPU:", ctx.name())
+
+#         # -- Create GPU variables -- #
+#         ctx = DeviceContext()
+
+#         # -- Initialize the quantum circuit to the |000⟩ state -- #
+#         quantum_state: StateVector = StateVector.from_bitstring("000")
+#         print("Initial quantum state:\n", quantum_state)
+
+#         host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+#             STATE_VECTOR_SIZE
+#         ).enqueue_fill(0)
+#         host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+#             STATE_VECTOR_SIZE
+#         ).enqueue_fill(0)
+
+#         host_gate_set = ctx.enqueue_create_host_buffer[dtype](
+#             GATE_SET_SIZE * GATE_SIZE * GATE_SIZE * 2
+#         ).enqueue_fill(0)
+
+#         host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+#             CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+#         ).enqueue_fill(0)
+
+#         # Wait for host buffers to be ready
+#         ctx.synchronize()
+
+#         # -- Fill host buffers -- #
+
+#         for i in range(STATE_VECTOR_SIZE):
+#             host_quantum_state_re[i] = quantum_state[i].re
+#             host_quantum_state_im[i] = quantum_state[i].im
+
+#         print("Initial state real part:", host_quantum_state_re)
+#         print("Initial state imaginary part:", host_quantum_state_im)
+
+#         for i in range(GATE_SET_SIZE):
+#             gate = gate_set[i]
+#             for j in range(GATE_SIZE):
+#                 for k in range(GATE_SIZE):
+#                     index = gate_set_1qubit_layout(
+#                         IntTuple(i, j, k)
+#                     )  # Get the index in the 1D buffer
+#                     host_gate_set[index][0] = gate[j, k].re
+#                     host_gate_set[index][1] = gate[j, k].im
+
+#         for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+#             for j in range(NUMBER_CONTROL_BITS):
+#                 for k in range(2):
+#                     index = circuit_control_bits_layout(IntTuple(i, j, k))
+#                     host_control_bits_circuit[index] = control_bits_list[i][j][
+#                         k
+#                     ]
+
+#         # -- Copy host buffers to device buffers -- #
+#         quantum_state_re = ctx.enqueue_create_buffer[dtype](
+#             STATE_VECTOR_SIZE
+#         ).enqueue_fill(0)
+#         quantum_state_im = ctx.enqueue_create_buffer[dtype](
+#             STATE_VECTOR_SIZE
+#         ).enqueue_fill(0)
+
+#         gate_set = ctx.enqueue_create_buffer[dtype](
+#             GATE_SET_SIZE * GATE_SIZE * GATE_SIZE * 2
+#         ).enqueue_fill(0)
+
+#         control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+#             CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+#         ).enqueue_fill(0)
+#         current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](
+#             1
+#         ).enqueue_fill(0)
+
+#         # Create other buffers for functions
+
+#         quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+#             STATE_VECTOR_SIZE
+#         ).enqueue_fill(0)
+#         quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+#             STATE_VECTOR_SIZE
+#         ).enqueue_fill(0)
+
+#         quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+#         quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+#         gate_set.enqueue_copy_from(host_gate_set)
+
+#         control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+#         # -- Create layout tensors for GPU operations -- #
+#         gate_set_tensor = LayoutTensor[
+#             mut=False, dtype, gate_set_1qubit_vectorized_layout
+#         ](gate_set.unsafe_ptr())
+
+#         quantum_state_re_tensor = LayoutTensor[
+#             mut=True, dtype, state_vector_3qubits_layout
+#         ](quantum_state_re.unsafe_ptr())
+#         quantum_state_im_tensor = LayoutTensor[
+#             mut=True, dtype, state_vector_3qubits_layout
+#         ](quantum_state_im.unsafe_ptr())
+
+#         quantum_state_out_re_tensor = LayoutTensor[
+#             mut=True, dtype, state_vector_3qubits_layout
+#         ](quantum_state_out_re.unsafe_ptr())
+#         quantum_state_out_im_tensor = LayoutTensor[
+#             mut=True, dtype, state_vector_3qubits_layout
+#         ](quantum_state_out_im.unsafe_ptr())
+
+#         control_bits_circuit_tensor = LayoutTensor[
+#             mut=False, DType.int32, circuit_control_bits_layout
+#         ](control_bits_circuit.unsafe_ptr())
+#         current_control_gate_circuit_tensor = LayoutTensor[
+#             mut=True, DType.int32, Layout.row_major(1)
+#         ](current_control_gate_circuit.unsafe_ptr())
+
+#         # -- Apply circuit operations -- #
+
+#         # Gate 0
+#         # quantum_state = qubit_wise_multiply_gpu(
+#         #     Hadamard.matrix, 1, quantum_state
+#         # )
+#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=0]](
+#             gate_set_tensor,
+#             gate_set_dic[Hadamard.symbol],
+#             GATE_SIZE,
+#             1,  # target_qubit
+#             quantum_state_re_tensor,
+#             quantum_state_im_tensor,
+#             3,  # number_qubits
+#             STATE_VECTOR_SIZE,  # quantum_state_size
+#             quantum_state_out_re_tensor,
+#             quantum_state_out_im_tensor,
+#             control_bits_circuit_tensor,
+#             current_control_gate_circuit_tensor,
+#             grid_dim=BLOCKS_PER_GRID,
+#             block_dim=THREADS_PER_BLOCK,
+#         )
+
+#         # # It works
+#         # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+#         #     print(
+#         #         "After Hadamard gate on qubit 1\nreal part:\n",
+#         #         host_re,
+#         #         "\nimaginary part:\n",
+#         #         host_im,
+#         #     )
+
+#         # Gate 1 (reverse the states input <-> output)
+#         # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
+#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=0]](
+#             gate_set_tensor,
+#             gate_set_dic[PauliX.symbol],
+#             GATE_SIZE,
+#             2,  # target_qubit
+#             quantum_state_out_re_tensor,
+#             quantum_state_out_im_tensor,
+#             3,  # number_qubits
+#             STATE_VECTOR_SIZE,  # quantum_state_size
+#             quantum_state_re_tensor,
+#             quantum_state_im_tensor,
+#             control_bits_circuit_tensor,
+#             current_control_gate_circuit_tensor,
+#             grid_dim=BLOCKS_PER_GRID,
+#             block_dim=THREADS_PER_BLOCK,
+#         )
+
+#         # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+#         #     print(
+#         #         "After Pauli-X gate on qubit 2:",
+#         #         "\nreal part:\n",
+#         #         host_re,
+#         #         "\nimaginary part:\n",
+#         #         host_im,
+#         #     )
+
+#         # # Gate 2
+#         # quantum_state = qubit_wise_multiply(
+#         #     PauliX.matrix, 0, quantum_state, [[1, 1]]
+#         # )
+#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=1]](
+#             gate_set_tensor,
+#             gate_set_dic[PauliX.symbol],
+#             GATE_SIZE,
+#             0,  # target_qubit
+#             quantum_state_re_tensor,
+#             quantum_state_im_tensor,
+#             3,  # number_qubits
+#             STATE_VECTOR_SIZE,  # quantum_state_size
+#             quantum_state_out_re_tensor,
+#             quantum_state_out_im_tensor,
+#             control_bits_circuit_tensor,
+#             current_control_gate_circuit_tensor,
+#             grid_dim=BLOCKS_PER_GRID,
+#             block_dim=THREADS_PER_BLOCK,
+#         )
+
+#         # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+#         #     print(
+#         #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
+#         #         "\nreal part:\n",
+#         #         host_re,
+#         #         "\nimaginary part:\n",
+#         #         host_im,
+#         #     )
+
+#         # Gate 3
+#         # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
+#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=0]](
+#             gate_set_tensor,
+#             gate_set_dic[PauliZ.symbol],
+#             GATE_SIZE,
+#             0,  # target_qubit
+#             quantum_state_out_re_tensor,
+#             quantum_state_out_im_tensor,
+#             3,  # number_qubits
+#             STATE_VECTOR_SIZE,  # quantum_state_size
+#             quantum_state_re_tensor,
+#             quantum_state_im_tensor,
+#             control_bits_circuit_tensor,
+#             current_control_gate_circuit_tensor,
+#             grid_dim=BLOCKS_PER_GRID,
+#             block_dim=THREADS_PER_BLOCK,
+#         )
+
+#         # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+#         #     print(
+#         #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
+#         #         host_re,
+#         #         "\nimaginary part:\n",
+#         #         host_im,
+#         #     )
+
+#         # Gate 4
+#         # quantum_state = qubit_wise_multiply(
+#         #     PauliX.matrix, 2, quantum_state, [[1, 1]]
+#         # )
+#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=1]](
+#             gate_set_tensor,
+#             gate_set_dic[PauliX.symbol],
+#             GATE_SIZE,
+#             2,  # target_qubit
+#             quantum_state_re_tensor,
+#             quantum_state_im_tensor,
+#             3,  # number_qubits
+#             STATE_VECTOR_SIZE,  # quantum_state_size
+#             quantum_state_out_re_tensor,
+#             quantum_state_out_im_tensor,
+#             control_bits_circuit_tensor,
+#             current_control_gate_circuit_tensor,
+#             grid_dim=BLOCKS_PER_GRID,
+#             block_dim=THREADS_PER_BLOCK,
+#         )
+
+#         with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+#             print(
+#                 (
+#                     "After Pauli-X gate on qubit 2 with control on qubit 1"
+#                     " (Final State):\nreal part:\n"
+#                 ),
+#                 host_re,
+#                 "\nimaginary part:\n",
+#                 host_im,
+#             )
+
+
+def simulate_any_size_circuit_gpu[num_qubits: Int]():
+    """Simulates a circuit of arbitrary number of qubits"""
 
     @parameter
     if not has_accelerator():
         print("No compatible GPU found")
     else:
-        print("Simulating Figure 1 circuit.\nCircuit design:")
-        print(
-            """
-    |0> -------|X|--|Z|--
-                |
-    |0> --|H|---*----*---
-                    |
-    |0> --|X|-------|X|--
-        """
+        alias state_vector_size = 1 << num_qubits
+        alias state_vector_layout = Layout.row_major(state_vector_size)
+
+        alias total_threads = 2 * state_vector_size
+
+        alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
+        alias blocks_per_grid = (
+            total_threads + max_threads_per_block - 1
+        ) // max_threads_per_block
+
+        alias threads_per_block = (
+            max_threads_per_block,
+            1,
+            1,
         )
+
+        @parameter
+        if total_threads < max_threads_per_block:
+            alias threads_per_block = (
+                total_threads,
+                1,
+                1,
+            )  # 1D block of threads
+
         var control_bits_list: List[List[List[Int]]] = [
             [[1, 1]],  # Control on qubit 1 and is control because flag=1
             [[1, 1]],  # Control on qubit 1 and is control because flag=1
@@ -169,190 +692,154 @@ def run_gpu_not_abstract():
 
         ctx = DeviceContext()
         print("Using GPU:", ctx.name())
+        print("ctx.device_info:", ctx.device_info)
+        print(
+            "ctx.device_info.max_thread_block_size:",
+            ctx.device_info.max_thread_block_size,
+        )
+        print(
+            "ctx.device_info.max_blocks_per_multiprocessor:",
+            ctx.device_info.max_blocks_per_multiprocessor,
+        )
+        try:
+            (free, total) = ctx.get_memory_info()
+            print("Free memory:", free / (1024 * 1024), "MB")
+            print("Total memory:", total / (1024 * 1024), "MB")
+        except:
+            print("Failed to get memory information")
 
         # -- Create GPU variables -- #
-        ctx = DeviceContext()
-
-        # Initialize the quantum circuit to the |000⟩ state
-        quantum_state: StateVector = StateVector.from_bitstring("000")
-        print("Initial quantum state:\n", quantum_state)
-
-        quantum_state_re = ctx.enqueue_create_buffer[dtype](
-            STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
-        quantum_state_im = ctx.enqueue_create_buffer[dtype](
-            STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
-
-        with quantum_state_re.map_to_host() as device_re, quantum_state_im.map_to_host() as device_im:
-            for i in range(STATE_VECTOR_SIZE):
-                device_re[i] = quantum_state[i].re
-                device_im[i] = quantum_state[i].im
+        # These don't need to be initialized to zero, they will be filled later
 
-        with quantum_state_re.map_to_host() as host_re:
-            print("Quantum state real part:", host_re)
-        with quantum_state_im.map_to_host() as host_im:
-            print("Quantum state imaginary part:", host_im)
+        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
 
-        gate_set_re = ctx.enqueue_create_buffer[dtype](
+        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
             GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        ).enqueue_fill(0)
-        gate_set_im = ctx.enqueue_create_buffer[dtype](
+        )
+        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
             GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        ).enqueue_fill(0)
-
-        with gate_set_re.map_to_host() as device_gate_re, gate_set_im.map_to_host() as device_gate_im:
-            for i in range(GATE_SET_SIZE):
-                gate = gate_set[i]
-                for j in range(GATE_SIZE):
-                    for k in range(GATE_SIZE):
-                        index = gate_set_1qubit_layout(
-                            IntTuple(i, j, k)
-                        )  # Get the index in the 1D buffer
-                        device_gate_re[index] = gate[j, k].re
-                        device_gate_im[index] = gate[j, k].im
-
-        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
-            STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
-        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
-            STATE_VECTOR_SIZE
-        ).enqueue_fill(0)
+        )
 
-        # TODO have one big variable for all control bits
-        # and have NUMBER_CONTROL_BITS be a list defining each gates specific control bits count
-        # maybe need a variable to keep track of the current control gate index
-        control_bits_0 = ctx.enqueue_create_buffer[DType.int32](
-            NUMBER_CONTROL_BITS * 2
-        ).enqueue_fill(0)
+        host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        )
 
-        control_bits_1 = ctx.enqueue_create_buffer[DType.int32](
-            NUMBER_CONTROL_BITS * 2
-        ).enqueue_fill(0)
+        # -- Initialize the quantum circuit to the |000⟩ state -- #
+        quantum_state: StateVector = StateVector.from_bitstring(
+            "0" * num_qubits
+        )
+        print("Initial quantum state:\n", quantum_state)
 
-        with control_bits_0.map_to_host() as device_control_0, control_bits_1.map_to_host() as device_control_1:
-            # Set control bits for the first controlled gate
-            device_control_0[0] = 1
-            device_control_0[1] = 1
+        # Wait for host buffers to be ready
+        ctx.synchronize()
 
-            # Set control bits for the second controlled gate
-            device_control_1[0] = 1
-            device_control_1[1] = 1
+        # -- Fill host buffers -- #
+
+        for i in range(state_vector_size):
+            host_quantum_state_re[i] = quantum_state[i].re
+            host_quantum_state_im[i] = quantum_state[i].im
+
+        print("Initial state real part:", host_quantum_state_re)
+        print("Initial state imaginary part:", host_quantum_state_im)
+
+        for i in range(GATE_SET_SIZE):
+            gate = gate_set[i]
+            for j in range(GATE_SIZE):
+                for k in range(GATE_SIZE):
+                    index = gate_set_1qubit_layout(
+                        IntTuple(i, j, k)
+                    )  # Get the index in the 1D buffer
+                    host_gate_set_re[index] = gate[j, k].re
+                    host_gate_set_im[index] = gate[j, k].im
+
+        for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+            for j in range(NUMBER_CONTROL_BITS):
+                for k in range(2):
+                    index = circuit_control_bits_layout(IntTuple(i, j, k))
+                    host_control_bits_circuit[index] = control_bits_list[i][j][
+                        k
+                    ]
+
+        # -- Copy host buffers to device buffers -- #
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
 
-        control_bits_empty = ctx.enqueue_create_buffer[DType.int32](
-            NUMBER_CONTROL_BITS * 2
-        ).enqueue_fill(0)
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+        )
 
-        # Create control bits for the circuit
         control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
             CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-        ).enqueue_fill(0)
-        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](
-            1
-        ).enqueue_fill(0)
-
-        print("FIRST LOOP")
-        for i in range(len(control_bits_list)):
-            for j in range(len(control_bits_list[i])):
-                for k in range(len(control_bits_list[i][j])):
-                    print("i:", i, "j:", j, "k:", k)
-                    print(
-                        "control_bits_list[i][j][k]:",
-                        control_bits_list[i][j][k],
-                    )
-
-        print("before printing index")
-
-        var coords_2 = IntTuple(0, 0, 0)
-        print("coords_2:", coords_2)
-
-        alias circuit_control_bits_layout = Layout.row_major(
-            CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
-        )
-        var index = circuit_control_bits_layout(coords_2)
-        print(
-            "gate_set_1qubit_layout(coords_2):",
-            index,
         )
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
 
-        # print(
-        #     "circuit_control_bits_layout(coords_2):",
-        #     circuit_control_bits_layout(IntTuple(0, 0, 0)),
-        # )
+        # Create other buffers for functions
 
-        # _ = gate_set_1qubit_layout
-        # _ = circuit_control_bits_layout
-
-        print("\nSECOND LOOP")
-        # with control_bits_circuit.map_to_host() as device_control_bits_circuit:
-        #     for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
-        #         for j in range(NUMBER_CONTROL_BITS):
-        #             for k in range(2):
-        #                 # print("i:", i, "j:", j, "k:", k, "index:", index)
-        #                 print("i:", i, "j:", j, "k:", k)
-        #                 print(
-        #                     "control_bits_list[i][j][k]:",
-        #                     control_bits_list[i][j][k],
-        #                 )
-        #                 index = circuit_control_bits_layout(IntTuple(i, j, k))
-        #                 print("index:", index)
-        #                 # device_control_bits_circuit[index] = control_bits_list[
-        #                 #     i
-        #                 # ][j][k]
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
 
-        #     print(
-        #         "Control bits for the circuit:\n",
-        #         device_control_bits_circuit,
-        #     )
+        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+        gate_set_re.enqueue_copy_from(host_gate_set_re)
+        gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+        control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+        ctx.enqueue_memset(current_control_gate_circuit, 0)
+        ctx.enqueue_memset(quantum_state_out_re, 0.0)
+        ctx.enqueue_memset(quantum_state_out_im, 0.0)
 
         # -- Create layout tensors for GPU operations -- #
         gate_set_re_tensor = LayoutTensor[
-            mut=True, dtype, gate_set_1qubit_layout
+            mut=False, dtype, gate_set_1qubit_layout
         ](gate_set_re.unsafe_ptr())
         gate_set_im_tensor = LayoutTensor[
-            mut=True, dtype, gate_set_1qubit_layout
+            mut=False, dtype, gate_set_1qubit_layout
         ](gate_set_im.unsafe_ptr())
 
         quantum_state_re_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
+            mut=True, dtype, state_vector_layout
         ](quantum_state_re.unsafe_ptr())
         quantum_state_im_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
+            mut=True, dtype, state_vector_layout
         ](quantum_state_im.unsafe_ptr())
 
         quantum_state_out_re_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
+            mut=True, dtype, state_vector_layout
         ](quantum_state_out_re.unsafe_ptr())
         quantum_state_out_im_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
+            mut=True, dtype, state_vector_layout
         ](quantum_state_out_im.unsafe_ptr())
 
-        control_bits_empty_tensor = LayoutTensor[
-            mut=True, DType.int32, control_bits_layout
-        ](control_bits_empty.unsafe_ptr())
-        control_bits_0_tensor = LayoutTensor[
-            mut=True, DType.int32, control_bits_layout
-        ](control_bits_0.unsafe_ptr())
-        control_bits_1_tensor = LayoutTensor[
-            mut=True, DType.int32, control_bits_layout
-        ](control_bits_1.unsafe_ptr())
-
         control_bits_circuit_tensor = LayoutTensor[
-            mut=True, DType.int32, circuit_control_bits_layout
+            mut=False, DType.int32, circuit_control_bits_layout
         ](control_bits_circuit.unsafe_ptr())
         current_control_gate_circuit_tensor = LayoutTensor[
             mut=True, DType.int32, Layout.row_major(1)
         ](current_control_gate_circuit.unsafe_ptr())
 
-        # Enqueue create_initial_state
-
-        # Enqueue applying gates
+        # -- Apply circuit operations -- #
 
         # Gate 0
         # quantum_state = qubit_wise_multiply_gpu(
         #     Hadamard.matrix, 1, quantum_state
         # )
-        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=0]](
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
             gate_set_re_tensor,
             gate_set_im_tensor,
             gate_set_dic[Hadamard.symbol],
@@ -360,13 +847,14 @@ def run_gpu_not_abstract():
             1,  # target_qubit
             quantum_state_re_tensor,
             quantum_state_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
             quantum_state_out_re_tensor,
             quantum_state_out_im_tensor,
-            control_bits_empty,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
         )
 
         # # It works
@@ -380,7 +868,9 @@ def run_gpu_not_abstract():
 
         # Gate 1 (reverse the states input <-> output)
         # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
-        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=0]](
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
             gate_set_re_tensor,
             gate_set_im_tensor,
             gate_set_dic[PauliX.symbol],
@@ -388,13 +878,14 @@ def run_gpu_not_abstract():
             2,  # target_qubit
             quantum_state_out_re_tensor,
             quantum_state_out_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
             quantum_state_re_tensor,
             quantum_state_im_tensor,
-            control_bits_empty,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
         )
 
         # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
@@ -410,7 +901,9 @@ def run_gpu_not_abstract():
         # quantum_state = qubit_wise_multiply(
         #     PauliX.matrix, 0, quantum_state, [[1, 1]]
         # )
-        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=1]](
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
             gate_set_re_tensor,
             gate_set_im_tensor,
             gate_set_dic[PauliX.symbol],
@@ -418,13 +911,14 @@ def run_gpu_not_abstract():
             0,  # target_qubit
             quantum_state_re_tensor,
             quantum_state_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
             quantum_state_out_re_tensor,
             quantum_state_out_im_tensor,
-            control_bits_0_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
         )
 
         # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
@@ -438,7 +932,9 @@ def run_gpu_not_abstract():
 
         # Gate 3
         # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
-        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=0]](
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
             gate_set_re_tensor,
             gate_set_im_tensor,
             gate_set_dic[PauliZ.symbol],
@@ -446,13 +942,14 @@ def run_gpu_not_abstract():
             0,  # target_qubit
             quantum_state_out_re_tensor,
             quantum_state_out_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
             quantum_state_re_tensor,
             quantum_state_im_tensor,
-            control_bits_empty_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
         )
 
         # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
@@ -467,7 +964,9 @@ def run_gpu_not_abstract():
         # quantum_state = qubit_wise_multiply(
         #     PauliX.matrix, 2, quantum_state, [[1, 1]]
         # )
-        ctx.enqueue_function[qubit_wise_multiply_gpu_2[number_control_bits=1]](
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
             gate_set_re_tensor,
             gate_set_im_tensor,
             gate_set_dic[PauliX.symbol],
@@ -475,13 +974,14 @@ def run_gpu_not_abstract():
             2,  # target_qubit
             quantum_state_re_tensor,
             quantum_state_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
             quantum_state_out_re_tensor,
             quantum_state_out_im_tensor,
-            control_bits_1_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
         )
 
         with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
diff --git a/examples/main.mojo b/examples/main.mojo
index f0a24c1..688f064 100644
--- a/examples/main.mojo
+++ b/examples/main.mojo
@@ -25,6 +25,7 @@ from qlabs.base import (
     SWAP,
     iSWAP,
     qubit_wise_multiply,
+    qubit_wise_multiply_inplace,
     qubit_wise_multiply_extended,
     apply_swap,
     partial_trace,
@@ -38,7 +39,10 @@ from qlabs.abstractions import (
     ShowOnlyEnd,
 )
 
-from gpu_examples import gpu_debug_something, run_gpu_not_abstract
+from gpu_examples import (
+    simulate_figure1_circuit_gpu,
+    simulate_any_size_circuit_gpu,
+)
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 # MARK:         Examples             #
@@ -102,6 +106,70 @@ fn simulate_figure1_circuit() -> None:
     print("Final quantum state after tracing out qubits:\n", final_matrix)
 
 
+fn simulate_figure1_circuit_inplace() -> None:
+    """Simulates the circuit from Figure 1 in the paper."""
+    print("Simulating Figure 1 circuit.\nCircuit design:")
+    print(
+        """
+|0> -------|X|--|Z|--
+            |       
+|0> --|H|---*----*---
+                 |    
+|0> --|X|-------|X|--
+    """
+    )
+    # Initialize the quantum circuit to the |000⟩ state
+    quantum_state_0: StateVector = StateVector.from_bitstring("000")
+    quantum_state_1: StateVector = StateVector.from_bitstring("000")
+
+    print("Initial quantum state:\n", quantum_state_0)
+
+    # Gate 0
+    qubit_wise_multiply_inplace(
+        Hadamard.matrix, 1, quantum_state_0, quantum_state_1
+    )
+
+    print("After Hadamard gate on qubit 1:\n", quantum_state_1)
+
+    # Gate 1
+    qubit_wise_multiply_inplace(
+        PauliX.matrix, 2, quantum_state_1, quantum_state_0
+    )
+
+    print("After Pauli-X gate on qubit 2:\n", quantum_state_0)
+
+    # Gate 2
+    qubit_wise_multiply_inplace(
+        PauliX.matrix, 0, quantum_state_0, quantum_state_1, [[1, 1]]
+    )
+
+    print(
+        "After Pauli-X gate on qubit 0 with control on qubit 1:\n",
+        quantum_state_1,
+    )
+
+    # Gate 3
+    qubit_wise_multiply_inplace(
+        PauliZ.matrix, 0, quantum_state_1, quantum_state_0
+    )
+
+    print("After Pauli-Z gate on qubit 0:\n", quantum_state_0)
+
+    # Gate 4
+    qubit_wise_multiply_inplace(
+        PauliX.matrix, 2, quantum_state_0, quantum_state_1, [[1, 1]]
+    )
+
+    print(
+        "After Pauli-X gate on qubit 2 with control on qubit 1:\n",
+        quantum_state_1,
+    )
+
+    final_matrix = partial_trace(quantum_state_1, [])  # Trace out qubits
+
+    print("Final quantum state after tracing out qubits:\n", final_matrix)
+
+
 fn simulate_figure1_circuit_abstract() -> None:
     """
     Simulates the circuit from Figure 1 in the paper.
@@ -570,6 +638,8 @@ def main():
 
     simulate_figure1_circuit()
 
+    simulate_figure1_circuit_inplace()
+
     # simulate_figure1_circuit_abstract()
 
     # simulate_random_circuit(number_qubits, number_layers)
@@ -586,6 +656,6 @@ def main():
 
     # debug_something()
 
-    gpu_debug_something()
+    # simulate_figure1_circuit_gpu()
 
-    run_gpu_not_abstract()
+    simulate_any_size_circuit_gpu[4]()
diff --git a/pixi.toml b/pixi.toml
index 48e015d..8d44315 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -62,7 +62,7 @@ outputs = ["build/qlabs.mojopkg"]
 depends-on = ["create_build_dir", "format_src"]
 
 [tasks.clean] # Clean the package files and Build directory
-cmd = "rm build/* && rmdir build/ && rm examples/qlabs.mojopkg && rm tests/qlabs.mojopkg && rm benchmarks/qlabs.mojopkg"
+cmd = "rm build/* && rmdir build/ && rm examples/qlabs.mojopkg && rm tests/qlabs.mojopkg && rm benchmarks/qlabs.mojopkg || true"
 
 [tasks.install] # Install the package in the necessary directories
 cmd = "cp build/qlabs.mojopkg examples/qlabs.mojopkg && cp build/qlabs.mojopkg tests/qlabs.mojopkg && cp build/qlabs.mojopkg benchmarks/qlabs.mojopkg"
diff --git a/src/base/__init__.mojo b/src/base/__init__.mojo
index eecf74b..2bd1408 100644
--- a/src/base/__init__.mojo
+++ b/src/base/__init__.mojo
@@ -18,6 +18,7 @@ from .gate import (
 
 from .qubits_operations import (
     qubit_wise_multiply,
+    qubit_wise_multiply_inplace,
     qubit_wise_multiply_extended,
     apply_swap,
     partial_trace,
diff --git a/src/base/gpu/__init__.mojo b/src/base/gpu/__init__.mojo
index bb72767..09d6bd9 100644
--- a/src/base/gpu/__init__.mojo
+++ b/src/base/gpu/__init__.mojo
@@ -1,4 +1,3 @@
 from .qubits_operations import (
-    qubit_wise_multiply_gpu,
-    qubit_wise_multiply_gpu_2,
+    qubit_wise_multiply_inplace_gpu,
 )
diff --git a/src/base/gpu/qubits_operations.mojo b/src/base/gpu/qubits_operations.mojo
index 4b64ff6..2a980d6 100644
--- a/src/base/gpu/qubits_operations.mojo
+++ b/src/base/gpu/qubits_operations.mojo
@@ -1,6 +1,6 @@
 from bit import count_trailing_zeros
 
-from gpu import thread_idx, block_dim, block_idx
+from gpu import thread_idx, block_dim, block_idx, global_idx, barrier
 from gpu.host import DeviceContext
 from layout import Layout, LayoutTensor
 
@@ -11,146 +11,320 @@ alias STATE_VECTOR_SIZE = 8
 alias NUMBER_CONTROL_BITS = 1
 
 alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
-alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE, 1)
+alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE)
 alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
 
+alias CIRCUIT_NUMBER_CONTROL_GATES = 2
+alias circuit_control_bits_layout = Layout.row_major(
+    CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
+)
+
 alias GATE_SET_SIZE = 3
 alias gate_set_1qubit_layout = Layout.row_major(
     GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
 )
 
+alias gate_set_1qubit_vectorized_layout = Layout.row_major(
+    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE, 2
+)
 
-fn qubit_wise_multiply_gpu(
-    gate_re: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
-    gate_im: LayoutTensor[mut=True, dtype, gate_1qubit_layout],
-    gate_size: Int,
-    target_qubit: Int,
-    quantum_state_re: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    quantum_state_im: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    number_qubits: Int,
-    quantum_state_size: Int,
-    quantum_state_out_re: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    quantum_state_out_im: LayoutTensor[
-        mut=True, dtype, state_vector_3qubits_layout
-    ],
-    control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
-    # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
-    number_control_bits: Int,
-) -> None:
-    """Applies a quantum gate to specific qubits in the quantum state.
-
-    It will apply the gate starting from the target qubit assuming that the other
-    qubits that the gate acts on are following the target qubit.
-
-    Args:
-        gate_re: Real part of the gate matrix.
-        gate_im: Imaginary part of the gate matrix.
-        gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
-        target_qubit: The index of the target qubit to apply the gate to.
-        quantum_state_re: Real part of the quantum state vector.
-        quantum_state_im: Imaginary part of the quantum state vector.
-        number_qubits: Total number of qubits in the quantum state.
-        quantum_state_size: Size of the quantum state vector (2^number_qubits).
-        quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
-        quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
-        control_bits: List of control bits, where each control bit is a list containing
-                      [wire_index, flag] (1 for control, 0 for anti-control).
-        number_control_bits: Number of control bits.
-
-    """
-    print("Inside qubit_wise_multiply_gpu")
-    target_qubits_count: Int = count_trailing_zeros(gate_size)
-    if (target_qubit < 0) or (target_qubit >= number_qubits):
-        print(
-            "Error: target_qubit index out of bounds. Must be between 0 and",
-            number_qubits - 1,
-        )
-        print("Skipping gate application.")
-        return
-
-    print("AAAAA")
-    inclusion_mask: Int = 0
-    desired_value_mask: Int = 0
-    for i in range(number_control_bits):
-        print("before")
-        wire_index, flag = control_bits[i, 0], control_bits[i, 1]
-        print("after")
-        bit: Int = 1 << Int(
-            wire_index
-        )  # efficient way of computing 2^wire_index
-        inclusion_mask |= bit  # turn on the bit
-        if flag == 1:
-            desired_value_mask |= bit  # turn on the bit
-
-    print("BBBBB")
-    size_of_state_vector: Int = quantum_state_size
-    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
-    size_of_block: Int = size_of_half_block << target_qubits_count
 
-    print("CCCC")
-    # copies all amplitudes from quantum_state to quantum_state_out
-    for i in range(size_of_state_vector):
-        quantum_state_out_re[i, 0] = quantum_state_re[i, 0]
-        quantum_state_out_im[i, 0] = quantum_state_im[i, 0]
-
-    print("before loop")
-    for block_start in range(0, size_of_state_vector, size_of_block):
-        # print("block_start:", block_start)
-        for offset in range(size_of_half_block):
-            # print("offset:", offset)
-            i1: Int = (
-                block_start | offset
-            )  # faster than, but equivalent to, block_start + offset
-
-            if (i1 & inclusion_mask) != desired_value_mask:
-                continue  # skip this iteration if the control bits do not match
-
-            i2: Int = (
-                i1 | size_of_half_block
-            )  # equivalent to i1 + size_of_half_block
-
-            print("i1:", i1, "i2:", i2)
-
-            quantum_state_out_re[i1, 0] = (
-                (gate_re[0, 0] * quantum_state_re[i1, 0])
-                - (gate_im[0, 0] * quantum_state_im[i1, 0])
-                + (gate_re[0, 1] * quantum_state_re[i2, 0])
-                - (gate_im[0, 1] * quantum_state_im[i2, 0])
-            )
-
-            quantum_state_out_im[i1, 0] = (
-                (gate_re[0, 0] * quantum_state_im[i1, 0])
-                - (gate_im[0, 0] * quantum_state_re[i1, 0])
-                + (gate_re[0, 1] * quantum_state_im[i2, 0])
-                - (gate_im[0, 1] * quantum_state_re[i2, 0])
-            )
-
-            quantum_state_out_re[i2, 0] = (
-                (gate_re[1, 0] * quantum_state_re[i1, 0])
-                - (gate_im[1, 0] * quantum_state_im[i1, 0])
-                + (gate_re[1, 1] * quantum_state_re[i2, 0])
-                - (gate_im[1, 1] * quantum_state_im[i2, 0])
-            )
-
-            quantum_state_out_im[i2, 0] = (
-                (gate_re[1, 0] * quantum_state_im[i1, 0])
-                - (gate_im[1, 0] * quantum_state_re[i1, 0])
-                + (gate_re[1, 1] * quantum_state_im[i2, 0])
-                - (gate_im[1, 1] * quantum_state_re[i2, 0])
-            )
-
-
-fn qubit_wise_multiply_gpu_2[
+# fn qubit_wise_multiply_inplace_gpu[
+#     number_control_bits: Int
+# ](
+#     gate_set_re: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
+#     gate_set_im: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
+#     gate_index: Int,
+#     gate_size: Int,
+#     target_qubit: Int,
+#     quantum_state_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     number_qubits: Int,
+#     quantum_state_size: Int,
+#     quantum_state_out_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_out_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+#     control_bits_circuit: LayoutTensor[
+#         mut=False, DType.int32, circuit_control_bits_layout
+#     ],
+#     current_control_gate_circuit: LayoutTensor[
+#         mut=True, DType.int32, Layout.row_major(1)
+#     ],
+# ) -> None:
+#     """Applies a quantum gate to specific qubits in the quantum state.
+
+#     It will apply the gate starting from the target qubit assuming that the other
+#     qubits that the gate acts on are following the target qubit.
+
+#     Parameters:
+#         number_control_bits: Number of control bits.
+
+#     Args:
+#         gate_set_re: All unique gates applied in the circuit, real part.
+#         gate_set_im: All unique gates applied in the circuit, imaginary part.
+#         gate_index: Index of the gate in the gate set to apply.
+#         gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
+#         target_qubit: The index of the target qubit to apply the gate to.
+#         quantum_state_re: Real part of the quantum state vector.
+#         quantum_state_im: Imaginary part of the quantum state vector.
+#         number_qubits: Total number of qubits in the quantum state.
+#         quantum_state_size: Size of the quantum state vector (2^number_qubits).
+#         quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
+#         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
+#         control_bits_circuit: Control bits, where each control bit contains
+#                             [wire_index, flag] (1 for control, 0 for anti-control).
+#         current_control_gate_circuit: Current control gate circuit index,
+#                                 used to track the position in the control_bits_circuit.
+#     """
+#     print("Inside qubit_wise_multiply_gpu")
+#     target_qubits_count: Int = count_trailing_zeros(gate_size)
+#     if (target_qubit < 0) or (target_qubit >= number_qubits):
+#         print(
+#             "Error: target_qubit index out of bounds. Must be between 0 and",
+#             number_qubits - 1,
+#         )
+#         print("Skipping gate application.")
+#         return
+
+#     print("AAAAA")
+#     inclusion_mask: Int = 0
+#     desired_value_mask: Int = 0
+
+#     @parameter
+#     for i in range(number_control_bits):
+#         print("before")
+#         wire_index, flag = (
+#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 0],
+#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 1],
+#         )
+#         current_control_gate_circuit[0] += 1
+#         print("after")
+#         bit: Int = 1 << Int(
+#             wire_index
+#         )  # efficient way of computing 2^wire_index
+#         inclusion_mask |= bit  # turn on the bit
+#         if flag == 1:
+#             desired_value_mask |= bit  # turn on the bit
+
+#     print("BBBBB")
+#     size_of_state_vector: Int = quantum_state_size
+#     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+#     size_of_block: Int = size_of_half_block << target_qubits_count
+
+#     print("CCCC")
+#     # copies all amplitudes from quantum_state to quantum_state_out
+#     for i in range(size_of_state_vector):
+#         quantum_state_out_re[i] = quantum_state_re[i]
+#         quantum_state_out_im[i] = quantum_state_im[i]
+
+#     print("before loop")
+#     for block_start in range(0, size_of_state_vector, size_of_block):
+#         # print("block_start:", block_start)
+#         for offset in range(size_of_half_block):
+#             # print("offset:", offset)
+#             i1: Int = (
+#                 block_start | offset
+#             )  # faster than, but equivalent to, block_start + offset
+
+#             if (i1 & inclusion_mask) != desired_value_mask:
+#                 continue  # skip this iteration if the control bits do not match
+
+#             i2: Int = (
+#                 i1 | size_of_half_block
+#             )  # equivalent to i1 + size_of_half_block
+
+#             print("i1:", i1, "i2:", i2)
+
+#             # new_state_vector[i1] = (
+#             #     gate[0, 0] * quantum_state[i1] + gate[0, 1] * quantum_state[i2]
+#             # )
+#             # new_state_vector[i2] = (
+#             #     gate[1, 0] * quantum_state[i1] + gate[1, 1] * quantum_state[i2]
+#             # )
+
+#             quantum_state_out_re[i1] = (
+#                 (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
+#                 - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
+#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
+#                 - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
+#             )
+
+#             quantum_state_out_im[i1] = (
+#                 (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
+#                 - (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
+#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
+#                 - (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
+#             )
+
+#             quantum_state_out_re[i2] = (
+#                 (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
+#                 - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
+#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
+#                 - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
+#             )
+
+#             quantum_state_out_im[i2] = (
+#                 (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
+#                 - (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
+#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
+#                 - (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
+#             )
+
+
+# NOTE: Works with
+# alias BLOCKS_PER_GRID = 1
+# alias THREADS_PER_BLOCK = (1, 1)
+# fn qubit_wise_multiply_inplace_gpu[
+#     number_control_bits: Int
+# ](
+#     gate_set_re: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
+#     gate_set_im: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
+#     gate_index: Int,
+#     gate_size: Int,
+#     target_qubit: Int,
+#     quantum_state_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     number_qubits: Int,
+#     quantum_state_size: Int,
+#     quantum_state_out_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_out_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+#     control_bits_circuit: LayoutTensor[
+#         mut=False, DType.int32, circuit_control_bits_layout
+#     ],
+#     current_control_gate_circuit: LayoutTensor[
+#         mut=True, DType.int32, Layout.row_major(1)
+#     ],
+# ) -> None:
+#     """Applies a quantum gate to specific qubits in the quantum state.
+
+#     It will apply the gate starting from the target qubit assuming that the other
+#     qubits that the gate acts on are following the target qubit.
+
+#     Parameters:
+#         number_control_bits: Number of control bits.
+
+#     Args:
+#         gate_set_re: All unique gates applied in the circuit, real part.
+#         gate_set_im: All unique gates applied in the circuit, imaginary part.
+#         gate_index: Index of the gate in the gate set to apply.
+#         gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
+#         target_qubit: The index of the target qubit to apply the gate to.
+#         quantum_state_re: Real part of the quantum state vector.
+#         quantum_state_im: Imaginary part of the quantum state vector.
+#         number_qubits: Total number of qubits in the quantum state.
+#         quantum_state_size: Size of the quantum state vector (2^number_qubits).
+#         quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
+#         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
+#         control_bits_circuit: Control bits, where each control bit contains
+#                             [wire_index, flag] (1 for control, 0 for anti-control).
+#         current_control_gate_circuit: Current control gate circuit index,
+#                                 used to track the position in the control_bits_circuit.
+#     """
+#     target_qubits_count: Int = count_trailing_zeros(gate_size)
+#     if (target_qubit < 0) or (target_qubit >= number_qubits):
+#         print(
+#             "Error: target_qubit index out of bounds. Must be between 0 and",
+#             number_qubits - 1,
+#             "(Skipping gate application)",
+#         )
+#         return
+
+#     # global_i = block_dim.x * block_idx.x + thread_idx.x
+#     global_i = global_idx.x
+#     local_i = thread_idx.x
+
+#     inclusion_mask: Int = 0
+#     desired_value_mask: Int = 0
+
+#     @parameter
+#     for i in range(number_control_bits):
+#         wire_index, flag = (
+#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 0],
+#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 1],
+#         )
+#         current_control_gate_circuit[0] += 1
+#         bit: Int = 1 << Int(
+#             wire_index
+#         )  # efficient way of computing 2^wire_index
+#         inclusion_mask |= bit  # turn on the bit
+#         if flag == 1:
+#             desired_value_mask |= bit  # turn on the bit
+
+#     size_of_state_vector: Int = quantum_state_size
+#     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+#     size_of_block: Int = size_of_half_block << target_qubits_count
+
+#     # copies all amplitudes from quantum_state to quantum_state_out
+#     for i in range(size_of_state_vector):
+#         quantum_state_out_re[i] = quantum_state_re[i]
+#         quantum_state_out_im[i] = quantum_state_im[i]
+
+#     for block_start in range(0, size_of_state_vector, size_of_block):
+#         # print("block_start:", block_start)
+#         for offset in range(size_of_half_block):
+#             # print("offset:", offset)
+#             i1: Int = (
+#                 block_start | offset
+#             )  # faster than, but equivalent to, block_start + offset
+
+#             if (i1 & inclusion_mask) != desired_value_mask:
+#                 continue  # skip this iteration if the control bits do not match
+
+#             i2: Int = (
+#                 i1 | size_of_half_block
+#             )  # equivalent to i1 + size_of_half_block
+
+#             quantum_state_out_re[i1] = (
+#                 (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
+#                 - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
+#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
+#                 - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
+#             )
+
+#             quantum_state_out_im[i1] = (
+#                 (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
+#                 + (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
+#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
+#                 + (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
+#             )
+
+#             quantum_state_out_re[i2] = (
+#                 (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
+#                 - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
+#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
+#                 - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
+#             )
+
+#             quantum_state_out_im[i2] = (
+#                 (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
+#                 + (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
+#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
+#                 + (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
+#             )
+
+
+fn qubit_wise_multiply_inplace_gpu[
     number_control_bits: Int
 ](
-    gate_set_re: LayoutTensor[mut=True, dtype, gate_set_1qubit_layout],
-    gate_set_im: LayoutTensor[mut=True, dtype, gate_set_1qubit_layout],
+    gate_set_re: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
+    gate_set_im: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
     gate_index: Int,
     gate_size: Int,
     target_qubit: Int,
@@ -168,7 +342,12 @@ fn qubit_wise_multiply_gpu_2[
     quantum_state_out_im: LayoutTensor[
         mut=True, dtype, state_vector_3qubits_layout
     ],
-    control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+    control_bits_circuit: LayoutTensor[
+        mut=False, DType.int32, circuit_control_bits_layout
+    ],
+    current_control_gate_circuit: LayoutTensor[
+        mut=True, DType.int32, Layout.row_major(1)
+    ],
 ) -> None:
     """Applies a quantum gate to specific qubits in the quantum state.
 
@@ -190,28 +369,41 @@ fn qubit_wise_multiply_gpu_2[
         quantum_state_size: Size of the quantum state vector (2^number_qubits).
         quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
-        control_bits: List of control bits, where each control bit is a list containing
-                      [wire_index, flag] (1 for control, 0 for anti-control).
+        control_bits_circuit: Control bits, where each control bit contains
+                            [wire_index, flag] (1 for control, 0 for anti-control).
+        current_control_gate_circuit: Current control gate circuit index,
+                                used to track the position in the control_bits_circuit.
     """
-    print("Inside qubit_wise_multiply_gpu")
-    target_qubits_count: Int = count_trailing_zeros(gate_size)
+    # target_qubits_count: Int = count_trailing_zeros(gate_size)
     if (target_qubit < 0) or (target_qubit >= number_qubits):
         print(
             "Error: target_qubit index out of bounds. Must be between 0 and",
             number_qubits - 1,
+            "(Skipping gate application)",
         )
-        print("Skipping gate application.")
         return
 
-    print("AAAAA")
+    # global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = global_idx.x
+    # local_i = thread_idx.x
+
+    # print("global_i:", global_i, "local_i:", local_i)
+
     inclusion_mask: Int = 0
     desired_value_mask: Int = 0
 
+    # CPU implementation
     @parameter
-    for i in range(number_control_bits):
-        print("before")
-        wire_index, flag = control_bits[i, 0], control_bits[i, 1]
-        print("after")
+    for control_qubit in range(number_control_bits):
+        wire_index, flag = (
+            control_bits_circuit[
+                Int(current_control_gate_circuit[0]), control_qubit, 0
+            ],
+            control_bits_circuit[
+                Int(current_control_gate_circuit[0]), control_qubit, 1
+            ],
+        )
+        current_control_gate_circuit[0] += 1
         bit: Int = 1 << Int(
             wire_index
         )  # efficient way of computing 2^wire_index
@@ -219,59 +411,289 @@ fn qubit_wise_multiply_gpu_2[
         if flag == 1:
             desired_value_mask |= bit  # turn on the bit
 
-    print("BBBBB")
+    # # GPU implementation
+    # if global_i < number_control_bits:
+    #     wire_index, flag = (
+    #         control_bits_circuit[
+    #             Int(current_control_gate_circuit[0]), global_i, 0
+    #         ],
+    #         control_bits_circuit[
+    #             Int(current_control_gate_circuit[0]), global_i, 1
+    #         ],
+    #     )
+    #     current_control_gate_circuit[0] += 1
+    #     bit: Int = 1 << Int(
+    #         wire_index
+    #     )  # efficient way of computing 2^wire_index
+    #     inclusion_mask |= bit  # turn on the bit
+    #     if flag == 1:
+    #         desired_value_mask |= bit  # turn on the bit
+
     size_of_state_vector: Int = quantum_state_size
     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
-    size_of_block: Int = size_of_half_block << target_qubits_count
 
-    print("CCCC")
     # copies all amplitudes from quantum_state to quantum_state_out
-    for i in range(size_of_state_vector):
-        quantum_state_out_re[i, 0] = quantum_state_re[i, 0]
-        quantum_state_out_im[i, 0] = quantum_state_im[i, 0]
-
-    print("before loop")
-    for block_start in range(0, size_of_state_vector, size_of_block):
-        # print("block_start:", block_start)
-        for offset in range(size_of_half_block):
-            # print("offset:", offset)
-            i1: Int = (
-                block_start | offset
-            )  # faster than, but equivalent to, block_start + offset
-
-            if (i1 & inclusion_mask) != desired_value_mask:
-                continue  # skip this iteration if the control bits do not match
-
-            i2: Int = (
-                i1 | size_of_half_block
-            )  # equivalent to i1 + size_of_half_block
-
-            print("i1:", i1, "i2:", i2)
-
-            quantum_state_out_re[i1, 0] = (
-                (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1, 0])
-                - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1, 0])
-                + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2, 0])
-                - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2, 0])
-            )
-
-            quantum_state_out_im[i1, 0] = (
-                (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1, 0])
-                - (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1, 0])
-                + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2, 0])
-                - (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2, 0])
-            )
-
-            quantum_state_out_re[i2, 0] = (
-                (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1, 0])
-                - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1, 0])
-                + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2, 0])
-                - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2, 0])
-            )
-
-            quantum_state_out_im[i2, 0] = (
-                (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1, 0])
-                - (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1, 0])
-                + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2, 0])
-                - (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2, 0])
-            )
+
+    # # CPU implementation
+    # for i in range(size_of_state_vector):
+    #     quantum_state_out_re[i] = quantum_state_re[i]
+    #     quantum_state_out_im[i] = quantum_state_im[i]
+
+    # GPU implementation
+    # Parallel copy of the entire state vector
+    if global_i < size_of_state_vector:
+        quantum_state_out_re[global_i] = quantum_state_re[global_i]
+        quantum_state_out_im[global_i] = quantum_state_im[global_i]
+
+    # Synchronize all threads to ensure the copy is complete before proceeding.
+    barrier()
+
+    # if global_i > 0:
+    #     return  # Only the first thread in the block will execute the function
+
+    # size_of_block: Int = size_of_half_block << target_qubits_count
+    # for block_start in range(0, size_of_state_vector, size_of_block):
+    #     # print("block_start:", block_start)
+    #     for offset in range(size_of_half_block):
+    #         # print("offset:", offset)
+    #         i1: Int = (
+    #             block_start | offset
+    #         )  # faster than, but equivalent to, block_start + offset
+
+    #         if (i1 & inclusion_mask) != desired_value_mask:
+    #             continue  # skip this iteration if the control bits do not match
+
+    #         i2: Int = (
+    #             i1 | size_of_half_block
+    #         )  # equivalent to i1 + size_of_half_block
+
+    #         quantum_state_out_re[i1] = (
+    #             (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
+    #             - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
+    #             + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
+    #             - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
+    #         )
+
+    #         quantum_state_out_im[i1] = (
+    #             (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
+    #             + (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
+    #             + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
+    #             + (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
+    #         )
+
+    #         quantum_state_out_re[i2] = (
+    #             (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
+    #             - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
+    #             + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
+    #             - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
+    #         )
+
+    #         quantum_state_out_im[i2] = (
+    #             (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
+    #             + (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
+    #             + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
+    #             + (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
+    #         )
+
+    # Parallel Gate Application
+    # Constants used by all threads
+    # target_qubits_count: Int = count_trailing_zeros(gate_size)
+    # size_of_half_block: Int = 1 << target_qubit
+
+    # Each thread works on one index `global_i`.
+    # We only need to proceed if the thread is within the state vector bounds.
+    if global_i < quantum_state_size:
+        # The core parallelization pattern:
+        # Only threads whose index has a '0' at the target_qubit position will do the work.
+        # These are the 'i1' indices.
+        is_i1_thread = (global_i & size_of_half_block) == 0
+
+        if is_i1_thread:
+            # This thread is responsible for an `i1` index.
+            i1: Int = global_i
+
+            # Check if the control bit condition is met for this pair.
+            if (i1 & inclusion_mask) == desired_value_mask:
+                # The condition is met, so we apply the gate.
+                # First, find the partner index `i2`.
+                i2: Int = i1 | size_of_half_block
+
+                # Fetch state vector values for the pair (ψ1, ψ2)
+                psi1_re = quantum_state_re[i1]
+                psi1_im = quantum_state_im[i1]
+                psi2_re = quantum_state_re[i2]
+                psi2_im = quantum_state_im[i2]
+
+                # Fetch gate matrix elements (g00, g01, g10, g11)
+                g00_re = gate_set_re[gate_index, 0, 0]
+                g00_im = gate_set_im[gate_index, 0, 0]
+                g01_re = gate_set_re[gate_index, 0, 1]
+                g01_im = gate_set_im[gate_index, 0, 1]
+                g10_re = gate_set_re[gate_index, 1, 0]
+                g10_im = gate_set_im[gate_index, 1, 0]
+                g11_re = gate_set_re[gate_index, 1, 1]
+                g11_im = gate_set_im[gate_index, 1, 1]
+
+                # Perform the 2x2 matrix-vector multiplication:
+                # [ out1 ] = [ g00 g01 ] [ psi1 ]
+                # [ out2 ]   [ g10 g11 ] [ psi2 ]
+
+                # Calculate out1 = g00 * psi1 + g01 * psi2
+                # Real part: (g00_re*psi1_re - g00_im*psi1_im) + (g01_re*psi2_re - g01_im*psi2_im)
+                quantum_state_out_re[i1] = (
+                    g00_re * psi1_re - g00_im * psi1_im
+                ) + (g01_re * psi2_re - g01_im * psi2_im)
+
+                # Imaginary part: (g00_re*psi1_im + g00_im*psi1_re) + (g01_re*psi2_im + g01_im*psi2_re)
+                # NOTE: This uses the standard complex multiplication rule (ad+bc).
+                quantum_state_out_im[i1] = (
+                    g00_re * psi1_im + g00_im * psi1_re
+                ) + (g01_re * psi2_im + g01_im * psi2_re)
+
+                # Calculate out2 = g10 * psi1 + g11 * psi2
+                # Real part: (g10_re*psi1_re - g10_im*psi1_im) + (g11_re*psi2_re - g11_im*psi2_im)
+                quantum_state_out_re[i2] = (
+                    g10_re * psi1_re - g10_im * psi1_im
+                ) + (g11_re * psi2_re - g11_im * psi2_im)
+
+                # Imaginary part: (g10_re*psi1_im + g10_im*psi1_re) + (g11_re*psi2_im + g11_im*psi2_re)
+                quantum_state_out_im[i2] = (
+                    g10_re * psi1_im + g10_im * psi1_re
+                ) + (g11_re * psi2_im + g11_im * psi2_re)
+            # If control bits do not match, we do nothing. The values already
+            # copied to quantum_state_out are correct.
+
+
+# # TODO one day, but maybe it will become memory bound if we do that since we have to create
+# # intermediary values for complex multiplications
+# fn qubit_wise_multiply_gpu_3[
+#     number_control_bits: Int
+# ](
+#     gate_set: LayoutTensor[mut=False, dtype, gate_set_1qubit_vectorized_layout],
+#     gate_index: Int,
+#     gate_size: Int,
+#     target_qubit: Int,
+#     quantum_state_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     number_qubits: Int,
+#     quantum_state_size: Int,
+#     quantum_state_out_re: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     quantum_state_out_im: LayoutTensor[
+#         mut=True, dtype, state_vector_3qubits_layout
+#     ],
+#     # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
+#     control_bits_circuit: LayoutTensor[
+#         mut=False, DType.int32, circuit_control_bits_layout
+#     ],
+#     current_control_gate_circuit: LayoutTensor[
+#         mut=True, DType.int32, Layout.row_major(1)
+#     ],
+# ) -> None:
+#     """Applies a quantum gate to specific qubits in the quantum state.
+
+#     It will apply the gate starting from the target qubit assuming that the other
+#     qubits that the gate acts on are following the target qubit.
+
+#     Parameters:
+#         number_control_bits: Number of control bits.
+
+#     Args:
+#         gate_set: All unique gates applied in the circuit, real and imaginary parts
+#                   to be treated as a SIMD vector.
+#         gate_index: Index of the gate in the gate set to apply.
+#         gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
+#         target_qubit: The index of the target qubit to apply the gate to.
+#         quantum_state_re: Real part of the quantum state vector.
+#         quantum_state_im: Imaginary part of the quantum state vector.
+#         number_qubits: Total number of qubits in the quantum state.
+#         quantum_state_size: Size of the quantum state vector (2^number_qubits).
+#         quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
+#         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
+#         control_bits_circuit: Control bits, where each control bit contains
+#                             [wire_index, flag] (1 for control, 0 for anti-control).
+#         current_control_gate_circuit: Current control gate circuit index,
+#                                 used to track the position in the control_bits_circuit.
+#     """
+#     print("Inside qubit_wise_multiply_gpu")
+#     target_qubits_count: Int = count_trailing_zeros(gate_size)
+#     if (target_qubit < 0) or (target_qubit >= number_qubits):
+#         print(
+#             "Error: target_qubit index out of bounds. Must be between 0 and",
+#             number_qubits - 1,
+#         )
+#         print("Skipping gate application.")
+#         return
+
+#     print("AAAAA")
+#     inclusion_mask: Int = 0
+#     desired_value_mask: Int = 0
+
+#     @parameter
+#     for i in range(number_control_bits):
+#         print("before")
+#         wire_index, flag = (
+#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 0],
+#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 1],
+#         )
+#         current_control_gate_circuit[0] += 1
+#         print("after")
+#         bit: Int = 1 << Int(
+#             wire_index
+#         )  # efficient way of computing 2^wire_index
+#         inclusion_mask |= bit  # turn on the bit
+#         if flag == 1:
+#             desired_value_mask |= bit  # turn on the bit
+
+#     print("BBBBB")
+#     size_of_state_vector: Int = quantum_state_size
+#     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+#     size_of_block: Int = size_of_half_block << target_qubits_count
+
+#     print("CCCC")
+#     # copies all amplitudes from quantum_state to quantum_state_out
+#     for i in range(size_of_state_vector):
+#         quantum_state_out_re[i] = quantum_state_re[i]
+#         quantum_state_out_im[i] = quantum_state_im[i]
+
+#     print("before loop")
+#     for block_start in range(0, size_of_state_vector, size_of_block):
+#         # print("block_start:", block_start)
+#         for offset in range(size_of_half_block):
+#             # print("offset:", offset)
+#             i1: Int = (
+#                 block_start | offset
+#             )  # faster than, but equivalent to, block_start + offset
+
+#             if (i1 & inclusion_mask) != desired_value_mask:
+#                 continue  # skip this iteration if the control bits do not match
+
+#             i2: Int = (
+#                 i1 | size_of_half_block
+#             )  # equivalent to i1 + size_of_half_block
+
+#             print("i1:", i1, "i2:", i2)
+
+#             # new_state_vector[i1] = (
+#             #     gate[0, 0] * quantum_state[i1] + gate[0, 1] * quantum_state[i2]
+#             # )
+#             # new_state_vector[i2] = (
+#             #     gate[1, 0] * quantum_state[i1] + gate[1, 1] * quantum_state[i2]
+#             # )
+
+#             right_part = gate_set[gate_index, 0, 0] * quantum_state_im[i1]
+#             right_part_re = right_part
+#             right_part_re[1] = 0
+#             right_part_im = right_part
+#             right_part_im[0] = 0
+
+#             quantum_state_out_re[i1] = (
+#                 (gate_set[gate_index, 0, 0] * quantum_state_re[i1])
+#                 - right_part_re
+#                 + right_part_im
+#             )
diff --git a/src/base/qubits_operations.mojo b/src/base/qubits_operations.mojo
index f39e221..91168b5 100644
--- a/src/base/qubits_operations.mojo
+++ b/src/base/qubits_operations.mojo
@@ -320,6 +320,74 @@ fn qubit_wise_multiply(
     return new_state_vector
 
 
+fn qubit_wise_multiply_inplace(
+    gate: ComplexMatrix,
+    target_qubit: Int,
+    mut quantum_state_in: StateVector,
+    mut quantum_state_out: StateVector,
+    control_bits: List[List[Int]] = [],
+) -> None:
+    """Applies a quantum gate to specific qubits in the quantum state.
+
+    It will apply the gate starting from the target qubit assuming that the other
+    qubits that the gate acts on are following the target qubit.
+
+    Args:
+        gate: The 2x2 matrix representing the quantum gate.
+        target_qubit: The index of the qubit on which the gate is applied.
+        quantum_state_in: The current state of the quantum system.
+        quantum_state_out: The state vector to store the result of the gate application.
+        control_bits: A list of control bits, where each bit is represented as
+                    [wire_index, flag]. If flag is 1, it is a control bit; if 0,
+                    it is an anti-control bit.
+    """
+    gate_size: Int = gate.size()
+    target_qubits_count: Int = count_trailing_zeros(gate_size)
+    if (target_qubit < 0) or (target_qubit >= quantum_state_in.number_qubits()):
+        print(
+            "Error: target_qubit index out of bounds. Must be between 0 and",
+            quantum_state_in.number_qubits() - 1,
+        )
+        print("Skipping gate application.")
+        return
+
+    inclusion_mask: Int = 0
+    desired_value_mask: Int = 0
+    for control in control_bits:
+        wire_index, flag = control[0], control[1]
+        bit: Int = 1 << wire_index  # efficient way of computing 2^wire_index
+        inclusion_mask |= bit  # turn on the bit
+        if flag == 1:
+            desired_value_mask |= bit  # turn on the bit
+
+    size_of_state_vector: Int = quantum_state_in.size()
+    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+    size_of_block: Int = size_of_half_block << target_qubits_count
+    quantum_state_out = quantum_state_in
+
+    for block_start in range(0, size_of_state_vector, size_of_block):
+        for offset in range(size_of_half_block):
+            i1: Int = (
+                block_start | offset
+            )  # faster than, but equivalent to, block_start + offset
+
+            if (i1 & inclusion_mask) != desired_value_mask:
+                continue  # skip this iteration if the control bits do not match
+
+            i2: Int = (
+                i1 | size_of_half_block
+            )  # equivalent to i1 + size_of_half_block
+
+            quantum_state_out[i1] = (
+                gate[0, 0] * quantum_state_in[i1]
+                + gate[0, 1] * quantum_state_in[i2]
+            )
+            quantum_state_out[i2] = (
+                gate[1, 0] * quantum_state_in[i1]
+                + gate[1, 1] * quantum_state_in[i2]
+            )
+
+
 # fn invert_gate_endian() #TODO
 
 

From 0a8c5a41c477fda303b671920b202bea18df900a Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sun, 29 Jun 2025 03:35:29 -0600
Subject: [PATCH 4/7] feat: clean gpu code + add gpu tests + fix benchmarks +
 add plotting

---
 .gitignore                                    |   1 +
 benchmarks/all_benchmarks.mojo                |  26 +-
 benchmarks/bench_qubit_wise_multiply.mojo     | 434 +--------
 benchmarks/bench_qubit_wise_multiply_gpu.mojo | 378 ++++++++
 benchmarks/plot_results.py                    | 104 ++
 examples/gpu_examples.mojo                    | 709 ++------------
 examples/main.mojo                            |   7 +-
 pixi.lock                                     | 899 ++++++++++++++++++
 pixi.toml                                     |   9 +
 src/base/gpu/qubits_operations.mojo           | 518 +++-------
 tests/base/test_gpu_qubits_operations.mojo    | 635 +++++++++++++
 tests/base/test_qubit_operations.mojo         |  34 +-
 tests/base/testing_state_vector.mojo          |  33 +
 13 files changed, 2316 insertions(+), 1471 deletions(-)
 create mode 100644 benchmarks/bench_qubit_wise_multiply_gpu.mojo
 create mode 100644 benchmarks/plot_results.py
 create mode 100644 tests/base/test_gpu_qubits_operations.mojo

diff --git a/.gitignore b/.gitignore
index 49dabf2..1ae1f98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 # Binary files
 build/
 build/*
+data/
 main
 *.mojopkg
 
diff --git a/benchmarks/all_benchmarks.mojo b/benchmarks/all_benchmarks.mojo
index 6fefb3c..a03ebfb 100644
--- a/benchmarks/all_benchmarks.mojo
+++ b/benchmarks/all_benchmarks.mojo
@@ -4,21 +4,41 @@ from bench_simulate_random_circuit import bench_simulate_random_circuit
 from bench_qubit_wise_multiply import (
     bench_qubit_wise_multiply,
     bench_qubit_wise_multiply_inplace,
-    bench_qubit_wise_multiply_inplace_gpu,
     bench_qubit_wise_multiply_extended,
 )
+from bench_qubit_wise_multiply_gpu import (
+    bench_qubit_wise_multiply_inplace_gpu,
+)
 
 
 def main():
     print("Running all benchmarks...")
     # bench_qubit_wise_multiply()
-    # bench_qubit_wise_multiply_inplace()
+    bench_qubit_wise_multiply_inplace[
+        min_number_qubits=5,
+        max_number_qubits=25,
+        number_qubits_step_size=2,
+        min_number_layers=5,
+        max_number_layers=3000,
+        number_layers_step_size=400,
+        fixed_number_qubits=10,
+        fixed_number_layers=5,
+    ]()
 
     @parameter
     if not has_accelerator():
         print("No compatible GPU found")
     else:
-        bench_qubit_wise_multiply_inplace_gpu()
+        bench_qubit_wise_multiply_inplace_gpu[
+            min_number_qubits=5,
+            max_number_qubits=25,
+            number_qubits_step_size=2,
+            min_number_layers=5,
+            max_number_layers=3000,
+            number_layers_step_size=400,
+            fixed_number_qubits=10,
+            fixed_number_layers=5,
+        ]()
 
     # bench_qubit_wise_multiply_extended()
     # bench_simulate_random_circuit()
diff --git a/benchmarks/bench_qubit_wise_multiply.mojo b/benchmarks/bench_qubit_wise_multiply.mojo
index bd723d8..ef6d3b7 100644
--- a/benchmarks/bench_qubit_wise_multiply.mojo
+++ b/benchmarks/bench_qubit_wise_multiply.mojo
@@ -5,6 +5,7 @@ from layout import Layout, LayoutTensor, IntTuple
 from benchmark import Bench, BenchConfig, Bencher, BenchId, keep
 
 from pathlib import Path
+from os import makedirs
 
 import random
 
@@ -30,8 +31,6 @@ from qlabs.base import (
     partial_trace,
 )
 
-from qlabs.base.gpu import qubit_wise_multiply_inplace_gpu
-
 from qlabs.abstractions import (
     GateCircuit,
     StateVectorSimulator,
@@ -41,38 +40,6 @@ from qlabs.abstractions import (
 )
 
 
-# alias BLOCKS_PER_GRID = 1
-# alias THREADS_PER_BLOCK = (1, 1)
-alias dtype = DType.float32
-
-alias GATE_SIZE = 2
-alias NUMBER_CONTROL_BITS = 1
-# TODO have NUMBER_CONTROL_BITS be a list defining each gates specific control bits count
-alias CIRCUIT_NUMBER_CONTROL_GATES = 1
-alias circuit_control_bits_layout = Layout.row_major(
-    CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
-)
-
-alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
-alias STATE_VECTOR_SIZE = 8
-alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE)
-alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
-
-alias gate_set_dic: Dict[String, Int] = {
-    Hadamard.symbol: 0,
-    PauliX.symbol: 1,
-    PauliY.symbol: 2,
-    PauliZ.symbol: 3,
-}
-alias GATE_SET_SIZE = 4
-alias gate_set_1qubit_layout = Layout.row_major(
-    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
-)
-alias gate_set_1qubit_vectorized_layout = Layout.row_major(
-    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE, 2
-)
-
-
 @parameter
 @always_inline
 fn benchmark_qubit_wise_multiply[
@@ -237,321 +204,6 @@ fn benchmark_qubit_wise_multiply_inplace[
     b.iter_custom[qubit_wise_multiply_inplace_workflow](bench_ctx)
 
 
-@parameter
-@always_inline
-fn benchmark_qubit_wise_multiply_inplace_gpu[
-    num_qubits: Int, number_layers: Int
-](mut b: Bencher) raises:
-    # gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
-
-    # indexes: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
-    #     number_layers * 1 * num_qubits
-    # )
-    # random.seed()  # Seed on current time
-    # random.randint(
-    #     indexes, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
-    # )
-
-    bench_ctx = DeviceContext()
-
-    alias state_vector_size = 1 << num_qubits
-    alias state_vector_layout = Layout.row_major(state_vector_size)
-
-    alias total_threads = 2 * state_vector_size
-
-    # alias max_threads_per_block = 1024
-    alias sm_count = bench_ctx.device_info.sm_count
-    alias max_blocks_per_multiprocessor = bench_ctx.device_info.max_blocks_per_multiprocessor
-
-    # alias max_number_blocks = sm_count * max_blocks_per_multiprocessor
-    alias max_number_blocks = 128
-    alias max_threads_per_block = bench_ctx.device_info.max_thread_block_size
-
-    print("state_vector_size:", state_vector_size)
-
-    # try:
-    #     print("BEFORE:")
-    #     (free, total) = bench_ctx.get_memory_info()
-    #     print("Free memory:", free / (1024 * 1024), "MB")
-    #     print("Total memory:", total / (1024 * 1024), "MB")
-    # except:
-    #     print("Failed to get memory information")
-
-    @parameter
-    @always_inline
-    fn qubit_wise_multiply_inplace_gpu_workflow(ctx: DeviceContext) raises:
-        """Simulates on GPU a random quantum circuit with the specified number of qubits and layers.
-        """
-        # gate_set: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
-
-        try:
-            print("1.BEFORE ALLOCATING:")
-            (free, total) = ctx.get_memory_info()
-            print("Free memory:", free / (1024 * 1024), "MB")
-            print("Total memory:", total / (1024 * 1024), "MB")
-        except:
-            print("Failed to get memory information")
-
-        # blocks_per_grid = (
-        #     total_threads + max_threads_per_block - 1
-        # ) // max_threads_per_block
-
-        # if blocks_per_grid >= max_number_blocks:
-        #     blocks_per_grid = max_number_blocks - 1
-
-        blocks_per_grid = 1
-
-        threads_per_block = (
-            1,
-            1,
-            1,
-        )
-
-        # @parameter
-        # if total_threads < max_threads_per_block:
-        #     threads_per_block = (
-        #         total_threads,
-        #         1,
-        #         1,
-        #     )  # 1D block of threads
-
-        # print(
-        #     "blocks_per_grid:",
-        #     blocks_per_grid,
-        #     "max_number_blocks:",
-        #     max_number_blocks,
-        # )
-        # print("threads_per_block[0]:", threads_per_block[0])
-
-        # var control_bits_list: List[List[List[Int]]] = [
-        #     [[1, 1]],
-        #     # [[1, 1]],
-        # ]
-
-        # control_bits_list: List[List[List[Int]]] = []
-
-        # -- Create GPU variables -- #
-
-        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
-            state_vector_size
-        )
-        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
-            state_vector_size
-        )
-
-        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-
-        # host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
-        #     CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-        # )
-
-        quantum_state: StateVector = StateVector.from_bitstring("000")
-
-        # Wait for host buffers to be ready
-        ctx.synchronize()
-
-        # -- Fill host buffers -- #
-
-        for i in range(state_vector_size):
-            host_quantum_state_re[i] = quantum_state[i].re
-            host_quantum_state_im[i] = quantum_state[i].im
-
-        # for i in range(GATE_SET_SIZE):
-        #     gate = gate_set[i]
-        #     for j in range(GATE_SIZE):
-        #         for k in range(GATE_SIZE):
-        #             index = gate_set_1qubit_layout(
-        #                 IntTuple(i, j, k)
-        #             )  # Get the index in the 1D buffer
-        #             host_gate_set_re[index] = gate[j, k].re
-        #             host_gate_set_im[index] = gate[j, k].im
-
-        # for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
-        #     for j in range(NUMBER_CONTROL_BITS):
-        #         for k in range(2):
-        #             index = circuit_control_bits_layout(IntTuple(i, j, k))
-        #             host_control_bits_circuit[index] = control_bits_list[i][j][
-        #                 k
-        #             ]
-
-        # -- Copy host buffers to device buffers -- #
-        quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
-        quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
-
-        gate_set_re = ctx.enqueue_create_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-        gate_set_im = ctx.enqueue_create_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-
-        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
-            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-        )
-        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
-
-        # Create other buffers for functions
-
-        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
-            state_vector_size
-        )
-        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
-            state_vector_size
-        )
-
-        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
-        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
-
-        # gate_set_re.enqueue_copy_from(host_gate_set_re)
-        # gate_set_im.enqueue_copy_from(host_gate_set_im)
-        ctx.enqueue_memset(gate_set_re, 0.0)
-        ctx.enqueue_memset(gate_set_im, 0.0)
-
-        # control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
-        ctx.enqueue_memset(control_bits_circuit, 0)
-
-        # TODO report that this create a runtime error only in this context not when
-        # running the same code in a standalone script
-        # ctx.enqueue_memset(current_control_gate_circuit, 0.0)
-        ctx.enqueue_memset(current_control_gate_circuit, 0)
-        ctx.enqueue_memset(quantum_state_out_re, 0.0)
-        ctx.enqueue_memset(quantum_state_out_im, 0.0)
-
-        # -- Create layout tensors for GPU operations -- #
-        gate_set_re_tensor = LayoutTensor[
-            mut=False, dtype, gate_set_1qubit_layout
-        ](gate_set_re.unsafe_ptr())
-        gate_set_im_tensor = LayoutTensor[
-            mut=False, dtype, gate_set_1qubit_layout
-        ](gate_set_im.unsafe_ptr())
-
-        quantum_state_re_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_layout
-        ](quantum_state_re.unsafe_ptr())
-        quantum_state_im_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_layout
-        ](quantum_state_im.unsafe_ptr())
-
-        quantum_state_out_re_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_layout
-        ](quantum_state_out_re.unsafe_ptr())
-        quantum_state_out_im_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_layout
-        ](quantum_state_out_im.unsafe_ptr())
-
-        control_bits_circuit_tensor = LayoutTensor[
-            mut=False, DType.int32, circuit_control_bits_layout
-        ](control_bits_circuit.unsafe_ptr())
-        current_control_gate_circuit_tensor = LayoutTensor[
-            mut=True, DType.int32, Layout.row_major(1)
-        ](current_control_gate_circuit.unsafe_ptr())
-
-        # -- Apply circuit operations -- #
-
-        ctx.synchronize()
-        try:
-            print("2.AFTER ALLOCATING:")
-            (free, total) = ctx.get_memory_info()
-            print("Free memory:", free / (1024 * 1024), "MB")
-            print("Total memory:", total / (1024 * 1024), "MB")
-        except:
-            print("Failed to get memory information")
-
-        # print("HERE")
-        current_state = 0
-        for layer in range(number_layers):
-            # print("Layer:", layer, "out of", number_layers)
-            for qubit in range(num_qubits):
-                # print("Applying gate on qubit:", i, "of", num_qubits)
-                # print(
-                #     "gate symbol: ",
-                #     gates_list[Int(indexes[layer * num_qubits + i])].symbol,
-                # )
-                # print(
-                #     "gate index: ",
-                #     gate_set_dic[
-                #         gates_list[Int(indexes[layer * num_qubits + i])].symbol
-                #     ],
-                # )
-                if current_state == 0:
-                    ctx.enqueue_function[
-                        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-                    ](
-                        gate_set_re_tensor,
-                        gate_set_im_tensor,
-                        gate_set_dic[Hadamard.symbol],
-                        # gate_set_dic[
-                        #     gates_list[
-                        #         Int(indexes[layer * num_qubits + qubit])
-                        #     ].symbol
-                        # ],
-                        GATE_SIZE,
-                        qubit,  # target_qubit
-                        quantum_state_re_tensor,
-                        quantum_state_im_tensor,
-                        num_qubits,  # number_qubits
-                        state_vector_size,  # quantum_state_size
-                        quantum_state_out_re_tensor,
-                        quantum_state_out_im_tensor,
-                        control_bits_circuit_tensor,
-                        current_control_gate_circuit_tensor,
-                        grid_dim=blocks_per_grid,
-                        block_dim=threads_per_block,
-                    )
-                    current_state = 1
-                else:
-                    ctx.enqueue_function[
-                        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-                    ](
-                        gate_set_re_tensor,
-                        gate_set_im_tensor,
-                        gate_set_dic[Hadamard.symbol],
-                        # gate_set_dic[
-                        #     gates_list[
-                        #         Int(indexes[layer * num_qubits + qubit])
-                        #     ].symbol
-                        # ],
-                        GATE_SIZE,
-                        qubit,  # target_qubit
-                        quantum_state_out_re_tensor,
-                        quantum_state_out_im_tensor,
-                        num_qubits,  # number_qubits
-                        state_vector_size,  # quantum_state_size
-                        quantum_state_re_tensor,
-                        quantum_state_im_tensor,
-                        control_bits_circuit_tensor,
-                        current_control_gate_circuit_tensor,
-                        grid_dim=blocks_per_grid,
-                        block_dim=threads_per_block,
-                    )
-                    current_state = 0
-
-        keep(quantum_state_re.unsafe_ptr())
-        keep(quantum_state_im.unsafe_ptr())
-        keep(quantum_state_out_re.unsafe_ptr())
-        keep(quantum_state_out_im.unsafe_ptr())
-        keep(gate_set_re.unsafe_ptr())
-        keep(gate_set_im.unsafe_ptr())
-        keep(control_bits_circuit.unsafe_ptr())
-        keep(current_control_gate_circuit.unsafe_ptr())
-
-        ctx.synchronize()
-        try:
-            print("3. AFTER AUTOMATIC FREE:")
-            (free, total) = ctx.get_memory_info()
-            print("Free memory:", free / (1024 * 1024), "MB")
-            print("Total memory:", total / (1024 * 1024), "MB")
-        except:
-            print("Failed to get memory information")
-
-    b.iter_custom[qubit_wise_multiply_inplace_gpu_workflow](bench_ctx)
-
-
 @parameter
 @always_inline
 fn benchmark_qubit_wise_multiply_extended[
@@ -709,22 +361,25 @@ def bench_qubit_wise_multiply[
 
 
 def bench_qubit_wise_multiply_inplace[
-    # max_number_qubits: Int = 10,
-    # max_number_layers: Int = 20,
-    # fixed_number_qubits: Int = 5,
-    # fixed_number_layers: Int = 10,
-    max_number_qubits: Int = 16,
+    min_number_qubits: Int = 15,
+    max_number_qubits: Int = 25,
+    number_qubits_step_size: Int = 1,
+    min_number_layers: Int = 1,
     max_number_layers: Int = 2000,
+    number_layers_step_size: Int = 200,
     fixed_number_qubits: Int = 5,
-    fixed_number_layers: Int = 200,
+    fixed_number_layers: Int = 2,
 ]():
     print("Running qubit_wise_multiply_inplace() Benchmarks...")
     print("-" * 80)
     bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
     bench = Bench(bench_config)
+    makedirs("data", exist_ok=True)
 
     @parameter
-    for number_qubits in range(1, max_number_qubits + 1, 5):
+    for number_qubits in range(
+        min_number_qubits, max_number_qubits + 1, number_qubits_step_size
+    ):
         bench.bench_function[
             benchmark_qubit_wise_multiply_inplace[
                 number_qubits, fixed_number_layers
@@ -739,62 +394,18 @@ def bench_qubit_wise_multiply_inplace[
             )
         )
 
-    @parameter
-    for number_layers in range(1, max_number_layers + 1, 200):
-        bench.bench_function[
-            benchmark_qubit_wise_multiply_inplace[
-                fixed_number_qubits, number_layers
-            ]
-        ](
-            BenchId(
-                "qubit_wise_multiply_inplace_"
-                + String(fixed_number_qubits)
-                + "q_"
-                + String(number_layers)
-                + "l"
-            )
-        )
+    # print(bench)
+    bench.config.out_file = Path("data/qubit_wise_multiply_inplace_qubits.csv")
+    bench.dump_report()
 
-    print(bench)
-
-    # bench.config.out_file = Path("out.csv")
-    # bench.dump_report()
-
-    print("qubit_wise_multiply_inplace() Benchmarks completed!")
-    print("-" * 80)
-
-
-def bench_qubit_wise_multiply_inplace_gpu[
-    max_number_qubits: Int = 25,
-    max_number_layers: Int = 2000,
-    fixed_number_qubits: Int = 5,
-    fixed_number_layers: Int = 2,
-]():
-    print("Running qubit_wise_multiply_inplace() Benchmarks...")
-    print("-" * 80)
-    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
     bench = Bench(bench_config)
 
     @parameter
-    for number_qubits in range(15, max_number_qubits + 1, 1):
-        bench.bench_function[
-            benchmark_qubit_wise_multiply_inplace_gpu[
-                number_qubits, fixed_number_layers
-            ]
-        ](
-            BenchId(
-                "qubit_wise_multiply_inplace_"
-                + String(number_qubits)
-                + "q_"
-                + String(fixed_number_layers)
-                + "l"
-            )
-        )
-
-    @parameter
-    for number_layers in range(1, max_number_layers + 1, 200):
+    for number_layers in range(
+        min_number_layers, max_number_layers + 1, number_layers_step_size
+    ):
         bench.bench_function[
-            benchmark_qubit_wise_multiply_inplace_gpu[
+            benchmark_qubit_wise_multiply_inplace[
                 fixed_number_qubits, number_layers
             ]
         ](
@@ -807,12 +418,11 @@ def bench_qubit_wise_multiply_inplace_gpu[
             )
         )
 
-    print(bench)
+    # print(bench)
+    bench.config.out_file = Path("data/qubit_wise_multiply_inplace_layers.csv")
+    bench.dump_report()
 
-    # bench.config.out_file = Path("out.csv")
-    # bench.dump_report()
-
-    print("qubit_wise_multiply_inplace_gpu() Benchmarks completed!")
+    print("qubit_wise_multiply_inplace() Benchmarks completed!")
     print("-" * 80)
 
 
diff --git a/benchmarks/bench_qubit_wise_multiply_gpu.mojo b/benchmarks/bench_qubit_wise_multiply_gpu.mojo
new file mode 100644
index 0000000..3b37ac4
--- /dev/null
+++ b/benchmarks/bench_qubit_wise_multiply_gpu.mojo
@@ -0,0 +1,378 @@
+from gpu.host import DeviceContext
+
+from layout import Layout, LayoutTensor, IntTuple
+
+from benchmark import Bench, BenchConfig, Bencher, BenchId, keep
+
+from pathlib import Path
+from os import makedirs
+
+import random
+
+from qlabs.base import (
+    StateVector,
+    Gate,
+    Hadamard,
+    PauliX,
+    PauliY,
+    PauliZ,
+)
+
+from qlabs.base.gpu import qubit_wise_multiply_inplace_gpu
+
+from qlabs.abstractions import (
+    GateCircuit,
+    StateVectorSimulator,
+    ShowAfterEachGate,
+    ShowAfterEachLayer,
+    ShowOnlyEnd,
+)
+
+
+alias dtype = DType.float32
+
+alias GATE_SIZE = 2
+alias NUMBER_CONTROL_BITS = 1
+
+
+@parameter
+@always_inline
+fn benchmark_qubit_wise_multiply_inplace_gpu[
+    num_qubits: Int, number_layers: Int
+](mut b: Bencher) raises:
+    # gates_list: List[Gate] = [Hadamard, PauliX, PauliY, PauliZ]
+
+    # indexes: UnsafePointer[Int8] = UnsafePointer[Int8].alloc(
+    #     number_layers * 1 * num_qubits
+    # )
+    # random.seed()  # Seed on current time
+    # random.randint(
+    #     indexes, number_layers * 2 * num_qubits, 0, len(gates_list) - 1
+    # )
+
+    @parameter
+    @always_inline
+    fn qubit_wise_multiply_inplace_gpu_workflow(ctx: DeviceContext) raises:
+        """Simulates on GPU a random quantum circuit with the specified number of qubits and layers.
+        """
+        alias circuit_number_control_gates = 2
+        alias circuit_control_bits_layout = Layout.row_major(
+            circuit_number_control_gates, NUMBER_CONTROL_BITS, 2
+        )
+
+        gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
+        gate_set_dic: Dict[String, Int] = {
+            Hadamard.symbol: 0,
+            PauliX.symbol: 1,
+            PauliZ.symbol: 2,
+        }
+        alias gate_set_size = 3
+        alias gate_set_1qubit_layout = Layout.row_major(
+            gate_set_size, GATE_SIZE, GATE_SIZE
+        )
+
+        alias state_vector_size = 1 << num_qubits
+        alias state_vector_layout = Layout.row_major(state_vector_size)
+
+        alias total_threads = state_vector_size
+
+        alias max_threads_per_block = ctx.device_info.max_thread_block_size
+        # alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
+
+        # alias sm_count = ctx.device_info.sm_count
+        # alias max_blocks_per_multiprocessor = ctx.device_info.max_blocks_per_multiprocessor
+        # alias max_number_blocks = sm_count * max_blocks_per_multiprocessor
+
+        alias blocks_per_grid = (
+            total_threads + max_threads_per_block - 1
+        ) // max_threads_per_block
+
+        threads_per_block = (
+            max_threads_per_block,
+            1,
+            1,
+        )
+
+        if total_threads < max_threads_per_block:
+            threads_per_block = (
+                total_threads,
+                1,
+                1,
+            )
+
+        # alias blocks_per_grid = (1)
+
+        # threads_per_block = (
+        #     1,
+        #     1,
+        #     1,
+        # )
+
+        # print("vector size:", state_vector_size)
+        # print("blocks per grid:", blocks_per_grid)
+        # print("threads per block[0]:", threads_per_block[0])
+
+        var control_bits_list: List[List[List[Int]]] = [
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+        ]
+
+        # -- Create GPU variables -- #
+        # These don't need to be initialized to zero, they will be filled later
+
+        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+
+        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+
+        host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+            circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
+        )
+
+        # -- Initialize the quantum circuit to the |000⟩ state -- #
+        quantum_state: StateVector = StateVector.from_bitstring(
+            "0" * num_qubits
+        )
+        # print("Initial quantum state:\n", quantum_state)
+
+        # Wait for host buffers to be ready
+        ctx.synchronize()
+
+        # -- Fill host buffers -- #
+
+        for i in range(state_vector_size):
+            host_quantum_state_re[i] = quantum_state[i].re
+            host_quantum_state_im[i] = quantum_state[i].im
+
+        for i in range(gate_set_size):
+            gate = gate_set[i]
+            for j in range(GATE_SIZE):
+                for k in range(GATE_SIZE):
+                    index = gate_set_1qubit_layout(
+                        IntTuple(i, j, k)
+                    )  # Get the index in the 1D buffer
+                    host_gate_set_re[index] = gate[j, k].re
+                    host_gate_set_im[index] = gate[j, k].im
+
+        for i in range(circuit_number_control_gates):
+            for j in range(NUMBER_CONTROL_BITS):
+                for k in range(2):
+                    index = circuit_control_bits_layout(IntTuple(i, j, k))
+                    host_control_bits_circuit[index] = control_bits_list[i][j][
+                        k
+                    ]
+
+        # -- Copy host buffers to device buffers -- #
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+
+        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+            circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
+        )
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+        # Create other buffers for functions
+
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+
+        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+        gate_set_re.enqueue_copy_from(host_gate_set_re)
+        gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+        control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+        ctx.enqueue_memset(current_control_gate_circuit, 0)
+        ctx.enqueue_memset(quantum_state_out_re, 0.0)
+        ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+        # -- Create layout tensors for GPU operations -- #
+        gate_set_re_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_re.unsafe_ptr())
+        gate_set_im_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_im.unsafe_ptr())
+
+        quantum_state_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_re.unsafe_ptr())
+        quantum_state_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_im.unsafe_ptr())
+
+        quantum_state_out_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_re.unsafe_ptr())
+        quantum_state_out_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_im.unsafe_ptr())
+
+        control_bits_circuit_tensor = LayoutTensor[
+            mut=False, DType.int32, circuit_control_bits_layout
+        ](control_bits_circuit.unsafe_ptr())
+        current_control_gate_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, Layout.row_major(1)
+        ](current_control_gate_circuit.unsafe_ptr())
+
+        # -- Apply circuit operations -- #
+
+        current_state = 0
+        for layer in range(number_layers):
+            for qubit in range(num_qubits):
+                if current_state == 0:
+                    ctx.enqueue_function[
+                        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+                    ](
+                        gate_set_re_tensor,
+                        gate_set_im_tensor,
+                        gate_set_dic[Hadamard.symbol],
+                        # gate_set_dic[
+                        #     gates_list[
+                        #         Int(indexes[layer * num_qubits + qubit])
+                        #     ].symbol
+                        # ],
+                        GATE_SIZE,
+                        qubit,  # target_qubit
+                        quantum_state_re_tensor,
+                        quantum_state_im_tensor,
+                        num_qubits,  # number_qubits
+                        state_vector_size,  # quantum_state_size
+                        quantum_state_out_re_tensor,
+                        quantum_state_out_im_tensor,
+                        control_bits_circuit_tensor,
+                        current_control_gate_circuit_tensor,
+                        grid_dim=blocks_per_grid,
+                        block_dim=threads_per_block,
+                    )
+                    current_state = 1
+                else:
+                    ctx.enqueue_function[
+                        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+                    ](
+                        gate_set_re_tensor,
+                        gate_set_im_tensor,
+                        gate_set_dic[Hadamard.symbol],
+                        # gate_set_dic[
+                        #     gates_list[
+                        #         Int(indexes[layer * num_qubits + qubit])
+                        #     ].symbol
+                        # ],
+                        GATE_SIZE,
+                        qubit,  # target_qubit
+                        quantum_state_out_re_tensor,
+                        quantum_state_out_im_tensor,
+                        num_qubits,  # number_qubits
+                        state_vector_size,  # quantum_state_size
+                        quantum_state_re_tensor,
+                        quantum_state_im_tensor,
+                        control_bits_circuit_tensor,
+                        current_control_gate_circuit_tensor,
+                        grid_dim=blocks_per_grid,
+                        block_dim=threads_per_block,
+                    )
+                    current_state = 0
+
+        keep(quantum_state_re.unsafe_ptr())
+        keep(quantum_state_im.unsafe_ptr())
+        keep(quantum_state_out_re.unsafe_ptr())
+        keep(quantum_state_out_im.unsafe_ptr())
+        keep(gate_set_re.unsafe_ptr())
+        keep(gate_set_im.unsafe_ptr())
+        keep(control_bits_circuit.unsafe_ptr())
+        keep(current_control_gate_circuit.unsafe_ptr())
+
+        ctx.synchronize()
+
+    bench_ctx = DeviceContext()
+    b.iter_custom[qubit_wise_multiply_inplace_gpu_workflow](bench_ctx)
+
+
+def bench_qubit_wise_multiply_inplace_gpu[
+    min_number_qubits: Int = 15,
+    max_number_qubits: Int = 25,
+    number_qubits_step_size: Int = 1,
+    min_number_layers: Int = 1,
+    max_number_layers: Int = 2000,
+    number_layers_step_size: Int = 200,
+    fixed_number_qubits: Int = 5,
+    fixed_number_layers: Int = 2,
+]():
+    print("Running qubit_wise_multiply_inplace_gpu() Benchmarks...")
+    print("-" * 80)
+    bench_config = BenchConfig(max_iters=10, min_warmuptime_secs=0.2)
+    bench = Bench(bench_config)
+
+    makedirs("data", exist_ok=True)
+
+    @parameter
+    for number_qubits in range(
+        min_number_qubits, max_number_qubits + 1, number_qubits_step_size
+    ):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply_inplace_gpu[
+                number_qubits, fixed_number_layers
+            ]
+        ](
+            BenchId(
+                "qubit_wise_multiply_inplace_gpu_"
+                + String(number_qubits)
+                + "q_"
+                + String(fixed_number_layers)
+                + "l"
+            )
+        )
+
+    bench.config.out_file = Path(
+        "data/qubit_wise_multiply_inplace_gpu_qubits.csv"
+    )
+    bench.dump_report()
+    bench = Bench(bench_config)
+
+    @parameter
+    for number_layers in range(
+        min_number_layers, max_number_layers + 1, number_layers_step_size
+    ):
+        bench.bench_function[
+            benchmark_qubit_wise_multiply_inplace_gpu[
+                fixed_number_qubits, number_layers
+            ]
+        ](
+            BenchId(
+                "qubit_wise_multiply_inplace_gpu_"
+                + String(fixed_number_qubits)
+                + "q_"
+                + String(number_layers)
+                + "l"
+            )
+        )
+
+    bench.config.out_file = Path(
+        "data/qubit_wise_multiply_inplace_gpu_layers.csv"
+    )
+    bench.dump_report()
+
+    print("qubit_wise_multiply_inplace_gpu() Benchmarks completed!")
+    print("-" * 80)
diff --git a/benchmarks/plot_results.py b/benchmarks/plot_results.py
new file mode 100644
index 0000000..9adcb34
--- /dev/null
+++ b/benchmarks/plot_results.py
@@ -0,0 +1,104 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# --- 2. Data Loading and Parsing ---
+
+
+def process_benchmark_data(filepath):
+    """
+    Reads a benchmark CSV, extracts qubit and layer counts from the 'name'
+    column, and returns a clean, sorted DataFrame.
+    """
+    # Read the CSV file
+    df = pd.read_csv(filepath)
+
+    # Rename column for easier access (removes space and parentheses)
+    df = df.rename(columns={"met (ms)": "time_ms"})
+
+    # Use regular expressions to extract numbers of qubits and layers
+    # '(\d+)q' finds a sequence of digits followed by 'q'
+    # '(\d+)l' finds a sequence of digits followed by 'l'
+    df["qubits"] = df["name"].str.extract(r"(\d+)q").astype(int)
+    df["layers"] = df["name"].str.extract(r"(\d+)l").astype(int)
+
+    # Sort values for correct line plotting
+    if "layers" in filepath:
+        df = df.sort_values("layers")
+    elif "qubits" in filepath:
+        df = df.sort_values("qubits")
+    return df
+
+
+# Load and process all four data files
+layers_gpu_df = process_benchmark_data(
+    "data/qubit_wise_multiply_inplace_gpu_layers.csv"
+)
+qubits_gpu_df = process_benchmark_data(
+    "data/qubit_wise_multiply_inplace_gpu_qubits.csv"
+)
+layers_cpu_df = process_benchmark_data("data/qubit_wise_multiply_inplace_layers.csv")
+qubits_cpu_df = process_benchmark_data("data/qubit_wise_multiply_inplace_qubits.csv")
+
+
+# --- 3. Plotting ---
+
+# Create a figure with two subplots side-by-side
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+fig.suptitle("Qubit-wise Multiplication Benchmark", fontsize=16)
+
+# Plot 1: Performance vs. Number of Layers
+ax1.plot(
+    layers_cpu_df["layers"],
+    layers_cpu_df["time_ms"],
+    marker="o",
+    linestyle="-",
+    label="CPU",
+)
+ax1.plot(
+    layers_gpu_df["layers"],
+    layers_gpu_df["time_ms"],
+    marker="s",
+    linestyle="--",
+    label="GPU",
+)
+ax1.set_title("Performance vs. Number of Layers (13 Qubits)")
+ax1.set_xlabel("Number of Layers")
+ax1.set_ylabel("Mean Execution Time (ms)")
+ax1.legend()
+ax1.grid(True, linestyle="--", alpha=0.6)
+
+# Plot 2: Performance vs. Number of Qubits
+ax2.plot(
+    qubits_cpu_df["qubits"],
+    qubits_cpu_df["time_ms"],
+    marker="o",
+    linestyle="-",
+    label="CPU",
+)
+ax2.plot(
+    qubits_gpu_df["qubits"],
+    qubits_gpu_df["time_ms"],
+    marker="s",
+    linestyle="--",
+    label="GPU",
+)
+ax2.set_title("Performance vs. Number of Qubits (20 Layers)")
+ax2.set_xlabel("Number of Qubits")
+# We can make the y-axis a log scale if the values vary widely
+ax2.set_ylabel("Mean Execution Time (ms) - Log Scale")
+ax2.set_yscale("log")  # Use a logarithmic scale to better see the differences
+ax2.legend()
+ax2.grid(True, which="both", linestyle="--", alpha=0.6)
+
+
+# Adjust layout to prevent labels from overlapping
+plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust rect to make space for suptitle
+
+# --- 4. Saving and Displaying ---
+pdf_filename = "data/benchmark_results.pdf"
+plt.savefig(pdf_filename, bbox_inches="tight")
+
+print(f"\nPlot successfully saved as '{pdf_filename}'")
+
+# # Display the plot on the screen
+# plt.show()
diff --git a/examples/gpu_examples.mojo b/examples/gpu_examples.mojo
index 5a64153..44be57b 100644
--- a/examples/gpu_examples.mojo
+++ b/examples/gpu_examples.mojo
@@ -12,695 +12,95 @@ from qlabs.base import (
     PauliX,
     PauliY,
     PauliZ,
-    NOT,
-    H,
-    X,
-    Y,
-    Z,
-    SWAP,
-    iSWAP,
 )
 
 from qlabs.base.gpu import qubit_wise_multiply_inplace_gpu
 
-alias BLOCKS_PER_GRID = 1
-alias THREADS_PER_BLOCK = (1, 1)
+from qlabs.local_stdlib.complex import ComplexFloat32
+
 alias dtype = DType.float32
 
 alias GATE_SIZE = 2
-alias STATE_VECTOR_SIZE = 8
 alias NUMBER_CONTROL_BITS = 1
 # TODO have NUMBER_CONTROL_BITS be a list defining each gates specific control bits count
-alias CIRCUIT_NUMBER_CONTROL_GATES = 2
-alias circuit_control_bits_layout = Layout.row_major(
-    CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
-)
-
-alias gate_1qubit_layout = Layout.row_major(GATE_SIZE, GATE_SIZE)
-alias state_vector_3qubits_layout = Layout.row_major(STATE_VECTOR_SIZE)
-alias control_bits_layout = Layout.row_major(NUMBER_CONTROL_BITS, 2)
-
-alias gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
-alias gate_set_dic: Dict[String, Int] = {
-    Hadamard.symbol: 0,
-    PauliX.symbol: 1,
-    PauliZ.symbol: 2,
-}
-alias GATE_SET_SIZE = 3
-alias gate_set_1qubit_layout = Layout.row_major(
-    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE
-)
-alias gate_set_1qubit_vectorized_layout = Layout.row_major(
-    GATE_SET_SIZE, GATE_SIZE, GATE_SIZE, 2
-)
 
 
-def simulate_figure1_circuit_gpu():
-    """Simulates the circuit from Figure 1 in the paper."""
+def simulate_figure1_circuit_gpu[num_qubits: Int]():
+    """Simulates a circuit of arbitrary number of qubits"""
 
     @parameter
     if not has_accelerator():
         print("No compatible GPU found")
     else:
-        print("Simulating Figure 1 circuit.\nCircuit design:")
-        print(
-            """
-    |0> -------|X|--|Z|--
-                |
-    |0> --|H|---*----*---
-                    |
-    |0> --|X|-------|X|--
-        """
-        )
-        var control_bits_list: List[List[List[Int]]] = [
-            [[1, 1]],  # Control on qubit 1 and is control because flag=1
-            [[1, 1]],  # Control on qubit 1 and is control because flag=1
-        ]
-
-        ctx = DeviceContext()
-        print("Using GPU:", ctx.name())
-
-        # -- Create GPU variables -- #
-        # These don't need to be initialized to zero, they will be filled later
-
-        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
-            STATE_VECTOR_SIZE
-        )
-        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
-            STATE_VECTOR_SIZE
-        )
-
-        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-
-        host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
-            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-        )
-
-        # -- Initialize the quantum circuit to the |000⟩ state -- #
-        quantum_state: StateVector = StateVector.from_bitstring("000")
-        print("Initial quantum state:\n", quantum_state)
-
-        # Wait for host buffers to be ready
-        ctx.synchronize()
-
-        # -- Fill host buffers -- #
-
-        for i in range(STATE_VECTOR_SIZE):
-            host_quantum_state_re[i] = quantum_state[i].re
-            host_quantum_state_im[i] = quantum_state[i].im
-
-        print("Initial state real part:", host_quantum_state_re)
-        print("Initial state imaginary part:", host_quantum_state_im)
-
-        for i in range(GATE_SET_SIZE):
-            gate = gate_set[i]
-            for j in range(GATE_SIZE):
-                for k in range(GATE_SIZE):
-                    index = gate_set_1qubit_layout(
-                        IntTuple(i, j, k)
-                    )  # Get the index in the 1D buffer
-                    host_gate_set_re[index] = gate[j, k].re
-                    host_gate_set_im[index] = gate[j, k].im
-
-        for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
-            for j in range(NUMBER_CONTROL_BITS):
-                for k in range(2):
-                    index = circuit_control_bits_layout(IntTuple(i, j, k))
-                    host_control_bits_circuit[index] = control_bits_list[i][j][
-                        k
-                    ]
-
-        # -- Copy host buffers to device buffers -- #
-        quantum_state_re = ctx.enqueue_create_buffer[dtype](STATE_VECTOR_SIZE)
-        quantum_state_im = ctx.enqueue_create_buffer[dtype](STATE_VECTOR_SIZE)
-
-        gate_set_re = ctx.enqueue_create_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-        gate_set_im = ctx.enqueue_create_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
-        )
-
-        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
-            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-        )
-        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
-
-        # Create other buffers for functions
-
-        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
-            STATE_VECTOR_SIZE
-        )
-        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
-            STATE_VECTOR_SIZE
-        )
-
-        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
-        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
-
-        gate_set_re.enqueue_copy_from(host_gate_set_re)
-        gate_set_im.enqueue_copy_from(host_gate_set_im)
-
-        control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
-
-        ctx.enqueue_memset(current_control_gate_circuit, 0)
-        ctx.enqueue_memset(quantum_state_out_re, 0.0)
-        ctx.enqueue_memset(quantum_state_out_im, 0.0)
-
-        # -- Create layout tensors for GPU operations -- #
-        gate_set_re_tensor = LayoutTensor[
-            mut=False, dtype, gate_set_1qubit_layout
-        ](gate_set_re.unsafe_ptr())
-        gate_set_im_tensor = LayoutTensor[
-            mut=False, dtype, gate_set_1qubit_layout
-        ](gate_set_im.unsafe_ptr())
-
-        quantum_state_re_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
-        ](quantum_state_re.unsafe_ptr())
-        quantum_state_im_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
-        ](quantum_state_im.unsafe_ptr())
-
-        quantum_state_out_re_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
-        ](quantum_state_out_re.unsafe_ptr())
-        quantum_state_out_im_tensor = LayoutTensor[
-            mut=True, dtype, state_vector_3qubits_layout
-        ](quantum_state_out_im.unsafe_ptr())
-
-        control_bits_circuit_tensor = LayoutTensor[
-            mut=False, DType.int32, circuit_control_bits_layout
-        ](control_bits_circuit.unsafe_ptr())
-        current_control_gate_circuit_tensor = LayoutTensor[
-            mut=True, DType.int32, Layout.row_major(1)
-        ](current_control_gate_circuit.unsafe_ptr())
-
-        # -- Apply circuit operations -- #
-
-        # Gate 0
-        # quantum_state = qubit_wise_multiply_gpu(
-        #     Hadamard.matrix, 1, quantum_state
-        # )
-        ctx.enqueue_function[
-            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-        ](
-            gate_set_re_tensor,
-            gate_set_im_tensor,
-            gate_set_dic[Hadamard.symbol],
-            GATE_SIZE,
-            1,  # target_qubit
-            quantum_state_re_tensor,
-            quantum_state_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
-            quantum_state_out_re_tensor,
-            quantum_state_out_im_tensor,
-            control_bits_circuit_tensor,
-            current_control_gate_circuit_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
-        )
+        ctx: DeviceContext = DeviceContext()
 
-        # # It works
-        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-        #     print(
-        #         "After Hadamard gate on qubit 1\nreal part:\n",
-        #         host_re,
-        #         "\nimaginary part:\n",
-        #         host_im,
-        #     )
-
-        # Gate 1 (reverse the states input <-> output)
-        # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
-        ctx.enqueue_function[
-            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-        ](
-            gate_set_re_tensor,
-            gate_set_im_tensor,
-            gate_set_dic[PauliX.symbol],
-            GATE_SIZE,
-            2,  # target_qubit
-            quantum_state_out_re_tensor,
-            quantum_state_out_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
-            quantum_state_re_tensor,
-            quantum_state_im_tensor,
-            control_bits_circuit_tensor,
-            current_control_gate_circuit_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+        alias circuit_number_control_gates = 2
+        alias circuit_control_bits_layout = Layout.row_major(
+            circuit_number_control_gates, NUMBER_CONTROL_BITS, 2
         )
 
-        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
-        #     print(
-        #         "After Pauli-X gate on qubit 2:",
-        #         "\nreal part:\n",
-        #         host_re,
-        #         "\nimaginary part:\n",
-        #         host_im,
-        #     )
-
-        # # Gate 2
-        # quantum_state = qubit_wise_multiply(
-        #     PauliX.matrix, 0, quantum_state, [[1, 1]]
-        # )
-        ctx.enqueue_function[
-            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
-        ](
-            gate_set_re_tensor,
-            gate_set_im_tensor,
-            gate_set_dic[PauliX.symbol],
-            GATE_SIZE,
-            0,  # target_qubit
-            quantum_state_re_tensor,
-            quantum_state_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
-            quantum_state_out_re_tensor,
-            quantum_state_out_im_tensor,
-            control_bits_circuit_tensor,
-            current_control_gate_circuit_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
-        )
-
-        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-        #     print(
-        #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
-        #         "\nreal part:\n",
-        #         host_re,
-        #         "\nimaginary part:\n",
-        #         host_im,
-        #     )
-
-        # Gate 3
-        # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
-        ctx.enqueue_function[
-            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-        ](
-            gate_set_re_tensor,
-            gate_set_im_tensor,
-            gate_set_dic[PauliZ.symbol],
-            GATE_SIZE,
-            0,  # target_qubit
-            quantum_state_out_re_tensor,
-            quantum_state_out_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
-            quantum_state_re_tensor,
-            quantum_state_im_tensor,
-            control_bits_circuit_tensor,
-            current_control_gate_circuit_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
+        gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
+        gate_set_dic: Dict[String, Int] = {
+            Hadamard.symbol: 0,
+            PauliX.symbol: 1,
+            PauliZ.symbol: 2,
+        }
+        alias gate_set_size = 3
+        alias gate_set_1qubit_layout = Layout.row_major(
+            gate_set_size, GATE_SIZE, GATE_SIZE
         )
-
-        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
-        #     print(
-        #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
-        #         host_re,
-        #         "\nimaginary part:\n",
-        #         host_im,
-        #     )
-
-        # Gate 4
-        # quantum_state = qubit_wise_multiply(
-        #     PauliX.matrix, 2, quantum_state, [[1, 1]]
+        # alias gate_set_1qubit_vectorized_layout = Layout.row_major(
+        #     gate_set_size, GATE_SIZE, GATE_SIZE, 2
         # )
-        ctx.enqueue_function[
-            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
-        ](
-            gate_set_re_tensor,
-            gate_set_im_tensor,
-            gate_set_dic[PauliX.symbol],
-            GATE_SIZE,
-            2,  # target_qubit
-            quantum_state_re_tensor,
-            quantum_state_im_tensor,
-            3,  # number_qubits
-            STATE_VECTOR_SIZE,  # quantum_state_size
-            quantum_state_out_re_tensor,
-            quantum_state_out_im_tensor,
-            control_bits_circuit_tensor,
-            current_control_gate_circuit_tensor,
-            grid_dim=BLOCKS_PER_GRID,
-            block_dim=THREADS_PER_BLOCK,
-        )
 
-        with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-            print(
-                (
-                    "After Pauli-X gate on qubit 2 with control on qubit 1"
-                    " (Final State):\nreal part:\n"
-                ),
-                host_re,
-                "\nimaginary part:\n",
-                host_im,
-            )
-
-
-# def run_gpu_not_abstract_3():
-#     """Simulates the circuit from Figure 1 in the paper."""
-
-#     @parameter
-#     if not has_accelerator():
-#         print("No compatible GPU found")
-#     else:
-#         print("Simulating Figure 1 circuit.\nCircuit design:")
-#         print(
-#             """
-#     |0> -------|X|--|Z|--
-#                 |
-#     |0> --|H|---*----*---
-#                     |
-#     |0> --|X|-------|X|--
-#         """
-#         )
-#         var control_bits_list: List[List[List[Int]]] = [
-#             [[1, 1]],  # Control on qubit 1 and is control because flag=1
-#             [[1, 1]],  # Control on qubit 1 and is control because flag=1
-#         ]
-
-#         ctx = DeviceContext()
-#         print("Using GPU:", ctx.name())
-
-#         # -- Create GPU variables -- #
-#         ctx = DeviceContext()
-
-#         # -- Initialize the quantum circuit to the |000⟩ state -- #
-#         quantum_state: StateVector = StateVector.from_bitstring("000")
-#         print("Initial quantum state:\n", quantum_state)
-
-#         host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
-#             STATE_VECTOR_SIZE
-#         ).enqueue_fill(0)
-#         host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
-#             STATE_VECTOR_SIZE
-#         ).enqueue_fill(0)
-
-#         host_gate_set = ctx.enqueue_create_host_buffer[dtype](
-#             GATE_SET_SIZE * GATE_SIZE * GATE_SIZE * 2
-#         ).enqueue_fill(0)
-
-#         host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
-#             CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-#         ).enqueue_fill(0)
-
-#         # Wait for host buffers to be ready
-#         ctx.synchronize()
-
-#         # -- Fill host buffers -- #
-
-#         for i in range(STATE_VECTOR_SIZE):
-#             host_quantum_state_re[i] = quantum_state[i].re
-#             host_quantum_state_im[i] = quantum_state[i].im
-
-#         print("Initial state real part:", host_quantum_state_re)
-#         print("Initial state imaginary part:", host_quantum_state_im)
-
-#         for i in range(GATE_SET_SIZE):
-#             gate = gate_set[i]
-#             for j in range(GATE_SIZE):
-#                 for k in range(GATE_SIZE):
-#                     index = gate_set_1qubit_layout(
-#                         IntTuple(i, j, k)
-#                     )  # Get the index in the 1D buffer
-#                     host_gate_set[index][0] = gate[j, k].re
-#                     host_gate_set[index][1] = gate[j, k].im
-
-#         for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
-#             for j in range(NUMBER_CONTROL_BITS):
-#                 for k in range(2):
-#                     index = circuit_control_bits_layout(IntTuple(i, j, k))
-#                     host_control_bits_circuit[index] = control_bits_list[i][j][
-#                         k
-#                     ]
-
-#         # -- Copy host buffers to device buffers -- #
-#         quantum_state_re = ctx.enqueue_create_buffer[dtype](
-#             STATE_VECTOR_SIZE
-#         ).enqueue_fill(0)
-#         quantum_state_im = ctx.enqueue_create_buffer[dtype](
-#             STATE_VECTOR_SIZE
-#         ).enqueue_fill(0)
-
-#         gate_set = ctx.enqueue_create_buffer[dtype](
-#             GATE_SET_SIZE * GATE_SIZE * GATE_SIZE * 2
-#         ).enqueue_fill(0)
-
-#         control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
-#             CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-#         ).enqueue_fill(0)
-#         current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](
-#             1
-#         ).enqueue_fill(0)
-
-#         # Create other buffers for functions
-
-#         quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
-#             STATE_VECTOR_SIZE
-#         ).enqueue_fill(0)
-#         quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
-#             STATE_VECTOR_SIZE
-#         ).enqueue_fill(0)
-
-#         quantum_state_re.enqueue_copy_from(host_quantum_state_re)
-#         quantum_state_im.enqueue_copy_from(host_quantum_state_im)
-
-#         gate_set.enqueue_copy_from(host_gate_set)
-
-#         control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
-
-#         # -- Create layout tensors for GPU operations -- #
-#         gate_set_tensor = LayoutTensor[
-#             mut=False, dtype, gate_set_1qubit_vectorized_layout
-#         ](gate_set.unsafe_ptr())
-
-#         quantum_state_re_tensor = LayoutTensor[
-#             mut=True, dtype, state_vector_3qubits_layout
-#         ](quantum_state_re.unsafe_ptr())
-#         quantum_state_im_tensor = LayoutTensor[
-#             mut=True, dtype, state_vector_3qubits_layout
-#         ](quantum_state_im.unsafe_ptr())
-
-#         quantum_state_out_re_tensor = LayoutTensor[
-#             mut=True, dtype, state_vector_3qubits_layout
-#         ](quantum_state_out_re.unsafe_ptr())
-#         quantum_state_out_im_tensor = LayoutTensor[
-#             mut=True, dtype, state_vector_3qubits_layout
-#         ](quantum_state_out_im.unsafe_ptr())
-
-#         control_bits_circuit_tensor = LayoutTensor[
-#             mut=False, DType.int32, circuit_control_bits_layout
-#         ](control_bits_circuit.unsafe_ptr())
-#         current_control_gate_circuit_tensor = LayoutTensor[
-#             mut=True, DType.int32, Layout.row_major(1)
-#         ](current_control_gate_circuit.unsafe_ptr())
-
-#         # -- Apply circuit operations -- #
-
-#         # Gate 0
-#         # quantum_state = qubit_wise_multiply_gpu(
-#         #     Hadamard.matrix, 1, quantum_state
-#         # )
-#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=0]](
-#             gate_set_tensor,
-#             gate_set_dic[Hadamard.symbol],
-#             GATE_SIZE,
-#             1,  # target_qubit
-#             quantum_state_re_tensor,
-#             quantum_state_im_tensor,
-#             3,  # number_qubits
-#             STATE_VECTOR_SIZE,  # quantum_state_size
-#             quantum_state_out_re_tensor,
-#             quantum_state_out_im_tensor,
-#             control_bits_circuit_tensor,
-#             current_control_gate_circuit_tensor,
-#             grid_dim=BLOCKS_PER_GRID,
-#             block_dim=THREADS_PER_BLOCK,
-#         )
-
-#         # # It works
-#         # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-#         #     print(
-#         #         "After Hadamard gate on qubit 1\nreal part:\n",
-#         #         host_re,
-#         #         "\nimaginary part:\n",
-#         #         host_im,
-#         #     )
-
-#         # Gate 1 (reverse the states input <-> output)
-#         # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
-#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=0]](
-#             gate_set_tensor,
-#             gate_set_dic[PauliX.symbol],
-#             GATE_SIZE,
-#             2,  # target_qubit
-#             quantum_state_out_re_tensor,
-#             quantum_state_out_im_tensor,
-#             3,  # number_qubits
-#             STATE_VECTOR_SIZE,  # quantum_state_size
-#             quantum_state_re_tensor,
-#             quantum_state_im_tensor,
-#             control_bits_circuit_tensor,
-#             current_control_gate_circuit_tensor,
-#             grid_dim=BLOCKS_PER_GRID,
-#             block_dim=THREADS_PER_BLOCK,
-#         )
-
-#         # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
-#         #     print(
-#         #         "After Pauli-X gate on qubit 2:",
-#         #         "\nreal part:\n",
-#         #         host_re,
-#         #         "\nimaginary part:\n",
-#         #         host_im,
-#         #     )
-
-#         # # Gate 2
-#         # quantum_state = qubit_wise_multiply(
-#         #     PauliX.matrix, 0, quantum_state, [[1, 1]]
-#         # )
-#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=1]](
-#             gate_set_tensor,
-#             gate_set_dic[PauliX.symbol],
-#             GATE_SIZE,
-#             0,  # target_qubit
-#             quantum_state_re_tensor,
-#             quantum_state_im_tensor,
-#             3,  # number_qubits
-#             STATE_VECTOR_SIZE,  # quantum_state_size
-#             quantum_state_out_re_tensor,
-#             quantum_state_out_im_tensor,
-#             control_bits_circuit_tensor,
-#             current_control_gate_circuit_tensor,
-#             grid_dim=BLOCKS_PER_GRID,
-#             block_dim=THREADS_PER_BLOCK,
-#         )
-
-#         # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-#         #     print(
-#         #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
-#         #         "\nreal part:\n",
-#         #         host_re,
-#         #         "\nimaginary part:\n",
-#         #         host_im,
-#         #     )
-
-#         # Gate 3
-#         # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
-#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=0]](
-#             gate_set_tensor,
-#             gate_set_dic[PauliZ.symbol],
-#             GATE_SIZE,
-#             0,  # target_qubit
-#             quantum_state_out_re_tensor,
-#             quantum_state_out_im_tensor,
-#             3,  # number_qubits
-#             STATE_VECTOR_SIZE,  # quantum_state_size
-#             quantum_state_re_tensor,
-#             quantum_state_im_tensor,
-#             control_bits_circuit_tensor,
-#             current_control_gate_circuit_tensor,
-#             grid_dim=BLOCKS_PER_GRID,
-#             block_dim=THREADS_PER_BLOCK,
-#         )
-
-#         # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
-#         #     print(
-#         #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
-#         #         host_re,
-#         #         "\nimaginary part:\n",
-#         #         host_im,
-#         #     )
-
-#         # Gate 4
-#         # quantum_state = qubit_wise_multiply(
-#         #     PauliX.matrix, 2, quantum_state, [[1, 1]]
-#         # )
-#         ctx.enqueue_function[qubit_wise_multiply_inplace_gpu[number_control_bits=1]](
-#             gate_set_tensor,
-#             gate_set_dic[PauliX.symbol],
-#             GATE_SIZE,
-#             2,  # target_qubit
-#             quantum_state_re_tensor,
-#             quantum_state_im_tensor,
-#             3,  # number_qubits
-#             STATE_VECTOR_SIZE,  # quantum_state_size
-#             quantum_state_out_re_tensor,
-#             quantum_state_out_im_tensor,
-#             control_bits_circuit_tensor,
-#             current_control_gate_circuit_tensor,
-#             grid_dim=BLOCKS_PER_GRID,
-#             block_dim=THREADS_PER_BLOCK,
-#         )
-
-#         with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-#             print(
-#                 (
-#                     "After Pauli-X gate on qubit 2 with control on qubit 1"
-#                     " (Final State):\nreal part:\n"
-#                 ),
-#                 host_re,
-#                 "\nimaginary part:\n",
-#                 host_im,
-#             )
-
-
-def simulate_any_size_circuit_gpu[num_qubits: Int]():
-    """Simulates a circuit of arbitrary number of qubits"""
-
-    @parameter
-    if not has_accelerator():
-        print("No compatible GPU found")
-    else:
         alias state_vector_size = 1 << num_qubits
         alias state_vector_layout = Layout.row_major(state_vector_size)
 
-        alias total_threads = 2 * state_vector_size
+        alias total_threads = state_vector_size
+
+        alias max_threads_per_block = ctx.device_info.max_thread_block_size
+        # alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
+
+        # alias sm_count = ctx.device_info.sm_count
+        # alias max_blocks_per_multiprocessor = ctx.device_info.max_blocks_per_multiprocessor
+        # alias max_number_blocks = sm_count * max_blocks_per_multiprocessor
 
-        alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
         alias blocks_per_grid = (
             total_threads + max_threads_per_block - 1
         ) // max_threads_per_block
 
-        alias threads_per_block = (
+        threads_per_block = (
             max_threads_per_block,
             1,
             1,
         )
 
-        @parameter
         if total_threads < max_threads_per_block:
-            alias threads_per_block = (
+            threads_per_block = (
                 total_threads,
                 1,
                 1,
-            )  # 1D block of threads
+            )
+
+        print("state_vector_size:", state_vector_size)
+        print("blocks_per_grid:", blocks_per_grid)
+        print("threads_per_block[0]:", threads_per_block[0])
 
         var control_bits_list: List[List[List[Int]]] = [
             [[1, 1]],  # Control on qubit 1 and is control because flag=1
             [[1, 1]],  # Control on qubit 1 and is control because flag=1
         ]
 
-        ctx = DeviceContext()
         print("Using GPU:", ctx.name())
-        print("ctx.device_info:", ctx.device_info)
-        print(
-            "ctx.device_info.max_thread_block_size:",
-            ctx.device_info.max_thread_block_size,
-        )
-        print(
-            "ctx.device_info.max_blocks_per_multiprocessor:",
-            ctx.device_info.max_blocks_per_multiprocessor,
-        )
+        # print("ctx.device_info:", ctx.device_info)
+        # print(
+        #     "ctx.device_info.max_thread_block_size:",
+        #     ctx.device_info.max_thread_block_size,
+        # )
+        # print(
+        #     "ctx.device_info.max_blocks_per_multiprocessor:",
+        #     ctx.device_info.max_blocks_per_multiprocessor,
+        # )
         try:
             (free, total) = ctx.get_memory_info()
             print("Free memory:", free / (1024 * 1024), "MB")
@@ -719,14 +119,14 @@ def simulate_any_size_circuit_gpu[num_qubits: Int]():
         )
 
         host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+            gate_set_size * GATE_SIZE * GATE_SIZE
         )
         host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+            gate_set_size * GATE_SIZE * GATE_SIZE
         )
 
         host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
-            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+            circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
         )
 
         # -- Initialize the quantum circuit to the |000⟩ state -- #
@@ -747,7 +147,7 @@ def simulate_any_size_circuit_gpu[num_qubits: Int]():
         print("Initial state real part:", host_quantum_state_re)
         print("Initial state imaginary part:", host_quantum_state_im)
 
-        for i in range(GATE_SET_SIZE):
+        for i in range(gate_set_size):
             gate = gate_set[i]
             for j in range(GATE_SIZE):
                 for k in range(GATE_SIZE):
@@ -757,7 +157,7 @@ def simulate_any_size_circuit_gpu[num_qubits: Int]():
                     host_gate_set_re[index] = gate[j, k].re
                     host_gate_set_im[index] = gate[j, k].im
 
-        for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+        for i in range(circuit_number_control_gates):
             for j in range(NUMBER_CONTROL_BITS):
                 for k in range(2):
                     index = circuit_control_bits_layout(IntTuple(i, j, k))
@@ -770,14 +170,14 @@ def simulate_any_size_circuit_gpu[num_qubits: Int]():
         quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
 
         gate_set_re = ctx.enqueue_create_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+            gate_set_size * GATE_SIZE * GATE_SIZE
         )
         gate_set_im = ctx.enqueue_create_buffer[dtype](
-            GATE_SET_SIZE * GATE_SIZE * GATE_SIZE
+            gate_set_size * GATE_SIZE * GATE_SIZE
         )
 
         control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
-            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+            circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
         )
         current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
 
@@ -991,6 +391,13 @@ def simulate_any_size_circuit_gpu[num_qubits: Int]():
                     " (Final State):\nreal part:\n"
                 ),
                 host_re,
+                "\nhost_rere[3]:",
+                host_re[3],
+                "\nhost_rere[4]:",
+                host_re[4],
                 "\nimaginary part:\n",
                 host_im,
             )
+            for i in range(state_vector_size):
+                quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+        print("Final quantum state:\n", quantum_state)
diff --git a/examples/main.mojo b/examples/main.mojo
index 688f064..451005b 100644
--- a/examples/main.mojo
+++ b/examples/main.mojo
@@ -41,7 +41,6 @@ from qlabs.abstractions import (
 
 from gpu_examples import (
     simulate_figure1_circuit_gpu,
-    simulate_any_size_circuit_gpu,
 )
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -642,7 +641,7 @@ def main():
 
     # simulate_figure1_circuit_abstract()
 
-    # simulate_random_circuit(number_qubits, number_layers)
+    simulate_random_circuit(number_qubits, number_layers)
 
     # simulate_figure4_circuit()
 
@@ -656,6 +655,4 @@ def main():
 
     # debug_something()
 
-    # simulate_figure1_circuit_gpu()
-
-    simulate_any_size_circuit_gpu[4]()
+    simulate_figure1_circuit_gpu[3]()
diff --git a/pixi.lock b/pixi.lock
index 3c27d8b..8175f37 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -13,6 +13,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiohappyeyeballs-2.6.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aiohttp-3.12.13-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiosignal-1.3.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.9.0-pyh29332c3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/asgiref-3.8.1-pyhd8ed1ab_1.conda
@@ -38,15 +39,19 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/backoff-2.2.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_5.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h2ec8cdc_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.6.15-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.6.15-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py312h06ac9bb_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.11-py312hd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cccl_linux-64-12.9.27-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-command-line-tools-12.9.0-0.conda
@@ -93,11 +98,14 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-tools-12.9.0-0.conda
       - conda: https://conda.anaconda.org/nvidia/noarch/cuda-version-12.9-3.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-visual-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hd9c7081_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/datasets-2.14.4-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/deprecated-1.2.18-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/dill-0.3.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.7.0-pyhff2d567_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/email-validator-2.2.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/email_validator-2.2.0-hd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda
@@ -105,7 +113,14 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.14-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-cli-0.0.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.18.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.4-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/frozenlist-1.6.0-py312hb9e946c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda
@@ -120,6 +135,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.2.1-py312h7201bc8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/googleapis-common-protos-1.70.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.14-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-1.71.0-py312hdcb7bd4_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/grpcio-reflection-1.71.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-tools-1.71.0-py312h2a0d124_1.conda
@@ -127,6 +143,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-hb14504d_11.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-transfer-0.1.9-py312h5bc9d60_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.5-py39h260a9e5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
@@ -143,6 +160,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_core-5.8.1-pyh31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.8-py312h84d6215_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda
@@ -160,6 +178,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_h372d94f_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.7-default_h1df26ce_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.7-default_he06ed0a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-12.9.0.13-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-dev-12.9.0.13-0.conda
@@ -167,6 +187,7 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-dev-11.4.0.6-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-1.14.0.30-4.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-dev-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-hb8b1518_5.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-10.3.10.19-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-dev-10.3.10.19-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda
@@ -175,7 +196,9 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-12.5.9.5-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-dev-12.5.9.5-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda
@@ -190,7 +213,10 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.24.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-hc4361e1_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.36.0-h0121fbd_1.conda
@@ -199,11 +225,13 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_hc41d3b0_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.7-he9d0ab4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-12.4.0.27-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-dev-12.4.0.27-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hb9d3cd8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-12.9.19-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-dev-12.9.19-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-12.9.41-0.conda
@@ -211,11 +239,14 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-12.4.0.16-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-dev-12.4.0.16-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hd1b1c89_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopus-1.5.2-hd0c01bc_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h3f30f2e_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.49-h943b412_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.29.3-h501fc15_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.06.26-hba17884_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda
@@ -239,11 +270,14 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.7-h024ca30_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py312h178313f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.10.3-py312h7900ff3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.3-py312hd3ec401_0.conda
       - conda: https://conda.modular.com/max-nightly/noarch/max-25.5.0.dev2025062705-release.conda
       - conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062705-release.conda
       - conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062705-release.conda
@@ -260,6 +294,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/msgspec-0.19.0-py312h66e93f0_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.1-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/multiprocess-0.70.15-py312h98912ed_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.5-pyhe01879c_0.conda
@@ -272,6 +307,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.3-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-api-1.30.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-exporter-otlp-proto-common-1.30.0-pyhd8ed1ab_0.conda
@@ -287,6 +323,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py312h80c1187_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.2-h29eaf8c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.8-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/prometheus-async-25.1.0-pyh29332c3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda
@@ -305,6 +342,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.10.1-pyh3cfb1c2_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyinstrument-5.0.2-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.9.1-py312hdb827e4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pysoundfile-0.13.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.11-h9e4cc4f_0_cpython.conda
@@ -320,6 +359,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py312h178313f_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyzmq-27.0.0-py312hbf22597_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.9.1-h0384650_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.06.26-h9925aae_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2024.11.6-py312h66e93f0_0.conda
@@ -360,18 +401,38 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.14.0-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-16.0.0-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.34.3-pyh31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.34.3-h31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/uvloop-0.21.0-py312h66e93f0_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/watchfiles-1.1.0-py312h12e396e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/websockets-15.0.1-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.17.2-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-h4f16b4b_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xgrammar-0.1.19-py312he346f12_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.45-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xxhash-0.8.3-hb47aa4a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-64/yarl-1.20.1-py312h178313f_0.conda
@@ -393,6 +454,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiohappyeyeballs-2.6.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/aiohttp-3.12.13-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiosignal-1.3.2-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/anyio-4.9.0-pyh29332c3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/asgiref-3.8.1-pyhd8ed1ab_1.conda
@@ -418,15 +480,19 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/backoff-2.2.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_impl_linux-64-2.43-h4bf12b8_5.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/binutils_linux-64-2.43-h4852527_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h2ec8cdc_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.5-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.6.15-hbd8a1cb_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.6.15-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/cffi-1.17.1-py312h06ac9bb_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/click-8.2.1-pyh707e725_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.11-py312hd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-cccl_linux-64-12.9.27-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-command-line-tools-12.9.0-0.conda
@@ -473,11 +539,14 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-tools-12.9.0-0.conda
       - conda: https://conda.anaconda.org/nvidia/noarch/cuda-version-12.9-3.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/cuda-visual-tools-12.9.0-0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hd9c7081_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/datasets-2.14.4-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/dbus-1.16.2-h3c4dab8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/deprecated-1.2.18-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/dill-0.3.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/dnspython-2.7.0-pyhff2d567_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/email-validator-2.2.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/email_validator-2.2.0-hd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/exceptiongroup-1.3.0-pyhd8ed1ab_0.conda
@@ -485,7 +554,14 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-0.115.14-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/fastapi-cli-0.0.7-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/filelock-3.18.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.4-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/frozenlist-1.6.0-py312hb9e946c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/fsspec-2025.5.1-pyhd8ed1ab_0.conda
@@ -500,6 +576,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gmp-6.3.0-hac33072_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gmpy2-2.2.1-py312h7201bc8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/googleapis-common-protos-1.70.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.14-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-1.71.0-py312hdcb7bd4_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/grpcio-reflection-1.71.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-tools-1.71.0-py312h2a0d124_1.conda
@@ -507,6 +584,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/gxx_linux-64-13.3.0-hb14504d_11.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/h11-0.16.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.2.0-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-transfer-0.1.9-py312h5bc9d60_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-xet-1.1.5-py39h260a9e5_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda
@@ -523,6 +601,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/jupyter_core-5.8.1-pyh31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/kernel-headers_linux-64-3.10.0-he073ed8_18.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.8-py312h84d6215_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lame-3.100-h166bdaf_1003.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.17-h717163a_0.conda
@@ -540,6 +619,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlidec-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libbrotlienc-1.1.0-hb9d3cd8_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-32_h372d94f_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.7-default_h1df26ce_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.7-default_he06ed0a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-12.9.0.13-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcublas-dev-12.9.0.13-0.conda
@@ -547,6 +628,7 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcufft-dev-11.4.0.6-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-1.14.0.30-4.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcufile-dev-1.14.0.30-4.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-hb8b1518_5.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-10.3.10.19-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-dev-10.3.10.19-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.14.1-h332b0f4_0.conda
@@ -555,7 +637,9 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-12.5.9.5-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libcusparse-dev-12.5.9.5-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.24-h86f0d12_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libevent-2.1.12-hf998b51_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.0-h5888daf_0.conda
@@ -570,7 +654,10 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgettextpo-devel-0.24.1-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.1.0-h69a702a_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.1.0-hcea5267_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-2.36.0-hc4361e1_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libgoogle-cloud-storage-2.36.0-h0121fbd_1.conda
@@ -579,11 +666,13 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libiconv-1.18-h4ce23a2_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.0-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-32_hc41d3b0_mkl.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.7-he9d0ab4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-12.4.0.27-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnpp-dev-12.4.0.27-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hb9d3cd8_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-12.9.19-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-dev-12.9.19-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjitlink-12.9.41-0.conda
@@ -591,11 +680,14 @@ environments:
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-12.4.0.16-0.conda
       - conda: https://conda.anaconda.org/nvidia/linux-64/libnvjpeg-dev-12.4.0.16-0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libogg-1.3.5-hd0c01bc_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hd1b1c89_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-headers-1.21.0-ha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libopus-1.5.2-hd0c01bc_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libparquet-20.0.0-h3f30f2e_8_cuda.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.49-h943b412_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.29.3-h501fc15_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libre2-11-2025.06.26-hba17884_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libsanitizer-13.3.0-he8ea267_2.conda
@@ -619,11 +711,14 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxkbcommon-1.10.0-h65c71a3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.13.8-h4bc477f_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/llvm-openmp-20.1.7-h024ca30_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/lz4-c-1.10.0-h5888daf_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-3.0.0-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py312h178313f_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.10.3-py312h7900ff3_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.3-py312hd3ec401_0.conda
       - conda: https://conda.modular.com/max-nightly/noarch/max-25.5.0.dev2025062705-release.conda
       - conda: https://conda.modular.com/max-nightly/linux-64/max-core-25.5.0.dev2025062705-release.conda
       - conda: https://conda.modular.com/max-nightly/noarch/max-pipelines-25.5.0.dev2025062705-release.conda
@@ -640,6 +735,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/msgspec-0.19.0-py312h66e93f0_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/multidict-6.5.1-py312h178313f_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/multiprocess-0.70.15-py312h98912ed_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.5-pyhe01879c_0.conda
@@ -652,6 +748,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/ocl-icd-2.3.3-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/opencl-headers-2025.06.13-h5888daf_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.3-h5fbd93e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-api-1.30.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/opentelemetry-exporter-otlp-proto-common-1.30.0-pyhd8ed1ab_0.conda
@@ -667,6 +764,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/pathspec-0.12.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.45-hc749103_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-11.2.1-py312h80c1187_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.2-h29eaf8c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.8-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/prometheus-async-25.1.0-pyh29332c3_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/prometheus-cpp-1.3.0-ha5d0236_0.conda
@@ -685,6 +783,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-settings-2.10.1-pyh3cfb1c2_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyinstrument-5.0.2-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.9.1-py312hdb827e4_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/pysoundfile-0.13.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.12.11-h9e4cc4f_0_cpython.conda
@@ -700,6 +800,8 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/pytz-2025.2-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.2-py312h178313f_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/pyzmq-27.0.0-py312hbf22597_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.9.1-h0384650_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.06.26-h9925aae_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2024.11.6-py312h66e93f0_0.conda
@@ -740,18 +842,38 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.14.0-pyhe01879c_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-16.0.0-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-0.34.3-pyh31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/uvicorn-standard-0.34.3-h31011fe_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/uvloop-0.21.0-py312h66e93f0_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/watchfiles-1.1.0-py312h12e396e_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/websockets-15.0.1-py312h66e93f0_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/wrapt-1.17.2-py312h66e93f0_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-h4f16b4b_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xgrammar-0.1.19-py312he346f12_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xkeyboard-config-2.45-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda
+      - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/xxhash-0.8.3-hb47aa4a_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h7f98852_2.tar.bz2
       - conda: https://conda.anaconda.org/conda-forge/linux-64/yarl-1.20.1-py312h178313f_0.conda
@@ -828,6 +950,16 @@ packages:
   license_family: APACHE
   size: 13229
   timestamp: 1734342253061
+- conda: https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.14-hb9d3cd8_0.conda
+  sha256: b9214bc17e89bf2b691fad50d952b7f029f6148f4ac4fe7c60c08f093efdf745
+  md5: 76df83c2a9035c54df5d04ff81bcc02d
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: LGPL-2.1-or-later
+  license_family: GPL
+  size: 566531
+  timestamp: 1744668655747
 - conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda
   sha256: e0ea1ba78fbb64f17062601edda82097fcf815012cf52bb704150a2668110d48
   md5: 2934f256a8acfe48f6ebb4fce6cde29c
@@ -1144,6 +1276,31 @@ packages:
   license_family: GPL
   size: 36038
   timestamp: 1749852914153
+- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hb9d3cd8_3.conda
+  sha256: c969baaa5d7a21afb5ed4b8dd830f82b78e425caaa13d717766ed07a61630bec
+  md5: 5d08a0ac29e6a5a984817584775d4131
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - brotli-bin 1.1.0 hb9d3cd8_3
+  - libbrotlidec 1.1.0 hb9d3cd8_3
+  - libbrotlienc 1.1.0 hb9d3cd8_3
+  - libgcc >=13
+  license: MIT
+  license_family: MIT
+  size: 19810
+  timestamp: 1749230148642
+- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-bin-1.1.0-hb9d3cd8_3.conda
+  sha256: ab74fa8c3d1ca0a055226be89e99d6798c65053e2d2d3c6cb380c574972cd4a7
+  md5: 58178ef8ba927229fba6d84abf62c108
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libbrotlidec 1.1.0 hb9d3cd8_3
+  - libbrotlienc 1.1.0 hb9d3cd8_3
+  - libgcc >=13
+  license: MIT
+  license_family: MIT
+  size: 19390
+  timestamp: 1749230137037
 - conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py312h2ec8cdc_3.conda
   sha256: dc27c58dc717b456eee2d57d8bc71df3f562ee49368a2351103bc8f1b67da251
   md5: a32e0c069f6c3dcac635f7b0b0dac67e
@@ -1187,6 +1344,31 @@ packages:
   license: ISC
   size: 151069
   timestamp: 1749990087500
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cairo-1.18.4-h3394656_0.conda
+  sha256: 3bd6a391ad60e471de76c0e9db34986c4b5058587fbf2efa5a7f54645e28c2c7
+  md5: 09262e66b19567aff4f592fb53b28760
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - fontconfig >=2.15.0,<3.0a0
+  - fonts-conda-ecosystem
+  - freetype >=2.12.1,<3.0a0
+  - icu >=75.1,<76.0a0
+  - libexpat >=2.6.4,<3.0a0
+  - libgcc >=13
+  - libglib >=2.82.2,<3.0a0
+  - libpng >=1.6.47,<1.7.0a0
+  - libstdcxx >=13
+  - libxcb >=1.17.0,<2.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - pixman >=0.44.2,<1.0a0
+  - xorg-libice >=1.1.2,<2.0a0
+  - xorg-libsm >=1.2.5,<2.0a0
+  - xorg-libx11 >=1.8.11,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  - xorg-libxrender >=0.9.12,<0.10.0a0
+  license: LGPL-2.1-only or MPL-1.1
+  size: 978114
+  timestamp: 1741554591855
 - conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2025.6.15-pyhd8ed1ab_0.conda
   sha256: d71c85835813072cd6d7ce4b24be34215cd90c104785b15a5d58f4cd0cb50778
   md5: 781d068df0cc2407d4db0ecfbb29225b
@@ -1237,6 +1419,20 @@ packages:
   license_family: BSD
   size: 27011
   timestamp: 1733218222191
+- conda: https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.3.2-py312h68727a3_0.conda
+  sha256: 4c8f2aa34aa031229e6f8aa18f146bce7987e26eae9c6503053722a8695ebf0c
+  md5: e688276449452cdfe9f8f5d3e74c23f6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  - numpy >=1.23
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 276533
+  timestamp: 1744743235779
 - conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.12.11-py312hd8ed1ab_0.conda
   noarch: generic
   sha256: 7e7bc8e73a2f3736444a8564cbece7216464c00f0bc38e604b0c792ff60d621a
@@ -1770,6 +1966,30 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 16987
   timestamp: 1745696228542
+- conda: https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_1.conda
+  sha256: 9827efa891e507a91a8a2acf64e210d2aff394e1cde432ad08e1f8c66b12293c
+  md5: 44600c4667a319d67dbe0681fc0bc833
+  depends:
+  - python >=3.9
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 13399
+  timestamp: 1733332563512
+- conda: https://conda.anaconda.org/conda-forge/linux-64/cyrus-sasl-2.1.28-hd9c7081_0.conda
+  sha256: ee09ad7610c12c7008262d713416d0b58bf365bc38584dce48950025850bdf3f
+  md5: cae723309a49399d2949362f4ab5c9e4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - krb5 >=1.21.3,<1.22.0a0
+  - libgcc >=13
+  - libntlm >=1.8,<2.0a0
+  - libstdcxx >=13
+  - libxcrypt >=4.4.36
+  - openssl >=3.5.0,<4.0a0
+  license: BSD-3-Clause-Attribution
+  license_family: BSD
+  size: 209774
+  timestamp: 1750239039316
 - conda: https://conda.anaconda.org/conda-forge/noarch/datasets-2.14.4-pyhd8ed1ab_0.conda
   sha256: 7e09bd083a609138b780fcc4535924cb96814d2c908a36d4c64a2ba9ee3efe7f
   md5: 3e087f072ce03c43a9b60522f5d0ca2f
@@ -1846,6 +2066,17 @@ packages:
   license_family: OTHER
   size: 172172
   timestamp: 1733256829961
+- conda: https://conda.anaconda.org/conda-forge/linux-64/double-conversion-3.3.1-h5888daf_0.conda
+  sha256: 1bcc132fbcc13f9ad69da7aa87f60ea41de7ed4d09f3a00ff6e0e70e1c690bc2
+  md5: bfd56492d8346d669010eccafe0ba058
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 69544
+  timestamp: 1739569648873
 - conda: https://conda.anaconda.org/conda-forge/noarch/email-validator-2.2.0-pyhd8ed1ab_1.conda
   sha256: b91a19eb78edfc2dbb36de9a67f74ee2416f1b5273dd7327abe53f2dbf864736
   md5: da16dd3b0b71339060cd44cb7110ddf9
@@ -1922,6 +2153,34 @@ packages:
   license: Unlicense
   size: 17887
   timestamp: 1741969612334
+- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2
+  sha256: 58d7f40d2940dd0a8aa28651239adbf5613254df0f75789919c4e6762054403b
+  md5: 0c96522c6bdaed4b1566d11387caaf45
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 397370
+  timestamp: 1566932522327
+- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2
+  sha256: c52a29fdac682c20d252facc50f01e7c2e7ceac52aa9817aaf0bb83f7559ec5c
+  md5: 34893075a5c9e55cdafac56607368fc6
+  license: OFL-1.1
+  license_family: Other
+  size: 96530
+  timestamp: 1620479909603
+- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2
+  sha256: 00925c8c055a2275614b4d983e1df637245e19058d79fc7dd1a93b8d9fb4b139
+  md5: 4d59c254e01d9cde7957100457e2d5fb
+  license: OFL-1.1
+  license_family: Other
+  size: 700814
+  timestamp: 1620479612257
+- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda
+  sha256: 2821ec1dc454bd8b9a31d0ed22a7ce22422c0aef163c59f49dfdf915d0f0ca14
+  md5: 49023d73832ef61042f6a237cb2687e7
+  license: LicenseRef-Ubuntu-Font-Licence-Version-1.0
+  license_family: Other
+  size: 1620504
+  timestamp: 1727511233259
 - conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.15.0-h7e30c49_1.conda
   sha256: 7093aa19d6df5ccb6ca50329ef8510c6acb6b0d8001191909397368b65b02113
   md5: 8f5b0b297b59e1ac160ad4beec99dbee
@@ -1936,6 +2195,42 @@ packages:
   license_family: MIT
   size: 265599
   timestamp: 1730283881107
+- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2
+  sha256: a997f2f1921bb9c9d76e6fa2f6b408b7fa549edd349a77639c9fe7a23ea93e61
+  md5: fee5683a3f04bd15cbd8318b096a27ab
+  depends:
+  - fonts-conda-forge
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 3667
+  timestamp: 1566974674465
+- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-0.tar.bz2
+  sha256: 53f23a3319466053818540bcdf2091f253cbdbab1e0e9ae7b9e509dcaa2a5e38
+  md5: f766549260d6815b0c52253f1fb1bb29
+  depends:
+  - font-ttf-dejavu-sans-mono
+  - font-ttf-inconsolata
+  - font-ttf-source-code-pro
+  - font-ttf-ubuntu
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 4102
+  timestamp: 1566932280397
+- conda: https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.58.4-py312h178313f_0.conda
+  sha256: aa29952ac29ab4c4dad091794513241c1f732c55c58ba109f02550bc83081dc9
+  md5: 223a4616e3db7336569eafefac04ebbf
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - brotli
+  - libgcc >=13
+  - munkres
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  - unicodedata2 >=15.1.0
+  license: MIT
+  license_family: MIT
+  size: 2864513
+  timestamp: 1749848613494
 - conda: https://conda.anaconda.org/conda-forge/linux-64/freetype-2.13.3-ha770c72_1.conda
   sha256: 7ef7d477c43c12a5b4cddcf048a83277414512d1116aba62ebadfa7056a7d84f
   md5: 9ccd736d31e0c6e41f54e704e5312811
@@ -2100,6 +2395,17 @@ packages:
   license_family: APACHE
   size: 142129
   timestamp: 1744688907411
+- conda: https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.14-h5888daf_0.conda
+  sha256: cac69f3ff7756912bbed4c28363de94f545856b35033c0b86193366b95f5317d
+  md5: 951ff8d9e5536896408e89d63230b8d5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  license: LGPL-2.0-or-later
+  license_family: LGPL
+  size: 98419
+  timestamp: 1750079957535
 - conda: https://conda.anaconda.org/conda-forge/linux-64/grpcio-1.71.0-py312hdcb7bd4_1.conda
   sha256: fabc35be513624005d9bc8585f807c3d8386bcf2f172631750305bf2f890e90f
   md5: 5aa1cb5ae0ce3986f70c155608865134
@@ -2190,6 +2496,26 @@ packages:
   license_family: MIT
   size: 53888
   timestamp: 1738578623567
+- conda: https://conda.anaconda.org/conda-forge/linux-64/harfbuzz-11.2.1-h3beb420_0.conda
+  sha256: 5bd0f3674808862838d6e2efc0b3075e561c34309c5c2f4c976f7f1f57c91112
+  md5: 0e6e192d4b3d95708ad192d957cf3163
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cairo >=1.18.4,<2.0a0
+  - freetype
+  - graphite2
+  - icu >=75.1,<76.0a0
+  - libexpat >=2.7.0,<3.0a0
+  - libfreetype >=2.13.3
+  - libfreetype6 >=2.13.3
+  - libgcc >=13
+  - libglib >=2.84.1,<3.0a0
+  - libstdcxx >=13
+  - libzlib >=1.3.1,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 1730226
+  timestamp: 1747091044218
 - conda: https://conda.anaconda.org/conda-forge/linux-64/hf-transfer-0.1.9-py312h5bc9d60_1.conda
   sha256: 21acb87a6403f88b2dbdefb79a537bc8fe871b86c60f9b690206eaf7ad1f009c
   md5: 3639aa7b1297e680220f52c2b8a21200
@@ -2380,6 +2706,19 @@ packages:
   license: LGPL-2.1-or-later
   size: 117831
   timestamp: 1646151697040
+- conda: https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.8-py312h84d6215_0.conda
+  sha256: 3ce99d721c1543f6f8f5155e53eef11be47b2f5942a8d1060de6854f9d51f246
+  md5: 6713467dc95509683bfa3aca08524e8a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  license: BSD-3-Clause
+  license_family: BSD
+  size: 71649
+  timestamp: 1736908364705
 - conda: https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda
   sha256: 99df692f7a8a5c27cd14b5fb1374ee55e756631b9c3d659ed3ee60830249b238
   md5: 3f43953b7d3fb3aaa1d0d0723d91e368
@@ -2626,6 +2965,30 @@ packages:
   license_family: BSD
   size: 17280
   timestamp: 1750388682101
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libclang-cpp20.1-20.1.7-default_h1df26ce_0.conda
+  sha256: 4194c75a91a9c790cbe96c3c33fc2f388274d1be85ec884ce7c88d7e8f9d96f2
+  md5: f9ef7bce54a7673cdbc2fadd8bca1956
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libllvm20 >=20.1.7,<20.2.0a0
+  - libstdcxx >=13
+  license: Apache-2.0 WITH LLVM-exception
+  license_family: Apache
+  size: 20925717
+  timestamp: 1749876303353
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libclang13-20.1.7-default_he06ed0a_0.conda
+  sha256: 6541d19a1659062dbf8823d6a1206e28f788369bcf7af9171d7c9069c1d35932
+  md5: 846875a174de6b6ff19e205a7d90eb74
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libllvm20 >=20.1.7,<20.2.0a0
+  - libstdcxx >=13
+  license: Apache-2.0 WITH LLVM-exception
+  license_family: Apache
+  size: 12116245
+  timestamp: 1749876520951
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libcrc32c-1.1.2-h9c3ff4c_0.tar.bz2
   sha256: fd1d153962764433fe6233f34a72cdeed5dcf8a883a85769e8295ce940b5b0c5
   md5: c965a5aa0d5c1c37ffc62dff36e28400
@@ -2714,6 +3077,19 @@ packages:
   license: LicenseRef-NVIDIA-End-User-License-Agreement
   size: 30710
   timestamp: 1744072713584
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libcups-2.3.3-hb8b1518_5.conda
+  sha256: cb83980c57e311783ee831832eb2c20ecb41e7dee6e86e8b70b8cef0e43eab55
+  md5: d4a250da4737ee127fb1fa6452a9002e
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - krb5 >=1.21.3,<1.22.0a0
+  - libgcc >=13
+  - libstdcxx >=13
+  - libzlib >=1.3.1,<2.0a0
+  license: Apache-2.0
+  license_family: Apache
+  size: 4523621
+  timestamp: 1749905341688
 - conda: https://conda.anaconda.org/nvidia/linux-64/libcurand-10.3.10.19-0.conda
   sha256: 65d4f3e3286af1165679baee2cb81ba518a186d27362293321c8cfe1317f87b6
   md5: 21dcbdfb0482a369aab2b1e0abb6a399
@@ -2820,6 +3196,17 @@ packages:
   license_family: MIT
   size: 72573
   timestamp: 1747040452262
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libdrm-2.4.125-hb9d3cd8_0.conda
+  sha256: f53458db897b93b4a81a6dbfd7915ed8fa4a54951f97c698dde6faa028aadfd2
+  md5: 4c0ab57463117fbb8df85268415082f5
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libpciaccess >=0.18,<0.19.0a0
+  license: MIT
+  license_family: MIT
+  size: 246161
+  timestamp: 1749904704373
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20250104-pl5321h7949ede_0.conda
   sha256: d789471216e7aba3c184cd054ed61ce3f6dac6f87a50ec69291b9297f8c18724
   md5: c277e0a4d549b03ac1e9d6cbbe3d017b
@@ -2832,6 +3219,15 @@ packages:
   license_family: BSD
   size: 134676
   timestamp: 1738479519902
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libegl-1.7.0-ha4b6fd6_2.conda
+  sha256: 7fd5408d359d05a969133e47af580183fbf38e2235b562193d427bb9dad79723
+  md5: c151d5eb730e9b7480e6d48c0fc44048
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libglvnd 1.7.0 ha4b6fd6_2
+  license: LicenseRef-libglvnd
+  size: 44840
+  timestamp: 1731330973553
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda
   sha256: 1cd6048169fa0395af74ed5d8f1716e22c19a81a8a36f934c110ca3ad4dd27b4
   md5: 172bf1cd1ff8629f2b1179945ed45055
@@ -2978,6 +3374,16 @@ packages:
   license: GPL-3.0-only WITH GCC-exception-3.1
   size: 1565627
   timestamp: 1750808236464
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libgl-1.7.0-ha4b6fd6_2.conda
+  sha256: dc2752241fa3d9e40ce552c1942d0a4b5eeb93740c9723873f6fcf8d39ef8d2d
+  md5: 928b8be80851f5d8ffb016f9c81dae7a
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libglvnd 1.7.0 ha4b6fd6_2
+  - libglx 1.7.0 ha4b6fd6_2
+  license: LicenseRef-libglvnd
+  size: 134712
+  timestamp: 1731330998354
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libglib-2.84.2-h3618099_0.conda
   sha256: a6b5cf4d443044bc9a0293dd12ca2015f0ebe5edfdc9c4abdde0b9947f9eb7bd
   md5: 072ab14a02164b7c0c089055368ff776
@@ -2993,6 +3399,24 @@ packages:
   license: LGPL-2.1-or-later
   size: 3955066
   timestamp: 1747836671118
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libglvnd-1.7.0-ha4b6fd6_2.conda
+  sha256: 1175f8a7a0c68b7f81962699751bb6574e6f07db4c9f72825f978e3016f46850
+  md5: 434ca7e50e40f4918ab701e3facd59a0
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  license: LicenseRef-libglvnd
+  size: 132463
+  timestamp: 1731330968309
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libglx-1.7.0-ha4b6fd6_2.conda
+  sha256: 2d35a679624a93ce5b3e9dd301fff92343db609b79f0363e6d0ceb3a6478bfa7
+  md5: c8013e438185f33b13814c5c488acd5c
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libglvnd 1.7.0 ha4b6fd6_2
+  - xorg-libx11 >=1.8.10,<2.0a0
+  license: LicenseRef-libglvnd
+  size: 75504
+  timestamp: 1731330988898
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_3.conda
   sha256: 43710ab4de0cd7ff8467abff8d11e7bb0e36569df04ce1c099d48601818f11d1
   md5: 3cd1a7238a0dd3d0860fdefc496cc854
@@ -3106,6 +3530,20 @@ packages:
   license_family: BSD
   size: 17284
   timestamp: 1750388691797
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libllvm20-20.1.7-he9d0ab4_0.conda
+  sha256: 5c51416c10e84ac6a73560c82e20f99788b1395ce431c450391966d07a444fa6
+  md5: 63f1accca4913e6b66a2d546c30ff4db
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  - libxml2 >=2.13.8,<2.14.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - zstd >=1.5.7,<1.6.0a0
+  license: Apache-2.0 WITH LLVM-exception
+  license_family: Apache
+  size: 43026762
+  timestamp: 1749836200754
 - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda
   sha256: f2591c0069447bbe28d4d696b7fcb0c5bd0b4ac582769b89addbcf26fb3430d8
   md5: 1a580f7796c7bf6393fddb8bbbde58dc
@@ -3168,6 +3606,15 @@ packages:
   license_family: GPL
   size: 33731
   timestamp: 1750274110928
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libntlm-1.8-hb9d3cd8_0.conda
+  sha256: 3b3f19ced060013c2dd99d9d46403be6d319d4601814c772a3472fe2955612b0
+  md5: 7c7927b404672409d9917d49bff5f2d6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: LGPL-2.1-or-later
+  size: 33418
+  timestamp: 1734670021371
 - conda: https://conda.anaconda.org/nvidia/linux-64/libnvfatbin-12.9.19-0.conda
   sha256: 6518f895024b6c03f384369e4870b8b2068c04e39677f074b96a969d84c1c055
   md5: fb75d10b7d3ae4bdb7697dfcdfa8991c
@@ -3251,6 +3698,15 @@ packages:
   license_family: BSD
   size: 218500
   timestamp: 1745825989535
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libopengl-1.7.0-ha4b6fd6_2.conda
+  sha256: 215086c108d80349e96051ad14131b751d17af3ed2cb5a34edd62fa89bfe8ead
+  md5: 7df50d44d4a14d6c31a2c54f2cd92157
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libglvnd 1.7.0 ha4b6fd6_2
+  license: LicenseRef-libglvnd
+  size: 50757
+  timestamp: 1731330993524
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libopentelemetry-cpp-1.21.0-hd1b1c89_0.conda
   sha256: b88de51fa55513483e7c80c43d38ddd3559f8d17921879e4c99909ba66e1c16b
   md5: 4b25cd8720fd8d5319206e4f899f2707
@@ -3303,6 +3759,16 @@ packages:
   license: Apache-2.0
   size: 1214865
   timestamp: 1750865956895
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libpciaccess-0.18-hb9d3cd8_0.conda
+  sha256: 0bd91de9b447a2991e666f284ae8c722ffb1d84acb594dbd0c031bd656fa32b2
+  md5: 70e3400cbbfa03e96dcde7fc13e38c7b
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: MIT
+  license_family: MIT
+  size: 28424
+  timestamp: 1749901812541
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.49-h943b412_0.conda
   sha256: c8f5dc929ba5fcee525a66777498e03bbcbfefc05a0773e5163bb08ac5122f1a
   md5: 37511c874cf3b8d0034c8d24e73c0884
@@ -3313,6 +3779,19 @@ packages:
   license: zlib-acknowledgement
   size: 289506
   timestamp: 1750095629466
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libpq-17.5-h27ae623_0.conda
+  sha256: 2dbcef0db82e0e7b6895b6c0dadd3d36c607044c40290c7ca10656f3fca3166f
+  md5: 6458be24f09e1b034902ab44fe9de908
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - icu >=75.1,<76.0a0
+  - krb5 >=1.21.3,<1.22.0a0
+  - libgcc >=13
+  - openldap >=2.6.9,<2.7.0a0
+  - openssl >=3.5.0,<4.0a0
+  license: PostgreSQL
+  size: 2680582
+  timestamp: 1746743259857
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libprotobuf-5.29.3-h501fc15_1.conda
   sha256: 691af28446345674c6b3fb864d0e1a1574b6cc2f788e0f036d73a6b05dcf81cf
   md5: edb86556cf4a0c133e7932a1597ff236
@@ -3598,6 +4077,16 @@ packages:
   license_family: MIT
   size: 690864
   timestamp: 1746634244154
+- conda: https://conda.anaconda.org/conda-forge/linux-64/libxslt-1.1.39-h76b75d6_0.conda
+  sha256: 684e9b67ef7b9ca0ca993762eeb39705ec58e2e7f958555c758da7ef416db9f3
+  md5: e71f31f8cfb0a91439f2086fc8aa0461
+  depends:
+  - libgcc-ng >=12
+  - libxml2 >=2.12.1,<2.14.0a0
+  license: MIT
+  license_family: MIT
+  size: 254297
+  timestamp: 1701628814990
 - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda
   sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4
   md5: edb0dca6bc32e4f4789199455a1dbeb8
@@ -3656,6 +4145,47 @@ packages:
   license_family: BSD
   size: 24604
   timestamp: 1733219911494
+- conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-3.10.3-py312h7900ff3_0.conda
+  sha256: 2255888d215fb1438b968bd7e5fd89580c25eb90f4010aad38dda8aac7b642c8
+  md5: 40e02247b1467ce6fff28cad870dc833
+  depends:
+  - matplotlib-base >=3.10.3,<3.10.4.0a0
+  - pyside6 >=6.7.2
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  - tornado >=5
+  license: PSF-2.0
+  license_family: PSF
+  size: 17376
+  timestamp: 1746820703075
+- conda: https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.10.3-py312hd3ec401_0.conda
+  sha256: 3b5be100ddfcd5697140dbb8d4126e3afd0147d4033defd6c6eeac78fe089bd2
+  md5: 2d69618b52d70970c81cc598e4b51118
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - contourpy >=1.0.1
+  - cycler >=0.10
+  - fonttools >=4.22.0
+  - freetype
+  - kiwisolver >=1.3.1
+  - libfreetype >=2.13.3
+  - libfreetype6 >=2.13.3
+  - libgcc >=13
+  - libstdcxx >=13
+  - numpy >=1.19,<3
+  - numpy >=1.23
+  - packaging >=20.0
+  - pillow >=8
+  - pyparsing >=2.3.1
+  - python >=3.12,<3.13.0a0
+  - python-dateutil >=2.7
+  - python_abi 3.12.* *_cp312
+  - qhull >=2020.2,<2020.3.0a0
+  - tk >=8.6.13,<8.7.0a0
+  license: PSF-2.0
+  license_family: PSF
+  size: 8188885
+  timestamp: 1746820680864
 - conda: https://conda.modular.com/max-nightly/noarch/max-25.5.0.dev2025062705-release.conda
   noarch: python
   sha256: 311e01e00ce7302eb97c263c021700c7baa1a6d9be9477f60edcfd17fbf4b49d
@@ -3910,6 +4440,15 @@ packages:
   license_family: BSD
   size: 335666
   timestamp: 1695459025249
+- conda: https://conda.anaconda.org/conda-forge/noarch/munkres-1.1.4-pyhd8ed1ab_1.conda
+  sha256: d09c47c2cf456de5c09fa66d2c3c5035aa1fa228a1983a433c47b876aa16ce90
+  md5: 37293a85a0f4f77bbd9cf7aaefc62609
+  depends:
+  - python >=3.9
+  license: Apache-2.0
+  license_family: Apache
+  size: 15851
+  timestamp: 1749895533014
 - conda: https://conda.anaconda.org/conda-forge/noarch/mypy_extensions-1.1.0-pyha770c72_0.conda
   sha256: 6ed158e4e5dd8f6a10ad9e525631e35cee8557718f83de7a4e3966b1f772c4b1
   md5: e9c622e0d00fa24a6292279af3ab6d06
@@ -4059,6 +4598,20 @@ packages:
   license_family: BSD
   size: 342988
   timestamp: 1733816638720
+- conda: https://conda.anaconda.org/conda-forge/linux-64/openldap-2.6.10-he970967_0.conda
+  sha256: cb0b07db15e303e6f0a19646807715d28f1264c6350309a559702f4f34f37892
+  md5: 2e5bf4f1da39c0b32778561c3c4e5878
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - cyrus-sasl >=2.1.27,<3.0a0
+  - krb5 >=1.21.3,<1.22.0a0
+  - libgcc >=13
+  - libstdcxx >=13
+  - openssl >=3.5.0,<4.0a0
+  license: OLDAP-2.8
+  license_family: BSD
+  size: 780253
+  timestamp: 1748010165522
 - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.0-h7b32b05_1.conda
   sha256: b4491077c494dbf0b5eaa6d87738c22f2154e9277e5293175ec187634bd808a0
   md5: de356753cfdbffcde5bb1e86e3aa6cd0
@@ -4288,6 +4841,17 @@ packages:
   license: HPND
   size: 42506161
   timestamp: 1746646366556
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.2-h29eaf8c_0.conda
+  sha256: 6cb261595b5f0ae7306599f2bb55ef6863534b6d4d1bc0dcfdfa5825b0e4e53d
+  md5: 39b4228a867772d610c02e06f939a5b8
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libstdcxx >=13
+  license: MIT
+  license_family: MIT
+  size: 402222
+  timestamp: 1749552884791
 - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.3.8-pyhe01879c_0.conda
   sha256: 0f48999a28019c329cd3f6fd2f01f09fc32cc832f7d6bbe38087ddac858feaa3
   md5: 424844562f5d337077b445ec6b1398a7
@@ -4517,6 +5081,36 @@ packages:
   license_family: BSD
   size: 184288
   timestamp: 1748126832356
+- conda: https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.2.3-pyhd8ed1ab_1.conda
+  sha256: b92afb79b52fcf395fd220b29e0dd3297610f2059afac45298d44e00fcbf23b6
+  md5: 513d3c262ee49b54a8fec85c5bc99764
+  depends:
+  - python >=3.9
+  license: MIT
+  license_family: MIT
+  size: 95988
+  timestamp: 1743089832359
+- conda: https://conda.anaconda.org/conda-forge/linux-64/pyside6-6.9.1-py312hdb827e4_0.conda
+  sha256: 782c46d57daf2e027cd4d6a7c440ccecf09aca34e200d209b1d1a4ebb0548789
+  md5: 843ad8ae4523f47a7f636f576750c487
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libclang13 >=20.1.6
+  - libegl >=1.7.0,<2.0a0
+  - libgcc >=13
+  - libgl >=1.7.0,<2.0a0
+  - libopengl >=1.7.0,<2.0a0
+  - libstdcxx >=13
+  - libxml2 >=2.13.8,<2.14.0a0
+  - libxslt >=1.1.39,<2.0a0
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  - qt6-main 6.9.1.*
+  - qt6-main >=6.9.1,<6.10.0a0
+  license: LGPL-3.0-only
+  license_family: LGPL
+  size: 10133664
+  timestamp: 1749047343971
 - conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda
   sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8
   md5: 461219d1a5bd61342293efa2c0c90eac
@@ -4719,6 +5313,76 @@ packages:
   license_family: BSD
   size: 378610
   timestamp: 1749898590652
+- conda: https://conda.anaconda.org/conda-forge/linux-64/qhull-2020.2-h434a139_5.conda
+  sha256: 776363493bad83308ba30bcb88c2552632581b143e8ee25b1982c8c743e73abc
+  md5: 353823361b1d27eb3960efb076dfcaf6
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc-ng >=12
+  - libstdcxx-ng >=12
+  license: LicenseRef-Qhull
+  size: 552937
+  timestamp: 1720813982144
+- conda: https://conda.anaconda.org/conda-forge/linux-64/qt6-main-6.9.1-h0384650_1.conda
+  sha256: 820338eadfdac82e0ec208e41a7f02cc3a7adb8fc0dcf107a57b2a1cdec9f89e
+  md5: 3610aa92d2de36047886f30e99342f21
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - alsa-lib >=1.2.14,<1.3.0a0
+  - dbus >=1.16.2,<2.0a0
+  - double-conversion >=3.3.1,<3.4.0a0
+  - fontconfig >=2.15.0,<3.0a0
+  - fonts-conda-ecosystem
+  - harfbuzz >=11.0.1
+  - icu >=75.1,<76.0a0
+  - krb5 >=1.21.3,<1.22.0a0
+  - libclang-cpp20.1 >=20.1.7,<20.2.0a0
+  - libclang13 >=20.1.7
+  - libcups >=2.3.3,<2.4.0a0
+  - libdrm >=2.4.125,<2.5.0a0
+  - libegl >=1.7.0,<2.0a0
+  - libfreetype >=2.13.3
+  - libfreetype6 >=2.13.3
+  - libgcc >=13
+  - libgl >=1.7.0,<2.0a0
+  - libglib >=2.84.2,<3.0a0
+  - libjpeg-turbo >=3.1.0,<4.0a0
+  - libllvm20 >=20.1.7,<20.2.0a0
+  - libpng >=1.6.49,<1.7.0a0
+  - libpq >=17.5,<18.0a0
+  - libsqlite >=3.50.1,<4.0a0
+  - libstdcxx >=13
+  - libtiff >=4.7.0,<4.8.0a0
+  - libwebp-base >=1.5.0,<2.0a0
+  - libxcb >=1.17.0,<2.0a0
+  - libxkbcommon >=1.10.0,<2.0a0
+  - libxml2 >=2.13.8,<2.14.0a0
+  - libzlib >=1.3.1,<2.0a0
+  - openssl >=3.5.0,<4.0a0
+  - pcre2 >=10.45,<10.46.0a0
+  - wayland >=1.23.1,<2.0a0
+  - xcb-util >=0.4.1,<0.5.0a0
+  - xcb-util-cursor >=0.1.5,<0.2.0a0
+  - xcb-util-image >=0.4.0,<0.5.0a0
+  - xcb-util-keysyms >=0.4.1,<0.5.0a0
+  - xcb-util-renderutil >=0.3.10,<0.4.0a0
+  - xcb-util-wm >=0.4.2,<0.5.0a0
+  - xorg-libice >=1.1.2,<2.0a0
+  - xorg-libsm >=1.2.6,<2.0a0
+  - xorg-libx11 >=1.8.12,<2.0a0
+  - xorg-libxcomposite >=0.4.6,<1.0a0
+  - xorg-libxcursor >=1.2.3,<2.0a0
+  - xorg-libxdamage >=1.1.6,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  - xorg-libxrandr >=1.5.4,<2.0a0
+  - xorg-libxtst >=1.2.5,<2.0a0
+  - xorg-libxxf86vm >=1.1.6,<2.0a0
+  - zstd >=1.5.7,<1.6.0a0
+  constrains:
+  - qt 6.9.1
+  license: LGPL-3.0-only
+  size: 52006560
+  timestamp: 1750920502800
 - conda: https://conda.anaconda.org/conda-forge/linux-64/re2-2025.06.26-h9925aae_0.conda
   sha256: 7a0b82cb162229e905f500f18e32118ef581e1fd182036f3298510b8e8663134
   md5: 2b4249747a9091608dbff2bd22afde44
@@ -5207,6 +5871,18 @@ packages:
   license: LicenseRef-Public-Domain
   size: 122968
   timestamp: 1742727099393
+- conda: https://conda.anaconda.org/conda-forge/linux-64/unicodedata2-16.0.0-py312h66e93f0_0.conda
+  sha256: 638916105a836973593547ba5cf4891d1f2cb82d1cf14354fcef93fd5b941cdc
+  md5: 617f5d608ff8c28ad546e5d9671cbb95
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - python >=3.12,<3.13.0a0
+  - python_abi 3.12.* *_cp312
+  license: Apache-2.0
+  license_family: Apache
+  size: 404401
+  timestamp: 1736692621599
 - conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.5.0-pyhd8ed1ab_0.conda
   sha256: 4fb9789154bd666ca74e428d973df81087a697dbb987775bc3198d2215f240f8
   md5: 436c165519e140cb08d246a4472a9d6a
@@ -5276,6 +5952,19 @@ packages:
   license_family: MIT
   size: 420196
   timestamp: 1750054006450
+- conda: https://conda.anaconda.org/conda-forge/linux-64/wayland-1.23.1-h3e06ad9_1.conda
+  sha256: 73d809ec8056c2f08e077f9d779d7f4e4c2b625881cad6af303c33dc1562ea01
+  md5: a37843723437ba75f42c9270ffe800b1
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libexpat >=2.7.0,<3.0a0
+  - libffi >=3.4.6,<3.5.0a0
+  - libgcc >=13
+  - libstdcxx >=13
+  license: MIT
+  license_family: MIT
+  size: 321099
+  timestamp: 1745806602179
 - conda: https://conda.anaconda.org/conda-forge/linux-64/websockets-15.0.1-py312h66e93f0_0.conda
   sha256: d55c82992553720a4c2f49d383ce8260a4ce1fa39df0125edb71f78ff2ee3682
   md5: b986da7551224417af6b7da4021d8050
@@ -5300,6 +5989,72 @@ packages:
   license_family: BSD
   size: 63590
   timestamp: 1736869574299
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-0.4.1-h4f16b4b_2.conda
+  sha256: ad8cab7e07e2af268449c2ce855cbb51f43f4664936eff679b1f3862e6e4b01d
+  md5: fdc27cb255a7a2cc73b7919a968b48f0
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libxcb >=1.17.0,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 20772
+  timestamp: 1750436796633
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-cursor-0.1.5-hb9d3cd8_0.conda
+  sha256: c7b35db96f6e32a9e5346f97adc968ef2f33948e3d7084295baebc0e33abdd5b
+  md5: eb44b3b6deb1cab08d72cb61686fe64c
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libxcb >=1.13
+  - libxcb >=1.16,<2.0.0a0
+  - xcb-util-image >=0.4.0,<0.5.0a0
+  - xcb-util-renderutil >=0.3.10,<0.4.0a0
+  license: MIT
+  license_family: MIT
+  size: 20296
+  timestamp: 1726125844850
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-image-0.4.0-hb711507_2.conda
+  sha256: 94b12ff8b30260d9de4fd7a28cca12e028e572cbc504fd42aa2646ec4a5bded7
+  md5: a0901183f08b6c7107aab109733a3c91
+  depends:
+  - libgcc-ng >=12
+  - libxcb >=1.16,<2.0.0a0
+  - xcb-util >=0.4.1,<0.5.0a0
+  license: MIT
+  license_family: MIT
+  size: 24551
+  timestamp: 1718880534789
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-keysyms-0.4.1-hb711507_0.conda
+  sha256: 546e3ee01e95a4c884b6401284bb22da449a2f4daf508d038fdfa0712fe4cc69
+  md5: ad748ccca349aec3e91743e08b5e2b50
+  depends:
+  - libgcc-ng >=12
+  - libxcb >=1.16,<2.0.0a0
+  license: MIT
+  license_family: MIT
+  size: 14314
+  timestamp: 1718846569232
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-renderutil-0.3.10-hb711507_0.conda
+  sha256: 2d401dadc43855971ce008344a4b5bd804aca9487d8ebd83328592217daca3df
+  md5: 0e0cbe0564d03a99afd5fd7b362feecd
+  depends:
+  - libgcc-ng >=12
+  - libxcb >=1.16,<2.0.0a0
+  license: MIT
+  license_family: MIT
+  size: 16978
+  timestamp: 1718848865819
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xcb-util-wm-0.4.2-hb711507_0.conda
+  sha256: 31d44f297ad87a1e6510895740325a635dd204556aa7e079194a0034cdd7e66a
+  md5: 608e0ef8256b81d04456e8d211eee3e8
+  depends:
+  - libgcc-ng >=12
+  - libxcb >=1.16,<2.0.0a0
+  license: MIT
+  license_family: MIT
+  size: 51689
+  timestamp: 1718844051451
 - conda: https://conda.anaconda.org/conda-forge/linux-64/xgrammar-0.1.19-py312he346f12_0.conda
   sha256: 8805e1c82c17b8721bd43f90eec1eda4b863315ac64a5bd2a10d74542410b8cd
   md5: 113121a0cbf37538bdf921d8e527cf63
@@ -5331,6 +6086,28 @@ packages:
   license_family: MIT
   size: 392406
   timestamp: 1749375847832
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libice-1.1.2-hb9d3cd8_0.conda
+  sha256: c12396aabb21244c212e488bbdc4abcdef0b7404b15761d9329f5a4a39113c4b
+  md5: fb901ff28063514abb6046c9ec2c4a45
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  license: MIT
+  license_family: MIT
+  size: 58628
+  timestamp: 1734227592886
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.6-he73a12e_0.conda
+  sha256: 277841c43a39f738927145930ff963c5ce4c4dacf66637a3d95d802a64173250
+  md5: 1c74ff8c35dcadf952a16f752ca5aa49
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - libuuid >=2.38.1,<3.0a0
+  - xorg-libice >=1.1.2,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 27590
+  timestamp: 1741896361728
 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libx11-1.8.12-h4f16b4b_0.conda
   sha256: 51909270b1a6c5474ed3978628b341b4d4472cd22610e5f22b506855a5e20f67
   md5: db038ce880f100acc74dba10302b5630
@@ -5352,6 +6129,44 @@ packages:
   license_family: MIT
   size: 14780
   timestamp: 1734229004433
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcomposite-0.4.6-hb9d3cd8_2.conda
+  sha256: 753f73e990c33366a91fd42cc17a3d19bb9444b9ca5ff983605fa9e953baf57f
+  md5: d3c295b50f092ab525ffe3c2aa4b7413
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxfixes >=6.0.1,<7.0a0
+  license: MIT
+  license_family: MIT
+  size: 13603
+  timestamp: 1727884600744
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxcursor-1.2.3-hb9d3cd8_0.conda
+  sha256: 832f538ade441b1eee863c8c91af9e69b356cd3e9e1350fff4fe36cc573fc91a
+  md5: 2ccd714aa2242315acaf0a67faea780b
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxfixes >=6.0.1,<7.0a0
+  - xorg-libxrender >=0.9.11,<0.10.0a0
+  license: MIT
+  license_family: MIT
+  size: 32533
+  timestamp: 1730908305254
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdamage-1.1.6-hb9d3cd8_0.conda
+  sha256: 43b9772fd6582bf401846642c4635c47a9b0e36ca08116b3ec3df36ab96e0ec0
+  md5: b5fcc7172d22516e1f965490e65e33a4
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  - xorg-libxfixes >=6.0.1,<7.0a0
+  license: MIT
+  license_family: MIT
+  size: 13217
+  timestamp: 1727891438799
 - conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb9d3cd8_0.conda
   sha256: 6b250f3e59db07c2514057944a3ea2044d6a8cdde8a47b6497c254520fade1ee
   md5: 8035c64cb77ed555e3f150b7b3972480
@@ -5362,6 +6177,90 @@ packages:
   license_family: MIT
   size: 19901
   timestamp: 1727794976192
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxext-1.3.6-hb9d3cd8_0.conda
+  sha256: da5dc921c017c05f38a38bd75245017463104457b63a1ce633ed41f214159c14
+  md5: febbab7d15033c913d53c7a2c102309d
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 50060
+  timestamp: 1727752228921
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxfixes-6.0.1-hb9d3cd8_0.conda
+  sha256: 2fef37e660985794617716eb915865ce157004a4d567ed35ec16514960ae9271
+  md5: 4bdb303603e9821baf5fe5fdff1dc8f8
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 19575
+  timestamp: 1727794961233
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxi-1.8.2-hb9d3cd8_0.conda
+  sha256: 1a724b47d98d7880f26da40e45f01728e7638e6ec69f35a3e11f92acd05f9e7a
+  md5: 17dcc85db3c7886650b8908b183d6876
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  - xorg-libxfixes >=6.0.1,<7.0a0
+  license: MIT
+  license_family: MIT
+  size: 47179
+  timestamp: 1727799254088
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrandr-1.5.4-hb9d3cd8_0.conda
+  sha256: ac0f037e0791a620a69980914a77cb6bb40308e26db11698029d6708f5aa8e0d
+  md5: 2de7f99d6581a4a7adbff607b5c278ca
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  - xorg-libxrender >=0.9.11,<0.10.0a0
+  license: MIT
+  license_family: MIT
+  size: 29599
+  timestamp: 1727794874300
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxrender-0.9.12-hb9d3cd8_0.conda
+  sha256: 044c7b3153c224c6cedd4484dd91b389d2d7fd9c776ad0f4a34f099b3389f4a1
+  md5: 96d57aba173e878a2089d5638016dc5e
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 33005
+  timestamp: 1734229037766
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxtst-1.2.5-hb9d3cd8_3.conda
+  sha256: 752fdaac5d58ed863bbf685bb6f98092fe1a488ea8ebb7ed7b606ccfce08637a
+  md5: 7bbe9a0cc0df0ac5f5a8ad6d6a11af2f
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  - xorg-libxi >=1.7.10,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 32808
+  timestamp: 1727964811275
+- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxxf86vm-1.1.6-hb9d3cd8_0.conda
+  sha256: 8a4e2ee642f884e6b78c20c0892b85dd9b2a6e64a6044e903297e616be6ca35b
+  md5: 5efa5fa6243a622445fdfd72aee15efa
+  depends:
+  - __glibc >=2.17,<3.0.a0
+  - libgcc >=13
+  - xorg-libx11 >=1.8.10,<2.0a0
+  - xorg-libxext >=1.3.6,<2.0a0
+  license: MIT
+  license_family: MIT
+  size: 17819
+  timestamp: 1734214575628
 - conda: https://conda.anaconda.org/conda-forge/linux-64/xxhash-0.8.3-hb47aa4a_0.conda
   sha256: 08e12f140b1af540a6de03dd49173c0e5ae4ebc563cabdd35ead0679835baf6f
   md5: 607e13a8caac17f9a664bcab5302ce06
diff --git a/pixi.toml b/pixi.toml
index 8d44315..0c5386a 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -14,6 +14,9 @@ cuda-toolkit = "12.*" # for compute-sanitizer etc.
 [dependencies]
 modular = ">=25.5.0.dev2025062705,<26"
 max = ">=25.5.0.dev2025062405,<26"
+python = ">=3.11,<3.13"
+matplotlib = ">=3.10.3,<4"
+pandas = ">=2.3.0,<3"
 
 [environments]
 cuda = { features = ["cuda"] }
@@ -88,6 +91,7 @@ depends-on = ["install", "format_tests"]
 
 [tasks.bench] # Run all benchmarks
 cmd = "./build/all_benchmarks"
+inputs = ["benchmarks/**/*.mojo"]
 depends-on = [
     "install",
     "format_benchmarks",
@@ -98,6 +102,11 @@ depends-on = [
     ] },
 ]
 
+[tasks.plot] # Plot the results of the benchmarks
+cmd = "python3 benchmarks/plot_results.py"
+inputs = ["data/**/*.csv"]
+depends-on = ["install", "bench"]
+
 
 # # Benches
 # bench_decimal = "clear && pixi run package && cd benches/decimal && pixi run mojo -I ../ bench.mojo && cd ../.. && pixi run clean"
diff --git a/src/base/gpu/qubits_operations.mojo b/src/base/gpu/qubits_operations.mojo
index 2a980d6..2c5a3c2 100644
--- a/src/base/gpu/qubits_operations.mojo
+++ b/src/base/gpu/qubits_operations.mojo
@@ -29,299 +29,8 @@ alias gate_set_1qubit_vectorized_layout = Layout.row_major(
 )
 
 
-# fn qubit_wise_multiply_inplace_gpu[
-#     number_control_bits: Int
-# ](
-#     gate_set_re: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
-#     gate_set_im: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
-#     gate_index: Int,
-#     gate_size: Int,
-#     target_qubit: Int,
-#     quantum_state_re: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     quantum_state_im: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     number_qubits: Int,
-#     quantum_state_size: Int,
-#     quantum_state_out_re: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     quantum_state_out_im: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
-#     control_bits_circuit: LayoutTensor[
-#         mut=False, DType.int32, circuit_control_bits_layout
-#     ],
-#     current_control_gate_circuit: LayoutTensor[
-#         mut=True, DType.int32, Layout.row_major(1)
-#     ],
-# ) -> None:
-#     """Applies a quantum gate to specific qubits in the quantum state.
-
-#     It will apply the gate starting from the target qubit assuming that the other
-#     qubits that the gate acts on are following the target qubit.
-
-#     Parameters:
-#         number_control_bits: Number of control bits.
-
-#     Args:
-#         gate_set_re: All unique gates applied in the circuit, real part.
-#         gate_set_im: All unique gates applied in the circuit, imaginary part.
-#         gate_index: Index of the gate in the gate set to apply.
-#         gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
-#         target_qubit: The index of the target qubit to apply the gate to.
-#         quantum_state_re: Real part of the quantum state vector.
-#         quantum_state_im: Imaginary part of the quantum state vector.
-#         number_qubits: Total number of qubits in the quantum state.
-#         quantum_state_size: Size of the quantum state vector (2^number_qubits).
-#         quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
-#         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
-#         control_bits_circuit: Control bits, where each control bit contains
-#                             [wire_index, flag] (1 for control, 0 for anti-control).
-#         current_control_gate_circuit: Current control gate circuit index,
-#                                 used to track the position in the control_bits_circuit.
-#     """
-#     print("Inside qubit_wise_multiply_gpu")
-#     target_qubits_count: Int = count_trailing_zeros(gate_size)
-#     if (target_qubit < 0) or (target_qubit >= number_qubits):
-#         print(
-#             "Error: target_qubit index out of bounds. Must be between 0 and",
-#             number_qubits - 1,
-#         )
-#         print("Skipping gate application.")
-#         return
-
-#     print("AAAAA")
-#     inclusion_mask: Int = 0
-#     desired_value_mask: Int = 0
-
-#     @parameter
-#     for i in range(number_control_bits):
-#         print("before")
-#         wire_index, flag = (
-#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 0],
-#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 1],
-#         )
-#         current_control_gate_circuit[0] += 1
-#         print("after")
-#         bit: Int = 1 << Int(
-#             wire_index
-#         )  # efficient way of computing 2^wire_index
-#         inclusion_mask |= bit  # turn on the bit
-#         if flag == 1:
-#             desired_value_mask |= bit  # turn on the bit
-
-#     print("BBBBB")
-#     size_of_state_vector: Int = quantum_state_size
-#     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
-#     size_of_block: Int = size_of_half_block << target_qubits_count
-
-#     print("CCCC")
-#     # copies all amplitudes from quantum_state to quantum_state_out
-#     for i in range(size_of_state_vector):
-#         quantum_state_out_re[i] = quantum_state_re[i]
-#         quantum_state_out_im[i] = quantum_state_im[i]
-
-#     print("before loop")
-#     for block_start in range(0, size_of_state_vector, size_of_block):
-#         # print("block_start:", block_start)
-#         for offset in range(size_of_half_block):
-#             # print("offset:", offset)
-#             i1: Int = (
-#                 block_start | offset
-#             )  # faster than, but equivalent to, block_start + offset
-
-#             if (i1 & inclusion_mask) != desired_value_mask:
-#                 continue  # skip this iteration if the control bits do not match
-
-#             i2: Int = (
-#                 i1 | size_of_half_block
-#             )  # equivalent to i1 + size_of_half_block
-
-#             print("i1:", i1, "i2:", i2)
-
-#             # new_state_vector[i1] = (
-#             #     gate[0, 0] * quantum_state[i1] + gate[0, 1] * quantum_state[i2]
-#             # )
-#             # new_state_vector[i2] = (
-#             #     gate[1, 0] * quantum_state[i1] + gate[1, 1] * quantum_state[i2]
-#             # )
-
-#             quantum_state_out_re[i1] = (
-#                 (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
-#                 - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
-#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
-#                 - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
-#             )
-
-#             quantum_state_out_im[i1] = (
-#                 (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
-#                 - (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
-#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
-#                 - (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
-#             )
-
-#             quantum_state_out_re[i2] = (
-#                 (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
-#                 - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
-#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
-#                 - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
-#             )
-
-#             quantum_state_out_im[i2] = (
-#                 (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
-#                 - (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
-#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
-#                 - (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
-#             )
-
-
-# NOTE: Works with
-# alias BLOCKS_PER_GRID = 1
-# alias THREADS_PER_BLOCK = (1, 1)
-# fn qubit_wise_multiply_inplace_gpu[
-#     number_control_bits: Int
-# ](
-#     gate_set_re: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
-#     gate_set_im: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
-#     gate_index: Int,
-#     gate_size: Int,
-#     target_qubit: Int,
-#     quantum_state_re: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     quantum_state_im: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     number_qubits: Int,
-#     quantum_state_size: Int,
-#     quantum_state_out_re: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     quantum_state_out_im: LayoutTensor[
-#         mut=True, dtype, state_vector_3qubits_layout
-#     ],
-#     # control_bits: LayoutTensor[mut=True, DType.int32, control_bits_layout],
-#     control_bits_circuit: LayoutTensor[
-#         mut=False, DType.int32, circuit_control_bits_layout
-#     ],
-#     current_control_gate_circuit: LayoutTensor[
-#         mut=True, DType.int32, Layout.row_major(1)
-#     ],
-# ) -> None:
-#     """Applies a quantum gate to specific qubits in the quantum state.
-
-#     It will apply the gate starting from the target qubit assuming that the other
-#     qubits that the gate acts on are following the target qubit.
-
-#     Parameters:
-#         number_control_bits: Number of control bits.
-
-#     Args:
-#         gate_set_re: All unique gates applied in the circuit, real part.
-#         gate_set_im: All unique gates applied in the circuit, imaginary part.
-#         gate_index: Index of the gate in the gate set to apply.
-#         gate_size: Size of the gate (2^n, where n is the number of qubits the gate acts on).
-#         target_qubit: The index of the target qubit to apply the gate to.
-#         quantum_state_re: Real part of the quantum state vector.
-#         quantum_state_im: Imaginary part of the quantum state vector.
-#         number_qubits: Total number of qubits in the quantum state.
-#         quantum_state_size: Size of the quantum state vector (2^number_qubits).
-#         quantum_state_out_re: Output real part of the quantum state vector after applying the gate.
-#         quantum_state_out_im: Output imaginary part of the quantum state vector after applying the gate.
-#         control_bits_circuit: Control bits, where each control bit contains
-#                             [wire_index, flag] (1 for control, 0 for anti-control).
-#         current_control_gate_circuit: Current control gate circuit index,
-#                                 used to track the position in the control_bits_circuit.
-#     """
-#     target_qubits_count: Int = count_trailing_zeros(gate_size)
-#     if (target_qubit < 0) or (target_qubit >= number_qubits):
-#         print(
-#             "Error: target_qubit index out of bounds. Must be between 0 and",
-#             number_qubits - 1,
-#             "(Skipping gate application)",
-#         )
-#         return
-
-#     # global_i = block_dim.x * block_idx.x + thread_idx.x
-#     global_i = global_idx.x
-#     local_i = thread_idx.x
-
-#     inclusion_mask: Int = 0
-#     desired_value_mask: Int = 0
-
-#     @parameter
-#     for i in range(number_control_bits):
-#         wire_index, flag = (
-#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 0],
-#             control_bits_circuit[Int(current_control_gate_circuit[0]), i, 1],
-#         )
-#         current_control_gate_circuit[0] += 1
-#         bit: Int = 1 << Int(
-#             wire_index
-#         )  # efficient way of computing 2^wire_index
-#         inclusion_mask |= bit  # turn on the bit
-#         if flag == 1:
-#             desired_value_mask |= bit  # turn on the bit
-
-#     size_of_state_vector: Int = quantum_state_size
-#     size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
-#     size_of_block: Int = size_of_half_block << target_qubits_count
-
-#     # copies all amplitudes from quantum_state to quantum_state_out
-#     for i in range(size_of_state_vector):
-#         quantum_state_out_re[i] = quantum_state_re[i]
-#         quantum_state_out_im[i] = quantum_state_im[i]
-
-#     for block_start in range(0, size_of_state_vector, size_of_block):
-#         # print("block_start:", block_start)
-#         for offset in range(size_of_half_block):
-#             # print("offset:", offset)
-#             i1: Int = (
-#                 block_start | offset
-#             )  # faster than, but equivalent to, block_start + offset
-
-#             if (i1 & inclusion_mask) != desired_value_mask:
-#                 continue  # skip this iteration if the control bits do not match
-
-#             i2: Int = (
-#                 i1 | size_of_half_block
-#             )  # equivalent to i1 + size_of_half_block
-
-#             quantum_state_out_re[i1] = (
-#                 (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
-#                 - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
-#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
-#                 - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
-#             )
-
-#             quantum_state_out_im[i1] = (
-#                 (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
-#                 + (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
-#                 + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
-#                 + (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
-#             )
-
-#             quantum_state_out_re[i2] = (
-#                 (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
-#                 - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
-#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
-#                 - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
-#             )
-
-#             quantum_state_out_im[i2] = (
-#                 (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
-#                 + (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
-#                 + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
-#                 + (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
-#             )
-
-
 fn qubit_wise_multiply_inplace_gpu[
-    number_control_bits: Int
+    number_control_bits: Int, use_one_thread: Bool = False
 ](
     gate_set_re: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
     gate_set_im: LayoutTensor[mut=False, dtype, gate_set_1qubit_layout],
@@ -356,6 +65,8 @@ fn qubit_wise_multiply_inplace_gpu[
 
     Parameters:
         number_control_bits: Number of control bits.
+        use_one_thread: If True, only the first thread will perform the operation.
+                    If False, all threads will participate in the operation.
 
     Args:
         gate_set_re: All unique gates applied in the circuit, real part.
@@ -374,7 +85,24 @@ fn qubit_wise_multiply_inplace_gpu[
         current_control_gate_circuit: Current control gate circuit index,
                                 used to track the position in the control_bits_circuit.
     """
-    # target_qubits_count: Int = count_trailing_zeros(gate_size)
+    # global_i = block_dim.x * block_idx.x + thread_idx.x
+    global_i = global_idx.x
+    # local_i = thread_idx.x
+
+    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
+
+    @parameter
+    if use_one_thread:
+        if global_i > 0:
+            return  # Skip this thread if it is not the first one
+    else:
+        if global_i < quantum_state_size:
+            # Only threads whose index has a '0' at the target_qubit position will do the work.
+            # These are the 'i1' indices.
+            is_i1_thread = (global_i & size_of_half_block) == 0
+            if not is_i1_thread:
+                return  # Skip this thread if it is not an 'i1' thread
+
     if (target_qubit < 0) or (target_qubit >= number_qubits):
         print(
             "Error: target_qubit index out of bounds. Must be between 0 and",
@@ -383,16 +111,10 @@ fn qubit_wise_multiply_inplace_gpu[
         )
         return
 
-    # global_i = block_dim.x * block_idx.x + thread_idx.x
-    global_i = global_idx.x
-    # local_i = thread_idx.x
-
-    # print("global_i:", global_i, "local_i:", local_i)
-
     inclusion_mask: Int = 0
     desired_value_mask: Int = 0
 
-    # CPU implementation
+    # CPU like implementation
     @parameter
     for control_qubit in range(number_control_bits):
         wire_index, flag = (
@@ -430,85 +152,73 @@ fn qubit_wise_multiply_inplace_gpu[
     #         desired_value_mask |= bit  # turn on the bit
 
     size_of_state_vector: Int = quantum_state_size
-    size_of_half_block: Int = 1 << target_qubit  # 2^target_qubit
 
     # copies all amplitudes from quantum_state to quantum_state_out
+    @parameter
+    if use_one_thread:
+        # CPU like implementation
+        for i in range(size_of_state_vector):
+            quantum_state_out_re[i] = quantum_state_re[i]
+            quantum_state_out_im[i] = quantum_state_im[i]
+
+        target_qubits_count: Int = count_trailing_zeros(gate_size)
+        size_of_block: Int = size_of_half_block << target_qubits_count
+        for block_start in range(0, size_of_state_vector, size_of_block):
+            for offset in range(size_of_half_block):
+                i1: Int = (
+                    block_start | offset
+                )  # faster than, but equivalent to, block_start + offset
+
+                if (i1 & inclusion_mask) != desired_value_mask:
+                    continue  # skip this iteration if the control bits do not match
+
+                i2: Int = (
+                    i1 | size_of_half_block
+                )  # equivalent to i1 + size_of_half_block
 
-    # # CPU implementation
-    # for i in range(size_of_state_vector):
-    #     quantum_state_out_re[i] = quantum_state_re[i]
-    #     quantum_state_out_im[i] = quantum_state_im[i]
-
-    # GPU implementation
-    # Parallel copy of the entire state vector
-    if global_i < size_of_state_vector:
-        quantum_state_out_re[global_i] = quantum_state_re[global_i]
-        quantum_state_out_im[global_i] = quantum_state_im[global_i]
-
-    # Synchronize all threads to ensure the copy is complete before proceeding.
-    barrier()
-
-    # if global_i > 0:
-    #     return  # Only the first thread in the block will execute the function
-
-    # size_of_block: Int = size_of_half_block << target_qubits_count
-    # for block_start in range(0, size_of_state_vector, size_of_block):
-    #     # print("block_start:", block_start)
-    #     for offset in range(size_of_half_block):
-    #         # print("offset:", offset)
-    #         i1: Int = (
-    #             block_start | offset
-    #         )  # faster than, but equivalent to, block_start + offset
-
-    #         if (i1 & inclusion_mask) != desired_value_mask:
-    #             continue  # skip this iteration if the control bits do not match
-
-    #         i2: Int = (
-    #             i1 | size_of_half_block
-    #         )  # equivalent to i1 + size_of_half_block
-
-    #         quantum_state_out_re[i1] = (
-    #             (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
-    #             - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
-    #             + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
-    #             - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
-    #         )
-
-    #         quantum_state_out_im[i1] = (
-    #             (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
-    #             + (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
-    #             + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
-    #             + (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
-    #         )
-
-    #         quantum_state_out_re[i2] = (
-    #             (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
-    #             - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
-    #             + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
-    #             - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
-    #         )
-
-    #         quantum_state_out_im[i2] = (
-    #             (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
-    #             + (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
-    #             + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
-    #             + (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
-    #         )
+                quantum_state_out_re[i1] = (
+                    (gate_set_re[gate_index, 0, 0] * quantum_state_re[i1])
+                    - (gate_set_im[gate_index, 0, 0] * quantum_state_im[i1])
+                    + (gate_set_re[gate_index, 0, 1] * quantum_state_re[i2])
+                    - (gate_set_im[gate_index, 0, 1] * quantum_state_im[i2])
+                )
 
-    # Parallel Gate Application
-    # Constants used by all threads
-    # target_qubits_count: Int = count_trailing_zeros(gate_size)
-    # size_of_half_block: Int = 1 << target_qubit
+                quantum_state_out_im[i1] = (
+                    (gate_set_re[gate_index, 0, 0] * quantum_state_im[i1])
+                    + (gate_set_im[gate_index, 0, 0] * quantum_state_re[i1])
+                    + (gate_set_re[gate_index, 0, 1] * quantum_state_im[i2])
+                    + (gate_set_im[gate_index, 0, 1] * quantum_state_re[i2])
+                )
 
-    # Each thread works on one index `global_i`.
-    # We only need to proceed if the thread is within the state vector bounds.
-    if global_i < quantum_state_size:
-        # The core parallelization pattern:
-        # Only threads whose index has a '0' at the target_qubit position will do the work.
-        # These are the 'i1' indices.
-        is_i1_thread = (global_i & size_of_half_block) == 0
+                quantum_state_out_re[i2] = (
+                    (gate_set_re[gate_index, 1, 0] * quantum_state_re[i1])
+                    - (gate_set_im[gate_index, 1, 0] * quantum_state_im[i1])
+                    + (gate_set_re[gate_index, 1, 1] * quantum_state_re[i2])
+                    - (gate_set_im[gate_index, 1, 1] * quantum_state_im[i2])
+                )
 
-        if is_i1_thread:
+                quantum_state_out_im[i2] = (
+                    (gate_set_re[gate_index, 1, 0] * quantum_state_im[i1])
+                    + (gate_set_im[gate_index, 1, 0] * quantum_state_re[i1])
+                    + (gate_set_re[gate_index, 1, 1] * quantum_state_im[i2])
+                    + (gate_set_im[gate_index, 1, 1] * quantum_state_re[i2])
+                )
+    else:
+        # GPU implementation
+        # Parallel copy of the entire state vector
+        if global_i < size_of_state_vector:
+            quantum_state_out_re[global_i] = quantum_state_re[global_i]
+            quantum_state_out_im[global_i] = quantum_state_im[global_i]
+
+        # Synchronize all threads to ensure the copy is complete before proceeding.
+        barrier()
+
+        # Each thread works on one index `global_i`.
+        # We only need to proceed if the thread is within the state vector bounds.
+        if global_i < quantum_state_size:
+            # The core parallelization pattern:
+            # Only threads whose index has a '0' at the target_qubit position will do the work.
+            # We already know that these are the 'i1' indices.
             # This thread is responsible for an `i1` index.
             i1: Int = global_i
 
@@ -563,6 +273,68 @@ fn qubit_wise_multiply_inplace_gpu[
             # If control bits do not match, we do nothing. The values already
             # copied to quantum_state_out are correct.
 
+    # Parallel Gate Application
+    # target_qubits_count: Int = count_trailing_zeros(gate_size)
+    # size_of_half_block: Int = 1 << target_qubit
+
+    # # Total number of pairs to calculate
+    # num_pairs = quantum_state_size // 2
+
+    # if global_i < quantum_state_size:
+    #     # 1. Identify group and role
+    #     group_id = global_i // 4
+    #     local_id_in_group = global_i % 4
+
+    #     # Only proceed if we are part of a valid group for a pair
+    #     if group_id < num_pairs:
+    #         # 2. Map group_id to state vector indices i1, i2
+    #         block_id = group_id // size_of_half_block
+    #         offset_in_block = group_id % size_of_half_block
+    #         i1: Int = (block_id * size_of_block) + offset_in_block
+
+    #         if (i1 & inclusion_mask) == desired_value_mask:
+    #             i2: Int = i1 | size_of_half_block
+
+    #             # Fetch state vector values for the pair (ψ1, ψ2)
+    #             psi1_re = quantum_state_re[i1]
+    #             psi1_im = quantum_state_im[i1]
+    #             psi2_re = quantum_state_re[i2]
+    #             psi2_im = quantum_state_im[i2]
+
+    #             # 4. Divide the calculation
+    #             if local_id_in_group == 0:  # quantum_state_out_re[i1]
+    #                 g00_re = gate_set_re[gate_index, 0, 0]
+    #                 g00_im = gate_set_im[gate_index, 0, 0]
+    #                 g01_re = gate_set_re[gate_index, 0, 1]
+    #                 g01_im = gate_set_im[gate_index, 0, 1]
+    #                 quantum_state_out_re[i1] = (
+    #                     g00_re * psi1_re - g00_im * psi1_im
+    #                 ) + (g01_re * psi2_re - g01_im * psi2_im)
+    #             elif local_id_in_group == 1:  # quantum_state_out_im[i1]
+    #                 g00_re = gate_set_re[gate_index, 0, 0]
+    #                 g00_im = gate_set_im[gate_index, 0, 0]
+    #                 g01_re = gate_set_re[gate_index, 0, 1]
+    #                 g01_im = gate_set_im[gate_index, 0, 1]
+    #                 quantum_state_out_im[i1] = (
+    #                     g00_re * psi1_im + g00_im * psi1_re
+    #                 ) + (g01_re * psi2_im + g01_im * psi2_re)
+    #             elif local_id_in_group == 2:  # quantum_state_out_re[i2]
+    #                 g10_re = gate_set_re[gate_index, 1, 0]
+    #                 g10_im = gate_set_im[gate_index, 1, 0]
+    #                 g11_re = gate_set_re[gate_index, 1, 1]
+    #                 g11_im = gate_set_im[gate_index, 1, 1]
+    #                 quantum_state_out_re[i2] = (
+    #                     g10_re * psi1_re - g10_im * psi1_im
+    #                 ) + (g11_re * psi2_re - g11_im * psi2_im)
+    #             elif local_id_in_group == 3:  # quantum_state_out_im[i2]
+    #                 g10_re = gate_set_re[gate_index, 1, 0]
+    #                 g10_im = gate_set_im[gate_index, 1, 0]
+    #                 g11_re = gate_set_re[gate_index, 1, 1]
+    #                 g11_im = gate_set_im[gate_index, 1, 1]
+    #                 quantum_state_out_im[i2] = (
+    #                     g10_re * psi1_im + g10_im * psi1_re
+    #                 ) + (g11_re * psi2_im + g11_im * psi2_re)
+
 
 # # TODO one day, but maybe it will become memory bound if we do that since we have to create
 # # intermediary values for complex multiplications
diff --git a/tests/base/test_gpu_qubits_operations.mojo b/tests/base/test_gpu_qubits_operations.mojo
new file mode 100644
index 0000000..eb60bd8
--- /dev/null
+++ b/tests/base/test_gpu_qubits_operations.mojo
@@ -0,0 +1,635 @@
+from testing import (
+    assert_true,
+    assert_false,
+    assert_equal,
+    assert_not_equal,
+    assert_almost_equal,
+)
+from testing_matrix import assert_matrix_almost_equal
+from testing_state_vector import (
+    assert_state_vector_almost_equal,
+    test_qubit_wise_multiply_0_reference,
+    test_qubit_wise_multiply_figure1_reference,
+)
+
+from sys import has_accelerator
+from gpu import thread_idx, block_dim, block_idx
+from gpu.host import DeviceContext
+from layout import Layout, LayoutTensor, IntTuple, print_layout
+
+from bit import count_trailing_zeros
+
+from qlabs.base import (
+    StateVector,
+    Gate,
+    Hadamard,
+    PauliX,
+    PauliY,
+    PauliZ,
+)
+
+from qlabs.base.gpu import qubit_wise_multiply_inplace_gpu
+
+from qlabs.local_stdlib.complex import ComplexFloat32
+
+alias dtype = DType.float32
+
+alias GATE_SIZE = 2
+alias NUMBER_CONTROL_BITS = 1
+
+
+def test_qubit_wise_multiply_0():
+    """Simulate a small circuit"""
+
+    @parameter
+    if not has_accelerator():
+        print("No compatible GPU found")
+        return
+
+    gate_set: List[Gate] = [Hadamard, PauliX]
+    alias gate_set_dic: Dict[String, Int] = {
+        Hadamard.symbol: 0,
+        PauliX.symbol: 1,
+    }
+    alias gate_set_size = 2
+    alias gate_set_1qubit_layout = Layout.row_major(
+        gate_set_size, GATE_SIZE, GATE_SIZE
+    )
+
+    alias CIRCUIT_NUMBER_CONTROL_GATES = 1
+    alias circuit_control_bits_layout = Layout.row_major(
+        CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
+    )
+
+    alias num_qubits = 3
+    alias state_vector_size = 1 << num_qubits
+    alias state_vector_layout = Layout.row_major(state_vector_size)
+
+    alias total_threads = state_vector_size
+
+    alias blocks_per_grid = 1
+    alias threads_per_block = (
+        total_threads,
+        1,
+        1,
+    )
+
+    ctx = DeviceContext()
+
+    host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+        state_vector_size
+    )
+    host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+        state_vector_size
+    )
+
+    host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+    host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+
+    host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+        CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+    )
+
+    quantum_state: StateVector = StateVector.from_bitstring("000")
+    var control_bits_list: List[List[List[Int]]] = [
+        [[0, 1]],
+    ]
+
+    ctx.synchronize()
+
+    # -- Fill host buffers -- #
+
+    for i in range(state_vector_size):
+        host_quantum_state_re[i] = quantum_state[i].re
+        host_quantum_state_im[i] = quantum_state[i].im
+
+    for i in range(gate_set_size):
+        gate = gate_set[i]
+        for j in range(GATE_SIZE):
+            for k in range(GATE_SIZE):
+                index = gate_set_1qubit_layout(
+                    IntTuple(i, j, k)
+                )  # Get the index in the 1D buffer
+                host_gate_set_re[index] = gate[j, k].re
+                host_gate_set_im[index] = gate[j, k].im
+
+    for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+        for j in range(NUMBER_CONTROL_BITS):
+            for k in range(2):
+                index = circuit_control_bits_layout(IntTuple(i, j, k))
+                host_control_bits_circuit[index] = control_bits_list[i][j][k]
+
+    # -- Copy host buffers to device buffers -- #
+    quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+    quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+    gate_set_re = ctx.enqueue_create_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+    gate_set_im = ctx.enqueue_create_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+
+    control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+        CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+    )
+    current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+    # Create other buffers for functions
+
+    quantum_state_out_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+    quantum_state_out_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+    quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+    quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+    gate_set_re.enqueue_copy_from(host_gate_set_re)
+    gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+    control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+    ctx.enqueue_memset(current_control_gate_circuit, 0)
+    ctx.enqueue_memset(quantum_state_out_re, 0.0)
+    ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+    # -- Create layout tensors for GPU operations -- #
+    gate_set_re_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
+        gate_set_re.unsafe_ptr()
+    )
+    gate_set_im_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
+        gate_set_im.unsafe_ptr()
+    )
+
+    quantum_state_re_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_re.unsafe_ptr())
+    quantum_state_im_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_im.unsafe_ptr())
+
+    quantum_state_out_re_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_out_re.unsafe_ptr())
+    quantum_state_out_im_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_out_im.unsafe_ptr())
+
+    control_bits_circuit_tensor = LayoutTensor[
+        mut=False, DType.int32, circuit_control_bits_layout
+    ](control_bits_circuit.unsafe_ptr())
+    current_control_gate_circuit_tensor = LayoutTensor[
+        mut=True, DType.int32, Layout.row_major(1)
+    ](current_control_gate_circuit.unsafe_ptr())
+
+    # quantum_state = qubit_wise_multiply(Hadamard.matrix, 0, quantum_state)
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[Hadamard.symbol],
+        GATE_SIZE,
+        0,  # target_qubit
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    # quantum_state = qubit_wise_multiply(
+    #     PauliX.matrix, 1, quantum_state, [[0, 1]]
+    # )
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[PauliX.symbol],
+        GATE_SIZE,
+        1,  # target_qubit
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    # quantum_state = qubit_wise_multiply(Hadamard.matrix, 2, quantum_state)
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[Hadamard.symbol],
+        GATE_SIZE,
+        2,  # target_qubit
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        print(
+            (
+                "After Pauli-X gate on qubit 2 with control on qubit 1"
+                " (Final State):\nreal part:\n"
+            ),
+            host_re,
+            "\nimaginary part:\n",
+            host_im,
+        )
+        for i in range(state_vector_size):
+            quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+
+    assert_state_vector_almost_equal(
+        quantum_state, test_qubit_wise_multiply_0_reference
+    )
+
+
+def test_qubit_wise_multiply_figure1():
+    """Simulates a circuit of arbitrary number of qubits"""
+
+    @parameter
+    if not has_accelerator():
+        print("No compatible GPU found")
+        return
+
+    alias num_qubits = 3
+
+    alias circuit_number_control_gates = 2
+    alias circuit_control_bits_layout = Layout.row_major(
+        circuit_number_control_gates, NUMBER_CONTROL_BITS, 2
+    )
+
+    gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
+    gate_set_dic: Dict[String, Int] = {
+        Hadamard.symbol: 0,
+        PauliX.symbol: 1,
+        PauliZ.symbol: 2,
+    }
+    alias gate_set_size = 3
+    alias gate_set_1qubit_layout = Layout.row_major(
+        gate_set_size, GATE_SIZE, GATE_SIZE
+    )
+
+    alias state_vector_size = 1 << num_qubits
+    alias state_vector_layout = Layout.row_major(state_vector_size)
+
+    alias total_threads = state_vector_size
+
+    alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
+    alias blocks_per_grid = (
+        total_threads + max_threads_per_block - 1
+    ) // max_threads_per_block
+
+    alias threads_per_block = (
+        max_threads_per_block,
+        1,
+        1,
+    )
+
+    @parameter
+    if total_threads < max_threads_per_block:
+        alias threads_per_block = (
+            total_threads,
+            1,
+            1,
+        )  # 1D block of threads
+
+    print("state_vector_size:", state_vector_size)
+    print("blocks_per_grid:", blocks_per_grid)
+    print("threads_per_block[0]:", threads_per_block[0])
+
+    var control_bits_list: List[List[List[Int]]] = [
+        [[1, 1]],  # Control on qubit 1 and is control because flag=1
+        [[1, 1]],  # Control on qubit 1 and is control because flag=1
+    ]
+
+    ctx = DeviceContext()
+    print("Using GPU:", ctx.name())
+    print("ctx.device_info:", ctx.device_info)
+    print(
+        "ctx.device_info.max_thread_block_size:",
+        ctx.device_info.max_thread_block_size,
+    )
+    print(
+        "ctx.device_info.max_blocks_per_multiprocessor:",
+        ctx.device_info.max_blocks_per_multiprocessor,
+    )
+    try:
+        (free, total) = ctx.get_memory_info()
+        print("Free memory:", free / (1024 * 1024), "MB")
+        print("Total memory:", total / (1024 * 1024), "MB")
+    except:
+        print("Failed to get memory information")
+
+    # -- Create GPU variables -- #
+    # These don't need to be initialized to zero, they will be filled later
+
+    host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+        state_vector_size
+    )
+    host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+        state_vector_size
+    )
+
+    host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+    host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+
+    host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+        circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
+    )
+
+    # -- Initialize the quantum circuit to the |000⟩ state -- #
+    quantum_state: StateVector = StateVector.from_bitstring("0" * num_qubits)
+    # print("Initial quantum state:\n", quantum_state)
+
+    # Wait for host buffers to be ready
+    ctx.synchronize()
+
+    # -- Fill host buffers -- #
+
+    for i in range(state_vector_size):
+        host_quantum_state_re[i] = quantum_state[i].re
+        host_quantum_state_im[i] = quantum_state[i].im
+
+    print("Initial state real part:", host_quantum_state_re)
+    print("Initial state imaginary part:", host_quantum_state_im)
+
+    for i in range(gate_set_size):
+        gate = gate_set[i]
+        for j in range(GATE_SIZE):
+            for k in range(GATE_SIZE):
+                index = gate_set_1qubit_layout(
+                    IntTuple(i, j, k)
+                )  # Get the index in the 1D buffer
+                host_gate_set_re[index] = gate[j, k].re
+                host_gate_set_im[index] = gate[j, k].im
+
+    for i in range(circuit_number_control_gates):
+        for j in range(NUMBER_CONTROL_BITS):
+            for k in range(2):
+                index = circuit_control_bits_layout(IntTuple(i, j, k))
+                host_control_bits_circuit[index] = control_bits_list[i][j][k]
+
+    # -- Copy host buffers to device buffers -- #
+    quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+    quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+    gate_set_re = ctx.enqueue_create_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+    gate_set_im = ctx.enqueue_create_buffer[dtype](
+        gate_set_size * GATE_SIZE * GATE_SIZE
+    )
+
+    control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+        circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
+    )
+    current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+    # Create other buffers for functions
+
+    quantum_state_out_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+    quantum_state_out_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+    quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+    quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+    gate_set_re.enqueue_copy_from(host_gate_set_re)
+    gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+    control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+    ctx.enqueue_memset(current_control_gate_circuit, 0)
+    ctx.enqueue_memset(quantum_state_out_re, 0.0)
+    ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+    # -- Create layout tensors for GPU operations -- #
+    gate_set_re_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
+        gate_set_re.unsafe_ptr()
+    )
+    gate_set_im_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
+        gate_set_im.unsafe_ptr()
+    )
+
+    quantum_state_re_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_re.unsafe_ptr())
+    quantum_state_im_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_im.unsafe_ptr())
+
+    quantum_state_out_re_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_out_re.unsafe_ptr())
+    quantum_state_out_im_tensor = LayoutTensor[
+        mut=True, dtype, state_vector_layout
+    ](quantum_state_out_im.unsafe_ptr())
+
+    control_bits_circuit_tensor = LayoutTensor[
+        mut=False, DType.int32, circuit_control_bits_layout
+    ](control_bits_circuit.unsafe_ptr())
+    current_control_gate_circuit_tensor = LayoutTensor[
+        mut=True, DType.int32, Layout.row_major(1)
+    ](current_control_gate_circuit.unsafe_ptr())
+
+    # -- Apply circuit operations -- #
+
+    # Gate 0
+    # quantum_state = qubit_wise_multiply_gpu(
+    #     Hadamard.matrix, 1, quantum_state
+    # )
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[Hadamard.symbol],
+        GATE_SIZE,
+        1,  # target_qubit
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    # # It works
+    # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+    #     print(
+    #         "After Hadamard gate on qubit 1\nreal part:\n",
+    #         host_re,
+    #         "\nimaginary part:\n",
+    #         host_im,
+    #     )
+
+    # Gate 1 (reverse the states input <-> output)
+    # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[PauliX.symbol],
+        GATE_SIZE,
+        2,  # target_qubit
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+    #     print(
+    #         "After Pauli-X gate on qubit 2:",
+    #         "\nreal part:\n",
+    #         host_re,
+    #         "\nimaginary part:\n",
+    #         host_im,
+    #     )
+
+    # # Gate 2
+    # quantum_state = qubit_wise_multiply(
+    #     PauliX.matrix, 0, quantum_state, [[1, 1]]
+    # )
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[PauliX.symbol],
+        GATE_SIZE,
+        0,  # target_qubit
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+    #     print(
+    #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
+    #         "\nreal part:\n",
+    #         host_re,
+    #         "\nimaginary part:\n",
+    #         host_im,
+    #     )
+
+    # Gate 3
+    # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[PauliZ.symbol],
+        GATE_SIZE,
+        0,  # target_qubit
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+    #     print(
+    #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
+    #         host_re,
+    #         "\nimaginary part:\n",
+    #         host_im,
+    #     )
+
+    # Gate 4
+    # quantum_state = qubit_wise_multiply(
+    #     PauliX.matrix, 2, quantum_state, [[1, 1]]
+    # )
+    ctx.enqueue_function[
+        qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+    ](
+        gate_set_re_tensor,
+        gate_set_im_tensor,
+        gate_set_dic[PauliX.symbol],
+        GATE_SIZE,
+        2,  # target_qubit
+        quantum_state_re_tensor,
+        quantum_state_im_tensor,
+        num_qubits,  # number_qubits
+        state_vector_size,  # quantum_state_size
+        quantum_state_out_re_tensor,
+        quantum_state_out_im_tensor,
+        control_bits_circuit_tensor,
+        current_control_gate_circuit_tensor,
+        grid_dim=blocks_per_grid,
+        block_dim=threads_per_block,
+    )
+
+    with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        print(
+            (
+                "After Pauli-X gate on qubit 2 with control on qubit 1"
+                " (Final State):\nreal part:\n"
+            ),
+            host_re,
+            "\nhost_rere[3]:",
+            host_re[4],
+            "\nhost_rere[4]:",
+            host_re[5],
+            "\nimaginary part:\n",
+            host_im,
+        )
+        for i in range(state_vector_size):
+            quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+
+    assert_state_vector_almost_equal(
+        quantum_state, test_qubit_wise_multiply_figure1_reference
+    )
diff --git a/tests/base/test_qubit_operations.mojo b/tests/base/test_qubit_operations.mojo
index 89e6126..8f55f13 100644
--- a/tests/base/test_qubit_operations.mojo
+++ b/tests/base/test_qubit_operations.mojo
@@ -8,7 +8,11 @@ from testing import (
 
 from testing_matrix import assert_matrix_almost_equal
 
-from testing_state_vector import assert_state_vector_almost_equal
+from testing_state_vector import (
+    assert_state_vector_almost_equal,
+    test_qubit_wise_multiply_0_reference,
+    test_qubit_wise_multiply_figure1_reference,
+)
 
 from math import sqrt
 
@@ -50,19 +54,7 @@ def test_qubit_wise_multiply_0():
 
     assert_state_vector_almost_equal(
         quantum_state,
-        StateVector(
-            3,
-            CustomList[ComplexFloat32, hint_trivial_type=True](
-                ComplexFloat32(0.5, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0.5, 0),
-                ComplexFloat32(0.5, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0.5, 0),
-            ),
-        ),
+        test_qubit_wise_multiply_0_reference,
     )
 
 
@@ -101,19 +93,7 @@ def test_qubit_wise_multiply_figure1():
 
     assert_state_vector_almost_equal(
         quantum_state,
-        StateVector(
-            3,
-            CustomList[ComplexFloat32, hint_trivial_type=True](
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(-1.0 / Float32(sqrt(2.0)), 0),
-                ComplexFloat32(1.0 / Float32(sqrt(2.0)), 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0, 0),
-                ComplexFloat32(0, 0),
-            ),
-        ),
+        test_qubit_wise_multiply_figure1_reference,
     )
 
 
diff --git a/tests/base/testing_state_vector.mojo b/tests/base/testing_state_vector.mojo
index 7cabe7d..242ca23 100644
--- a/tests/base/testing_state_vector.mojo
+++ b/tests/base/testing_state_vector.mojo
@@ -1,3 +1,5 @@
+from math import sqrt
+
 from testing import (
     assert_true,
     assert_false,
@@ -8,6 +10,37 @@ from testing import (
 
 from qlabs.base import StateVector
 
+from qlabs.local_stdlib.complex import ComplexFloat32
+from qlabs.local_stdlib import CustomList
+
+alias test_qubit_wise_multiply_0_reference = StateVector(
+    3,
+    CustomList[ComplexFloat32, hint_trivial_type=True](
+        ComplexFloat32(0.5, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0.5, 0),
+        ComplexFloat32(0.5, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0.5, 0),
+    ),
+)
+
+alias test_qubit_wise_multiply_figure1_reference = StateVector(
+    3,
+    CustomList[ComplexFloat32, hint_trivial_type=True](
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(-1.0 / Float32(sqrt(2.0)), 0),
+        ComplexFloat32(1.0 / Float32(sqrt(2.0)), 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0, 0),
+        ComplexFloat32(0, 0),
+    ),
+)
+
 
 def assert_state_vector_almost_equal(
     reference_state: StateVector, state: StateVector, message: String = ""

From f3e1ed44b618232711bd1271906f9cc9f6582297 Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sun, 29 Jun 2025 03:43:01 -0600
Subject: [PATCH 5/7] fix: update main ci

---
 .github/workflows/main_ci.yml  | 10 +++++++---
 benchmarks/all_benchmarks.mojo | 14 +++++++-------
 pixi.toml                      | 14 +-------------
 3 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml
index 4460468..88b6ed2 100644
--- a/.github/workflows/main_ci.yml
+++ b/.github/workflows/main_ci.yml
@@ -11,6 +11,9 @@ jobs:
   build-and-test:
     runs-on: ubuntu-latest
 
+    env:
+      CONDA_OVERRIDE_CUDA: "12.0" # Mock CUDA version 12.0
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -20,7 +23,9 @@ jobs:
         with:
           pixi-version: latest
           cache: true
-          cache-key: ${{ runner.os }}-pixi-v1-${{ hashFiles('**/pixi.lock') }}
+          cache-key: ${{ runner.os }}-pixi-${{ hashFiles('**/pixi.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-pixi-
 
       - name: Check formatting
         run: pixi run lint
@@ -32,5 +37,4 @@ jobs:
         run: pixi run main
 
       - name: Run tests
-        run: pixi run test
-
+        run: pixi run test
\ No newline at end of file
diff --git a/benchmarks/all_benchmarks.mojo b/benchmarks/all_benchmarks.mojo
index a03ebfb..b288c69 100644
--- a/benchmarks/all_benchmarks.mojo
+++ b/benchmarks/all_benchmarks.mojo
@@ -19,25 +19,25 @@ def main():
         max_number_qubits=25,
         number_qubits_step_size=2,
         min_number_layers=5,
-        max_number_layers=3000,
+        max_number_layers=4000,
         number_layers_step_size=400,
-        fixed_number_qubits=10,
-        fixed_number_layers=5,
+        fixed_number_qubits=11,
+        fixed_number_layers=50,
     ]()
 
     @parameter
     if not has_accelerator():
         print("No compatible GPU found")
     else:
-        bench_qubit_wise_multiply_inplace_gpu[
+        bench_qubit_wise_multiply_inplace[
             min_number_qubits=5,
             max_number_qubits=25,
             number_qubits_step_size=2,
             min_number_layers=5,
-            max_number_layers=3000,
+            max_number_layers=4000,
             number_layers_step_size=400,
-            fixed_number_qubits=10,
-            fixed_number_layers=5,
+            fixed_number_qubits=11,
+            fixed_number_layers=50,
         ]()
 
     # bench_qubit_wise_multiply_extended()
diff --git a/pixi.toml b/pixi.toml
index 0c5386a..5b45626 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -107,21 +107,9 @@ cmd = "python3 benchmarks/plot_results.py"
 inputs = ["data/**/*.csv"]
 depends-on = ["install", "bench"]
 
-
-# # Benches
-# bench_decimal = "clear && pixi run package && cd benches/decimal && pixi run mojo -I ../ bench.mojo && cd ../.. && pixi run clean"
-# bench_bigint = "clear && pixi run package && cd benches/bigint && pixi run mojo -I ../ bench.mojo && cd ../.. && pixi run clean"
-# bench_biguint = "clear && pixi run package && cd benches/biguint && pixi run mojo -I ../ bench.mojo && cd ../.. && pixi run clean"
-# bench_bigdecimal = "clear && pixi run package && cd benches/bigdecimal && pixi run mojo -I ../ bench.mojo && cd ../.. && pixi run clean"
-# bench_dec = "pixi run bench_decimal"
-# bench_bint = "pixi run bench_bigint"
-# bench_buint = "pixi run bench_biguint"
-# bench_bdec = "pixi run bench_bigdecimal"
-
-
 [tasks]
 
 tests = [{ task = "test" }]
 p = [{ task = "clear" }, { task = "package" }]
 m = [{ task = "clear" }, { task = "main" }]
-# t = "clear && pixi run package && pixi run mojo test tests --filter"
+t = [{ task = "clear" }, { task = "test" }]

From 10826d7a3485fc6c235a7915cb24f2ed1f981f9d Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sun, 29 Jun 2025 03:51:23 -0600
Subject: [PATCH 6/7] fix: remove restore-keys in ci + add back format task

---
 .github/workflows/main_ci.yml | 2 --
 pixi.toml                     | 8 ++++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main_ci.yml b/.github/workflows/main_ci.yml
index 88b6ed2..1fe3768 100644
--- a/.github/workflows/main_ci.yml
+++ b/.github/workflows/main_ci.yml
@@ -24,8 +24,6 @@ jobs:
           pixi-version: latest
           cache: true
           cache-key: ${{ runner.os }}-pixi-${{ hashFiles('**/pixi.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-pixi-
 
       - name: Check formatting
         run: pixi run lint
diff --git a/pixi.toml b/pixi.toml
index 5b45626..074a7bd 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -44,6 +44,14 @@ inputs = ["./tests/**/*.mojo"]
 cmd = "pixi run mojo format ./benchmarks"
 inputs = ["./benchmarks/**/*.mojo"]
 
+[tasks.format]
+depends-on = [
+    "format_examples",
+    "format_src",
+    "format_tests",
+    "format_benchmarks",
+] # Format all the code
+
 [tasks.create_build_dir]
 cmd = "mkdir -p build/"
 

From 95439a8f24dd66e15e15f5b3a44b88a7b462d610 Mon Sep 17 00:00:00 2001
From: ttrenty <154608953+ttrenty@users.noreply.github.com>
Date: Sun, 29 Jun 2025 03:54:53 -0600
Subject: [PATCH 7/7] fix: try to fix gpu tests for ci

---
 benchmarks/all_benchmarks.mojo             |    4 +-
 tests/base/test_gpu_qubits_operations.mojo | 1139 ++++++++++----------
 2 files changed, 579 insertions(+), 564 deletions(-)

diff --git a/benchmarks/all_benchmarks.mojo b/benchmarks/all_benchmarks.mojo
index b288c69..062bfd5 100644
--- a/benchmarks/all_benchmarks.mojo
+++ b/benchmarks/all_benchmarks.mojo
@@ -22,7 +22,7 @@ def main():
         max_number_layers=4000,
         number_layers_step_size=400,
         fixed_number_qubits=11,
-        fixed_number_layers=50,
+        fixed_number_layers=20,
     ]()
 
     @parameter
@@ -37,7 +37,7 @@ def main():
             max_number_layers=4000,
             number_layers_step_size=400,
             fixed_number_qubits=11,
-            fixed_number_layers=50,
+            fixed_number_layers=20,
         ]()
 
     # bench_qubit_wise_multiply_extended()
diff --git a/tests/base/test_gpu_qubits_operations.mojo b/tests/base/test_gpu_qubits_operations.mojo
index eb60bd8..f92a817 100644
--- a/tests/base/test_gpu_qubits_operations.mojo
+++ b/tests/base/test_gpu_qubits_operations.mojo
@@ -46,226 +46,233 @@ def test_qubit_wise_multiply_0():
         print("No compatible GPU found")
         return
 
-    gate_set: List[Gate] = [Hadamard, PauliX]
-    alias gate_set_dic: Dict[String, Int] = {
-        Hadamard.symbol: 0,
-        PauliX.symbol: 1,
-    }
-    alias gate_set_size = 2
-    alias gate_set_1qubit_layout = Layout.row_major(
-        gate_set_size, GATE_SIZE, GATE_SIZE
-    )
-
-    alias CIRCUIT_NUMBER_CONTROL_GATES = 1
-    alias circuit_control_bits_layout = Layout.row_major(
-        CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
-    )
-
-    alias num_qubits = 3
-    alias state_vector_size = 1 << num_qubits
-    alias state_vector_layout = Layout.row_major(state_vector_size)
-
-    alias total_threads = state_vector_size
-
-    alias blocks_per_grid = 1
-    alias threads_per_block = (
-        total_threads,
-        1,
-        1,
-    )
-
-    ctx = DeviceContext()
-
-    host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
-        state_vector_size
-    )
-    host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
-        state_vector_size
-    )
-
-    host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-    host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-
-    host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
-        CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-    )
-
-    quantum_state: StateVector = StateVector.from_bitstring("000")
-    var control_bits_list: List[List[List[Int]]] = [
-        [[0, 1]],
-    ]
-
-    ctx.synchronize()
-
-    # -- Fill host buffers -- #
-
-    for i in range(state_vector_size):
-        host_quantum_state_re[i] = quantum_state[i].re
-        host_quantum_state_im[i] = quantum_state[i].im
-
-    for i in range(gate_set_size):
-        gate = gate_set[i]
-        for j in range(GATE_SIZE):
-            for k in range(GATE_SIZE):
-                index = gate_set_1qubit_layout(
-                    IntTuple(i, j, k)
-                )  # Get the index in the 1D buffer
-                host_gate_set_re[index] = gate[j, k].re
-                host_gate_set_im[index] = gate[j, k].im
-
-    for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
-        for j in range(NUMBER_CONTROL_BITS):
-            for k in range(2):
-                index = circuit_control_bits_layout(IntTuple(i, j, k))
-                host_control_bits_circuit[index] = control_bits_list[i][j][k]
-
-    # -- Copy host buffers to device buffers -- #
-    quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
-    quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
-
-    gate_set_re = ctx.enqueue_create_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-    gate_set_im = ctx.enqueue_create_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-
-    control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
-        CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
-    )
-    current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
-
-    # Create other buffers for functions
-
-    quantum_state_out_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
-    quantum_state_out_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
-
-    quantum_state_re.enqueue_copy_from(host_quantum_state_re)
-    quantum_state_im.enqueue_copy_from(host_quantum_state_im)
-
-    gate_set_re.enqueue_copy_from(host_gate_set_re)
-    gate_set_im.enqueue_copy_from(host_gate_set_im)
-
-    control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
-
-    ctx.enqueue_memset(current_control_gate_circuit, 0)
-    ctx.enqueue_memset(quantum_state_out_re, 0.0)
-    ctx.enqueue_memset(quantum_state_out_im, 0.0)
-
-    # -- Create layout tensors for GPU operations -- #
-    gate_set_re_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
-        gate_set_re.unsafe_ptr()
-    )
-    gate_set_im_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
-        gate_set_im.unsafe_ptr()
-    )
-
-    quantum_state_re_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_re.unsafe_ptr())
-    quantum_state_im_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_im.unsafe_ptr())
-
-    quantum_state_out_re_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_out_re.unsafe_ptr())
-    quantum_state_out_im_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_out_im.unsafe_ptr())
-
-    control_bits_circuit_tensor = LayoutTensor[
-        mut=False, DType.int32, circuit_control_bits_layout
-    ](control_bits_circuit.unsafe_ptr())
-    current_control_gate_circuit_tensor = LayoutTensor[
-        mut=True, DType.int32, Layout.row_major(1)
-    ](current_control_gate_circuit.unsafe_ptr())
-
-    # quantum_state = qubit_wise_multiply(Hadamard.matrix, 0, quantum_state)
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[Hadamard.symbol],
-        GATE_SIZE,
-        0,  # target_qubit
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    # quantum_state = qubit_wise_multiply(
-    #     PauliX.matrix, 1, quantum_state, [[0, 1]]
-    # )
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=1]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[PauliX.symbol],
-        GATE_SIZE,
-        1,  # target_qubit
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    # quantum_state = qubit_wise_multiply(Hadamard.matrix, 2, quantum_state)
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[Hadamard.symbol],
-        GATE_SIZE,
-        2,  # target_qubit
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-        print(
-            (
-                "After Pauli-X gate on qubit 2 with control on qubit 1"
-                " (Final State):\nreal part:\n"
-            ),
-            host_re,
-            "\nimaginary part:\n",
-            host_im,
+    else:
+        gate_set: List[Gate] = [Hadamard, PauliX]
+        alias gate_set_dic: Dict[String, Int] = {
+            Hadamard.symbol: 0,
+            PauliX.symbol: 1,
+        }
+        alias gate_set_size = 2
+        alias gate_set_1qubit_layout = Layout.row_major(
+            gate_set_size, GATE_SIZE, GATE_SIZE
+        )
+
+        alias CIRCUIT_NUMBER_CONTROL_GATES = 1
+        alias circuit_control_bits_layout = Layout.row_major(
+            CIRCUIT_NUMBER_CONTROL_GATES, NUMBER_CONTROL_BITS, 2
+        )
+
+        alias num_qubits = 3
+        alias state_vector_size = 1 << num_qubits
+        alias state_vector_layout = Layout.row_major(state_vector_size)
+
+        alias total_threads = state_vector_size
+
+        alias blocks_per_grid = 1
+        alias threads_per_block = (
+            total_threads,
+            1,
+            1,
+        )
+
+        ctx = DeviceContext()
+
+        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+
+        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+
+        host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
         )
+
+        quantum_state: StateVector = StateVector.from_bitstring("000")
+        var control_bits_list: List[List[List[Int]]] = [
+            [[0, 1]],
+        ]
+
+        ctx.synchronize()
+
+        # -- Fill host buffers -- #
+
         for i in range(state_vector_size):
-            quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+            host_quantum_state_re[i] = quantum_state[i].re
+            host_quantum_state_im[i] = quantum_state[i].im
+
+        for i in range(gate_set_size):
+            gate = gate_set[i]
+            for j in range(GATE_SIZE):
+                for k in range(GATE_SIZE):
+                    index = gate_set_1qubit_layout(
+                        IntTuple(i, j, k)
+                    )  # Get the index in the 1D buffer
+                    host_gate_set_re[index] = gate[j, k].re
+                    host_gate_set_im[index] = gate[j, k].im
+
+        for i in range(CIRCUIT_NUMBER_CONTROL_GATES):
+            for j in range(NUMBER_CONTROL_BITS):
+                for k in range(2):
+                    index = circuit_control_bits_layout(IntTuple(i, j, k))
+                    host_control_bits_circuit[index] = control_bits_list[i][j][
+                        k
+                    ]
+
+        # -- Copy host buffers to device buffers -- #
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+
+        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+            CIRCUIT_NUMBER_CONTROL_GATES * NUMBER_CONTROL_BITS * 2
+        )
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+        # Create other buffers for functions
+
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+
+        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+        gate_set_re.enqueue_copy_from(host_gate_set_re)
+        gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+        control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+        ctx.enqueue_memset(current_control_gate_circuit, 0)
+        ctx.enqueue_memset(quantum_state_out_re, 0.0)
+        ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+        # -- Create layout tensors for GPU operations -- #
+        gate_set_re_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_re.unsafe_ptr())
+        gate_set_im_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_im.unsafe_ptr())
+
+        quantum_state_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_re.unsafe_ptr())
+        quantum_state_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_im.unsafe_ptr())
+
+        quantum_state_out_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_re.unsafe_ptr())
+        quantum_state_out_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_im.unsafe_ptr())
+
+        control_bits_circuit_tensor = LayoutTensor[
+            mut=False, DType.int32, circuit_control_bits_layout
+        ](control_bits_circuit.unsafe_ptr())
+        current_control_gate_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, Layout.row_major(1)
+        ](current_control_gate_circuit.unsafe_ptr())
+
+        # quantum_state = qubit_wise_multiply(Hadamard.matrix, 0, quantum_state)
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[Hadamard.symbol],
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
+
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 1, quantum_state, [[0, 1]]
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            1,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
 
-    assert_state_vector_almost_equal(
-        quantum_state, test_qubit_wise_multiply_0_reference
-    )
+        # quantum_state = qubit_wise_multiply(Hadamard.matrix, 2, quantum_state)
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[Hadamard.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
+
+        with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+            print(
+                (
+                    "After Pauli-X gate on qubit 2 with control on qubit 1"
+                    " (Final State):\nreal part:\n"
+                ),
+                host_re,
+                "\nimaginary part:\n",
+                host_im,
+            )
+            for i in range(state_vector_size):
+                quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+
+        assert_state_vector_almost_equal(
+            quantum_state, test_qubit_wise_multiply_0_reference
+        )
 
 
 def test_qubit_wise_multiply_figure1():
@@ -275,361 +282,369 @@ def test_qubit_wise_multiply_figure1():
     if not has_accelerator():
         print("No compatible GPU found")
         return
+    else:
+        alias num_qubits = 3
 
-    alias num_qubits = 3
-
-    alias circuit_number_control_gates = 2
-    alias circuit_control_bits_layout = Layout.row_major(
-        circuit_number_control_gates, NUMBER_CONTROL_BITS, 2
-    )
-
-    gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
-    gate_set_dic: Dict[String, Int] = {
-        Hadamard.symbol: 0,
-        PauliX.symbol: 1,
-        PauliZ.symbol: 2,
-    }
-    alias gate_set_size = 3
-    alias gate_set_1qubit_layout = Layout.row_major(
-        gate_set_size, GATE_SIZE, GATE_SIZE
-    )
+        alias circuit_number_control_gates = 2
+        alias circuit_control_bits_layout = Layout.row_major(
+            circuit_number_control_gates, NUMBER_CONTROL_BITS, 2
+        )
 
-    alias state_vector_size = 1 << num_qubits
-    alias state_vector_layout = Layout.row_major(state_vector_size)
+        gate_set: List[Gate] = [Hadamard, PauliX, PauliZ]
+        gate_set_dic: Dict[String, Int] = {
+            Hadamard.symbol: 0,
+            PauliX.symbol: 1,
+            PauliZ.symbol: 2,
+        }
+        alias gate_set_size = 3
+        alias gate_set_1qubit_layout = Layout.row_major(
+            gate_set_size, GATE_SIZE, GATE_SIZE
+        )
 
-    alias total_threads = state_vector_size
+        alias state_vector_size = 1 << num_qubits
+        alias state_vector_layout = Layout.row_major(state_vector_size)
 
-    alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
-    alias blocks_per_grid = (
-        total_threads + max_threads_per_block - 1
-    ) // max_threads_per_block
+        alias total_threads = state_vector_size
 
-    alias threads_per_block = (
-        max_threads_per_block,
-        1,
-        1,
-    )
+        alias max_threads_per_block = 1024  # Maximum threads per block in CUDA
+        alias blocks_per_grid = (
+            total_threads + max_threads_per_block - 1
+        ) // max_threads_per_block
 
-    @parameter
-    if total_threads < max_threads_per_block:
         alias threads_per_block = (
-            total_threads,
+            max_threads_per_block,
             1,
             1,
-        )  # 1D block of threads
-
-    print("state_vector_size:", state_vector_size)
-    print("blocks_per_grid:", blocks_per_grid)
-    print("threads_per_block[0]:", threads_per_block[0])
-
-    var control_bits_list: List[List[List[Int]]] = [
-        [[1, 1]],  # Control on qubit 1 and is control because flag=1
-        [[1, 1]],  # Control on qubit 1 and is control because flag=1
-    ]
-
-    ctx = DeviceContext()
-    print("Using GPU:", ctx.name())
-    print("ctx.device_info:", ctx.device_info)
-    print(
-        "ctx.device_info.max_thread_block_size:",
-        ctx.device_info.max_thread_block_size,
-    )
-    print(
-        "ctx.device_info.max_blocks_per_multiprocessor:",
-        ctx.device_info.max_blocks_per_multiprocessor,
-    )
-    try:
-        (free, total) = ctx.get_memory_info()
-        print("Free memory:", free / (1024 * 1024), "MB")
-        print("Total memory:", total / (1024 * 1024), "MB")
-    except:
-        print("Failed to get memory information")
-
-    # -- Create GPU variables -- #
-    # These don't need to be initialized to zero, they will be filled later
-
-    host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
-        state_vector_size
-    )
-    host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
-        state_vector_size
-    )
-
-    host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-    host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-
-    host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
-        circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
-    )
-
-    # -- Initialize the quantum circuit to the |000⟩ state -- #
-    quantum_state: StateVector = StateVector.from_bitstring("0" * num_qubits)
-    # print("Initial quantum state:\n", quantum_state)
-
-    # Wait for host buffers to be ready
-    ctx.synchronize()
-
-    # -- Fill host buffers -- #
-
-    for i in range(state_vector_size):
-        host_quantum_state_re[i] = quantum_state[i].re
-        host_quantum_state_im[i] = quantum_state[i].im
-
-    print("Initial state real part:", host_quantum_state_re)
-    print("Initial state imaginary part:", host_quantum_state_im)
-
-    for i in range(gate_set_size):
-        gate = gate_set[i]
-        for j in range(GATE_SIZE):
-            for k in range(GATE_SIZE):
-                index = gate_set_1qubit_layout(
-                    IntTuple(i, j, k)
-                )  # Get the index in the 1D buffer
-                host_gate_set_re[index] = gate[j, k].re
-                host_gate_set_im[index] = gate[j, k].im
-
-    for i in range(circuit_number_control_gates):
-        for j in range(NUMBER_CONTROL_BITS):
-            for k in range(2):
-                index = circuit_control_bits_layout(IntTuple(i, j, k))
-                host_control_bits_circuit[index] = control_bits_list[i][j][k]
-
-    # -- Copy host buffers to device buffers -- #
-    quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
-    quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
-
-    gate_set_re = ctx.enqueue_create_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-    gate_set_im = ctx.enqueue_create_buffer[dtype](
-        gate_set_size * GATE_SIZE * GATE_SIZE
-    )
-
-    control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
-        circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
-    )
-    current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
-
-    # Create other buffers for functions
-
-    quantum_state_out_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
-    quantum_state_out_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
-
-    quantum_state_re.enqueue_copy_from(host_quantum_state_re)
-    quantum_state_im.enqueue_copy_from(host_quantum_state_im)
-
-    gate_set_re.enqueue_copy_from(host_gate_set_re)
-    gate_set_im.enqueue_copy_from(host_gate_set_im)
-
-    control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
-
-    ctx.enqueue_memset(current_control_gate_circuit, 0)
-    ctx.enqueue_memset(quantum_state_out_re, 0.0)
-    ctx.enqueue_memset(quantum_state_out_im, 0.0)
-
-    # -- Create layout tensors for GPU operations -- #
-    gate_set_re_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
-        gate_set_re.unsafe_ptr()
-    )
-    gate_set_im_tensor = LayoutTensor[mut=False, dtype, gate_set_1qubit_layout](
-        gate_set_im.unsafe_ptr()
-    )
-
-    quantum_state_re_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_re.unsafe_ptr())
-    quantum_state_im_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_im.unsafe_ptr())
-
-    quantum_state_out_re_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_out_re.unsafe_ptr())
-    quantum_state_out_im_tensor = LayoutTensor[
-        mut=True, dtype, state_vector_layout
-    ](quantum_state_out_im.unsafe_ptr())
-
-    control_bits_circuit_tensor = LayoutTensor[
-        mut=False, DType.int32, circuit_control_bits_layout
-    ](control_bits_circuit.unsafe_ptr())
-    current_control_gate_circuit_tensor = LayoutTensor[
-        mut=True, DType.int32, Layout.row_major(1)
-    ](current_control_gate_circuit.unsafe_ptr())
-
-    # -- Apply circuit operations -- #
-
-    # Gate 0
-    # quantum_state = qubit_wise_multiply_gpu(
-    #     Hadamard.matrix, 1, quantum_state
-    # )
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[Hadamard.symbol],
-        GATE_SIZE,
-        1,  # target_qubit
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    # # It works
-    # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-    #     print(
-    #         "After Hadamard gate on qubit 1\nreal part:\n",
-    #         host_re,
-    #         "\nimaginary part:\n",
-    #         host_im,
-    #     )
-
-    # Gate 1 (reverse the states input <-> output)
-    # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[PauliX.symbol],
-        GATE_SIZE,
-        2,  # target_qubit
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
-    #     print(
-    #         "After Pauli-X gate on qubit 2:",
-    #         "\nreal part:\n",
-    #         host_re,
-    #         "\nimaginary part:\n",
-    #         host_im,
-    #     )
-
-    # # Gate 2
-    # quantum_state = qubit_wise_multiply(
-    #     PauliX.matrix, 0, quantum_state, [[1, 1]]
-    # )
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=1]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[PauliX.symbol],
-        GATE_SIZE,
-        0,  # target_qubit
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
-    #     print(
-    #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
-    #         "\nreal part:\n",
-    #         host_re,
-    #         "\nimaginary part:\n",
-    #         host_im,
-    #     )
-
-    # Gate 3
-    # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=0]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[PauliZ.symbol],
-        GATE_SIZE,
-        0,  # target_qubit
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
-    #     print(
-    #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
-    #         host_re,
-    #         "\nimaginary part:\n",
-    #         host_im,
-    #     )
-
-    # Gate 4
-    # quantum_state = qubit_wise_multiply(
-    #     PauliX.matrix, 2, quantum_state, [[1, 1]]
-    # )
-    ctx.enqueue_function[
-        qubit_wise_multiply_inplace_gpu[number_control_bits=1]
-    ](
-        gate_set_re_tensor,
-        gate_set_im_tensor,
-        gate_set_dic[PauliX.symbol],
-        GATE_SIZE,
-        2,  # target_qubit
-        quantum_state_re_tensor,
-        quantum_state_im_tensor,
-        num_qubits,  # number_qubits
-        state_vector_size,  # quantum_state_size
-        quantum_state_out_re_tensor,
-        quantum_state_out_im_tensor,
-        control_bits_circuit_tensor,
-        current_control_gate_circuit_tensor,
-        grid_dim=blocks_per_grid,
-        block_dim=threads_per_block,
-    )
-
-    with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        )
+
+        @parameter
+        if total_threads < max_threads_per_block:
+            alias threads_per_block = (
+                total_threads,
+                1,
+                1,
+            )  # 1D block of threads
+
+        print("state_vector_size:", state_vector_size)
+        print("blocks_per_grid:", blocks_per_grid)
+        print("threads_per_block[0]:", threads_per_block[0])
+
+        var control_bits_list: List[List[List[Int]]] = [
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+            [[1, 1]],  # Control on qubit 1 and is control because flag=1
+        ]
+
+        ctx = DeviceContext()
+        print("Using GPU:", ctx.name())
+        print("ctx.device_info:", ctx.device_info)
         print(
-            (
-                "After Pauli-X gate on qubit 2 with control on qubit 1"
-                " (Final State):\nreal part:\n"
-            ),
-            host_re,
-            "\nhost_rere[3]:",
-            host_re[4],
-            "\nhost_rere[4]:",
-            host_re[5],
-            "\nimaginary part:\n",
-            host_im,
+            "ctx.device_info.max_thread_block_size:",
+            ctx.device_info.max_thread_block_size,
+        )
+        print(
+            "ctx.device_info.max_blocks_per_multiprocessor:",
+            ctx.device_info.max_blocks_per_multiprocessor,
+        )
+        try:
+            (free, total) = ctx.get_memory_info()
+            print("Free memory:", free / (1024 * 1024), "MB")
+            print("Total memory:", total / (1024 * 1024), "MB")
+        except:
+            print("Failed to get memory information")
+
+        # -- Create GPU variables -- #
+        # These don't need to be initialized to zero, they will be filled later
+
+        host_quantum_state_re = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
         )
+        host_quantum_state_im = ctx.enqueue_create_host_buffer[dtype](
+            state_vector_size
+        )
+
+        host_gate_set_re = ctx.enqueue_create_host_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+        host_gate_set_im = ctx.enqueue_create_host_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+
+        host_control_bits_circuit = ctx.enqueue_create_host_buffer[DType.int32](
+            circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
+        )
+
+        # -- Initialize the quantum circuit to the |000⟩ state -- #
+        quantum_state: StateVector = StateVector.from_bitstring(
+            "0" * num_qubits
+        )
+        # print("Initial quantum state:\n", quantum_state)
+
+        # Wait for host buffers to be ready
+        ctx.synchronize()
+
+        # -- Fill host buffers -- #
+
         for i in range(state_vector_size):
-            quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+            host_quantum_state_re[i] = quantum_state[i].re
+            host_quantum_state_im[i] = quantum_state[i].im
+
+        print("Initial state real part:", host_quantum_state_re)
+        print("Initial state imaginary part:", host_quantum_state_im)
+
+        for i in range(gate_set_size):
+            gate = gate_set[i]
+            for j in range(GATE_SIZE):
+                for k in range(GATE_SIZE):
+                    index = gate_set_1qubit_layout(
+                        IntTuple(i, j, k)
+                    )  # Get the index in the 1D buffer
+                    host_gate_set_re[index] = gate[j, k].re
+                    host_gate_set_im[index] = gate[j, k].im
+
+        for i in range(circuit_number_control_gates):
+            for j in range(NUMBER_CONTROL_BITS):
+                for k in range(2):
+                    index = circuit_control_bits_layout(IntTuple(i, j, k))
+                    host_control_bits_circuit[index] = control_bits_list[i][j][
+                        k
+                    ]
+
+        # -- Copy host buffers to device buffers -- #
+        quantum_state_re = ctx.enqueue_create_buffer[dtype](state_vector_size)
+        quantum_state_im = ctx.enqueue_create_buffer[dtype](state_vector_size)
+
+        gate_set_re = ctx.enqueue_create_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+        gate_set_im = ctx.enqueue_create_buffer[dtype](
+            gate_set_size * GATE_SIZE * GATE_SIZE
+        )
+
+        control_bits_circuit = ctx.enqueue_create_buffer[DType.int32](
+            circuit_number_control_gates * NUMBER_CONTROL_BITS * 2
+        )
+        current_control_gate_circuit = ctx.enqueue_create_buffer[DType.int32](1)
+
+        # Create other buffers for functions
+
+        quantum_state_out_re = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+        quantum_state_out_im = ctx.enqueue_create_buffer[dtype](
+            state_vector_size
+        )
+
+        quantum_state_re.enqueue_copy_from(host_quantum_state_re)
+        quantum_state_im.enqueue_copy_from(host_quantum_state_im)
+
+        gate_set_re.enqueue_copy_from(host_gate_set_re)
+        gate_set_im.enqueue_copy_from(host_gate_set_im)
+
+        control_bits_circuit.enqueue_copy_from(host_control_bits_circuit)
+
+        ctx.enqueue_memset(current_control_gate_circuit, 0)
+        ctx.enqueue_memset(quantum_state_out_re, 0.0)
+        ctx.enqueue_memset(quantum_state_out_im, 0.0)
+
+        # -- Create layout tensors for GPU operations -- #
+        gate_set_re_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_re.unsafe_ptr())
+        gate_set_im_tensor = LayoutTensor[
+            mut=False, dtype, gate_set_1qubit_layout
+        ](gate_set_im.unsafe_ptr())
+
+        quantum_state_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_re.unsafe_ptr())
+        quantum_state_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_im.unsafe_ptr())
+
+        quantum_state_out_re_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_re.unsafe_ptr())
+        quantum_state_out_im_tensor = LayoutTensor[
+            mut=True, dtype, state_vector_layout
+        ](quantum_state_out_im.unsafe_ptr())
+
+        control_bits_circuit_tensor = LayoutTensor[
+            mut=False, DType.int32, circuit_control_bits_layout
+        ](control_bits_circuit.unsafe_ptr())
+        current_control_gate_circuit_tensor = LayoutTensor[
+            mut=True, DType.int32, Layout.row_major(1)
+        ](current_control_gate_circuit.unsafe_ptr())
+
+        # -- Apply circuit operations -- #
+
+        # Gate 0
+        # quantum_state = qubit_wise_multiply_gpu(
+        #     Hadamard.matrix, 1, quantum_state
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[Hadamard.symbol],
+            GATE_SIZE,
+            1,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
+
+        # # It works
+        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        #     print(
+        #         "After Hadamard gate on qubit 1\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 1 (reverse the states input <-> output)
+        # quantum_state = qubit_wise_multiply(PauliX.matrix, 2, quantum_state)
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
+
+        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-X gate on qubit 2:",
+        #         "\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # # Gate 2
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 0, quantum_state, [[1, 1]]
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
 
-    assert_state_vector_almost_equal(
-        quantum_state, test_qubit_wise_multiply_figure1_reference
-    )
+        # with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-X gate on qubit 0 with control on qubit 1:",
+        #         "\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 3
+        # quantum_state = qubit_wise_multiply(PauliZ.matrix, 0, quantum_state)
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=0]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliZ.symbol],
+            GATE_SIZE,
+            0,  # target_qubit
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
+
+        # with quantum_state_re.map_to_host() as host_re, quantum_state_im.map_to_host() as host_im:
+        #     print(
+        #         "After Pauli-Z gate on qubit 0:\nreal part:\n",
+        #         host_re,
+        #         "\nimaginary part:\n",
+        #         host_im,
+        #     )
+
+        # Gate 4
+        # quantum_state = qubit_wise_multiply(
+        #     PauliX.matrix, 2, quantum_state, [[1, 1]]
+        # )
+        ctx.enqueue_function[
+            qubit_wise_multiply_inplace_gpu[number_control_bits=1]
+        ](
+            gate_set_re_tensor,
+            gate_set_im_tensor,
+            gate_set_dic[PauliX.symbol],
+            GATE_SIZE,
+            2,  # target_qubit
+            quantum_state_re_tensor,
+            quantum_state_im_tensor,
+            num_qubits,  # number_qubits
+            state_vector_size,  # quantum_state_size
+            quantum_state_out_re_tensor,
+            quantum_state_out_im_tensor,
+            control_bits_circuit_tensor,
+            current_control_gate_circuit_tensor,
+            grid_dim=blocks_per_grid,
+            block_dim=threads_per_block,
+        )
+
+        with quantum_state_out_re.map_to_host() as host_re, quantum_state_out_im.map_to_host() as host_im:
+            print(
+                (
+                    "After Pauli-X gate on qubit 2 with control on qubit 1"
+                    " (Final State):\nreal part:\n"
+                ),
+                host_re,
+                "\nhost_rere[3]:",
+                host_re[4],
+                "\nhost_rere[4]:",
+                host_re[5],
+                "\nimaginary part:\n",
+                host_im,
+            )
+            for i in range(state_vector_size):
+                quantum_state[i] = ComplexFloat32(host_re[i], host_im[i])
+
+        assert_state_vector_almost_equal(
+            quantum_state, test_qubit_wise_multiply_figure1_reference
+        )