diff --git a/.github/containers/x86_64-tutorial/Dockerfile b/.github/containers/x86_64-tutorial/Dockerfile
new file mode 100644
index 00000000..c7092954
--- /dev/null
+++ b/.github/containers/x86_64-tutorial/Dockerfile
@@ -0,0 +1,134 @@
+# ┌──────────────┐
+# │  BUILDER     │
+# └──────────────┘
+
+FROM fluxrm/flux-core:latest AS builder 
+USER root
+SHELL ["/bin/bash", "-lc"]
+
+ARG num_jobs
+ARG hypre_version
+
+# Install MariaDB + tools
+# Install extra packages (e.g., git, python3-pip)
+RUN apt-get update && \
+    apt-get install -y lsb-release gnupg software-properties-common  \
+    			git python3 python3-pip mariadb-server mariadb-client \
+			curl \
+			rabbitmq-server supervisor \
+			python3-venv \
+			nlohmann-json3-dev \
+			build-essential cmake \
+		        libopenmpi-dev libmetis-dev libhypre-dev \
+		        libblas-dev liblapack-dev \
+			libhdf5-dev hdf5-tools && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /run/mysqld \
+ && chown mysql:mysql /run/mysqld
+
+RUN rabbitmq-plugins enable --offline rabbitmq_management
+WORKDIR /opt/archives
+#HYPRE
+RUN curl -L https://github.com/hypre-space/hypre/archive/refs/tags/v${hypre_version}.tar.gz > /opt/archives/hypre-v${hypre_version}.tar.gz && \ 
+    tar xzf hypre-v${hypre_version}.tar.gz && cd hypre-${hypre_version}/src && \
+    ./configure --prefix /usr/local --enable-shared --disable-static && \
+    make -j ${num_jobs} && make install && \
+    rm -rf hypre-v${hypre_version}*
+
+#MFEM
+RUN git clone --depth=1  https://github.com/mfem/mfem.git /opt/archives/mfem && \
+    mkdir -p /opt/archives/mfem/build && cd /opt/archives/mfem/build/ && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local -DMFEM_USE_MPI=ON -DMFEM_USE_METIS_5=ON -DMFEM_USE_HYPRE=ON .. && \
+    make -j"$(nproc)" && \
+    make install
+
+#CALIPER
+RUN cd /opt/archives/ && \
+    git clone --depth 1 https://github.com/LLNL/Caliper.git && \
+    cd Caliper && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_INSTALL_PREFIX=/usr/local/ .. && \
+    make  && \
+    make install
+
+#AMQCPP
+RUN cd /opt/archives/ && \
+	git clone --depth=1 https://github.com/CopernicaMarketingSoftware/AMQP-CPP.git && \
+	mkdir -p /opt/archives/AMQP-CPP/build/ && \
+	cd /opt/archives/AMQP-CPP/build/ && \
+	cmake -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"  -DAMQP-CPP_LINUX_TCP=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DAMQP-CPP_BUILD_SHARED=On .. && \
+	make && \
+	make install
+
+RUN python3 -m venv /app/venv --system-site-packages
+ENV PATH="/app/venv/bin:$PATH"
+RUN source /app/venv/bin/activate && \
+		python3 -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+		python -m pip install --no-cache-dir h5py
+
+ENV AMS_MFEM_PATH=/usr/local/
+ENV AMS_TORCH_PATH=/app/venv/lib/python3.11/site-packages/torch/share/cmake/Torch
+
+RUN cd /opt/archives/ && \
+	source /app/venv/bin/activate && \
+	git clone --depth 1 https://github.com/LLNL/AMS.git -b features/tutorial && \
+	mkdir -p /opt/archives/AMS/build/ && \
+	cd /opt/archives/AMS/build/ && \
+	cmake \
+	  -DWITH_RMQ=On \
+	  -DBUILD_SHARED_LIBS=On \
+	  -DWITH_CALIPER=On \
+	  -DWITH_HDF5=On \
+          -DWITH_AMS_DEBUG=On \
+	  -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=On \
+	  -DCMAKE_INSTALL_PREFIX=/usr/local \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  -DWITH_CUDA=Off \
+	  -DMFEM_DIR=$AMS_MFEM_PATH \
+	  -DWITH_MPI=On \
+	  -DWITH_TORCH=On \
+	  -DTorch_DIR=$AMS_TORCH_PATH \
+	  -DWITH_WORKFLOW=On \
+	  ../ && \
+	  make && \
+	  make install
+
+# ┌──────────────┐
+# │  RUNTIME     │
+# └──────────────┘
+FROM fluxrm/flux-core:latest AS runtime 
+USER root
+SHELL ["/bin/bash", "-lc"]
+
+# 1) Install _only_ runtime deps
+RUN apt-get update && \
+    apt-get install -y lsb-release gnupg software-properties-common  \
+    			git python3 python3-pip mariadb-server mariadb-client \
+			curl \
+			rabbitmq-server supervisor \
+			python3-venv \
+			nlohmann-json3-dev \
+			build-essential cmake \
+		        libopenmpi-dev libmetis-dev libhypre-dev \
+		        libblas-dev liblapack-dev \
+			libhdf5-dev hdf5-tools && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# 2) Copy in your built artifacts
+COPY --from=builder /usr/local /usr/local
+COPY --from=builder /app/venv    /app/venv
+
+# 3) Copy configs & entrypoint
+COPY entrypoint.sh        /usr/local/bin/entrypoint.sh
+COPY rabbitmq.conf        /etc/rabbitmq/rabbitmq.conf
+COPY supervisord.conf     /etc/supervisor/conf.d/supervisord.conf
+ENV PATH="/app/venv/bin:$PATH"
+
+RUN chmod +x /usr/local/bin/entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["supervisord", "-n", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
+
diff --git a/.github/containers/x86_64-tutorial/README.md b/.github/containers/x86_64-tutorial/README.md
new file mode 100644
index 00000000..2400dbe7
--- /dev/null
+++ b/.github/containers/x86_64-tutorial/README.md
@@ -0,0 +1,13 @@
+# Container with all necessary AMS dependencies installed and with AMS 
+
+The container runs a RMQ and a MariaDB server on startup. To build issue:
+
+```bash
+docker build -t <your-registry>/ams-tutorial:latest . --build-arg hypre_version=2.33.0
+```
+
+To run issue:
+```bash
+docker run --rm -it -v "$(pwd)":/workspace -w /workspace  <your-registry>/ams-tutorial:latest bash
+```
+
diff --git a/.github/containers/x86_64-tutorial/entrypoint.sh b/.github/containers/x86_64-tutorial/entrypoint.sh
new file mode 100644
index 00000000..83cea3a3
--- /dev/null
+++ b/.github/containers/x86_64-tutorial/entrypoint.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -e
+
+# 1) ensure the socket directory exists and is owned by mysql
+mkdir -p /run/mysqld
+chown mysql:mysql /run/mysqld
+
+# 2) start MariaDB directly
+#    this will background itself (via mysqld_safe)
+exec /usr/bin/mysqld_safe --datadir=/var/lib/mysql &
+
+# 3) wait until it's up
+while ! mysqladmin ping -uroot --silent; do
+  sleep 1
+done
+echo "MariaDB is up!"
+
+: "${MYSQL_ROOT_PASSWORD:=root}"     # default, if not passed-in
+mysql -uroot <<-EOSQL
+  ALTER USER 'root'@'localhost' IDENTIFIED BY '${MYSQL_ROOT_PASSWORD}';
+  FLUSH PRIVILEGES;
+EOSQL
+echo "Root password set to '${MYSQL_ROOT_PASSWORD}'"
+
+# 4) start RabbitMQ in detached mode
+rabbitmq-server -detached
+
+# 5) Load the python venv
+source /app/venv/bin/activate
+
+# 6) drop into a shell (or run passed-in command)
+if [ $# -gt 0 ]; then
+  exec "$@"
+else
+  exec bash
+fi
+
diff --git a/.github/containers/x86_64-tutorial/init.sql b/.github/containers/x86_64-tutorial/init.sql
new file mode 100644
index 00000000..8401db2c
--- /dev/null
+++ b/.github/containers/x86_64-tutorial/init.sql
@@ -0,0 +1,6 @@
+ALTER USER 'root'@'localhost' IDENTIFIED BY 'secret';
+CREATE DATABASE IF NOT EXISTS testdb;
+CREATE USER IF NOT EXISTS 'testuser'@'localhost' IDENTIFIED BY 'testpass';
+GRANT ALL PRIVILEGES ON testdb.* TO 'testuser'@'localhost';
+FLUSH PRIVILEGES;
+
diff --git a/.github/containers/x86_64-tutorial/rabbitmq.conf b/.github/containers/x86_64-tutorial/rabbitmq.conf
new file mode 100644
index 00000000..3d5e1f4f
--- /dev/null
+++ b/.github/containers/x86_64-tutorial/rabbitmq.conf
@@ -0,0 +1,5 @@
+# rabbitmq.conf
+listeners.tcp.default = 127.0.0.1:5672
+management.listener.ip   = 127.0.0.1
+management.listener.port = 15672
+
diff --git a/.github/containers/x86_64-tutorial/supervisord.conf b/.github/containers/x86_64-tutorial/supervisord.conf
new file mode 100644
index 00000000..c6c24358
--- /dev/null
+++ b/.github/containers/x86_64-tutorial/supervisord.conf
@@ -0,0 +1,22 @@
+[supervisord]
+nodaemon=true
+logfile=/var/log/supervisord.log
+
+[program:mariadb]
+; run MariaDB as the mysql user so mysqld won’t abort
+user = mysql
+command = /usr/bin/mysqld_safe --datadir=/var/lib/mysql
+stdout_logfile = /dev/stdout
+stderr_logfile = /dev/stderr
+priority = 10
+autorestart = true
+
+[program:rabbitmq]
+; run RabbitMQ as the rabbitmq user
+user = rabbitmq
+command = /usr/sbin/rabbitmq-server
+stdout_logfile = /dev/stdout
+stderr_logfile = /dev/stderr
+priority = 20
+autorestart = true
+
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c868adb0..9c099418 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,10 +13,15 @@ jobs:
     runs-on: ubuntu-latest
     container: ghcr.io/llnl/ams-ci-almalinux8:latest
 
-    # Steps represent a sequence of tasks that will be executed as part of the job
+    strategy:
+      matrix:
+        with_hdf5: [ON, OFF]
+        with_caliper: [ON, OFF]
+        with_mpi : [ON, OFF]
+
     steps:
       - uses: actions/checkout@v3
-      - name: Build Torch=On FAISS=On HDF5=On AMS
+      - name: Build WITH_HDF5=${{ matrix.with_hdf5 }}, WITH_CALIPER=${{ matrix.with_caliper }} WITH_MPI=${{ matrix.with_mpi }} 
         shell: bash -l {0}
         run: |
           module load gcc/11.2.1
@@ -32,105 +37,46 @@ jobs:
           export AMS_HDF5_PATH=$(spack location -i hdf5)
           cmake \
             -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DCMAKE_INSTALL_PREFIX=./install \
+            -DWITH_CALIPER=${{ matrix.with_caliper }} \
+            -DWITH_HDF5=${{ matrix.with_hdf5 }} \
+            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DCMAKE_BUILD_TYPE=Release \
             -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=On \
+            -DWITH_MPI=${{ matrix.with_mpi }} \
             -DWITH_TESTS=On \
             -DTorch_DIR=$AMS_TORCH_PATH \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
             -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
+            -DWITH_WORKFLOW=Off \
             $GITHUB_WORKSPACE
-            make
-      - name: Run tests Torch=On FAISS=On HDF5=On AMS
+            make -j$(nproc)
+      - name: Run Tests WITH_HDF5=${{ matrix.with_hdf5 }}, WITH_CALIPER=${{ matrix.with_caliper }} WITH_MPI=${{ matrix.with_mpi }} 
         run: |
           cd build
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
           env CTEST_OUTPUT_ON_FAILURE=1 make test
-      - name: Build Torch=On FAISS=On HDF5=On MPI=Off AMS
-        shell: bash -l {0}
+
+      - name: Install AMS WITH_HDF5=${{ matrix.with_hdf5 }}, WITH_CALIPER=${{ matrix.with_caliper }} WITH_MPI=${{ matrix.with_mpi }} 
         run: |
-          module load gcc/11.2.1
-          source /spack/share/spack/setup-env.sh
-          spack env activate -p /ams-spack-env
-          rm -rf build/
-          mkdir build
           cd build
-          export AMS_MFEM_PATH=$(spack location -i mfem)
-          export AMS_TORCH_PATH=$(spack location -i py-torch)/lib/python3.10/site-packages/torch/share/cmake/Torch
-          export AMS_FAISS_PATH=$(spack location -i faiss)
-          export AMS_UMPIRE_PATH=$(spack location -i umpire)
-          export AMS_HDF5_PATH=$(spack location -i hdf5)
-          cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
-            -DCMAKE_INSTALL_PREFIX=./install \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
-            -DWITH_MPI=Off \
-            -DWITH_TORCH=On \
-            -DWITH_TESTS=On \
-            -DTorch_DIR=$AMS_TORCH_PATH \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
-            -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
-            $GITHUB_WORKSPACE
-            make
-      - name: Build CALIPER=Off Torch=Off FAISS=On HDF5=On AMS
-        shell: bash -l {0}
-        run: |
-          module load gcc/11.2.1
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
-          rm -rf build/
-          mkdir build
-          cd build
-          export AMS_MFEM_PATH=$(spack location -i mfem)
-          export AMS_FAISS_PATH=$(spack location -i faiss)
-          export AMS_UMPIRE_PATH=$(spack location -i umpire)
-          export AMS_HDF5_PATH=$(spack location -i hdf5)
-          cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=Off \
-            -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
-            -DCMAKE_INSTALL_PREFIX=./install \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
-            -DWITH_TESTS=On \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
-            -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            $GITHUB_WORKSPACE
-            make
-      - name: Build Torch=Off FAISS=On HDF5=On AMS
+          make -j$(nproc) install
+
+  build-cuda-tests:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+    container: ghcr.io/llnl/ams-ci-cuda11.6.1:latest
+
+    strategy:
+        matrix:
+          with_hdf5: [ON, OFF]
+          with_caliper: [ON, OFF]
+          with_mpi : [ON, OFF]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Build WITH_CUDA=On WITH_HDF5=${{ matrix.with_hdf5 }}, WITH_CALIPER=${{ matrix.with_caliper }} WITH_MPI=${{ matrix.with_mpi }} 
         shell: bash -l {0}
         run: |
           module load gcc/11.2.1
@@ -140,38 +86,44 @@ jobs:
           mkdir build
           cd build
           export AMS_MFEM_PATH=$(spack location -i mfem)
+          export AMS_TORCH_PATH=$(spack location -i py-torch)/lib/python3.10/site-packages/torch/share/cmake/Torch
           export AMS_FAISS_PATH=$(spack location -i faiss)
           export AMS_UMPIRE_PATH=$(spack location -i umpire)
           export AMS_HDF5_PATH=$(spack location -i hdf5)
           cmake \
             -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DCMAKE_INSTALL_PREFIX=./install \
+            -DWITH_CALIPER=${{ matrix.with_caliper }} \
+            -DWITH_HDF5=${{ matrix.with_hdf5 }} \
+            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DCMAKE_BUILD_TYPE=Release \
-            -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
+            -DWITH_CUDA=On \
+            -DWITH_MPI=${{ matrix.with_mpi }} \
             -DWITH_TESTS=On \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
+            -DTorch_DIR=$AMS_TORCH_PATH \
             -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
+            -DWITH_WORKFLOW=Off \
             $GITHUB_WORKSPACE
-            make
-      - name: Run tests Torch=Off FAISS=On HDF5=On AMS
+            make -j$(nproc)
+      - name: Install AMS WITH_CUDA=On WITH_HDF5=${{ matrix.with_hdf5 }}, WITH_CALIPER=${{ matrix.with_caliper }} WITH_MPI=${{ matrix.with_mpi }} 
         run: |
           cd build
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
-          env CTEST_OUTPUT_ON_FAILURE=1 make test
-      - name: Build Torch=Off FAISS=Off HDF5=On AMS
+          make -j$(nproc) install
+
+
+  install-link-cpu-tests:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+    container: ghcr.io/llnl/ams-ci-almalinux8:latest
+
+    strategy:
+        matrix:
+          with_shared: [ON, OFF]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Build SharedLibray=${{ matrix.with_shared }}
         shell: bash -l {0}
         run: |
           module load gcc/11.2.1
@@ -180,217 +132,116 @@ jobs:
           rm -rf build/
           mkdir build
           cd build
+          export AMS_INSTALL_PATH=$(pwd)/install
+          echo "AMS_INSTALL_PATH=${AMS_INSTALL_PATH}"
           export AMS_MFEM_PATH=$(spack location -i mfem)
+          export AMS_TORCH_PATH=$(spack location -i py-torch)/lib/python3.10/site-packages/torch/share/cmake/Torch
+          export AMS_FAISS_PATH=$(spack location -i faiss)
           export AMS_UMPIRE_PATH=$(spack location -i umpire)
           export AMS_HDF5_PATH=$(spack location -i hdf5)
+          export AMS_AMQPCPP_PATH=$(spack location -i amqp-cpp)/cmake
           cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
+            -DBUILD_SHARED_LIBS=${{ matrix.with_shared }} \
+            -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=${{ matrix.with_shared }} \
+            -DCMAKE_INSTALL_PREFIX=$AMS_INSTALL_PATH \
+            -DWITH_RMQ=On \
+            -Damqpcpp_DIR=$AMS_AMQPCPP_PATH \
             -DWITH_CALIPER=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DWITH_HDF5=On \
-            -DCMAKE_INSTALL_PREFIX=./install \
+            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DCMAKE_BUILD_TYPE=Release \
             -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=Off \
             -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
             -DWITH_TESTS=On \
+            -DTorch_DIR=$AMS_TORCH_PATH \
             -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
+            -DWITH_WORKFLOW=Off \
             $GITHUB_WORKSPACE
-            make
-      - name: Run tests Torch=Off FAISS=Off HDF5=On AMS
-        run: |
-          cd build
-          source /spack/share/spack/setup-env.sh
-          spack env activate -p /ams-spack-env
-          make test
-      - name: Build Torch=Off FAISS=Off HDF5=Off AMS
+            make -j$(nproc)
+            make -j$(nproc) install
+            echo "AMS_INSTALL_PATH=${AMS_INSTALL_PATH}"
+            echo "AMS_INSTALL_PATH=${AMS_INSTALL_PATH}" >> $GITHUB_ENV
+      - name: Install IdealGas SharedLibray=${{ matrix.with_shared }}
         shell: bash -l {0}
         run: |
           module load gcc/11.2.1
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
-          rm -rf build/
-          mkdir build
-          cd build
+          mkdir build_idealgas/
+          cd build_idealgas/
           export AMS_MFEM_PATH=$(spack location -i mfem)
-          export AMS_UMPIRE_PATH=$(spack location -i umpire)
-          cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_EXAMPLES=On \
-            -DCMAKE_INSTALL_PREFIX=./install \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=Off \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
-            -DWITH_TESTS=On \
-            -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=Off \
-            $GITHUB_WORKSPACE
-            make
-      - name: Run tests Torch=Off FAISS=Off HDF5=Off AMS
-        run: |
-          cd build
-          source /spack/share/spack/setup-env.sh
-          spack env activate -p /ams-spack-env
-          make test
+          echo "AMS_INSTALL_PATH=${AMS_INSTALL_PATH}"
+          cmake -DWITH_CUDA=Off -DAMS_DIR=${AMS_INSTALL_PATH}/lib64/cmake/AMS -DMFEM_DIR=${AMS_MFEM_PATH} $GITHUB_WORKSPACE/examples/ideal_gas/
+          make -j$(nproc) VERBOSE=1
 
-  build-cuda-tests:
+  install-link-cuda-tests:
     # The type of runner that the job will run on
     runs-on: ubuntu-latest
     container: ghcr.io/llnl/ams-ci-cuda11.6.1:latest
 
-    # Steps represent a sequence of tasks that will be executed as part of the job
+    strategy:
+        matrix:
+          with_shared: [ON, OFF]
     steps:
       - uses: actions/checkout@v3
-      - name: Build Torch=On FAISS=On HDF5=On AMS
+      - name: Build SharedLibray=${{ matrix.with_shared }}
         shell: bash -l {0}
         run: |
+          module load gcc/11.2.1
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
           rm -rf build/
           mkdir build
           cd build
+          export AMS_INSTALL_PATH=$(pwd)/install
+          echo "AMS_INSTALL_PATH=${AMS_INSTALL_PATH}"
           export AMS_MFEM_PATH=$(spack location -i mfem)
           export AMS_TORCH_PATH=$(spack location -i py-torch)/lib/python3.10/site-packages/torch/share/cmake/Torch
           export AMS_FAISS_PATH=$(spack location -i faiss)
           export AMS_UMPIRE_PATH=$(spack location -i umpire)
           export AMS_HDF5_PATH=$(spack location -i hdf5)
+          echo "Current directory is: $(pwd)"
           cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
+            -DBUILD_SHARED_LIBS=${{ matrix.with_shared }} \
+            -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=${{ matrix.with_shared }} \
+            -DCMAKE_INSTALL_PREFIX=$AMS_INSTALL_PATH \
             -DWITH_CALIPER=On \
             -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
             -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
-            -DCMAKE_INSTALL_PREFIX=./install \
             -DCMAKE_BUILD_TYPE=Release \
-            -DCUDA_ARCH=70 \
             -DWITH_CUDA=On \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
             -DWITH_MPI=On \
-            -DWITH_TORCH=On \
             -DWITH_TESTS=On \
             -DTorch_DIR=$AMS_TORCH_PATH \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
             -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
+            -DWITH_WORKFLOW=Off \
             $GITHUB_WORKSPACE
-            make
-      - name: Build Torch=Off FAISS=On HDF5=On AMS
+            make -j$(nproc)
+            make -j$(nproc) install
+            echo "AMS_INSTALL_PATH=${AMS_INSTALL_PATH}" >> $GITHUB_ENV
+            ls ${AMS_INSTALL_PATH}
+      - name: Install IdealGas SharedLibray=${{ matrix.with_shared }}
         shell: bash -l {0}
         run: |
+          module load gcc/11.2.1
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
-          rm -rf build/
-          mkdir build
-          cd build
+          mkdir build_idealgas/
+          cd build_idealgas/
           export AMS_MFEM_PATH=$(spack location -i mfem)
-          export AMS_FAISS_PATH=$(spack location -i faiss)
-          export AMS_UMPIRE_PATH=$(spack location -i umpire)
-          export AMS_HDF5_PATH=$(spack location -i hdf5)
-          cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
-            -DCMAKE_INSTALL_PREFIX=./install \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DCUDA_ARCH=70 \
-            -DWITH_CUDA=On \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
-            -DWITH_TESTS=On \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
-            -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
-            $GITHUB_WORKSPACE
-            make
-      - name: Build Torch=Off FAISS=Off HDF5=On AMS
-        shell: bash -l {0}
-        run: |
-          source /spack/share/spack/setup-env.sh
-          spack env activate -p /ams-spack-env
-          rm -rf build/
-          mkdir build
-          cd build
-          export AMS_MFEM_PATH=$(spack location -i mfem)
-          export AMS_UMPIRE_PATH=$(spack location -i umpire)
-          export AMS_HDF5_PATH=$(spack location -i hdf5)
-          cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_EXAMPLES=On \
-            -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
-            -DWITH_HDF5=On \
-            -DCMAKE_INSTALL_PREFIX=./install \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DCUDA_ARCH=70 \
-            -DWITH_CUDA=On \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=Off \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
-            -DWITH_TESTS=On \
-            -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=On \
-            $GITHUB_WORKSPACE
-            make
-      - name: Build Torch=Off FAISS=Off HDF5=Off AMS
+          cmake -DCMAKE_CUDA_ARCHITECTURES=70 -DAMS_DIR=${AMS_INSTALL_PATH}/lib64/cmake/AMS -DMFEM_DIR=${AMS_MFEM_PATH} $GITHUB_WORKSPACE/examples/ideal_gas/
+          make -j$(nproc) VERBOSE=1
+      - name: Install BOptions SharedLibray=${{ matrix.with_shared }}
         shell: bash -l {0}
         run: |
+          module load gcc/11.2.1
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
-          rm -rf build/
-          mkdir build
-          cd build
-          export AMS_MFEM_PATH=$(spack location -i mfem)
-          export AMS_UMPIRE_PATH=$(spack location -i umpire)
-          cmake \
-            -DBUILD_SHARED_LIBS=On \
-            -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
-            -DWITH_CALIPER=On \
-            -DWITH_EXAMPLES=On \
-            -DCMAKE_INSTALL_PREFIX=./install \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DCUDA_ARCH=70 \
-            -DWITH_CUDA=On \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=Off \
-            -DWITH_MPI=On \
-            -DWITH_TORCH=Off \
-            -DWITH_TESTS=On \
-            -DWITH_AMS_DEBUG=On \
-            -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=Off \
-            $GITHUB_WORKSPACE
-            make
-  
+          mkdir build_bnm_opt/
+          cd build_bnm_opt/
+          cmake -DCMAKE_CUDA_ARCHITECTURES=70 -DAMS_DIR=${AMS_INSTALL_PATH}/lib64/cmake/AMS $GITHUB_WORKSPACE/examples/bnm_opt/
+          make -j$(nproc) VERBOSE=1
+
   build-rmq-tests:
     # The type of runner that the job will run on
     runs-on: ubuntu-latest
@@ -410,7 +261,6 @@ jobs:
         RABBITMQ_PASS: ams
         RABBITMQ_HOST: rabbitmq
         RABBITMQ_PORT: 5672
-
     steps:
       - uses: actions/checkout@v4
       - name: Build Torch=On FAISS=On RMQ=On AMS
@@ -429,27 +279,33 @@ jobs:
           export AMS_UMPIRE_PATH=$(spack location -i umpire)
           export AMS_HDF5_PATH=$(spack location -i hdf5)
           export AMS_AMQPCPP_PATH=$(spack location -i amqp-cpp)/cmake
+          echo """{
+                \"rabbitmq-user\": \"${RABBITMQ_USER}\",
+                \"rabbitmq-password\": \"${RABBITMQ_PASS}\",
+                \"service-port\": ${RABBITMQ_PORT},
+                \"service-host\": \"${RABBITMQ_HOST}\",
+                \"rabbitmq-vhost\": \"/\",
+                \"rabbitmq-queue-physics\": \"test-ci\",
+                \"rabbitmq-exchange-training\": \"ams-fanout\",
+                \"rabbitmq-key-training\": \"training\"
+              }""" > rmq.json
+          jq < rm.json
+          export AMS_RMQ_CONFIG=$(cat ./rmq.json)
           cmake \
             -DBUILD_SHARED_LIBS=On \
             -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
             -DWITH_CALIPER=On \
             -DWITH_HDF5=On \
-            -DWITH_EXAMPLES=On \
             -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
             -DCMAKE_INSTALL_PREFIX=./install \
             -DCMAKE_BUILD_TYPE=Release \
             -DWITH_CUDA=Off \
-            -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-            -DMFEM_DIR=$AMS_MFEM_PATH \
-            -DWITH_FAISS=On \
             -DWITH_MPI=On \
             -DWITH_TORCH=On \
             -DWITH_TESTS=On \
             -DTorch_DIR=$AMS_TORCH_PATH \
-            -DFAISS_DIR=$AMS_FAISS_PATH \
             -DWITH_AMS_DEBUG=On \
             -DWITH_WORKFLOW=On \
-            -DWITH_ADIAK=Off \
             -DWITH_RMQ=On \
             -Damqpcpp_DIR=$AMS_AMQPCPP_PATH \
             $GITHUB_WORKSPACE
@@ -460,34 +316,13 @@ jobs:
           export SPACK_ROOT=/spack/
           source /spack/share/spack/setup-env.sh
           spack env activate -p /ams-spack-env
-
           # We overwrite the rmq.json created by CMake
-          echo """{
-            \"db\": {
-              \"dbType\": \"rmq\",
-              \"rmq_config\": {
-                \"rabbitmq-user\": \"${RABBITMQ_USER}\",
-                \"rabbitmq-password\": \"${RABBITMQ_PASS}\",
-                \"service-port\": ${RABBITMQ_PORT},
-                \"service-host\": \"${RABBITMQ_HOST}\",
-                \"rabbitmq-vhost\": \"/\",
-                \"rabbitmq-queue-physics\": \"test-ci\",
-                \"rabbitmq-exchange-training\": \"ams-fanout\",
-                \"rabbitmq-key-training\": \"training\"
-              },
-              \"update_surrogate\": false
-            },
-            \"ml_models\": {},
-            \"domain_models\": {}
-          }""" > $GITHUB_WORKSPACE/build/tests/AMSlib/rmq.json
-
           ctest --output-on-failure ./
         env:
           RABBITMQ_USER: ams
           RABBITMQ_PASS: ams
           RABBITMQ_HOST: rabbitmq
           RABBITMQ_PORT: 5672
-    
   sqlite-tests:
     runs-on: ubuntu-latest
     container: ghcr.io/llnl/ams-ci-flux-mariadb:latest
@@ -534,4 +369,3 @@ jobs:
         run: |
           source /venv/bin/activate
           python -m unittest discover -s tests/AMSWorkflow -p "test_*.py" -v
-
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index c0eaf759..f752c33c 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -48,7 +48,7 @@ variables:
 
 # Lassen uses a different job scheduler (spectrum lsf) that does not allow
 # pre-allocation the same way slurm does. Arguments for job level allocation
-  LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
+  LASSEN_JOB_ALLOC: "1 -W 45 -q pci"
 # Add variables that should apply to all the jobs on a machine:
 #  LASSEN_MY_VAR: "..."
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb52c561..eef2edef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 cmake_minimum_required(VERSION 3.18)
-project(AMS VERSION 0.1.0 LANGUAGES CXX C)
+project(AMS VERSION 0.1.1 LANGUAGES CXX C)
 cmake_policy(SET CMP0074 NEW)
 
 # NOTE: This may break some of our integrations with the applications. But flux requires > C++20, RMQ requires C++17 and although AMS does not have 
@@ -33,128 +33,81 @@ set(AMS_EXAMPLE_INCLUDES "")
 
 # ------------------------------------------------------------------------------
 option(WITH_CUDA           "Option to enable CUDA" OFF)
-option(WITH_EXAMPLES       "Build examples" OFF)
 option(WITH_MPI            "Option to enable MPI" OFF)
 option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
-option(WITH_FAISS          "Use C/C++ FAISS interface for HD Cache" OFF)
-option(WITH_TORCH          "Use C/C++ Torch interface for Surrogate Model Inference" OFF)
-option(WITH_TORCH_DEBUG    "Compute RMSE of Surrogate Model and Physics Module" OFF)
 option(WITH_TESTS          "Compile tests" OFF)
-option(WITH_REDIS          "Use REDIS as a database back end" OFF)
 option(WITH_HDF5           "Use HDF5 as a database back end" OFF)
 option(WITH_RMQ            "Use RabbitMQ as a database back end (require a reachable and running RabbitMQ server service)" OFF)
 option(WITH_AMS_DEBUG      "Enable verbose messages" OFF)
 option(WITH_PERFFLOWASPECT "Use PerfFlowAspect for Profiling" OFF)
 option(WITH_WORKFLOW       "Install python drivers used by the outer workflow" OFF)
 option(WITH_AMS_LIB        "Install C++ library to support scientific applications" ON)
-option(WITH_ADIAK          "Use Adiak for recording metadata" OFF)
-option(BUILD_SHARED_LIBS   "Build using shared libraries" ON)
 
-if (WITH_MPI)
-  #  SET(CMAKE_CXX_COMPILER "${MPI_CXX_COMPILER}" CACHE FILEPATH "CXX compiler overridden with MPI C++ wrapper")
-  #SET(CMAKE_C_COMPILER "${MPI_C_COMPILER}" CACHE FILEPATH "CXX compiler overridden with MPI C++ wrapper")
+# ------------------------------------------------------------------------------
+find_package(nlohmann_json REQUIRED)
+list(APPEND AMS_APP_LIBRARIES nlohmann_json::nlohmann_json)
 
+# ------------------------------------------------------------------------------
+find_package(Threads REQUIRED)
+# ------------------------------------------------------------------------------
+
+set(MPI_DIRECTORIES "")
+if (WITH_MPI)
   find_package(MPI REQUIRED)
   message(STATUS "MPICC:  ${MPI_C_COMPILER}")
   message(STATUS "MPICXX: ${MPI_CXX_COMPILER}")
-  list(APPEND AMS_APP_LIBRARIES MPI::MPI_CXX)
   message(STATUS "MPI Library used: " MPI::MPI_CXX)
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_MPI__")
+  foreach(LIBRARY ${MPI_C_LIBRARIES})
+    get_filename_component(LIB_DIR ${LIBRARY} DIRECTORY)
+    list(APPEND MPI_DIRECTORIES ${LIB_DIR})
+  endforeach()
+  list(REMOVE_DUPLICATES MPI_DIRECTORIES)
+  message(STATUS "MPI Liraries: ${MPI_DIRECTORIES}")
+  list(APPEND AMS_APP_DEFINES "__AMS_ENABLE_MPI__")
 endif()
 
 # ------------------------------------------------------------------------------
 if (WITH_CUDA)
-  if (NOT DEFINED AMS_CUDA_ARCH)
-    message(WARNING "CUDA ARCH IS NOT DEFINED, USING 70")
-    set(AMS_CUDA_ARCH 70)
-  endif()
-
-  if (BUILD_SHARED_LIBS)
-    set(CUDA_RUNTIME_LIBRARY "Shared")
-  else()
-    set(CUDA_RUNTIME_LIBRARY "Static")
-  endif()
-
-  # we need to enable nvcc
-  enable_language(CUDA)
-  set(CMAKE_CUDA_STANDARD 14)
-  # Turn off message of mfem
-  set(ENABLE_CUDA True)
-  list(APPEND AMS_APP_LIBRARIES ${CUDA_LIBRARIES} cuda)
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_CUDA__")
-  set(THRUST_IGNORE_CUB_VERSION_CHECK True)
+  find_package(CUDAToolkit)
 endif()
 
+
 # ------------------------------------------------------------------------------
 if (WITH_CALIPER)
   find_package(caliper REQUIRED)
-  list(APPEND AMS_APP_INCLUDES ${caliper_INCLUDE_DIR})
-  list(APPEND AMS_APP_LIBRARIES caliper)
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_CALIPER__")
+  message(STATUS "Caliper Directory is ${caliper_DIR}")
+  list(APPEND AMS_APP_DEFINES "__AMS_ENABLE_CALIPER__")
 endif()
 
 if (WITH_AMS_DEBUG)
-  list(APPEND AMS_APP_DEFINES "-DLIBAMS_VERBOSE")
+  list(APPEND AMS_APP_DEFINES "LIBAMS_VERBOSE")
 endif()
 
-# ------------------------------------------------------------------------------
-list(APPEND AMS_APP_DEFINES "-D__ENABLE_DB__")
-
-if (WITH_REDIS)
-  # Temprorary fix for:= the following error which happens when using Cuda 11.6 and Redis backend
-  # error: #error The version of CUB in your include path is not compatible with this release of Thrust.
-  if (WITH_CUDA)
-    add_compile_definitions(THRUST_IGNORE_CUB_VERSION_CHECK)
-  endif()
-  if (HIREDIS_DIR)
-    find_path(HIREDIS_HEADER NAMES hiredis HINTS ${HIREDIS_DIR} PATH_SUFFIXES include)
-    find_library(HIREDIS_LIB NAMES hiredis HINTS ${HIREDIS_DIR} PATH_SUFFIXES lib)
-  else()
-    find_package(hiredis REQUIRED)
-    find_path(HIREDIS_HEADER hiredis)
-    find_library(HIREDIS_LIB hiredis)
-  endif()
-  message(STATUS "Hiredis library is ${HIREDIS_LIB}")
-  message(STATUS "Hiredis headers are ${HIREDIS_HEADER}")
-  list(APPEND AMS_APP_INCLUDES ${HIREDIS_HEADER})
-  list(APPEND AMS_APP_LIBRARIES ${HIREDIS_LIB})
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_REDIS__")
-
-  if (REDIS_PLUS_PLUS_DIR)
-    find_path(REDIS_PLUS_PLUS_HEADER NAMES sw PATHS ${REDIS_PLUS_PLUS_DIR} PATH_SUFFIXES include)
-    find_library(REDIS_PLUS_PLUS_LIB NAMES redis++ PATHS ${REDIS_PLUS_PLUS_DIR} PATH_SUFFIXES lib)
+if (WITH_HDF5)
+  if (AMS_HDF5_DIR)
+         if (HDF5_USE_STATIC_LIBRARIES)
+           find_package(HDF5 COMPONENTS C static NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
+           set(AMS_HDF5_TARGET hdf5-static)
+           set(AMS_HDF5_LIB_TYPE "static")
+         else()
+             find_package(HDF5 COMPONENTS C shared NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
+             set(AMS_HDF5_LIB_TYPE "shared")
+             set(AMS_HDF5_TARGET hdf5-shared)
+         endif()
   else()
-    find_path(REDIS_PLUS_PLUS_HEADER sw)
-    find_library(REDIS_PLUS_PLUS_LIB redis++)
+         set(AMS_HDF5_LIB_TYPE "default")
+         find_package(HDF5 REQUIRED COMPONENTS C CXX)
+         set(AMS_HDF5_TARGET HDF5::HDF5)
   endif()
-  message(STATUS "Redis++ library is ${REDIS_PLUS_PLUS_LIB}")
-  list(APPEND AMS_APP_INCLUDES ${REDIS_PLUS_PLUS_HEADER})
-  list(APPEND AMS_APP_LIBRARIES ${REDIS_PLUS_PLUS_LIB})
-endif() # WITH_REDIS
-
-
-if (WITH_HDF5)
-  if (HDF5_USE_STATIC_LIBRARIES)
-  find_package(HDF5 NAMES hdf5 COMPONENTS C static NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
-  list(APPEND AMS_APP_LIBRARIES ${HDF5_C_STATIC_LIBRARY})
-  message(STATUS "HDF5 Static Library : ${HDF5_C_STATIC_LIBRARY}")
-  set(AMS_HDF5_LIB_TYPE "static")
-else()
-  find_package(HDF5 NAMES hdf5 COMPONENTS C shared NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
-  list(APPEND AMS_APP_LIBRARIES ${HDF5_C_SHARED_LIBRARY})
-  message(STATUS "HDF5 Shared Library : ${HDF5_C_SHARED_LIBRARY}")
-  set(AMS_HDF5_LIB_TYPE "shared")
-endif()
-  list(APPEND AMS_APP_INCLUDES ${HDF5_INCLUDE_DIR})
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_HDF5__")
-  message(STATUS "HDF5 Include directories: ${HDF5_INCLUDE_DIR}")
+  message(STATUS "HDF5 Dir is ${HDF5_FOUND}")
+  list(APPEND AMS_APP_DEFINES "__AMS_ENABLE_HDF5__")
 endif() # WITH_HDF5
 
 if (WITH_RMQ)
   if (WITH_CUDA)
     add_compile_definitions(THRUST_IGNORE_CUB_VERSION_CHECK)
   endif()
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_RMQ__")
+  list(APPEND AMS_APP_DEFINES "__AMS_ENABLE_RMQ__")
 
   find_package(amqpcpp REQUIRED)
   get_target_property(amqpcpp_INCLUDE_DIR amqpcpp INTERFACE_INCLUDE_DIRECTORIES)
@@ -163,88 +116,28 @@ if (WITH_RMQ)
   find_package(OpenSSL REQUIRED)
   set(AMS_OPENSSL_FOUND_ROOT "")
   if (OPENSSL_FOUND)
-    list(APPEND AMS_APP_INCLUDES ${OPENSSL_INCLUDE_DIR})
-    list(APPEND AMS_APP_LIBRARIES "${OPENSSL_LIBRARIES}")
-    list(APPEND AMS_APP_LIBRARIES ssl)
+    message(STATUS "OpenSSL libraries found: ${OPENSSL_LIBRARIES}")
+    message(STATUS "OpenSSL includes found: " ${OPENSSL_INCLUDE_DIR})
     get_filename_component(AMS_OPENSSL_FOUND_ROOT "${OPENSSL_SSL_LIBRARY}" DIRECTORY)
     get_filename_component(AMS_OPENSSL_FOUND_ROOT "${AMS_OPENSSL_FOUND_ROOT}" DIRECTORY)
-    message(STATUS "OpenSSL includes found: " ${OPENSSL_INCLUDE_DIR})
-    message(STATUS "OpenSSL libraries found: " ${OPENSSL_LIBRARIES})
+    message(STATUS "OPENSSL Root dir is ${AMS_OPENSSL_FOUND_ROOT}")
   else()
     message(STATUS "OpenSSL Not Found")
   endif()
-
+  set(AMS_LIBEVENT_HINTS ${MPI_DIRECTORIES})
+  
   find_package(libevent REQUIRED) # event loop library
-  list(APPEND AMS_APP_INCLUDES ${LIBEVENT_INCLUDE_DIR})
-  list(APPEND AMS_APP_LIBRARIES "${LIBEVENT_LIBRARIES}")
-  list(APPEND AMS_APP_LIBRARIES amqpcpp event_pthreads event)
 endif() # WITH_RMQ
 
-if(NOT DEFINED UMPIRE_DIR)
-  message(FATAL_ERROR "Missing required 'UMPIRE_DIR' variable pointing to an installed Umpire ${UMPIRE_DIR}")
-endif()
-
-find_package(UMPIRE REQUIRED
-             NO_DEFAULT_PATH
-             PATHS ${UMPIRE_DIR}/share/umpire/cmake ${UMPIRE_DIR}/lib/cmake/umpire)
-list(APPEND AMS_APP_LIBRARIES umpire)
-list(APPEND AMS_APP_INCLUDES ${UMPIRE_INCLUDE_DIR})
-
 # ------------------------------------------------------------------------------
-find_package(nlohmann_json REQUIRED)
-list(APPEND AMS_APP_LIBRARIES nlohmann_json::nlohmann_json)
+find_package(Torch REQUIRED)
+# This is annoying, torch populates all my cuda flags
+# and resets them
+set(CMAKE_CUDA_FLAGS "")
+set(CMAKE_CUDA_ARCHITECTURES ON)
 
-# ------------------------------------------------------------------------------
-find_package(Threads REQUIRED)
 
 # ------------------------------------------------------------------------------
-if (WITH_TORCH)
-  find_package(Torch REQUIRED)
-  # This is annoying, torch populates all my cuda flags
-  # and resets them
-  set(CMAKE_CUDA_FLAGS "")
-  set(CMAKE_CUDA_ARCHITECTURES ON)
-
-  list(APPEND AMS_APP_INCLUDES "${TORCH_INCLUDE_DIRS}")
-  list(APPEND AMS_APP_LIBRARIES "${TORCH_LIBRARIES}")
-
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_TORCH__")
-  set(BLA_VENDER OpenBLAS)
-  find_package(BLAS REQUIRED)
-  list(APPEND AMS_APP_LIBRARIES "${BLAS_LIBRARIES}")
-endif()
-
-# ------------------------------------------------------------------------------
-if (WITH_FAISS)
-  ## TODO: still need to create FindFaiss.cmake
-  #find_package(FAISS REQUIRED HINTS ${FAISS_DIR})
-  #include(${PROJECT_SOURCE_DIR}/cmake/FindFAISS.cmake)
-
-  if (FAISS_DIR)
-    message(STATUS "FAISS_DIR = ${FAISS_DIR}")
-    set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/include")
-    set(FAISS_LIB_DIR "${FAISS_DIR}/lib")
-    set(FAISS_LIB64_DIR "${FAISS_DIR}/lib64")
-    set(FAISS_LIBRARIES "faiss")
-  else()
-    message(FATAL_ERROR "Cannot find FAISS. FAISS_DIR is not defined.")
-  endif()
-
-  ## TODO: we need to make a FindFaiss.cmake
-  list(APPEND AMS_APP_INCLUDES "${FAISS_INCLUDE_DIRS}")
-  list(APPEND AMS_APP_LIB_DIRS "${FAISS_LIB_DIR}" "${FAISS_LIB64_DIR}")
-  list(APPEND AMS_APP_LIBRARIES "${FAISS_LIBRARIES}")
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_FAISS__")
-
-
-  find_package(OpenMP)
-  if (OPENMP_FOUND)
-     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-  endif()
-endif()
-
 if (WITH_RZ)
   find_package(MPI REQUIRED)
   add_subdirectory(rz)
@@ -257,7 +150,7 @@ endif()
 if (WITH_PERFFLOWASPECT)
   find_package(perfflowaspect CONFIG REQUIRED)
 
-  list(APPEND AMS_APP_DEFINES "-D__ENABLE_PERFFLOWASPECT__")
+  list(APPEND AMS_APP_DEFINES "__AMS_ENABLE_PERFFLOWASPECT__")
   list(APPEND AMS_APP_LIB_DIRS "${PERFFLOWASPECT_LIB_DIR}")
 
   list(APPEND AMS_APP_LIBRARIES "perfflow_runtime")
@@ -266,80 +159,7 @@ endif()
 
 add_subdirectory(src)
 
-# ------------------------------------------------------------------------------
-if (WITH_AMS_LIB)
-if (WITH_EXAMPLES)
-  #The AMS Runtime does not depend on MFEM. The
-  #examples we use do.
-  if (WITH_MPI)
-    list(APPEND AMS_EXAMPLE_LIBRARIES MPI::MPI_CXX)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_MPI__")
-  endif()
-
-  list(APPEND AMS_EXAMPLE_LIBRARIES umpire)
-  list(APPEND AMS_EXAMPLE_INCLUDES ${UMPIRE_INCLUDE_DIR})
-
-  if (MFEM_DIR)
-    include(${PROJECT_SOURCE_DIR}/cmake/FindMFEM.cmake)
-  else()
-    find_package(MFEM REQUIRED)
-  endif()
-  list(APPEND AMS_EXAMPLE_INCLUDES "${MFEM_INCLUDE_DIRS}")
-  list(APPEND AMS_EXAMPLE_LIBRARIES "${MFEM_LIBRARIES}")
-  list(APPEND AMS_EXAMPLE_LIB_DIRS "${MFEM_LIB_DIR}")
-
-  if (WITH_REDIS)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_REDIS__")
-  endif()
-
-  if (WITH_RMQ)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_RMQ__")
-    list(APPEND AMS_EXAMPLE_INCLUDES ${amqpcpp_INCLUDE_DIR})
-    list(APPEND AMS_EXAMPLE_INCLUDES ${OPENSSL_INCLUDE_DIR})
-    list(APPEND AMS_EXAMPLE_INCLUDES ${LIBEVENT_INCLUDE_DIR})
-    list(APPEND AMS_EXAMPLE_LIBRARIES OpenSSL::SSL amqpcpp pthread event_pthreads event)
-    if (WITH_MPI)
-      list(APPEND AMS_EXAMPLE_LIBRARIES MPI::MPI_CXX)
-    endif()
-  endif()
-
-  if (WITH_CUDA)
-    list(APPEND AMS_EXAMPLE_LIBRARIES ${CUDA_LIBRARIES} cuda)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_CUDA__")
-  endif()
-
-  if (WITH_CALIPER)
-    list(APPEND AMS_EXAMPLE_INCLUDES ${caliper_INCLUDE_DIR})
-    list(APPEND AMS_EXAMPLE_LIBRARIES caliper)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_CALIPER__")
-  endif()
-
-  if (WITH_FAISS)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_FAISS__")
-  endif()
-
-  if (WITH_TORCH)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_TORCH__")
-  endif()
-
-  if (WITH_PERFFLOWASPECT)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_PERFFLOWASPECT__")
-  endif()
-
-  if (WITH_ADIAK)
-    find_package(adiak REQUIRED)
-    list(APPEND AMS_EXAMPLE_DEFINES "-D__ENABLE_ADIAK__")
-    list(APPEND AMS_EXAMPLE_INCLUDES ${adiak_INCLUDE_DIR})
-    list(APPEND AMS_EXAMPLE_LIBRARIES adiak::adiak)
-  endif()
-
-  add_subdirectory(examples)
-endif()
-endif()
-
 if (WITH_TESTS)
   include(CTest)
   add_subdirectory(tests)
 endif()
-
-# ------------------------------------------------------------------------------
diff --git a/cmake/AMSConfig.cmake.in b/cmake/AMSConfig.cmake.in
index b94b6308..a87e5bd0 100644
--- a/cmake/AMSConfig.cmake.in
+++ b/cmake/AMSConfig.cmake.in
@@ -10,7 +10,6 @@ set(AMS_WITH_HDF5 @WITH_HDF5@)
 set(AMS_WITH_RMQ @WITH_RMQ@)
 include(CMakeFindDependencyMacro)
 
-find_dependency(nlohmann_json REQUIRED)
 if (NOT TARGET Torch)
   set(AMS_TORCH_DIR @Torch_DIR@)
   if (NOT Torch_DIR)
@@ -40,7 +39,9 @@ if (AMS_WITH_HDF5)
       set(HDF5_DIR ${AMS_HDF5_DIR})
     endif()
 
-    if (${AMS_HDF5_LIB_TYPE} STREQUAL "static")
+    if (${AMS_HDF5_LIB_TYPE} STREQUAL "default")
+      find_dependency(HDF5 COMPONENTS C)
+    elseif (${AMS_HDF5_LIB_TYPE} STREQUAL "static")
       find_dependency(HDF5 COMPONENTS C static HINTS @AMS_HDF5_DIR@ @AMS_HDF5_DIR@/share/cmake)
     else()
       find_dependency(HDF5 COMPONENTS C shared HINTS @AMS_HDF5_DIR@ @AMS_HDF5_DIR@/share/cmake)
diff --git a/cmake/Findlibevent.cmake b/cmake/Findlibevent.cmake
index 849434ab..553d8ec1 100644
--- a/cmake/Findlibevent.cmake
+++ b/cmake/Findlibevent.cmake
@@ -6,16 +6,38 @@
 # LIBEVENT_FOUND - System has LibEvent
 # LIBEVENT_INCLUDE_DIR - the LibEvent include directory
 # LIBEVENT_LIBRARIES 0 The libraries needed to use LibEvent
-find_path     (LIBEVENT_INCLUDE_DIR NAMES event.h)
-find_library  (LIBEVENT_LIBRARY     NAMES event)
-find_library  (LIBEVENT_CORE        NAMES event_core)
-find_library  (LIBEVENT_EXTRA       NAMES event_extra)
+find_path     (LIBEVENT_INCLUDE_DIR NAMES event.h HINTS ${AMS_LIBEVENT_HINTS})
+find_library  (LIBEVENT_LIBRARY     NAMES event HINTS ${AMS_LIBEVENT_HINTS} NO_CMAKE_SYSTEM_PATH)
+
+if(NOT LIBEVENT_LIBRARY)
+    find_library(LIBEVENT_LIBRARY NAMES event)
+endif()
+
+find_library  (LIBEVENT_CORE NAMES event_core HINTS ${AMS_LIBEVENT_HINTS} NO_CMAKE_SYSTEM_PATH)
+if(NOT LIBEVENT_CORE)
+  find_library(LIBEVENT_CORE NAMES event_core)
+endif()
+
+
+find_library  (LIBEVENT_EXTRA       NAMES event_extra HINTS ${AMS_LIBEVENT_HINTS} NO_CMAKE_SYSTEM_PATH)
+if(NOT LIBEVENT_EXTRA)
+  find_library(LIBEVENT_EXTRA NAMES event_extra)
+endif()
+
 if (NOT EVHTP_DISABLE_EVTHR)
-    find_library (LIBEVENT_THREAD   NAMES event_pthreads)
+  find_library (LIBEVENT_THREAD NAMES event_pthreads HINTS ${AMS_LIBEVENT_HINTS} NO_CMAKE_SYSTEM_PATH)
+  if(NOT LIBEVENT_THREAD)
+    find_library(LIBEVENT_THREAD NAMES event_pthreads)
+  endif()
 endif()
+
 if (NOT EVHTP_DISABLE_SSL)
-    find_library (LIBEVENT_SSL      NAMES event_openssl)
+  find_library (LIBEVENT_SSL  NAMES event_openssl HINTS ${AMS_LIBEVENT_HINTS} NO_CMAKE_SYSTEM_PATH)
+  if(NOT LIBEVENT_SSL)
+    find_library(LIBEVENT_SSL NAMES event_openssl)
+  endif()
 endif()
+
 include (FindPackageHandleStandardArgs)
 set (LIBEVENT_INCLUDE_DIRS ${LIBEVENT_INCLUDE_DIR})
 set (LIBEVENT_LIBRARIES
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
deleted file mode 100644
index 6b1d7d0e..00000000
--- a/examples/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
-# AMSLib Project Developers
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-add_subdirectory(ideal_gas)
-if (WITH_CUDA)
-add_subdirectory(bnm_opt)
-endif()
-
diff --git a/examples/bnm_opt/CMakeLists.txt b/examples/bnm_opt/CMakeLists.txt
index 0f147c2b..2f5a2712 100644
--- a/examples/bnm_opt/CMakeLists.txt
+++ b/examples/bnm_opt/CMakeLists.txt
@@ -3,25 +3,66 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-set(binomial_options_src binomial_options.cpp kernel.cpp)
-
-function(ADDExec binary_name definitions)
-  target_include_directories(${binary_name} PUBLIC "${AMS_EXAMPLE_INCLUDES}")
-  target_compile_definitions(${binary_name} PRIVATE ${definitions})
-  target_compile_definitions(${binary_name} PRIVATE ${AMS_EXAMPLE_DEFINES})
-  target_link_directories(${binary_name} PRIVATE ${AMS_EXAMPLE_LIB_DIRS})
-  target_link_libraries(${binary_name} PUBLIC AMS ${AMS_EXAMPLE_LIBRARIES})
-
-  if (WITH_CUDA)
-      set_source_files_properties(kernel.cpp PROPERTIES LANGUAGE CUDA)
-      set_target_properties(${binary_name} PROPERTIES CUDA_ARCHITECTURES "${AMS_CUDA_ARCH}")
+
+cmake_minimum_required(VERSION 3.10)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(BOptions LANGUAGES CXX)
+
+set(AMS_EXAMPLE_SRC binomial_options.cpp kernel.cpp)
+
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+enable_language(CUDA)
+
+find_package(AMS REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+function(ADDExec binary_name use_ams)
+  target_link_libraries(${binary_name} PRIVATE ${MFEM_LIBRARIES})
+  target_include_directories(${binary_name} PRIVATE ${MFEM_INCLUDE_DIRS})
+  # we always use ams to avoid if def conflicts.
+  # but we only enable it for the use_ams case
+  target_link_libraries(${binary_name} PRIVATE AMS::AMS)
+  if (${use_ams})
+    target_compile_definitions(${binary_name} PRIVATE USE_AMS)
+  endif()
+
+  if (WITH_MPI)
+    target_link_libraries(${binary_name} PRIVATE MPI::MPI_CXX)
+    target_compile_definitions(${binary_name} PRIVATE "__ENABLE_MPI__")
   endif()
+
+
+  target_link_libraries(${binary_name} PRIVATE 
+    CUDA::cudart
+    CUDA::cublas
+    CUDA::cusparse
+    CUDA::curand 
+  )
+  set_source_files_properties(kernel.cpp PROPERTIES LANGUAGE CUDA)
+  set_target_properties(${binary_name} PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
+  target_compile_definitions(${binary_name} PRIVATE "__ENABLE_CUDA__")
 endfunction()
 
-add_executable(no_ams_bo ${binomial_options_src} ${MINIAPP_INCLUDES})
-ADDExec(no_ams_bo minibude "${AMS_EXAMPLE_DEFINES}")
+add_executable(no_ams_boptions ${AMS_EXAMPLE_SRC} ${MINIAPP_INCLUDES})
+ADDExec(no_ams_boptions FALSE)
+
+add_executable(ams_boptions ${AMS_EXAMPLE_SRC} ${MINIAPP_INCLUDES})
+ADDExec(ams_boptions TRUE)
 
-list(APPEND AMS_EXAMPLE_DEFINES "-DUSE_AMS")
-add_executable(ams_bo ${binomial_options_src} ${MINIAPP_INCLUDES})
-ADDExec(ams_bo "${AMS_EXAMPLE_DEFINES}")
 
diff --git a/examples/bnm_opt/binomial_options.cpp b/examples/bnm_opt/binomial_options.cpp
index f9a2a4e0..8ff8182d 100644
--- a/examples/bnm_opt/binomial_options.cpp
+++ b/examples/bnm_opt/binomial_options.cpp
@@ -253,7 +253,7 @@ int main(int argc, char **argv)
 #endif
 
   if (argc != 5) {
-    std::cout << "USAGE: " << argv[0] << " num-options batch_size";
+    std::cout << "USAGE: " << argv[0] << " num-options batch_size num_fractions fraction_id";
     return EXIT_FAILURE;
   }
 
diff --git a/examples/bnm_opt/kernel.cpp b/examples/bnm_opt/kernel.cpp
index a2e9631e..0364b622 100644
--- a/examples/bnm_opt/kernel.cpp
+++ b/examples/bnm_opt/kernel.cpp
@@ -26,6 +26,8 @@
 #include <AMS.h>
 #endif
 
+using namespace ams;
+
 
 // Overloaded shortcut functions for different precision modes
 #ifndef DOUBLE_PRECISION
@@ -169,6 +171,7 @@ BinomialOptions::BinomialOptions(unsigned int batchSize,
 
 #ifdef USE_AMS
   const char *model_name = std::getenv("BO_MODEL_NAME");
+  std::cout << "Model name is " << model_name << "\n";
   if (model_name) {
     model = AMSQueryModel(model_name);
   } else {
@@ -176,9 +179,6 @@ BinomialOptions::BinomialOptions(unsigned int batchSize,
   }
 
   wf = AMSCreateExecutor(model,
-                         AMSDType::AMS_DOUBLE,
-                         AMSResourceType::AMS_DEVICE,
-                         (AMSPhysicFn)(BinomialOptions::AMSRun),
                          rank,
                          worldSize);
 #endif
@@ -219,18 +219,38 @@ void BinomialOptions::run(real *callValue,
   cudaMemcpy(d_X, _X, sizeof(real) * optN, cudaMemcpyHostToDevice);
 
 #ifdef USE_AMS
-  std::vector<const real *> inputs({(const real *)d_S,
-                                    (const real *)d_X,
-                                    (const real *)d_R,
-                                    (const real *)d_V,
-                                    (const real *)d_T});
+  
+  SmallVector<AMSTensor> inputs;
+  SmallVector<AMSTensor> inout;
+  SmallVector<AMSTensor> outputs;
+  inputs.push_back(std::move(AMSTensor::view(d_S, {static_cast<long>(optN), 1L}, {1, 1}, AMSResourceType::AMS_DEVICE)));
+  inputs.push_back(std::move(AMSTensor::view(d_X, {static_cast<long>(optN), 1L}, {1, 1}, AMSResourceType::AMS_DEVICE)));
+  inputs.push_back(std::move(AMSTensor::view(d_R, {static_cast<long>(optN), 1L}, {1, 1}, AMSResourceType::AMS_DEVICE)));
+  inputs.push_back(std::move(AMSTensor::view(d_V, {static_cast<long>(optN), 1L}, {1, 1}, AMSResourceType::AMS_DEVICE)));
+  inputs.push_back(std::move(AMSTensor::view(d_T, {static_cast<long>(optN), 1L}, {1, 1}, AMSResourceType::AMS_DEVICE)));
+
+
+  outputs.push_back(std::move(AMSTensor::view(d_CallValue, {static_cast<long>(optN), 1}, {1, 1}, AMSResourceType::AMS_DEVICE)));
+
+  EOSLambda OrigComputation = [&, this](const SmallVector<AMSTensor> &ams_ins,
+                                        SmallVector<AMSTensor> &ams_inouts,
+                                        SmallVector<AMSTensor> &ams_outs) {
+  binomialOptionsGPU(ams_outs[0].data<real>(),
+                     ams_ins[0].data<real>(),
+                     ams_ins[1].data<real>(),
+                     ams_ins[2].data<real>(),
+                     ams_ins[3].data<real>(),
+                     ams_ins[4].data<real>(),
+                     d_vDt,
+                     d_puByDf,
+                     d_pdByDf,
+                     ams_outs[0].shape()[0]);
+  };
+
+
   AMSExecute(wf,
-             (void *)this,
-             optN,
-             reinterpret_cast<const void **>(inputs.data()),
-             reinterpret_cast<void **>(&d_CallValue),
-             inputs.size(),
-             1);
+             OrigComputation,
+             inputs, inout, outputs);
 #else
   binomialOptionsGPU(
       d_CallValue, d_S, d_X, d_R, d_V, d_T, d_puByDf, d_pdByDf, d_vDt, optN);
diff --git a/examples/bnm_opt/kernel.hpp b/examples/bnm_opt/kernel.hpp
index 939dd5f6..0f4176a0 100644
--- a/examples/bnm_opt/kernel.hpp
+++ b/examples/bnm_opt/kernel.hpp
@@ -2,9 +2,7 @@
 
 #include "realtype.h"
 
-#ifdef USE_AMS
 #include <AMS.h>
-#endif
 
 class BinomialOptions
 {
@@ -18,10 +16,8 @@ class BinomialOptions
   real *d_X;
   real *d_CallValue;
 
-#ifdef USE_AMS
-  AMSCAbstrModel model;
-  AMSExecutor wf;
-#endif
+  ams::AMSCAbstrModel model;
+  ams::AMSExecutor wf;
 
 public:
   BinomialOptions(unsigned int batchSize, int rank, int worldSize);
@@ -33,9 +29,7 @@ class BinomialOptions
            real *_T,
            size_t optN);
 
-#ifdef USE_AMS
   static void AMSRun(void *cls, long numOptions, void **inputs, void **outputs);
-#endif
 
   ~BinomialOptions();
 };
diff --git a/examples/ideal_gas/CMakeLists.txt b/examples/ideal_gas/CMakeLists.txt
index d0a41287..dfa32e0c 100644
--- a/examples/ideal_gas/CMakeLists.txt
+++ b/examples/ideal_gas/CMakeLists.txt
@@ -1,49 +1,85 @@
-# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+# Copyright 2021-2023 Lawrence Livermore National Security, LLC and otherMS
 # AMSLib Project Developers
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-file(GLOB_RECURSE AMS_CURRENT_EXAMPLE_INCLUDES "*.hpp")
+cmake_minimum_required(VERSION 3.10)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(IdealGasExample LANGUAGES CXX)
+
+
+option(WITH_CUDA           "Option to enable CUDA" On)
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+if (WITH_CUDA)
+  message(WARNING "CUDA is ${CMAKE_CUDA_ARCHITECTURES}")
+  enable_language(CUDA)
+  find_package(CUDAToolkit)
+  message(WARNING "CUDA is ${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+
+if (MFEM_DIR)
+  include(${PROJECT_SOURCE_DIR}/cmake/FindMFEM.cmake)
+else()
+  find_package(MFEM REQUIRED)
+endif()
+
 
 set(AMS_EXAMPLE_SRC ${MINIAPP_INCLUDES} main.cpp app/eos_ams.cpp)
 
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+find_package(AMS REQUIRED)
+
+function(ADDExec binary_name use_ams)
+  target_link_libraries(${binary_name} PRIVATE ${MFEM_LIBRARIES})
+  target_include_directories(${binary_name} PRIVATE ${MFEM_INCLUDE_DIRS})
+  # we always use ams to avoid if def conflicts.
+  # but we only enable it for the use_ams case
+  target_link_libraries(${binary_name} PRIVATE AMS::AMS)
+  if (${use_ams})
+    target_compile_definitions(${binary_name} PRIVATE USE_AMS)
+  endif()
 
-function(ADDExec binary_name definitions)
-  if (WITH_RZ)
-    list(APPEND AMS_EXAMPLE_SRC ${RZ_AMS_SOURCES} ${AMS_CURRENT_EXAMPLE_INCLUDES})
+  if (WITH_MPI)
+    target_link_libraries(${binary_name} PRIVATE MPI::MPI_CXX)
+    target_compile_definitions(${binary_name} PRIVATE "__ENABLE_MPI__")
   endif()
 
-  target_include_directories(${binary_name} PRIVATE ${AMS_EXAMPLE_INCLUDES}
-    ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR}/include)
-  target_compile_definitions(${binary_name} PRIVATE ${definitions})
-  target_link_directories(${binary_name} PRIVATE ${AMS_EXAMPLE_LIB_DIRS})
-  target_link_libraries(${binary_name} PUBLIC AMS ${AMS_EXAMPLE_LIBRARIES})
 
   if (WITH_CUDA)
-      set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA)
-      set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
-      set_source_files_properties(app/eos_ams.cpp PROPERTIES LANGUAGE CUDA)
-      set_source_files_properties(app/eos_ams.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
-      set_target_properties(${binary_name} PROPERTIES CUDA_ARCHITECTURES "${AMS_CUDA_ARCH}")
-
-      if (WITH_RZ)
-          set_source_files_properties(${RZ_AMS_SOURCES} PROPERTIES LANGUAGE CUDA)
-          set_source_files_properties(${RZ_AMS_SOURCES} PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
-          set_property(TARGET ${binary_name} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-      endif()
-
-      if (WITH_PERFFLOWASPECT)
-        set_property(SOURCE ${AMS_EXAMPLE_SRC} APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
-      endif()
+    target_link_libraries(${binary_name} PRIVATE 
+      CUDA::cudart          # CUDA Runtime
+      CUDA::cublas          # cuBLAS
+      CUDA::cusparse        # cuSPARSE
+      CUDA::curand          # cuRAND
+    )
+    set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(main.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
+    set_source_files_properties(app/eos_ams.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(app/eos_ams.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
+    set_target_properties(${binary_name} PROPERTIES CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
+    target_compile_definitions(${binary_name} PRIVATE "__ENABLE_CUDA__")
   endif()
 endfunction()
 
 add_executable(no_ams_example ${AMS_EXAMPLE_SRC} ${MINIAPP_INCLUDES})
-ADDExec(no_ams_example "${AMS_EXAMPLE_DEFINES}")
+ADDExec(no_ams_example FALSE)
 
-list(APPEND AMS_EXAMPLE_DEFINES "-DUSE_AMS")
 add_executable(ams_example ${AMS_EXAMPLE_SRC} ${MINIAPP_INCLUDES})
-ADDExec(ams_example "${AMS_EXAMPLE_DEFINES}")
+ADDExec(ams_example TRUE)
 
 if (WITH_WORKFLOW)
   set(TRAIN_DEVICE "cpu")
diff --git a/examples/ideal_gas/app/eos_ams.cpp b/examples/ideal_gas/app/eos_ams.cpp
index cf7b9de5..3f5735b9 100644
--- a/examples/ideal_gas/app/eos_ams.cpp
+++ b/examples/ideal_gas/app/eos_ams.cpp
@@ -5,64 +5,27 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
-#include "eos_ams.hpp"
-
+#include <SmallVector.hpp>
 #include <vector>
 
-template <typename FPType>
-void callBack(void *cls,
-              long elements,
-              const void *const *inputs,
-              void *const *outputs)
-{
-  static_cast<EOS<FPType> *>(cls)->Eval(elements,
-                                        static_cast<const FPType *>(inputs[0]),
-                                        static_cast<const FPType *>(inputs[1]),
-                                        static_cast<FPType *>(outputs[0]),
-                                        static_cast<FPType *>(outputs[1]),
-                                        static_cast<FPType *>(outputs[2]),
-                                        static_cast<FPType *>(outputs[3]));
-}
+#include "eos_ams.hpp"
 
+using namespace ams;
 
 template <typename FPType>
-AMSEOS<FPType>::AMSEOS(EOS<FPType> *model,
-                       const AMSDBType db_type,
-                       const AMSDType dtype,
+AMSEOS<FPType>::AMSEOS(const AMSDBType db_type,
+                       const AMSResourceType resource,
                        const AMSExecPolicy exec_policy,
-                       const AMSResourceType res_type,
                        const AMSUQPolicy uq_policy,
-                       const int k_nearest,
                        const int mpi_task,
                        const int mpi_nproc,
                        const double threshold,
-                       const char *surrogate_path,
-                       const char *uq_path)
-    : model_(model)
+                       const char *surrogate_path)
+    : res_(resource), IdealGas<FPType>(1.6, 1.4)
 {
-  AMSCAbstrModel model_descr = AMSRegisterAbstractModel("ideal_gas",
-                                                        uq_policy,
-                                                        threshold,
-                                                        surrogate_path,
-                                                        uq_path,
-                                                        "ideal_gas",
-                                                        k_nearest);
-#ifdef __ENABLE_MPI__
-  wf_ = AMSCreateDistributedExecutor(model_descr,
-                                     dtype,
-                                     res_type,
-                                     (AMSPhysicFn)callBack<FPType>,
-                                     MPI_COMM_WORLD,
-                                     mpi_task,
-                                     mpi_nproc);
-#else
-  wf_ = AMSCreateExecutor(model_descr,
-                          dtype,
-                          res_type,
-                          (AMSPhysicFn)callBack<FPType>,
-                          mpi_task,
-                          mpi_nproc);
-#endif
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "ideal_gas", uq_policy, threshold, surrogate_path, "ideal_gas");
+  wf_ = AMSCreateExecutor(model_descr, mpi_task, mpi_nproc);
 }
 
 template <typename FPType>
@@ -77,16 +40,40 @@ void AMSEOS<FPType>::Eval(const int length,
                           FPType *bulkmod,
                           FPType *temperature) const
 {
-  std::vector<const FPType *> inputs = {density, energy};
-  std::vector<FPType *> outputs = {pressure, soundspeed2, bulkmod, temperature};
+  SmallVector<AMSTensor> inputs;
+  inputs.push_back(
+      std::move(AMSTensor::view(density, {length, 1}, {1, 1}, res_)));
+  inputs.push_back(
+      std::move(AMSTensor::view(density, {length, 1}, {1, 1}, res_)));
+
+  SmallVector<AMSTensor> inout;
+  SmallVector<AMSTensor> outputs;
+  outputs.push_back(
+      std::move(AMSTensor::view(pressure, {length, 1}, {1, 1}, res_)));
+  outputs.push_back(
+      std::move(AMSTensor::view(soundspeed2, {length, 1}, {1, 1}, res_)));
+  outputs.push_back(
+      std::move(AMSTensor::view(bulkmod, {length, 1}, {1, 1}, res_)));
+  outputs.push_back(
+      std::move(AMSTensor::view(temperature, {length, 1}, {1, 1}, res_)));
+
+  EOSLambda OrigComputation = [&, this](const SmallVector<AMSTensor> &ams_ins,
+                                        SmallVector<AMSTensor> &ams_inouts,
+                                        SmallVector<AMSTensor> &ams_outs) {
+    std::cout << "Shape is " << ams_ins[0].shape()[0] << ", "
+              << ams_ins[1].shape()[1] << "\n";
+    IdealGas<FPType>::Eval(
+        ams_ins[0].shape()[0],
+        static_cast<const FPType *>(ams_ins[0].data<FPType>()),
+        static_cast<const FPType *>(ams_ins[1].data<FPType>()),
+        static_cast<FPType *>(ams_outs[0].data<FPType>()),
+        static_cast<FPType *>(ams_outs[1].data<FPType>()),
+        static_cast<FPType *>(ams_outs[2].data<FPType>()),
+        static_cast<FPType *>(ams_outs[3].data<FPType>()));
+  };
+
 
-  AMSExecute(wf_,
-             (void *)model_,
-             length,
-             reinterpret_cast<const void **>(inputs.data()),
-             reinterpret_cast<void **>(outputs.data()),
-             inputs.size(),
-             outputs.size());
+  AMSExecute(wf_, OrigComputation, inputs, inout, outputs);
 }
 
 template class AMSEOS<double>;
diff --git a/examples/ideal_gas/app/eos_ams.hpp b/examples/ideal_gas/app/eos_ams.hpp
index c799568c..ac9748d1 100644
--- a/examples/ideal_gas/app/eos_ams.hpp
+++ b/examples/ideal_gas/app/eos_ams.hpp
@@ -11,49 +11,31 @@
 #include <stdexcept>
 
 #include "AMS.h"
-#include "eos.hpp"
+#include "eos_idealgas.hpp"
 
 template <typename FPType>
-class AMSEOS : public EOS<FPType>
+class AMSEOS : public IdealGas<FPType>
 {
-  AMSExecutor wf_;
-  EOS<FPType> *model_ = nullptr;
+  ams::AMSExecutor wf_;
+  ams::AMSResourceType res_;
 
 public:
-  AMSEOS(EOS<FPType> *model,
-         const AMSDBType db_type,
-         const AMSDType dtype,
-         const AMSExecPolicy exec_policy,
-         const AMSResourceType res_type,
-         const AMSUQPolicy uq_policy,
-         const int k_nearest,
+  AMSEOS(const ams::AMSDBType db_type,
+         const ams::AMSResourceType resource,
+         const ams::AMSExecPolicy exec_policy,
+         const ams::AMSUQPolicy uq_policy,
          const int mpi_task,
          const int mpi_nproc,
          const double threshold,
-         const char *surrogate_path,
-         const char *uq_path);
-
-  virtual ~AMSEOS() { delete model_; }
-
-  void Eval(const int length,
-            const FPType *density,
-            const FPType *energy,
-            FPType *pressure,
-            FPType *soundspeed2,
-            FPType *bulkmod,
-            FPType *temperature) const override;
-
-  void Eval_with_filter(const int length,
-                        const FPType *density,
-                        const FPType *energy,
-                        const bool *filter,
-                        FPType *pressure,
-                        FPType *soundspeed2,
-                        FPType *bulkmod,
-                        FPType *temperature) const override
-  {
-    throw std::runtime_error("AMSEOS: Eval_with_filter is not implemented");
-  }
+         const char *surrogate_path);
+
+  virtual void Eval(const int length,
+                    const FPType *density,
+                    const FPType *energy,
+                    FPType *pressure,
+                    FPType *soundspeed2,
+                    FPType *bulkmod,
+                    FPType *temperature) const override;
 };
 
 #endif  // _AMS_EOS_HPP_
diff --git a/examples/ideal_gas/app/eos_idealgas.hpp b/examples/ideal_gas/app/eos_idealgas.hpp
index d1af5e43..d7b203f0 100644
--- a/examples/ideal_gas/app/eos_idealgas.hpp
+++ b/examples/ideal_gas/app/eos_idealgas.hpp
@@ -39,6 +39,7 @@ class IdealGas : public EOS<FPType>
   {
     const FPType gamma = gamma_;
     const FPType specific_heat = specific_heat_;
+    std::cout << "Evaluating for " << length << " elements\n";
 
     using mfem::ForallWrap;
     MFEM_FORALL(i, length, {
@@ -47,6 +48,13 @@ class IdealGas : public EOS<FPType>
       bulkmod[i] = gamma * pressure[i];
       temperature[i] = energy[i] / specific_heat;
     });
+
+    for (int i = 0; i < length; i++) {
+      std::cout << "( " << i << ") Outputs are (" << pressure[i] << ", ";
+      std::cout << soundspeed2[i] << ", ";
+      std::cout << bulkmod[i] << ", ";
+      std::cout << temperature[i] << ")\n";
+    }
   }
 
 #ifdef __ENABLE_PERFFLOWASPECT__
diff --git a/cmake/FindMFEM.cmake b/examples/ideal_gas/cmake/FindMFEM.cmake
similarity index 100%
rename from cmake/FindMFEM.cmake
rename to examples/ideal_gas/cmake/FindMFEM.cmake
diff --git a/examples/ideal_gas/main.cpp b/examples/ideal_gas/main.cpp
index b35d4293..ab41bc4e 100644
--- a/examples/ideal_gas/main.cpp
+++ b/examples/ideal_gas/main.cpp
@@ -5,9 +5,6 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
-#ifdef __AMS_ENABLE_ADIAK__
-#include <adiak.hpp>
-#endif
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -31,7 +28,20 @@
 // this macro completely bypasses all AMS functionality
 // this allows us to check how easy is it to test ams
 
-#include "AMS.h"
+#include <AMSTypes.hpp>
+using namespace ams;
+
+#define CALIPER(stmt)
+
+#ifdef __ENABLE_MPI__
+#include <mpi.h>
+#define MPI_CALL(stmt)                                                         \
+  if (stmt != MPI_SUCCESS) {                                                   \
+    fprintf(stderr, "Error in MPI-Call (File: %s, %d)\n", __FILE__, __LINE__); \
+  }
+#else
+#define MPI_CALL(stm)
+#endif
 
 void printMemory(std::unordered_set<std::string> &allocators)
 {
@@ -56,68 +66,16 @@ void createUmpirePool(std::string parent_name, std::string pool_name)
       pool_name, rm.getAllocator(parent_name));
 }
 
-std::unordered_set<std::string> createMemoryAllocators(
-    std::string pool,
-    std::string &physics_host_alloc,
-    std::string &physics_device_alloc,
-    std::string &physics_pinned_alloc,
-    std::string &ams_host_alloc,
-    std::string &ams_device_alloc,
-    std::string &ams_pinned_alloc)
+void createMemoryAllocators(std::string &physics_host_alloc,
+                            std::string &physics_device_alloc,
+                            std::string &physics_pinned_alloc)
 {
   std::unordered_set<std::string> allocator_names;
-  if (pool == "default") {
-    physics_host_alloc = ams_host_alloc = "HOST";
-    allocator_names.insert(ams_host_alloc);
+  physics_host_alloc = "HOST";
 #ifdef __ENABLE_CUDA__
-    physics_device_alloc = ams_device_alloc = "DEVICE";
-    allocator_names.insert(ams_device_alloc);
-    physics_pinned_alloc = ams_pinned_alloc = "PINNED";
-    allocator_names.insert(ams_pinned_alloc);
+  physics_device_alloc = "DEVICE";
+  physics_pinned_alloc = "PINNED";
 #endif
-  } else if (pool == "split") {
-    physics_host_alloc = "phys-host";
-    createUmpirePool("HOST", "phys-host");
-    allocator_names.insert(physics_host_alloc);
-
-    ams_host_alloc = "ams-host";
-    createUmpirePool("HOST", ams_host_alloc);
-    allocator_names.insert(ams_host_alloc);
-
-#ifdef __ENABLE_CUDA__
-    physics_device_alloc = "phys-device";
-    createUmpirePool("DEVICE", physics_device_alloc);
-    allocator_names.insert(physics_device_alloc);
-
-    physics_pinned_alloc = "phys-pinned";
-    createUmpirePool("PINNED", physics_pinned_alloc);
-    allocator_names.insert(physics_pinned_alloc);
-
-    ams_device_alloc = "ams-device";
-    createUmpirePool("DEVICE", ams_device_alloc);
-    allocator_names.insert(ams_device_alloc);
-
-    ams_pinned_alloc = "ams-pinned";
-    createUmpirePool("PINNED", ams_pinned_alloc);
-    allocator_names.insert(ams_pinned_alloc);
-#endif
-  } else if (pool == "same") {
-    physics_host_alloc = ams_host_alloc = "common-host";
-    createUmpirePool("HOST", "common-host");
-    allocator_names.insert(physics_host_alloc);
-#ifdef __ENABLE_CUDA__
-    physics_device_alloc = ams_device_alloc = "common-device";
-    createUmpirePool("DEVICE", "common-device");
-    allocator_names.insert(ams_device_alloc);
-    physics_pinned_alloc = ams_pinned_alloc = "common-pinned";
-    createUmpirePool("PINNED", "common-pinned");
-    allocator_names.insert(ams_pinned_alloc);
-#endif
-  } else {
-    std::cout << "Stategy is " << pool << "\n";
-    throw std::runtime_error("Pool strategy does not exist\n");
-  }
-  return std::move(allocator_names);
 }
 
 int computeNumElements(int globalNumElements, int id, int numRanks)
@@ -148,7 +106,6 @@ template <typename TypeValue>
 int run(const char *device_name,
         const char *db_type,
         const char *uq_policy_opt,
-        AMSDType precision,
         int seed,
         int rId,
         int imbalance,
@@ -156,7 +113,6 @@ int run(const char *device_name,
         double avg,
         double stdDev,
         double threshold,
-        const char *pool,
         int num_mats,
         int num_elems,
         int num_qpts,
@@ -165,11 +121,9 @@ int run(const char *device_name,
         const char *eos_name,
         int stop_cycle,
         bool pack_sparse_mats,
-        const char *hdcache_path,
         const char *model_path,
         const char *db_config,
-        bool lbalance,
-        int k_nearest)
+        ams::AMSExecPolicy ams_loadBalance)
 {
   // -------------------------------------------------------------------------
   // setup
@@ -180,9 +134,7 @@ int run(const char *device_name,
 
   const bool use_device = std::strcmp(device_name, "cpu") != 0;
   AMSDBType dbType = AMSDBType::AMS_NONE;
-  if (std::strcmp(db_type, "csv") == 0) {
-    dbType = AMSDBType::AMS_CSV;
-  } else if (std::strcmp(db_type, "hdf5") == 0) {
+  if (std::strcmp(db_type, "hdf5") == 0) {
     dbType = AMSDBType::AMS_HDF5;
   } else if (std::strcmp(db_type, "rmq") == 0) {
     dbType = AMSDBType::AMS_RMQ;
@@ -196,12 +148,7 @@ int run(const char *device_name,
   }
 
   AMSUQPolicy uq_policy;
-
-  if (strcmp(uq_policy_opt, "faiss-max") == 0)
-    uq_policy = AMSUQPolicy::AMS_FAISS_MAX;
-  else if (strcmp(uq_policy_opt, "faiss-mean") == 0)
-    uq_policy = AMSUQPolicy::AMS_FAISS_MEAN;
-  else if (strcmp(uq_policy_opt, "deltauq-max") == 0)
+  if (strcmp(uq_policy_opt, "deltauq-max") == 0)
     uq_policy = AMSUQPolicy::AMS_DELTAUQ_MAX;
   else if (strcmp(uq_policy_opt, "deltauq-mean") == 0)
     uq_policy = AMSUQPolicy::AMS_DELTAUQ_MEAN;
@@ -246,17 +193,9 @@ int run(const char *device_name,
   std::string physics_device_alloc;
   std::string physics_pinned_alloc;
 
-  std::string ams_host_alloc;
-  std::string ams_device_alloc;
-  std::string ams_pinned_alloc;
-
-  auto allocator_names = createMemoryAllocators(std::string(pool),
-                                                physics_host_alloc,
-                                                physics_device_alloc,
-                                                physics_pinned_alloc,
-                                                ams_host_alloc,
-                                                ams_device_alloc,
-                                                ams_pinned_alloc);
+  createMemoryAllocators(physics_host_alloc,
+                         physics_device_alloc,
+                         physics_pinned_alloc);
 
 
   mfem::MemoryManager::SetUmpireHostAllocatorName(physics_host_alloc.c_str());
@@ -266,17 +205,6 @@ int run(const char *device_name,
   }
 
 
-  // When we are not allocating from parent/root umpire allocator
-  // we need to inform AMS about the pool allocators.
-  if (strcmp(pool, "default") != 0) {
-    AMSSetAllocator(AMSResourceType::AMS_HOST, ams_host_alloc.c_str());
-
-    if (use_device) {
-      AMSSetAllocator(AMSResourceType::AMS_DEVICE, ams_device_alloc.c_str());
-      AMSSetAllocator(AMSResourceType::AMS_PINNED, ams_pinned_alloc.c_str());
-    }
-  }
-
   mfem::Device::SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE,
                                mfem::MemoryType::DEVICE_UMPIRE);
 
@@ -325,27 +253,16 @@ int run(const char *device_name,
   // ---------------------------------------------------------------------
   // setup AMS options
   // ---------------------------------------------------------------------
-#ifdef USE_AMS
-  constexpr bool use_ams = true;
-  const char *uq_path = nullptr;
   const char *surrogate_path = nullptr;
   const char *db_path = nullptr;
 
-#ifdef __ENABLE_FAISS__
-  uq_path = (strlen(hdcache_path) > 0) ? hdcache_path : nullptr;
-#endif
-
   std::cout << "surrogate Path is : " << model_path << "\n";
-#ifdef __ENABLE_TORCH__
   surrogate_path = (strlen(model_path) > 0) ? model_path : nullptr;
-#endif
 
   db_path = (strlen(db_config) > 0) ? db_config : nullptr;
 
-  AMSResourceType ams_device = AMSResourceType::AMS_HOST;
-  if (use_device) ams_device = AMSResourceType::AMS_DEVICE;
-  AMSExecPolicy ams_loadBalance = AMSExecPolicy::AMS_UBALANCED;
-  if (lbalance) ams_loadBalance = AMSExecPolicy::AMS_BALANCED;
+#ifdef USE_AMS
+  constexpr bool use_ams = true;
 #else
   constexpr bool use_ams = false;
 #endif
@@ -358,26 +275,23 @@ int run(const char *device_name,
     EOS<TypeValue> *base;
     if (eos_name == std::string("ideal_gas")) {
       base = new IdealGas<TypeValue>(1.6, 1.4);
-    } else if (eos_name == std::string("constant_host")) {
-      base = new ConstantEOSOnHost<TypeValue>(physics_host_alloc.c_str(), 1.0);
     } else {
       std::cerr << "unknown eos `" << eos_name << "'" << std::endl;
       return 1;
     }
 #ifdef USE_AMS
     if (use_ams) {
-      eoses[mat_idx] = new AMSEOS<TypeValue>(base,
-                                             dbType,
-                                             precision,
-                                             ams_loadBalance,
-                                             ams_device,
-                                             uq_policy,
-                                             k_nearest,
-                                             rId,
-                                             wS,
-                                             threshold,
-                                             surrogate_path,
-                                             uq_path);
+      eoses[mat_idx] =
+          new AMSEOS<TypeValue>(dbType,
+                                use_device == true
+                                    ? ams::AMSResourceType::AMS_DEVICE
+                                    : ams::AMSResourceType::AMS_HOST,
+                                ams_loadBalance,
+                                uq_policy,
+                                rId,
+                                wS,
+                                threshold,
+                                surrogate_path);
 
     } else
 #endif
@@ -578,7 +492,6 @@ int run(const char *device_name,
     }
     CALIPER(CALI_MARK_END("Cycle");)
     MPI_CALL(MPI_Barrier(MPI_COMM_WORLD));
-    printMemory(allocator_names);
   }
 
   // TODO: Add smart-pointers
@@ -592,7 +505,6 @@ int run(const char *device_name,
   return 0;
 }
 
-PERFFASPECT()
 int main(int argc, char **argv)
 {
   // -------------------------------------------------------------------------
@@ -617,15 +529,12 @@ int main(int argc, char **argv)
   const char *device_name = "cpu";
   const char *eos_name = "ideal_gas";
   const char *model_path = "";
-  const char *hdcache_path = "";
   const char *db_config = "";
   const char *db_type = "";
 
   const char *precision_opt = "double";
-  AMSDType precision = AMSDType::AMS_DOUBLE;
 
   const char *uq_policy_opt = "";
-  int k_nearest = 5;
 
   int seed = 0;
   double empty_element_ratio = -1;
@@ -651,30 +560,6 @@ int main(int argc, char **argv)
 
   bool verbose = false;
 
-#ifdef __AMS_ENABLE_ADIAK__
-  // add adiak init here
-  adiak::init(NULL);
-
-  // replace with adiak::collect_all(); once adiak v0.4.0
-  adiak::uid();
-  adiak::launchdate();
-  adiak::launchday();
-  adiak::executable();
-  adiak::executablepath();
-  adiak::workdir();
-  adiak::libraries();
-  adiak::cmdline();
-  adiak::hostname();
-  adiak::clustername();
-  adiak::walltime();
-  adiak::systime();
-  adiak::cputime();
-  adiak::jobsize();
-  adiak::hostlist();
-  adiak::numhosts();
-  adiak::value("compiler", std::string("@RAJAPERF_COMPILER@"));
-#endif
-
   // -------------------------------------------------------------------------
   // setup command line parser
   // -------------------------------------------------------------------------
@@ -689,7 +574,6 @@ int main(int argc, char **argv)
 
   // surrogate model
   args.AddOption(&model_path, "-S", "--surrogate", "Path to surrogate model");
-  args.AddOption(&hdcache_path, "-H", "--hdcache", "Path to hdcache index");
 
   // eos model and length of simulation
   args.AddOption(&eos_name, "-z", "--eos", "EOS model type");
@@ -735,13 +619,6 @@ int main(int argc, char **argv)
                  "--stdev",
                  "Standard deviation of random number generator of imbalance ");
 
-  args.AddOption(&lbalance,
-                 "-lb",
-                 "--with-load-balance",
-                 "-nlb",
-                 "--without-load-balance",
-                 "Enable Load balance module in AMS");
-
   args.AddOption(&threshold,
                  "-t",
                  "--threshold",
@@ -759,25 +636,13 @@ int main(int argc, char **argv)
                  "-dt",
                  "--dbtype",
                  "Configuration option of the different DB types:\n"
-                 "\t 'csv' Use csv as back end\n"
                  "\t 'hdf5': use hdf5 as a back end\n"
                  "\t 'rmq': use RabbitMQ as a back end\n");
 
-  args.AddOption(&k_nearest,
-                 "-knn",
-                 "--k-nearest-neighbors",
-                 "Number of closest neightbors we should look at");
-
   args.AddOption(&uq_policy_opt,
                  "-uq",
                  "--uqtype",
                  "Types of UQ to select from: \n"
-                 "\t 'faiss-mean' Uncertainty is computed in comparison "
-                 "against the "
-                 "mean distance of k-nearest neighbors\n"
-                 "\t 'faiss-max': Uncertainty is computed in comparison with "
-                 "the "
-                 "k'st cluster \n"
                  "\t 'deltauq-mean': Uncertainty through DUQ using mean\n"
                  "\t 'deltauq-max': Uncertainty through DUQ using max\n"
                  "\t 'random': Uncertainty throug a random model\n");
@@ -785,14 +650,6 @@ int main(int argc, char **argv)
   args.AddOption(
       &verbose, "-v", "--verbose", "-qu", "--quiet", "Print extra stuff");
 
-  args.AddOption(&pool,
-                 "-ptype",
-                 "--pool-type",
-                 "How to assign memory pools to AMSlib:\n"
-                 "\t 'default' Use the default Umpire pool\n"
-                 "\t 'split' provide a separate pool to AMSlib\n"
-                 "\t 'same': assign the same with physics to AMS\n");
-
   // -------------------------------------------------------------------------
   // parse arguments
   // -------------------------------------------------------------------------
@@ -807,6 +664,9 @@ int main(int argc, char **argv)
     std::cout << std::endl;
   }
 
+  ams::AMSExecPolicy ams_loadBalance = ams::AMSExecPolicy::AMS_UBALANCED;
+  if (lbalance) ams_loadBalance = ams::AMSExecPolicy::AMS_BALANCED;
+
   // -------------------------------------------------------------------------
   // additional argument validation
   // -------------------------------------------------------------------------
@@ -842,21 +702,11 @@ int main(int argc, char **argv)
   std::cout << "Total computed elements across all ranks: " << wS * num_elems
             << "(Weak Scaling)\n";
 
-  if (strcmp(precision_opt, "single") == 0)
-    precision = AMSDType::AMS_SINGLE;
-  else if (strcmp(precision_opt, "double") == 0)
-    precision = AMSDType::AMS_DOUBLE;
-  else {
-    std::cerr << "Invalid precision " << precision_opt << "\n";
-    return -1;
-  }
-
   int ret = 0;
-  if (precision == AMSDType::AMS_SINGLE)
+  if (strcmp(precision_opt, "single") == 0)
     ret = run<float>(device_name,
                      db_type,
                      uq_policy_opt,
-                     precision,
                      seed,
                      rId,
                      imbalance,
@@ -864,7 +714,6 @@ int main(int argc, char **argv)
                      avg,
                      stdDev,
                      threshold,
-                     pool,
                      num_mats,
                      num_elems,
                      num_qpts,
@@ -873,16 +722,13 @@ int main(int argc, char **argv)
                      eos_name,
                      stop_cycle,
                      pack_sparse_mats,
-                     hdcache_path,
                      model_path,
                      db_config,
-                     lbalance,
-                     k_nearest);
-  else if (precision == AMSDType::AMS_DOUBLE)
+                     ams_loadBalance);
+  else if (strcmp(precision_opt, "double") == 0)
     ret = run<double>(device_name,
                       db_type,
                       uq_policy_opt,
-                      precision,
                       seed,
                       rId,
                       imbalance,
@@ -890,7 +736,6 @@ int main(int argc, char **argv)
                       avg,
                       stdDev,
                       threshold,
-                      pool,
                       num_mats,
                       num_elems,
                       num_qpts,
@@ -899,22 +744,14 @@ int main(int argc, char **argv)
                       eos_name,
                       stop_cycle,
                       pack_sparse_mats,
-                      hdcache_path,
                       model_path,
                       db_config,
-                      lbalance,
-                      k_nearest);
+                      ams_loadBalance);
   else {
     std::cerr << "Invalid precision " << precision_opt << "\n";
     return -1;
   }
 
-  // ---------------------------------------------------------------------------
-#ifdef __AMS_ENABLE_ADIAK__
-  // adiak finalize
-  adiak::fini();
-#endif
-
   MPI_CALL(MPI_Finalize());
   AMSFinalize();
   return ret;
diff --git a/scripts/gitlab/ci-build-test.sh b/scripts/gitlab/ci-build-test.sh
index e3ba63b3..ca85960e 100755
--- a/scripts/gitlab/ci-build-test.sh
+++ b/scripts/gitlab/ci-build-test.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+echo ${CI_PROJECT_DIR}
 
 source scripts/gitlab/setup-env.sh
 
@@ -16,22 +17,21 @@ cleanup() {
 }
 
 build_and_test() {
-  WITH_TORCH=${1}
-  WITH_FAISS=${2}
-  WITH_HDF5=${3}
-  WITH_MPI=${4}
+  WITH_HDF5=${1}
+  WITH_MPI=${2}
+  WITH_WORKFLOW=${3}
 
   echo "*******************************************************************************************"
   echo "Build configuration" \
-    "WITH_TORCH ${WITH_TORCH}" \
-    "WITH_FAISS ${WITH_FAISS}" \
     "WITH_HDF5 ${WITH_HDF5}" \
     "WITH_MPI ${WITH_MPI}" \
+    "WITH_MPI ${WITH_WORKFLOW}" \
     "WITH_CUDA ${WITH_CUDA}"
   echo "*******************************************************************************************"
 
-  mkdir -p /tmp/ams
-  pushd /tmp/ams
+  build_dir="/tmp/ams/$(uuidgen)"
+  mkdir -p ${build_dir}
+  pushd ${build_dir}
 
   cleanup
 
@@ -45,23 +45,16 @@ build_and_test() {
     -DCMAKE_PREFIX_PATH=$INSTALL_DIR \
     -DWITH_CALIPER=On \
     -DWITH_HDF5=${WITH_HDF5} \
-    -DWITH_EXAMPLES=On \
     -DAMS_HDF5_DIR=$AMS_HDF5_PATH \
     -DCMAKE_INSTALL_PREFIX=./install \
     -DCMAKE_BUILD_TYPE=Release \
     -DCUDA_ARCH=$AMS_CUDA_ARCH \
     -DWITH_CUDA=${WITH_CUDA} \
-    -DUMPIRE_DIR=$AMS_UMPIRE_PATH \
-    -DMFEM_DIR=$AMS_MFEM_PATH \
-    -DWITH_FAISS=${WITH_FAISS} \
     -DWITH_MPI=${WITH_MPI} \
-    -DWITH_TORCH=${WITH_TORCH} \
     -DWITH_TESTS=On \
     -DTorch_DIR=$AMS_TORCH_PATH \
-    -DFAISS_DIR=$AMS_FAISS_PATH \
     -DWITH_AMS_DEBUG=On \
-    -DWITH_WORKFLOW=On \
-    -DWITH_ADIAK=On \
+    -DWITH_WORKFLOW=${WITH_WORKFLOW} \
     ${CI_PROJECT_DIR} || { echo "CMake failed"; exit 1; }
 
   make -j || { echo "Building failed"; exit 1; }
@@ -71,13 +64,13 @@ build_and_test() {
   cleanup
 
   popd
-  rm -rf /tmp/ams
+  rm -rf ${build_dir}
 }
 
-# build_and_test WITH_TORCH WITH_FAISS WITH_HDF5 WITH_MPI
-build_and_test "On" "On" "On" "On"
-build_and_test "On" "On" "On" "Off"
-build_and_test "Off" "On" "On" "On"
-build_and_test "Off" "Off" "On" "On"
-build_and_test "Off" "Off" "Off" "On"
+# build_and_test WITH_HDF5 WITH_MPI
+build_and_test "On" "On" "Off"
+build_and_test "On" "Off" "Off"
+build_and_test "Off" "On" "Off"
+build_and_test "Off" "Off" "Off"
+build_and_test "Off" "Off" "On"
 
diff --git a/src/AMSWorkflow/ams/faccessors.py b/src/AMSWorkflow/ams/faccessors.py
index c5d7a218..e250d881 100644
--- a/src/AMSWorkflow/ams/faccessors.py
+++ b/src/AMSWorkflow/ams/faccessors.py
@@ -5,7 +5,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import argparse
-import csv
 from abc import ABC, abstractmethod
 from pathlib import Path
 
@@ -57,61 +56,6 @@ def load(self) -> tuple:
         raise NotImplementedError
 
 
-class CSVReader(FileReader):
-    """
-    A CSV File Reader
-    """
-
-    suffix = "csv"
-
-    def __init__(self, file_name: str, delimiter: str = ":"):
-        super().__init__()
-        self.file_name = file_name
-        self.delimiter = delimiter
-        self.fd = None
-
-    def open(self):
-        self.fd = open(self.file_name, "r")
-        return self
-
-    def close(self):
-        self.fd.close()
-
-    def __enter__(self):
-        self.open()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-
-    def load(self) -> tuple:
-        """
-        load the data in the file and return a tupple of the inputs, outputs
-
-        We assume the file is produced by the C/C++ front-end. Thus the file
-        will have a generic header row specifying the inputs/outputs
-
-        Returns:
-            A tuple of None, input, output data values
-        """
-
-        if self.fd and self.fd.closed:
-            return None, None
-
-        file_data = list(csv.reader(self.fd, delimiter=self.delimiter))
-        header = file_data[0]
-        data = file_data[1:]
-        output_start = header.index("output_0")
-        data = np.array(data)
-        input_data = data[:, :output_start]
-        output_data = data[:, output_start:]
-        return (None, input_data.astype(np.float64), output_data.astype(np.float64))
-
-    @classmethod
-    def get_file_format_suffix(cls):
-        return cls.suffix
-
-
 class HDF5CLibReader(FileReader):
     """
     An HDF5 reader for files generated directly by the C/C++ code.
@@ -270,68 +214,6 @@ def store(self, inputs, outputs) -> int:
         raise NotImplementedError
 
 
-class CSVWriter(FileWriter):
-    """
-    A simple CSV backend.
-    """
-
-    suffix = "csv"
-
-    def __init__(self, file_name: str, delimiter: str = ":"):
-        super().__init__()
-        self.file_name = file_name
-        self.delimiter = delimiter
-        self.fd = None
-        self.write_header = False
-
-    def __str__(self) -> str:
-        return f"{__class__.__name__}(fd={self.fd}, delimiter={self.delimiter})"
-
-    def open(self):
-        if not Path(self.file_name).exists():
-            self.write_header = True
-
-        self.fd = open(self.file_name, "a")
-        return self
-
-    def close(self):
-        self.write_header = False
-        self.fd.close()
-
-    def __enter__(self):
-        return self.open()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.close()
-
-    def store(self, inputs: np.array, outputs: np.array) -> int:
-        """Store the two arrays in a CSV file"""
-        assert len(inputs) == len(outputs)
-        if self.fd and self.fd.closed:
-            return 0
-        if self.write_header:
-            writer = csv.DictWriter(
-                self.fd,
-                fieldnames=[f"input_{i}" for i in range(inputs.shape[-1])]
-                + [f"output_{i}" for i in range(outputs.shape[-1])],
-                delimiter=self.delimiter,
-            )
-            writer.writeheader()
-            self.write_header = False
-
-        csvwriter = csv.writer(self.fd, delimiter=self.delimiter, quotechar="'", quoting=csv.QUOTE_MINIMAL)
-        nelem = len(inputs)
-        elem_wrote: int = 0
-        # We follow the mini-app format, inputs elem and then output elems
-        for i in range(nelem):
-            elem_wrote += csvwriter.writerow(np.concatenate((inputs[i], outputs[i]), axis=0))
-        return elem_wrote
-
-    @classmethod
-    def get_file_format_suffix(cls):
-        return cls.suffix
-
-
 class HDF5Writer(FileWriter):
     """
     A simple hdf5 backend.
@@ -422,7 +304,7 @@ def get_reader(ftype="dhdf5"):
     Factory method return a AMS file reader depending on the requested filetype
     """
 
-    readers = {"shdf5": HDF5CLibReader, "dhdf5": HDF5PackedReader, "csv": CSVReader}
+    readers = {"shdf5": HDF5CLibReader, "dhdf5": HDF5PackedReader}
     return readers[ftype]
 
 
@@ -431,21 +313,19 @@ def get_writer(ftype="shdf5"):
     Factory method return a AMS file writer depending on the requested filetype
     """
 
-    writers = {"shdf5": HDF5Writer, "dhdf5": HDF5PackedWriter, "csv": CSVWriter}
+    writers = {"shdf5": HDF5Writer, "dhdf5": HDF5PackedWriter}
     return writers[ftype]
 
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--type", "-t", help="version to assign to data file", choices=["hdf5", "csv"], required=True)
+    parser.add_argument("--type", "-t", help="version to assign to data file", choices=["hdf5"], required=True)
     parser.add_argument("--action", "-a", help="action", choices=["write", "read"], required=True)
     parser.add_argument("filename")
     args = parser.parse_args()
 
     if args.action == "write":
-        if args.type == "csv":
-            db = CSVWriter
-        elif args.type == "hdf5":
+        if args.type == "hdf5":
             db = HDF5PackedWriter
 
         with db(args.filename) as fd:
@@ -454,9 +334,7 @@ def main():
             fd.store(inputs, outputs)
 
     elif args.action == "read":
-        if args.type == "csv":
-            db = CSVReader
-        elif args.type == "hdf5":
+        if args.type == "hdf5":
             db = HDF5PackedReader
 
         with db(args.filename) as fd:
diff --git a/src/AMSWorkflow/ams/rmq.py b/src/AMSWorkflow/ams/rmq.py
index d342140b..06f4505e 100644
--- a/src/AMSWorkflow/ams/rmq.py
+++ b/src/AMSWorkflow/ams/rmq.py
@@ -53,24 +53,22 @@ def header_format(self) -> str:
         This string represents the AMS format in Python pack format:
         See https://docs.python.org/3/library/struct.html#format-characters
         - 1 byte is the size of the header (here 12). Limit max: 255
-        - 1 byte is the precision (4 for float, 8 for double). Limit max: 255
         - 2 bytes are the MPI rank (0 if AMS is not running with MPI). Limit max: 65535
         - 2 bytes to store the size of the MSG domain name. Limit max: 65535
-        - 4 bytes are the number of elements in the message. Limit max: 2^32 - 1
         - 2 bytes are the input dimension. Limit max: 65535
         - 2 bytes are the output dimension. Limit max: 65535
-        - 2 bytes are for aligning memory to 8
+        - 3 bytes are for aligning memory to 4
 
-            |_Header_|_Datatype_|_Rank_|_DomainSize_|_#elems_|_InDim_|_OutDim_|_Pad_|_DomainName_|.Real_Data.|
+            |_Header_|_Rank_|_DomainSize_|_InDim_|_OutDim_|_Pad_|_DomainName_|.Real_Data.|
 
-        Then the data starts at byte 16 with the domain name, then the real data and
+        Then the data starts at byte 12 with the domain name, then the real data and
         is structured as pairs of input/outputs. Let K be the total number of elements,
-        then we have K pairs of inputs/outputs (either float or double):
+        then we have K inputs followed by K outputs:
 
-            |__Header_(16B)__|_Domain_Name_|__Input 1__|__Output 1__|...|__Input_K__|__Output_K__|
+            |__Header_(16B)__|_Domain_Name_|__Input 1__|...|__Input_K__|__Output_1__|...
 
         """
-        return "BBHHIHHH"
+        return "BHHHH3x"
 
     def endianness(self) -> str:
         """
@@ -90,7 +88,7 @@ def encode(
         """
         For debugging and testing purposes, this function encode a message identical to what AMS would send
         """
-        header_format = self.ams_endianness() + self.ams_header_format()
+        header_format = self.endianness() + self.ams_header_format()
         hsize = struct.calcsize(header_format)
         assert dtype_byte in [4, 8]
         dt = "f" if dtype_byte == 4 else "d"
@@ -127,29 +125,13 @@ def _parse_header(self, body: str) -> dict:
         # Parse header
         (
             res["hsize"],
-            res["datatype"],
             res["mpirank"],
             res["domain_size"],
-            res["num_element"],
             res["input_dim"],
             res["output_dim"],
-            res["padding"],
         ) = struct.unpack(fmt, body[:hsize])
-        assert hsize == res["hsize"], f"Hsize is {hsize} expected value is {res['hsize']}"
-        assert res["datatype"] in [4, 8]
-        if len(body) < hsize:
-            print(f"Incomplete message of size {len(body)}. Header should be of size {hsize}. skipping")
-            return {}
-
-        # Theoritical size in Bytes for the incoming message (without the header)
-        # Int() is needed otherwise we might overflow here (because of uint16 / uint8)
-        res["dsize"] = int(res["datatype"]) * int(res["num_element"]) * (int(res["input_dim"]) + int(res["output_dim"]))
-        res["msg_size"] = hsize + res["dsize"]
-        res["multiple_msg"] = len(body) != res["msg_size"]
 
-        self.num_elements = int(res["num_element"])
         self.hsize = int(res["hsize"])
-        self.dtype_byte = int(res["datatype"])
         self.mpi_rank = int(res["mpirank"])
         self.domain_name_size = int(res["domain_size"])
         self.input_dim = int(res["input_dim"])
@@ -157,53 +139,74 @@ def _parse_header(self, body: str) -> dict:
 
         return res
 
+    def _parse_tensor(self, body: str, offset: int):
+        start = offset
+        (num_dims,) = struct.unpack_from(self.endianness() + "Q", body, offset)
+        offset += 8
+
+        (total_bytes,) = struct.unpack_from(self.endianness() + "Q", body, offset)
+        offset += 8
+
+        shapes_fmt = self.endianness() + "Q" * num_dims
+        shapes = struct.unpack_from(shapes_fmt, body, offset)
+        offset += 8 * num_dims
+
+        strides = struct.unpack_from(shapes_fmt, body, offset)
+        offset += 8 * num_dims
+
+        tensor_data = body[offset : offset + total_bytes]
+        offset += total_bytes
+
+        return num_dims, shapes, strides, tensor_data, offset
+
     def _parse_data(self, body: str, header_info: dict) -> Tuple[str, np.array, np.array]:
         data = np.array([])
         if len(body) == 0:
             return data
         hsize = header_info["hsize"]
-        dsize = header_info["dsize"]
         domain_name_size = header_info["domain_size"]
         domain_name = body[hsize : hsize + domain_name_size]
         domain_name = domain_name.decode("utf-8")
-        try:
-            if header_info["datatype"] == 4:  # if datatype takes 4 bytes (float)
-                data = np.frombuffer(
-                    body[hsize + domain_name_size : hsize + domain_name_size + dsize],
-                    dtype=np.float32,
-                )
-            else:
-                data = np.frombuffer(
-                    body[hsize + domain_name_size : hsize + domain_name_size + dsize],
-                    dtype=np.float64,
-                )
-        except ValueError as e:
-            print(f"Error: {e} => {header_info}")
-            return np.array([])
-
-        idim = header_info["input_dim"]
-        odim = header_info["output_dim"]
-        data = data.reshape((idim + odim, -1)).transpose()
+        inputs = []
+        offset = hsize + domain_name_size
+        dtype = np.dtype(self.endianness() + "f4")
+
+        for i in range(0, header_info["input_dim"]):
+            num_dims, shapes, strides, data, offset = self._parse_tensor(body, offset)
+            ndarray = np.ndarray(
+                shape=shapes, dtype=dtype, buffer=data, strides=tuple(s * dtype.itemsize for s in strides)
+            )
+            inputs.append(ndarray)
+
+        outputs = []
+        for i in range(0, header_info["output_dim"]):
+            num_dims, shapes, strides, data, offset = self._parse_tensor(body, offset)
+            ndarray = np.ndarray(
+                shape=shapes, dtype=dtype, buffer=data, strides=tuple(s * dtype.itemsize for s in strides)
+            )
+            outputs.append(ndarray)
+
         # Return input, output
-        return (domain_name, data[:, :idim], data[:, idim:])
+
+        return offset, (domain_name, np.concatenate(inputs, axis=-1), np.concatenate(outputs, axis=-1))
 
     def _decode(self, body: str) -> Tuple[np.array]:
-        input = []
-        output = []
+        inputs = []
+        outputs = []
         # Multiple AMS messages could be packed in one RMQ message
         # TODO: we should manage potential mutliple messages per AMSMessage better
+
         while body:
             header_info = self._parse_header(body)
-            domain_name, temp_input, temp_output = self._parse_data(body, header_info)
+            offset, (domain_name, temp_input, temp_output) = self._parse_data(body, header_info)
             # print(f"MSG: {domain_name} input shape {temp_input.shape} outpute shape {temp_output.shape}")
             # total size of byte we read for that message
-            chunk_size = header_info["hsize"] + header_info["dsize"] + header_info["domain_size"]
-            input.append(temp_input)
-            output.append(temp_output)
+            inputs.append(temp_input)
+            outputs.append(temp_output)
             # We remove the current message and keep going
-            body = body[chunk_size:]
+            body = body[offset:]
             self.domain_names.append(domain_name)
-        return domain_name, np.concatenate(input), np.concatenate(output)
+        return domain_name, np.concatenate(inputs), np.concatenate(outputs)
 
     def decode(self) -> Tuple[str, np.array, np.array]:
         return self._decode(self.body)
@@ -211,6 +214,7 @@ def decode(self) -> Tuple[str, np.array, np.array]:
 
 def default_ams_callback(method, properties, body):
     """Simple callback that decode incoming message assuming they are AMS binary messages"""
+
     return AMSMessage(body)
 
 
diff --git a/src/AMSWorkflow/ams/stage.py b/src/AMSWorkflow/ams/stage.py
index 758099a8..c7103cf8 100644
--- a/src/AMSWorkflow/ams/stage.py
+++ b/src/AMSWorkflow/ams/stage.py
@@ -703,7 +703,7 @@ class Pipeline(ABC):
     """
 
     supported_policies = {"sequential", "thread", "process"}
-    supported_writers = {"shdf5", "dhdf5", "csv"}
+    supported_writers = {"shdf5", "dhdf5"}
 
     def __init__(self, application_name, dest_dir, db_url, db_type="dhdf5"):
         """
@@ -965,7 +965,7 @@ class FSPipeline(Pipeline):
         src_type: The file format of the source data
     """
 
-    supported_readers = ("shdf5", "dhdf5", "csv")
+    supported_readers = ("shdf5", "dhdf5")
 
     def __init__(
         self, application_name, dest_dir, db_url, db_type, src, src_type, pattern
diff --git a/src/AMSlib/AMS.cpp b/src/AMSlib/AMS.cpp
index 3da6bc4e..c1817395 100644
--- a/src/AMSlib/AMS.cpp
+++ b/src/AMSlib/AMS.cpp
@@ -5,10 +5,9 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
-#include "AMS.h"
-
 #include <limits.h>
-#ifdef __ENABLE_MPI__
+
+#ifdef __AMS_ENABLE_MPI__
 #include <mpi.h>
 #endif
 #include <unistd.h>
@@ -22,27 +21,17 @@
 #include <utility>
 #include <vector>
 
-#include "include/AMS.h"
-#include "ml/uq.hpp"
+#include "AMS.h"
 #include "wf/basedb.hpp"
 #include "wf/debug.h"
 #include "wf/logger.hpp"
 #include "wf/resource_manager.hpp"
 #include "wf/workflow.hpp"
 
-static int get_rank_id()
+using namespace ams;
+
+namespace
 {
-  if (const char *flux_id = std::getenv("FLUX_TASK_RANK")) {
-    return std::stoi(flux_id);
-  } else if (const char *rid = std::getenv("SLURM_PROCID")) {
-    return std::stoi(rid);
-  } else if (const char *jsm = std::getenv("JSM_NAMESPACE_RANK")) {
-    return std::stoi(jsm);
-  } else if (const char *pmi = std::getenv("PMIX_RANK")) {
-    return std::stoi(pmi);
-  }
-  return 0;
-}
 
 struct AMSAbstractModel {
   enum UQAggrType {
@@ -53,19 +42,15 @@ struct AMSAbstractModel {
 
 public:
   std::string SPath;
-  std::string UQPath;
   std::string DBLabel;
   bool DebugDB;
   double threshold;
   AMSUQPolicy uqPolicy;
-  int nClusters;
 
   static AMSUQPolicy getUQType(std::string type)
   {
     if (type.compare("deltaUQ") == 0) {
       return AMSUQPolicy::AMS_DELTAUQ_MEAN;
-    } else if (type.compare("faiss") == 0) {
-      return AMSUQPolicy::AMS_FAISS_MEAN;
     } else if (type.compare("random") == 0) {
       return AMSUQPolicy::AMS_RANDOM;
     } else {
@@ -102,33 +87,18 @@ struct AMSAbstractModel {
   }
 
 
-  void parseUQPaths(AMSUQPolicy policy, nlohmann::json &jRoot)
+  std::string parseSurrogatePaths(nlohmann::json &jRoot)
   {
 
-    /* 
-     * Empty models can exist in cases were the user annotates
-     * the code without having data to train a model. In such a case,
-     * the user deploys without specifying the model and lib AMS will
-     * collect everything
-     */
-    if (!jRoot.contains("model_path")) {
-      SPath = "";
-    } else {
-      SPath = jRoot["model_path"].get<std::string>();
-    }
-
-    DBG(AMS, "Model Is Random or DeltaUQ %s %u", SPath.c_str(), policy);
-    if (BaseUQ::isRandomUQ(policy) || BaseUQ::isDeltaUQ(policy)) {
-      UQPath = "";
-      return;
+    std::string path = "";
+    if (jRoot.contains("model_path")) {
+      path = jRoot["model_path"].get<std::string>();
+      CFATAL(AMS,
+             (!path.empty() && !fs::exists(path)),
+             "Path '%s' to model does not exist\n",
+             path.c_str());
     }
-
-    if (!jRoot.contains("faiss_path")) {
-      THROW(std::runtime_error,
-            "Model is of UQ type 'faiss' and thus expecting a path to FAISS");
-    }
-
-    UQPath = jRoot["faiss_path"].get<std::string>();
+    return path;
   }
 
 
@@ -150,7 +120,7 @@ struct AMSAbstractModel {
       THROW(std::runtime_error, "Model must specify the UQ type");
     }
 
-    if (!BaseUQ::isUQPolicy(policy)) {
+    if (!UQ::isUQPolicy(policy)) {
       THROW(std::runtime_error, "UQ Policy is not supported");
     }
 
@@ -162,26 +132,18 @@ struct AMSAbstractModel {
     }
 
 
-    if ((BaseUQ::isDeltaUQ(policy) || BaseUQ::isFaissUQ(policy)) &&
-        uqAggregate == UQAggrType::Unknown) {
+    if (UQ::isDeltaUQ(policy) && uqAggregate == UQAggrType::Unknown) {
       THROW(std::runtime_error,
             "UQ Type should be defined or set to undefined value");
     }
 
-    if (uqAggregate == Max) {
-      if (BaseUQ::isDeltaUQ(policy)) {
+    if (UQ::isDeltaUQ(policy)) {
+      if (uqAggregate == Max)
         policy = AMSUQPolicy::AMS_DELTAUQ_MAX;
-      } else if (BaseUQ::isFaissUQ(policy)) {
-        policy = AMSUQPolicy::AMS_FAISS_MAX;
-      }
-    } else if (uqAggregate == Mean) {
-      if (BaseUQ::isDeltaUQ(policy)) {
+      else if (uqAggregate == Mean)
         policy = AMSUQPolicy::AMS_DELTAUQ_MEAN;
-      } else if (BaseUQ::isFaissUQ(policy)) {
-        policy = AMSUQPolicy::AMS_FAISS_MEAN;
-      }
     }
-    DBG(AMS, "UQ Policy is %s", BaseUQ::UQPolicyToStr(policy).c_str())
+    DBG(AMS, "UQ Policy is %s", UQ::UQPolicyToStr(policy).c_str())
     return policy;
   }
 
@@ -192,10 +154,6 @@ struct AMSAbstractModel {
 
     uqPolicy = parseUQPolicy(value);
 
-    if (BaseUQ::isFaissUQ(uqPolicy)) {
-      nClusters = parseClusters(value);
-    }
-
     if (!value.contains("threshold")) {
       THROW(std::runtime_error,
             "Model must define threshold value (threshold < 0 always "
@@ -203,7 +161,7 @@ struct AMSAbstractModel {
             "model)");
     }
     threshold = value["threshold"].get<float>();
-    parseUQPaths(uqPolicy, value);
+    SPath = parseSurrogatePaths(value);
     DBLabel = parseDBLabel(value);
     DebugDB = parseDebugDB(value);
 
@@ -216,10 +174,8 @@ struct AMSAbstractModel {
 
   AMSAbstractModel(AMSUQPolicy uq_policy,
                    const char *surrogate_path,
-                   const char *uq_path,
                    const char *db_label,
-                   double threshold,
-                   int num_clusters)
+                   double threshold)
   {
     DebugDB = false;
     if (db_label == nullptr)
@@ -227,7 +183,7 @@ struct AMSAbstractModel {
 
     DBLabel = std::string(db_label);
 
-    if (!BaseUQ::isUQPolicy(uq_policy)) {
+    if (!UQ::isUQPolicy(uq_policy)) {
       FATAL(AMS, "Invalid UQ policy %d", uq_policy)
     }
 
@@ -235,13 +191,10 @@ struct AMSAbstractModel {
 
     if (surrogate_path != nullptr) SPath = std::string(surrogate_path);
 
-    if (uq_path != nullptr) UQPath = std::string(uq_path);
-
     this->threshold = threshold;
-    nClusters = num_clusters;
     DBG(AMS,
         "Registered Model %s %g",
-        BaseUQ::UQPolicyToStr(uqPolicy).c_str(),
+        UQ::UQPolicyToStr(uqPolicy).c_str(),
         threshold);
   }
 
@@ -249,13 +202,11 @@ struct AMSAbstractModel {
   void dump()
   {
     if (!SPath.empty()) DBG(AMS, "Surrogate Model Path: %s", SPath.c_str());
-    if (!UQPath.empty()) DBG(AMS, "UQ-Model: %s", UQPath.c_str());
     DBG(AMS,
-        "db-Label: %s threshold %f UQ-Policy: %u nClusters: %d",
+        "db-Label: %s threshold %f UQ-Policy: %u",
         DBLabel.c_str(),
         threshold,
-        uqPolicy,
-        nClusters);
+        uqPolicy);
   }
 };
 
@@ -269,7 +220,7 @@ class AMSWrap
   using json = nlohmann::json;
 
 public:
-  std::vector<std::pair<AMSDType, void *>> executors;
+  std::vector<void *> executors;
   std::vector<std::pair<std::string, AMSAbstractModel>> registered_models;
   std::unordered_map<std::string, int> ams_candidate_models;
   AMSDBType dbType = AMSDBType::AMS_NONE;
@@ -393,13 +344,16 @@ class AMSWrap
       rmq_cert = getEntry<std::string>(rmq_entry, "rabbitmq-cert");
 
     CFATAL(AMS,
-      (exchange == "" || routing_key == "") && update_surrogate,
-      "Found empty RMQ exchange / routing-key, model update is not possible. "
-      "Please provide a RMQ exchange or deactivate surrogate model "
-      "update.")
-
-    if(exchange == "" || routing_key == "") {
-      WARNING(AMS, "Found empty RMQ exchange or routing-key, deactivating model update")
+           (exchange == "" || routing_key == "") && update_surrogate,
+           "Found empty RMQ exchange / routing-key, model update is not "
+           "possible. "
+           "Please provide a RMQ exchange or deactivate surrogate model "
+           "update.")
+
+    if (exchange == "" || routing_key == "") {
+      WARNING(AMS,
+              "Found empty RMQ exchange or routing-key, deactivating model "
+              "update")
       update_surrogate = false;
     }
 
@@ -431,93 +385,21 @@ class AMSWrap
     switch (dbType) {
       case AMSDBType::AMS_NONE:
         return;
-      case AMSDBType::AMS_CSV:
       case AMSDBType::AMS_HDF5:
         setupFSDB(entry, dbStrType);
         break;
       case AMSDBType::AMS_RMQ:
         setupRMQ(entry, dbStrType);
         break;
-      case AMSDBType::AMS_REDIS:
-        FATAL(AMS, "Cannot connect to REDIS database, missing implementation");
+      default:
+        FATAL(AMS, "Unknown db-type");
     }
     return;
   }
 
-  std::pair<bool, std::string> setup_loggers()
-  {
-    const char *ams_logger_level = std::getenv("AMS_LOG_LEVEL");
-    const char *ams_logger_dir = std::getenv("AMS_LOG_DIR");
-    const char *ams_logger_prefix = std::getenv("AMS_LOG_PREFIX");
-    std::string log_fn("");
-    std::string log_path("./");
-
-    auto logger = ams::util::Logger::getActiveLogger();
-    bool enable_log = false;
-
-    if (ams_logger_level) {
-      auto log_lvl = ams::util::getVerbosityLevel(ams_logger_level);
-      logger->setLoggingMsgLevel(log_lvl);
-      enable_log = true;
-    }
-
-    // In the case we specify a directory and we do not specify a file
-    // by default we write to a file.
-    if (ams_logger_dir && !ams_logger_prefix) {
-      ams_logger_prefix = "ams";
-    }
-
-    if (ams_logger_prefix) {
-      // We are going to redirect stdout to some file
-      // By default we store to the current directory
-      std::string pattern("");
-      std::string log_prefix(ams_logger_prefix);
-
-      if (ams_logger_dir) {
-        log_path = std::string(ams_logger_dir);
-      }
-
-      char hostname[HOST_NAME_MAX];
-      if (gethostname(hostname, HOST_NAME_MAX) != 0) {
-        FATAL(AMS, "Get hostname returns error");
-      }
-
-      int id = 0;
-      if (log_prefix.find("<RID>") != std::string::npos) {
-        pattern = std::string("<RID>");
-        id = get_rank_id();
-      } else if (log_prefix.find("<PID>") != std::string::npos) {
-        pattern = std::string("<PID>");
-        id = getpid();
-      }
-
-      // Combine hostname and pid
-      std::ostringstream combined;
-      combined << "." << hostname << "." << id;
-
-      if (!pattern.empty()) {
-        log_path = fs::absolute(log_path).string();
-        log_fn =
-            std::regex_replace(log_prefix, std::regex(pattern), combined.str());
-      } else {
-        log_path = fs::absolute(log_path).string();
-        log_fn = log_prefix + combined.str();
-      }
-    }
-    logger->initialize_std_io_err(enable_log, log_path, log_fn);
-
-    return std::make_pair(enable_log, log_path);
-  }
-
 public:
   AMSWrap() : memManager(ams::ResourceManager::getInstance())
   {
-    auto log_stats = setup_loggers();
-    DBG(AMS,
-        "Enable Log %d stored under %s",
-        log_stats.first,
-        log_stats.second.c_str())
-
     memManager.init();
 
     if (const char *object_descr = std::getenv("AMS_OBJECTS")) {
@@ -542,9 +424,7 @@ class AMSWrap
                      AMSUQPolicy uq_policy,
                      double threshold,
                      const char *surrogate_path,
-                     const char *uq_path,
-                     const char *db_label,
-                     int num_clusters)
+                     const char *db_label)
   {
     auto model = ams_candidate_models.find(domain_name);
     if (model != ams_candidate_models.end()) {
@@ -554,13 +434,9 @@ class AMSWrap
             domain_name,
             registered_models[model->second].second.SPath.c_str());
     }
-    registered_models.push_back(std::make_pair(std::string(domain_name),
-                                               AMSAbstractModel(uq_policy,
-                                                                surrogate_path,
-                                                                uq_path,
-                                                                db_label,
-                                                                threshold,
-                                                                num_clusters)));
+    registered_models.push_back(std::make_pair(
+        std::string(domain_name),
+        AMSAbstractModel(uq_policy, surrogate_path, db_label, threshold)));
     ams_candidate_models.emplace(std::string(domain_name),
                                  registered_models.size() - 1);
     return registered_models.size() - 1;
@@ -586,13 +462,7 @@ class AMSWrap
   ~AMSWrap()
   {
     for (auto E : executors) {
-      if (E.second != nullptr) {
-        if (E.first == AMSDType::AMS_DOUBLE) {
-          delete reinterpret_cast<ams::AMSWorkflow<double> *>(E.second);
-        } else {
-          delete reinterpret_cast<ams::AMSWorkflow<float> *>(E.second);
-        }
-      }
+      delete reinterpret_cast<ams::AMSWorkflow *>(E);
     }
     ams::util::close();
   }
@@ -602,166 +472,101 @@ static std::once_flag _amsInitFlag;
 static std::once_flag _amsFinalizeFlag;
 static std::unique_ptr<AMSWrap> _amsWrap;
 
-void AMSInit() {
-  std::call_once(_amsInitFlag, [&]() {
-    DBG(AMS, "Initialization of AMS")
-    _amsWrap = std::make_unique<AMSWrap>();
-  });
-}
+ams::AMSWorkflow *_AMSCreateExecutor(AMSCAbstrModel model,
+                                     int process_id,
+                                     int world_size)
+{
+  CFATAL(AMS, _amsWrap == nullptr, "AMSInit has not been called.")
+  auto &model_descr = _amsWrap->get_model(model);
 
-void AMSFinalize() {
-  std::call_once(_amsFinalizeFlag, [&]() {
-    DBG(AMS, "Finalization of AMS")
-    _amsWrap.reset();
-  });
+  ams::AMSWorkflow *WF = new ams::AMSWorkflow(model_descr.second.SPath,
+                                              model_descr.first,
+                                              model_descr.second.DBLabel,
+                                              model_descr.second.threshold,
+                                              model_descr.second.uqPolicy,
+                                              process_id,
+                                              world_size);
+  return WF;
 }
 
-void _AMSExecute(AMSExecutor executor,
-                 void *probDescr,
-                 const int numElements,
-                 const void **input_data,
-                 void **output_data,
-                 int inputDim,
-                 int outputDim)
+AMSExecutor _AMSRegisterExecutor(ams::AMSWorkflow *workflow)
 {
   CFATAL(AMS, _amsWrap == nullptr, "AMSInit has not been called.")
-  int64_t index = static_cast<int64_t>(executor);
-  if (index >= _amsWrap->executors.size())
-    throw std::runtime_error("AMS Executor identifier does not exist\n");
-  auto currExec = _amsWrap->executors[index];
-
-  if (currExec.first == AMSDType::AMS_DOUBLE) {
-    ams::AMSWorkflow<double> *dWF =
-        reinterpret_cast<ams::AMSWorkflow<double> *>(currExec.second);
-    dWF->evaluate(probDescr,
-                  numElements,
-                  reinterpret_cast<const double **>(input_data),
-                  reinterpret_cast<double **>(output_data),
-                  inputDim,
-                  outputDim);
-  } else if (currExec.first == AMSDType::AMS_SINGLE) {
-    ams::AMSWorkflow<float> *sWF =
-        reinterpret_cast<ams::AMSWorkflow<float> *>(currExec.second);
-    sWF->evaluate(probDescr,
-                  numElements,
-                  reinterpret_cast<const float **>(input_data),
-                  reinterpret_cast<float **>(output_data),
-                  inputDim,
-                  outputDim);
-  } else {
-    throw std::invalid_argument("Data type is not supported by AMSLib!");
-    return;
-  }
+  _amsWrap->executors.push_back(static_cast<void *>(workflow));
+  return static_cast<AMSExecutor>(_amsWrap->executors.size()) - 1L;
 }
+}  // namespace
 
-template <typename FPTypeValue>
-ams::AMSWorkflow<FPTypeValue> *_AMSCreateExecutor(AMSCAbstrModel model,
-                                                  AMSDType data_type,
-                                                  AMSResourceType resource_type,
-                                                  AMSPhysicFn call_back,
-                                                  int process_id,
-                                                  int world_size)
+namespace ams
 {
-  CFATAL(AMS, _amsWrap == nullptr, "AMSInit has not been called.")
-  auto &model_descr = _amsWrap->get_model(model);
 
-  ams::AMSWorkflow<FPTypeValue> *WF =
-      new ams::AMSWorkflow<FPTypeValue>(call_back,
-                                        model_descr.second.UQPath,
-                                        model_descr.second.SPath,
-                                        model_descr.first,
-                                        model_descr.second.DBLabel,
-                                        model_descr.second.DebugDB,
-                                        resource_type,
-                                        model_descr.second.threshold,
-                                        model_descr.second.uqPolicy,
-                                        model_descr.second.nClusters,
-                                        process_id,
-                                        world_size);
-  return WF;
+void AMSInit()
+{
+  std::call_once(_amsInitFlag, [&]() {
+    DBG(AMS, "Initialization of AMS")
+    _amsWrap = std::make_unique<AMSWrap>();
+  });
 }
 
-template <typename FPTypeValue>
-AMSExecutor _AMSRegisterExecutor(AMSDType data_type,
-                                 ams::AMSWorkflow<FPTypeValue> *workflow)
+void AMSFinalize()
 {
-  CFATAL(AMS, _amsWrap == nullptr, "AMSInit has not been called.")
-  _amsWrap->executors.push_back(
-      std::make_pair(data_type, static_cast<void *>(workflow)));
-  return static_cast<AMSExecutor>(_amsWrap->executors.size()) - 1L;
+  std::call_once(_amsFinalizeFlag, [&]() {
+    DBG(AMS, "Finalization of AMS")
+    _amsWrap.reset();
+  });
 }
 
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 AMSExecutor AMSCreateExecutor(AMSCAbstrModel model,
-                              AMSDType data_type,
-                              AMSResourceType resource_type,
-                              AMSPhysicFn call_back,
                               int process_id,
                               int world_size)
 {
-  if (data_type == AMSDType::AMS_DOUBLE) {
-    auto *dWF = _AMSCreateExecutor<double>(
-        model, data_type, resource_type, call_back, process_id, world_size);
-    return _AMSRegisterExecutor(data_type, dWF);
-
-  } else if (data_type == AMSDType::AMS_SINGLE) {
-    auto *sWF = _AMSCreateExecutor<float>(
-        model, data_type, resource_type, call_back, process_id, world_size);
-    return _AMSRegisterExecutor(data_type, sWF);
-  } else {
-    throw std::invalid_argument("Data type is not supported by AMSLib!");
-    return static_cast<AMSExecutor>(-1);
-  }
+  auto *dWF = _AMSCreateExecutor(model, process_id, world_size);
+  return _AMSRegisterExecutor(dWF);
 }
 
-#ifdef __ENABLE_MPI__
-AMSExecutor AMSCreateDistributedExecutor(AMSCAbstrModel model,
-                                         AMSDType data_type,
-                                         AMSResourceType resource_type,
-                                         AMSPhysicFn call_back,
-                                         MPI_Comm Comm,
-                                         int process_id,
-                                         int world_size)
+
+void AMSExecute(AMSExecutor executor,
+                EOSLambda &OrigComputation,
+                const ams::SmallVector<ams::AMSTensor> &ins,
+                ams::SmallVector<ams::AMSTensor> &inouts,
+                ams::SmallVector<ams::AMSTensor> &outs)
 {
-  if (data_type == AMSDType::AMS_DOUBLE) {
-    auto *dWF = _AMSCreateExecutor<double>(
-        model, data_type, resource_type, call_back, process_id, world_size);
-    dWF->set_communicator(Comm);
-    return _AMSRegisterExecutor(data_type, dWF);
-
-  } else if (data_type == AMSDType::AMS_SINGLE) {
-    auto *sWF = _AMSCreateExecutor<float>(
-        model, data_type, resource_type, call_back, process_id, world_size);
-    sWF->set_communicator(Comm);
-    return _AMSRegisterExecutor(data_type, sWF);
-  } else {
-    throw std::invalid_argument("Data type is not supported by AMSLib!");
-    return static_cast<AMSExecutor>(-1);
-  }
+  int64_t index = static_cast<int64_t>(executor);
+  if (index >= _amsWrap->executors.size())
+    throw std::runtime_error("AMS Executor identifier does not exist\n");
+  auto currExec = _amsWrap->executors[index];
+
+  ams::AMSWorkflow *workflow = reinterpret_cast<ams::AMSWorkflow *>(currExec);
+  DBG(AMS,
+      "Calling AMS with in:%ld, inout:%ld, out:%ld",
+      ins.size(),
+      inouts.size(),
+      outs.size());
+
+  callAMS(workflow, OrigComputation, ins, inouts, outs);
 }
-#endif
 
-void AMSExecute(AMSExecutor executor,
-                void *probDescr,
-                const int numElements,
-                const void **input_data,
-                void **output_data,
-                int inputDim,
-                int outputDim)
+void AMSCExecute(AMSExecutor executor,
+                 EOSCFn OrigCComputation,
+                 void *args,
+                 const ams::SmallVector<ams::AMSTensor> &ins,
+                 ams::SmallVector<ams::AMSTensor> &inouts,
+                 ams::SmallVector<ams::AMSTensor> &outs)
 {
-  _AMSExecute(executor,
-              probDescr,
-              numElements,
-              input_data,
-              output_data,
-              inputDim,
-              outputDim);
+
+  // Define the lambda and let the compiler deduce the type conversion to std::function
+  EOSLambda OrigComputation =
+      [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+          ams::SmallVector<ams::AMSTensor> &ams_inouts,
+          ams::SmallVector<ams::AMSTensor> &ams_outs) {
+        OrigCComputation(args, ams_ins, ams_inouts, ams_outs);
+      };
+
+  AMSExecute(executor, OrigComputation, ins, inouts, outs);
 }
 
+
 void AMSDestroyExecutor(AMSExecutor executor)
 {
   CFATAL(AMS, _amsWrap == nullptr, "AMSInit has not been called.")
@@ -770,14 +575,7 @@ void AMSDestroyExecutor(AMSExecutor executor)
     throw std::runtime_error("AMS Executor identifier does not exist\n");
   auto currExec = _amsWrap->executors[index];
 
-  if (currExec.first == AMSDType::AMS_DOUBLE) {
-    delete reinterpret_cast<ams::AMSWorkflow<double> *>(currExec.second);
-  } else if (currExec.first == AMSDType::AMS_SINGLE) {
-    delete reinterpret_cast<ams::AMSWorkflow<float> *>(currExec.second);
-  } else {
-    throw std::invalid_argument("Data type is not supported by AMSLib!");
-    return;
-  }
+  delete reinterpret_cast<ams::AMSWorkflow *>(currExec);
 }
 
 
@@ -798,21 +596,13 @@ AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name,
                                         AMSUQPolicy uq_policy,
                                         double threshold,
                                         const char *surrogate_path,
-                                        const char *uq_path,
-                                        const char *db_label,
-                                        int num_clusters)
+                                        const char *db_label)
 {
-  CFATAL(AMS, _amsWrap == nullptr, "AMSInit has not been called.")
-  std::cout << "_amsWrap = " << _amsWrap.get() << std::endl;
+  CFATAL(AMS, !_amsWrap, "AMSInit has not been called.")
   auto id = _amsWrap->get_model_index(domain_name);
   if (id == -1) {
-    id = _amsWrap->register_model(domain_name,
-                                 uq_policy,
-                                 threshold,
-                                 surrogate_path,
-                                 uq_path,
-                                 db_label,
-                                 num_clusters);
+    id = _amsWrap->register_model(
+        domain_name, uq_policy, threshold, surrogate_path, db_label);
   }
 
   return id;
@@ -831,6 +621,17 @@ void AMSConfigureFSDatabase(AMSDBType db_type, const char *db_path)
   db_instance.instantiate_fs_db(db_type, std::string(db_path));
 }
 
-#ifdef __cplusplus
+
+#ifdef __AMS_ENABLE_MPI__
+AMSExecutor AMSCreateDistributedExecutor(AMSCAbstrModel model,
+                                         MPI_Comm Comm,
+                                         int process_id,
+                                         int world_size)
+
+{
+  auto *dWF = _AMSCreateExecutor(model, process_id, world_size);
+  dWF->set_communicator(Comm);
+  return _AMSRegisterExecutor(dWF);
 }
 #endif
+}  // namespace ams
diff --git a/src/AMSlib/AMSTensor.cpp b/src/AMSlib/AMSTensor.cpp
new file mode 100644
index 00000000..20fa882b
--- /dev/null
+++ b/src/AMSlib/AMSTensor.cpp
@@ -0,0 +1,215 @@
+#include <stdexcept>
+
+#include "AMS.h"
+#include "AMSTensor.hpp"
+#include "ArrayRef.hpp"
+#include "SmallVector.hpp"
+#include "include/AMSTensor.hpp"
+#include "wf/resource_manager.hpp"
+#include "wf/utils.hpp"
+
+using namespace ams;
+
+/**
+   * @brief Computes the number of elements in the tensor given its shape.
+   * @param[in] shapes The shape of the tensor as an array reference.
+   * @return The total number of elements in the tensor.
+   */
+template <typename T>
+static inline AMSTensor::IntDimType computeNumElements(ams::ArrayRef<T> shapes)
+{
+  return std::accumulate(shapes.begin(),
+                         shapes.end(),
+                         1,
+                         std::multiplies<AMSTensor::IntDimType>());
+}
+// Helper function to check if the tensor is contiguous in memory
+bool AMSTensor::isContiguous(AMSTensor::IntDimType expected_stride) const
+{
+  for (int i = _shape.size() - 1; i >= 0; --i) {
+    if (_strides[i] != expected_stride) return false;
+    expected_stride *= _shape[i];
+  }
+  return true;
+}
+
+AMSTensor::AMSTensor(uint8_t* data,
+                     ams::ArrayRef<AMSTensor::IntDimType> shapes,
+                     ams::ArrayRef<AMSTensor::IntDimType> strides,
+                     AMSDType dType,
+                     AMSResourceType location,
+                     bool view)
+    : _data(data),
+      _element_size(dtype_to_size(dType)),
+      _shape(shapes),
+      _strides(strides),
+      _dType(dType),
+      _location(location),
+      _owned(!view)
+{
+  _bytes = _elements * _element_size;
+  _elements = computeNumElements(shapes);
+  if (!_data) {
+    throw std::runtime_error("Generating tensor with Null Pointer AMSTensor.");
+  }
+}
+
+template <typename FPType, typename>
+AMSTensor AMSTensor::create(ams::ArrayRef<AMSTensor::IntDimType> shapes,
+                            ams::ArrayRef<AMSTensor::IntDimType> strides,
+                            AMSResourceType location)
+{
+  auto numElements = computeNumElements(shapes);
+  auto& rm = ams::ResourceManager::getInstance();
+  if constexpr ((std::is_same_v<FPType, float>) ||
+                (std::is_same_v<FPType, const float>)) {
+    float* _data = rm.allocate<float>(numElements, location, sizeof(float));
+    return AMSTensor((uint8_t*)_data, shapes, strides, AMS_SINGLE, location);
+  } else if constexpr ((std::is_same_v<FPType, double>) ||
+                       (std::is_same_v<FPType, const double>)) {
+    double* _data = rm.allocate<double>(numElements, location, sizeof(double));
+    return AMSTensor((uint8_t*)_data, shapes, strides, AMS_DOUBLE, location);
+  } else {
+    // This should never happen due to the type restriction
+    static_assert(std::is_same_v<FPType, float> ||
+                      std::is_same_v<FPType, double>,
+                  "AMSTensor only supports float or double tensor creation");
+  }
+}
+
+
+template <typename FPType, typename>
+AMSTensor AMSTensor::view(FPType* data,
+                          ams::ArrayRef<AMSTensor::IntDimType> shapes,
+                          ams::ArrayRef<AMSTensor::IntDimType> strides,
+                          AMSResourceType location)
+{
+  if constexpr ((std::is_same_v<FPType, float>) ||
+                (std::is_same_v<FPType, const float>)) {
+    return AMSTensor(
+        (uint8_t*)data, shapes, strides, AMS_SINGLE, location, true);
+  } else if constexpr ((std::is_same_v<FPType, double>) ||
+                       (std::is_same_v<FPType, const double>)) {
+    return AMSTensor(
+        (uint8_t*)data, shapes, strides, AMS_DOUBLE, location, true);
+  } else {
+    static_assert(std::is_same_v<FPType, float> ||
+                      std::is_same_v<FPType, const float> ||
+                      std::is_same_v<FPType, const double> ||
+                      std::is_same_v<FPType, double>,
+                  "AMSTensor only supports float or double tensor view");
+  }
+  throw std::runtime_error("Should never get here\n");
+}
+
+AMSTensor AMSTensor::view(AMSTensor& tensor)
+{
+  if (tensor._dType == AMS_DOUBLE)
+    return AMSTensor::view((double*)tensor._data,
+                           tensor._shape,
+                           tensor._strides,
+                           tensor._location);
+  else if (tensor._dType == AMS_SINGLE)
+    return AMSTensor::view((float*)tensor._data,
+                           tensor._shape,
+                           tensor._strides,
+                           tensor._location);
+  throw std::runtime_error(
+      "Creating view through copying constructor has incorrect dtype");
+}
+
+AMSTensor::~AMSTensor()
+{
+  // Only release whenwe own the pointer
+  if (_owned && _data) {
+    auto& rm = ams::ResourceManager::getInstance();
+    rm.deallocate(_data, _location);
+    _data = nullptr;
+    _owned = false;
+  }
+}
+
+AMSTensor::AMSTensor(AMSTensor&& other) noexcept
+    : _data(other._data),
+      _elements(other._elements),
+      _element_size(other._element_size),
+      _shape(std::move(other._shape)),
+      _strides(std::move(other._strides)),
+      _dType(other._dType),
+      _location(other._location),
+      _owned(other._owned)
+{
+  other._data = nullptr;
+  other._owned = false;
+}
+
+AMSTensor& AMSTensor::operator=(AMSTensor&& other) noexcept
+{
+  if (this != &other) {
+    // Free existing resources
+
+    // Steal resources from `other`
+    _data = other._data;
+    _elements = other._elements;
+    _element_size = other._element_size;
+    _shape = std::move(other._shape);
+    _strides = std::move(other._strides);
+    _dType = other._dType;
+    _location = other._location;
+    _owned = other._owned;
+
+    other._data = nullptr;
+    other._owned = false;
+  }
+  return *this;
+}
+
+AMSTensor AMSTensor::transpose(AMSTensor::IntDimType axis1,
+                               AMSTensor::IntDimType axis2) const
+{
+  // Ensure the axes are within bounds
+  if (axis1 >= _shape.size() || axis2 >= _shape.size()) {
+    throw std::out_of_range("Transpose axes are out of bounds");
+  }
+
+  // Create new shape and strides for the transposed tensor
+  auto newShape = _shape;
+  auto newStrides = _strides;
+
+  // Swap the specified axes in both shape and strides
+  std::swap(newShape[axis1], newShape[axis2]);
+  std::swap(newStrides[axis1], newStrides[axis2]);
+
+  // Create a new tensor with the same data, new shape, and strides
+  if (dType() == AMSDType::AMS_DOUBLE)
+    return view((double*)_data, newShape, newStrides, _location);
+  else if (dType() == AMSDType::AMS_SINGLE)
+    return view((float*)_data, newShape, newStrides, _location);
+
+  throw std::runtime_error("Unknow data type in transpose\n");
+}
+
+template AMSTensor AMSTensor::create<float>(ams::ArrayRef<IntDimType>,
+                                            ams::ArrayRef<IntDimType>,
+                                            AMSResourceType);
+template AMSTensor AMSTensor::create<double>(ams::ArrayRef<IntDimType>,
+                                             ams::ArrayRef<IntDimType>,
+                                             AMSResourceType);
+
+template AMSTensor AMSTensor::view<float>(float*,
+                                          ams::ArrayRef<IntDimType>,
+                                          ams::ArrayRef<IntDimType>,
+                                          AMSResourceType);
+template AMSTensor AMSTensor::view<double>(double*,
+                                           ams::ArrayRef<IntDimType>,
+                                           ams::ArrayRef<IntDimType>,
+                                           AMSResourceType);
+
+template AMSTensor AMSTensor::view<const float>(const float*,
+                                                ams::ArrayRef<IntDimType>,
+                                                ams::ArrayRef<IntDimType>,
+                                                AMSResourceType);
+template AMSTensor AMSTensor::view<const double>(const double*,
+                                                 ams::ArrayRef<IntDimType>,
+                                                 ams::ArrayRef<IntDimType>,
+                                                 AMSResourceType);
diff --git a/src/AMSlib/CMakeLists.txt b/src/AMSlib/CMakeLists.txt
index d48ca998..8e94c075 100644
--- a/src/AMSlib/CMakeLists.txt
+++ b/src/AMSlib/CMakeLists.txt
@@ -4,14 +4,7 @@
 
 # ------------------------------------------------------------------------------
 # handle sources and headers
-file(GLOB_RECURSE MINIAPP_INCLUDES "*.hpp")
-#set global library path to link with tests if necessary
-set(LIBRARY_OUTPUT_PATH ${AMS_LIB_OUT_PATH})
-set(AMS_LIB_SRC ${MINIAPP_INCLUDES} AMS.cpp wf/resource_manager.cpp wf/debug.cpp wf/basedb.cpp wf/logger.cpp wf/utils.cpp util/SmallVector.cpp)
-
-if (WITH_CUDA)
-  list(APPEND AMS_LIB_SRC wf/cuda/utilities.cpp)
-endif()
+set(AMS_LIB_SRC wf/debug.cpp wf/logger.cpp wf/utils.cpp wf/SmallVector.cpp ml/surrogate.cpp wf/basedb.cpp AMSTensor.cpp wf/interface.cpp wf/resource_manager.cpp AMS.cpp)
 
 if (WITH_HDF5)
   list(APPEND AMS_LIB_SRC wf/hdf5db.cpp)
@@ -21,74 +14,74 @@ if (WITH_RMQ)
   list(APPEND AMS_LIB_SRC wf/rmqdb.cpp)
 endif()
 
+add_library(AMS ${AMS_LIB_SRC})
 
-
-# two targets: a shared lib and an exec
-add_library(AMS ${AMS_LIB_SRC} ${MINIAPP_INCLUDES})
+get_target_property(AMS_TYPE AMS TYPE)
 
 # ------------------------------------------------------------------------------
-if (WITH_CUDA)
-
-  set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
+# setup the lib first
+message(STATUS "ALL INCLUDES ARE ${AMS_APP_INCLUDES}")
+target_compile_definitions(AMS PRIVATE ${AMS_APP_DEFINES})
+message(STATUS "All AMS Internal Defines are ${AMS_APP_DEFINES}")
+target_include_directories(AMS PRIVATE 
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/>)
+target_include_directories(AMS PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include>
+)
 
-  #    if (BUILD_SHARED_LIBS)
-  #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-  #    else()
-  #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-  #      set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-  #    endif()
 
-  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES LANGUAGE CUDA)
-  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
-  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
+if (WITH_CUDA)
+  target_link_libraries(AMS PRIVATE CUDA::cudart)
+endif()
 
-  if (WITH_PERFFLOWASPECT)
-      set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
-      set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
-    endif()
+if (WITH_HDF5)
+  target_link_libraries(AMS PUBLIC $<BUILD_INTERFACE:${AMS_HDF5_TARGET}> PRIVATE $<INSTALL_INTERFACE:${AMS_HDF5_TARGET}>)
 endif()
 
-# ------------------------------------------------------------------------------
-# setup the lib first
-message(STATUS "ALL INCLUDES ARE ${AMS_APP_INCLUDES}")
-target_compile_definitions(AMS PRIVATE ${AMS_APP_DEFINES})
-target_include_directories(AMS PRIVATE ${AMS_APP_INCLUDES})
-target_include_directories(AMS PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include/>
-  $<INSTALL_INTERFACE:include/>)
-target_include_directories(AMS PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_directories(AMS PUBLIC ${AMS_APP_LIB_DIRS})
-target_link_libraries(AMS PRIVATE ${AMS_APP_LIBRARIES} stdc++fs)
-
-#-------------------------------------------------------------------------------
-# create the configuration header file with the respective information
-#-------------------------------------------------------------------------------
-set(CALIPER_DEFINES "// #define __AMS_ENABLE_CALIPER__")
-set(MPI_DEFINES "// #define __AMS_ENABLE_MPI__")
-set(PERFF_DEFINES "// #define __AMS_ENABLE_PERFFLOWASPECT__")
-
-if (${WITH_CALIPER})
-  set(CALIPER_DEFINES "#define __AMS_ENABLE_CALIPER__")
+if (WITH_CALIPER)
+  target_link_libraries(AMS PUBLIC $<BUILD_INTERFACE:caliper> PRIVATE $<INSTALL_INTERFACE:caliper>)
 endif()
 
-if (${WITH_MPI})
-  set(MPI_DEFINES "#define __AMS_ENABLE_MPI__")
+if (WITH_MPI)
+  target_link_libraries(AMS PUBLIC $<BUILD_INTERFACE:MPI::MPI_CXX> PRIVATE $<INSTALL_INTERFACE:MPI::MPI_CXX>)
 endif()
 
-if (${WITH_PERFFLOWASPECT})
-  set(PERFF_DEFINES "#define __AMS_ENABLE_PERFFLOWASPECT__")
+if (WITH_RMQ)
+  target_link_libraries(AMS PUBLIC  $<BUILD_INTERFACE:amqpcpp> PRIVATE $<INSTALL_INTERFACE:amqpcpp>)
+
+  if (OPENSSL_FOUND)
+    target_link_libraries(AMS PUBLIC $<BUILD_INTERFACE:OpenSSL::SSL OpenSSL::Crypto> PRIVATE
+    		$<INSTALL_INTERFACE:OpenSSL::SSL OpenSSL::Crypto>)
+  endif()
+  # NOTE: We set here the event/event pthreads as public. As there is no easy way
+  # to do a find package(libevent) and RMQ is not exposing that properly.
+  message(STATUS "Event libs are ${LIBEVENT_LIBRARY}")
+  message(STATUS "Event libs are ${LIBEVENT_LIBRARY} and ${LIBEVENT_THREAD}")
+  target_link_libraries(AMS PUBLIC ${LIBEVENT_LIBRARY} ${LIBEVENT_THREAD})
 endif()
 
+target_link_libraries(AMS PUBLIC
+    		$<BUILD_INTERFACE:stdc++fs torch> PRIVATE
+    		$<INSTALL_INTERFACE:stdc++fs torch>)
+
+
 configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/include/AMS-config.h.in" "${PROJECT_BINARY_DIR}/include/AMS-config.h")
 configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/include/AMS.h" "${PROJECT_BINARY_DIR}/include/AMS.h" COPYONLY)
+configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/include/AMSTypes.hpp" "${PROJECT_BINARY_DIR}/include/AMSTypes.hpp" COPYONLY)
+configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/include/SmallVector.hpp" "${PROJECT_BINARY_DIR}/include/SmallVector.hpp" COPYONLY)
+configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/include/ArrayRef.hpp" "${PROJECT_BINARY_DIR}/include/ArrayRef.hpp" COPYONLY)
+configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/include/AMSTensor.hpp" "${PROJECT_BINARY_DIR}/include/AMSTensor.hpp" COPYONLY)
 
 # setup the exec
 #SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,$ORIGIN")
 # ------------------------------------------------------------------------------
 # installation paths
-# Install the AMS library
+
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
+
+# Install the AMS library
 install(TARGETS AMS
     EXPORT AMSTargets
     LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"      # For shared libraries
@@ -106,7 +99,10 @@ install(EXPORT AMSTargets
 # Install the public headers
 install(FILES
     ${PROJECT_BINARY_DIR}/include/AMS.h
-    ${PROJECT_BINARY_DIR}/include/AMS-config.h
+    ${PROJECT_BINARY_DIR}/include/AMSTensor.hpp
+    ${PROJECT_BINARY_DIR}/include/AMSTypes.hpp
+    ${PROJECT_BINARY_DIR}/include/ArrayRef.hpp
+    ${PROJECT_BINARY_DIR}/include/SmallVector.hpp
     DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/"    # Headers installed into AMS subdir
 )
 
@@ -131,3 +127,4 @@ install(FILES
     "${CMAKE_CURRENT_BINARY_DIR}/AMSConfigVersion.cmake"
     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/AMS"
 )
+
diff --git a/src/AMSlib/include/AMS.h b/src/AMSlib/include/AMS.h
index 671ea4cb..5583f801 100644
--- a/src/AMSlib/include/AMS.h
+++ b/src/AMSlib/include/AMS.h
@@ -4,119 +4,63 @@
  *
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
-
-#ifndef __AMS__
-#define __AMS__
+#pragma once
 
 #include <cstdint>
 
-#include "AMS-config.h"
-
-#ifdef __AMS_ENABLE_CALIPER__
-#include <caliper/cali-manager.h>
-#include <caliper/cali.h>
-#define CALIPER(stmt) stmt
-#else
-#define CALIPER(stmt)
-#endif
-
-#ifdef __AMS_ENABLE_MPI__
-#include <mpi.h>
-#define MPI_CALL(stmt)                                                         \
-  if (stmt != MPI_SUCCESS) {                                                   \
-    fprintf(stderr, "Error in MPI-Call (File: %s, %d)\n", __FILE__, __LINE__); \
-  }
-#else
-typedef void *MPI_Comm;
-#define MPI_CALL(stm)
-#endif
-
-#ifdef __AMS_ENABLE_PERFFLOWASPECT__
-#define PERFFASPECT() __attribute__((annotate("@critical_path()")))
-#else
-#define PERFFASPECT()
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*AMSPhysicFn)(void *, long, const void *const *, void *const *);
-
-typedef int64_t AMSExecutor;
-typedef int AMSCAbstrModel;
-
-typedef enum { AMS_SINGLE = 0, AMS_DOUBLE } AMSDType;
-
-typedef enum {
-  AMS_UNKNOWN = -1,
-  AMS_HOST = 0,
-  AMS_DEVICE = 1,
-  AMS_PINNED = 2,
-  AMS_RSEND
-} AMSResourceType;
-
-typedef enum { AMS_UBALANCED = 0, AMS_BALANCED } AMSExecPolicy;
-
-typedef enum { AMS_NONE = 0, AMS_CSV, AMS_REDIS, AMS_HDF5, AMS_RMQ } AMSDBType;
-
-enum struct AMSUQPolicy {
-  AMS_UQ_BEGIN = 0,
-  AMS_FAISS_MEAN,
-  AMS_FAISS_MAX,
-  AMS_DELTAUQ_MEAN,
-  AMS_DELTAUQ_MAX,
-  AMS_RANDOM,
-  AMS_UQ_END
-};
+#include "AMSTensor.hpp"
+#include "AMSTypes.hpp"
+
+namespace ams
+{
+
+using EOSLambda =
+    std::function<void(const ams::SmallVector<ams::AMSTensor> & /*inputs */,
+                       ams::SmallVector<ams::AMSTensor> & /*input - outputs */,
+                       ams::SmallVector<ams::AMSTensor> & /* outputs */)>;
+
+
+using EOSCFn = void (*)(void *,
+                        const ams::SmallVector<ams::AMSTensor> &,
+                        ams::SmallVector<ams::AMSTensor> &,
+                        ams::SmallVector<ams::AMSTensor> &);
+
+using AMSExecutor = int64_t;
+using AMSCAbstrModel = int;
 
 void AMSInit();
 void AMSFinalize();
 
+
 AMSExecutor AMSCreateExecutor(AMSCAbstrModel model,
-                              AMSDType data_type,
-                              AMSResourceType resource_type,
-                              AMSPhysicFn call_back,
                               int process_id,
                               int world_size);
 
-#ifdef __AMS_ENABLE_MPI__
-AMSExecutor AMSCreateDistributedExecutor(AMSCAbstrModel model,
-                                         AMSDType data_type,
-                                         AMSResourceType resource_type,
-                                         AMSPhysicFn call_back,
-                                         MPI_Comm comm,
-                                         int process_id,
-                                         int world_size);
-#endif
-
-
 AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name,
                                         AMSUQPolicy uq_policy,
                                         double threshold,
                                         const char *surrogate_path,
-                                        const char *uq_path,
-                                        const char *db_label,
-                                        int num_clusters);
+                                        const char *db_label);
 
 AMSCAbstrModel AMSQueryModel(const char *domain_model);
 
 void AMSExecute(AMSExecutor executor,
-                void *probDescr,
-                const int numElements,
-                const void **input_data,
-                void **output_data,
-                int inputDim,
-                int outputDim);
+                EOSLambda &OrigComputation,
+                const ams::SmallVector<ams::AMSTensor> &ins,
+                ams::SmallVector<ams::AMSTensor> &inouts,
+                ams::SmallVector<ams::AMSTensor> &outs);
+
+void AMSCExecute(AMSExecutor executor,
+                 EOSCFn OrigComputation,
+                 void *args,
+                 const ams::SmallVector<ams::AMSTensor> &ins,
+                 ams::SmallVector<ams::AMSTensor> &inouts,
+                 ams::SmallVector<ams::AMSTensor> &outs);
 
 void AMSDestroyExecutor(AMSExecutor executor);
 
-void AMSSetAllocator(AMSResourceType resource, const char *alloc_name);
-const char *AMSGetAllocatorName(AMSResourceType device);
+void AMSSetAllocator(ams::AMSResourceType resource, const char *alloc_name);
+const char *AMSGetAllocatorName(ams::AMSResourceType device);
 void AMSConfigureFSDatabase(AMSDBType db_type, const char *db_path);
 
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+};  // namespace ams
diff --git a/src/AMSlib/include/AMSTensor.hpp b/src/AMSlib/include/AMSTensor.hpp
new file mode 100644
index 00000000..53483838
--- /dev/null
+++ b/src/AMSlib/include/AMSTensor.hpp
@@ -0,0 +1,153 @@
+#pragma once
+#include <sys/types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "AMSTypes.hpp"
+#include "ArrayRef.hpp"
+#include "SmallVector.hpp"
+
+namespace ams
+{
+
+class AMSTensor
+{
+public:
+  using IntDimType = long int;
+  IntDimType elements() const { return _elements; }
+  IntDimType element_size() const { return _element_size; }
+  AMSDType dType() const { return _dType; }
+  AMSResourceType location() const { return _location; }
+  ams::ArrayRef<IntDimType> strides() const { return _strides; }
+  ams::ArrayRef<IntDimType> shape() const { return _shape; }
+  bool contiguous() const { return _contiguous; }
+
+
+private:
+  uint8_t* _data;
+  IntDimType _elements;
+  IntDimType _element_size;
+  ams::SmallVector<IntDimType> _shape;
+  ams::SmallVector<IntDimType> _strides;
+  AMSDType _dType;            // AMS_SINGLE/AMS_DOUBLE
+  AMSResourceType _location;  // CPU/GPU/Pinned
+  bool _owned;
+  bool _contiguous;
+  bool _bytes;
+
+  // Helper function to check if the tensor is contiguous in memory
+  bool isContiguous(IntDimType expected_stride) const;
+
+  /**
+   * @brief Constructs a new AMSTensor with the specified shape, strides, data type, and location.
+   *        This constructor is private and intended for internal use, such as creating views.
+   * @param[in] shapes The shape of the tensor.
+   * @param[in] strides The strides of the tensor.
+   * @param[in] dType The data type of the tensor elements.
+   * @param[in] location The memory location (e.g., CPU, GPU).
+   * @param[in] view Set to true if this tensor is a view of another tensor (non-owning).
+   */
+  explicit AMSTensor(uint8_t* data,
+                     ams::ArrayRef<IntDimType> shapes,
+                     ams::ArrayRef<IntDimType> strides,
+                     AMSDType dType,
+                     AMSResourceType location,
+                     bool view = false);
+
+
+public:
+  /**
+   * @brief Creates a new AMSTensor and allocates the tensor memory.
+   * @param[in] shapes The shape of the tensor.
+   * @param[in] strides The strides of the tensor.
+   * @param[in] dType The data type of the tensor elements.
+   * @param[in] location The memory location (e.g., CPU, GPU).
+   * @return A new AMSTensor with allocated memory.
+   */
+  template <typename FPType,
+            typename = std::enable_if_t<std::is_floating_point<FPType>::value>>
+  static AMSTensor create(ams::ArrayRef<IntDimType> shapes,
+                          ams::ArrayRef<IntDimType> strides,
+                          AMSResourceType location);
+
+  /**
+   * @brief Creates a view on an existing memory buffer.
+   * @param[in] data Pointer to the existing data to be viewed.
+   * @param[in] shapes The shape of the view tensor.
+   * @param[in] strides The strides of the view tensor.
+   * @param[in] dType The data type of the tensor elements.
+   * @param[in] location The memory location (e.g., CPU, GPU).
+   * @return A new AMSTensor that acts as a view of the existing data.
+   */
+  template <typename FPType,
+            typename = std::enable_if_t<std::is_floating_point<FPType>::value>>
+  static AMSTensor view(FPType* data,
+                        ams::ArrayRef<IntDimType> shapes,
+                        ams::ArrayRef<IntDimType> strides,
+                        AMSResourceType location);
+
+
+  static AMSTensor view(AMSTensor& tensor);
+
+  /**
+   * @brief Destructor for AMSTensor, deallocates memory if this tensor owns it.
+   */
+  ~AMSTensor();
+
+
+  /**
+   * @brief Deleted copy assignment operator to prevent copying of tensors.
+   */
+  AMSTensor(const AMSTensor&) = delete;
+
+  /**
+   * @brief Move constructor for AMSTensor, transfers ownership of data.
+   * @param[in,out] other The tensor to move from. It will be left in a valid but unspecified state.
+   */
+  AMSTensor& operator=(const AMSTensor&) = delete;
+
+  /**
+   * @brief Move assignment operator for AMSTensor, transfers ownership of data.
+   * @param[in,out] other The tensor to move from. It will be left in a valid but unspecified state.
+   * @return A reference to the updated tensor after move assignment.
+   */
+  AMSTensor(AMSTensor&& other) noexcept;
+
+  // Define move assignment operator
+  AMSTensor& operator=(AMSTensor&& other) noexcept;
+
+  /**
+   * @brief Retrieves a typed pointer to the underlying data.
+   * @tparam T The data type to retrieve.
+   * @return A typed pointer to the tensor's data.
+   */
+  template <typename T>
+  T* data() const
+  {
+    return reinterpret_cast<T*>(_data);
+  }
+
+  void* raw_data() const { return reinterpret_cast<void*>(_data); }
+
+  /**
+   * @brief Creates a transposed view of the tensor by swapping two specified axes.
+   * @param[in] axis1 The first axis to swap in the transposition.
+   * @param[in] axis2 The second axis to swap in the transposition.
+   * @return A new AMSTensor that is a transposed view of the original tensor.
+   * @throw std::out_of_range if any axis is out of bounds.
+   */
+  AMSTensor transpose(IntDimType axis1 = 0, IntDimType axis2 = 1) const;
+};
+
+// Explicit instantiation declarations
+extern template AMSTensor AMSTensor::create<float>(
+    ams::ArrayRef<AMSTensor::IntDimType> shapes,
+    ams::ArrayRef<AMSTensor::IntDimType> strides,
+    AMSResourceType location);
+extern template AMSTensor AMSTensor::create<double>(
+    ams::ArrayRef<AMSTensor::IntDimType> shapes,
+    ams::ArrayRef<AMSTensor::IntDimType> strides,
+    AMSResourceType location);
+}  // namespace ams
diff --git a/src/AMSlib/include/AMSTypes.hpp b/src/AMSlib/include/AMSTypes.hpp
new file mode 100644
index 00000000..35b036ce
--- /dev/null
+++ b/src/AMSlib/include/AMSTypes.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+namespace ams
+{
+typedef enum { AMS_SINGLE = 0, AMS_DOUBLE, AMS_UNKNOWN_TYPE } AMSDType;
+
+typedef enum {
+  AMS_UNKNOWN = -1,
+  AMS_HOST = 0,
+  AMS_DEVICE = 1,
+  AMS_PINNED = 2,
+  AMS_RSEND
+} AMSResourceType;
+
+typedef enum { AMS_UBALANCED = 0, AMS_BALANCED } AMSExecPolicy;
+
+typedef enum { AMS_NONE = 0, AMS_HDF5, AMS_RMQ } AMSDBType;
+
+enum struct AMSUQPolicy {
+  AMS_UQ_BEGIN = 0,
+  AMS_DELTAUQ_MEAN,
+  AMS_DELTAUQ_MAX,
+  AMS_RANDOM,
+  AMS_UQ_END
+};
+
+}  // namespace ams
diff --git a/src/AMSlib/util/ArrayRef.h b/src/AMSlib/include/ArrayRef.hpp
similarity index 83%
rename from src/AMSlib/util/ArrayRef.h
rename to src/AMSlib/include/ArrayRef.hpp
index b9c6eef7..87c63d6a 100644
--- a/src/AMSlib/util/ArrayRef.h
+++ b/src/AMSlib/include/ArrayRef.hpp
@@ -19,12 +19,12 @@
 #include <optional>
 #include <vector>
 
-#include "SmallVector.h"
+#include "SmallVector.hpp"
 
 namespace ams
 {
 template <typename T>
-class [[nodiscard]] MutableArrayRef;
+class MutableArrayRef;
 
 /// ArrayRef - Represent a constant reference to an array (0 or more elements
 /// consecutively in memory), i.e. a start pointer and a length.  It allows
@@ -38,7 +38,7 @@ class [[nodiscard]] MutableArrayRef;
 /// This is intended to be trivially copyable, so it should be passed by
 /// value.
 template <typename T>
-class [[nodiscard]] ArrayRef
+class ArrayRef
 {
 public:
   using value_type = T;
@@ -67,9 +67,6 @@ class [[nodiscard]] ArrayRef
   /// Construct an empty ArrayRef.
   /*implicit*/ ArrayRef() = default;
 
-  /// Construct an empty ArrayRef from std::nullopt.
-  /*implicit*/ ArrayRef(std::nullopt_t) {}
-
   /// Construct an ArrayRef from a single element.
   /*implicit*/ ArrayRef(const T &OneElt) : Data(&OneElt), Length(1) {}
 
@@ -327,7 +324,7 @@ class [[nodiscard]] ArrayRef
 /// This is intended to be trivially copyable, so it should be passed by
 /// value.
 template <typename T>
-class [[nodiscard]] MutableArrayRef : public ArrayRef<T>
+class MutableArrayRef : public ArrayRef<T>
 {
 public:
   using value_type = T;
@@ -345,9 +342,6 @@ class [[nodiscard]] MutableArrayRef : public ArrayRef<T>
   /// Construct an empty MutableArrayRef.
   /*implicit*/ MutableArrayRef() = default;
 
-  /// Construct an empty MutableArrayRef from std::nullopt.
-  /*implicit*/ MutableArrayRef(std::nullopt_t) : ArrayRef<T>() {}
-
   /// Construct a MutableArrayRef from a single element.
   /*implicit*/ MutableArrayRef(T &OneElt) : ArrayRef<T>(OneElt) {}
 
@@ -511,85 +505,6 @@ class OwningArrayRef : public MutableArrayRef<T>
   ~OwningArrayRef() { delete[] this->data(); }
 };
 
-/// @name ArrayRef Deduction guides
-/// @{
-/// Deduction guide to construct an ArrayRef from a single element.
-template <typename T>
-ArrayRef(const T &OneElt) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a pointer and length
-template <typename T>
-ArrayRef(const T *data, size_t length) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a range
-template <typename T>
-ArrayRef(const T *data, const T *end) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a SmallVector
-template <typename T>
-ArrayRef(const SmallVectorImpl<T> &Vec) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a SmallVector
-template <typename T, unsigned N>
-ArrayRef(const SmallVector<T, N> &Vec) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a std::vector
-template <typename T>
-ArrayRef(const std::vector<T> &Vec) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a std::array
-template <typename T, std::size_t N>
-ArrayRef(const std::array<T, N> &Vec) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from an ArrayRef (const)
-template <typename T>
-ArrayRef(const ArrayRef<T> &Vec) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from an ArrayRef
-template <typename T>
-ArrayRef(ArrayRef<T> &Vec) -> ArrayRef<T>;
-
-/// Deduction guide to construct an ArrayRef from a C array.
-template <typename T, size_t N>
-ArrayRef(const T (&Arr)[N]) -> ArrayRef<T>;
-
-/// @}
-
-/// @name ArrayRef Convenience constructors
-/// @{
-/// Construct an ArrayRef from a single element.
-
-/// @name MutableArrayRef Deduction guides
-/// @{
-/// Deduction guide to construct a `MutableArrayRef` from a single element
-template <class T>
-MutableArrayRef(T &OneElt) -> MutableArrayRef<T>;
-
-/// Deduction guide to construct a `MutableArrayRef` from a pointer and
-/// length.
-template <class T>
-MutableArrayRef(T *data, size_t length) -> MutableArrayRef<T>;
-
-/// Deduction guide to construct a `MutableArrayRef` from a `SmallVector`.
-template <class T>
-MutableArrayRef(SmallVectorImpl<T> &Vec) -> MutableArrayRef<T>;
-
-template <class T, unsigned N>
-MutableArrayRef(SmallVector<T, N> &Vec) -> MutableArrayRef<T>;
-
-/// Deduction guide to construct a `MutableArrayRef` from a `std::vector`.
-template <class T>
-MutableArrayRef(std::vector<T> &Vec) -> MutableArrayRef<T>;
-
-/// Deduction guide to construct a `MutableArrayRef` from a `std::array`.
-template <class T, std::size_t N>
-MutableArrayRef(std::array<T, N> &Vec) -> MutableArrayRef<T>;
-
-/// Deduction guide to construct a `MutableArrayRef` from a C array.
-template <typename T, size_t N>
-MutableArrayRef(T (&Arr)[N]) -> MutableArrayRef<T>;
-
-/// @}
 
 template <typename T>
 inline bool operator==(ArrayRef<T> LHS, ArrayRef<T> RHS)
diff --git a/src/AMSlib/util/SmallVector.h b/src/AMSlib/include/SmallVector.hpp
similarity index 99%
rename from src/AMSlib/util/SmallVector.h
rename to src/AMSlib/include/SmallVector.hpp
index c43afd55..685b0f37 100644
--- a/src/AMSlib/util/SmallVector.h
+++ b/src/AMSlib/include/SmallVector.hpp
@@ -405,10 +405,14 @@ class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T>
 
   /// Move the range [I, E) into the uninitialized memory starting with "Dest",
   /// constructing elements as needed.
+  /// We expose a C++14 API, so we cannot directly call uninitialized_move that was intorduced
+  /// on C++17. This is equivalent.
   template <typename It1, typename It2>
   static void uninitialized_move(It1 I, It1 E, It2 Dest)
   {
-    std::uninitialized_move(I, E, Dest);
+    std::uninitialized_copy(std::make_move_iterator(I),
+                            std::make_move_iterator(E),
+                            Dest);
   }
 
   /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
diff --git a/src/AMSlib/macro.h b/src/AMSlib/macro.h
new file mode 100644
index 00000000..33483688
--- /dev/null
+++ b/src/AMSlib/macro.h
@@ -0,0 +1,24 @@
+#ifdef __AMS_ENABLE_CALIPER__
+#include <caliper/cali-manager.h>
+#include <caliper/cali.h>
+#define CALIPER(stmt) stmt
+#else
+#define CALIPER(stmt)
+#endif
+
+#ifdef __AMS_ENABLE_MPI__
+#include <mpi.h>
+#define MPI_CALL(stmt)                                                         \
+  if (stmt != MPI_SUCCESS) {                                                   \
+    fprintf(stderr, "Error in MPI-Call (File: %s, %d)\n", __FILE__, __LINE__); \
+  }
+#else
+typedef void *MPI_Comm;
+#define MPI_CALL(stm)
+#endif
+
+#ifdef __AMS_ENABLE_PERFFLOWASPECT__
+#define PERFFASPECT() __attribute__((annotate("@critical_path()")))
+#else
+#define PERFFASPECT()
+#endif
diff --git a/src/AMSlib/ml/hdcache.hpp b/src/AMSlib/ml/hdcache.hpp
deleted file mode 100644
index 31719979..00000000
--- a/src/AMSlib/ml/hdcache.hpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef __AMS_HDCACHE_HPP__
-#define __AMS_HDCACHE_HPP__
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <numeric>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-#ifdef __ENABLE_FAISS__
-#include <faiss/IndexFlat.h>
-#include <faiss/index_factory.h>
-#include <faiss/index_io.h>
-
-#ifdef __ENABLE_CUDA__
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/StandardGpuResources.h>
-
-#include "wf/device.hpp"
-#endif
-#endif
-
-#include "AMS.h"
-#include "wf/data_handler.hpp"
-#include "wf/resource_manager.hpp"
-#include "wf/utils.hpp"
-
-//! ----------------------------------------------------------------------------
-//! An implementation of FAISS-based HDCache
-//! ----------------------------------------------------------------------------
-template <typename TypeInValue>
-class HDCache
-{
-
-  static_assert(std::is_floating_point<TypeInValue>::value,
-                "HDCache supports floating-point values (floats, doubles, and "
-                "long doubles) only!");
-
-#ifdef __ENABLE_FAISS__
-  using Index = faiss::Index;
-  using TypeIndex = faiss::Index::idx_t;  // 64-bit int
-  using TypeValue = float;                // faiss uses floats
-#ifdef __ENABLE_CUDA__
-  faiss::gpu::StandardGpuResources res;
-  faiss::gpu::GpuClonerOptions copyOptions;
-#endif
-#else
-  using Index = void;
-  using TypeIndex = uint64_t;
-  using TypeValue = TypeInValue;
-#endif
-  using data_handler =
-      ams::DataHandler<TypeValue>;  // utils to handle float data
-
-  Index *m_index = nullptr;
-  const uint8_t m_dim;
-
-  const int m_knbrs = 0;
-  const AMSUQPolicy m_policy = AMSUQPolicy::AMS_FAISS_MEAN;
-
-  AMSResourceType cache_location;
-
-  const TypeValue acceptable_error;
-
-
-#ifdef __ENABLE_FAISS__
-  const char *index_key = "IVF4096,Flat";
-  // const char* index_key = "IndexFlatL2";
-  // const char* index_key = "IndexFlatL2";
-  // const char* index_key = "GpuIndexFlatL2";
-
-  // faiss::gpu::StandardGpuResources resources;
-  // faiss::gpu::GpuIndexIVFPQConfig config;
-  // faiss::IndexIVFPQ* index_cpu;
-  // faiss::gpu::GpuIndexIVFPQ *index_gpu;
-#endif
-
-protected:
-  // A mechanism to keep track of all unique HDCaches
-  static std::unordered_map<std::string, std::shared_ptr<HDCache<TypeInValue>>>
-      instances;
-
-  //! ------------------------------------------------------------------------
-  //! constructors
-  //! ------------------------------------------------------------------------
-#ifdef __ENABLE_FAISS__
-  HDCache(const std::string &cache_path,
-          AMSResourceType resource,
-          const AMSUQPolicy uqPolicy,
-          int knbrs,
-          TypeInValue threshold = 0.5)
-      : m_index(load_cache(cache_path)),
-        m_dim(m_index->d),
-        m_knbrs(knbrs),
-        m_policy(uqPolicy),
-        cache_location(resource),
-        acceptable_error(threshold)
-  {
-#ifdef __ENABLE_CUDA__
-    // Copy index to device side
-    if (cache_location == AMSResourceType::AMS_DEVICE) {
-      faiss::gpu::GpuClonerOptions copyOptions;
-      faiss::gpu::ToGpuCloner cloner(&res, 0, copyOptions);
-      m_index = cloner.clone_Index(m_index);
-    }
-#endif
-    print();
-  }
-#else  // Disabled FAISS
-  HDCache(const std::string &cache_path,
-          AMSResourceType resource,
-          const AMSUQPolicy uqPolicy,
-          int knbrs,
-          TypeInValue threshold = 0.5)
-      : m_index(load_cache(cache_path)),
-        m_dim(0),
-        m_knbrs(knbrs),
-        m_policy(uqPolicy),
-        cache_location(resource),
-        acceptable_error(threshold)
-  {
-    WARNING(UQModule, "Ignoring cache path because FAISS is not available")
-    print();
-  }
-#endif
-
-public:
-  static std::shared_ptr<HDCache<TypeInValue>> find_cache(
-      const std::string &cache_path,
-      AMSResourceType resource,
-      const AMSUQPolicy uqPolicy,
-      int knbrs,
-      TypeInValue threshold = 0.5)
-  {
-    auto model = HDCache<TypeInValue>::instances.find(cache_path);
-
-    if (model != instances.end()) {
-      // Model Found
-      auto cache = model->second;
-      if (resource != cache->cache_location)
-        throw std::runtime_error(
-            "Currently we do not support loading the same index on different "
-            "devices.");
-
-      if (uqPolicy != cache->m_policy)
-        throw std::runtime_error(
-            "We do not support caches of different policies.");
-
-      if (knbrs != cache->m_knbrs)
-        throw std::runtime_error(
-            "We do not support caches of different number of neighbors.");
-
-      // FIXME: Here we need to cast both to float. FAISS index only works for
-      // single precision and we shoehorn FAISS inability to support arbitary real
-      // types by forcing TypeValue to be 'float'. In our case this results in having
-      // cases where input data are of type(TypeInValue) double. Thus here, threshold can
-      // be of different type than 'acceptable_error' and at compile time we cannot decide
-      // which overloaded function to pick.
-      if (!is_real_equal(static_cast<float>(threshold),
-                         static_cast<float>(cache->acceptable_error)))
-        throw std::runtime_error(
-            "We do not support caches of different thresholds");
-
-      return cache;
-    }
-    return nullptr;
-  }
-
-  static std::shared_ptr<HDCache<TypeInValue>> getInstance(
-      const std::string &cache_path,
-      AMSResourceType resource,
-      const AMSUQPolicy uqPolicy,
-      int knbrs,
-      TypeInValue threshold = 0.5)
-  {
-
-    // Cache does not exist. We need to create one
-    //
-    std::shared_ptr<HDCache<TypeInValue>> cache =
-        find_cache(cache_path, resource, uqPolicy, knbrs, threshold);
-    if (cache) {
-      DBG(UQModule, "Returning existing cache under (%s)", cache_path.c_str())
-      return cache;
-    }
-
-    if (uqPolicy != AMSUQPolicy::AMS_FAISS_MEAN &&
-        uqPolicy != AMSUQPolicy::AMS_FAISS_MAX)
-      THROW(std::invalid_argument,
-            "Invalid UQ policy for hdcache" +
-                std::to_string(static_cast<unsigned int>(uqPolicy)));
-
-    DBG(UQModule, "Generating new cache under (%s)", cache_path.c_str())
-    std::shared_ptr<HDCache<TypeInValue>> new_cache =
-        std::shared_ptr<HDCache<TypeInValue>>(new HDCache<TypeInValue>(
-            cache_path, resource, uqPolicy, knbrs, threshold));
-
-    instances.insert(std::make_pair(cache_path, new_cache));
-    return new_cache;
-  }
-
-  ~HDCache()
-  {
-    DBG(UQModule, "Deleting UQ-Module");
-#ifdef __ENABLE_FAISS__
-    if (m_index) {
-      DBG(UQModule, "Deleting HD-Cache");
-      /// TODO: Deleting the cache on device can, and does
-      /// result in C++ destructor.
-      if (cache_location != AMSResourceType::AMS_DEVICE) {
-        m_index->reset();
-        delete m_index;
-      }
-    }
-#endif
-  }
-
-  //! ------------------------------------------------------------------------
-  //! simple queries
-  //! ------------------------------------------------------------------------
-  inline void print() const
-  {
-    std::string info("index = null");
-    if (has_index()) {
-      info = "npoints = " + std::to_string(count());
-    }
-    DBG(UQModule, "HDCache (on_device = %d %s)", cache_location, info.c_str());
-  }
-
-  inline bool has_index() const
-  {
-#ifdef __ENABLE_FAISS__
-    return m_index != nullptr && m_index->is_trained;
-#endif
-    return true;
-  }
-
-  inline size_t count() const
-  {
-#ifdef __ENABLE_FAISS__
-    return m_index->ntotal;
-#endif
-    return 0;
-  }
-
-  inline uint8_t dim() const { return m_dim; }
-
-  //! ------------------------------------------------------------------------
-  //! load/save faiss cache
-  //! ------------------------------------------------------------------------
-  static inline Index *load_cache(const std::string &filename)
-  {
-#ifdef __ENABLE_FAISS__
-    DBG(UQModule, "Loading HDCache: %s", filename.c_str());
-    return faiss::read_index(filename.c_str());
-#else
-    return nullptr;
-#endif
-  }
-
-  inline void save_cache(const std::string &filename) const
-  {
-#ifdef __ENABLE_FAISS__
-    print();
-    DBG(UQModule, "Saving HDCache to: %s", filename.c_str());
-    faiss::write_index(m_index, filename.c_str());
-#endif
-  }
-
-  //! -----------------------------------------------------------------------
-  //! add points to the faiss cache
-  //! -----------------------------------------------------------------------
-  //! add the data that comes as linearized features
-  PERFFASPECT()
-  void add(const size_t ndata, const size_t d, TypeInValue *data)
-  {
-    DBG(UQModule, "Add %ld %ld points to HDCache", ndata, d);
-    CFATAL(UQModule, d != m_dim, "Mismatch in data dimensionality!")
-    CFATAL(UQModule,
-           !has_index(),
-           "HDCache does not have a valid and trained index!")
-
-    _add(ndata, data);
-  }
-
-  //! add the data that comes as separate features (a vector of pointers)
-  PERFFASPECT()
-  void add(const size_t ndata, const std::vector<TypeInValue *> &inputs)
-  {
-    if (inputs.size() != m_dim)
-      CFATAL(UQModule,
-             inputs.size() != m_dim,
-             "Mismatch in data dimensionality")
-    CFATAL(UQModule,
-           !has_index(),
-           "HDCache does not have a valid and trained index!")
-
-    TypeValue *lin_data =
-        data_handler::linearize_features(cache_location, ndata, inputs);
-    _add(ndata, lin_data);
-    auto &rm = ams::ResourceManager::getInstance();
-    rm.deallocate(lin_data, cache_location);
-  }
-
-  //! -----------------------------------------------------------------------
-  //! train a faiss cache
-  //! -----------------------------------------------------------------------
-  //! train on data that comes as linearized features
-  PERFFASPECT()
-  void train(const size_t ndata, const size_t d, TypeInValue *data)
-  {
-    DBG(UQModule, "Add %ld %ld points to HDCache", ndata, d);
-    CFATAL(UQModule, d != m_dim, "Mismatch in data dimensionality!")
-    CFATAL(UQModule,
-           !has_index(),
-           "HDCache does not have a valid and trained index!")
-
-    _train(ndata, data);
-    DBG(UQModule, "Successfully Trained HDCache");
-  }
-
-  //! train on data that comes separate features (a vector of pointers)
-  PERFFASPECT()
-  void train(const size_t ndata, const std::vector<TypeInValue *> &inputs)
-  {
-    TypeValue *lin_data =
-        data_handler::linearize_features(cache_location, ndata, inputs);
-    _train(ndata, lin_data);
-    auto &rm = ams::ResourceManager::getInstance();
-    rm.deallocate(lin_data, cache_location);
-  }
-
-  //! ------------------------------------------------------------------------
-  //! evaluate uncertainty using the cache
-  //! ------------------------------------------------------------------------
-  //! train on data that comes as linearized features
-  //! it looks like faiss can work directly on torch tensor
-  //! https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU#passing-in-pytorch-tensors
-  //! so, we should use Dino's code to linearize data into torch tensor and then
-  //! pass it here
-  PERFFASPECT()
-  void evaluate(const size_t ndata,
-                const size_t d,
-                TypeInValue *data,
-                bool *is_acceptable) const
-  {
-
-    CFATAL(UQModule,
-           !has_index(),
-           "HDCache does not have a valid and trained index!")
-    DBG(UQModule, "Evaluating %ld %ld points using HDCache", ndata, d);
-
-    CFATAL(UQModule, (d != m_dim), "Mismatch in data dimensionality!")
-
-    _evaluate(ndata, data, is_acceptable);
-
-    if (cache_location == AMSResourceType::AMS_DEVICE) {
-      ams::deviceCheckErrors(__FILE__, __LINE__);
-    }
-
-    DBG(UQModule, "Done with evalution of uq")
-  }
-
-  //! train on data that comes separate features (a vector of pointers)
-  PERFFASPECT()
-  void evaluate(const size_t ndata,
-                const std::vector<const TypeInValue *> &inputs,
-                bool *is_acceptable) const
-  {
-
-    CFATAL(UQModule,
-           !has_index(),
-           "HDCache does not have a valid and trained index!")
-    DBG(UQModule,
-        "Evaluating %ld %ld points using HDCache configured with %d neighbors, "
-        "%f threshold, %d policy",
-        ndata,
-        inputs.size(),
-        m_knbrs,
-        acceptable_error,
-        m_policy);
-    CFATAL(UQModule,
-           (inputs.size() != m_dim),
-           "Mismatch in data dimensionality!")
-
-    TypeValue *lin_data =
-        data_handler::linearize_features(cache_location, ndata, inputs);
-    _evaluate(ndata, lin_data, is_acceptable);
-    auto &rm = ams::ResourceManager::getInstance();
-    rm.deallocate(lin_data, cache_location);
-    DBG(UQModule, "Done with evalution of uq");
-  }
-
-private:
-#ifdef __ENABLE_FAISS__
-  //! ------------------------------------------------------------------------
-  //! core faiss functionality.
-  //! ------------------------------------------------------------------------
-
-  inline uint8_t _dim() const { return (m_index != nullptr) ? m_index->d : 0; }
-
-  //! add points to index when  (data type = TypeValue)
-  template <typename T,
-            std::enable_if_t<std::is_same<TypeValue, T>::value> * = nullptr>
-  PERFFASPECT()
-  inline void _add(const size_t ndata, const T *data)
-  {
-    m_index->add(ndata, data);
-  }
-
-  //! add points to index when (data type != TypeValue)
-  template <typename T,
-            std::enable_if_t<!std::is_same<TypeValue, T>::value> * = nullptr>
-  PERFFASPECT()
-  inline void _add(const size_t ndata, const T *data)
-  {
-    TypeValue *vdata =
-        data_handler::cast_to_typevalue(cache_location, ndata, data);
-    _add(ndata, vdata);
-    delete[] vdata;
-  }
-
-
-  //! train an index when (data type = TypeValue)
-  template <typename T,
-            std::enable_if_t<std::is_same<TypeValue, T>::value> * = nullptr>
-  PERFFASPECT()
-  inline void _train(const size_t ndata, const T *data)
-  {
-
-    if (m_index != nullptr && m_index->is_trained)
-      throw std::invalid_argument("!");
-
-    CFATAL(UQModule,
-           (m_index != nullptr && m_index->is_trained),
-           "Trying to re-train an already trained index")
-
-    m_index = faiss::index_factory(m_dim, index_key);
-    m_index->train(ndata, data);
-
-    CFATAL(UQModule, ((!m_index->is_trained)), "Failed to train index")
-  }
-
-  //! train an index when (data type != TypeValue)
-  template <typename T,
-            std::enable_if_t<!std::is_same<TypeValue, T>::value> * = nullptr>
-  PERFFASPECT()
-  inline void _train(const size_t ndata, const T *data)
-  {
-    TypeValue *vdata =
-        data_handler::cast_to_typevalue(cache_location, ndata, data);
-    _train(ndata, vdata);
-    delete[] vdata;
-  }
-
-  // -------------------------------------------------------------------------
-  //! evaluate cache uncertainty when  (data type = TypeValue)
-  template <typename T,
-            std::enable_if_t<std::is_same<TypeValue, T>::value> * = nullptr>
-  PERFFASPECT()
-  void _evaluate(const size_t ndata, T *data, bool *is_acceptable) const
-  {
-
-    const size_t knbrs = static_cast<size_t>(m_knbrs);
-    static const TypeValue ook = 1.0 / TypeValue(knbrs);
-    auto &rm = ams::ResourceManager::getInstance();
-    TypeValue *kdists = rm.allocate<TypeValue>(ndata * knbrs, cache_location);
-    TypeIndex *kidxs = rm.allocate<TypeIndex>(ndata * knbrs, cache_location);
-
-    // query faiss
-    // TODO: This is a HACK. When searching more than 65535
-    // items in the GPU case, faiss is throwing an exception.
-    const unsigned int MAGIC_NUMBER = 65535;
-    for (int start = 0; start < ndata; start += MAGIC_NUMBER) {
-      unsigned int nElems =
-          ((ndata - start) < MAGIC_NUMBER) ? ndata - start : MAGIC_NUMBER;
-      DBG(UQModule, "Running for %d elements %d %d", nElems, start, m_dim);
-      m_index->search(nElems,
-                      &data[start * m_dim],
-                      knbrs,
-                      &kdists[start * knbrs],
-                      &kidxs[start * knbrs]);
-    }
-#ifdef __ENABLE_CUDA__
-    faiss::gpu::synchronizeAllDevices();
-#endif
-
-    // compute means
-    if (cache_location == AMSResourceType::AMS_HOST) {
-      for (size_t i = 0; i < ndata; ++i) {
-        if (m_policy == AMSUQPolicy::AMS_FAISS_MEAN) {
-          TypeValue mean_dist = std::accumulate(kdists + i * knbrs,
-                                                kdists + (i + 1) * knbrs,
-                                                0.) *
-                                ook;
-          is_acceptable[i] = mean_dist < acceptable_error;
-        } else if (m_policy == AMSUQPolicy::AMS_FAISS_MAX) {
-          // Take the furtherst cluster as the distance metric
-          TypeValue max_dist =
-              *std::max_element(&kdists[i * knbrs],
-                                &kdists[i * knbrs + knbrs - 1]);
-          is_acceptable[i] = (max_dist) < acceptable_error;
-        }
-      }
-    } else {
-      CFATAL(UQModule,
-             m_policy == AMSUQPolicy::AMS_FAISS_MAX,
-             "FAISS Max on device is not supported yet");
-
-      ams::Device::computePredicate(
-          kdists, is_acceptable, ndata, knbrs, acceptable_error);
-    }
-
-    rm.deallocate(kdists, cache_location);
-    rm.deallocate(kidxs, cache_location);
-  }
-
-  //! evaluate cache uncertainty when (data type != TypeValue)
-  template <typename T,
-            std::enable_if_t<!std::is_same<TypeValue, T>::value> * = nullptr>
-  inline void _evaluate(const size_t ndata, T *data, bool *is_acceptable) const
-  {
-    TypeValue *vdata =
-        data_handler::cast_to_typevalue(cache_location, ndata, data);
-    _evaluate(ndata, data, is_acceptable);
-    delete[] vdata;
-  }
-
-#else
-  // -------------------------------------------------------------------------
-  // fucntionality for randomized cache
-  // -------------------------------------------------------------------------
-  inline uint8_t _dim() const { return 0; }
-
-  template <typename T>
-  PERFFASPECT()
-  inline void _add(const size_t, const T *)
-  {
-  }
-
-  template <typename T>
-  PERFFASPECT()
-  inline void _train(const size_t, const T *)
-  {
-  }
-
-  template <typename T>
-  PERFFASPECT()
-  inline void _evaluate(const size_t, T *, bool *) const
-  {
-  }
-#endif
-  // -------------------------------------------------------------------------
-};
-
-template <typename T>
-std::unordered_map<std::string, std::shared_ptr<HDCache<T>>>
-    HDCache<T>::instances;
-
-#endif
diff --git a/src/AMSlib/ml/surrogate.cpp b/src/AMSlib/ml/surrogate.cpp
new file mode 100644
index 00000000..db653a0f
--- /dev/null
+++ b/src/AMSlib/ml/surrogate.cpp
@@ -0,0 +1,334 @@
+
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <torch/script.h>
+
+#include <experimental/filesystem>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include "AMS.h"
+#include "surrogate.hpp"
+#include "wf/debug.h"
+#include "wf/utils.hpp"
+
+using namespace ams;
+static std::string getDTypeAsString(torch::Dtype dtype)
+{
+  if (dtype == torch::kFloat32) return "float32";
+  if (dtype == torch::kFloat64) return "float64";
+  if (dtype == torch::kInt32) return "int32";
+  if (dtype == torch::kInt64) return "int64";
+  if (dtype == torch::kBool) return "bool";
+  if (dtype == torch::kUInt8) return "uint8";
+  if (dtype == torch::kInt8) return "int8";
+
+  // Add other types as needed
+  return "unknown";
+}
+
+static std::string getAMSDTypeAsString(AMSDType dType)
+{
+  if (dType == AMS_SINGLE)
+    return "float32";
+  else if (dType == AMS_DOUBLE)
+    return "float64";
+  return "unknown";
+}
+
+static std::string getAMSResourceTypeAsString(AMSResourceType res)
+{
+  if (res == ams::AMS_DEVICE)
+    return "device";
+  else if (res == ams::AMS_HOST)
+    return "host";
+  return "unknown-device";
+}
+
+
+SurrogateModel::SurrogateModel(std::string& model_path, bool isDeltaUQ)
+    : _model_path(model_path), _is_DeltaUQ(isDeltaUQ)
+{
+
+  std::experimental::filesystem::path Path(model_path);
+  std::error_code ec;
+
+  if (!std::experimental::filesystem::exists(Path, ec)) {
+    FATAL(Surrogate,
+          "Path to Surrogate Model (%s) Does not exist",
+          model_path.c_str())
+  }
+
+  try {
+    module = torch::jit::load(model_path);
+  } catch (const c10::Error& e) {
+    printf("Error opening %s\n", model_path.c_str());
+  }
+  std::tie(model_device, torch_device) = getModelResourceType();
+  std::tie(model_dtype, torch_dtype) = getModelDataType();
+  DBG(SurrogateModel,
+      "Loaded model with type %s on device %s",
+      getAMSDTypeAsString(model_dtype).c_str(),
+      getAMSResourceTypeAsString(model_device).c_str());
+}
+
+std::tuple<AMSResourceType, torch::DeviceType> SurrogateModel::
+    getModelResourceType()
+{
+  // Iterate through the parameters to determine the device
+  for (const auto& parameter : module.parameters()) {
+    // Return the device of the first parameter found
+    switch (parameter.device().type()) {
+      case c10::DeviceType::CUDA:
+      case c10::DeviceType::HIP:
+        return std::make_tuple(AMS_DEVICE, parameter.device().type());
+      case c10::DeviceType::CPU:
+        return std::make_tuple(AMS_HOST, parameter.device().type());
+      default:
+        continue;
+    }
+  }
+
+  // If no parameters are found, check the buffers
+  for (const auto& buffer : module.buffers()) {
+    switch (buffer.device().type()) {
+      case c10::DeviceType::CUDA:
+      case c10::DeviceType::HIP:
+        return std::make_tuple(AMS_DEVICE, buffer.device().type());
+      case c10::DeviceType::CPU:
+        return std::make_tuple(AMS_HOST, buffer.device().type());
+      default:
+        continue;
+    }
+  }
+
+  // If no parameters or buffers are found, default to unknown
+  FATAL(Surrogate,
+        "Cannot determine device type of model %s",
+        _model_path.c_str());
+  return std::make_tuple(AMS_UNKNOWN,
+                         c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+}
+
+std::tuple<AMSDType, torch::Dtype> SurrogateModel::getModelDataType()
+{
+  AMSDType dParamType = AMSDType::AMS_DOUBLE;
+  torch::Dtype torchType = at::kDouble;
+  for (const auto& parameter : module.parameters()) {
+    // Return the device of the first parameter found
+    if (parameter.dtype() == at::kFloat) {
+      dParamType = AMS_SINGLE;
+      torchType = at::kFloat;
+    } else if (parameter.dtype() == at::kDouble) {
+      dParamType = AMS_DOUBLE;
+      torchType = at::kDouble;
+    } else {
+      throw std::runtime_error(std::string("Invalid datatype ") +
+                               std::string(parameter.dtype().name()));
+    }
+  }
+
+  // Verify
+  for (const auto& parameter : module.parameters()) {
+    if (parameter.dtype() != torchType)
+      throw std::runtime_error("Provided model has mixed data types");
+  }
+
+  AMSDType dBufferType = dParamType;
+  for (const auto& buffer : module.buffers()) {
+    // Return the device of the first parameter found
+    if (buffer.dtype() == at::kFloat) {
+      dBufferType = AMS_SINGLE;
+      torchType = at::kFloat;
+    } else if (buffer.dtype() == at::kDouble) {
+      dBufferType = AMS_DOUBLE;
+      torchType = at::kDouble;
+    } else {
+      throw std::runtime_error(std::string("Invalid datatype ") +
+                               std::string(buffer.dtype().name()));
+    }
+  }
+  // Verify
+  for (const auto& buffer : module.buffers()) {
+    if (buffer.dtype() != torchType)
+      throw std::runtime_error("Provided model has mixed data types");
+  }
+
+  if (dParamType != dBufferType)
+    throw std::runtime_error(
+        "Provided model has mixed data types between parameters and buffers");
+
+  DBG(Surrogate,
+      "Detected model data type %s %s",
+      getDTypeAsString(torchType).c_str(),
+      getAMSDTypeAsString(dParamType).c_str());
+  return std::make_tuple(dParamType, torchType);
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> SurrogateModel::_computeDetlaUQ(
+    c10::IValue& deltaUQTuple,
+    AMSUQPolicy policy,
+    float threshold)
+{
+  at::Tensor output_mean_tensor = deltaUQTuple.toTuple()
+                                      ->elements()[0]
+                                      .toTensor()
+                                      .set_requires_grad(false)
+                                      .detach();
+  at::Tensor output_stdev_tensor = deltaUQTuple.toTuple()
+                                       ->elements()[1]
+                                       .toTensor()
+                                       .set_requires_grad(false)
+                                       .detach();
+  auto outer_dim = output_stdev_tensor.sizes().size() - 1;
+  if (policy != AMSUQPolicy::AMS_DELTAUQ_MAX &&
+      policy != AMSUQPolicy::AMS_DELTAUQ_MEAN)
+    throw std::runtime_error("Invalid DELTA_UQ policy");
+
+  if (policy == AMSUQPolicy::AMS_DELTAUQ_MEAN) {
+    auto mean = output_stdev_tensor.mean(outer_dim);
+    auto predicate = mean < threshold;
+    return std::make_tuple(std::move(output_mean_tensor), std::move(predicate));
+  } else if (policy == AMSUQPolicy::AMS_DELTAUQ_MAX) {
+    auto tmp = output_stdev_tensor.max(outer_dim);
+    torch::Tensor max = std::get<0>(tmp);
+    auto predicate = max < threshold;
+    return std::make_tuple(std::move(output_mean_tensor), std::move(predicate));
+  }
+  throw std::runtime_error("Invalid DELTA_UQ policy");
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> SurrogateModel::_evaluate(
+    torch::Tensor& inputs,
+    AMSUQPolicy policy,
+    float threshold)
+{
+  if (inputs.dtype() != torch_dtype) {
+    throw std::runtime_error(
+        "Received inputs of wrong dType. Model is expecting " +
+        getDTypeAsString(torch::typeMetaToScalarType(inputs.dtype())) +
+        " and model is " + getDTypeAsString(torch_dtype));
+  }
+  c10::InferenceMode guard(true);
+  auto out = module.forward({inputs});
+  if (_is_DeltaUQ) {
+    return _computeDetlaUQ(out, policy, threshold);
+  }
+
+  at::Tensor output_tensor = out.toTensor().set_requires_grad(false).detach();
+  // Randomly select indices to set to True
+  torch::Tensor predicate =
+      torch::zeros({output_tensor.sizes()[0], 1}, torch::kBool);
+  auto indices = torch::randperm(output_tensor.sizes()[0])
+                     .slice(0, 0, threshold * output_tensor.sizes()[0]);
+
+  // Set selected indices to True
+  predicate.index_put_({indices, 0}, true);
+  return std::make_tuple(std::move(output_tensor), std::move(predicate));
+}
+
+
+std::tuple<torch::Tensor, torch::Tensor> SurrogateModel::evaluate(
+    ams::MutableArrayRef<at::Tensor> Inputs,
+    AMSUQPolicy policy,
+    float threshold)
+{
+  if (Inputs.size() == 0) {
+    throw std::invalid_argument(
+        "Input Vector should always contain at least one tensor");
+  }
+
+  torch::DeviceType InputDevice = Inputs[0].device().type();
+  torch::Dtype InputDType = torch::typeMetaToScalarType(Inputs[0].dtype());
+  auto CAxis = Inputs[0].sizes().size() - 1;
+
+  // Verify input/device matching
+  for (auto& In : Inputs) {
+    if (InputDevice != In.device().type()) {
+      throw std::invalid_argument(
+          "Unsupported feature, application domain tensors are on different "
+          "devices\n");
+    }
+    if (InputDType != torch::typeMetaToScalarType(In.dtype())) {
+      throw std::invalid_argument(
+          "Unsupported feature, application domain tensors have different data "
+          "types\n");
+    }
+  }
+  c10::SmallVector<torch::Tensor> ConvertedInputs(Inputs.begin(), Inputs.end());
+  // If either the model's execution device or the data type differ
+  // in respect to the inputs we need to handle this separately.
+  if (InputDevice != torch_device || InputDType != torch_dtype) {
+    for (int i = 0; i < ConvertedInputs.size(); i++) {
+      ConvertedInputs[i] = ConvertedInputs[i].to(torch_device, torch_dtype);
+    }
+  }
+
+  auto ITensor = torch::cat(ConvertedInputs, CAxis);
+  DBG(Surrogate,
+      "Input concatenated tensor is %s",
+      shapeToString(ITensor).c_str());
+
+  auto [OTensor, Predicate] = _evaluate(ITensor, policy, threshold);
+  if (InputDevice != torch_device) {
+    OTensor = OTensor.to(InputDevice);
+    Predicate = Predicate.to(InputDevice);
+  }
+  return std::make_tuple(std::move(OTensor), std::move(Predicate));
+}
+
+
+std::unordered_map<std::string, std::shared_ptr<SurrogateModel>>
+    SurrogateModel::instances;
+
+#if 0
+//#include <wf/resource_manager.hpp>
+
+//#include <c10/core/Allocator.h>
+//#include <c10/core/CPUAllocator.h>
+//#include <c10/core/DeviceType.h>
+//#include <c10/core/ScalarType.h>
+//#include <torch/script.h>  // One-stop header.
+// #include <wf/debug.h>
+
+
+//struct C10_API AMSCPUAllocator final : at::Allocator {
+//  AMSCPUAllocator() = default;
+//  at::DataPtr allocate(size_t nbytes) const override
+//  {
+//    auto& rm = ams::ResourceManager::getInstance();
+//    uint8_t* data = rm.allocate<uint8_t>(nbytes, AMSResourceType::AMS_HOST);
+//
+//    return {(void*)data,
+//            (void*)data,
+//            &ReportAndDelete,
+//            at::Device(at::DeviceType::CPU)};
+//  }
+//
+//  static void ReportAndDelete(void* ptr)
+//  {
+//    if (!ptr) {
+//      return;
+//    }
+//    auto& rm = ams::ResourceManager::getInstance();
+//    rm.deallocate(ptr, AMSResourceType::AMS_HOST);
+//  }
+//
+//  at::DeleterFnPtr raw_deleter() const override { return &ReportAndDelete; }
+//};
+//
+//
+//AMSCPUAllocator ams_torch;
+//
+//
+//void set_cpu_torch_allocator()
+//{
+//  SetAllocator(c10::DeviceType::CPU, &ams_torch, (uint8_t)(2 ^ 8 - 1));
+//  SetCPUAllocator(&ams_torch, (uint8_t)(2 ^ 8 - 1));
+//}
+//
+//
+#endif
diff --git a/src/AMSlib/ml/surrogate.hpp b/src/AMSlib/ml/surrogate.hpp
index 95c84c7a..c1b18981 100644
--- a/src/AMSlib/ml/surrogate.hpp
+++ b/src/AMSlib/ml/surrogate.hpp
@@ -8,366 +8,109 @@
 #ifndef __AMS_SURROGATE_HPP__
 #define __AMS_SURROGATE_HPP__
 
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <torch/cuda.h>
+#include <torch/script.h>  // One-stop header.
+
 #include <experimental/filesystem>
 #include <memory>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 
 #include "AMS.h"
-#include "wf/device.hpp"
+#include "ArrayRef.hpp"
+#include "wf/debug.h"
 
-#ifdef __ENABLE_TORCH__
-#include <ATen/core/interned_strings.h>
-#include <ATen/core/ivalue.h>
-#include <torch/cuda.h>
-#include <torch/script.h>  // One-stop header.
-#endif
 
-#include "wf/data_handler.hpp"
-#include "wf/debug.h"
+namespace UQ
+{
+static inline bool isDeltaUQ(ams::AMSUQPolicy policy)
+{
+  if (policy >= ams::AMSUQPolicy::AMS_DELTAUQ_MEAN &&
+      policy <= ams::AMSUQPolicy::AMS_DELTAUQ_MAX) {
+    return true;
+  }
+  return false;
+}
+
+static inline bool isRandomUQ(ams::AMSUQPolicy policy)
+{
+  return policy == ams::AMSUQPolicy::AMS_RANDOM;
+}
+
+
+static inline bool isUQPolicy(ams::AMSUQPolicy policy)
+{
+  if (ams::AMSUQPolicy::AMS_UQ_BEGIN < policy &&
+      policy < ams::AMSUQPolicy::AMS_UQ_END)
+    return true;
+  return false;
+}
+
+static std::string UQPolicyToStr(ams::AMSUQPolicy policy)
+{
+  if (policy == ams::AMSUQPolicy::AMS_RANDOM)
+    return "random";
+  else if (policy == ams::AMSUQPolicy::AMS_DELTAUQ_MEAN)
+    return "deltaUQ (mean)";
+  else if (policy == ams::AMSUQPolicy::AMS_DELTAUQ_MAX)
+    return "deltaUQ (max)";
+  return "Unknown";
+}
+
+static ams::AMSUQPolicy UQPolicyFromStr(std::string& policy)
+{
+  if (policy.compare("random") == 0)
+    return ams::AMSUQPolicy::AMS_RANDOM;
+
+  else if (policy.compare("deltaUQ (mean)") == 0)
+    return ams::AMSUQPolicy::AMS_DELTAUQ_MEAN;
+  else if (policy.compare("deltaUQ (max)") == 0)
+    return ams::AMSUQPolicy::AMS_DELTAUQ_MAX;
+  return ams::AMSUQPolicy::AMS_UQ_END;
+}
+};  // namespace UQ
 
 //! ----------------------------------------------------------------------------
 //! An implementation for a surrogate model
 //! ----------------------------------------------------------------------------
-template <typename TypeInValue>
 class SurrogateModel
 {
 
-  static_assert(std::is_floating_point<TypeInValue>::value,
-                "SurrogateModel supports floating-point values (floats, "
-                "doubles, or long doubles) only!");
-
-  using data_handler =
-      ams::DataHandler<TypeInValue>;  // utils to handle float data
-
 private:
-  const std::string model_path;
-  AMSResourceType model_resource;
+  const std::string _model_path;
+  ams::AMSResourceType model_device;
+  torch::DeviceType torch_device;
+  ams::AMSDType model_dtype;
+  torch::Dtype torch_dtype;
   const bool _is_DeltaUQ;
 
-#ifdef __ENABLE_TORCH__
   // -------------------------------------------------------------------------
   // variables to store the torch model
   // -------------------------------------------------------------------------
   torch::jit::script::Module module;
-  c10::TensorOptions tensorOptions;
-
-
-  // -------------------------------------------------------------------------
-  // conversion to and from torch
-  // -------------------------------------------------------------------------
-  PERFFASPECT()
-  inline at::Tensor arrayToTensor(long numRows,
-                                  long numCols,
-                                  TypeInValue** array)
-  {
-    c10::SmallVector<at::Tensor, 8> Tensors;
-    for (int i = 0; i < numCols; i++) {
-      Tensors.push_back(torch::from_blob((TypeInValue*)array[i],
-                                         {numRows, 1},
-                                         tensorOptions));
-    }
-    at::Tensor tensor = at::reshape(at::cat(Tensors, 1), {numRows, numCols});
-    return tensor;
-  }
-
-  PERFFASPECT()
-  inline at::Tensor arrayToTensor(long numRows,
-                                  long numCols,
-                                  const TypeInValue** array)
-  {
-    c10::SmallVector<at::Tensor, 8> Tensors;
-    CALIPER(CALI_MARK_BEGIN("ARRAY_BLOB");)
-    for (int i = 0; i < numCols; i++) {
-      Tensors.push_back(torch::from_blob((TypeInValue*)array[i],
-                                         {numRows, 1},
-                                         tensorOptions));
-    }
-    CALIPER(CALI_MARK_END("ARRAY_BLOB");)
-
-    CALIPER(CALI_MARK_BEGIN("ARRAY_RESHAPE");)
-    at::Tensor tensor = at::reshape(at::cat(Tensors, 1), {numRows, numCols});
-    CALIPER(CALI_MARK_END("ARRAY_RESHAPE");)
-    return tensor;
-  }
-
-  PERFFASPECT()
-  inline void tensorToArray(at::Tensor tensor,
-                            long numRows,
-                            long numCols,
-                            TypeInValue** array)
-  {
-    // Transpose to get continuous memory and
-    // perform single memcpy.
-    auto& rm = ams::ResourceManager::getInstance();
-    tensor = tensor.transpose(1, 0);
-    for (long j = 0; j < numCols; j++) {
-      auto tmp = tensor[j].contiguous();
-      TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-      rm.copy(ptr, model_resource, array[j], model_resource, numRows);
-    }
-  }
-
-  // -------------------------------------------------------------------------
-  // loading a surrogate model!
-  // -------------------------------------------------------------------------
-  PERFFASPECT()
-  void _load_torch(const std::string& model_path,
-                   c10::Device&& device,
-                   at::ScalarType dType)
-  {
-    try {
-      module = torch::jit::load(model_path);
-      module.to(device);
-      module.to(dType);
-      tensorOptions =
-          torch::TensorOptions().dtype(dType).device(device).requires_grad(
-              false);
-    } catch (const c10::Error& e) {
-      FATAL("Error loding torch model:%s", model_path.c_str())
-    }
-  }
-
-  template <typename T,
-            std::enable_if_t<std::is_same<T, double>::value>* = nullptr>
-  PERFFASPECT()
-  inline void _load(const std::string& model_path,
-                    const std::string& device_name)
-  {
-    DBG(Surrogate, "Using model at double precision: %s", model_path.c_str());
-    _load_torch(model_path, torch::Device(device_name), torch::kFloat64);
-  }
-
-  template <typename T,
-            std::enable_if_t<std::is_same<T, float>::value>* = nullptr>
-  PERFFASPECT()
-  inline void _load(const std::string& model_path,
-                    const std::string& device_name)
-  {
-    DBG(Surrogate, "Using model at single precision: %s", model_path.c_str());
-    _load_torch(model_path, torch::Device(device_name), torch::kFloat32);
-  }
-
-  // -------------------------------------------------------------------------
-  // compute delta uq predicates
-  // -------------------------------------------------------------------------
-  void computeDeltaUQPredicates(AMSUQPolicy uq_policy,
-                                const TypeInValue* __restrict__ outputs_stdev,
-                                bool* __restrict__ predicates,
-                                const size_t nrows,
-                                const size_t ncols,
-                                const double threshold)
-  {
-    auto computeDeltaUQMeanPredicatesHost = [&]() {
-      for (size_t i = 0; i < nrows; ++i) {
-        double mean = 0.0;
-        for (size_t j = 0; j < ncols; ++j)
-          mean += outputs_stdev[j + i * ncols];
-        mean /= ncols;
-
-        predicates[i] = (mean < threshold);
-      }
-    };
-
-    auto computeDeltaUQMaxPredicatesHost = [&]() {
-      for (size_t i = 0; i < nrows; ++i) {
-        predicates[i] = true;
-        for (size_t j = 0; j < ncols; ++j)
-          if (outputs_stdev[j + i * ncols] >= threshold) {
-            predicates[i] = false;
-            break;
-          }
-      }
-    };
-
-    if (uq_policy == AMSUQPolicy::AMS_DELTAUQ_MEAN) {
-      if (model_resource == AMSResourceType::AMS_DEVICE) {
-#ifdef __ENABLE_CUDA__
-        DBG(Surrogate, "Compute mean delta uq predicates on device\n");
-        // TODO: use combined routine when it lands.
-        ams::Device::computeDeltaUQMeanPredicatesDevice(
-            outputs_stdev, predicates, nrows, ncols, threshold);
-#else
-        THROW(std::runtime_error,
-              "Expected CUDA is enabled when model data are on DEVICE");
-#endif
-      } else {
-        DBG(Surrogate, "Compute mean delta uq predicates on host\n");
-        computeDeltaUQMeanPredicatesHost();
-      }
-    } else if (uq_policy == AMSUQPolicy::AMS_DELTAUQ_MAX) {
-      if (model_resource == AMSResourceType::AMS_DEVICE) {
-#ifdef __ENABLE_CUDA__
-        DBG(Surrogate, "Compute max delta uq predicates on device\n");
-        // TODO: use combined routine when it lands.
-        ams::Device::computeDeltaUQMaxPredicatesDevice(
-            outputs_stdev, predicates, nrows, ncols, threshold);
-#else
-        THROW(std::runtime_error,
-              "Expected CUDA is enabled when model data are on DEVICE");
-#endif
-      } else {
-        DBG(Surrogate, "Compute max delta uq predicates on host\n");
-        computeDeltaUQMaxPredicatesHost();
-      }
-    } else
-      THROW(std::runtime_error,
-            "Invalid uq_policy to compute delta uq predicates");
-  }
-
-  // -------------------------------------------------------------------------
-  // evaluate a torch model
-  // -------------------------------------------------------------------------
-  PERFFASPECT()
-  inline void _evaluate(long num_elements,
-                        size_t num_in,
-                        size_t num_out,
-                        const TypeInValue** inputs,
-                        TypeInValue** outputs,
-                        AMSUQPolicy uq_policy,
-                        bool* predicates,
-                        double threshold)
-  {
-    //torch::NoGradGuard no_grad;
-    c10::InferenceMode guard(true);
-    CALIPER(CALI_MARK_BEGIN("ARRAY_TO_TENSOR");)
-    auto input = arrayToTensor(num_elements, num_in, inputs);
-    CALIPER(CALI_MARK_END("ARRAY_TO_TENSOR");)
-
-    input.set_requires_grad(false);
-    if (_is_DeltaUQ) {
-      // The deltauq surrogate returns a tuple of (outputs, outputs_stdev)
-      CALIPER(CALI_MARK_BEGIN("SURROGATE-EVAL");)
-      auto output_tuple = module.forward({input}).toTuple();
-      CALIPER(CALI_MARK_END("SURROGATE-EVAL");)
-
-      at::Tensor output_mean_tensor =
-          output_tuple->elements()[0].toTensor().detach();
-      at::Tensor output_stdev_tensor =
-          output_tuple->elements()[1].toTensor().detach().contiguous();
-      CALIPER(CALI_MARK_BEGIN("TENSOR_TO_ARRAY");)
-
-      computeDeltaUQPredicates(uq_policy,
-                               output_stdev_tensor.data_ptr<TypeInValue>(),
-                               predicates,
-                               num_elements,
-                               num_out,
-                               threshold);
-      tensorToArray(output_mean_tensor, num_elements, num_out, outputs);
-      CALIPER(CALI_MARK_END("TENSOR_TO_ARRAY");)
-    } else {
-      CALIPER(CALI_MARK_BEGIN("SURROGATE-EVAL");)
-      at::Tensor output = module.forward({input}).toTensor().detach();
-      CALIPER(CALI_MARK_END("SURROGATE-EVAL");)
-
-      CALIPER(CALI_MARK_BEGIN("TENSOR_TO_ARRAY");)
-      tensorToArray(output, num_elements, num_out, outputs);
-      CALIPER(CALI_MARK_END("TENSOR_TO_ARRAY");)
-    }
-
-    if (is_device()) {
-      ams::deviceCheckErrors(__FILE__, __LINE__);
-    }
-
-    DBG(Surrogate,
-        "Evaluate surrogate model (%ld, %ld) -> (%ld, %ld)",
-        num_elements,
-        num_in,
-        num_elements,
-        num_out);
-  }
-
-#else
-  template <typename T>
-  PERFFASPECT()
-  inline void _load(const std::string& model_path,
-                    const std::string& device_name)
-  {
-  }
-
-  PERFFASPECT()
-  inline void _evaluate(long num_elements,
-                        long num_in,
-                        size_t num_out,
-                        const TypeInValue** inputs,
-                        TypeInValue** outputs,
-                        AMSUQPolicy uq_policy,
-                        bool* predicates,
-                        double threshold)
-  {
-  }
-
-#endif
-
-  SurrogateModel(std::string& model_path,
-                 AMSResourceType resource = AMSResourceType::AMS_HOST,
-                 bool is_DeltaUQ = false)
-      : model_path(model_path),
-        model_resource(resource),
-        _is_DeltaUQ(is_DeltaUQ)
-  {
-
-    std::experimental::filesystem::path Path(model_path);
-    std::error_code ec;
-
-    if (!std::experimental::filesystem::exists(Path, ec)) {
-      FATAL(Surrogate,
-            "Path to Surrogate Model (%s) Does not exist",
-            model_path.c_str())
-    }
-
-    if (resource != AMSResourceType::AMS_DEVICE)
-      _load<TypeInValue>(model_path, "cpu");
-    else
-      _load<TypeInValue>(model_path, "cuda");
-  }
 
 protected:
-  template <typename T,
-            std::enable_if_t<std::is_same<T, float>::value>* = nullptr>
-  static bool same_type(bool is_double)
-  {
-    return !is_double;
-  }
-
-  template <typename T,
-            std::enable_if_t<std::is_same<T, double>::value>* = nullptr>
-  static bool same_type(bool is_double)
-  {
-    return is_double;
-  }
-
-  static std::unordered_map<std::string,
-                            std::shared_ptr<SurrogateModel<TypeInValue>>>
+  static std::unordered_map<std::string, std::shared_ptr<SurrogateModel>>
       instances;
 
+  SurrogateModel(std::string& model_path, bool is_DeltaUQ = false);
+
 public:
   // -------------------------------------------------------------------------
   // public interface
   // -------------------------------------------------------------------------
 
-  static std::shared_ptr<SurrogateModel<TypeInValue>> getInstance(
-      std::string& model_path,
-      AMSResourceType resource = AMSResourceType::AMS_HOST,
-      bool is_DeltaUQ = false)
+  static std::shared_ptr<SurrogateModel> getInstance(std::string& model_path,
+                                                     bool is_DeltaUQ = false)
   {
-    auto model =
-        SurrogateModel<TypeInValue>::instances.find(std::string(model_path));
+    auto model = SurrogateModel::instances.find(std::string(model_path));
     if (model != instances.end()) {
       // Model Found
       auto torch_model = model->second;
-      if (resource != torch_model->model_resource)
-        throw std::runtime_error(
-            "Currently we are not supporting loading the same model file on "
-            "different devices.");
-
-      if (is_DeltaUQ != torch_model->is_DeltaUQ())
-        THROW(std::runtime_error, "Loaded model instance is not DeltaUQ");
-
-      if (!same_type<TypeInValue>(torch_model->is_double()))
-        throw std::runtime_error(
-            "Requesting model loading of different data types.");
 
       DBG(Surrogate,
           "Returning existing model represented under (%s)",
@@ -377,116 +120,79 @@ class SurrogateModel
 
     // Model does not exist. We need to create one
     DBG(Surrogate, "Generating new model under (%s)", model_path.c_str());
-    std::shared_ptr<SurrogateModel<TypeInValue>> torch_model =
-        std::shared_ptr<SurrogateModel<TypeInValue>>(
-            new SurrogateModel<TypeInValue>(model_path, resource, is_DeltaUQ));
+    std::shared_ptr<SurrogateModel> torch_model =
+        std::shared_ptr<SurrogateModel>(
+            new SurrogateModel(model_path, is_DeltaUQ));
     instances.insert(std::make_pair(std::string(model_path), torch_model));
     return torch_model;
   };
 
   ~SurrogateModel()
   {
-    DBG(Surrogate, "Destroying surrogate model at %s", model_path.c_str());
+    DBG(Surrogate, "Destroying surrogate model at %s", _model_path.c_str());
   }
 
+  std::tuple<torch::Tensor, torch::Tensor> _computeDetlaUQ(
+      c10::IValue& deltaUQTuple,
+      ams::AMSUQPolicy policy,
+      float threshold);
 
-  PERFFASPECT()
-  inline void evaluate(long num_elements,
-                       size_t num_in,
-                       size_t num_out,
-                       const TypeInValue** inputs,
-                       TypeInValue** outputs,
-                       AMSUQPolicy uq_policy = AMSUQPolicy::AMS_UQ_BEGIN,
-                       bool* predicates = nullptr,
-                       double threshold = 0.0)
-  {
-    _evaluate(num_elements,
-              num_in,
-              num_out,
-              inputs,
-              outputs,
-              uq_policy,
-              predicates,
-              threshold);
-  }
+  std::tuple<torch::Tensor, torch::Tensor> _evaluate(torch::Tensor& inputs,
+                                                     ams::AMSUQPolicy policy,
+                                                     const float threshold);
 
-  PERFFASPECT()
-  inline void evaluate(long num_elements,
-                       std::vector<const TypeInValue*> inputs,
-                       std::vector<TypeInValue*> outputs,
-                       AMSUQPolicy uq_policy,
-                       bool* predicates,
-                       double threshold)
-  {
-    _evaluate(num_elements,
-              inputs.size(),
-              outputs.size(),
-              static_cast<const TypeInValue**>(inputs.data()),
-              static_cast<TypeInValue**>(outputs.data()),
-              uq_policy,
-              predicates,
-              threshold);
-  }
+  std::tuple<torch::Tensor, torch::Tensor> evaluate(
+      ams::MutableArrayRef<at::Tensor> Inputs,
+      ams::AMSUQPolicy policy,
+      const float threshold);
 
-  PERFFASPECT()
-  inline void evaluate(long num_elements,
-                       std::vector<const TypeInValue*> inputs,
-                       std::vector<TypeInValue*> outputs)
+
+  inline bool is_gpu() const
   {
-    _evaluate(num_elements,
-              inputs.size(),
-              outputs.size(),
-              static_cast<const TypeInValue**>(inputs.data()),
-              static_cast<TypeInValue**>(outputs.data()),
-              AMSUQPolicy::AMS_UQ_BEGIN,
-              nullptr,
-              0.0);
+    return model_device == ams::AMSResourceType::AMS_DEVICE;
   }
 
-#ifdef __ENABLE_TORCH__
-  bool is_double() { return (tensorOptions.dtype() == torch::kFloat64); }
-#else
-  bool is_double()
+  inline bool is_cpu() const
   {
-    if (typeid(TypeInValue) == typeid(double)) return true;
-    return false;
+    return model_device == ams::AMSResourceType::AMS_HOST;
   }
 
-#endif
-
-  inline bool is_device() const
+  inline bool is_resource(ams::AMSResourceType rType) const
   {
-#ifdef __ENABLE_TORCH__
-    return model_resource == AMSResourceType::AMS_DEVICE;
-#else
-    return false;
-#endif
+    return model_device == rType;
   }
 
-  bool is_DeltaUQ() { return _is_DeltaUQ; }
-
-  void update(const std::string& new_path)
+  inline bool is_float() const { return model_dtype == ams::AMS_SINGLE; }
+  inline bool is_double() const { return model_dtype == ams::AMS_DOUBLE; }
+  inline bool is_type(ams::AMSDType dType) const
   {
-    /* This function updates the underlying torch model,
-     * with a new one pointed at location modelPath. The previous
-     * one is destructed automatically.
-     *
-     * TODO: I decided to not update the model path on the ``instances''
-     * map. As we currently expect this change will be agnostic to the application
-     * user. But, in any case we should keep track of which model has been used at which
-     * invocation. This is currently not done.
-     */
-    if (model_resource != AMSResourceType::AMS_DEVICE)
-      _load<TypeInValue>(new_path, "cpu");
-    else
-      _load<TypeInValue>(new_path, "cuda");
+    return model_dtype == dType;
   }
 
-  AMSResourceType getModelResource() const { return model_resource; }
-};
 
-template <typename T>
-std::unordered_map<std::string, std::shared_ptr<SurrogateModel<T>>>
-    SurrogateModel<T>::instances;
+  //
+  bool is_DeltaUQ() { return _is_DeltaUQ; }
+  //
+  //  void update(const std::string& new_path)
+  //  {
+  //    /* This function updates the underlying torch model,
+  //     * with a new one pointed at location modelPath. The previous
+  //     * one is destructed automatically.
+  //     *
+  //     * TODO: I decided to not update the model path on the ``instances''
+  //     * map. As we currently expect this change will be agnostic to the application
+  //     * user. But, in any case we should keep track of which model has been used at which
+  //     * invocation. This is currently not done.
+  //     */
+  //    //if (model_device != AMSResourceType::AMS_DEVICE)
+  //    //  _load<TypeInValue>(new_path, "cpu");
+  //    //else
+  //    //  _load<TypeInValue>(new_path, "cuda");
+  //  }
+
+  //  AMSResourceType getModelResource() const { return model_device; }
+  std::tuple<ams::AMSResourceType, torch::DeviceType> getModelResourceType();
+  std::tuple<ams::AMSDType, torch::Dtype> getModelDataType();
+};
 
 #endif
diff --git a/src/AMSlib/ml/uq.hpp b/src/AMSlib/ml/uq.hpp
index ca73e64f..0cd708e0 100644
--- a/src/AMSlib/ml/uq.hpp
+++ b/src/AMSlib/ml/uq.hpp
@@ -82,7 +82,6 @@ class BaseUQ
   }
 };
 
-template <typename FPTypeValue>
 class UQ : public BaseUQ
 {
 public:
@@ -91,7 +90,7 @@ class UQ : public BaseUQ
      std::string &uqPath,
      const int nClusters,
      std::string &surrogatePath,
-     FPTypeValue threshold)
+     float threshold)
       : uqPolicy(uq_policy), threshold(threshold)
   {
     if (surrogatePath.empty()) {
@@ -113,9 +112,9 @@ class UQ : public BaseUQ
 
     bool is_DeltaUQ = isDeltaUQ(uqPolicy);
 
-    surrogate = SurrogateModel<FPTypeValue>::getInstance(surrogatePath,
-                                                         resourceLocation,
-                                                         is_DeltaUQ);
+    surrogate = SurrogateModel::getInstance(surrogatePath,
+                                            resourceLocation,
+                                            is_DeltaUQ);
 
     if (isFaissUQ(uqPolicy)) {
       if (uqPath.empty())
diff --git a/src/AMSlib/util/SmallVector.cpp b/src/AMSlib/wf/SmallVector.cpp
similarity index 99%
rename from src/AMSlib/util/SmallVector.cpp
rename to src/AMSlib/wf/SmallVector.cpp
index 4276dd0b..6959b86a 100644
--- a/src/AMSlib/util/SmallVector.cpp
+++ b/src/AMSlib/wf/SmallVector.cpp
@@ -22,7 +22,7 @@
 #include <stdexcept>
 #include <string>
 
-#include "SmallVector.h"
+#include "SmallVector.hpp"
 using namespace ams;
 
 // Check that no bytes are wasted and everything is well-aligned.
diff --git a/src/AMSlib/wf/basedb.cpp b/src/AMSlib/wf/basedb.cpp
index da8b8a10..0edaa158 100644
--- a/src/AMSlib/wf/basedb.cpp
+++ b/src/AMSlib/wf/basedb.cpp
@@ -15,10 +15,6 @@ AMSDBType getDBType(std::string type)
 {
   if (type.compare("hdf5") == 0) {
     return AMSDBType::AMS_HDF5;
-  } else if (type.compare("csv") == 0) {
-    return AMSDBType::AMS_CSV;
-  } else if (type.compare("redis") == 0) {
-    return AMSDBType::AMS_REDIS;
   } else if (type.compare("rmq") == 0) {
     return AMSDBType::AMS_RMQ;
   }
@@ -30,14 +26,10 @@ std::string getDBTypeAsStr(AMSDBType type)
   switch (type) {
     case AMSDBType::AMS_NONE:
       return "None";
-    case AMSDBType::AMS_CSV:
-      return "csv";
     case AMSDBType::AMS_HDF5:
       return "hdf5";
     case AMSDBType::AMS_RMQ:
       return "rmq";
-    case AMSDBType::AMS_REDIS:
-      return "redis";
   }
   return "Unknown";
 }
diff --git a/src/AMSlib/wf/basedb.hpp b/src/AMSlib/wf/basedb.hpp
index 5ad149ca..8e337bca 100644
--- a/src/AMSlib/wf/basedb.hpp
+++ b/src/AMSlib/wf/basedb.hpp
@@ -8,6 +8,10 @@
 #ifndef __AMS_BASE_DB__
 #define __AMS_BASE_DB__
 
+#include <ATen/core/TensorBody.h>
+#include <H5Ipublic.h>
+#include <torch/torch.h>
+
 #include <cstdint>
 #include <experimental/filesystem>
 #include <fstream>
@@ -21,21 +25,16 @@
 #include <vector>
 
 #include "AMS.h"
+#include "ArrayRef.hpp"
 #include "debug.h"
+#include "macro.h"
 #include "wf/debug.h"
 #include "wf/resource_manager.hpp"
 #include "wf/utils.hpp"
 
 namespace fs = std::experimental::filesystem;
 
-#ifdef __ENABLE_REDIS__
-#include <sw/redis++/redis++.h>
-
-#include <iomanip>
-#warning Redis is currently not supported/tested
-#endif
-
-#ifdef __ENABLE_HDF5__
+#ifdef __AMS_ENABLE_HDF5__
 #include <H5Ipublic.h>
 #include <hdf5.h>
 #define HDF5_ERROR(Eid)                                             \
@@ -50,7 +49,7 @@ namespace fs = std::experimental::filesystem;
 #include <caliper/cali_macros.h>
 #endif
 
-#ifdef __ENABLE_RMQ__
+#ifdef __AMS_ENABLE_RMQ__
 #include <amqpcpp.h>
 #include <amqpcpp/libevent.h>
 #include <amqpcpp/linux_tcp.h>
@@ -73,7 +72,7 @@ namespace fs = std::experimental::filesystem;
 #include <thread>
 #include <tuple>
 
-#endif  // __ENABLE_RMQ__
+#endif  // __AMS_ENABLE_RMQ__
 
 namespace ams
 {
@@ -114,8 +113,7 @@ class BaseDB
   virtual AMSDBType dbType() = 0;
 
   /**
-   * @brief Takes an input and an output vector each holding 1-D vectors data, and
-   * store. them in persistent data storage.
+   * @brief Takes an input and an output Tensor.
    * @param[in] num_elements Number of elements of each 1-D vector
    * @param[in] inputs Vector of 1-D vectors containing the inputs to be stored
    * @param[in] inputs Vector of 1-D vectors, each 1-D vectors contains
@@ -124,16 +122,9 @@ class BaseDB
    * 'num_elements'  values to be stored
    */
 
-  virtual void store(size_t num_elements,
-                     std::vector<double*>& inputs,
-                     std::vector<double*>& outputs,
-                     bool* predicate = nullptr) = 0;
-
+  virtual void store(ArrayRef<torch::Tensor> Inputs,
+                     ArrayRef<torch::Tensor> Outputs) = 0;
 
-  virtual void store(size_t num_elements,
-                     std::vector<float*>& inputs,
-                     std::vector<float*>& outputs,
-                     bool* predicate = nullptr) = 0;
 
   uint64_t getId() const { return id; }
 
@@ -143,7 +134,7 @@ class BaseDB
 
   virtual std::string getLatestModel() { return {}; }
 
-  virtual bool storePredicate() const { return false; }
+  virtual std::string getFilename() const { return ""; }
 };
 
 /**
@@ -210,162 +201,29 @@ class FileDB : public BaseDB
     this->fn = fs::absolute(Path).string();
     DBG(DB, "File System DB writes to file %s", this->fn.c_str())
   }
-};
-
-
-class csvDB final : public FileDB
-{
-private:
-  /** @brief file descriptor */
-  bool writeHeader;
-  std::fstream fd;
-
-  PERFFASPECT()
-  template <typename TypeValue>
-  void _store(size_t num_elements,
-              std::vector<TypeValue*>& inputs,
-              std::vector<TypeValue*>& outputs)
-  {
-    DBG(DB,
-        "DB of type %s stores %ld elements of input/output dimensions (%lu, "
-        "%lu)",
-        type().c_str(),
-        num_elements,
-        inputs.size(),
-        outputs.size())
-
-    CALIPER(CALI_MARK_BEGIN("STORE_CSV");)
-    const size_t num_in = inputs.size();
-    const size_t num_out = outputs.size();
-
-    if (writeHeader) {
-      for (size_t i = 0; i < num_in; i++)
-        fd << "input_" << i << ":";
-      for (size_t i = 0; i < num_out - 1; i++)
-        fd << "output_" << i << ":";
-      fd << "output_" << num_out - 1 << "\n";
-      writeHeader = false;
-    }
-
-    for (size_t i = 0; i < num_elements; i++) {
-      for (size_t j = 0; j < num_in; j++) {
-        fd << inputs[j][i] << ":";
-      }
-
-      for (size_t j = 0; j < num_out - 1; j++) {
-        fd << outputs[j][i] << ":";
-      }
-      fd << outputs[num_out - 1][i] << "\n";
-    }
-    CALIPER(CALI_MARK_END("STORE_CSV");)
-  }
-
-
-public:
-  csvDB(const csvDB&) = delete;
-  csvDB& operator=(const csvDB&) = delete;
-
-  /**
-   * @brief constructs the class and opens the file to write to
-   * @param[in] fn Name of the file to store data to
-   * @param[in] rId a unique Id for each process taking part in a distributed
-   * execution (rank-id)
-   */
-  csvDB(std::string path, std::string fn, uint64_t rId)
-      : FileDB(path, fn, ".csv", rId)
-  {
-    writeHeader = !fs::exists(this->fn);
-    fd.open(this->fn, std::ios_base::app | std::ios_base::out);
-    if (!fd.is_open()) {
-      std::cerr << "Cannot open db file: " << this->fn << std::endl;
-    }
-    DBG(DB, "DB Type: %s", type().c_str())
-  }
-
-  /**
-   * @brief deconstructs the class and closes the file
-   */
-  ~csvDB()
-  {
-    DBG(DB, "Closing File: %s %s", type().c_str(), this->fn.c_str())
-    fd.close();
-  }
-
-  virtual void store(size_t num_elements,
-                     std::vector<float*>& inputs,
-                     std::vector<float*>& outputs,
-                     bool* predicate = nullptr) override
-  {
-    CFATAL(CSV,
-           predicate != nullptr,
-           "CSV database does not support storing uq-predicates")
-
-    _store(num_elements, inputs, outputs);
-  }
-
-  virtual void store(size_t num_elements,
-                     std::vector<double*>& inputs,
-                     std::vector<double*>& outputs,
-                     bool* predicate = nullptr) override
-  {
-
-    CFATAL(CSV,
-           predicate != nullptr,
-           "CSV database does not support storing uq-predicates")
-
-    _store(num_elements, inputs, outputs);
-  }
-
-
-  /**
-   * @brief Define the type of the DB (File, Redis etc)
-   */
-  std::string type() override { return "csv"; }
 
-  /**
-   * @brief Return the DB enumerationt type (File, Redis etc)
-   */
-  AMSDBType dbType() override { return AMSDBType::AMS_CSV; };
-
-  /**
-   * @brief Takes an input and an output vector each holding 1-D vectors data, and
-   * store them into a csv file delimited by ':'. This should never be used for
-   * large scale simulations as txt/csv format will be extremely slow.
-   * @param[in] num_elements Number of elements of each 1-D vector
-   * @param[in] inputs Vector of 1-D vectors containing the inputs to bestored
-   * @param[in] inputs Vector of 1-D vectors, each 1-D vectors contains
-   * 'num_elements'  values to be stored
-   * @param[in] outputs Vector of 1-D vectors, each 1-D vectors contains
-   * 'num_elements'  values to be stored
-   */
+  std::string getFilename() const { return fn; }
 };
 
 
-#ifdef __ENABLE_HDF5__
+#ifdef __AMS_ENABLE_HDF5__
 class hdf5DB final : public FileDB
 {
 private:
   /** @brief file descriptor */
   hid_t HFile;
-  /** @brief vector holding the hdf5 dataset descriptor.
-   * We currently store every input on a separate dataset
+  /** @brief The hdf5 dataset descriptor for input data.
    */
-  std::vector<hid_t> HDIsets;
+  hid_t HDIset;
 
-  /** @brief vector holding the hdf5 dataset descriptor.
-   * We currently store every output on a separate dataset
+  /** @brief the hdf5 dataset descriptor for output data.
    */
-  std::vector<hid_t> HDOsets;
-
-  /** @brief Total number of elements we have in our file   */
-  hsize_t totalElements;
+  hid_t HDOset;
 
   hid_t HDType;
 
-  /** @brief the dataset descriptor of the predicates */
-  hid_t pSet;
-
-  const bool predicateStore;
+  ams::SmallVector<hsize_t> currentInputShape;
+  ams::SmallVector<hsize_t> currentOutputShape;
 
   /** @brief create or get existing hdf5 dataset with the provided name
    * storing data as Ckunked pieces. The Chunk value controls the chunking
@@ -378,6 +236,8 @@ class hdf5DB final : public FileDB
    */
   hid_t getDataSet(hid_t group,
                    std::string dName,
+                   ams::SmallVector<hsize_t>& currentShape,
+                   const at::IntArrayRef Shape,
                    hid_t dataType,
                    const size_t Chunk = 1024L);
 
@@ -389,9 +249,8 @@ class hdf5DB final : public FileDB
    * @param[in] numIn number of input 1-D vectors
    * @param[in] numOut number of output 1-D vectors
    */
-  void createDataSets(size_t numElements,
-                      const size_t numIn,
-                      const size_t numOut);
+  void createDataSets(const at::IntArrayRef InShapes,
+                      const at::IntArrayRef OutShapes);
 
   /**
    * @brief Write all the data in the vectors in the respective datasets.
@@ -401,25 +260,13 @@ class hdf5DB final : public FileDB
    * to be written in the db.
    * @param[in] numElements The number of elements each vector has
    */
-  template <typename TypeValue>
-  void writeDataToDataset(std::vector<hid_t>& dsets,
-                          std::vector<TypeValue*>& data,
-                          size_t numElements);
 
-  /** @brief Writes a single 1-D vector to the dataset
-   * @param[in] dSet the dataset to write the data to
-   * @param[in] data the data we need to write
-   * @param[in] elements the number of data elements we have
-   * @param[in] datatype of elements we will write
-   */
-  void writeVecToDataset(hid_t dSet, void* data, size_t elements, hid_t DType);
+  void writeDataToDataset(ams::MutableArrayRef<hsize_t> currentShape,
+                          hid_t& dset,
+                          const at::Tensor& tensor_data);
 
   PERFFASPECT()
-  template <typename TypeValue>
-  void _store(size_t num_elements,
-              std::vector<TypeValue*>& inputs,
-              std::vector<TypeValue*>& outputs,
-              bool* predicate = nullptr);
+  void _store(const at::Tensor& inputs, const at::Tensor& outputs);
 
 public:
   // Delete copy constructors. We do not want to copy the DB around
@@ -436,8 +283,7 @@ class hdf5DB final : public FileDB
   hdf5DB(std::string path,
          std::string domain_name,
          std::string fn,
-         uint64_t rId,
-         bool predicate = false);
+         uint64_t rId);
 
   /**
    * @brief deconstructs the class and closes the file
@@ -456,209 +302,20 @@ class hdf5DB final : public FileDB
 
 
   /**
-   * @brief Takes an input and an output vector each holding 1-D vectors data,
-   * and store them into a hdf5 file delimited by ':'. This should never be used
-   * for large scale simulations as txt/hdf5 format will be extremely slow.
-   * @param[in] num_elements Number of elements of each 1-D vector
-   * @param[in] inputs Vector of 1-D vectors containing the inputs to bestored
-   * @param[in] inputs Vector of 1-D vectors, each 1-D vectors contains
-   * 'num_elements'  values to be stored
-   * @param[in] outputs Vector of 1-D vectors, each 1-D vectors contains
-   * 'num_elements'  values to be stored
+   * @brief Takes an input and an output tensor each holding data,
+   * and stores them into a hdf5 file. 
+   * @param[in] inputs Tensor containing the inputs to bestored
+   * @param[in] outputs Tensor containing the outputs to bestored
    */
-  void store(size_t num_elements,
-             std::vector<float*>& inputs,
-             std::vector<float*>& outputs,
-             bool* predicate = nullptr) override;
-
-
-  /**
-   * @brief Takes an input and an output vector each holding 1-D vectors data,
-   * and store them into a hdf5 file delimited by ':'. This should never be used
-   * for large scale simulations as txt/hdf5 format will be extremely slow.
-   * @param[in] num_elements Number of elements of each 1-D vector
-   * @param[in] inputs Vector of 1-D vectors containing the inputs to bestored
-   * @param[in] inputs Vector of 1-D vectors, each 1-D vectors contains
-   * 'num_elements'  values to be stored
-   * @param[in] outputs Vector of 1-D vectors, each 1-D vectors contains
-   * 'num_elements'  values to be stored
-   */
-  void store(size_t num_elements,
-             std::vector<double*>& inputs,
-             std::vector<double*>& outputs,
-             bool* predicate = nullptr) override;
-
-  /**
-   * @brief Returns whether the DB can also store predicate information for debug
-   * purposes
-   */
-  bool storePredicate() const override { return predicateStore; }
+  virtual void store(ArrayRef<torch::Tensor> Inputs,
+                     ArrayRef<torch::Tensor> Outputs) override;
 };
-#endif
-
 
-#ifdef __ENABLE_REDIS__
-template <typename TypeValue>
-class RedisDB : public BaseDB<TypeValue>
-{
-  const std::string _fn;  // path to the file storing the DB access config
-  uint64_t _dbid;
-  sw::redis::Redis* _redis;
-  uint64_t keyId;
-
-public:
-  RedisDB(const RedisDB&) = delete;
-  RedisDB& operator=(const RedisDB&) = delete;
-
-  /**
-   * @brief constructs the class and opens the file to write to
-   * @param[in] fn Name of the file to store data to
-   * @param[in] rId a unique Id for each process taking part in a distributed
-   * execution (rank-id)
-   */
-  RedisDB(std::string fn, uint64_t rId)
-      : BaseDB<TypeValue>(rId), _fn(fn), _redis(nullptr), keyId(0)
-  {
-    _dbid = reinterpret_cast<uint64_t>(this);
-    auto connection_info = read_json(fn);
-
-    sw::redis::ConnectionOptions connection_options;
-    connection_options.type = sw::redis::ConnectionType::TCP;
-    connection_options.host = connection_info["host"];
-    connection_options.port = std::stoi(connection_info["service-port"]);
-    connection_options.password = connection_info["database-password"];
-    connection_options.db = 0;  // Optionnal, 0 is the default
-    connection_options.tls.enabled =
-        true;  // Required to connect to PDS within LC
-    connection_options.tls.cacert = connection_info["cert"];
-
-    sw::redis::ConnectionPoolOptions pool_options;
-    pool_options.size = 100;  // Pool size, i.e. max number of connections.
-
-    _redis = new sw::redis::Redis(connection_options, pool_options);
-  }
-
-  ~RedisDB()
-  {
-    std::cerr << "Deleting RedisDB object\n";
-    delete _redis;
-  }
-
-  inline std::string type() override { return "RedisDB"; }
-
-  /**
-   * @brief Return the DB enumerationt type (File, Redis etc)
-   */
-  AMSDBType dbType() { return AMSDBType::REDIS; };
-
-
-  inline std::string info() { return _redis->info(); }
-
-  // Return the number of keys in the DB
-  inline long long dbsize() { return _redis->dbsize(); }
-
-  /* !
-   * ! WARNING: Flush the entire Redis, accross all DBs!
-   * !
-   */
-  inline void flushall() { _redis->flushall(); }
-
-  /*
-   * ! WARNING: Flush the entire current DB!
-   * !
-   */
-  inline void flushdb() { _redis->flushdb(); }
-
-  std::unordered_map<std::string, std::string> read_json(std::string fn)
-  {
-    std::ifstream config;
-    std::unordered_map<std::string, std::string> connection_info = {
-        {"database-password", ""},
-        {"host", ""},
-        {"service-port", ""},
-        {"cert", ""},
-    };
-
-    config.open(fn, std::ifstream::in);
-    if (config.is_open()) {
-      std::string line;
-      // Quite inefficient parsing (to say the least..) but the file to parse is
-      // small (4 lines)
-      // TODO: maybe use Boost or another JSON library
-      while (std::getline(config, line)) {
-        if (line.find("{") != std::string::npos ||
-            line.find("}") != std::string::npos) {
-          continue;
-        }
-        line.erase(std::remove(line.begin(), line.end(), ' '), line.end());
-        line.erase(std::remove(line.begin(), line.end(), ','), line.end());
-        line.erase(std::remove(line.begin(), line.end(), '"'), line.end());
-
-        std::string key = line.substr(0, line.find(':'));
-        line.erase(0, line.find(":") + 1);
-        connection_info[key] = line;
-        // std::cerr << "key=" << key << " and value=" << line << std::endl;
-      }
-      config.close();
-    } else {
-      std::cerr << "Config located at: " << fn << std::endl;
-      throw std::runtime_error("Could not open Redis config file");
-    }
-    return connection_info;
-  }
-
-  void store(size_t num_elements,
-             std::vector<TypeValue*>& inputs,
-             std::vector<TypeValue*>& outputs,
-             bool predicate = nullptr) override
-  {
-
-    CFATAL(REDIS,
-           predicate != nullptr,
-           "REDIS database does not support storing uq-predicates")
-
-    const size_t num_in = inputs.size();
-    const size_t num_out = outputs.size();
-
-    // TODO:
-    //      Make insertion more efficient.
-    //      Right now it's pretty naive and expensive
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (size_t i = 0; i < num_elements; i++) {
-      std::string key = std::to_string(_dbid) + ":" + std::to_string(keyId) +
-                        ":" +
-                        std::to_string(i);  // In Redis a key must be a string
-      std::ostringstream fd;
-      for (size_t j = 0; j < num_in; j++) {
-        fd << inputs[j][i] << ":";
-      }
-      for (size_t j = 0; j < num_out - 1; j++) {
-        fd << outputs[j][i] << ":";
-      }
-      fd << outputs[num_out - 1][i];
-      std::string val(fd.str());
-      _redis->set(key, val);
-    }
-
-    keyId += 1;
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
-    auto nb_keys = this->dbsize();
-
-    std::cout << std::setprecision(2) << "Inserted " << num_elements
-              << " keys [Total keys = " << nb_keys << "]  into RedisDB [Total "
-              << duration.count() << "ms, "
-              << static_cast<double>(num_elements) / duration.count()
-              << " key/ms]" << std::endl;
-  }
-};
+#endif
 
-#endif  // __ENABLE_REDIS__
 
-#ifdef __ENABLE_RMQ__
+#ifdef __AMS_ENABLE_RMQ__
+// TODO IMPLEMENT THIS AFTER everything else is working
 
 enum class ConnectionStatus { FAILED, CONNECTED, CLOSED, ERROR };
 
@@ -666,17 +323,15 @@ enum class ConnectionStatus { FAILED, CONNECTED, CLOSED, ERROR };
   * @brief AMS represents the header as follows:
   * The header is 16 bytes long:
   *   - 1 byte is the size of the header (here 16). Limit max: 255
-  *   - 1 byte is the precision (4 for float, 8 for double). Limit max: 255
   *   - 2 bytes are the MPI rank (0 if AMS is not running with MPI). Limit max: 65535
   *   - 2 bytes to store the size of the MSG domain name. Limit max: 65535
-  *   - 4 bytes are the number of elements in the message. Limit max: 2^32 - 1
-  *   - 2 bytes are the input dimension. Limit max: 65535
-  *   - 2 bytes are the output dimension. Limit max: 65535
-  *   - 2 bytes for padding. Limit max: 2^16 - 1
+  *   - 2 bytes are the number of input tensors . Limit max: 65535
+  *   - 2 bytes are the number of output tensors . Limit max: 65535
+  *   - 3 bytes for padding. Limit max: 2^16 - 1
   *
-  * |_Header_|_Datatype_|___Rank___|__DomainSize__|__#elems__|___InDim____|___OutDim___|_Pad_|.real data.|
+  * |_Header_|___Rank___|__DomainSize__|___InDim____|___OutDim___|_Pad_|.real data.|
   * ^        ^          ^          ^              ^          ^            ^            ^     ^           ^
-  * | Byte 1 |  Byte 2  | Byte 3-4 |  Byte 5-6    |Byte 6-10 | Byte 10-12 | Byte 12-14 |-----| Byte 16-k |
+  * | Byte 1 | Byte 2-3 |    Byte 4-5  |  Byte 6-7  |Byte 8-9    | Byte 10-12 | Byte 12-14 |-----| Byte 16-k |
   *
   * where X = datatype * num_element * (InDim + OutDim). Total message size is 16+k. 
   *
@@ -689,17 +344,13 @@ enum class ConnectionStatus { FAILED, CONNECTED, CLOSED, ERROR };
 struct AMSMsgHeader {
   /** @brief Header size (bytes) */
   uint8_t hsize;
-  /** @brief Data type size (bytes) */
-  uint8_t dtype;
   /** @brief MPI rank */
   uint16_t mpi_rank;
   /** @brief Domain Name Size */
   uint16_t domain_size;
-  /** @brief Number of elements */
-  uint32_t num_elem;
-  /** @brief Inputs dimension */
+  /** @brief Number of input tensors*/
   uint16_t in_dim;
-  /** @brief Outputs dimension */
+  /** @brief Number of ouput tensors */
   uint16_t out_dim;
 
   /**
@@ -711,10 +362,8 @@ struct AMSMsgHeader {
    */
   AMSMsgHeader(size_t mpi_rank,
                size_t domain_size,
-               size_t num_elem,
                size_t in_dim,
-               size_t out_dim,
-               size_t type_size);
+               size_t out_dim);
 
   /**
    * @brief Constructor for AMSMsgHeader
@@ -725,10 +374,8 @@ struct AMSMsgHeader {
    */
   AMSMsgHeader(uint16_t mpi_rank,
                uint16_t domain_size,
-               uint32_t num_elem,
                uint16_t in_dim,
-               uint16_t out_dim,
-               uint8_t type_size);
+               uint16_t out_dim);
 
   /**
    * @brief Return the size of a header in the AMS protocol.
@@ -736,11 +383,10 @@ struct AMSMsgHeader {
    */
   static size_t constexpr size()
   {
-    return ((sizeof(hsize) + sizeof(dtype) + sizeof(mpi_rank) +
-             sizeof(domain_size) + sizeof(num_elem) + sizeof(in_dim) +
-             sizeof(out_dim) + sizeof(double) - 1) /
-            sizeof(double)) *
-           sizeof(double);
+    return ((sizeof(hsize) + sizeof(mpi_rank) + sizeof(domain_size) +
+             sizeof(in_dim) + sizeof(out_dim) + sizeof(float) - 1) /
+            sizeof(float)) *
+           sizeof(float);
   }
 
   /**
@@ -758,12 +404,57 @@ struct AMSMsgHeader {
   static AMSMsgHeader decode(uint8_t* data_blob);
 };
 
+template <typename T>
+static inline size_t serialize_data(uint8_t* dest, T src)
+{
+  uint8_t* ptr = reinterpret_cast<uint8_t*>(&src);
+  for (int i = 0; i < sizeof(T); i++) {
+    dest[i] = ptr[i];
+  }
+
+  return sizeof(T);
+}
 
 /**
  * @brief Class representing a message for the AMSLib
  */
 class AMSMessage
 {
+private:
+  static size_t computeSerializedSize(const torch::Tensor& tensor)
+  {
+    // First we need to store how many dimensions this tensor has.
+    size_t totalBytes = sizeof(size_t);
+    // Next we need to get the required bytes to store both shape and strides.
+    totalBytes += tensor.sizes().size() * sizeof(size_t) * 2;
+    // Next we need to store the number of bytes of this tensor.
+    totalBytes += sizeof(size_t);
+    // And finally the size of the data themselves.
+    return totalBytes + tensor.nbytes();
+  }
+
+  static void serializeTensorHeader(const torch::Tensor& tensor, uint8_t*& blob)
+  {
+    blob += serialize_data(blob, static_cast<uint64_t>(tensor.sizes().size()));
+    blob += serialize_data(blob, static_cast<uint64_t>(tensor.nbytes()));
+    for (auto& V : tensor.sizes()) {
+      blob += serialize_data(blob, static_cast<uint64_t>(V));
+    }
+    for (auto& V : tensor.strides()) {
+      blob += serialize_data(blob, static_cast<uint64_t>(V));
+    }
+  }
+
+  static void serializeTensor(const torch::Tensor& tensor, uint8_t*& blob)
+  {
+    auto start = blob;
+    serializeTensorHeader(tensor, blob);
+    auto afterHeader = blob;
+    std::memcpy(blob, tensor.data_ptr(), tensor.nbytes());
+    blob += tensor.nbytes();
+    auto afterData = blob;
+  }
+
 public:
   /** @brief message ID */
   int _id;
@@ -773,8 +464,6 @@ class AMSMessage
   uint8_t* _data;
   /** @brief The total size of the binary blob in bytes */
   size_t _total_size;
-  /** @brief The number of input/output pairs */
-  size_t _num_elements;
   /** @brief The dimensions of inputs */
   size_t _input_dim;
   /** @brief The dimensions of outputs */
@@ -786,7 +475,6 @@ class AMSMessage
   AMSMessage()
       : _id(0),
         _rank(0),
-        _num_elements(0),
         _input_dim(0),
         _output_dim(0),
         _data(nullptr),
@@ -802,31 +490,39 @@ class AMSMessage
    * @param[in]  inputs              Inputs
    * @param[in]  outputs             Outputs
    */
-  template <typename TypeValue>
   AMSMessage(int id,
              uint64_t rId,
              std::string& domain_name,
-             size_t num_elements,
-             const std::vector<TypeValue*>& inputs,
-             const std::vector<TypeValue*>& outputs)
+             ArrayRef<torch::Tensor> Inputs,
+             ArrayRef<torch::Tensor> Outputs)
       : _id(id),
         _rank(rId),
-        _num_elements(num_elements),
-        _input_dim(inputs.size()),
-        _output_dim(outputs.size()),
+        _input_dim(Inputs.size()),
+        _output_dim(Outputs.size()),
         _data(nullptr),
         _total_size(0)
   {
-    CALIPER(CALI_MARK_BEGIN("AMS_MESSAGE");)
-    AMSMsgHeader header(_rank,
-                        domain_name.size(),
-                        _num_elements,
-                        _input_dim,
-                        _output_dim,
-                        sizeof(TypeValue));
-
-    _total_size = AMSMsgHeader::size() + domain_name.size() +
-                  getTotalElements() * sizeof(TypeValue);
+    SmallVector<torch::Tensor> _inputs;
+    SmallVector<torch::Tensor> _outputs;
+    auto tOptions = torch::TensorOptions()
+                        .dtype(torch::kFloat32)
+                        .device(c10::DeviceType::CPU);
+
+    for (auto& tensor : Inputs)
+      _inputs.push_back(tensor.contiguous().to(tOptions));
+
+    for (auto& tensor : Outputs)
+      _outputs.push_back(tensor.contiguous().to(tOptions));
+
+    AMSMsgHeader header(_rank, domain_name.size(), _input_dim, _output_dim);
+
+    _total_size = AMSMsgHeader::size() + domain_name.size();
+
+    for (auto& tensor : _inputs)
+      _total_size += computeSerializedSize(tensor);
+    for (auto& tensor : _outputs)
+      _total_size += computeSerializedSize(tensor);
+
     auto& rm = ams::ResourceManager::getInstance();
     _data = rm.allocate<uint8_t>(_total_size, AMSResourceType::AMS_HOST);
 
@@ -835,9 +531,19 @@ class AMSMessage
                 domain_name.c_str(),
                 domain_name.size());
     current_offset += domain_name.size();
-    current_offset += encode_data(_data + current_offset, inputs, outputs);
-    DBG(AMSMessage, "Allocated message %d: %p", _id, _data);
-    CALIPER(CALI_MARK_END("AMS_MESSAGE");)
+
+    uint8_t* blob = _data + current_offset;
+
+
+    for (auto& tensor : _inputs)
+      serializeTensor(tensor, blob);
+    for (auto& tensor : _outputs)
+      serializeTensor(tensor, blob);
+    DBG(AMSMessage,
+        "Allocated message %d: %p with size: %ld",
+        _id,
+        _data,
+        reinterpret_cast<uintptr_t>(blob) - reinterpret_cast<uintptr_t>(_data));
   }
 
   /**
@@ -899,49 +605,6 @@ class AMSMessage
     return *this;
   }
 
-  /**
-   * @brief Fill a buffer with a data section starting at a given position.
-   * @param[in]  data_blob          The buffer to fill
-   * @param[in]  offset     Position where to start writing in the buffer
-   * @param[in]  inputs             Inputs
-   * @param[in]  outputs            Outputs
-   * @return The number of bytes in the message or 0 if error
-   */
-  template <typename TypeValue>
-  size_t encode_data(uint8_t* data_blob,
-                     const std::vector<TypeValue*>& inputs,
-                     const std::vector<TypeValue*>& outputs)
-  {
-    if (!data_blob) return 0;
-    size_t offset = 0;
-
-    // Creating the body part of the message
-    for (size_t i = 0; i < _input_dim; i++) {
-      std::memcpy(data_blob + offset,
-                  inputs[i],
-                  _num_elements * sizeof(TypeValue));
-      offset += (_num_elements * sizeof(TypeValue));
-    }
-
-    for (size_t i = 0; i < _output_dim; i++) {
-      std::memcpy(data_blob + offset,
-                  outputs[i],
-                  _num_elements * sizeof(TypeValue));
-      offset += (_num_elements * sizeof(TypeValue));
-    }
-
-    return ((_input_dim + _output_dim) * _num_elements) * sizeof(TypeValue);
-  }
-
-  /**
-   *  @brief Return the total number of elements in this message
-   *  @return  Size in bytes of the data portion
-   */
-  size_t getTotalElements() const
-  {
-    return (_num_elements * (_input_dim + _output_dim));
-  }
-
   /**
    * @brief Return the underlying data pointer
    * @return Data pointer (binary blob)
@@ -1697,7 +1360,7 @@ class ConnectionManagerAMQP
  * @brief Class that manages a RabbitMQ broker and handles connection, event
  * loop and set up various handlers.
  * @details This class handles a specific type of database backend in AMSLib.
- * Instead of writing inputs/outputs directly to files (CSV or HDF5), we
+ * Instead of writing inputs/outputs directly to files (HDF5), we
  * send these elements (a collection of inputs and their corresponding outputs)
  * to a service called RabbitMQ which is listening on a given IP and port.
  * 
@@ -1855,22 +1518,19 @@ class RMQInterface
    * @param[in] inputs A vector containing arrays of inputs, each array has num_elements elements
    * @param[in] outputs A vector containing arrays of outputs, each array has num_elements elements
    */
-  template <typename TypeValue>
   void publish(std::string& domain_name,
-               size_t num_elements,
-               std::vector<TypeValue*>& inputs,
-               std::vector<TypeValue*>& outputs)
+               ArrayRef<torch::Tensor> Inputs,
+               ArrayRef<torch::Tensor> Outputs)
   {
+    CALIPER(CALI_MARK_BEGIN("STORE_RMQ");)
     DBG(RMQInterface,
         "[tag=%d] stores %ld elements of input/output "
         "dimensions (%ld, %ld)",
         _msg_tag,
-        num_elements,
-        inputs.size(),
-        outputs.size())
+        Inputs.size(),
+        Outputs.size())
 
-    CALIPER(CALI_MARK_BEGIN("STORE_RMQ");)
-    AMSMessage msg(_msg_tag, _rId, domain_name, num_elements, inputs, outputs);
+    AMSMessage msg(_msg_tag, _rId, domain_name, Inputs, Outputs);
 
     // TODO: we could simplify the logic here
     // AMSMessage could directly produce a shared ptr
@@ -1964,27 +1624,12 @@ class RabbitMQDB final : public BaseDB
    * @param[in] predicate (NOT SUPPORTED YET) Series of predicate
    */
   PERFFASPECT()
-  void store(size_t num_elements,
-             std::vector<double*>& inputs,
-             std::vector<double*>& outputs,
-             bool* predicate = nullptr) override
+  virtual void store(ArrayRef<torch::Tensor> Inputs,
+                     ArrayRef<torch::Tensor> Outputs)
   {
-    CFATAL(RMQDB,
-           predicate != nullptr,
-           "RMQ database does not support storing uq-predicates")
-    interface.publish(appDomain, num_elements, inputs, outputs);
+    interface.publish(appDomain, Inputs, Outputs);
   }
 
-  void store(size_t num_elements,
-             std::vector<float*>& inputs,
-             std::vector<float*>& outputs,
-             bool* predicate = nullptr) override
-  {
-    CFATAL(RMQDB,
-           predicate != nullptr,
-           "RMQ database does not support storing uq-predicates")
-    interface.publish(appDomain, num_elements, inputs, outputs);
-  }
 
   /**
    * @brief Return the type of this broker
@@ -2019,7 +1664,7 @@ class RMQInterface
   void close() {}
 };
 
-#endif  // __ENABLE_RMQ__
+#endif  // __AMS_ENABLE_RMQ__
 
 class FilesystemInterface
 {
@@ -2064,7 +1709,7 @@ class FilesystemInterface
 class DBManager
 {
 
-#ifdef __ENABLE_RMQ__
+#ifdef __AMS_ENABLE_RMQ__
   friend RabbitMQDB;
 #endif
 
@@ -2075,7 +1720,7 @@ class DBManager
   /** @brief If True, the DB is allowed to update the surrogate model */
   bool updateSurrogate;
 
-  DBManager() : dbType(AMSDBType::AMS_NONE), updateSurrogate(false) {};
+  DBManager() : dbType(AMSDBType::AMS_NONE), updateSurrogate(false){};
 
 protected:
   RMQInterface rmq_interface;
@@ -2129,18 +1774,12 @@ class DBManager
   std::shared_ptr<BaseDB> createDB(std::string& domainName,
                                    std::string& dbLabel,
                                    AMSDBType dbType,
-                                   uint64_t rId = 0,
-                                   bool isDebug = false)
+                                   uint64_t rId = 0)
   {
-    CWARNING(DBManager,
-             (isDebug && dbType != AMSDBType::AMS_HDF5),
-             "Requesting debug database but %d db type does not support it",
-             dbType);
-#ifdef __ENABLE_DB__
+
     DBG(DBManager, "Instantiating data base");
 
-    if ((dbType == AMSDBType::AMS_CSV || dbType == AMSDBType::AMS_HDF5) &&
-        !fs_interface.isConnected()) {
+    if ((dbType == AMSDBType::AMS_HDF5) && !fs_interface.isConnected()) {
       THROW(std::runtime_error,
             "File System is not configured, Please specify output directory");
     } else if (dbType == AMSDBType::AMS_RMQ && !rmq_interface.isConnected()) {
@@ -2148,14 +1787,14 @@ class DBManager
     }
 
     switch (dbType) {
-      case AMSDBType::AMS_CSV:
-        return std::make_shared<csvDB>(fs_interface.path(), dbLabel, rId);
-#ifdef __ENABLE_HDF5__
+#ifdef __AMS_ENABLE_HDF5__
       case AMSDBType::AMS_HDF5:
-        return std::make_shared<hdf5DB>(
-            fs_interface.path(), domainName, dbLabel, rId, isDebug);
+        return std::make_shared<hdf5DB>(fs_interface.path(),
+                                        domainName,
+                                        dbLabel,
+                                        rId);
 #endif
-#ifdef __ENABLE_RMQ__
+#ifdef __AMS_ENABLE_RMQ__
       case AMSDBType::AMS_RMQ:
         return std::make_shared<RabbitMQDB>(rmq_interface,
                                             domainName,
@@ -2165,7 +1804,6 @@ class DBManager
       default:
         return nullptr;
     }
-#endif
     return nullptr;
   }
 
@@ -2180,8 +1818,7 @@ class DBManager
   */
   std::shared_ptr<BaseDB> getDB(std::string& domainName,
                                 std::string& dbLabel,
-                                uint64_t rId = 0,
-                                bool isDebug = false)
+                                uint64_t rId = 0)
   {
     DBG(DBManager,
         "Requested DB for domain: '%s' Under Name: '%s' DB Configured to "
@@ -2199,7 +1836,7 @@ class DBManager
 
     auto db_iter = db_instances.find(std::string(key));
     if (db_iter == db_instances.end()) {
-      auto db = createDB(domainName, dbLabel, dbType, rId, isDebug);
+      auto db = createDB(domainName, dbLabel, dbType, rId);
       db_instances.insert(std::make_pair(std::string(domainName), db));
       DBG(DBManager,
           "Creating new Database writting to file: %s",
@@ -2270,7 +1907,7 @@ class DBManager
              rmq_cert.c_str());
     dbType = AMSDBType::AMS_RMQ;
     updateSurrogate = update_surrogate;
-#ifdef __ENABLE_RMQ__
+#ifdef __AMS_ENABLE_RMQ__
     rmq_interface.connect(rmq_user,
                           rmq_pass,
                           rmq_vhost,
@@ -2287,6 +1924,9 @@ class DBManager
           "enabled")
 #endif
   }
+
+  size_t getNumInstances() const { return db_instances.size(); }
+  void clean() { db_instances.clear(); }
 };
 
 }  // namespace db
diff --git a/src/AMSlib/wf/cuda/utilities.cpp b/src/AMSlib/wf/cuda/utilities.cpp
deleted file mode 100644
index 0af826da..00000000
--- a/src/AMSlib/wf/cuda/utilities.cpp
+++ /dev/null
@@ -1,768 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef __DEVICE_UTILITIES__
-#define __DEVICE_UTILITIES__
-
-
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <thrust/device_vector.h>
-#include <thrust/scan.h>
-
-#include <iostream>
-
-#include "wf/device.hpp"
-#include "wf/resource_manager.hpp"
-
-namespace ams
-{
-namespace Device
-{
-
-const int warpSize = 32;
-const unsigned int fullMask = 0xffffffff;
-
-__host__ int divup(int x, int y) { return (x + y - 1) / y; }
-
-__device__ __inline__ int pow2i(int e) { return 1 << e; }
-
-// Define this to turn on error checking
-#define CUDA_ERROR_CHECK
-
-#define CUDASAFECALL(err) __cudaSafeCall(err, __FILE__, __LINE__)
-#define CUDACHECKERROR() __cudaCheckError(__FILE__, __LINE__)
-
-inline void __cudaSafeCall(cudaError err, const char* file, const int line)
-{
-#ifdef CUDA_ERROR_CHECK
-  if (cudaSuccess != err) {
-    fprintf(stderr,
-            "cudaSafeCall() failed at %s:%i : %s\n",
-            file,
-            line,
-            cudaGetErrorString(err));
-
-    fprintf(stdout,
-            "cudaSafeCall() failed at %s:%i : %s\n",
-            file,
-            line,
-            cudaGetErrorString(err));
-    exit(-1);
-  }
-#endif
-
-  return;
-}
-
-struct is_true {
-  __host__ __device__ bool operator()(const int x) { return x; }
-};
-
-struct is_false {
-  __host__ __device__ bool operator()(const int x) { return !x; }
-};
-
-
-inline void __cudaCheckError(const char* file, const int line)
-{
-#ifdef CUDA_ERROR_CHECK
-  cudaError err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr,
-            "cudaCheckError() failed at %s:%i : %s\n",
-            file,
-            line,
-            cudaGetErrorString(err));
-    exit(-1);
-  }
-
-  // More careful checking. However, this will affect performance.
-  // Comment away if needed.
-  err = cudaDeviceSynchronize();
-  if (cudaSuccess != err) {
-    fprintf(stderr,
-            "cudaCheckError() with sync failed at %s:%i : %s\n",
-            file,
-            line,
-            cudaGetErrorString(err));
-    exit(-1);
-  }
-#endif
-
-  return;
-}
-
-__global__ void srand_dev(curandState* states, const int total_threads)
-{
-  int id = threadIdx.x + blockDim.x * blockIdx.x;
-  if (id < total_threads) {
-    int seed = id;  // different seed per thread
-    curand_init(seed, id, 0, &states[id]);
-  }
-}
-
-__global__ void initIndices(int* ind, int length)
-{
-  int id = threadIdx.x + blockDim.x * blockIdx.x;
-  if (id < length) ind[id] = id;
-}
-
-template <typename T>
-__global__ void fillRandom(bool* predicate,
-                           const int total_threads,
-                           curandState* states,
-                           const size_t length,
-                           T threshold)
-{
-  int id = threadIdx.x + blockDim.x * blockIdx.x;
-  if (id < total_threads) {
-    for (int i = id; i < length; i += total_threads) {
-      float x = curand_uniform(&states[id]);
-      predicate[i] = (x <= threshold);
-    }
-  }
-}
-
-template <typename T>
-__global__ void computeBlockCounts(bool cond,
-                                   T* d_input,
-                                   int length,
-                                   int* d_BlockCounts)
-{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < length) {
-    int pred = (d_input[idx] == cond);
-    int BC = __syncthreads_count(pred);
-
-    if (threadIdx.x == 0) {
-      d_BlockCounts[blockIdx.x] =
-          BC;  // BC will contain the number of valid elements in all threads of this thread block
-    }
-  }
-}
-
-template <typename T>
-__global__ void assignK(T** sparse,
-                        T** dense,
-                        int* indices,
-                        size_t length,
-                        int dims,
-                        bool isReverse)
-{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < length) {
-    int index = indices[idx];
-    if (!isReverse) {
-      for (int i = 0; i < dims; i++) {
-        dense[i][idx] = sparse[i][index];
-      }
-    } else {
-      for (int i = 0; i < dims; i++) {
-        sparse[i][index] = dense[i][idx];
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void device_compactK(bool cond,
-                                T** d_input,
-                                T** d_output,
-                                const bool* predicates,
-                                const size_t length,
-                                int dims,
-                                int* d_BlocksOffset,
-                                bool reverse)
-{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  extern __shared__ int warpTotals[];
-  if (idx < length) {
-    int pred = (predicates[idx] == cond);
-    int w_i = threadIdx.x / warpSize;  //warp index
-    int w_l = idx % warpSize;          //thread index within a warp
-
-    // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
-    int t_m = fullMask >> (warpSize - w_l);  //thread mask
-#if (CUDART_VERSION < 9000)
-    int b = __ballot(pred) & t_m;  //ballot result = number whose ith bit
-                                   //is one if the ith's thread pred is true
-                                   //masked up to the current index in warp
-#else
-    int b = __ballot_sync(fullMask, pred) & t_m;
-#endif
-    int t_u = __popc(
-        b);  // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
-
-    // last thread in warp computes total valid counts for the warp
-    if (w_l == warpSize - 1) {
-      warpTotals[w_i] = t_u + pred;
-    }
-
-    // need all warps in thread block to fill in warpTotals before proceeding
-    __syncthreads();
-
-    // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
-    int numWarps = blockDim.x / warpSize;
-    unsigned int numWarpsMask = fullMask >> (warpSize - numWarps);
-    if (w_i == 0 && w_l < numWarps) {
-      int w_i_u = 0;
-      for (int j = 0; j <= 5; j++) {
-#if (CUDART_VERSION < 9000)
-        int b_j = __ballot(
-            warpTotals[w_l] &
-            pow2i(j));  //# of the ones in the j'th digit of the warp offsets
-#else
-        int b_j = __ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j));
-#endif
-        w_i_u += (__popc(b_j & t_m)) << j;
-      }
-      warpTotals[w_l] = w_i_u;
-    }
-
-    // need all warps in thread block to wait until prefix sum is calculated in warpTotals
-    __syncthreads();
-
-    // if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
-    if (pred) {
-      if (!reverse) {
-        for (int i = 0; i < dims; i++)
-          d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]] =
-              d_input[i][idx];
-      } else {
-        for (int i = 0; i < dims; i++)
-          d_input[i][idx] =
-              d_output[i][t_u + warpTotals[w_i] + d_BlocksOffset[blockIdx.x]];
-      }
-    }
-  }
-}
-
-
-template <typename TypeInValue, typename TypeOutValue>
-void __global__ linearizeK(TypeOutValue* output,
-                           const TypeInValue* const* inputs,
-                           size_t dims,
-                           size_t elements)
-{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx >= elements) return;
-
-  for (int i = 0; i < dims; i++) {
-    output[idx * dims + i] = static_cast<TypeOutValue>(inputs[i][idx]);
-  }
-}
-
-
-void __global__ compute_predicate(float* data,
-                                  bool* predicate,
-                                  size_t nData,
-                                  const size_t kneigh,
-                                  float threshold)
-{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx >= nData) return;
-
-  int index = idx * kneigh;
-  float acc = 0.0f;
-  for (int i = 0; i < kneigh; i++) {
-    acc += data[index + i];
-  }
-
-  acc /= static_cast<float>(kneigh);
-
-  bool pred = acc < threshold ? true : false;
-
-  predicate[idx] = pred;
-}
-
-template <typename T>
-int device_compact(bool cond,
-                   const T** sparse,
-                   T** dense,
-                   const bool* dPredicate,
-                   const size_t length,
-                   int dims,
-                   int blockSize,
-                   bool isReverse)
-{
-  int numBlocks = divup(length, blockSize);
-  auto& rm = ams::ResourceManager::getInstance();
-  int* d_BlocksCount = rm.allocate<int>(numBlocks, AMSResourceType::AMS_DEVICE);
-  int* d_BlocksOffset =
-      rm.allocate<int>(numBlocks, AMSResourceType::AMS_DEVICE);
-  // determine number of elements in the compacted list
-  int* h_BlocksCount = rm.allocate<int>(numBlocks, AMSResourceType::AMS_HOST);
-  int* h_BlocksOffset = rm.allocate<int>(numBlocks, AMSResourceType::AMS_HOST);
-
-  T** d_dense = rm.allocate<T*>(dims, AMSResourceType::AMS_DEVICE);
-  T** d_sparse = rm.allocate<T*>(dims, AMSResourceType::AMS_DEVICE);
-
-
-  rm.copy(dense,
-          AMSResourceType::AMS_HOST,
-          d_dense,
-          AMSResourceType::AMS_DEVICE,
-          dims);
-  rm.copy(const_cast<T**>(sparse),
-          AMSResourceType::AMS_HOST,
-          d_sparse,
-          AMSResourceType::AMS_DEVICE,
-          dims);
-
-  thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
-  thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
-
-  //phase 1: count number of valid elements in each thread block
-  computeBlockCounts<<<numBlocks, blockSize>>>(cond,
-                                               dPredicate,
-                                               length,
-                                               d_BlocksCount);
-
-  //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
-  thrust::exclusive_scan(thrust::device,
-                         d_BlocksCount,
-                         d_BlocksCount + numBlocks,
-                         d_BlocksOffset);
-
-  //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
-  device_compactK<<<numBlocks,
-                    blockSize,
-                    sizeof(int) * (blockSize / warpSize)>>>(cond,
-                                                            d_sparse,
-                                                            d_dense,
-                                                            dPredicate,
-                                                            length,
-                                                            dims,
-                                                            d_BlocksOffset,
-                                                            isReverse);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-
-  rm.copy(d_BlocksCount,
-          AMSResourceType::AMS_DEVICE,
-          h_BlocksCount,
-          AMSResourceType::AMS_HOST,
-          numBlocks);
-  rm.copy(d_BlocksOffset,
-          AMSResourceType::AMS_DEVICE,
-          h_BlocksOffset,
-          AMSResourceType::AMS_HOST,
-          numBlocks);
-
-  int compact_length =
-      h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1];
-
-  rm.deallocate(d_BlocksCount, AMSResourceType::AMS_DEVICE);
-  rm.deallocate(d_BlocksOffset, AMSResourceType::AMS_DEVICE);
-
-  rm.deallocate(h_BlocksCount, AMSResourceType::AMS_HOST);
-  rm.deallocate(h_BlocksOffset, AMSResourceType::AMS_HOST);
-
-  rm.deallocate(d_dense, AMSResourceType::AMS_DEVICE);
-  rm.deallocate(d_sparse, AMSResourceType::AMS_DEVICE);
-
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-
-  return compact_length;
-}
-
-template <typename T>
-int device_compact(bool cond,
-                   T** sparse,
-                   T** dense,
-                   int* indices,
-                   const size_t length,
-                   int dims,
-                   int blockSize,
-                   const bool* dPredicate,
-                   bool isReverse)
-{
-  int numBlocks = divup(length, blockSize);
-  size_t sparseElements = length;
-
-  if (!isReverse) {
-    initIndices<<<numBlocks, blockSize>>>(indices, length);
-    if (cond) {
-      auto last = thrust::copy_if(thrust::device,
-                                  indices,
-                                  indices + sparseElements,
-                                  dPredicate,
-                                  indices,
-                                  is_true());
-      sparseElements = last - indices;
-    } else {
-      auto last = thrust::copy_if(thrust::device,
-                                  indices,
-                                  indices + sparseElements,
-                                  dPredicate,
-                                  indices,
-                                  is_false());
-      sparseElements = last - indices;
-    }
-  }
-
-  assignK<<<numBlocks, blockSize>>>(
-      sparse, dense, indices, sparseElements, dims, isReverse);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-
-  return sparseElements;
-}
-
-template <typename TypeInValue, typename TypeOutValue>
-void device_linearize(TypeOutValue* output,
-                      const TypeInValue* const* inputs,
-                      size_t dims,
-                      size_t elements)
-{
-  // TODO: Fix "magic number".
-  const int NT = 256;
-  // TODO: We should add a max number of blocks typically this should be around 3K.
-  int NB = (elements + NT - 1) / NT;
-  DBG(Device,
-      "Linearize using %ld blocks %ld threads to transpose %ld, %ld matrix",
-      NB,
-      NT,
-      dims,
-      elements);
-
-  linearizeK<<<NB, NT>>>(output, inputs, dims, elements);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-}
-
-template <typename T>
-void cuda_rand_init(bool* predicate, const size_t length, T threshold)
-{
-  static curandState* dev_random = NULL;
-  const int TS = 4096;
-  const int BS = 128;
-  int numBlocks = divup(TS, BS);
-  auto& rm = ams::ResourceManager::getInstance();
-  if (!dev_random) {
-    dev_random = rm.allocate<curandState>(4096, AMSResourceType::AMS_DEVICE);
-    srand_dev<<<numBlocks, BS>>>(dev_random, TS);
-  }
-
-  DBG(Device,
-      "Random Fill using %ld blocks %ld threads to randomly initialize %ld "
-      "elements",
-      numBlocks,
-      BS,
-      length);
-  fillRandom<<<numBlocks, BS>>>(predicate, TS, dev_random, length, threshold);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-}
-
-
-void device_compute_predicate(float* data,
-                              bool* predicate,
-                              size_t nData,
-                              const size_t kneigh,
-                              float threshold)
-{
-  const int NT = 256;
-  int NB = (nData + NT - 1) / NT;
-  DBG(Device,
-      "Compute predicate for %d elements with threshold %f",
-      nData,
-      threshold);
-  compute_predicate<<<NB, NT>>>(data, predicate, nData, kneigh, threshold);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-}
-
-__global__ void random_uq_device(int seed,
-                                 bool* uq_flags,
-                                 int ndata,
-                                 double acceptable_error)
-{
-
-  /* CUDA's random number library uses curandState_t to keep track of the seed
-     value we will store a random state for every thread  */
-  curandState_t state;
-  int id = threadIdx.x + blockDim.x * blockIdx.x;
-
-  if (id >= ndata) return;
-
-  /* we have to initialize the state */
-  curand_init(
-      seed +
-          id, /* the seed controls the sequence of random values that are produced */
-      0,      /* the sequence number is only important with multiple cores */
-      0, /* the offset is how much extra we advance in the sequence for each
-            call, can be 0 */
-      &state);
-
-  float x = curand_uniform(&state);
-  uq_flags[id] = (x <= acceptable_error);
-}
-
-
-template <typename scalar_t>
-__global__ void computeDeltaUQMeanPredicatesKernel(
-    const scalar_t* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold)
-{
-
-  size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  size_t stride = blockDim.x * gridDim.x;
-  // Compute mean over columns, strided loop.
-  for (size_t i = idx; i < nrows; i += stride) {
-    double mean = 0.0;
-    for (size_t j = 0; j < ncols; ++j)
-      mean += outputs_stdev[j + i * ncols];
-    mean /= ncols;
-
-    predicates[i] = (mean < threshold);
-  }
-}
-
-template <typename scalar_t>
-__global__ void computeDeltaUQMaxPredicatesKernel(
-    const scalar_t* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold)
-{
-
-  size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  size_t stride = blockDim.x * gridDim.x;
-  // Compute max delta uq over columns, strided loop.
-  for (size_t i = idx; i < nrows; i += stride) {
-    predicates[i] = true;
-    for (size_t j = 0; j < ncols; ++j)
-      if (outputs_stdev[j + i * ncols] >= threshold) {
-        predicates[i] = false;
-        break;
-      }
-  }
-}
-
-
-template <typename TypeValue>
-void rand_init(bool* predicate, const size_t n, TypeValue threshold)
-{
-  cuda_rand_init(predicate, n, threshold);
-  return;
-}
-
-template <typename scalar_t>
-void computeDeltaUQMeanPredicatesDevice(
-    const scalar_t* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold)
-{
-  constexpr int block_size = 256;
-  int grid_size = divup(nrows, block_size);
-  computeDeltaUQMeanPredicatesKernel<<<grid_size, block_size>>>(
-      outputs_stdev, predicates, nrows, ncols, threshold);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-};
-
-template <typename scalar_t>
-void computeDeltaUQMaxPredicatesDevice(
-    const scalar_t* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold)
-{
-  constexpr int block_size = 256;
-  int grid_size = divup(nrows, block_size);
-  computeDeltaUQMaxPredicatesKernel<<<grid_size, block_size>>>(
-      outputs_stdev, predicates, nrows, ncols, threshold);
-  cudaDeviceSynchronize();
-  CUDACHECKERROR();
-}
-
-
-// Specializations
-
-template void computeDeltaUQMaxPredicatesDevice<float>(
-    const float* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold);
-
-template void computeDeltaUQMaxPredicatesDevice<double>(
-    const double* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold);
-
-template void computeDeltaUQMeanPredicatesDevice<float>(
-    const float* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold);
-
-template void computeDeltaUQMeanPredicatesDevice<double>(
-    const double* __restrict__ outputs_stdev,
-    bool* __restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold);
-
-template void cuda_rand_init<float>(bool* predicate,
-                                    const size_t length,
-                                    float threshold);
-
-template void cuda_rand_init<double>(bool* predicate,
-                                     const size_t length,
-                                     double threshold);
-
-
-template void device_linearize<float, float>(float* output,
-                                             const float* const* inputs,
-                                             size_t dims,
-                                             size_t elements);
-
-template void device_linearize<float, double>(double* output,
-                                              const float* const* inputs,
-                                              size_t dims,
-                                              size_t elements);
-
-template void device_linearize<double, double>(double* output,
-                                               const double* const* inputs,
-                                               size_t dims,
-                                               size_t elements);
-
-template void device_linearize<double, float>(float* output,
-                                              const double* const* inputs,
-                                              size_t dims,
-                                              size_t elements);
-
-template int device_compact<double>(bool cond,
-                                    const double** sparse,
-                                    double** dense,
-                                    const bool* dPredicate,
-                                    const size_t length,
-                                    int dims,
-                                    int blockSize,
-                                    bool isReverse);
-
-template int device_compact<float>(bool cond,
-                                   const float** sparse,
-                                   float** dense,
-                                   const bool* dPredicate,
-                                   const size_t length,
-                                   int dims,
-                                   int blockSize,
-                                   bool isReverse);
-
-template int device_compact<double>(bool cond,
-                                    double** sparse,
-                                    double** dense,
-                                    int* indices,
-                                    const size_t length,
-                                    int dims,
-                                    int blockSize,
-                                    const bool* dPredicate,
-                                    bool isReverse);
-
-template int device_compact<float>(bool cond,
-                                   float** sparse,
-                                   float** dense,
-                                   int* indices,
-                                   const size_t length,
-                                   int dims,
-                                   int blockSize,
-                                   const bool* dPredicate,
-                                   bool isReverse);
-
-
-template void rand_init<double>(bool* predicate,
-                                const size_t n,
-                                double threshold);
-
-template void rand_init<float>(bool* predicate,
-                               const size_t n,
-                               float threshold);
-
-}  // namespace Device
-
-
-void DtoDMemcpy(void* dest, void* src, size_t nBytes)
-{
-  cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToDevice);
-}
-
-void HtoHMemcpy(void* dest, void* src, size_t nBytes)
-{
-  std::memcpy(dest, src, nBytes);
-}
-
-void HtoDMemcpy(void* dest, void* src, size_t nBytes)
-{
-  cudaMemcpy(dest, src, nBytes, cudaMemcpyHostToDevice);
-};
-
-void DtoHMemcpy(void* dest, void* src, size_t nBytes)
-{
-  cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToHost);
-}
-
-void* DeviceAllocate(size_t nBytes)
-{
-  void* devPtr;
-  cudaMalloc(&devPtr, nBytes);
-  return devPtr;
-}
-
-void DeviceFree(void* ptr)
-{
-  cudaFree(ptr);
-  return;
-}
-
-void* DevicePinnedAlloc(size_t nBytes)
-{
-  void* ptr;
-  cudaHostAlloc(&ptr, nBytes, cudaHostAllocPortable);
-  return ptr;
-}
-
-void DeviceFreePinned(void* ptr) { cudaFreeHost(ptr); }
-
-
-void deviceCheckErrors(const char* file, int line)
-{
-  ams::Device::__cudaCheckError(file, line);
-}
-
-void device_random_uq(int seed,
-                      bool* uq_flags,
-                      int ndata,
-                      double acceptable_error)
-{
-  size_t block_size = 256;
-  size_t blocks = ams::Device::divup(ndata, block_size);
-  ams::Device::random_uq_device<<<blocks, block_size>>>(seed,
-                                                        uq_flags,
-                                                        ndata,
-                                                        acceptable_error);
-}
-
-
-}  // namespace ams
-
-#endif
diff --git a/src/AMSlib/wf/data_handler.hpp b/src/AMSlib/wf/data_handler.hpp
deleted file mode 100644
index d03405b0..00000000
--- a/src/AMSlib/wf/data_handler.hpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef __AMS_UTILS_DATA_HPP__
-#define __AMS_UTILS_DATA_HPP__
-
-#include <algorithm>
-#include <random>
-#include <vector>
-
-#include "wf/device.hpp"
-#include "wf/resource_manager.hpp"
-#include "wf/utils.hpp"
-
-namespace ams
-{
-/**
- * @brief A "utility" class that transforms data into
- * various formats. For example moving from sparse to dense
- * representations
- */
-template <typename TypeValue>
-class DataHandler
-{
-
-public:
-  /* @brief Casts C-vector from one type to another type.
-   *
-   * This function uses tempalte metaprogramming. When both
-   * the class Templated 'TypeValue' and the functions template
-   * 'TypeInValue' have the same type, we return directly
-   * the same memory.
-   *
-   * @tparam TypeInValue Type of the source value.
-   * @param[in] n The number of elements of the vector.
-   * @param[in] data A pointer pointing to the C-vector to be casted.
-   * @return A pointer to a C-vector containing the casted values.
-   *
-   */
-  template <
-      class TypeInValue,
-      std::enable_if_t<std::is_same<TypeValue, TypeInValue>::value>* = nullptr>
-  static inline TypeValue* cast_to_typevalue(AMSResourceType resource,
-                                             const size_t n,
-                                             TypeInValue* data)
-  {
-    return data;
-  }
-
-  /* @brief Casts C-vector from one type to another type.
-   *
-   * This function uses tempalte metaprogramming. Both
-   * the class Templated 'TypeValue' and the functions template
-   * 'TypeInValue' have a different type, thus we allocate a new
-   * vector of 'TypeInValue' type and we cast each element of the vector
-   * to the desired ('TypeValue') type.
-   *
-   * @tparam TypeInValue Type of the source value.
-   * @param[in] n The number of elements of the vector.
-   * @param[in] data A pointer pointing to the C-vector to be casted.
-   * @return A pointer to a C-vector containing the casted values.
-   *
-   */
-  template <
-      typename TypeInValue,
-      std::enable_if_t<!std::is_same<TypeValue, TypeInValue>::value>* = nullptr>
-  static inline TypeValue* cast_to_typevalue(AMSResourceType resource,
-                                             const size_t n,
-                                             TypeInValue* data)
-  {
-    auto& rm = ams::ResourceManager::getInstance();
-    TypeValue* fdata = rm.allocate<TypeValue>(resource, n);
-    std::transform(data, data + n, fdata, [&](const TypeInValue& v) {
-      return static_cast<TypeValue>(v);
-    });
-    return fdata;
-  }
-
-  /* @brief Casts all elements of a C-vector from one type to
-   * the other type and stores them to the 'dest' vector.
-   *
-   * This function uses tempalte metaprogramming. In this function
-   * template datatypes match, thus we just copy data from
-   * one vector to another.
-   *
-   * @tparam TypeInValue Type of the source value.
-   * @param[in] n The number of elements of the vectors.
-   * @param[out] dest The destination vector.
-   * @param[in] src The source vector.
-   * @return A pointer to a C-vector containing the casted values.
-   */
-  template <
-      typename TypeInValue,
-      std::enable_if_t<std::is_same<TypeValue, TypeInValue>::value>* = nullptr>
-  static inline void cast_from_typevalue(const size_t n,
-                                         TypeInValue* dest,
-                                         TypeValue* src)
-  {
-    std::transform(src, src + n, dest, [&](const TypeInValue& v) { return v; });
-  }
-
-  /* @brief Casts all elements of a C-vector from one type to
-   * the other type and stores them to the 'dest' vector.
-   *
-   * This function uses tempalte metaprogramming. In this function
-   * template datatypes do not match, thus we cast each element
-   * and store it to destination vector
-   *
-   * @tparam TypeInValue Type of the source value.
-   * @param[in] n The number of elements of the vectors.
-   * @param[out] dest The destination vector.
-   * @param[in] src The source vector.
-   * @return A pointer to a C-vector containing the casted values.
-   */
-  template <
-      typename TypeInValue,
-      std::enable_if_t<!std::is_same<TypeValue, TypeInValue>::value>* = nullptr>
-  static inline void cast_from_typevalue(const size_t n,
-                                         TypeInValue* dest,
-                                         TypeValue* src)
-  {
-    std::transform(src, src + n, dest, [&](const TypeInValue& v) {
-      return static_cast<TypeInValue>(v);
-    });
-  }
-
-  /* @brief linearize all elements of a vector of C-vectors
-   * in a single C-vector. Data are transposed.
-   *
-   * @tparam TypeInValue Type of the source value.
-   * @param[in] n The number of elements of the vectors.
-   * @param[in] features A vector containing C-vector of feature values.
-   * @return A pointer to a C-vector containing the linearized values. The
-   * C-vector is_same resident in the same device as the input feature pointers.
-   */
-  template <typename TypeInValue>
-  PERFFASPECT()
-  static inline TypeValue* linearize_features(
-      AMSResourceType resource,
-      const size_t n,
-      const std::vector<const TypeInValue*>& features)
-  {
-
-    const size_t nfeatures = features.size();
-    const size_t nvalues = n * nfeatures;
-
-    auto& rm = ams::ResourceManager::getInstance();
-    TypeValue* data = rm.allocate<TypeValue>(nvalues, resource);
-
-    if (resource == AMSResourceType::AMS_HOST) {
-      for (size_t d = 0; d < nfeatures; d++) {
-        for (size_t i = 0; i < n; i++) {
-          data[i * nfeatures + d] = static_cast<TypeValue>(features[d][i]);
-        }
-      }
-    } else {
-      ams::Device::linearize(data, features.data(), nfeatures, n);
-    }
-    return data;
-  }
-
-  /* @brief The function stores all elements of the sparse
-   * vector in the dense vector if the respective index
-   * of the predicate vector is equal to 'denseVal.
-   *
-   * @param[in] dataLocation Location of the data
-   * @param[in] predicate A boolean vector storing which elements in the vector
-   * should be dropped.
-   * @param[in] n The number of elements of the C-vectors.
-   * @param[in] sparse A vector containing C-vectors whose elements will be
-   * dropped
-   * @param[out] dense A vector containing C-vectors with the remaining elements
-   * @param[in] denseVal The condition the predicate needs to meet for the index
-   * to be stored in the dense vector
-   * @return Total number of elements stored in the dense vector
-   * */
-  PERFFASPECT()
-  static inline size_t pack(AMSResourceType dataLocation,
-                            const bool* predicate,
-                            const size_t n,
-                            std::vector<const TypeValue*>& sparse,
-                            std::vector<TypeValue*>& dense,
-                            bool denseVal = false)
-  {
-    if (sparse.size() != dense.size())
-      throw std::invalid_argument("Packing arrays size mismatch");
-
-    size_t npacked = 0;
-    size_t dims = sparse.size();
-
-    if (dataLocation != AMSResourceType::AMS_DEVICE) {
-      for (size_t i = 0; i < n; i++) {
-        if (predicate[i] == denseVal) {
-          for (size_t j = 0; j < dims; j++)
-            dense[j][npacked] = sparse[j][i];
-          npacked++;
-        }
-      }
-    } else {
-      npacked = ams::Device::pack(denseVal,
-                                  predicate,
-                                  n,
-                                  static_cast<const TypeValue**>(sparse.data()),
-                                  dense.data(),
-                                  dims);
-    }
-    return npacked;
-  }
-
-  /* @brief The function stores all elements from the dense
-   * vector to the sparse vector.
-   *
-   * @param[in] dataLocation Location of the data
-   * @param[in] predicate A boolean vector storing which elements in the vector
-   * should be kept.
-   * @param[in] n The number of elements of the C-vectors.
-   * dropped
-   * @param[in] dense A vector containing C-vectors with elements
-   * to be stored in the sparse vector
-   * @param[out] sparse A vector containing C-vectors whose elements will be
-   * @param[in] denseVal The condition the predicate needs to meet for the index
-   * to be copied to the sparse vectors.
-   * */
-  PERFFASPECT()
-  static inline void unpack(AMSResourceType dataLocation,
-                            const bool* predicate,
-                            const size_t n,
-                            std::vector<TypeValue*>& dense,
-                            std::vector<TypeValue*>& sparse,
-                            bool denseVal = false)
-  {
-
-    if (sparse.size() != dense.size())
-      throw std::invalid_argument("Packing arrays size mismatch");
-
-    size_t npacked = 0;
-    size_t dims = sparse.size();
-    if (dataLocation != AMSResourceType::AMS_DEVICE) {
-      for (size_t i = 0; i < n; i++) {
-        if (predicate[i] == denseVal) {
-          for (size_t j = 0; j < dims; j++)
-            sparse[j][i] = dense[j][npacked];
-          npacked++;
-        }
-      }
-    } else {
-      npacked = ams::Device::unpack(
-          denseVal, predicate, n, sparse.data(), dense.data(), dims);
-    }
-    return;
-  }
-
-  /* @brief The function stores all elements of the sparse
-   * vector in the dense vector if the respective index
-   * of the predicate vector is equal to 'denseVal.
-   *
-   * @param[in] dataLocation Location of the data
-   * @param[in] predicate A boolean vector storing which elements in the vector
-   * @param[out] sparse_indices A vector storing the mapping from dense elements
-   * to sparse elements.
-   * @param[in] n The number of elements of the C-vectors.
-   * @param[in] sparse A vector containing C-vectors whose elements will be
-   * dropped
-   * @param[out] dense A vector containing C-vectors with the remaining elements
-   * @param[in] denseVal The condition the predicate needs to meet for the index
-   * to be stored in the dense vector
-   * @return Total number of elements stored in the dense vector
-   * */
-  PERFFASPECT()
-  static inline size_t pack(AMSResourceType dataLocation,
-                            const bool* predicate,
-                            int* sparse_indices,
-                            const size_t n,
-                            std::vector<const TypeValue*>& sparse,
-                            std::vector<TypeValue*>& dense,
-                            bool denseVal = false)
-  {
-
-    if (sparse.size() != dense.size())
-      throw std::invalid_argument("Packing arrays size mismatch");
-
-    size_t npacked = 0;
-    int dims = sparse.size();
-
-    if (dataLocation != AMSResourceType::AMS_DEVICE) {
-      for (size_t i = 0; i < n; i++) {
-        if (predicate[i] == denseVal) {
-          for (size_t j = 0; j < dims; j++)
-            dense[j][npacked] = sparse[j][i];
-          sparse_indices[npacked++] = i;
-        }
-      }
-    } else {
-      npacked = ams::Device::pack(denseVal,
-                                  predicate,
-                                  n,
-                                  sparse.data(),
-                                  dense.data(),
-                                  sparse_indices,
-                                  dims);
-    }
-
-    return npacked;
-  }
-
-  /* @brief The function copies all elements from the dense
-   * vector to the sparse vector.
-   *
-   * @param[in] dataLocation Location of the data
-   * @param[in] sparse_indices A vector storing the mapping from sparse to
-   * dense.
-   * @param[in] n The number of elements of the C-vectors.
-   * dropped
-   * @param[in] dense A vector containing C-vectors with elements
-   * to be stored in the sparse vector
-   * @param[out] sparse A vector containing C-vectors whose elements will be
-   * @param[in] denseVal The condition the predicate needs to meet for the index
-   * to be copied to the sparse vectors.
-   * */
-  PERFFASPECT()
-  static inline void unpack(AMSResourceType dataLocation,
-                            int* sparse_indices,
-                            const size_t nPacked,
-                            std::vector<TypeValue*>& dense,
-                            std::vector<TypeValue*>& sparse,
-                            bool denseVal = false)
-  {
-
-    if (sparse.size() != dense.size())
-      throw std::invalid_argument("Packing arrays size mismatch");
-
-    int dims = sparse.size();
-
-    if (dataLocation != AMSResourceType::AMS_DEVICE) {
-      for (size_t i = 0; i < nPacked; i++)
-        for (size_t j = 0; j < dims; j++)
-          sparse[j][sparse_indices[i]] = dense[j][i];
-    } else {
-      ams::Device::unpack(
-          denseVal, nPacked, sparse.data(), dense.data(), sparse_indices, dims);
-    }
-
-    return;
-  }
-};
-}  // namespace ams
-
-// -----------------------------------------------------------------------------
-#endif
diff --git a/src/AMSlib/wf/device.hpp b/src/AMSlib/wf/device.hpp
index d197bd21..893075d3 100644
--- a/src/AMSlib/wf/device.hpp
+++ b/src/AMSlib/wf/device.hpp
@@ -18,7 +18,7 @@
 
 #define UNDEFINED_FUNC -1
 
-#ifdef __ENABLE_CUDA__
+#ifdef __AMS_ENABLE_CUDA__
 namespace ams
 {
 void DtoDMemcpy(void *dest, void *src, size_t nBytes);
@@ -44,291 +44,7 @@ void device_random_uq(int seed,
                       int ndata,
                       double acceptable_error);
 
-namespace Device
-{
-
-template <typename scalar_t>
-void computeDeltaUQMeanPredicatesDevice(
-    const scalar_t *__restrict__ outputs_stdev,
-    bool *__restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold);
-
-
-template <typename scalar_t>
-void computeDeltaUQMaxPredicatesDevice(
-    const scalar_t *__restrict__ outputs_stdev,
-    bool *__restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold);
-
-void device_compute_predicate(float *data,
-                              bool *predicate,
-                              size_t nData,
-                              const size_t kneigh,
-                              float threshold);
-
-template <typename TypeValue>
-PERFFASPECT()
-void rand_init(bool *predicate, const size_t n, TypeValue threshold);
-
-template <typename TypeInValue, typename TypeOutValue>
-void device_linearize(TypeOutValue *output,
-                      const TypeInValue *const *inputs,
-                      size_t dims,
-                      size_t elements);
-
-template <typename T>
-int device_compact(bool cond,
-                   const T **sparse,
-                   T **dense,
-                   const bool *dPredicate,
-                   const size_t length,
-                   int dims,
-                   int blockSize,
-                   bool isReverse = false);
-
-template <typename T>
-int device_compact(bool cond,
-                   T **sparse,
-                   T **dense,
-                   int *indices,
-                   const size_t length,
-                   int dims,
-                   int blockSize,
-                   const bool *dPredicate,
-                   bool isReverse = false);
-
-
-PERFFASPECT()
-inline void computePredicate(float *data,
-                             bool *predicate,
-                             size_t nData,
-                             const size_t kneigh,
-                             float threshold)
-{
-  return device_compute_predicate(data, predicate, nData, kneigh, threshold);
-}
-
-
-template <typename TypeInValue, typename TypeOutValue>
-PERFFASPECT()
-inline void linearize(TypeOutValue *output,
-                      const TypeInValue *const *inputs,
-                      size_t dims,
-                      size_t elements)
-{
-  return device_linearize(output, inputs, dims, elements);
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int pack(bool cond,
-                const bool *predicate,
-                const size_t n,
-                const TypeValue **sparse,
-                TypeValue **dense,
-                int dims)
-{
-  return device_compact(cond, sparse, dense, predicate, n, dims, 1024);
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int pack(bool cond,
-                const bool *predicate,
-                const size_t n,
-                TypeValue **sparse,
-                TypeValue **dense,
-                int *sparse_indices,
-                int dims)
-{
-  return device_compact(
-      cond, sparse, dense, sparse_indices, n, dims, 1024, predicate);
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int unpack(bool cond,
-                  const bool *predicate,
-                  const size_t n,
-                  TypeValue **sparse,
-                  TypeValue **dense,
-                  int dims)
-{
-  return device_compact(cond,
-                        const_cast<const TypeValue **>(sparse),
-                        dense,
-                        predicate,
-                        n,
-                        dims,
-                        1024,
-                        true);
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int unpack(bool cond,
-                  const size_t n,
-                  TypeValue **sparse,
-                  TypeValue **dense,
-                  int *sparse_indices,
-                  int dims)
-{
-  return device_compact(
-      cond, sparse, dense, sparse_indices, n, dims, 1024, NULL, true);
-}
-
-}  // namespace Device
-}  // namespace ams
-
-#else
-
-namespace ams
-{
-
-
-PERFFASPECT()
-inline void DtoDMemcpy(void *dest, void *src, size_t nBytes)
-{
-  FATAL(Device, "DtoD Memcpy Not Enabled");
-}
-
-PERFFASPECT()
-inline void HtoHMemcpy(void *dest, void *src, size_t nBytes)
-{
-  std::memcpy(dest, src, nBytes);
-}
-
-PERFFASPECT()
-inline void HtoDMemcpy(void *dest, void *src, size_t nBytes)
-{
-  FATAL(Device, "HtoD Memcpy Not Enabled");
-}
-
-PERFFASPECT()
-inline void DtoHMemcpy(void *dest, void *src, size_t nBytes)
-{
-  FATAL(Device, "DtoH Memcpy Not Enabled");
-}
-
-
-inline void *DeviceAllocate(size_t nBytes)
-{
-  FATAL(Device, "DtoH Memcpy Not Enabled");
-}
-
-
-PERFFASPECT()
-inline void DeviceFree(void *ptr) { FATAL(Device, "DtoH Memcpy Not Enabled"); }
-
-PERFFASPECT()
-inline void *DevicePinnedAlloc(size_t nBytes)
-{
-  FATAL(Device, "Pinned Alloc Not Enabled");
-}
-
-PERFFASPECT()
-inline void DeviceFreePinned(void *ptr)
-{
-  FATAL(Device, "Pinned Free Pinned Not Enabled");
-}
-
-inline void device_random_uq(int seed,
-                             bool *uq_flags,
-                             int ndata,
-                             double acceptable_error)
-{
-  FATAL(Device, "Called Device Runtime UQ without enabling Device compilation");
-}
-
-
-inline void deviceCheckErrors(const char *file, int line) { return; }
-
-namespace Device
-{
-PERFFASPECT()
-inline void computePredicate(float *data,
-                             bool *predicate,
-                             size_t nData,
-                             const size_t kneigh,
-                             float threshold)
-{
-  FATAL(Device, "Called device code when CUDA disabled");
-  return;
-}
-
-
-template <typename TypeInValue, typename TypeOutValue>
-PERFFASPECT()
-inline void linearize(TypeOutValue *output,
-                      const TypeInValue *const *inputs,
-                      size_t dims,
-                      size_t elements)
-{
-  FATAL(Device, "Called device code when CUDA disabled");
-  return;
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int pack(bool cond,
-                const bool *predicate,
-                const size_t n,
-                const TypeValue **sparse,
-                TypeValue **dense,
-                int dims)
-{
-  FATAL(Device, "Called device code when CUDA disabled");
-  return UNDEFINED_FUNC;
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int pack(bool cond,
-                const bool *predicate,
-                const size_t n,
-                TypeValue **sparse,
-                TypeValue **dense,
-                int *sparse_indices,
-                int dims)
-{
-  FATAL(Device, "Called device code when CUDA disabled");
-  return UNDEFINED_FUNC;
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int unpack(bool cond,
-                  const bool *predicate,
-                  const size_t n,
-                  TypeValue **sparse,
-                  TypeValue **dense,
-                  int dims)
-{
-  FATAL(Device, "Called device code when CUDA disabled");
-  return UNDEFINED_FUNC;
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-inline int unpack(bool cond,
-                  const size_t n,
-                  TypeValue **sparse,
-                  TypeValue **dense,
-                  int *sparse_indices,
-                  int dims)
-{
-  FATAL(Device, "Called device code when CUDA disabled");
-  return UNDEFINED_FUNC;
-}
-
-}  // namespace Device
 }  // namespace ams
 
 #endif
-
-
 #endif
diff --git a/src/AMSlib/wf/hdf5db.cpp b/src/AMSlib/wf/hdf5db.cpp
index f59e78b7..b7b5fccf 100644
--- a/src/AMSlib/wf/hdf5db.cpp
+++ b/src/AMSlib/wf/hdf5db.cpp
@@ -5,17 +5,96 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
+#include <ATen/core/ATen_fwd.h>
+#include <H5Ipublic.h>
+#include <H5Tpublic.h>
+#include <H5public.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <torch/torch.h>
+#include <torch/types.h>
+
+#include <stdexcept>
+
+#include "ArrayRef.hpp"
 #include "wf/basedb.hpp"
 
+
 using namespace ams::db;
+using namespace ams;
+
+static std::string SmallVectorToString(ams::MutableArrayRef<hsize_t> shape)
+{
+  std::ostringstream oss;
+  oss << "[";
+  for (size_t i = 0; i < shape.size(); ++i) {
+    oss << shape[i];
+    if (i < shape.size() - 1) {
+      oss << ", ";
+    }
+  }
+  oss << "]";
+  return oss.str();
+}
+
+static std::string tensorSizeToString(const at::IntArrayRef shape)
+{
+  std::ostringstream oss;
+  oss << "[";
+  for (size_t i = 0; i < shape.size(); ++i) {
+    oss << shape[i];
+    if (i < shape.size() - 1) {
+      oss << ", ";
+    }
+  }
+  oss << "]";
+  return oss.str();
+}
+
+// Helper function to convert torch::Dtype to a string
+static std::string dtypeToString(torch::Dtype dtype)
+{
+  static const std::unordered_map<torch::Dtype, std::string> dtypeMap = {
+      {torch::kFloat32, "float32"},
+      {torch::kFloat, "float32"},  // Alias for float32
+      {torch::kFloat64, "float64"},
+      {torch::kDouble, "float64"},  // Alias for float64
+      {torch::kInt32, "int32"},
+      {torch::kInt64, "int64"},
+      {torch::kBool, "bool"},
+      {torch::kUInt8, "uint8"},
+      {torch::kInt8, "int8"},
+      {torch::kHalf, "float16"},
+      {torch::kBFloat16, "bfloat16"}};
+  return dtypeMap.count(dtype) ? dtypeMap.at(dtype) : "unknown dtype";
+}
+// Helper function to convert torch::Dtype to a string
+static hid_t torchDTypeToHDF5Type(torch::Dtype dtype)
+{
+  static const std::unordered_map<torch::Dtype, hid_t> dtypeMap = {
+      {torch::kFloat32, H5T_NATIVE_FLOAT},
+      {torch::kFloat, H5T_NATIVE_FLOAT},  // Alias for float32
+      {torch::kFloat64, H5T_NATIVE_DOUBLE},
+      {torch::kDouble, H5T_NATIVE_DOUBLE},  // Alias for float64
+      {torch::kInt32, H5T_NATIVE_INT},
+      {torch::kInt64, H5T_NATIVE_LONG},
+      {torch::kBool, H5T_NO_CLASS},
+      {torch::kUInt8, H5T_NO_CLASS},
+      {torch::kInt8, H5T_NO_CLASS},
+      {torch::kHalf, H5T_NO_CLASS},
+      {torch::kBFloat16, H5T_NO_CLASS}};
+  return dtypeMap.count(dtype) ? dtypeMap.at(dtype) : H5T_NO_CLASS;
+}
 
 hid_t hdf5DB::getDataSet(hid_t group,
                          std::string dName,
+                         ams::SmallVector<hsize_t>& currentShape,
+                         const at::IntArrayRef Shape,
                          hid_t dataType,
                          const size_t Chunk)
 {
-  // Our datasets a.t.m are 1-D vectors
-  const int nDims = 1;
+  const int nDims = Shape.size();
+  currentShape.resize(nDims);
+  currentShape.assign(nDims, 0);
   // We always start from 0
   hsize_t dims = 0;
   hid_t dset = -1;
@@ -26,172 +105,178 @@ hid_t hdf5DB::getDataSet(hid_t group,
     dset = H5Dopen(group, dName.c_str(), H5P_DEFAULT);
     HDF5_ERROR(dset);
     // We are assuming symmetrical data sets a.t.m
-    if (totalElements == 0) {
-      hid_t dspace = H5Dget_space(dset);
-      const int ndims = H5Sget_simple_extent_ndims(dspace);
-      hsize_t dims[ndims];
-      H5Sget_simple_extent_dims(dspace, dims, NULL);
-      totalElements = dims[0];
+    hid_t dspace = H5Dget_space(dset);
+    const int file_ndims = H5Sget_simple_extent_ndims(dspace);
+    if (file_ndims != nDims) {
+      throw std::runtime_error(
+          "File system file with current tensor shape to not match");
     }
+    hsize_t dims[nDims];
+    H5Sget_simple_extent_dims(dspace, dims, NULL);
+    currentShape[0] = dims[0];
     return dset;
-  } else {
-    // We will extend the data-set size, so we use unlimited option
-    hsize_t maxDims = H5S_UNLIMITED;
-    hid_t fileSpace = H5Screate_simple(nDims, &dims, &maxDims);
-    HDF5_ERROR(fileSpace);
-
-    hid_t pList = H5Pcreate(H5P_DATASET_CREATE);
-    HDF5_ERROR(pList);
-
-    herr_t ec = H5Pset_layout(pList, H5D_CHUNKED);
-    HDF5_ERROR(ec);
-
-    // cDims impacts performance considerably.
-    // TODO: Align this with the caching mechanism for this option to work
-    // out.
-    hsize_t cDims = Chunk;
-    H5Pset_chunk(pList, nDims, &cDims);
-    dset = H5Dcreate(group,
-                     dName.c_str(),
-                     dataType,
-                     fileSpace,
-                     H5P_DEFAULT,
-                     pList,
-                     H5P_DEFAULT);
-    HDF5_ERROR(dset);
-    H5Sclose(fileSpace);
-    H5Pclose(pList);
   }
+
+  // We will extend the data-set size, so we use unlimited option
+  hsize_t max_dims[Shape.size()];
+  hsize_t initial_shape[Shape.size()];
+  for (int i = 0; i < Shape.size(); i++) {
+    max_dims[i] = Shape[i];
+    initial_shape[i] = 0;
+  }
+  max_dims[0] = H5S_UNLIMITED;
+  hid_t fileSpace = H5Screate_simple(nDims, initial_shape, max_dims);
+  HDF5_ERROR(fileSpace);
+
+  hid_t pList = H5Pcreate(H5P_DATASET_CREATE);
+  HDF5_ERROR(pList);
+
+  herr_t ec = H5Pset_layout(pList, H5D_CHUNKED);
+  HDF5_ERROR(ec);
+
+  // cDims impacts performance considerably.
+  // TODO: Align this with the caching mechanism for this option to work
+  // out.
+  max_dims[0] = Chunk;
+  H5Pset_chunk(pList, nDims, max_dims);
+  dset = H5Dcreate(group,
+                   dName.c_str(),
+                   dataType,
+                   fileSpace,
+                   H5P_DEFAULT,
+                   pList,
+                   H5P_DEFAULT);
+  HDF5_ERROR(dset);
+  H5Sclose(fileSpace);
+  H5Pclose(pList);
   return dset;
 }
 
 
-void hdf5DB::createDataSets(size_t numElements,
-                            const size_t numIn,
-                            const size_t numOut)
+void hdf5DB::createDataSets(at::IntArrayRef InShapes, at::IntArrayRef OutShapes)
 {
-  for (int i = 0; i < numIn; i++) {
-    hid_t dSet =
-        getDataSet(HFile, std::string("input_") + std::to_string(i), HDType);
-    HDIsets.push_back(dSet);
-  }
+  HDIset = getDataSet(HFile, "input_data", currentInputShape, InShapes, HDType);
+
+  HDOset =
+      getDataSet(HFile, "output_data", currentOutputShape, OutShapes, HDType);
+}
+
+void hdf5DB::writeDataToDataset(ams::MutableArrayRef<hsize_t> currentShape,
+                                hid_t& dset,
+                                const at::Tensor& tensor_data)
+{
+  herr_t status;
+
+  // Ensure tensor is contiguous
+  torch::Tensor tensor_contiguous = tensor_data.contiguous();
 
-  for (int i = 0; i < numOut; i++) {
-    hid_t dSet =
-        getDataSet(HFile, std::string("output_") + std::to_string(i), HDType);
-    HDOsets.push_back(dSet);
+  // Get tensor dimensions
+  std::vector<hsize_t> tensor_dims(tensor_contiguous.sizes().begin(),
+                                   tensor_contiguous.sizes().end());
+  int rank = tensor_dims.size();
+
+  // Initialize currentShape if it's empty (e.g., first write or reopening an existing file)
+  if (currentShape.empty()) {
+    hid_t fileSpace = H5Dget_space(dset);
+    if (fileSpace < 0) {
+      throw std::runtime_error("Failed to get dataspace from dataset.");
+    }
+    if (H5Sget_simple_extent_dims(fileSpace, currentShape.data(), NULL) < 0) {
+      H5Sclose(fileSpace);
+      throw std::runtime_error("Failed to retrieve dataset dimensions.");
+    }
+    H5Sclose(fileSpace);
   }
 
-  if (storePredicate()) {
-    pSet = getDataSet(HFile, "predicate", H5T_NATIVE_HBOOL);
+  // Create a memory representation for the data to be stored
+  hid_t memSpace = H5Screate_simple(rank, tensor_dims.data(), NULL);
+  if (memSpace < 0) {
+    throw std::runtime_error("Failed to create memory dataspace.");
   }
-}
 
-template <typename TypeValue>
-void hdf5DB::writeDataToDataset(std::vector<hid_t>& dsets,
-                                std::vector<TypeValue*>& data,
-                                size_t numElements)
-{
-  int index = 0;
-  for (auto* I : data) {
-    writeVecToDataset(dsets[index++],
-                      static_cast<void*>(I),
-                      numElements,
-                      HDType);
+  // Prepare the dataset for new data
+  ams::SmallVector<hsize_t> newShape(tensor_dims.begin(), tensor_dims.end());
+
+  newShape[0] += currentShape[0];  // Update the first dimension
+
+  status = H5Dset_extent(dset, newShape.data());
+  if (status < 0) {
+    throw std::runtime_error("Failed to extend dataset's dimensions.");
   }
-}
 
-void hdf5DB::writeVecToDataset(hid_t dSet,
-                               void* data,
-                               size_t elements,
-                               hid_t DType)
-{
-  const int nDims = 1;
-  hsize_t dims = elements;
-  hsize_t start;
-  hsize_t count;
-  hid_t memSpace = H5Screate_simple(nDims, &dims, NULL);
-  HDF5_ERROR(memSpace);
 
-  dims = totalElements + elements;
-  H5Dset_extent(dSet, &dims);
+  // Refresh fileSpace after extending
+  hid_t fileSpace = H5Dget_space(dset);
+  if (fileSpace < 0) {
+    throw std::runtime_error(
+        "Failed to get refreshed dataspace after extending dataset.");
+  }
 
-  hid_t fileSpace = H5Dget_space(dSet);
-  HDF5_ERROR(fileSpace);
+  // Debugging: Check dimensions of fileSpace
+  std::vector<hsize_t> file_dims(rank);
+  H5Sget_simple_extent_dims(fileSpace, file_dims.data(), NULL);
 
-  // Data set starts at offset totalElements
-  start = totalElements;
-  // And we append additional elements
-  count = elements;
   // Select hyperslab
-  herr_t err = H5Sselect_hyperslab(
-      fileSpace, H5S_SELECT_SET, &start, NULL, &count, NULL);
-  HDF5_ERROR(err);
+  herr_t err = H5Sselect_hyperslab(fileSpace,
+                                   H5S_SELECT_SET,
+                                   currentShape.data(),
+                                   NULL,
+                                   tensor_dims.data(),
+                                   NULL);
+  if (err < 0) {
+    H5Sclose(fileSpace);
+    H5Sclose(memSpace);
+    throw std::runtime_error("Failed to select hyperslab.");
+  }
 
-  H5Dwrite(dSet, DType, memSpace, fileSpace, H5P_DEFAULT, data);
+  // Write the tensor data to the dataset
+  status = H5Dwrite(dset,
+                    HDType,
+                    memSpace,
+                    fileSpace,
+                    H5P_DEFAULT,
+                    tensor_contiguous.data_ptr());
+  if (status < 0) {
+    throw std::runtime_error("Failed to write data to dataset.");
+  }
+
+  // Update currentShape
+  currentShape[0] = newShape[0];
+
+  // Close HDF5 objects
+  H5Sclose(memSpace);
   H5Sclose(fileSpace);
 }
 
 
-template <typename TypeValue>
-void hdf5DB::_store(size_t num_elements,
-                    std::vector<TypeValue*>& inputs,
-                    std::vector<TypeValue*>& outputs,
-                    bool* predicate)
+void hdf5DB::_store(const at::Tensor& inputs, const at::Tensor& outputs)
 {
-  CALIPER(CALI_MARK_BEGIN("STORE_HDF5");)
-  if (isDouble<TypeValue>::default_value())
-    HDType = H5T_NATIVE_DOUBLE;
-  else
-    HDType = H5T_NATIVE_FLOAT;
-
-
-  CFATAL(HDF5DB,
-         storePredicate() && predicate == nullptr,
-         "DB Configured to store predicates, predicate is not provided")
-
-
   DBG(DB,
-      "DB of type %s stores %ld elements of input/output dimensions (%lu, "
-      "%lu)",
+      "DB of type %s stores input/output tensors of  shapes %s, "
+      "%s",
       type().c_str(),
-      num_elements,
-      inputs.size(),
-      outputs.size())
-  const size_t num_in = inputs.size();
-  const size_t num_out = outputs.size();
-
-  if (HDIsets.empty()) {
-    createDataSets(num_elements, num_in, num_out);
-  }
+      tensorSizeToString(inputs.sizes()).c_str(),
+      tensorSizeToString(outputs.sizes()).c_str());
 
-  CFATAL(HDF5DB,
-         (HDIsets.size() != num_in || HDOsets.size() != num_out),
-         "The data dimensionality is different than the one in the "
-         "DB")
-
-  writeDataToDataset(HDIsets, inputs, num_elements);
-  writeDataToDataset(HDOsets, outputs, num_elements);
-
-  if (storePredicate() && predicate != nullptr) {
-    writeVecToDataset(pSet,
-                      static_cast<void*>(predicate),
-                      num_elements,
-                      H5T_NATIVE_HBOOL);
+  if (HDIset == -1 || HDOset == -1) {
+    createDataSets(inputs.sizes(), outputs.sizes());
   }
 
-  totalElements += num_elements;
-  CALIPER(CALI_MARK_END("STORE_HDF5");)
+  writeDataToDataset(currentInputShape, HDIset, inputs);
+  writeDataToDataset(currentOutputShape, HDOset, outputs);
+  DBG(DB,
+      "DB (file:%s) next elements to be stored at Input:%s Output: %s",
+      fn.c_str(),
+      SmallVectorToString(currentOutputShape).c_str(),
+      SmallVectorToString(currentInputShape).c_str());
 }
 
 
 hdf5DB::hdf5DB(std::string path,
                std::string domain_name,
                std::string fn,
-               uint64_t rId,
-               bool predicate)
-    : FileDB(path, fn, predicate ? ".debug.h5" : ".h5", rId),
-      predicateStore(predicate)
+               uint64_t rId)
+    : FileDB(path, fn, ".h5", rId), HDOset(-1), HDIset(-1)
 {
   std::error_code ec;
   bool exists = fs::exists(this->fn);
@@ -220,7 +305,6 @@ hdf5DB::hdf5DB(std::string path,
     H5Sclose(dataspace_id);
   }
   HDF5_ERROR(HFile);
-  totalElements = 0;
   HDType = -1;
 }
 
@@ -233,32 +317,41 @@ hdf5DB::~hdf5DB()
   //    HDF5_ERROR(err);
 }
 
-void hdf5DB::store(size_t num_elements,
-                   std::vector<float*>& inputs,
-                   std::vector<float*>& outputs,
-                   bool* predicate)
+void hdf5DB::store(ArrayRef<torch::Tensor> Inputs,
+                   ArrayRef<torch::Tensor> Outputs)
 {
-  if (HDType == -1) {
-    HDType = H5T_NATIVE_FLOAT;
-  }
 
-  CFATAL(HDF5DB,
-         HDType != H5T_NATIVE_FLOAT,
-         "Database %s initialized to work on 'float' received different "
-         "datatypes",
-         fn.c_str());
+  auto tOptions = torch::TensorOptions()
+                      .dtype(torch::kFloat32)
+                      .device(c10::DeviceType::CPU);
 
-  _store(num_elements, inputs, outputs, predicate);
-}
+  c10::SmallVector<torch::Tensor> ConvertedInputs(Inputs.begin(), Inputs.end());
+  c10::SmallVector<torch::Tensor> ConvertedOutputs(Outputs.begin(),
+                                                   Outputs.end());
 
+  auto inputs =
+      torch::cat(ConvertedInputs, Inputs[0].sizes().size() - 1).to(tOptions);
+  auto outputs =
+      torch::cat(ConvertedOutputs, Outputs[0].sizes().size() - 1).to(tOptions);
+
+
+  if (inputs.dtype() != outputs.dtype()) {
+    throw std::invalid_argument(
+        "Storing into HDF5 database requires all tensors to have the same "
+        "datatype. Now they have:" +
+        dtypeToString(torch::typeMetaToScalarType(inputs.dtype())) + " and " +
+        dtypeToString(torch::typeMetaToScalarType(outputs.dtype())));
+  }
 
-void hdf5DB::store(size_t num_elements,
-                   std::vector<double*>& inputs,
-                   std::vector<double*>& outputs,
-                   bool* predicate)
-{
   if (HDType == -1) {
-    HDType = H5T_NATIVE_DOUBLE;
+    HDType = torchDTypeToHDF5Type(torch::typeMetaToScalarType(inputs.dtype()));
   }
-  _store(num_elements, inputs, outputs, predicate);
+
+  if (HDType == -1 || HDType == H5T_NO_CLASS)
+    throw std::invalid_argument(
+        "Data base can not deduce the data type of the tensors" +
+        dtypeToString(torch::typeMetaToScalarType(inputs.dtype())) + " and " +
+        dtypeToString(torch::typeMetaToScalarType(outputs.dtype())));
+
+  _store(inputs, outputs);
 }
diff --git a/src/AMSlib/wf/interface.cpp b/src/AMSlib/wf/interface.cpp
new file mode 100644
index 00000000..3bdba47a
--- /dev/null
+++ b/src/AMSlib/wf/interface.cpp
@@ -0,0 +1,140 @@
+#include <ATen/ops/from_blob.h>
+#include <c10/core/DeviceType.h>
+#include <c10/util/SmallVector.h>
+#include <torch/torch.h>
+
+#include <stdexcept>
+
+#include "AMS.h"
+#include "AMSTensor.hpp"
+#include "wf/workflow.hpp"
+
+using namespace ams;
+
+static AMSResourceType torchDeviceToAMSDevice(c10::DeviceType dType)
+{
+  switch (dType) {
+    case c10::DeviceType::CUDA:
+    case c10::DeviceType::HIP:
+      return AMSResourceType::AMS_DEVICE;
+    case c10::DeviceType::CPU:
+      return AMSResourceType::AMS_HOST;
+    default:
+      return AMSResourceType::AMS_UNKNOWN;
+  }
+  return AMSResourceType::AMS_UNKNOWN;
+}
+
+static AMSDType torchDTypeToAMSType(torch::Dtype dtype)
+{
+  static const std::unordered_map<torch::Dtype, AMSDType> dtypeMap = {
+      {torch::kFloat32, AMSDType::AMS_SINGLE},
+      {torch::kFloat, AMSDType::AMS_SINGLE},  // Alias for float32
+      {torch::kFloat64, AMSDType::AMS_DOUBLE},
+      {torch::kDouble, AMSDType::AMS_DOUBLE},  // Alias for float64
+      {torch::kInt32, AMSDType::AMS_UNKNOWN_TYPE},
+      {torch::kInt64, AMSDType::AMS_UNKNOWN_TYPE},
+      {torch::kBool, AMSDType::AMS_UNKNOWN_TYPE},
+      {torch::kUInt8, AMSDType::AMS_UNKNOWN_TYPE},
+      {torch::kInt8, AMSDType::AMS_UNKNOWN_TYPE},
+      {torch::kHalf, AMSDType::AMS_UNKNOWN_TYPE},
+      {torch::kBFloat16, AMSDType::AMS_UNKNOWN_TYPE}};
+
+  return dtypeMap.count(dtype) ? dtypeMap.at(dtype)
+                               : AMSDType::AMS_UNKNOWN_TYPE;
+}
+
+static c10::DeviceType amsToTorchDevice(const ams::AMSResourceType resource)
+{
+  if (resource == ams::AMSResourceType::AMS_HOST)
+    return c10::DeviceType::CPU;
+  else if (resource == ams::AMSResourceType::AMS_DEVICE)
+    return c10::DeviceType::CUDA;
+
+  throw std::runtime_error("Unknown ams resource type");
+  return c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
+}
+
+static c10::ScalarType amsToTorchDType(const ams::AMSDType dType)
+{
+  if (dType == ams::AMSDType::AMS_SINGLE)
+    return torch::kFloat32;
+  else if (dType == ams::AMSDType::AMS_DOUBLE)
+    return torch::kFloat64;
+
+  throw std::runtime_error("Unknown ams data type");
+  return torch::kHalf;
+}
+
+
+static ams::SmallVector<ams::AMSTensor> torchToAMSTensors(
+    ams::MutableArrayRef<torch::Tensor> tensorVector)
+{
+  ams::SmallVector<ams::AMSTensor> ams_tensors;
+  for (auto tensor : tensorVector) {
+    // We should be able to completely remove these conversion by using some template "magic."
+    // I will leave these for later though
+    auto dType = torchDTypeToAMSType(tensor.scalar_type());
+    auto rType = torchDeviceToAMSDevice(tensor.device().type());
+    // In both cases, I am effectively only forwarding the pointer of begin/end to ams.
+    // this is a cheap operating. It should boil down to: shapes.start = tensor.sizes.start, shapes.end = tensor.sizes.end;
+    auto shapes = ArrayRef(tensor.sizes().begin(), tensor.strides().size());
+    auto strides = ArrayRef(tensor.strides().begin(), tensor.strides().size());
+    if (dType == AMSDType::AMS_SINGLE) {
+      ams_tensors.push_back(
+          AMSTensor::view(tensor.data_ptr<float>(), shapes, strides, rType));
+    } else if (dType == AMSDType::AMS_DOUBLE) {
+      ams_tensors.push_back(
+          AMSTensor::view(tensor.data_ptr<double>(), shapes, strides, rType));
+    }
+  }
+  return ams_tensors;
+}
+
+static ams::SmallVector<torch::Tensor> amsToTorchTensors(
+    const ams::SmallVector<ams::AMSTensor> &amsTensorVector)
+{
+  ams::SmallVector<torch::Tensor> ams_tensors;
+  for (auto &tensor : amsTensorVector) {
+    // We should be able to completely remove these conversion by using some template "magic."
+    // I will leave these for later though
+    auto dType = amsToTorchDType(tensor.dType());
+    auto deviceType = amsToTorchDevice(tensor.location());
+    // In both cases, I am effectively only forwarding the pointer of begin/end to ams.
+    // this is a cheap operating. It should boil down to: shapes.start = tensor.sizes.start, shapes.end = tensor.sizes.end;
+    c10::SmallVector<long> shapes(tensor.shape().begin(), tensor.shape().end());
+    c10::SmallVector<long> strides(tensor.strides().begin(),
+                                   tensor.strides().end());
+    ams_tensors.push_back(torch::from_blob(
+        tensor.raw_data(),
+        shapes,
+        strides,
+        torch::TensorOptions().dtype(dType).device(deviceType)));
+  }
+  return std::move(ams_tensors);
+}
+
+void callApplication(ams::EOSLambda CallBack,
+                     ams::MutableArrayRef<torch::Tensor> Ins,
+                     ams::MutableArrayRef<torch::Tensor> InOuts,
+                     ams::MutableArrayRef<torch::Tensor> Outs)
+{
+  auto AMSIns = torchToAMSTensors(Ins);
+  auto AMSInOuts = torchToAMSTensors(InOuts);
+  auto AMSOuts = torchToAMSTensors(Outs);
+  CallBack(AMSIns, AMSInOuts, AMSOuts);
+  return;
+}
+
+void callAMS(ams::AMSWorkflow *executor,
+             EOSLambda Physics,
+             const ams::SmallVector<ams::AMSTensor> &ins,
+             ams::SmallVector<ams::AMSTensor> &inouts,
+             ams::SmallVector<ams::AMSTensor> &outs)
+{
+  ams::SmallVector<torch::Tensor> tins = amsToTorchTensors(ins);
+  ams::SmallVector<torch::Tensor> tinouts = amsToTorchTensors(inouts);
+  ams::SmallVector<torch::Tensor> touts = amsToTorchTensors(outs);
+
+  executor->evaluate(Physics, tins, tinouts, touts);
+}
diff --git a/src/AMSlib/wf/interface.hpp b/src/AMSlib/wf/interface.hpp
new file mode 100644
index 00000000..c1e3ec7b
--- /dev/null
+++ b/src/AMSlib/wf/interface.hpp
@@ -0,0 +1,22 @@
+#pragma once
+#include <c10/core/DeviceType.h>
+#include <torch/torch.h>
+
+#include "AMS.h"
+
+namespace ams
+{
+class AMSWorkflow;
+}
+
+void callApplication(ams::EOSLambda CallBack,
+                     ams::MutableArrayRef<torch::Tensor> Ins,
+                     ams::MutableArrayRef<torch::Tensor> InOuts,
+                     ams::MutableArrayRef<torch::Tensor> Outs);
+
+
+void callAMS(ams::AMSWorkflow *executor,
+             ams::EOSLambda Physics,
+             const ams::SmallVector<ams::AMSTensor> &ins,
+             ams::SmallVector<ams::AMSTensor> &inouts,
+             ams::SmallVector<ams::AMSTensor> &outs);
diff --git a/src/AMSlib/wf/logger.cpp b/src/AMSlib/wf/logger.cpp
index 478326e0..f01c4627 100644
--- a/src/AMSlib/wf/logger.cpp
+++ b/src/AMSlib/wf/logger.cpp
@@ -6,13 +6,18 @@
  */
 
 
+#include <limits.h>
+#include <unistd.h>
+
 #include <algorithm>  // for std::equal
 #include <cctype>     // for std::toupper
 #include <cstdlib>    // for getenv()
 #include <experimental/filesystem>
 #include <fstream>
 #include <iostream>
+#include <mutex>
 #include <ostream>
+#include <regex>
 #include <string>
 
 #include "debug.h"
@@ -20,6 +25,20 @@
 #include "wf/debug.h"
 #include "wf/logger.hpp"
 
+static int get_rank_id()
+{
+  if (const char* flux_id = std::getenv("FLUX_TASK_RANK")) {
+    return std::stoi(flux_id);
+  } else if (const char* rid = std::getenv("SLURM_PROCID")) {
+    return std::stoi(rid);
+  } else if (const char* jsm = std::getenv("JSM_NAMESPACE_RANK")) {
+    return std::stoi(jsm);
+  } else if (const char* pmi = std::getenv("PMIX_RANK")) {
+    return std::stoi(pmi);
+  }
+  return 0;
+}
+
 
 namespace ams
 {
@@ -87,6 +106,8 @@ void Logger::setLoggingMsgLevel(LogVerbosityLevel level)
 Logger* Logger::getActiveLogger()
 {
   static Logger logger;
+  static std::once_flag _amsLogger;
+  std::call_once(_amsLogger, [&]() { logger.setup_loggers(); });
   return &logger;
 }
 
@@ -119,6 +140,73 @@ void Logger::initialize_std_io_err(const bool enable_log,
   }
 }
 
+
+void Logger::setup_loggers()
+{
+  namespace fs = std::experimental::filesystem;
+  const char* ams_logger_level = std::getenv("AMS_LOG_LEVEL");
+  const char* ams_logger_dir = std::getenv("AMS_LOG_DIR");
+  const char* ams_logger_prefix = std::getenv("AMS_LOG_PREFIX");
+  std::string log_fn("");
+  std::string log_path("./");
+
+  bool enable_log = false;
+
+  if (ams_logger_level) {
+    auto log_lvl = ams::util::getVerbosityLevel(ams_logger_level);
+    setLoggingMsgLevel(log_lvl);
+    enable_log = true;
+  }
+
+  // In the case we specify a directory and we do not specify a file
+  // by default we write to a file.
+  if (ams_logger_dir && !ams_logger_prefix) {
+    ams_logger_prefix = "ams";
+  }
+
+  if (ams_logger_prefix) {
+    // We are going to redirect stdout to some file
+    // By default we store to the current directory
+    std::string pattern("");
+    std::string log_prefix(ams_logger_prefix);
+
+    if (ams_logger_dir) {
+      log_path = std::string(ams_logger_dir);
+    }
+
+    char hostname[HOST_NAME_MAX];
+    if (gethostname(hostname, HOST_NAME_MAX) != 0) {
+      FATAL(AMS, "Get hostname returns error");
+    }
+
+    int id = 0;
+    if (log_prefix.find("<RID>") != std::string::npos) {
+      pattern = std::string("<RID>");
+      id = get_rank_id();
+    } else if (log_prefix.find("<PID>") != std::string::npos) {
+      pattern = std::string("<PID>");
+      id = getpid();
+    }
+
+    // Combine hostname and pid
+    std::ostringstream combined;
+    combined << "." << hostname << "." << id;
+
+    if (!pattern.empty()) {
+      log_path = fs::absolute(log_path).string();
+      log_fn =
+          std::regex_replace(log_prefix, std::regex(pattern), combined.str());
+    } else {
+      log_path = fs::absolute(log_path).string();
+      log_fn = log_prefix + combined.str();
+    }
+  }
+  initialize_std_io_err(enable_log, log_path, log_fn);
+
+  return;
+}
+
+
 void Logger::flush()
 {
   if (ams_out != nullptr && ams_out != stdout) fflush(ams_out);
diff --git a/src/AMSlib/wf/logger.hpp b/src/AMSlib/wf/logger.hpp
index 5f1be46a..80715ce2 100644
--- a/src/AMSlib/wf/logger.hpp
+++ b/src/AMSlib/wf/logger.hpp
@@ -62,6 +62,7 @@ class Logger
 
 private:
   Logger() noexcept;
+  void setup_loggers();
 
   bool m_is_enabled[LogVerbosityLevel::Num_Levels];
   FILE *ams_out, *ams_err;
diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp
index 1deeaf0a..a23a9384 100644
--- a/src/AMSlib/wf/resource_manager.cpp
+++ b/src/AMSlib/wf/resource_manager.cpp
@@ -8,8 +8,11 @@
 #include <cstdlib>
 #include <cstring>
 
+#ifdef __AMS_ENABLE_CUDA__
+#include <cuda_runtime.h>
+#endif
+
 #include "debug.h"
-#include "device.hpp"
 #include "resource_manager.hpp"
 
 namespace ams
@@ -25,15 +28,29 @@ const std::string AMSAllocator::getName() const { return name; }
 
 
 struct AMSDefaultDeviceAllocator final : AMSAllocator {
-  AMSDefaultDeviceAllocator(std::string name) : AMSAllocator(name){};
+  AMSDefaultDeviceAllocator(std::string name) : AMSAllocator(name) {};
   ~AMSDefaultDeviceAllocator()
   {
     DBG(AMSDefaultDeviceAllocator, "Destroying default device allocator");
   };
 
-  void *allocate(size_t num_bytes) { return DeviceAllocate(num_bytes); }
+  void *allocate(size_t num_bytes, size_t alignment)
+  {
+#ifdef __AMS_ENABLE_CUDA__
+    void *devPtr;
+    cudaMalloc(&devPtr, num_bytes);
+    return devPtr;
+#else
+    return nullptr;
+#endif
+  }
 
-  void deallocate(void *ptr) { return DeviceFree(ptr); }
+  void deallocate(void *ptr)
+  {
+#ifdef __AMS_ENABLE_CUDA__
+    cudaFree(ptr);
+#endif
+  }
 };
 
 struct AMSDefaultHostAllocator final : AMSAllocator {
@@ -43,9 +60,9 @@ struct AMSDefaultHostAllocator final : AMSAllocator {
     DBG(AMSDefaultDeviceAllocator, "Destroying default host allocator");
   }
 
-  void *allocate(size_t num_bytes)
+  void *allocate(size_t num_bytes, size_t alignment)
   {
-    return aligned_alloc(8, roundUp(num_bytes, 8));
+    return aligned_alloc(alignment, roundUp(num_bytes, alignment));
   }
 
   void deallocate(void *ptr) { free(ptr); }
@@ -55,9 +72,23 @@ struct AMSDefaultPinnedAllocator final : AMSAllocator {
   AMSDefaultPinnedAllocator(std::string name) : AMSAllocator(name) {}
   ~AMSDefaultPinnedAllocator() = default;
 
-  void *allocate(size_t num_bytes) { return DevicePinnedAlloc(num_bytes); }
+  void *allocate(size_t num_bytes, size_t alignment)
+  {
+#ifdef __AMS_ENABLE_CUDA__
+    void *ptr;
+    cudaHostAlloc(&ptr, num_bytes, cudaHostAllocPortable);
+    return ptr;
+#else
+    return nullptr;
+#endif
+  }
 
-  void deallocate(void *ptr) { DeviceFreePinned(ptr); }
+  void deallocate(void *ptr)
+  {
+#ifdef __AMS_ENABLE_CUDA__
+    cudaFreeHost(ptr);
+#endif
+  }
 };
 
 
@@ -78,26 +109,30 @@ void _raw_copy(void *src,
           std::memcpy(dest, src, num_bytes);
           break;
         case AMSResourceType::AMS_DEVICE:
-          HtoDMemcpy(dest, src, num_bytes);
+#ifdef __AMS_ENABLE_CUDA__
+          cudaMemcpy(dest, src, num_bytes, cudaMemcpyHostToDevice);
+#endif
           break;
         default:
           FATAL(ResourceManager, "Unknown device type to copy to from HOST");
           break;
       }
       break;
+#ifdef __AMS_ENABLE_CUDA__
     case AMSResourceType::AMS_DEVICE:
       switch (dest_dev) {
         case AMSResourceType::AMS_DEVICE:
-          DtoDMemcpy(dest, src, num_bytes);
+          cudaMemcpy(dest, src, num_bytes, cudaMemcpyDeviceToDevice);
           break;
         case AMSResourceType::AMS_HOST:
         case AMSResourceType::AMS_PINNED:
-          DtoHMemcpy(dest, src, num_bytes);
+          cudaMemcpy(dest, src, num_bytes, cudaMemcpyDeviceToHost);
           break;
         default:
           FATAL(ResourceManager, "Unknown device type to copy to from DEVICE");
           break;
       }
+#endif
       break;
     default:
       FATAL(ResourceManager, "Unknown device type to copy from");
diff --git a/src/AMSlib/wf/resource_manager.hpp b/src/AMSlib/wf/resource_manager.hpp
index 813c8d56..20eb64f4 100644
--- a/src/AMSlib/wf/resource_manager.hpp
+++ b/src/AMSlib/wf/resource_manager.hpp
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "AMS.h"
+#include "macro.h"
 #include "wf/debug.h"
 
 
@@ -44,7 +45,7 @@ struct AMSAllocator {
   AMSAllocator(std::string& alloc_name) : name(alloc_name) {}
   virtual ~AMSAllocator() = default;
 
-  virtual void* allocate(size_t num_bytes) = 0;
+  virtual void* allocate(size_t num_bytes, size_t alignment) = 0;
   virtual void deallocate(void* ptr) = 0;
 
   const std::string getName() const;
@@ -91,10 +92,12 @@ class ResourceManager
    */
   template <typename TypeInValue>
   PERFFASPECT()
-  TypeInValue* allocate(size_t nvalues, AMSResourceType dev)
+  TypeInValue* allocate(size_t nvalues,
+                        AMSResourceType dev,
+                        size_t alignment = sizeof(TypeInValue))
   {
     return static_cast<TypeInValue*>(
-        RMAllocators[dev]->allocate(nvalues * sizeof(TypeInValue)));
+        RMAllocators[dev]->allocate(nvalues * sizeof(TypeInValue), alignment));
   }
 
   /** @brief deallocates pointer from the specified device.
@@ -152,7 +155,7 @@ class ResourceManager
     std::string pinned_alloc("PINNED");
     if (!RMAllocators[AMSResourceType::AMS_HOST])
       setAllocator(host_alloc, AMSResourceType::AMS_HOST);
-#ifdef __ENABLE_CUDA__
+#ifdef __AMS_ENABLE_CUDA__
     if (!RMAllocators[AMSResourceType::AMS_DEVICE])
       setAllocator(host_alloc, AMSResourceType::AMS_DEVICE);
 
diff --git a/src/AMSlib/wf/rmqdb.cpp b/src/AMSlib/wf/rmqdb.cpp
index 4f3cdf62..b398f99d 100644
--- a/src/AMSlib/wf/rmqdb.cpp
+++ b/src/AMSlib/wf/rmqdb.cpp
@@ -5,6 +5,8 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
+#include <cstdint>
+
 #include "wf/basedb.hpp"
 
 using namespace ams::db;
@@ -15,15 +17,11 @@ using namespace ams::db;
 
 AMSMsgHeader::AMSMsgHeader(size_t mpi_rank,
                            size_t domain_size,
-                           size_t num_elem,
                            size_t in_dim,
-                           size_t out_dim,
-                           size_t type_size)
+                           size_t out_dim)
     : hsize(static_cast<uint8_t>(AMSMsgHeader::size())),
-      dtype(static_cast<uint8_t>(type_size)),
       mpi_rank(static_cast<uint16_t>(mpi_rank)),
       domain_size(static_cast<uint16_t>(domain_size)),
-      num_elem(static_cast<uint32_t>(num_elem)),
       in_dim(static_cast<uint16_t>(in_dim)),
       out_dim(static_cast<uint16_t>(out_dim))
 {
@@ -31,15 +29,11 @@ AMSMsgHeader::AMSMsgHeader(size_t mpi_rank,
 
 AMSMsgHeader::AMSMsgHeader(uint16_t mpi_rank,
                            uint16_t domain_size,
-                           uint32_t num_elem,
                            uint16_t in_dim,
-                           uint16_t out_dim,
-                           uint8_t type_size)
+                           uint16_t out_dim)
     : hsize(static_cast<uint8_t>(AMSMsgHeader::size())),
-      dtype(type_size),
       mpi_rank(mpi_rank),
       domain_size(domain_size),
-      num_elem(num_elem),
       in_dim(in_dim),
       out_dim(out_dim)
 {
@@ -50,33 +44,20 @@ size_t AMSMsgHeader::encode(uint8_t* data_blob)
   if (!data_blob) return 0;
 
   size_t current_offset = 0;
-  // Header size (should be 1 bytes)
-  data_blob[current_offset] = hsize;
-  current_offset += sizeof(hsize);
-  // Data type (should be 1 bytes)
-  data_blob[current_offset] = dtype;
-  current_offset += sizeof(dtype);
   // MPI rank (should be 2 bytes)
-  std::memcpy(data_blob + current_offset, &(mpi_rank), sizeof(mpi_rank));
-  current_offset += sizeof(mpi_rank);
+  current_offset += serialize_data(&data_blob[current_offset], hsize);
+  current_offset += serialize_data(&data_blob[current_offset], mpi_rank);
+  current_offset += serialize_data(&data_blob[current_offset], domain_size);
+  current_offset +=
+      serialize_data(&data_blob[current_offset], static_cast<uint16_t>(in_dim));
+  current_offset +=
+      serialize_data(&data_blob[current_offset], static_cast<uint16_t>(out_dim));
+
   // Domain Size (should be 2 bytes)
   DBG(AMSMsgHeader,
-      "Generating domain name of size %d --- %d offset %d",
+      "Generating domain name of size %d --- %lu",
       domain_size,
-      sizeof(domain_size),
-      current_offset);
-  std::memcpy(data_blob + current_offset, &(domain_size), sizeof(domain_size));
-  current_offset += sizeof(domain_size);
-  // Num elem (should be 4 bytes)
-  std::memcpy(data_blob + current_offset, &(num_elem), sizeof(num_elem));
-  current_offset += sizeof(num_elem);
-  // Input dim (should be 2 bytes)
-  std::memcpy(data_blob + current_offset, &(in_dim), sizeof(in_dim));
-  current_offset += sizeof(in_dim);
-  // Output dim (should be 2 bytes)
-  std::memcpy(data_blob + current_offset, &(out_dim), sizeof(out_dim));
-  current_offset += sizeof(out_dim);
-
+      sizeof(domain_size));
   return AMSMsgHeader::size();
 }
 
@@ -117,12 +98,7 @@ AMSMsgHeader AMSMsgHeader::decode(uint8_t* data_blob)
   uint16_t new_out_dim;
   std::memcpy(&new_out_dim, data_blob + current_offset, sizeof(uint16_t));
 
-  return AMSMsgHeader(new_mpirank,
-                      new_domain_size,
-                      new_num_elem,
-                      new_in_dim,
-                      new_out_dim,
-                      new_dtype);
+  return AMSMsgHeader(new_mpirank, new_domain_size, new_in_dim, new_out_dim);
 }
 
 /**
@@ -133,40 +109,12 @@ void AMSMessage::swap(const AMSMessage& other)
 {
   _id = other._id;
   _rank = other._rank;
-  _num_elements = other._num_elements;
   _input_dim = other._input_dim;
   _output_dim = other._output_dim;
   _total_size = other._total_size;
   _data = other._data;
 }
 
-AMSMessage::AMSMessage(int id, uint64_t rId, uint8_t* data)
-    : _id(id),
-      _num_elements(0),
-      _input_dim(0),
-      _output_dim(0),
-      _data(data),
-      _total_size(0)
-{
-  auto header = AMSMsgHeader::decode(data);
-
-  int current_rank = rId;
-  _rank = header.mpi_rank;
-  CWARNING(AMSMessage,
-           _rank != current_rank,
-           "MPI rank are not matching (using %d)",
-           _rank)
-
-  _num_elements = header.num_elem;
-  _input_dim = header.in_dim;
-  _output_dim = header.out_dim;
-  _data = data;
-  auto type_value = header.dtype;
-
-  _total_size = AMSMsgHeader::size() + getTotalElements() * type_value;
-
-  DBG(AMSMessage, "Allocated message %d: %p", _id, _data);
-}
 
 /**
  * AMSMessageInbound
@@ -183,7 +131,7 @@ AMSMessageInbound::AMSMessageInbound(uint64_t id,
       body(std::move(body)),
       exchange(std::move(exchange)),
       routing_key(std::move(routing_key)),
-      redelivered(redelivered) {};
+      redelivered(redelivered){};
 
 
 bool AMSMessageInbound::empty() { return body.empty() || routing_key.empty(); }
@@ -340,4 +288,4 @@ void AMQPHandler::onClosed(AMQP::TcpConnection* connection)
 void AMQPHandler::onReady(AMQP::TcpConnection* connection)
 {
   DBG(AMQPHandler, "Connection established and ready")
-}
\ No newline at end of file
+}
diff --git a/src/AMSlib/wf/utils.cpp b/src/AMSlib/wf/utils.cpp
index 0dc8f8ad..9b2884e2 100644
--- a/src/AMSlib/wf/utils.cpp
+++ b/src/AMSlib/wf/utils.cpp
@@ -7,7 +7,7 @@
 
 #include "wf/utils.hpp"
 
-void random_uq_host(bool *uq_flags, int ndata, double acceptable_error)
+void random_uq_host(bool* uq_flags, int ndata, double acceptable_error)
 {
 
   for (int i = 0; i < ndata; i++) {
diff --git a/src/AMSlib/wf/utils.hpp b/src/AMSlib/wf/utils.hpp
index 06ea0527..a51e733f 100644
--- a/src/AMSlib/wf/utils.hpp
+++ b/src/AMSlib/wf/utils.hpp
@@ -8,12 +8,17 @@
 #ifndef __AMS_UTILS_HPP__
 #define __AMS_UTILS_HPP__
 
+#include <ATen/core/TensorBody.h>
+
 #include <algorithm>
 #include <array>
 #include <iostream>
 #include <random>
 #include <vector>
 
+#include "AMS.h"
+#include "SmallVector.hpp"
+
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 
@@ -47,13 +52,38 @@ class isDouble<float>
 // -----------------------------------------------------------------------------
 // -----------------------------------------------------------------------------
 
-void random_uq_host(bool *uq_flags, int ndata, double acceptable_error);
-
 template <typename T>
 inline bool is_real_equal(T l, T r)
 {
   return r == std::nextafter(l, r);
 }
 
-// -----------------------------------------------------------------------------
+
+static inline size_t dtype_to_size(ams::AMSDType dType)
+{
+  switch (dType) {
+    case ams::AMSDType::AMS_DOUBLE:
+      return sizeof(double);
+    case ams::AMSDType::AMS_SINGLE:
+      return sizeof(float);
+    default:
+      throw std::runtime_error("Requesting the size of unknown object");
+  }
+}
+
+static inline std::string shapeToString(const at::Tensor& tensor)
+{
+  std::ostringstream oss;
+  oss << tensor.sizes();
+  return oss.str();
+}
+
+namespace ams
+{
+namespace tensor
+{
+SmallVector<at::Tensor> maskTensor(at::Tensor& Src, at::Tensor& Mask);
+}  // namespace tensor
+}  // namespace ams
+
 #endif
diff --git a/src/AMSlib/wf/workflow.hpp b/src/AMSlib/wf/workflow.hpp
index 543ecb80..bcdbb17d 100644
--- a/src/AMSlib/wf/workflow.hpp
+++ b/src/AMSlib/wf/workflow.hpp
@@ -8,29 +8,21 @@
 #ifndef __AMS_WORKFLOW_HPP__
 #define __AMS_WORKFLOW_HPP__
 
-#include "debug.h"
-#ifdef __AMS_ENABLE_CALIPER__
-#include <caliper/cali_macros.h>
-#endif
+#include <ATen/core/TensorBody.h>
+#include <c10/core/DeviceType.h>
 
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <vector>
+#include <memory>
+#include <stdexcept>
 
 #include "AMS.h"
-#include "ml/uq.hpp"
+#include "ArrayRef.hpp"
+#include "SmallVector.hpp"
+#include "interface.hpp"
+#include "macro.h"
+#include "ml/surrogate.hpp"
 #include "resource_manager.hpp"
+#include "utils.hpp"
 #include "wf/basedb.hpp"
-
-#ifdef __ENABLE_MPI__
-#include <mpi.h>
-
-#include "wf/redist_load.hpp"
-#endif
-
 #include "wf/debug.h"
 
 //! ----------------------------------------------------------------------------
@@ -41,20 +33,9 @@
 //! ----------------------------------------------------------------------------
 namespace ams
 {
-template <typename FPTypeValue>
 class AMSWorkflow
 {
 
-  static_assert(std::is_floating_point<FPTypeValue>::value,
-                "HDCache supports floating-point values (floats, doubles, and "
-                "long doubles) only!");
-
-  using data_handler = ams::DataHandler<FPTypeValue>;
-
-  /** @brief The application call back to perform the original SPMD physics
-   * execution */
-  AMSPhysicFn AppCall;
-
   /** @brief A string identifier describing the domain-model being solved. */
   std::string domainName;
 
@@ -62,29 +43,30 @@ class AMSWorkflow
   std::string dbLabel;
 
   /** @brief The module that performs uncertainty quantification (UQ) */
-  std::unique_ptr<UQ<FPTypeValue>> UQModel;
+  std::shared_ptr<SurrogateModel> MLModel;
 
   /** The metric/type of UQ we will use to select between physics and ml computations **/
   const AMSUQPolicy uqPolicy = AMSUQPolicy::AMS_UQ_END;
 
   /** @brief The database to store data for which we cannot apply the current
-   * model */
+     * model */
   std::shared_ptr<ams::db::BaseDB> DB;
 
   /** @brief The process id. For MPI runs this is the rank */
   const int rId;
 
   /** @brief The total number of processes participating in the simulation
-   * (world_size for MPI) */
+     * (world_size for MPI) */
   int wSize;
 
-  /** @brief  Location of the original application data  (CPU or GPU) */
-  AMSResourceType appDataLoc;
-
   /** @brief execution policy of the distributed system. Load balance or not. */
   AMSExecPolicy ePolicy;
 
-#ifdef __ENABLE_MPI__
+
+  /** @brief The maximum distance of the predicate for a sample prediction to be considered as valid **/
+  const float threshold;
+
+#ifdef __AMS_ENABLE_MPI__
   /** @brief MPI Communicator for all ranks that call collectively the evaluate function **/
   MPI_Comm comm;
 #endif
@@ -92,91 +74,37 @@ class AMSWorkflow
   /** @brief Is the evaluate a distributed execution **/
   bool isDistributed;
 
-  /** \brief Store the data in the database and copies
-   * data from the GPU to the CPU and then to the database.
-   * To store GPU resident data we use a 1MB of "pinned"
-   * memory as a buffer
-   * @param[in] num_elements Number of elements of each 1-D vector
-   * @param[in] inputs vector to 1-D vectors storing num_elements
-   * items to be stored in the database
-   * @param[in] outputs vector to 1-D vectors storing num_elements
-   * items to be stored in the database
-   */
-  void store(size_t num_elements,
-             std::vector<FPTypeValue *> &inputs,
-             std::vector<FPTypeValue *> &outputs,
-             bool *predicate = nullptr)
-  {
-    // 1 MB of buffer size;
-    // TODO: Fix magic number
-    // TODO: This is likely not efficient for RabbitMQ backend at scale
-    //       We could just linearize the whole input+output and do one send (or two) per cycle
-    static const long bSize = 1 * 1024 * 1024;
-    const int numIn = inputs.size();
-    const int numOut = outputs.size();
-    auto &rm = ams::ResourceManager::getInstance();
-
-    // No database, so just de-allocate and return
-    if (!DB) return;
-
-    std::vector<FPTypeValue *> hInputs, hOutputs;
-    bool *hPredicate = nullptr;
-
-    if (appDataLoc == AMSResourceType::AMS_HOST) {
-      return DB->store(num_elements, inputs, outputs, predicate);
-    }
-
-    for (int i = 0; i < inputs.size(); i++) {
-      FPTypeValue *pPtr =
-          rm.allocate<FPTypeValue>(num_elements, AMSResourceType::AMS_HOST);
-      rm.copy(inputs[i], AMS_DEVICE, pPtr, AMS_HOST, num_elements);
-      hInputs.push_back(pPtr);
-    }
-
-    for (int i = 0; i < outputs.size(); i++) {
-      FPTypeValue *pPtr =
-          rm.allocate<FPTypeValue>(num_elements, AMSResourceType::AMS_HOST);
-      rm.copy(outputs[i], AMS_DEVICE, pPtr, AMS_HOST, num_elements);
-      hOutputs.push_back(pPtr);
-    }
-
-    if (predicate) {
-      hPredicate = rm.allocate<bool>(num_elements, AMSResourceType::AMS_HOST);
-      rm.copy(predicate, AMS_DEVICE, hPredicate, AMS_HOST, num_elements);
-    }
-
-    // Store to database
-    DB->store(num_elements, hInputs, hOutputs, hPredicate);
-    rm.deallocate(hInputs, AMSResourceType::AMS_HOST);
-    rm.deallocate(hOutputs, AMSResourceType::AMS_HOST);
-    if (predicate) rm.deallocate(hPredicate, AMSResourceType::AMS_HOST);
-
-    return;
-  }
-
-  void store(size_t num_elements,
-             std::vector<const FPTypeValue *> &inputs,
-             std::vector<FPTypeValue *> &outputs,
-             bool *predicate = nullptr)
+  void storeComputedData(ArrayRef<torch::Tensor> Ins,
+                         ArrayRef<torch::Tensor> InOutsBefore,
+                         ArrayRef<torch::Tensor> Outs,
+                         ArrayRef<torch::Tensor> InOutsAfter)
   {
-    std::vector<FPTypeValue *> mInputs;
-    for (auto I : inputs) {
-      mInputs.push_back(const_cast<FPTypeValue *>(I));
+    CALIPER(CALI_MARK_BEGIN("DBSTORE");)
+    SmallVector<torch::Tensor> StoreInputTensors(Ins.begin(), Ins.end());
+    SmallVector<torch::Tensor> StoreOutputTensors(Outs.begin(), Outs.end());
+    for (auto Tensor : InOutsBefore)
+      StoreInputTensors.push_back(Tensor);
+    for (auto Tensor : InOutsAfter) {
+      StoreOutputTensors.push_back(Tensor);
     }
 
-    store(num_elements, mInputs, outputs, predicate);
+    DBG(Workflow,
+        "Storing data (#elements = %ld) to database",
+        StoreInputTensors[0].sizes()[0]);
+    DB->store(StoreInputTensors, StoreOutputTensors);
+    CALIPER(CALI_MARK_END("DBSTORE");)
   }
 
   /** \brief Check if we can perform a surrogate model update.
-   *  AMS can update surrogate model only when all MPI ranks have received 
-   * the latest model from RabbitMQ.
-   * @return True if surrogate model can be updated
-   */
+     *  AMS can update surrogate model only when all MPI ranks have received 
+     * the latest model from RabbitMQ.
+     * @return True if surrogate model can be updated
+     */
   bool updateModel()
   {
     if (!DB || !DB->allowModelUpdate()) return false;
     bool local = DB->updateModel();
-#ifdef __ENABLE_MPI__
+#ifdef __AMS_ENABLE_MPI__
     bool global = false;
     MPI_Allreduce(&local, &global, 1, MPI_CXX_BOOL, MPI_LAND, comm);
     return global;
@@ -186,52 +114,44 @@ class AMSWorkflow
   }
 
 public:
-  AMSWorkflow()
-      : AppCall(nullptr),
-        DB(nullptr),
-        appDataLoc(AMSResourceType::AMS_HOST),
-#ifdef __ENABLE_MPI__
-        comm(MPI_COMM_NULL),
-#endif
-        ePolicy(AMSExecPolicy::AMS_UBALANCED)
-  {
-  }
-
-  AMSWorkflow(AMSPhysicFn _AppCall,
-              std::string &uq_path,
-              std::string &surrogate_path,
+  AMSWorkflow(std::string &surrogate_path,
               std::string &domain_name,
               std::string &db_label,
-              bool isDebugDB,
-              AMSResourceType app_data_loc,
-              FPTypeValue threshold,
+              float threshold,
               const AMSUQPolicy uq_policy,
-              const int nClusters,
               int _pId = 0,
               int _wSize = 1)
-      : AppCall(_AppCall),
-        domainName(domain_name),
+      : domainName(domain_name),
         dbLabel(db_label),
         rId(_pId),
         wSize(_wSize),
-        appDataLoc(app_data_loc),
         uqPolicy(uq_policy),
-#ifdef __ENABLE_MPI__
+#ifdef __AMS_ENABLE_MPI__
         comm(MPI_COMM_NULL),
 #endif
+        threshold(threshold),
         ePolicy(AMSExecPolicy::AMS_UBALANCED)
   {
     DB = nullptr;
     auto &dbm = ams::db::DBManager::getInstance();
 
-    DB = dbm.getDB(domainName, dbLabel, rId, isDebugDB);
-    UQModel = std::make_unique<UQ<FPTypeValue>>(
-        appDataLoc, uqPolicy, uq_path, nClusters, surrogate_path, threshold);
+    DB = dbm.getDB(domainName, dbLabel, rId);
+    MLModel = nullptr;
+    if (!surrogate_path.empty())
+      MLModel = SurrogateModel::getInstance(
+          surrogate_path,
+          uqPolicy == AMSUQPolicy::AMS_DELTAUQ_MAX ||
+              uqPolicy == AMSUQPolicy::AMS_DELTAUQ_MEAN);
+  }
+
+  std::string getDBFilename() const
+  {
+    if (!DB) return "";
+    return DB->getFilename();
   }
 
-  void set_physics(AMSPhysicFn _AppCall) { AppCall = _AppCall; }
 
-#ifdef __ENABLE_MPI__
+#ifdef __AMS_ENABLE_MPI__
   void set_communicator(MPI_Comm communicator) { comm = communicator; }
 #endif
 
@@ -239,100 +159,170 @@ class AMSWorkflow
 
   bool should_load_balance() const
   {
-#ifdef __ENABLE_MPI__
+#ifdef __AMS_ENABLE_MPI__
     return (comm != MPI_COMM_NULL && ePolicy == AMSExecPolicy::AMS_BALANCED);
 #else
     return false;
 #endif
   }
 
+
+  static SmallVector<torch::Tensor> subSelectTensors(
+      ArrayRef<torch::Tensor> Tensors,
+      torch::Tensor &Mask)
+  {
+    SmallVector<torch::Tensor> NewVector;
+    for (auto O : Tensors) {
+      NewVector.push_back(O.index({Mask}).view({-1, O.sizes()[O.dim() - 1]}));
+    }
+    return NewVector;
+  }
+
+  static void ScatterPhysicOutputsToOrigDomain(
+      ArrayRef<torch::Tensor> computedDomain,
+      torch::Tensor &Predicate,
+      MutableArrayRef<torch::Tensor> entireDomain)
+  {
+    if (computedDomain.size() != entireDomain.size()) {
+      throw std::runtime_error(
+          "Expecting equal sized tensors when composing Original and domain "
+          "memories\n");
+    }
+    for (int i = 0; i < computedDomain.size(); i++) {
+      auto indexed_shape = computedDomain[i].sizes();
+      entireDomain[i].index_put_({Predicate},
+                                 computedDomain[i].view(indexed_shape));
+    }
+  }
+
+
+  static int MLDomainToApplication(torch::Tensor Src,
+                                   MutableArrayRef<torch::Tensor> Dest,
+                                   torch::Tensor Predicate,
+                                   int offset)
+  {
+    int outerDim = Src.dim() - 1;
+    for (auto &dst : Dest) {
+      int ConcatAxisSize = dst.sizes()[dst.dim() - 1];
+      torch::Tensor Slice =
+          Src.narrow(outerDim, offset, ConcatAxisSize).to(dst.options());
+      dst.index_put_({Predicate}, Slice.index({Predicate}));
+      offset += ConcatAxisSize;
+    }
+    return offset;
+  }
+
+
   ~AMSWorkflow() { DBG(Workflow, "Destroying Workflow Handler"); }
 
   /** @brief This is the main entry point of AMSLib and replaces the original
-   * execution path of the application.
-   * @param[in] probDescr an opaque type that will be forwarded to the
-   * application upcall
-   * @param[in] totalElements the total number of elements to apply the SPMD
-   * function on
-   * @param[in] inputs the inputs of the computation.
-   * @param[out] outputs the computed outputs.
-   * @param[in] Comm The MPI Communicatotor for all ranks participating in the
-   * SPMD execution.
-   *
-   * @details The function corresponds to the main driver of the AMSLib.
-   * Assuming an original 'foo' function void foo ( void *cls, int numElements,
-   * void **inputs, void **outputs){ parallel_for(I : numElements){
-   *       cls->physics(inputs[0][I], outputs[0][I]);
-   *    }
-   * }
-   *
-   * The AMS transformation would functionaly look like this:
-   * void AMSfoo ( void *cls, int numElements, void **inputs, void **outputs){
-   *    parallel_for(I : numElements){
-   *       if ( UQ (I) ){
-   *          Surrogate(inputs[0][I], outputs[0][I])
-   *       }
-   *       else{
-   *        cls->physics(inputs[0][I], outputs[0][I]);
-   *        DB->Store(inputs[0][I], outputs[0][I]);
-   *       }
-   *    }
-   * }
-   *
-   * Yet, AMS assumes a SPMD physics function (in the example cls->physics).
-   * Therefore, the AMS transformation is taking place at the level of the SPMD
-   * execution. The following transformation is equivalent void AMSfoo( void
-   * *cls, int numElements, void **inputs, void **outputs){ predicates =
-   * UQ(inputs, numElements); modelInputs, physicsInputs = partition(predicates,
-   * inputs); modelOuputs, physicsOutputs = partition(predicates, output);
-   *    foo(cls, physicsInputs.size(), physicsInputs, physicsOutputs);
-   *    surrogate(modelInputs, modelOuputs, modelOuputs.size());
-   *    DB->Store(physicsInputs, physicsOutputs);
-   *    concatenate(outptuts, modelOuputs, predicate);
-   * }
-   *
-   * This transformation can exploit the parallel nature of all the required
-   * steps.
-   */
-  void evaluate(void *probDescr,
-                const int totalElements,
-                const FPTypeValue **inputs,
-                FPTypeValue **outputs,
-                int inputDim,
-                int outputDim)
+     * execution path of the application.
+     * @param[in] probDescr an opaque type that will be forwarded to the
+     * application upcall
+     * @param[in] totalElements the total number of elements to apply the SPMD
+     * function on
+     * @param[in] inputs the inputs of the computation.
+     * @param[out] outputs the computed outputs.
+     * @param[in] Comm The MPI Communicatotor for all ranks participating in the
+     * SPMD execution.
+     *
+     * @details The function corresponds to the main driver of the AMSLib.
+     * Assuming an original 'foo' function void foo ( void *cls, int numElements,
+     * void **inputs, void **outputs){ parallel_for(I : numElements){
+     *       cls->physics(inputs[0][I], outputs[0][I]);
+     *    }
+     * }
+     *
+     * The AMS transformation would functionaly look like this:
+     * void AMSfoo ( void *cls, int numElements, void **inputs, void **outputs){
+     *    parallel_for(I : numElements){
+     *       if ( UQ (I) ){
+     *          Surrogate(inputs[0][I], outputs[0][I])
+     *       }
+     *       else{
+     *        cls->physics(inputs[0][I], outputs[0][I]);
+     *        DB->Store(inputs[0][I], outputs[0][I]);
+     *       }
+     *    }
+     * }
+     *
+     * Yet, AMS assumes a SPMD physics function (in the example cls->physics).
+     * Therefore, the AMS transformation is taking place at the level of the SPMD
+     * execution. The following transformation is equivalent void AMSfoo( void
+     * *cls, int numElements, void **inputs, void **outputs){ predicates =
+     * UQ(inputs, numElements); modelInputs, physicsInputs = partition(predicates,
+     * inputs); modelOuputs, physicsOutputs = partition(predicates, output);
+     *    foo(cls, physicsInputs.size(), physicsInputs, physicsOutputs);
+     *    surrogate(modelInputs, modelOuputs, modelOuputs.size());
+     *    DB->Store(physicsInputs, physicsOutputs);
+     *    concatenate(outptuts, modelOuputs, predicate);
+     * }
+     *
+     * This transformation can exploit the parallel nature of all the required
+     * steps.
+     */
+  void evaluate(EOSLambda CallBack,
+                ams::MutableArrayRef<torch::Tensor> Ins,
+                ams::MutableArrayRef<torch::Tensor> InOuts,
+                ams::MutableArrayRef<torch::Tensor> Outs)
   {
     CALIPER(CALI_MARK_BEGIN("AMSEvaluate");)
+    DBG(Workflow,
+        "Entering Workflow with TorchIn:%ld, TochInOut:%ld, TorchOut:%ld",
+        Ins.size(),
+        InOuts.size(),
+        Outs.size());
+
+    std::string msg{"ApplicationInput: [ "};
+    for (auto &TI : Ins)
+      msg += shapeToString(TI) + " ";
+    msg += "]";
+    DBG(Workflow, "%s", msg.c_str());
+
+    msg = "ApplicationInOut: [ ";
+    for (auto &TIO : InOuts)
+      msg += shapeToString(TIO) + " ";
+    msg += "]";
+    DBG(Workflow, "%s", msg.c_str());
+
+    msg = "ApplicationOutput: [ ";
+    for (auto &TO : Outs)
+      msg += shapeToString(TO) + " ";
+    msg += "]";
+    DBG(Workflow, "%s", msg.c_str());
+
+
+    SmallVector<torch::Tensor> InputTensors(Ins.begin(), Ins.end());
+    SmallVector<torch::Tensor> OutputTensors(Outs.begin(), Outs.end());
+    DBG(Workflow,
+        "Entering Workflow with TorchIn:%ld, TorchOut:%ld",
+        InputTensors.size(),
+        OutputTensors.size());
+    for (auto Tensor : InOuts) {
+      InputTensors.push_back(Tensor);
+      OutputTensors.push_back(Tensor);
+    }
 
-    CDEBUG(Workflow,
-           rId == 0,
-           "Entering Evaluate "
-           "with problem dimensions [(%d, %d, %d, %d)]",
-           totalElements,
-           inputDim,
-           totalElements,
-           outputDim);
-    // To move around the inputs, outputs we bundle them as std::vectors
-    std::vector<const FPTypeValue *> origInputs(inputs, inputs + inputDim);
-    std::vector<FPTypeValue *> origOutputs(outputs, outputs + outputDim);
-    auto &rm = ams::ResourceManager::getInstance();
+    // Here we create a copy of the inputs/outputs. This is "necessary". To correctly handle
+    // input-output cases and to also to set them to right precision.
 
     REPORT_MEM_USAGE(Workflow, "Start")
 
-    if (!UQModel->hasSurrogate()) {
-      FPTypeValue **tmpInputs = const_cast<FPTypeValue **>(inputs);
+    if (!MLModel) {
+      DBG(Workflow, "Model does not exist, calling entire application");
+      // We need to clone only inout data to guarantee
+      // we have a copy of them when writting the database
+      SmallVector<torch::Tensor> PhysicInOutsBefore;
+      for (auto S : InOuts)
+        PhysicInOutsBefore.push_back(S.clone());
 
-      std::vector<FPTypeValue *> tmpIn(tmpInputs, tmpInputs + inputDim);
-      DBG(Workflow, "No-Model, I am calling Physics code (for all data)");
+      // We call the application here
       CALIPER(CALI_MARK_BEGIN("PHYSICS MODULE");)
-      AppCall(probDescr,
-              totalElements,
-              reinterpret_cast<const void **>(origInputs.data()),
-              reinterpret_cast<void **>(origOutputs.data()));
+      callApplication(CallBack, Ins, InOuts, Outs);
       CALIPER(CALI_MARK_END("PHYSICS MODULE");)
+
       if (DB) {
-        CALIPER(CALI_MARK_BEGIN("DBSTORE");)
-        store(totalElements, tmpIn, origOutputs);
-        CALIPER(CALI_MARK_END("DBSTORE");)
+        storeComputedData(Ins, PhysicInOutsBefore, Outs, InOuts);
       }
       CALIPER(CALI_MARK_END("AMSEvaluate");)
       return;
@@ -345,133 +335,85 @@ class AMSWorkflow
             rId == 0,
             "Updating surrogate model with %s",
             model.c_str())
-      UQModel->updateModel(model);
+      // UQModel->updateModel(model);
     }
     CALIPER(CALI_MARK_END("UPDATEMODEL");)
 
-    // The predicate with which we will split the data on a later step
-    bool *predicate = rm.allocate<bool>(totalElements, appDataLoc);
-
     // -------------------------------------------------------------
-    // STEP 1: call the UQ module to look at input uncertainties
-    //         to decide if making a ML inference makes sense
+    // STEP 1: call the ML Model to get both the prediction and the predicates.
     // -------------------------------------------------------------
-    CALIPER(CALI_MARK_BEGIN("UQ_MODULE");)
-    UQModel->evaluate(totalElements, origInputs, origOutputs, predicate);
-    CALIPER(CALI_MARK_END("UQ_MODULE");)
-
-    DBG(Workflow, "Computed Predicates")
+    CALIPER(CALI_MARK_BEGIN("SURROGATE");)
+    // The predicate with which we will split the data on a lateMLInputsr step
+    auto [MLOutputs, Predicate] =
+        MLModel->evaluate(InputTensors, uqPolicy, threshold);
 
-    // Pointer values which store input data values
-    // to be computed using the eos function.
-    std::vector<FPTypeValue *> packedInputs;
+    CALIPER(CALI_MARK_END("SURROGATE");)
 
-    for (int i = 0; i < inputDim; i++) {
-      packedInputs.emplace_back(
-          rm.allocate<FPTypeValue>(totalElements, appDataLoc));
-    }
+    //Copy out the results of the ML Model to the correct indices, this needs to happen
+    // NOTE: squeezing the predicate is important for the operations next. As they require
+    // this shape. We cannot call the inpace operator as the Predicate is generated by the model,
+    // in inference mode, and thus it has the read-only property set.
+    Predicate = Predicate.squeeze();
+    CALIPER(CALI_MARK_BEGIN("MLDomainToApplication");)
+    int offset = MLDomainToApplication(MLOutputs, Outs, Predicate, 0);
+    MLDomainToApplication(MLOutputs, InOuts, Predicate, offset);
+    CALIPER(CALI_MARK_END("MLDomainToApplication");)
 
-    DBG(Workflow, "Allocated input resources")
+    // Revert pedicates and use it to pick the Physic points outputs.
+    auto WrongMLIndices = torch::logical_not(Predicate);
 
+    if (WrongMLIndices.sum().item<int64_t>() == 0) return;
 
-    // -----------------------------------------------------------------
-    // STEP 3: call physics module only where predicate = false
-    // -----------------------------------------------------------------
-    // ---- 3a: we need to pack the sparse data based on the uq flag
+    // Physis* tensors have the points which the model could not accurately predict
     CALIPER(CALI_MARK_BEGIN("PACK");)
-    const long packedElements = data_handler::pack(
-        appDataLoc, predicate, totalElements, origInputs, packedInputs);
+    SmallVector<torch::Tensor> PhysicIns(subSelectTensors(Ins, WrongMLIndices));
+    SmallVector<torch::Tensor> PhysicInOuts(
+        subSelectTensors(InOuts, WrongMLIndices));
+    // TODO: Outs does not need sub select, we will write all of these from scratch
+    SmallVector<torch::Tensor> PhysicOuts(
+        subSelectTensors(Outs, WrongMLIndices));
     CALIPER(CALI_MARK_END("PACK");)
 
-    // Pointer values which store output data values
-    // to be computed using the eos function.
-    std::vector<FPTypeValue *> packedOutputs;
-    for (int i = 0; i < outputDim; i++) {
-      packedOutputs.emplace_back(
-          rm.allocate<FPTypeValue>(packedElements, appDataLoc));
-    }
+    // Copy and clone. This important to take place before AppCall is executed. To keep a copy of the input values
+    // that will be overwritten.
+    SmallVector<torch::Tensor> PhysicInOutsBefore;
+    for (auto S : PhysicInOuts)
+      PhysicInOutsBefore.push_back(S.clone());
 
-    {
-      void **iPtr = reinterpret_cast<void **>(packedInputs.data());
-      void **oPtr = reinterpret_cast<void **>(packedOutputs.data());
-      long lbElements = packedElements;
-
-      // FIXME: I don't like the way we separate code here.
-      // Simple modification can make it easier to read.
-      // if (should_load_balance)  -> Code for load balancing
-      // else -> current code
-#ifdef __ENABLE_MPI__
-      CALIPER(CALI_MARK_BEGIN("LOAD BALANCE MODULE");)
-      AMSLoadBalancer<FPTypeValue> lBalancer(rId, wSize, packedElements, comm);
-      if (should_load_balance()) {
-        lBalancer.init(inputDim, outputDim, appDataLoc);
-        lBalancer.scatterInputs(packedInputs, appDataLoc);
-        iPtr = reinterpret_cast<void **>(lBalancer.inputs());
-        oPtr = reinterpret_cast<void **>(lBalancer.outputs());
-        lbElements = lBalancer.getBalancedSize();
-      }
-      CALIPER(CALI_MARK_END("LOAD BALANCE MODULE");)
-#endif
 
-      // ---- 3b: call the physics module and store in the data base
-      if (packedElements > 0) {
-        CALIPER(CALI_MARK_BEGIN("PHYSICS MODULE");)
-        AppCall(probDescr, lbElements, iPtr, oPtr);
-        CALIPER(CALI_MARK_END("PHYSICS MODULE");)
-      }
+    // We call the application here
+    CALIPER(CALI_MARK_BEGIN("PHYSICS MODULE");)
+    callApplication(CallBack, PhysicIns, PhysicInOuts, PhysicOuts);
+    CALIPER(CALI_MARK_END("PHYSICS MODULE");)
 
-#ifdef __ENABLE_MPI__
-      CALIPER(CALI_MARK_BEGIN("LOAD BALANCE MODULE");)
-      if (should_load_balance()) {
-        lBalancer.gatherOutputs(packedOutputs, appDataLoc);
-      }
-      CALIPER(CALI_MARK_END("LOAD BALANCE MODULE");)
-#endif
-    }
 
-    // ---- 3c: unpack the data
     CALIPER(CALI_MARK_BEGIN("UNPACK");)
-    data_handler::unpack(
-        appDataLoc, predicate, totalElements, packedOutputs, origOutputs);
+    // Copy out the computation results to the original tensors/buffers
+    ScatterPhysicOutputsToOrigDomain(PhysicOuts,
+                                     WrongMLIndices.squeeze_(),
+                                     Outs);
+    ScatterPhysicOutputsToOrigDomain(PhysicInOuts, WrongMLIndices, InOuts);
     CALIPER(CALI_MARK_END("UNPACK");)
 
+
     DBG(Workflow, "Finished physics evaluation")
 
     if (DB) {
-      CALIPER(CALI_MARK_BEGIN("DBSTORE");)
-      if (!DB->storePredicate()) {
-        DBG(Workflow,
-            "Storing data (#elements = %d) to database",
-            packedElements);
-        store(packedElements, packedInputs, packedOutputs);
-      } else {
-        DBG(Workflow,
-            "Storing data (#elements = %d) to database including predicates",
-            totalElements);
-        store(totalElements, origInputs, origOutputs, predicate);
-      }
-
-      CALIPER(CALI_MARK_END("DBSTORE");)
+      storeComputedData(PhysicIns,
+                        PhysicInOutsBefore,
+                        PhysicOuts,
+                        PhysicInOuts);
     }
 
-    // -----------------------------------------------------------------
-    // Deallocate temporal data
-    // -----------------------------------------------------------------
-    for (int i = 0; i < inputDim; i++)
-      rm.deallocate(packedInputs[i], appDataLoc);
-    for (int i = 0; i < outputDim; i++)
-      rm.deallocate(packedOutputs[i], appDataLoc);
-
-    rm.deallocate(predicate, appDataLoc);
 
     DBG(Workflow, "Finished AMSExecution")
     CINFO(Workflow,
           rId == 0,
           "Computed %ld "
           "using physics out of the %ld items (%.2f)",
-          packedElements,
-          totalElements,
-          (float)(packedElements) / float(totalElements))
+          PhysicIns[0].sizes()[0],
+          InputTensors[0].sizes()[0],
+          (float)(PhysicIns[0].sizes()[0]) / float(InputTensors[0].sizes()[0]));
 
     REPORT_MEM_USAGE(Workflow, "End")
     CALIPER(CALI_MARK_END("AMSEvaluate");)
diff --git a/tests/AMSWorkflow/test_faccessors.py b/tests/AMSWorkflow/test_faccessors.py
index 97386dc8..299cf171 100644
--- a/tests/AMSWorkflow/test_faccessors.py
+++ b/tests/AMSWorkflow/test_faccessors.py
@@ -1,4 +1,3 @@
-import csv
 import pathlib
 import unittest
 
@@ -34,27 +33,6 @@ def _store(self, cls, fn):
         return inputs, outputs
 
 
-class TestCSVWriter(TestWriter):
-    def test_csv_open_close(
-        self,
-    ):
-        super()._open_close(faccessors.CSVWriter, "ams_test." + faccessors.CSVReader.get_file_format_suffix())
-
-    def test_csv_store(self):
-        fn = "ams_test." + faccessors.CSVReader.get_file_format_suffix()
-        inputs, outputs = super()._store(faccessors.CSVWriter, fn)
-        with open(fn, "r") as x:
-            sample_data = list(csv.reader(x, delimiter=":"))
-        data = np.delete(np.array(sample_data), (0), axis=0).astype(inputs.dtype)
-        _cdata = np.hstack((inputs, outputs))
-        self.assertTrue(np.array_equal(data, _cdata), msg="Writting data loses information")
-
-    def tearDown(self):
-        fn = pathlib.Path("ams_test." + faccessors.CSVReader.get_file_format_suffix())
-        if fn.exists():
-            fn.unlink()
-
-
 class TestHDF5Writer(TestWriter):
     def _pack_dsets_to_list(self, dsets, selector):
         data = [dsets[k[0]] for k in selector]
@@ -67,12 +45,12 @@ def _map_name_to_index(self, dsets_keys, name):
         keys = [(k, int(k.split("_")[-1])) for k in keys]
         return keys
 
-    def test_csv_open_close(
+    def test_hdf5_open_close(
         self,
     ):
         super()._open_close(faccessors.HDF5Writer, "ams_test." + faccessors.HDF5Writer.get_file_format_suffix())
 
-    def test_csv_store(self):
+    def test_hdf5_store(self):
         fn = "ams_test." + faccessors.HDF5Writer.get_file_format_suffix()
         inputs, outputs = super()._store(faccessors.HDF5Writer, fn)
 
@@ -93,14 +71,14 @@ def tearDown(self):
 
 
 class TestH5PackedWriter(TestWriter):
-    def test_csv_open_close(
+    def test_hdf5_open_close(
         self,
     ):
         super()._open_close(
             faccessors.HDF5PackedWriter, "ams_test." + faccessors.HDF5PackedWriter.get_file_format_suffix()
         )
 
-    def test_csv_store(self):
+    def test_hdf5_store(self):
         fn = "ams_test." + faccessors.HDF5PackedWriter.get_file_format_suffix()
         inputs, outputs = super()._store(faccessors.HDF5PackedWriter, fn)
 
@@ -149,18 +127,6 @@ def _cmp(self, writer_cls, reader_cls, fn):
         )
 
 
-class TestCSVReader(TestReader):
-    def test_load(self):
-        fn = "ams_test." + faccessors.CSVReader.get_file_format_suffix()
-        super()._cmp(faccessors.CSVWriter, faccessors.CSVReader, fn)
-
-    def tearDown(self):
-        fn = "ams_test." + faccessors.CSVReader.get_file_format_suffix()
-        fn = pathlib.Path(fn)
-        if fn.exists():
-            fn.unlink()
-
-
 class TestHDF5Reader(TestReader):
     def test_load(self):
         fn = "ams_test." + faccessors.HDF5CLibReader.get_file_format_suffix()
diff --git a/tests/AMSlib/CMakeLists.txt b/tests/AMSlib/CMakeLists.txt
index 1b50c711..1cbfbca0 100644
--- a/tests/AMSlib/CMakeLists.txt
+++ b/tests/AMSlib/CMakeLists.txt
@@ -3,265 +3,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-function(JSON_TESTS db_type)
-  set(FS_PATH "${CMAKE_CURRENT_BINARY_DIR}")
-  set(AMS_DB_TEST_TYPE ${db_type})
-  set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/${db_type}.json")
-  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/json_configs/env_2_models_fs_rand_uq.json.in" "${JSON_FP}" @ONLY)
-  
-  # Tests Random models with different percentages both models store to file
-  add_test(NAME AMSEndToEndFromJSON::Random10::Random50::Double::DB::${db_type}::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_random_10 app_random_50;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_random_10 app_random_50")
-
-
-  # Tests delta-uq models with different aggregation both models store to file
-  add_test(NAME AMSEndToEndFromJSON::DuqMean::DuqMax::Double::DB::${db_type}::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_uq_mean app_uq_max;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_uq_mean app_uq_max")
-
-
-  # Tests detla uq model with a random uq model both models store to files
-  add_test(NAME AMSEndToEndFromJSON::Random::DuqMax::Double::DB::${db_type}::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_random_10 app_uq_max;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_random_10 app_uq_max")
-
-  # Tests detla uq model with no model. uq model both store to files
-  add_test(NAME AMSEndToEndFromJSON::Random::NoModel::Double::DB::${db_type}::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_random_10 app_no_model;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_random_10 app_no_model")
-
-  # Tests 2 delta uq models with no deb . uq model both store to files
-  add_test(NAME AMSEndToEndFromJSON::DuqMean::DuqMax::Double::NODB::${db_type}::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_uq_mean_ndb app_uq_max_ndb;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_uq_mean_ndb app_uq_max_ndb")
-
-  # Tests null models null dbs 
-  add_test(NAME AMSEndToEndFromJSON::None::None::Double::NODB::${db_type}::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_no_model_no_db  app_no_model_no_db ;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_no_model_no_db app_no_model_no_db")
-
-  unset(AMS_DB_TEST_TYPE)
-  unset(JSON_FP)
-endfunction()
-
-function(CHECK_RMQ_CONFIG file)
-  # Read the JSON file.
-  file(READ ${file} MY_JSON_STRING)
-  message(STATUS "RabbitMQ config ${file}")
-
-  string(JSON DB_CONF GET ${MY_JSON_STRING} db)
-  string(JSON DB_CONF GET ${DB_CONF} rmq_config)
-  string(JSON RMQ_HOST GET ${DB_CONF} "service-host")
-  string(JSON RMQ_PORT GET ${DB_CONF} "service-port")
-
-  if(NOT "${RMQ_HOST}" STREQUAL "" AND NOT "${RMQ_PORT}" STREQUAL "0")
-  message(STATUS "RabbitMQ config ${file}: ${RMQ_HOST}:${RMQ_PORT}")
-  else()
-    message(WARNING "RabbitMQ config file ${file} looks empty! Make sure to fill these fields before running the tests")
-  endif()
-endfunction()
-
-function(INTEGRATION_TEST_ENV)
-  JSON_TESTS("csv")
-  if (WITH_HDF5)
-  JSON_TESTS("hdf5")
-  set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/hdf5.json")
-  # Tests delta-uq models with different aggregation both models store to file with debug option set to on.
-  add_test(NAME AMSEndToEndFromJSON::DuqMean::DuqMax::Double::DB::hdf5-debug::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 9 \"double\" 1 1024 app_uq_mean_debug app_uq_max_debug;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 \"double\" 1024 app_uq_mean_debug app_uq_max_debug")
-  unset(JSON_FP)
-  endif()
-endfunction()
-
-function(INTEGRATION_TEST_RMQ)
-  if (WITH_RMQ)
-    if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
-      # If file exists we do not overwrite it
-      message(STATUS "Ctest will use ${CMAKE_CURRENT_BINARY_DIR}/rmq.json as RabbitMQ configuration for testing. Make sure RabbitMQ parameters are valid.")
-    else()
-      message(STATUS "Copying empty configuration to ${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
-      configure_file("${CMAKE_CURRENT_SOURCE_DIR}/json_configs/rmq.json.in" "rmq.json" @ONLY)
-    endif()
-    set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
-    CHECK_RMQ_CONFIG(${JSON_FP})
-    add_test(NAME AMSEndToEndFromJSON::NoModel::Double::DB::rmq::HOST COMMAND bash -c "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_rmq 0 2 2 \"double\" 2 10; AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_rmq.py 0 2 2 \"double\" 2 10")
-  endif()
-endfunction()
-
-function (INTEGRATION_TEST)
-  #######################################################
-  # TEST: output format 
-  # UQ: Random
-  #######################################################
-
-  add_test(NAME AMSEndToEnd::Random::Double::DB::OnlyPhysics::None::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"none\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"none\" \"./\"")
-  add_test(NAME AMSEndToEnd::Random::Double::DB::OnlyModel::None::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"none\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 1024 \"none\" \"./\"")
-
-
-  #######################################################
-  # TEST: CSV output format 
-  # UQ: Random
-  # 3 Thresholds: 0, 0.5, 1.0
-  #######################################################
-
-  add_test(NAME AMSEndToEnd::Random::Double::DB::OnlyPhysics::CSV::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"csv\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"csv\" \"./\"")
-  add_test(NAME AMSEndToEnd::Random::Double::DB::OnlyModel::CSV::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 1024 \"csv\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 1024 \"csv\" \"./\"")
-  add_test(NAME AMSEndToEnd::Random::Double::DB::HALF::CSV::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 1024 \"csv\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 1024 \"csv\" \"./\"")
-
-
-  if (WITH_HDF5)
-  #######################################################
-  # TEST: hdf5 output format 
-  # UQ: Random
-  # 3 Thresholds: 0, 0.5, 1.0
-  # precision: double
-  #######################################################
-
-    add_test(NAME AMSEndToEnd::Random::Double::DB::OnlyPhysics::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 1024 \"hdf5\" \"./\"")
-    add_test(NAME AMSEndToEnd::Random::Double::DB::OnlyModel::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 1024 \"hdf5\" \"./\"")
-    add_test(NAME AMSEndToEnd::Random::Double::DB::HALF::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 1024 \"hdf5\" \"./\"")
-
-  #######################################################
-  # TEST: hdf5 output format 
-  # UQ: Random
-  # 3 Thresholds: 0, 0.5, 1.0
-  # precision: single
-  #######################################################
-
-
-    add_test(NAME AMSEndToEnd::Random::Single::DB::OnlyPhysics::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 0.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 0.0 1 1024 \"hdf5\" \"./\"")
-    add_test(NAME AMSEndToEnd::Random::Single::DB::OnlyModel::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 1.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 1.0 1 1024 \"hdf5\" \"./\"")
-    add_test(NAME AMSEndToEnd::Random::Single::DB::HALF::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 0.5 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 0.5 1 1024 \"hdf5\" \"./\"")
-  endif()
-
-
-  #######################################################
-  # TEST: hdf5 output format 
-  # UQ: deltauq-mean
-  # 3 Thresholds: 0, 0.5, 1.0
-  # precision: double
-  #######################################################
-  add_test(NAME AMSEndToEnd::DeltaUQMean::Double::DB::OnlyPhysics::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_double_cpu.pt \"double\" \"deltaUQ (mean)\" 0.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_double_cpu.pt \"double\" \"deltaUQ (mean)\" 0.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMean::Double::DB::OnlyModel::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_double_cpu.pt \"double\" \"deltaUQ (mean)\" 1.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_double_cpu.pt \"double\" \"deltaUQ (mean)\" 1.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMean::Double::DB::HALF::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_double_cpu.pt \"double\" \"deltaUQ (mean)\" 0.5 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_double_cpu.pt \"double\" \"deltaUQ (mean)\" 0.5 1 1024 \"hdf5\" \"./\"")
-
-  #######################################################
-  # TEST: hdf5 output format 
-  # UQ: deltauq-mean
-  # 3 Thresholds: 0, 0.5, 1.0
-  # precision: single
-  #######################################################
-  add_test(NAME AMSEndToEnd::DeltaUQMean::Single::DB::OnlyPhysics::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_single_cpu.pt \"float\" \"deltaUQ (mean)\" 0.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_single_cpu.pt \"float\" \"deltaUQ (mean)\" 0.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMean::Single::DB::OnlyModel::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_single_cpu.pt \"float\" \"deltaUQ (mean)\" 1.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_single_cpu.pt \"float\" \"deltaUQ (mean)\" 1.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMean::Single::DB::HALF::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_single_cpu.pt \"float\" \"deltaUQ (mean)\" 0.5 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_mean_single_cpu.pt \"float\" \"deltaUQ (mean)\" 0.5 1 1024 \"hdf5\" \"./\"")
-
-  #######################################################
-  # TEST: hdf5 output format 
-  # UQ: deltauq-max
-  # 3 Thresholds: 0, 0.5, 1.0
-  # precision: double
-  #######################################################
-  add_test(NAME AMSEndToEnd::DeltaUQMax::Double::DB::OnlyPhysics::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_double_cpu.pt \"double\" \"deltaUQ (max)\" 0.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_double_cpu.pt \"double\" \"deltaUQ (max)\" 0.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMax::Double::DB::OnlyModel::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_double_cpu.pt \"double\" \"deltaUQ (max)\" 1.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_double_cpu.pt \"double\" \"deltaUQ (max)\" 1.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMax::Double::DB::HALF::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_double_cpu.pt \"double\" \"deltaUQ (max)\" 0.5 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_double_cpu.pt \"double\" \"deltaUQ (max)\" 0.5 1 1024 \"hdf5\" \"./\"")
-
- #######################################################
-  # TEST: hdf5 output format 
-  # UQ: deltauq-max
-  # 3 Thresholds: 0, 0.5, 1.0
-  # precision: single
-  #######################################################
-  add_test(NAME AMSEndToEnd::DeltaUQMax::Single::DB::OnlyPhysics::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_single_cpu.pt \"float\" \"deltaUQ (max)\" 0.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_single_cpu.pt \"float\" \"deltaUQ (max)\" 0.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMax::Single::DB::OnlyModel::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_single_cpu.pt \"float\" \"deltaUQ (max)\" 1.0 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_single_cpu.pt \"float\" \"deltaUQ (max)\" 1.0 1 1024 \"hdf5\" \"./\"")
-  add_test(NAME AMSEndToEnd::DeltaUQMax::Single::DB::HALF::HDF5::HOST COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_single_cpu.pt \"float\" \"deltaUQ (max)\" 0.5 1 1024 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 9 ${CMAKE_CURRENT_SOURCE_DIR}/uq_max_single_cpu.pt \"float\" \"deltaUQ (max)\" 0.5 1 1024 \"hdf5\" \"./\"")
-endfunction()
-
-
-
-function(BUILD_TEST exe source)
-  add_executable(${exe} ${source})
-  target_include_directories(${exe} PRIVATE "${PROJECT_SOURCE_DIR}/src/AMSlib/" umpire ${caliper_INCLUDE_DIR} ${MPI_INCLUDE_PATH})
-  target_link_directories(${exe} PRIVATE ${AMS_APP_LIB_DIRS})
-  target_link_libraries(${exe} PRIVATE AMS ${AMS_APP_LIBRARIES})
-
-  target_compile_definitions(${exe} PRIVATE ${AMS_APP_DEFINES})
-
-  if(WITH_CUDA)
-    set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES "${AMS_CUDA_ARCH}")
-    set_property(TARGET ${exe} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
-    set_source_files_properties(${source} PROPERTIES LANGUAGE CUDA)
-
-    target_compile_definitions(${exe} PRIVATE "-D__ENABLE_CUDA__ -DLIBAMS_VERBOSE")
-  endif()
-endfunction()
-
-function(ADDTEST exe test_name)
-  add_test(NAME "${test_name}::HOST" COMMAND ${exe} 0 ${ARGN})
-
-  if(WITH_CUDA)
-    add_test(NAME "${test_name}::DEVICE" COMMAND ${exe} 1 ${ARGN})
-  endif()
-endfunction()
-
-# This test requires Allocate
-# TODO: Include tests once we re-instate a pool
-#BUILD_TEST(ams_allocator_test ams_allocate.cpp)
-#ADDTEST(ams_allocator_test AMSAllocate)
-BUILD_TEST(ams_packing_test cpu_packing_test.cpp AMSPack)
-ADDTEST(ams_packing_test AMSPack)
-
-# AMS Database benchmark (RMQ and/or HDF5 + MPI / No ML models used)
-BUILD_TEST(ams_benchmark_db ams_bench_db.cpp)
-# The AMS DB Benchmark requires mfem
-# TODO: Remove mfem requirement from the benchmark
-target_link_libraries(ams_benchmark_db PRIVATE AMS ${AMS_EXAMPLE_LIBRARIES})
-
-if(WITH_TORCH)
-  BUILD_TEST(ams_inference_test torch_model.cpp)
-  ADDTEST(ams_inference_test AMSInferDouble ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "double")
-  ADDTEST(ams_inference_test AMSInferSingle ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt "single")
-  if (WITH_EXAMPLES)
-  add_test(NAME AMSExampleSingleDeltaUQ::HOST COMMAND ams_example --precision single --uqtype deltauq-mean -db "./" -S ${CMAKE_CURRENT_SOURCE_DIR}/tuple-single.torchscript -e 100)
-  add_test(NAME AMSExampleSingleRandomUQ::HOST COMMAND ams_example --precision single --uqtype random -S ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt -e 100)
-  add_test(NAME AMSExampleDoubleRandomUQ::HOST COMMAND ams_example --precision double --uqtype random -S ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt -e 100)
-endif()
-
-# These are integration tests. Try to use an end to end AMS run and realize issues between multiple pieces
-  BUILD_TEST(ams_end_to_end ams_ete.cpp)
-  INTEGRATION_TEST()
-  BUILD_TEST(ams_end_to_end_env ams_ete_env.cpp)
-  INTEGRATION_TEST_ENV()
-  BUILD_TEST(ams_rmq ams_rmq_env.cpp)
-  INTEGRATION_TEST_RMQ()
-
-
-  # UQ Tests
-  BUILD_TEST(ams_delta_uq_test ams_uq_model.cpp)
-
-  if(WITH_TORCH)
-    add_test(NAME AMSDeltaUQDoubleMean::HOST COMMAND ams_delta_uq_test 0 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 3 0.0)
-    add_test(NAME AMSDeltaUQDoubleMax::HOST COMMAND ams_delta_uq_test 0 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 4 0.0)
-    add_test(NAME AMSDeltaUQDoubleMean_2::HOST COMMAND ams_delta_uq_test 0 ${CMAKE_CURRENT_SOURCE_DIR}/tuple.duq "double" 2 4 3 0.5)
-    add_test(NAME AMSDeltaUQDoubleMax_2::HOST COMMAND ams_delta_uq_test 0 ${CMAKE_CURRENT_SOURCE_DIR}/tuple.duq "double" 2 4 4 0.1)
-
-    if(WITH_CUDA)
-      add_test(NAME AMSDeltaUQDoubleMean::DEVICE COMMAND ams_delta_uq_test 1 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 3 0.0)
-      add_test(NAME AMSDeltaUQDoubleMax::DEVICE COMMAND ams_delta_uq_test 1 ${CMAKE_CURRENT_SOURCE_DIR}/torch.duq.cuda "double" 8 9 4 0.0)
-      add_test(NAME AMSDeltaUQDoubleMean_2::DEVICE COMMAND ams_delta_uq_test 1 ${CMAKE_CURRENT_SOURCE_DIR}/tuple.duq.cuda "double" 2 4 3 0.5)
-      add_test(NAME AMSDeltaUQDoubleMax_2::DEVICE COMMAND ams_delta_uq_test 1 ${CMAKE_CURRENT_SOURCE_DIR}/tuple.duq.cuda "double" 2 4 4 0.1)
-    endif()
-  endif()
-
-  # TODO Add tests with cpu model
-  BUILD_TEST(ams_update_model ams_update_model.cpp)
-  ADDTEST(ams_update_model AMSUpdateModelDouble "double" ${CMAKE_CURRENT_SOURCE_DIR}/ConstantZeroModel_cpu.pt ${CMAKE_CURRENT_SOURCE_DIR}/ConstantOneModel_cpu.pt)
-endif()
-
-if(WITH_FAISS)
-  BUILD_TEST(ams_hdcache_test test_hdcache.cpp)
-  ADDTEST(ams_hdcache_test AMSHDCacheMeanPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 1 10 4.0 4 5)
-
-  ADDTEST(ams_hdcache_test AMSHDCacheMeanPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 1 10 4.0 4 5)
-
-  ADDTEST(ams_hdcache_test AMSHDCacheMaxPolicyDouble ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "double" 2 10 4.0 4 5)
-  ADDTEST(ams_hdcache_test AMSHDCacheMaxPolicySingle ${CMAKE_CURRENT_SOURCE_DIR}/faiss_debug.pt "single" 2 10 4.0 4 5)
-  # The max case fails on DEVICE. We should be aware about this when adding support for CI for GPUs
-  if (WITH_CUDA)
-    set_tests_properties(AMSHDCacheMaxPolicySingle::DEVICE AMSHDCacheMaxPolicyDouble::DEVICE PROPERTIES DISABLED TRUE)
-  endif()
-
-  if(WITH_TORCH)
-    if (WITH_EXAMPLES)
-    add_test(NAME AMSExampleFaissInferSingle::HOST COMMAND ams_example --precision single --uqtype faiss-mean -S ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt -H ${CMAKE_CURRENT_SOURCE_DIR}/example_faiss.idx -e 100)
-    add_test(NAME AMSExampleFaissInferDouble::HOST COMMAND ams_example --precision double --uqtype faiss-mean -S ${CMAKE_CURRENT_SOURCE_DIR}/debug_model.pt -H ${CMAKE_CURRENT_SOURCE_DIR}/example_faiss.idx -e 100)
-    endif()
-  endif()
+add_subdirectory(models)
+add_subdirectory(torch)
+add_subdirectory(db)
+add_subdirectory(wf)
+add_subdirectory(perf_regression)
+if (WITH_HDF5)
+add_subdirectory(ams_interface)
 endif()
diff --git a/tests/AMSlib/ConstantOneModel_cpu.pt b/tests/AMSlib/ConstantOneModel_cpu.pt
deleted file mode 100644
index 05131615..00000000
Binary files a/tests/AMSlib/ConstantOneModel_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/ConstantZeroModel_cpu.pt b/tests/AMSlib/ConstantZeroModel_cpu.pt
deleted file mode 100644
index 5eeafd2a..00000000
Binary files a/tests/AMSlib/ConstantZeroModel_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/ams_allocate.cpp b/tests/AMSlib/ams_allocate.cpp
deleted file mode 100644
index df5ff2ea..00000000
--- a/tests/AMSlib/ams_allocate.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#include <AMS.h>
-
-constexpr int ERROR = 1;
-constexpr int SUCCESS = 0;
-
-#include <cstring>
-#include <iostream>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <umpire/strategy/QuickPool.hpp>
-#include <wf/resource_manager.hpp>
-
-#include "../utils.hpp"
-
-int test_allocation(AMSResourceType resource, std::string pool_name)
-{
-  std::cout << "Testing Pool: " << pool_name << "\n";
-  auto& rm = umpire::ResourceManager::getInstance();
-  auto& ams_rm = ams::ResourceManager::getInstance();
-  double* data = ams_rm.allocate<double>(1, resource);
-  auto found_allocator = rm.getAllocator(data);
-  if (ams_rm.getAllocatorName(resource) != found_allocator.getName()) {
-    std::cout << "Allocator Name" << ams_rm.getAllocatorName(resource)
-              << "Actual Allocation " << found_allocator.getName() << "\n";
-    return 1;
-  }
-
-
-  if (ams_rm.getAllocatorName(resource) != pool_name) {
-    std::cout << "Allocator Name" << ams_rm.getAllocatorName(resource)
-              << "is not equal to pool name " << pool_name << "\n";
-    return 1;
-  }
-
-  ams_rm.deallocate(data, resource);
-  return 0;
-}
-
-int main(int argc, char* argv[])
-{
-  installSignals();
-  AMSInit();
-  int device = std::atoi(argv[1]);
-  // Testing with global umpire allocators
-  auto& ams_rm = ams::ResourceManager::getInstance();
-  ams_rm.init();
-  if (device == 1) {
-    if (test_allocation(AMSResourceType::AMS_DEVICE, "DEVICE") != 0) return 1;
-  } else if (device == 0) {
-    if (test_allocation(AMSResourceType::AMS_HOST, "HOST") != 0) return 1;
-  }
-
-  // Testing with pools
-
-  if (device == 1) {
-    auto& rm = umpire::ResourceManager::getInstance();
-    auto alloc_resource = rm.makeAllocator<umpire::strategy::QuickPool, true>(
-        "test-device", rm.getAllocator("DEVICE"));
-    ams_rm.setAllocator("test-device", AMSResourceType::AMS_DEVICE);
-    if (test_allocation(AMSResourceType::AMS_DEVICE, "test-device") != 0)
-      return ERROR;
-  } else if (device == 0) {
-    auto& rm = umpire::ResourceManager::getInstance();
-    auto alloc_resource = rm.makeAllocator<umpire::strategy::QuickPool, true>(
-        "test-host", rm.getAllocator("HOST"));
-    ams_rm.setAllocator("test-host", AMSResourceType::AMS_HOST);
-    if (test_allocation(AMSResourceType::AMS_HOST, "test-host") != 0)
-      return ERROR;
-  } else {
-    std::cout << "Unknown device type " << device
-              << "should be either 1 for GPU device or 0 for HOST\n";
-  }
-
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/ams_ete.cpp b/tests/AMSlib/ams_ete.cpp
deleted file mode 100644
index ed5a91ad..00000000
--- a/tests/AMSlib/ams_ete.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifdef __AMS_ENABLE_MPI__
-#include <mpi.h>
-#endif
-#include <unistd.h>
-
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <ml/uq.hpp>
-#include <umpire/Umpire.hpp>
-#include <umpire/strategy/QuickPool.hpp>
-#include <wf/basedb.hpp>
-#include <wf/resource_manager.hpp>
-
-#include "AMS.h"
-#include "wf/debug.h"
-
-#include "../utils.hpp"
-
-void createUmpirePool(std::string parent_name, std::string pool_name)
-{
-  auto &rm = umpire::ResourceManager::getInstance();
-  auto alloc_resource = rm.makeAllocator<umpire::strategy::QuickPool, true>(
-      pool_name, rm.getAllocator(parent_name));
-}
-
-
-AMSDType getDataType(char *d_type)
-{
-  AMSDType dType = AMSDType::AMS_DOUBLE;
-  if (std::strcmp(d_type, "float") == 0) {
-    dType = AMSDType::AMS_SINGLE;
-  } else if (std::strcmp(d_type, "double") == 0) {
-    dType = AMSDType::AMS_DOUBLE;
-  } else {
-    assert(false && "Unknown data type");
-  }
-  return dType;
-}
-
-template <typename DType>
-struct Problem {
-  int num_inputs;
-  int num_outputs;
-  int multiplier;
-  Problem(int ni, int no) : num_inputs(ni), num_outputs(no), multiplier(100) {}
-
-  void run(long num_elements, DType **inputs, DType **outputs)
-  {
-    for (int i = 0; i < num_elements; i++) {
-      DType sum = 0;
-      for (int j = 0; j < num_inputs; j++) {
-        sum += inputs[j][i];
-      }
-
-      for (int j = 0; j < num_outputs; j++) {
-        outputs[j][i] = sum;
-      }
-    }
-  }
-
-
-  const DType *initialize_inputs(DType *inputs, long length)
-  {
-    for (int i = 0; i < length; i++) {
-      inputs[i] = static_cast<DType>(i);
-    }
-    return inputs;
-  }
-
-  void ams_run(AMSExecutor &wf,
-               AMSResourceType resource,
-               int iterations,
-               int num_elements)
-  {
-    auto &rm = umpire::ResourceManager::getInstance();
-
-    for (int i = 0; i < iterations; i++) {
-      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
-      std::vector<const DType *> inputs;
-      std::vector<DType *> outputs;
-
-      // Allocate Input memory
-      for (int j = 0; j < num_inputs; j++) {
-        DType *data = new DType[elements];
-        inputs.push_back(initialize_inputs(data, elements));
-      }
-
-      // Allocate Output memory
-      for (int j = 0; j < num_outputs; j++) {
-        outputs.push_back(new DType[elements]);
-      }
-
-      AMSExecute(wf,
-                 (void *)this,
-                 elements,
-                 reinterpret_cast<const void **>(inputs.data()),
-                 reinterpret_cast<void **>(outputs.data()),
-                 inputs.size(),
-                 outputs.size());
-
-      for (int i = 0; i < num_outputs; i++) {
-        delete outputs[i];
-      }
-
-
-      for (int i = 0; i < num_inputs; i++) {
-        delete inputs[i];
-      }
-    }
-  }
-};
-
-void callBackDouble(void *cls, long elements, void **inputs, void **outputs)
-{
-  std::cout << "Called the double model\n";
-  static_cast<Problem<double> *>(cls)->run(elements,
-                                           (double **)(inputs),
-                                           (double **)(outputs));
-}
-
-
-void callBackSingle(void *cls, long elements, void **inputs, void **outputs)
-{
-  std::cout << "Called the single model\n";
-  static_cast<Problem<float> *>(cls)->run(elements,
-                                          (float **)(inputs),
-                                          (float **)(outputs));
-}
-
-
-int main(int argc, char **argv)
-{
-  if (argc != 12) {
-    std::cout << "Wrong cli\n";
-    std::cout << argv[0]
-              << " use_device(0|1) num_inputs num_outputs model_path "
-                 "data_type(float|double) uq_policy(random|deltaUQ "
-                 "(mean)|deltaUQ (max)) threshold(0) "
-                 "num_iterations avg_num_values db_type(none|csv|hdf5) "
-                 "db_path(path to existing path to store data)";
-    return -1;
-  }
-
-  installSignals();
-  AMSInit();
-
-  int use_device = std::atoi(argv[1]);
-  int num_inputs = std::atoi(argv[2]);
-  int num_outputs = std::atoi(argv[3]);
-  char *model_path = argv[4];
-  AMSDType data_type = getDataType(argv[5]);
-  std::string uq_name = std::string(argv[6]);
-  const AMSUQPolicy uq_policy = BaseUQ::UQPolicyFromStr(uq_name);
-  float threshold = std::atof(argv[7]);
-  int num_iterations = std::atoi(argv[8]);
-  int avg_elements = std::atoi(argv[9]);
-  std::string db_type_str = std::string(argv[10]);
-  std::string fs_path = std::string(argv[11]);
-  AMSDBType db_type = ams::db::getDBType(db_type_str);
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  srand(time(NULL));
-
-  AMSConfigureFSDatabase(db_type, fs_path.c_str());
-
-  assert((uq_policy == AMSUQPolicy::AMS_DELTAUQ_MAX ||
-          uq_policy == AMSUQPolicy::AMS_DELTAUQ_MEAN ||
-          uq_policy == AMSUQPolicy::AMS_RANDOM) &&
-         "Test only supports duq models");
-
-  createUmpirePool("HOST", "TEST_HOST");
-  AMSSetAllocator(AMSResourceType::AMS_HOST, "TEST_HOST");
-
-  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
-      "test", uq_policy, threshold, model_path, nullptr, "test", -1);
-
-  if (data_type == AMSDType::AMS_SINGLE) {
-    Problem<float> prob(num_inputs, num_outputs);
-
-    AMSExecutor wf = AMSCreateExecutor(model_descr,
-                                       AMSDType::AMS_SINGLE,
-                                       resource,
-                                       (AMSPhysicFn)callBackSingle,
-                                       0,
-                                       1);
-
-    prob.ams_run(wf, resource, num_iterations, avg_elements);
-  } else {
-    Problem<double> prob(num_inputs, num_outputs);
-    AMSExecutor wf = AMSCreateExecutor(model_descr,
-                                       AMSDType::AMS_DOUBLE,
-                                       resource,
-                                       (AMSPhysicFn)callBackDouble,
-                                       0,
-                                       1);
-    prob.ams_run(wf, resource, num_iterations, avg_elements);
-  }
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/ams_interface/CMakeLists.txt b/tests/AMSlib/ams_interface/CMakeLists.txt
new file mode 100644
index 00000000..9761a257
--- /dev/null
+++ b/tests/AMSlib/ams_interface/CMakeLists.txt
@@ -0,0 +1,211 @@
+function(ADD_API_UNIT_TEST gname name cmd)
+  add_test(NAME ${name} COMMAND bash -c "${cmd}")
+  set_tests_properties(${name} PROPERTIES LABELS ${gname})
+endfunction()
+
+
+function(JSON_TESTS db_type)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/json_configs/env_2_models_fs_rand_uq.json.in" "${JSON_FP}" @ONLY)
+  
+  # Tests Random models with different percentages both models store to file
+  ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::Random10::Random50::Double::DB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_random_10 app_random_50;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_random_10 app_random_50")
+
+
+  # Tests delta-uq models with different aggregation both models store to file
+  ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::DuqMean::DuqMax::Double::DB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_uq_mean app_uq_max;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_uq_mean app_uq_max")
+
+
+  # Tests detla uq model with a random uq model both models store to files
+  ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::Random::DuqMax::Double::DB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_random_10 app_uq_max;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_random_10 app_uq_max")
+
+  # Tests detla uq model with no model. uq model both store to files
+  ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::Random::NoModel::Double::DB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_random_10 app_no_model;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_random_10 app_no_model")
+
+  # Tests 2 delta uq models with no deb . uq model both store to files
+  ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::DuqMean::DuqMax::Double::NODB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_uq_mean_ndb app_uq_max_ndb;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_uq_mean_ndb app_uq_max_ndb")
+
+  # Tests null models null dbs 
+  ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::None::None::Double::NODB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_no_model_no_db  app_no_model_no_db ;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_no_model_no_db app_no_model_no_db")
+
+  unset(AMS_DB_TEST_TYPE)
+  unset(JSON_FP)
+endfunction()
+
+
+function(CHECK_RMQ_CONFIG file)
+  # Read the JSON file.
+  file(READ ${file} MY_JSON_STRING)
+  message(STATUS "RabbitMQ config ${file}")
+  string(JSON DB_CONF GET ${MY_JSON_STRING} "db")
+  string(JSON DB_CONF GET ${DB_CONF} "rmq_config")
+  string(JSON RMQ_HOST GET ${DB_CONF} "service-host")
+  string(JSON RMQ_PORT GET ${DB_CONF} "service-port")
+
+  if(NOT "${RMQ_HOST}" STREQUAL "" AND NOT "${RMQ_PORT}" STREQUAL "0")
+  message(STATUS "RabbitMQ config ${file}: ${RMQ_HOST}:${RMQ_PORT}")
+  else()
+    message(WARNING "RabbitMQ config file ${file} looks empty! Make sure to fill these fields before running the tests")
+  endif()
+endfunction()
+
+
+function(INTEGRATION_TEST_ENV)
+  set(DB_CONFIG "\"fs_path\" : \"${CMAKE_CURRENT_BINARY_DIR}\"")
+  set(AMS_DB_TEST_TYPE "hdf5")
+  set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/hdf5.json")
+  JSON_TESTS("hdf5")
+  unset(JSON_FP)
+endfunction()
+
+
+function(CONFIGURE_RMQ_FILE)
+  if(DEFINED ENV{AMS_RMQ_CONFIG})
+    message(STATUS "Env variable for rmq is set: $ENV{AMS_RMQ_CONFIG}")
+    set(AMS_DB_TEST_TYPE "rmq")
+    set(DB_CONFIG "\"rmq_config\" : $ENV{AMS_RMQ_CONFIG},\n \"update_surrogate\": false\n")
+    set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
+    JSON_TESTS("rmq")
+    CHECK_RMQ_CONFIG(${JSON_FP})
+    unset(JSON_FP)
+  else()
+      message(WARNING "Copying empty configuration to ${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
+      configure_file("${CMAKE_CURRENT_SOURCE_DIR}/json_configs/rmq.json.in" "rmq.json" @ONLY)
+      set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
+      JSON_TESTS("rmq")
+      unset(JSON_FP)
+  endif()
+endfunction()
+
+function(INTEGRATION_TEST_RMQ)
+    if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
+      # If file exists we do not overwrite it
+      message(STATUS "Ctest will use ${CMAKE_CURRENT_BINARY_DIR}/rmq.json as RabbitMQ configuration for testing. Make sure RabbitMQ parameters are valid.")
+      set(JSON_FP "${CMAKE_CURRENT_BINARY_DIR}/rmq.json")
+      JSON_TESTS("rmq")
+      unset(JSON_FP)
+    else()
+      CONFIGURE_RMQ_FILE()
+    endif()
+endfunction()
+
+function (INTEGRATION_TEST)
+  #######################################################
+  # TEST: output format 
+  # UQ: Random
+  #######################################################
+
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Double::DB::OnlyPhysics::None::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.0 1 128 \"none\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 128 \"none\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Double::DB::OnlyModel::None::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.0 1 128 \"none\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 128 \"none\" \"./\"")
+
+
+  if (WITH_HDF5)
+  #######################################################
+  # TEST: hdf5 output format 
+  # UQ: Random
+  # 3 Thresholds: 0, 0.5, 1.0
+  # precision: double
+  #######################################################
+
+    ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Double::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 128 \"hdf5\" \"./\"")
+    ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Double::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 128 \"hdf5\" \"./\"")
+    ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Double::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 128 \"hdf5\" \"./\"")
+
+  #######################################################
+  # TEST: hdf5 output format 
+  # UQ: Random
+  # 3 Thresholds: 0, 0.5, 1.0
+  # precision: single
+  #######################################################
+
+
+    ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Single::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"float\" \"random\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 0.0 1 128 \"hdf5\" \"./\"")
+    ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Single::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"float\" \"random\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 1.0 1 128 \"hdf5\" \"./\"")
+    ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Random::Single::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"float\" \"random\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"float\" \"random\" 0.5 1 128 \"hdf5\" \"./\"")
+  endif()
+
+
+  #######################################################
+  # TEST: hdf5 output format 
+  # UQ: deltauq-mean
+  # 3 Thresholds: 0, 0.5, 1.0
+  # precision: double
+  #######################################################
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMean::Double::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt \"double\" \"deltaUQ (mean)\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt \"double\" \"deltaUQ (mean)\" 0.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMean::Double::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt \"double\" \"deltaUQ (mean)\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt \"double\" \"deltaUQ (mean)\" 1.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMean::Double::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt \"double\" \"deltaUQ (mean)\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt \"double\" \"deltaUQ (mean)\" 0.5 1 128 \"hdf5\" \"./\"")
+
+  #######################################################
+  # TEST: hdf5 output format 
+  # UQ: deltauq-mean
+  # 3 Thresholds: 0, 0.5, 1.0
+  # precision: single
+  #######################################################
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMean::Single::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt \"float\" \"deltaUQ (mean)\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt \"float\" \"deltaUQ (mean)\" 0.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMean::Single::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt \"float\" \"deltaUQ (mean)\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt \"float\" \"deltaUQ (mean)\" 1.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMean::Single::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt \"float\" \"deltaUQ (mean)\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt \"float\" \"deltaUQ (mean)\" 0.5 1 128 \"hdf5\" \"./\"")
+
+  #######################################################
+  # TEST: hdf5 output format 
+  # UQ: deltauq-max
+  # 3 Thresholds: 0, 0.5, 1.0
+  # precision: double
+  #######################################################
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMax::Double::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt \"double\" \"deltaUQ (max)\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt \"double\" \"deltaUQ (max)\" 0.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMax::Double::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt \"double\" \"deltaUQ (max)\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt \"double\" \"deltaUQ (max)\" 1.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMax::Double::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt \"double\" \"deltaUQ (max)\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt \"double\" \"deltaUQ (max)\" 0.5 1 128 \"hdf5\" \"./\"")
+
+ #######################################################
+  # TEST: hdf5 output format 
+  # UQ: deltauq-max
+  # 3 Thresholds: 0, 0.5, 1.0
+  # precision: single
+  #######################################################
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMax::Single::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt \"float\" \"deltaUQ (max)\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt \"float\" \"deltaUQ (max)\" 0.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMax::Single::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt \"float\" \"deltaUQ (max)\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt \"float\" \"deltaUQ (max)\" 1.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::DeltaUQMax::Single::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt \"float\" \"deltaUQ (max)\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt \"float\" \"deltaUQ (max)\" 0.5 1 128 \"hdf5\" \"./\"")
+
+
+  # Random model with inout arguments that represent 2D data.
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Inout2D::Random::Double::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_inout_2d 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Inout2D::Random::Double::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_inout_2d 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Inout2D::Random::Double::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_inout_2d 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 128 \"hdf5\" \"./\"")
+
+# Random model with inout arguments that represent 2D data.
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Broadcast::Random::Double::DB::OnlyPhysics::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_ete_broadcast 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Broadcast::Random::Double::DB::OnlyModel::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_ete_broadcast 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 1.0 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 1.0 1 128 \"hdf5\" \"./\"")
+  ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::ModelDefine::Broadcast::Random::Double::DB::HALF::HDF5::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_ete_broadcast 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.5 1 128 \"hdf5\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.5 1 128 \"hdf5\" \"./\"")
+
+endfunction()
+
+function(BUILD_TEST exe source)
+  add_executable(${exe} ${source})
+  target_include_directories(${exe} PRIVATE "${PROJECT_SOURCE_DIR}/src/AMSlib/" "${PROJECT_SOURCE_DIR}/src/AMSlib/include" ${caliper_INCLUDE_DIR} ${MPI_INCLUDE_PATH})
+  target_link_libraries(${exe} PRIVATE AMS torch)
+  if (WITH_HDF5)
+    target_link_libraries(${exe} PRIVATE ${AMS_HDF5_TARGET})
+  endif()
+  target_compile_definitions(${exe} PRIVATE ${AMS_APP_DEFINES})
+endfunction()
+
+BUILD_TEST(ams_physics ams_physics.cpp)
+BUILD_TEST(ams_end_to_end ams_ete.cpp)
+BUILD_TEST(ams_inout_2d ams_ete_2d.cpp)
+BUILD_TEST(ams_ete_broadcast ams_ete_broadcast.cpp)
+INTEGRATION_TEST()
+BUILD_TEST(ams_end_to_end_env ams_ete_env.cpp)
+if (WITH_HDF5)
+INTEGRATION_TEST_ENV()
+endif()
+if (WITH_RMQ)
+INTEGRATION_TEST_RMQ()
+endif()
+BUILD_TEST(ams_multi_model_end_to_end ams_multi_model_ete.cpp)
+#ADD_API_UNIT_TEST(API_DIRECT_QUERY AMS::API::MultiModelDefine::Random::Double::DB::OnlyPhysics::None::HOST "${CMAKE_CURRENT_BINARY_DIR}/ams_multi_model_end_to_end 0 8 8 ${TORCH_MODEL_DIR}/linear_scripted_single_cpu_random.pt \"double\" \"random\" 0.0 1 128 \"none\" \"./\"; python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 ${CMAKE_CURRENT_SOURCE_DIR}/linear_scripted_cpu.pt \"double\" \"random\" 0.0 1 128 \"none\" \"./\"")
+
+
+BUILD_TEST(ams_multi_model_end_to_end_env ams_multi_model_ete_env.cpp)
+#ADD_API_UNIT_TEST(APIEnvAPI AMS::ENV::MultiModelDefine::Random10::Random50::Double::DB::${db_type}::HOST "AMS_OBJECTS=${JSON_FP} ${CMAKE_CURRENT_BINARY_DIR}/ams_end_to_end_env  0 8 8 \"double\" 1 128 app_random_10 app_random_50;AMS_OBJECTS=${JSON_FP} python3 ${CMAKE_CURRENT_SOURCE_DIR}/verify_ete.py 0 8 8 \"double\" 128 app_random_10 app_random_50")
+
+
+#BUILD_TEST(ams_rmq ams_rmq_env.cpp)
+#
diff --git a/tests/AMSlib/ams_interface/ams_ete.cpp b/tests/AMSlib/ams_interface/ams_ete.cpp
new file mode 100644
index 00000000..e88e96d8
--- /dev/null
+++ b/tests/AMSlib/ams_interface/ams_ete.cpp
@@ -0,0 +1,171 @@
+#include <stdexcept>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+
+#include "../utils.hpp"
+#include "AMS.h"
+
+using namespace ams;
+
+
+template <typename DType>
+struct Problem {
+  int num_inputs;
+  int num_outputs;
+  int multiplier;
+  Problem(int ni, int no) : num_inputs(ni), num_outputs(no), multiplier(100) {}
+
+  void run(long num_elements, DType **inputs, DType **outputs)
+  {
+    for (int i = 0; i < num_elements; i++) {
+      DType sum = 0;
+      for (int j = 0; j < num_inputs; j++) {
+        sum += inputs[j][i];
+      }
+
+      for (int j = 0; j < num_outputs; j++) {
+        outputs[j][i] = sum;
+      }
+    }
+  }
+
+
+  DType *initialize_inputs(DType *inputs, long length)
+  {
+    for (int i = 0; i < length; i++) {
+      inputs[i] = static_cast<DType>(i);
+    }
+    return inputs;
+  }
+
+  void ams_run(AMSExecutor &wf,
+               AMSResourceType resource,
+               int iterations,
+               int num_elements)
+  {
+    for (int i = 0; i < iterations; i++) {
+      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> output_tensors;
+
+      // Allocate Input memory
+      for (int j = 0; j < num_inputs; j++) {
+        DType *data = new DType[elements];
+        DType *ptr = initialize_inputs(data, elements);
+        input_tensors.push_back(AMSTensor::view(
+            ptr,
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      // Allocate Output memory
+      for (int j = 0; j < num_outputs; j++) {
+        auto tmp = new DType[elements];
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      EOSLambda OrigComputation =
+          [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+              ams::SmallVector<ams::AMSTensor> &ams_inouts,
+              ams::SmallVector<ams::AMSTensor> &ams_outs) {
+            DType *ins[num_inputs];
+            DType *outs[num_outputs];
+            if (num_inputs != ams_ins.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of inputs to remain the same");
+            else if (num_outputs != ams_outs.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of outputs to remain the same");
+
+            // Here I can use domain knowledge (inouts is empty)
+            int num_elements = ams_ins[0].shape()[0];
+            for (int i = 0; i < num_inputs; i++) {
+              ins[i] = ams_ins[i].data<DType>();
+              if (ams_ins[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            for (int i = 0; i < num_outputs; i++) {
+              outs[i] = ams_outs[i].data<DType>();
+              if (ams_outs[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            run(num_elements, ins, outs);
+          };
+
+      ams::SmallVector<AMSTensor> inouts;
+      AMSExecute(wf, OrigComputation, input_tensors, inouts, output_tensors);
+
+      for (int i = 0; i < input_tensors.size(); i++) {
+        delete input_tensors[i].data<DType>();
+      }
+
+
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
+      }
+    }
+  }
+};
+
+int main(int argc, char **argv)
+{
+  if (argc != 12) {
+    std::cout << "Wrong cli\n";
+    std::cout << argv[0]
+              << " use_device(0|1) num_inputs num_outputs model_path "
+                 "data_type(float|double) uq_policy(random|deltaUQ "
+                 "(mean)|deltaUQ (max)) threshold(0) "
+                 "num_iterations avg_num_values db_type(none|hdf5) "
+                 "db_path(path to existing path to store data)";
+    return -1;
+  }
+
+  AMSInit();
+  int use_device = std::atoi(argv[1]);
+  int num_inputs = std::atoi(argv[2]);
+  int num_outputs = std::atoi(argv[3]);
+  char *model_path = argv[4];
+  AMSDType data_type = getDataType(argv[5]);
+  std::string uq_name = std::string(argv[6]);
+  const AMSUQPolicy uq_policy = UQPolicyFromStr(uq_name);
+  float threshold = std::atof(argv[7]);
+  int num_iterations = std::atoi(argv[8]);
+  int avg_elements = std::atoi(argv[9]);
+  std::string db_type_str = std::string(argv[10]);
+  std::string fs_path = std::string(argv[11]);
+  AMSDBType db_type = getDBType(db_type_str);
+  AMSResourceType resource = AMSResourceType::AMS_HOST;
+  srand(time(NULL));
+
+  AMSConfigureFSDatabase(db_type, fs_path.c_str());
+
+  assert((uq_policy == AMSUQPolicy::AMS_DELTAUQ_MAX ||
+          uq_policy == AMSUQPolicy::AMS_DELTAUQ_MEAN ||
+          uq_policy == AMSUQPolicy::AMS_RANDOM) &&
+         "Test only supports duq models");
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "test", uq_policy, threshold, model_path, "test");
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  if (data_type == AMSDType::AMS_SINGLE) {
+    Problem<float> prob(num_inputs, num_outputs);
+    prob.ams_run(wf, resource, num_iterations, avg_elements);
+  } else {
+    Problem<double> prob(num_inputs, num_outputs);
+    prob.ams_run(wf, resource, num_iterations, avg_elements);
+  }
+  AMSFinalize();
+  return 0;
+}
diff --git a/tests/AMSlib/ams_interface/ams_ete_2d.cpp b/tests/AMSlib/ams_interface/ams_ete_2d.cpp
new file mode 100644
index 00000000..351f2de8
--- /dev/null
+++ b/tests/AMSlib/ams_interface/ams_ete_2d.cpp
@@ -0,0 +1,243 @@
+#include <stdexcept>
+#ifdef __AMS_ENABLE_MPI__
+#include <mpi.h>
+#endif
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <wf/basedb.hpp>
+#include <wf/resource_manager.hpp>
+
+#include "AMS.h"
+#include "ml/surrogate.hpp"
+#include "wf/debug.h"
+
+using namespace ams;
+
+
+AMSDType getDataType(char *d_type)
+{
+  AMSDType dType = AMSDType::AMS_DOUBLE;
+  if (std::strcmp(d_type, "float") == 0) {
+    dType = AMSDType::AMS_SINGLE;
+  } else if (std::strcmp(d_type, "double") == 0) {
+    dType = AMSDType::AMS_DOUBLE;
+  } else {
+    assert(false && "Unknown data type");
+  }
+  return dType;
+}
+
+template <typename DType>
+struct Problem {
+  int num_inputs;
+  int num_inouts;
+  int num_outputs;
+  int multiplier;
+  Problem(int ni, int nio, int no)
+      : num_inputs(ni), num_inouts(nio), num_outputs(no), multiplier(100)
+  {
+  }
+
+  void run(long num_elements,
+           DType *input1,
+           DType *input2,
+           DType *inout,
+           DType *out1,
+           DType *out2,
+           int num_inout)
+  {
+    for (int i = 0; i < num_elements; i++) {
+      DType sum = input1[i] + input2[i];
+      for (int j = 0; j < num_inout; j++)
+        sum += inout[i * num_inout + j];
+
+      out1[i] = sum;
+      out2[i] = sum;
+      for (int j = 0; j < num_inout; j++)
+        inout[i * num_inout + j] = sum;
+    }
+  }
+
+
+  DType *initialize_inputs(DType *inputs, long length)
+  {
+    for (int i = 0; i < length; i++) {
+      inputs[i] = static_cast<DType>(i);
+    }
+    return inputs;
+  }
+
+  DType *initialize_inout(DType *inputs, long length, int elements_per_row)
+  {
+    for (int i = 0; i < length; i++) {
+      for (int j = 0; j < elements_per_row; j++) {
+        inputs[i * elements_per_row + j] = static_cast<DType>(i);
+      }
+    }
+    return inputs;
+  }
+
+
+  void ams_run(AMSExecutor &wf,
+               AMSResourceType resource,
+               int iterations,
+               int num_elements)
+  {
+
+    for (int i = 0; i < iterations; i++) {
+      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> inout_tensors;
+      SmallVector<AMSTensor> output_tensors;
+
+      // Allocate Input memory
+      for (int j = 0; j < num_inputs; j++) {
+        DType *data = new DType[elements];
+        input_tensors.push_back(AMSTensor::view(
+            initialize_inputs(data, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      DType *inout_data = new DType[elements * num_inouts];
+      inout_tensors.push_back(AMSTensor::view(
+          initialize_inout(inout_data, elements, num_inouts),
+          SmallVector<ams::AMSTensor::IntDimType>({num_elements, num_inouts}),
+          SmallVector<ams::AMSTensor::IntDimType>({num_inouts, 1}),
+          resource));
+
+      // Allocate Output memory
+      for (int j = 0; j < num_outputs; j++) {
+        auto tmp = new DType[elements];
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      EOSLambda OrigComputation = [&](const ams::SmallVector<ams::AMSTensor>
+                                          &ams_ins,
+                                      ams::SmallVector<ams::AMSTensor>
+                                          &ams_inouts,
+                                      ams::SmallVector<ams::AMSTensor>
+                                          &ams_outs) {
+        DType *ins[num_inputs];
+        DType *outs[num_outputs];
+        DType *inout;
+
+        if (ams_inouts.size() != 1) {
+          throw std::runtime_error("Expecting a single inout tensor");
+        }
+
+        if (ams_inouts[0].shape()[1] != num_inouts)
+          throw std::runtime_error("Inout shape should be 'num_inout'");
+
+        inout = ams_inouts[0].data<DType>();
+        int num_elements = ams_inouts[0].shape()[0];
+        for (int i = 0; i < num_inputs; i++) {
+          ins[i] = ams_ins[i].data<DType>();
+          if (ams_ins[i].shape()[0] != num_elements)
+            throw std::runtime_error("Expected tensors to have the same shape");
+        }
+        for (int i = 0; i < num_outputs; i++) {
+          outs[i] = ams_outs[i].data<DType>();
+          if (ams_outs[i].shape()[0] != num_elements)
+            throw std::runtime_error("Expected tensors to have the same shape");
+        }
+        run(num_elements,
+            ins[0],
+            ins[1],
+            inout,
+            outs[0],
+            outs[1],
+            // I have access to inouts, because we captured everything by reference.
+            num_inouts);
+      };
+
+      AMSExecute(
+          wf, OrigComputation, input_tensors, inout_tensors, output_tensors);
+
+      for (int i = 0; i < input_tensors.size(); i++) {
+        delete input_tensors[i].data<DType>();
+      }
+
+      inout_tensors.clear();
+      delete[] inout_data;
+
+
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
+      }
+    }
+  }
+};
+
+int main(int argc, char **argv)
+{
+  AMSInit();
+  if (argc != 12) {
+    std::cout << "Wrong cli\n";
+    std::cout << argv[0]
+              << " use_device(0|1) num_inputs num_outputs model_path "
+                 "data_type(float|double) uq_policy(random|deltaUQ "
+                 "(mean)|deltaUQ (max)) threshold(0) "
+                 "num_iterations avg_num_values db_type(none|hdf5) "
+                 "db_path(path to existing path to store data)";
+    return -1;
+  }
+
+
+  int use_device = std::atoi(argv[1]);
+  int num_inputs = std::atoi(argv[2]);
+  int num_outputs = std::atoi(argv[3]);
+  char *model_path = argv[4];
+  AMSDType data_type = getDataType(argv[5]);
+  std::string uq_name = std::string(argv[6]);
+  const AMSUQPolicy uq_policy = UQ::UQPolicyFromStr(uq_name);
+  float threshold = std::atof(argv[7]);
+  int num_iterations = std::atoi(argv[8]);
+  int avg_elements = std::atoi(argv[9]);
+  std::string db_type_str = std::string(argv[10]);
+  std::string fs_path = std::string(argv[11]);
+  AMSDBType db_type = ams::db::getDBType(db_type_str);
+  AMSResourceType resource = AMSResourceType::AMS_HOST;
+  srand(time(NULL));
+
+  int num_inouts = 6;
+  assert(num_inputs == 8 && "Num Inputs should always be 8");
+  assert(num_outputs == 8 && "Num outputs should always be 8");
+
+  AMSConfigureFSDatabase(db_type, fs_path.c_str());
+
+  assert((uq_policy == AMSUQPolicy::AMS_DELTAUQ_MAX ||
+          uq_policy == AMSUQPolicy::AMS_DELTAUQ_MEAN ||
+          uq_policy == AMSUQPolicy::AMS_RANDOM) &&
+         "Test only supports duq models");
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "test", uq_policy, threshold, model_path, "test");
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  if (data_type == AMSDType::AMS_SINGLE) {
+    Problem<float> prob(num_inputs - num_inouts,
+                        num_inouts,
+                        num_outputs - num_inouts);
+
+
+    prob.ams_run(wf, resource, num_iterations, avg_elements);
+  } else {
+    Problem<double> prob(num_inputs - num_inouts,
+                         num_inouts,
+                         num_outputs - num_inouts);
+    prob.ams_run(wf, resource, num_iterations, avg_elements);
+  }
+
+  AMSFinalize();
+  return 0;
+}
diff --git a/tests/AMSlib/ams_interface/ams_ete_broadcast.cpp b/tests/AMSlib/ams_interface/ams_ete_broadcast.cpp
new file mode 100644
index 00000000..b3c64fb3
--- /dev/null
+++ b/tests/AMSlib/ams_interface/ams_ete_broadcast.cpp
@@ -0,0 +1,199 @@
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
+#include <wf/basedb.hpp>
+
+#include "AMS.h"
+#include "ml/surrogate.hpp"
+#include "wf/debug.h"
+
+using namespace ams;
+
+AMSDType getDataType(char *d_type)
+{
+  AMSDType dType = AMSDType::AMS_DOUBLE;
+  if (std::strcmp(d_type, "float") == 0) {
+    dType = AMSDType::AMS_SINGLE;
+  } else if (std::strcmp(d_type, "double") == 0) {
+    dType = AMSDType::AMS_DOUBLE;
+  } else {
+    assert(false && "Unknown data type");
+  }
+  return dType;
+}
+
+template <typename DType>
+struct Problem {
+  int num_inputs;
+  int num_outputs;
+  int multiplier;
+  Problem(int ni, int no) : num_inputs(ni), num_outputs(no), multiplier(100) {}
+
+  void run(long num_elements, DType **inputs, DType **outputs, DType constant)
+  {
+    for (int i = 0; i < num_elements; i++) {
+      DType sum = constant;
+      for (int j = 0; j < num_inputs - 1; j++) {
+        sum += inputs[j][i];
+      }
+
+      for (int j = 0; j < num_outputs; j++) {
+        outputs[j][i] = sum;
+      }
+    }
+  }
+
+
+  DType *initialize_inputs(DType *inputs, long length)
+  {
+    for (int i = 0; i < length; i++) {
+      inputs[i] = static_cast<DType>(i);
+    }
+    return inputs;
+  }
+
+  void ams_run(AMSExecutor &wf,
+               AMSResourceType resource,
+               int iterations,
+               int num_elements)
+  {
+    DType value = 1.0;
+    for (int i = 0; i < iterations; i++) {
+      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> output_tensors;
+
+      // Allocate Input memory
+      for (int j = 0; j < num_inputs - 1; j++) {
+        DType *data = new DType[elements];
+        input_tensors.push_back(AMSTensor::view(
+            initialize_inputs(data, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+      value = num_inputs - 1;
+      input_tensors.push_back(AMSTensor::view(
+          &value,
+          SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+          SmallVector<ams::AMSTensor::IntDimType>({0, 0}),
+          resource));
+
+
+      // Allocate Output memory
+      for (int j = 0; j < num_outputs; j++) {
+        auto tmp = new DType[elements];
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      EOSLambda OrigComputation =
+          [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+              ams::SmallVector<ams::AMSTensor> &ams_inouts,
+              ams::SmallVector<ams::AMSTensor> &ams_outs) {
+            DType *ins[num_inputs - 1];
+            DType *outs[num_outputs];
+            if (num_inputs != ams_ins.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of inputs to remain the same");
+            else if (num_outputs != ams_outs.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of outputs to remain the same");
+
+            // Here I can use domain knowledge (inouts is empty)
+            int num_elements = ams_ins[0].shape()[0];
+            for (int i = 0; i < num_inputs - 1; i++) {
+              ins[i] = ams_ins[i].data<DType>();
+              if (ams_ins[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            for (int i = 0; i < num_outputs; i++) {
+              outs[i] = ams_outs[i].data<DType>();
+              if (ams_outs[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            run(num_elements,
+                ins,
+                outs,
+                *ams_ins[num_inputs - 1].data<DType>());
+          };
+
+      ams::SmallVector<AMSTensor> inouts;
+      AMSExecute(wf, OrigComputation, input_tensors, inouts, output_tensors);
+
+      for (int i = 0; i < input_tensors.size() - 1; i++) {
+        delete input_tensors[i].data<DType>();
+      }
+
+
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
+      }
+    }
+  }
+};
+
+int main(int argc, char **argv)
+{
+  if (argc != 12) {
+    std::cout << "Wrong cli\n";
+    std::cout << argv[0]
+              << " use_device(0|1) num_inputs num_outputs model_path "
+                 "data_type(float|double) uq_policy(random|deltaUQ "
+                 "(mean)|deltaUQ (max)) threshold(0) "
+                 "num_iterations avg_num_values db_type(none|hdf5) "
+                 "db_path(path to existing path to store data)";
+    return -1;
+  }
+  AMSInit();
+
+
+  int use_device = std::atoi(argv[1]);
+  int num_inputs = std::atoi(argv[2]);
+  int num_outputs = std::atoi(argv[3]);
+  char *model_path = argv[4];
+  AMSDType data_type = getDataType(argv[5]);
+  std::string uq_name = std::string(argv[6]);
+  const AMSUQPolicy uq_policy = UQ::UQPolicyFromStr(uq_name);
+  float threshold = std::atof(argv[7]);
+  int num_iterations = std::atoi(argv[8]);
+  int avg_elements = std::atoi(argv[9]);
+  std::string db_type_str = std::string(argv[10]);
+  std::string fs_path = std::string(argv[11]);
+  AMSDBType db_type = ams::db::getDBType(db_type_str);
+  AMSResourceType resource = AMSResourceType::AMS_HOST;
+  srand(time(NULL));
+
+  AMSConfigureFSDatabase(db_type, fs_path.c_str());
+
+  assert((uq_policy == AMSUQPolicy::AMS_DELTAUQ_MAX ||
+          uq_policy == AMSUQPolicy::AMS_DELTAUQ_MEAN ||
+          uq_policy == AMSUQPolicy::AMS_RANDOM) &&
+         "Test only supports duq models");
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "test", uq_policy, threshold, model_path, "test");
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  if (data_type == AMSDType::AMS_SINGLE) {
+    Problem<float> prob(num_inputs, num_outputs);
+
+
+    prob.ams_run(wf, resource, num_iterations, avg_elements);
+  } else {
+    Problem<double> prob(num_inputs, num_outputs);
+    prob.ams_run(wf, resource, num_iterations, avg_elements);
+  }
+
+  AMSFinalize();
+  return 0;
+}
diff --git a/tests/AMSlib/ams_ete_env.cpp b/tests/AMSlib/ams_interface/ams_ete_env.cpp
similarity index 57%
rename from tests/AMSlib/ams_ete_env.cpp
rename to tests/AMSlib/ams_interface/ams_ete_env.cpp
index b1331a78..4da14c06 100644
--- a/tests/AMSlib/ams_ete_env.cpp
+++ b/tests/AMSlib/ams_interface/ams_ete_env.cpp
@@ -1,30 +1,12 @@
-#ifdef __AMS_ENABLE_MPI__
-#include <mpi.h>
-#endif
 #include <unistd.h>
 
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
-#include <limits>
-#include <ml/uq.hpp>
-#include <umpire/Umpire.hpp>
-#include <umpire/strategy/QuickPool.hpp>
-#include <wf/basedb.hpp>
-#include <wf/resource_manager.hpp>
 
 #include "AMS.h"
-#include "wf/debug.h"
-
-#include "../utils.hpp"
-
-void createUmpirePool(std::string parent_name, std::string pool_name)
-{
-  auto &rm = umpire::ResourceManager::getInstance();
-  auto alloc_resource = rm.makeAllocator<umpire::strategy::QuickPool, true>(
-      pool_name, rm.getAllocator(parent_name));
-}
 
+using namespace ams;
 
 AMSDType getDataType(char *d_type)
 {
@@ -61,7 +43,7 @@ struct Problem {
   }
 
 
-  const DType *initialize_inputs(DType *inputs, long length)
+  DType *initialize_inputs(DType *inputs, long length)
   {
     for (int i = 0; i < length; i++) {
       inputs[i] = static_cast<DType>(i);
@@ -74,41 +56,71 @@ struct Problem {
                int iterations,
                int num_elements)
   {
-    auto &rm = umpire::ResourceManager::getInstance();
-
     for (int i = 0; i < iterations; i++) {
       int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
-      std::vector<const DType *> inputs;
-      std::vector<DType *> outputs;
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> output_tensors;
 
       // Allocate Input memory
       for (int j = 0; j < num_inputs; j++) {
         DType *data = new DType[elements];
-        inputs.push_back(initialize_inputs(data, elements));
+        input_tensors.push_back(AMSTensor::view(
+            initialize_inputs(data, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
       }
 
       // Allocate Output memory
       for (int j = 0; j < num_outputs; j++) {
-        outputs.push_back(new DType[elements]);
+        auto tmp = new DType[elements];
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
       }
 
-      AMSExecute(wf,
-                 (void *)this,
-                 elements,
-                 reinterpret_cast<const void **>(inputs.data()),
-                 reinterpret_cast<void **>(outputs.data()),
-                 inputs.size(),
-                 outputs.size());
-
-      for (int i = 0; i < num_outputs; i++) {
-        delete[] outputs[i];
-        outputs[i] = nullptr;
+      EOSLambda OrigComputation =
+          [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+              ams::SmallVector<ams::AMSTensor> &ams_inouts,
+              ams::SmallVector<ams::AMSTensor> &ams_outs) {
+            DType *ins[num_inputs];
+            DType *outs[num_outputs];
+            if (num_inputs != ams_ins.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of inputs to remain the same");
+            else if (num_outputs != ams_outs.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of outputs to remain the same");
+
+            // Here I can use domain knowledge (inouts is empty)
+            int num_elements = ams_ins[0].shape()[0];
+            for (int i = 0; i < num_inputs; i++) {
+              ins[i] = ams_ins[i].data<DType>();
+              if (ams_ins[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            for (int i = 0; i < num_outputs; i++) {
+              outs[i] = ams_outs[i].data<DType>();
+              if (ams_outs[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            run(num_elements, ins, outs);
+          };
+
+      ams::SmallVector<AMSTensor> inouts;
+      AMSExecute(wf, OrigComputation, input_tensors, inouts, output_tensors);
+
+      for (int i = 0; i < input_tensors.size(); i++) {
+        delete input_tensors[i].data<DType>();
       }
 
 
-      for (int i = 0; i < num_inputs; i++) {
-        delete[] inputs[i];
-        inputs[i] = nullptr;
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
       }
     }
   }
@@ -135,6 +147,7 @@ void callBackSingle(void *cls, long elements, void **inputs, void **outputs)
 int main(int argc, char **argv)
 {
 
+  AMSInit();
   if (argc != 9) {
     std::cout << "Wrong cli\n";
     std::cout << argv[0]
@@ -144,8 +157,6 @@ int main(int argc, char **argv)
     return -1;
   }
 
-  installSignals();
-  AMSInit();
 
   int use_device = std::atoi(argv[1]);
   int num_inputs = std::atoi(argv[2]);
@@ -158,31 +169,15 @@ int main(int argc, char **argv)
   AMSResourceType resource = AMSResourceType::AMS_HOST;
   srand(time(NULL));
 
-
-  createUmpirePool("HOST", "TEST_HOST");
-  AMSSetAllocator(AMSResourceType::AMS_HOST, "TEST_HOST");
-
   AMSCAbstrModel models[] = {AMSQueryModel(model1), AMSQueryModel(model2)};
 
   for (int i = 0; i < 2; i++) {
+    AMSExecutor wf = AMSCreateExecutor(models[i], 0, 1);
     if (data_type == AMSDType::AMS_SINGLE) {
       Problem<float> prob(num_inputs, num_outputs);
-      AMSExecutor wf = AMSCreateExecutor(models[i],
-                                         AMSDType::AMS_SINGLE,
-                                         resource,
-                                         (AMSPhysicFn)callBackSingle,
-                                         0,
-                                         1);
-
       prob.ams_run(wf, resource, num_iterations, avg_elements);
     } else {
       Problem<double> prob(num_inputs, num_outputs);
-      AMSExecutor wf = AMSCreateExecutor(models[i],
-                                         AMSDType::AMS_DOUBLE,
-                                         resource,
-                                         (AMSPhysicFn)callBackDouble,
-                                         0,
-                                         1);
       prob.ams_run(wf, resource, num_iterations, avg_elements);
     }
   }
diff --git a/tests/AMSlib/ams_interface/ams_multi_model_ete.cpp b/tests/AMSlib/ams_interface/ams_multi_model_ete.cpp
new file mode 100644
index 00000000..25ee21d2
--- /dev/null
+++ b/tests/AMSlib/ams_interface/ams_multi_model_ete.cpp
@@ -0,0 +1,204 @@
+#include <stdexcept>
+#ifdef __AMS_ENABLE_MPI__
+#include <mpi.h>
+#endif
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <wf/basedb.hpp>
+#include <wf/resource_manager.hpp>
+
+#include "AMS.h"
+#include "ml/surrogate.hpp"
+
+using namespace ams;
+
+AMSDType getDataType(char *d_type)
+{
+  AMSDType dType = AMSDType::AMS_DOUBLE;
+  if (std::strcmp(d_type, "float") == 0) {
+    dType = AMSDType::AMS_SINGLE;
+  } else if (std::strcmp(d_type, "double") == 0) {
+    dType = AMSDType::AMS_DOUBLE;
+  } else {
+    assert(false && "Unknown data type");
+  }
+  return dType;
+}
+
+template <typename DType>
+struct Problem {
+  int num_inputs;
+  int num_outputs;
+  int multiplier;
+  Problem(int ni, int no) : num_inputs(ni), num_outputs(no), multiplier(100) {}
+
+  void run(long num_elements, DType **inputs, DType **outputs, DType scalar)
+  {
+    for (int i = 0; i < num_elements; i++) {
+      DType sum = 0;
+      for (int j = 0; j < num_inputs; j++) {
+        sum += inputs[j][i];
+      }
+
+      for (int j = 0; j < num_outputs; j++) {
+        outputs[j][i] = sum + scalar;
+      }
+    }
+  }
+
+
+  DType *initialize_inputs(DType *inputs, long length)
+  {
+    for (int i = 0; i < length; i++) {
+      inputs[i] = static_cast<DType>(i);
+    }
+    return inputs;
+  }
+
+  void ams_run(AMSExecutor &wf,
+               AMSResourceType resource,
+               int iterations,
+               int num_elements,
+               int scalar)
+  {
+    for (int i = 0; i < iterations; i++) {
+      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> output_tensors;
+
+      // Allocate Input memory
+      for (int j = 0; j < num_inputs; j++) {
+        DType *data = new DType[elements];
+        DType *ptr = initialize_inputs(data, elements);
+        input_tensors.push_back(AMSTensor::view(
+            ptr,
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      // Allocate Output memory
+      for (int j = 0; j < num_outputs; j++) {
+        auto tmp = new DType[elements];
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      EOSLambda OrigComputation =
+          [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+              ams::SmallVector<ams::AMSTensor> &ams_inouts,
+              ams::SmallVector<ams::AMSTensor> &ams_outs) {
+            DType *ins[num_inputs];
+            DType *outs[num_outputs];
+            if (num_inputs != ams_ins.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of inputs to remain the same");
+            else if (num_outputs != ams_outs.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of outputs to remain the same");
+
+            // Here I can use domain knowledge (inouts is empty)
+            int num_elements = ams_ins[0].shape()[0];
+            for (int i = 0; i < num_inputs; i++) {
+              ins[i] = ams_ins[i].data<DType>();
+              if (ams_ins[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            for (int i = 0; i < num_outputs; i++) {
+              outs[i] = ams_outs[i].data<DType>();
+              if (ams_outs[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            run(num_elements, ins, outs, scalar);
+          };
+
+      ams::SmallVector<AMSTensor> inouts;
+      AMSExecute(wf, OrigComputation, input_tensors, inouts, output_tensors);
+
+      for (int i = 0; i < input_tensors.size(); i++) {
+        delete input_tensors[i].data<DType>();
+      }
+
+
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
+      }
+    }
+  }
+};
+
+int main(int argc, char **argv)
+{
+  if (argc != 12) {
+    std::cout << "Wrong cli\n";
+    std::cout << argv[0]
+              << " use_device(0|1) num_inputs num_outputs model_path "
+                 "data_type(float|double) uq_policy(random|deltaUQ "
+                 "(mean)|deltaUQ (max)) threshold(0) "
+                 "num_iterations avg_num_values db_type(none|hdf5) "
+                 "db_path(path to existing path to store data)";
+    return -1;
+  }
+
+  AMSInit();
+
+  int use_device = std::atoi(argv[1]);
+  int num_inputs = std::atoi(argv[2]);
+  int num_outputs = std::atoi(argv[3]);
+  char *model_path = argv[4];
+  AMSDType data_type = getDataType(argv[5]);
+  std::string uq_name = std::string(argv[6]);
+  const AMSUQPolicy uq_policy = UQ::UQPolicyFromStr(uq_name);
+  float threshold = std::atof(argv[7]);
+  int num_iterations = std::atoi(argv[8]);
+  int avg_elements = std::atoi(argv[9]);
+  std::string db_type_str = std::string(argv[10]);
+  std::string fs_path = std::string(argv[11]);
+  AMSDBType db_type = ams::db::getDBType(db_type_str);
+  AMSResourceType resource = AMSResourceType::AMS_HOST;
+  srand(time(NULL));
+
+  AMSConfigureFSDatabase(db_type, fs_path.c_str());
+
+  assert((uq_policy == AMSUQPolicy::AMS_DELTAUQ_MAX ||
+          uq_policy == AMSUQPolicy::AMS_DELTAUQ_MEAN ||
+          uq_policy == AMSUQPolicy::AMS_RANDOM) &&
+         "Test only supports duq models");
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "test_1", uq_policy, threshold, nullptr, "test_1");
+
+  AMSCAbstrModel model_descr1 = AMSRegisterAbstractModel(
+      "test_2", uq_policy, threshold, nullptr, "test_2");
+
+  std::cout << "Running with " << num_iterations << "\n";
+  AMSExecutor wf1 = AMSCreateExecutor(model_descr, 0, 1);
+  AMSExecutor wf2 = AMSCreateExecutor(model_descr1, 0, 1);
+  for (int i = 0; i < 10; i++) {
+    if (data_type == AMSDType::AMS_SINGLE) {
+      Problem<float> prob1(num_inputs, num_outputs);
+      Problem<float> prob2(num_inputs + 1, num_outputs + 1);
+
+
+      prob1.ams_run(wf1, resource, num_iterations, avg_elements, 0);
+      prob2.ams_run(wf2, resource, num_iterations, avg_elements, 1);
+    } else {
+      Problem<double> prob1(num_inputs, num_outputs);
+      Problem<double> prob2(num_inputs + 1, num_outputs + 1);
+      prob2.ams_run(wf2, resource, num_iterations, avg_elements, 1);
+      prob1.ams_run(wf1, resource, num_iterations, avg_elements, 0);
+    }
+  }
+
+  AMSFinalize();
+  return 0;
+}
diff --git a/tests/AMSlib/ams_interface/ams_multi_model_ete_env.cpp b/tests/AMSlib/ams_interface/ams_multi_model_ete_env.cpp
new file mode 100644
index 00000000..18921472
--- /dev/null
+++ b/tests/AMSlib/ams_interface/ams_multi_model_ete_env.cpp
@@ -0,0 +1,208 @@
+#include <stdexcept>
+#ifdef __AMS_ENABLE_MPI__
+#include <mpi.h>
+#endif
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <wf/basedb.hpp>
+#include <wf/resource_manager.hpp>
+
+#include "AMS.h"
+#include "ml/surrogate.hpp"
+
+using namespace ams;
+
+AMSDType getDataType(char *d_type)
+{
+  AMSDType dType = AMSDType::AMS_DOUBLE;
+  if (std::strcmp(d_type, "float") == 0) {
+    dType = AMSDType::AMS_SINGLE;
+  } else if (std::strcmp(d_type, "double") == 0) {
+    dType = AMSDType::AMS_DOUBLE;
+  } else {
+    assert(false && "Unknown data type");
+  }
+  return dType;
+}
+
+template <typename DType>
+void OrigComputation(void *cls,
+                     const ams::SmallVector<ams::AMSTensor> &ams_ins,
+                     ams::SmallVector<ams::AMSTensor> &ams_inouts,
+                     ams::SmallVector<ams::AMSTensor> &ams_outs);
+
+template <typename DType>
+struct Problem {
+  int num_inputs;
+  int num_outputs;
+  int multiplier;
+  int scalar;
+  Problem(int ni, int no) : num_inputs(ni), num_outputs(no), multiplier(100) {}
+
+  void run(long num_elements, DType **inputs, DType **outputs, DType scalar)
+  {
+    std::cout << "In run " << num_inputs << " " << num_outputs << "\n";
+    for (int i = 0; i < num_elements; i++) {
+      DType sum = 0;
+      for (int j = 0; j < num_inputs; j++) {
+        sum += inputs[j][i];
+      }
+
+      for (int j = 0; j < num_outputs; j++) {
+        outputs[j][i] = sum + scalar;
+      }
+    }
+  }
+
+
+  DType *initialize_inputs(DType *inputs, long length)
+  {
+    for (int i = 0; i < length; i++) {
+      inputs[i] = static_cast<DType>(i);
+    }
+    return inputs;
+  }
+
+  void ams_run(AMSExecutor &wf,
+               AMSResourceType resource,
+               int iterations,
+               int num_elements,
+               int scalar)
+  {
+    this->scalar = scalar;
+    for (int i = 0; i < iterations; i++) {
+      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> output_tensors;
+      std::cout << "Num Inputs are " << num_inputs << " num outputs are "
+                << num_outputs << "\n";
+      // Allocate Input memory
+      for (int j = 0; j < num_inputs; j++) {
+        DType *data = new DType[elements];
+        DType *ptr = initialize_inputs(data, elements);
+        std::cout << "Input_" << j << " is " << std::hex << ptr << std::dec
+                  << "\n";
+        input_tensors.push_back(AMSTensor::view(
+            ptr,
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      // Allocate Output memory
+      for (int j = 0; j < num_outputs; j++) {
+        auto tmp = new DType[elements];
+        std::cout << "output " << j << " is " << std::hex << tmp << std::dec
+                  << "\n";
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
+      }
+
+      ams::SmallVector<AMSTensor> inouts;
+      AMSCExecute(wf,
+                  OrigComputation<DType>,
+                  (void *)this,
+                  input_tensors,
+                  inouts,
+                  output_tensors);
+
+      for (int i = 0; i < input_tensors.size(); i++) {
+        delete input_tensors[i].data<DType>();
+      }
+
+
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
+      }
+    }
+  }
+};
+
+template <typename DType>
+void OrigComputation(void *cls,
+                     const ams::SmallVector<ams::AMSTensor> &ams_ins,
+                     ams::SmallVector<ams::AMSTensor> &ams_inouts,
+                     ams::SmallVector<ams::AMSTensor> &ams_outs)
+{
+  std::cout << "Num Inputs are " << ams_ins.size() + ams_inouts.size() << "\n";
+  std::cout << "Num Ouputs are " << ams_outs.size() + ams_inouts.size() << "\n";
+  DType *ins[ams_ins.size() + ams_inouts.size()];
+  DType *outs[ams_outs.size() + ams_inouts.size()];
+  Problem<DType> *Prob = (Problem<DType> *)cls;
+
+
+  // Here I can use domain knowledge (inouts is empty)
+  int num_elements = ams_ins[0].shape()[0];
+  for (int i = 0; i < ams_ins.size(); i++) {
+    ins[i] = ams_ins[i].data<DType>();
+    std::cout << "Input_" << i << " is " << std::hex << ins[i] << std::dec
+              << "\n";
+    if (ams_ins[i].shape()[0] != num_elements)
+      throw std::runtime_error("Expected tensors to have the same shape");
+  }
+  for (int i = 0; i < ams_outs.size(); i++) {
+    outs[i] = ams_outs[i].data<DType>();
+    std::cout << "Output_" << i << " is " << std::hex << outs[i] << std::dec
+              << "\n";
+    if (ams_outs[i].shape()[0] != num_elements)
+      throw std::runtime_error("Expected tensors to have the same shape");
+  }
+  Prob->run(num_elements, ins, outs, Prob->scalar);
+};
+
+
+int main(int argc, char **argv)
+{
+
+  if (argc != 9) {
+    std::cout << "Wrong cli\n";
+    std::cout << argv[0]
+              << " use_device(0|1) num_inputs num_outputs "
+                 "data_type(float|double)"
+                 "num_iterations avg_num_values 'model-name-1' 'model-name-2'";
+    return -1;
+  }
+
+  AMSInit();
+
+  int use_device = std::atoi(argv[1]);
+  int num_inputs = std::atoi(argv[2]);
+  int num_outputs = std::atoi(argv[3]);
+  AMSDType data_type = getDataType(argv[4]);
+  int num_iterations = std::atoi(argv[5]);
+  int avg_elements = std::atoi(argv[6]);
+  const char *model1 = argv[7];
+  const char *model2 = argv[8];
+  AMSResourceType resource = AMSResourceType::AMS_HOST;
+  srand(time(NULL));
+
+  AMSCAbstrModel model_descr = AMSQueryModel(model1);
+  AMSCAbstrModel model_descr1 = AMSQueryModel(model2);
+
+  std::cout << "Running with " << num_iterations << "\n";
+  AMSExecutor wf1 = AMSCreateExecutor(model_descr, 0, 1);
+  AMSExecutor wf2 = AMSCreateExecutor(model_descr1, 0, 1);
+  for (int i = 0; i < 10; i++) {
+    if (data_type == AMSDType::AMS_SINGLE) {
+      Problem<float> prob1(num_inputs, num_outputs);
+      Problem<float> prob2(num_inputs + 1, num_outputs + 1);
+      prob1.ams_run(wf1, resource, num_iterations, avg_elements, 0);
+      prob2.ams_run(wf2, resource, num_iterations, avg_elements, 1);
+    } else {
+      Problem<double> prob1(num_inputs, num_outputs);
+      Problem<double> prob2(num_inputs + 1, num_outputs + 1);
+      prob2.ams_run(wf2, resource, num_iterations, avg_elements, 1);
+      prob1.ams_run(wf1, resource, num_iterations, avg_elements, 0);
+    }
+  }
+
+  AMSFinalize();
+  return 0;
+}
diff --git a/tests/AMSlib/ams_interface/ams_physics.cpp b/tests/AMSlib/ams_interface/ams_physics.cpp
new file mode 100644
index 00000000..d9aea739
--- /dev/null
+++ b/tests/AMSlib/ams_interface/ams_physics.cpp
@@ -0,0 +1,187 @@
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <random>
+
+#include "AMS.h"
+
+using real_t = double;
+using namespace ams;
+
+void eval(real_t *density,
+          real_t *e_mass,
+          real_t *qc,
+          real_t deltaTime,
+          real_t **mat,
+          int NumComps,
+          int NumZones)
+{
+  // Density is a 0->vector.
+  real_t *Dense = density;
+  real_t *eMass = e_mass;
+  real_t *QC = qc;
+
+  for (int j = 0; j < NumZones; j++) {
+    real_t A = Dense[j];  // Reactant A
+    for (int i = 0; i < NumComps; i++) {
+      real_t k = mat[j][i];  // Reaction rate constant
+      real_t reaction_rate = k * A * deltaTime;
+      Dense[j] -= reaction_rate;
+      eMass[j] = reaction_rate * k;
+      QC[j] += reaction_rate;
+    }
+  }
+}
+
+void eval_ams(AMSExecutor &wf,
+              real_t *density,
+              real_t *e_mass,
+              real_t *qc,
+              real_t deltaTime,
+              real_t **mat,
+              int NumComps,
+              int NumZones)
+{
+  // Density is a 0->vector.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+  // Density is inout.
+  inout_tensors.push_back(
+      AMSTensor::view(density,
+                      SmallVector<ams::AMSTensor::IntDimType>({NumZones, 1}),
+                      SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+                      AMSResourceType::AMS_HOST));
+  // QC is inout
+  inout_tensors.push_back(
+      AMSTensor::view(qc,
+                      SmallVector<ams::AMSTensor::IntDimType>({NumZones, 1}),
+                      SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+                      AMSResourceType::AMS_HOST));
+
+  input_tensors.push_back(AMSTensor::view(
+      &mat[0][0],
+      SmallVector<ams::AMSTensor::IntDimType>({NumZones, NumComps}),
+      SmallVector<ams::AMSTensor::IntDimType>({NumComps, 1}),
+      AMSResourceType::AMS_HOST));
+
+  // deltaTime is a scalar input, I BROADCAST it now with 0 strides.
+  input_tensors.push_back(
+      AMSTensor::view(&deltaTime,
+                      SmallVector<ams::AMSTensor::IntDimType>({NumZones, 1}),
+                      SmallVector<ams::AMSTensor::IntDimType>({0, 0}),
+                      AMSResourceType::AMS_HOST));
+
+  // e_mass is just an output
+  output_tensors.push_back(
+      AMSTensor::view(e_mass,
+                      SmallVector<ams::AMSTensor::IntDimType>({NumZones, 1}),
+                      SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+                      AMSResourceType::AMS_HOST));
+
+
+  EOSLambda OrigComputation = [&](const ams::SmallVector<ams::AMSTensor>
+                                      &ams_ins,
+                                  ams::SmallVector<ams::AMSTensor> &ams_inouts,
+                                  ams::SmallVector<ams::AMSTensor> &ams_outs) {
+    int prunedZones = ams_ins[0].shape()[0];
+    std::cout << "Pruned are " << prunedZones << "\n";
+    real_t *pruned_mat[prunedZones];
+    // The 2D data of materials are unnder a c_vector.
+    real_t *c_mats = ams_ins[0].data<real_t>();
+    // We need this as eval requires a c like 2D vector
+    for (int i = 0; i < prunedZones; i++) {
+      pruned_mat[i] = &c_mats[i * ams_ins[0].shape()[1]];
+    }
+    eval(ams_inouts[0].data<real_t>(),  // density was the first entry in inout
+         ams_outs[0].data<real_t>(),
+         ams_inouts[1].data<real_t>(),  // qc was the second entry in inout
+         *ams_ins[1].data<real_t>(),
+         pruned_mat,
+         NumComps,
+         prunedZones);
+  };
+  // After I call this, I expect the database to have the following order:
+  // input_Data: **input_tensors, **inout_tensors
+  // input_Data: **output_tensors, **inout_tensors
+  // In this example the database will have the following:
+  // Input: |Mat_0|Mat_1|dt|density|qc| Output : |e_mass|density|qc|
+  AMSExecute(wf, OrigComputation, input_tensors, inout_tensors, output_tensors);
+}
+
+void initializeRandom(real_t *data,
+                      size_t NumElements,
+                      real_t minVal = 0.0,
+                      real_t maxVal = 1.0)
+{
+  std::random_device rd;
+  std::mt19937 gen(0);
+  std::uniform_real_distribution<real_t> dist(minVal, maxVal);
+  for (size_t i = 0; i < NumElements; i++) {
+    data[i] = dist(gen);
+  }
+}
+
+
+int main(int argc, char *argv[])
+{
+  AMSInit();
+  int numZones = std::atoi(argv[1]);
+  int numComps = std::atoi(argv[2]);
+  real_t *actualDensity = new real_t[numZones];
+  initializeRandom(actualDensity, numZones);
+  real_t *eMass = new real_t[numZones];
+  initializeRandom(eMass, numZones);
+  real_t *qc = new real_t[numZones];
+  initializeRandom(qc, numZones);
+  real_t dt = 1.0;
+  ams::AMSConfigureFSDatabase(ams::AMSDBType::AMS_HDF5, "./");
+  ams::AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "test", ams::AMSUQPolicy::AMS_RANDOM, 0.0, nullptr, "test");
+  ams::AMSExecutor wf = ams::AMSCreateExecutor(model_descr, 0, 1);
+
+  // Here I am uncertain if materials are NumComps or NumZones.
+  // NOTE: Materials may or may not be contineous on the outer dimension.
+  // We take a worst case scenario here, in which data are non contineous.
+  real_t *materials[numZones];
+  real_t *tmpData = new real_t[numZones * numComps];
+  for (int i = 0; i < numZones; i++) {
+    materials[i] = &tmpData[i * numComps];
+    initializeRandom(materials[i], numComps);
+  }
+
+#if 0
+  // THIS WE DO NOT SUPPORT CAUSE the materials data will be a non contineous vector
+  real_t *materials[numZones];
+  for (int i = 0; i < numZones; i++) {
+    materials[i] = new real_t[numComps];
+    initializeRandom(materials[i], numComps);
+  }
+#endif
+  std::cout << std::fixed << std::setprecision(2);
+
+  std::cout << "Before\n";
+  for (int i = 0; i < numZones; i++) {
+    std::cout << "Dense: " << actualDensity[i] << " eMass:" << eMass[i]
+              << " QC:" << qc[i];
+    for (int j = 0; j < numComps; j++) {
+      std::cout << " Mat_" << j << " " << materials[i][j];
+    }
+    std::cout << "\n";
+  }
+
+
+  eval_ams(wf, actualDensity, eMass, qc, dt, materials, numComps, numZones);
+
+  std::cout << "After\n";
+  for (int i = 0; i < numZones; i++) {
+    std::cout << "Dense: " << actualDensity[i] << " eMass:" << eMass[i]
+              << " QC:" << qc[i];
+    for (int j = 0; j < numComps; j++) {
+      std::cout << " Mat_" << j << " " << materials[i][j];
+    }
+    std::cout << "\n";
+  }
+  AMSFinalize();
+  return 0;
+}
diff --git a/tests/AMSlib/json_configs/env_2_models_fs_rand_uq.json.in b/tests/AMSlib/ams_interface/json_configs/env_2_models_fs_rand_uq.json.in
similarity index 59%
rename from tests/AMSlib/json_configs/env_2_models_fs_rand_uq.json.in
rename to tests/AMSlib/ams_interface/json_configs/env_2_models_fs_rand_uq.json.in
index 90961bdf..e382c013 100644
--- a/tests/AMSlib/json_configs/env_2_models_fs_rand_uq.json.in
+++ b/tests/AMSlib/ams_interface/json_configs/env_2_models_fs_rand_uq.json.in
@@ -1,77 +1,61 @@
 {
   "db" : {
     "dbType" : "@AMS_DB_TEST_TYPE@",
-    "fs_path" : "@FS_PATH@"
+    @DB_CONFIG@
   },
   "ml_models" : {
     "random_50": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : "random_50"
     }, 
     "random_10": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.1,
       "db_label" : "random_10"
     }, 
     "duq_mean": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_mean_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_mean.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : "duq_mean"
     },
     "duq_max": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_max_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_max.pt",
       "uq_aggregate": "max",
       "threshold": 0.5,
       "db_label" : "duq_max"
     },
-    "duq_max_debug": {
-      "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_max_double_cpu.pt",
-      "uq_aggregate": "max",
-      "threshold": 0.5,
-      "db_label" : "duq_max",
-      "debug_db" : true
-    },
-    "duq_mean_debug": {
-      "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_mean_double_cpu.pt",
-      "uq_aggregate": "mean",
-      "threshold": 0.5,
-      "db_label" : "duq_mean",
-      "debug_db" : true
-    },
     "random_no_db_10": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.1,
       "db_label" : ""
     },
     "random_no_db_50": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : ""
     },
     "duq_mean_no_db": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_mean_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_mean.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : ""
     },
     "duq_max_no_db": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_max_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_max.pt",
       "uq_aggregate": "max",
       "threshold": 0.5,
       "db_label" : ""
@@ -99,9 +83,6 @@
     "app_uq_mean_ndb" : "duq_mean_no_db",
     "app_uq_max_ndb" : "duq_max_no_db",
     "app_no_model" : "no_model",
-    "app_no_model_no_db" : "no_model_no_db",
-    "app_uq_mean_debug" : "duq_mean_debug",
-    "app_uq_max_debug" : "duq_max_debug"
-
+    "app_no_model_no_db" : "no_model_no_db"
   }
 }
diff --git a/tests/AMSlib/ams_interface/json_configs/objects.json.in b/tests/AMSlib/ams_interface/json_configs/objects.json.in
new file mode 100644
index 00000000..b86eefe6
--- /dev/null
+++ b/tests/AMSlib/ams_interface/json_configs/objects.json.in
@@ -0,0 +1,26 @@
+{
+  "db" : {
+    "dbType" : "@DBTYPE@",
+    "fs_path" : "@FS_PATH@"
+  },
+  "ml_models" : {
+    "model_1": {
+      "uq_type": "@UQ_TYPE@",
+      "model_path": "model_1",
+      "uq_aggregate": "mean",
+      "threshold": 0.5,
+      "db_label" : "tatb_eos"
+    }, 
+    "model_2": {
+      "uq_type": "@UQ_TYPE@",
+      "model_path": "model_2",
+      "uq_aggregate": "max",
+      "threshold": 0.5,
+      "db_label" : "tatb_eos"
+    }
+  },
+  "domain_models" : {
+    "test_1": "model_1",
+    "test_2" : "model_2"
+  }
+}
diff --git a/tests/AMSlib/json_configs/rmq.json.in b/tests/AMSlib/ams_interface/json_configs/rmq.json.in
similarity index 64%
rename from tests/AMSlib/json_configs/rmq.json.in
rename to tests/AMSlib/ams_interface/json_configs/rmq.json.in
index c3ac11c3..72e8aa2d 100644
--- a/tests/AMSlib/json_configs/rmq.json.in
+++ b/tests/AMSlib/ams_interface/json_configs/rmq.json.in
@@ -18,72 +18,56 @@
   "ml_models" : {
     "random_50": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : "random_50"
     }, 
     "random_10": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.1,
       "db_label" : "random_10"
     }, 
     "duq_mean": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_mean_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_mean.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : "duq_mean"
     },
     "duq_max": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_max_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_max.pt",
       "uq_aggregate": "max",
       "threshold": 0.5,
       "db_label" : "duq_max"
     },
-    "duq_max_debug": {
-      "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_max_double_cpu.pt",
-      "uq_aggregate": "max",
-      "threshold": 0.5,
-      "db_label" : "duq_max",
-      "debug_db" : true
-    },
-    "duq_mean_debug": {
-      "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_mean_double_cpu.pt",
-      "uq_aggregate": "mean",
-      "threshold": 0.5,
-      "db_label" : "duq_mean",
-      "debug_db" : true
-    },
     "random_no_db_10": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.1,
       "db_label" : ""
     },
     "random_no_db_50": {
       "uq_type": "random",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/linear_scripted_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/linear_scripted_single_cpu_random.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : ""
     },
     "duq_mean_no_db": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_mean_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_mean.pt",
       "uq_aggregate": "mean",
       "threshold": 0.5,
       "db_label" : ""
     },
     "duq_max_no_db": {
       "uq_type": "deltaUQ",
-      "model_path": "@CMAKE_CURRENT_SOURCE_DIR@/uq_max_double_cpu.pt",
+      "model_path": "@TORCH_MODEL_DIR@/double_cpu_duq_max.pt",
       "uq_aggregate": "max",
       "threshold": 0.5,
       "db_label" : ""
@@ -111,9 +95,6 @@
     "app_uq_mean_ndb" : "duq_mean_no_db",
     "app_uq_max_ndb" : "duq_max_no_db",
     "app_no_model" : "no_model",
-    "app_no_model_no_db" : "no_model_no_db",
-    "app_uq_mean_debug" : "duq_mean_debug",
-    "app_uq_max_debug" : "duq_max_debug"
-
+    "app_no_model_no_db" : "no_model_no_db"
   }
 }
diff --git a/tests/AMSlib/ams_interface/verify_ete.py b/tests/AMSlib/ams_interface/verify_ete.py
new file mode 100644
index 00000000..524d7a64
--- /dev/null
+++ b/tests/AMSlib/ams_interface/verify_ete.py
@@ -0,0 +1,394 @@
+import sys
+import json
+from pathlib import Path
+import h5py
+import numpy as np
+import os
+from typing import Tuple, Optional
+
+
+def get_suffix(db_type):
+    if db_type == "none":
+        return "none"
+    if db_type == "hdf5":
+        return "h5"
+    return "unknown"
+
+
+def verify_fs_db(db_type: str, fs_path: str, name: str) -> Tuple[bool, bool, str]:
+    """
+    @brief verifies that the db file exists and is correct in format
+
+    @param db_type The type of the database ('can be  only hdf5')
+    @param fs_path: The path of the database
+    @param name: The filename prefix of the db
+
+    @return [continue, has_error] Whether we should continue processing the test or stop and whether the formats etc are correct
+    db_type:
+    """
+
+    if not Path(fs_path).exists():
+        print("Expecting output directory to exist")
+        return True, 1, None
+
+    suffix = get_suffix(db_type)
+    if suffix == "none":
+        return False, 0, None
+    fn = f"{name}_0.{suffix}"
+    fp = Path(f"{fs_path}/{fn}")
+
+    if name == "":
+        # We don't have a file and we should return to stop analysis
+        return False, 0, None
+
+    if name == "" and fp.exists():
+        print(f"I was expecting file({fp}) to not exist")
+        fp.unlink()
+        return False, 1, None
+
+    return True, 0, str(fp)
+
+
+def read_file(fp: str, threshold: float) -> Tuple[Optional[Tuple[np.ndarray, np.ndarray]], bool]:
+    """
+    @brief verifies that the db file has the appropriate format
+
+    @param fp the path to the file
+    @param threshold The threshold of UQ
+
+    @return [input, output], has_error] The data in the file and whether there is some error
+    """
+
+    with h5py.File(fp, "r") as fd:
+        dsets = fd.keys()
+        if threshold == 1.0:
+            dsets = fd.keys()
+            print(dsets)
+            assert len(dsets) == 1, "Expected input, output and domain_name dset"
+            return (None, None), 0
+        print(dsets)
+        if len(dsets) != 3:
+            print(f"Expected input, output and domain_name dset")
+            return (None, None), 1
+
+        data = {}
+        for d in {"input_data", "output_data"}:
+            if d not in dsets:
+                print(f"Expected {d} to be in dataset names {dests}")
+                return (None, None), 1
+            data[d] = fd[d]
+            print(d, fd[d].shape)
+        input_data = np.array(data["input_data"])
+        output_data = np.array(data["output_data"])
+        print(input_data.shape)
+        print(output_data.shape)
+        return (input_data, output_data), 0
+
+
+def verify_data_collection(fs_path, db_type, num_inputs, num_outputs, threshold, name="test", debug_db=False):
+    # Returns a tuple of the input/ouput data and 0/1 for correct incorrect file.
+    # Checks whether the files also have the right number of columns
+    if not Path(fs_path).exists():
+        print("Expecting output directory to exist")
+        return None, 1
+
+    suffix = get_suffix(db_type)
+    if suffix == "none":
+        return None, 0
+
+    fn = f"{name}_0.{suffix}"
+    if debug_db and db_type != "hdf5":
+        print("Debug DB is only supported on hdf5")
+        return None, 1
+    elif debug_db:
+        fn = f"{name}_0.debug.{suffix}"
+
+    fp = Path(f"{fs_path}/{fn}")
+
+    if name == "" and fp.exists():
+        print(f"I was expecting file({fp}) to not exist")
+        fp.unlink()
+        return None, 1
+    elif name == "":
+        return (np.empty((0, 0), dtype=np.float32), np.empty((0, 0), dtype=np.float32)), 0
+    elif not fp.exists():
+        print(f"File path {fn} does not exist")
+        return None, 1
+
+    if db_type == "hdf5":
+        with h5py.File(fp, "r") as fd:
+            dsets = fd.keys()
+            if threshold == 1.0:
+                dsets = fd.keys()
+                print(dsets)
+                assert len(dsets) == 1, "Expected input, output and domain_name dset"
+
+                return None, 0
+
+            print(dsets)
+            assert len(dsets) == 3, "Expected input, output and domain_name dset"
+            data = {}
+            for d in {"input_data", "output_data"}:
+                assert d in dsets, f"Expected {d} to be in dataset names {dests}"
+                data[d] = fd[d]
+                print(d, fd[d].shape)
+            input_data = np.array(data["input_data"])
+            output_data = np.array(data["output_data"])
+            print(input_data.shape)
+            print(output_data.shape)
+            fp.unlink()
+            if debug_db:
+                return (input_data, output_data, predicate), 0
+            return (input_data, output_data), 0
+
+    else:
+        return None, 1
+
+
+def verify_data(
+    threshold: float, uq_name, num_elements, inputs: np.ndarray, outputs: np.ndarray, num_inputs: int, num_outputs: int
+) -> bool:
+    """
+    @brief verifies that the collected data have the expected values
+
+    @param threshold The threshold of UQ
+    @param uq_name The uq name/technology being used
+    @param num_elements The number of elements we executed our simulation with
+    @param inputs: The data collected as inputs
+    @param outputs: The data collected as outputs
+    @param num_inputs: The number of inputs on the outer dimension
+    @param num_outputs: The number of outputs on the outer dimension
+
+    @return has_error Whether we are correct or not
+    """
+    if threshold == 0.0:
+        # Threshold 0 means collect all data. Verify the sizes.
+        assert (
+            inputs.shape[0] == num_elements and outputs.shape[0] == num_elements
+        ), f"Did not collect all expected data, Input Size: {inputs.shape} : Output Size: {outputs.shape}, expected size: {num_elements}"
+
+    elif threshold == 1.0:
+        # Threshold 1.0 means to not collect any data. Verify the sizes.
+        assert inputs.shape[0] == 0 and outputs.shape[0] == 0, "Num elements should be zero"
+        # There is nothing else we can check here
+        return 0
+    else:
+        # Compute a theoritical range of possible values in the db.
+        # The duq/faiss tests have specific settings. The random one can have a
+        # bound. This checks for all these cases
+        lb = num_elements * (1 - threshold) - num_elements * 0.05
+        ub = num_elements * (1 - threshold) + num_elements * 0.05
+        assert (
+            inputs.shape[0] > lb and inputs.shape[0] < ub
+        ), f"Not in the bounds of correct items {lb} {ub} {inputs.shape[0]}"
+        assert (
+            outputs.shape[0] > lb and outputs.shape[0] < ub
+        ), f"Not in the bounds of correct items {lb} {ub} {outputs.shape[0]}"
+
+    if "delta" in uq_name:
+        assert "mean" in uq_name or "max" in uq_name, "unknown Delta UQ mechanism"
+        d_type = np.float32
+        # Our DUQ-mean model skips odd evaluations.
+        # Here we set on verify_inputs the inputs of those evaluations
+        verify_inputs = np.zeros(inputs.shape, dtype=d_type)
+        if threshold == 0.0:
+            step = 1
+        elif threshold == 0.5:
+            verify_inputs[0] = np.ones(num_inputs, dtype=d_type)
+            step = 2
+        for i in range(1, len(inputs)):
+            verify_inputs[i] = verify_inputs[i - 1] + step
+        # Compare whether the results match our base function.
+        diff_sum = np.sum(np.abs(verify_inputs - inputs))
+        assert np.isclose(diff_sum, 0.0), "Mean Input data do not match"
+        verify_output = np.sum(inputs, axis=1).T * num_outputs
+        outputs = np.sum(outputs, axis=1)
+        diff_sum = np.sum(np.abs(outputs - verify_output))
+        assert np.isclose(diff_sum, 0.0), "Mean Output data do not match"
+    else:
+        return 0
+
+
+def get_fs_data(db_type, fs_path, model_path, threshold, name="test"):
+    if db_type != "hdf5" and db_type != "none":
+        print(f"Wrong db_type, we support only hdf5 instead got {db_type}")
+        return 1
+
+    if model_path == None or model_path == "":
+        threshold = 0.0
+
+    cont, has_error, fp = verify_fs_db(db_type, fs_path, name)
+
+    if has_error:
+        return (None, None), threshold, 1
+
+    if not cont:
+        return (None, None), threshold, 0
+
+    (_in, _out), has_error = read_file(fp, threshold)
+
+    Path(fp).unlink()
+    return (_in, _out), threshold, has_error
+
+
+def from_cli(argv):
+    use_device = int(argv[0])
+    num_inputs = int(argv[1])
+    num_outputs = int(argv[2])
+    model_path = argv[3]
+    data_type = argv[4]
+    uq_name = argv[5]
+    threshold = float(argv[6])
+    num_iterations = int(argv[7])
+    num_elements = int(argv[8])
+    db_type = argv[9]
+    fs_path = argv[10]
+
+    (_in, _out), thresh, has_error = get_fs_data(db_type, fs_path, model_path, threshold)
+
+    if has_error:
+        return 1
+
+    if (_in is None) and (_out is None):
+        return 0
+
+    if _in is None:
+        print("In is None, Out is not")
+        return 1
+
+    if _out is None:
+        print("Out is None, In is not")
+        return 1
+
+    error = verify_data(thresh, uq_name, num_elements, _in, _out, num_inputs, num_outputs)
+    return error
+
+
+def get_rmq_data(ams_config, domain_names, num_iterations, timeout=1):
+    from ams.rmq import BlockingClient, default_ams_callback
+
+    rmq_json = ams_config["db"]["rmq_config"]
+    print(rmq_json)
+    host = rmq_json["service-host"]
+    vhost = rmq_json["rabbitmq-vhost"]
+    port = rmq_json["service-port"]
+    user = rmq_json["rabbitmq-user"]
+    password = rmq_json["rabbitmq-password"]
+    queue = rmq_json["rabbitmq-queue-physics"]
+    cert = None
+    if "rabbitmq-cert" in rmq_json:
+        cert = rmq_json["rabbitmq-cert"]
+        cert = None if cert == "" else cert
+    with BlockingClient(host, port, vhost, user, password, cert, default_ams_callback) as client:
+        with client.connect(queue) as channel:
+            msgs = channel.receive(n_msg=num_iterations * len(domain_names), timeout=timeout)
+
+    dns = set(domain_names)
+
+    _data = {k: ([], []) for k in dns}
+
+    # sim_data[] = (_in, _out, thresh, uq_type)
+    for msg in msgs:
+        domain, input_data, output_data = msg.decode()
+        print(domain)
+        print(msg)
+        if domain not in dns:
+            raise RuntimeError(f"Received unknown domain name {domain}")
+        _data[domain][0].append(input_data)
+        _data[domain][1].append(output_data)
+
+    sim_data = {}
+    for d in _data.keys():
+        ml_id = ams_config["domain_models"][d]
+        model = ams_config["ml_models"][ml_id]
+        threshold = model["threshold"]
+        model_path = model.get("model_path", None)
+        uq_type = model["uq_type"]
+        if "uq_aggregate" in model:
+            uq_type += " ({0})".format(model["uq_aggregate"])
+
+        if model_path == None or model_path == "":
+            threshold = 0.0
+
+        print("Type for domain", d, type(_data[d]), len(_data[d]))
+        if model["db_label"] == "" and len(_data[d][0]) != 0:
+            raise RuntimeError("Expected data to not exist")
+        elif model["db_label"] != "":
+            inputs = np.vstack(_data[d][0])
+            outputs = np.vstack(_data[d][1])
+            sim_data[d] = (inputs, outputs, threshold, uq_type)
+
+    return sim_data
+
+
+def from_json(argv):
+    print(argv)
+    use_device = int(argv[0])
+    num_inputs = int(argv[1])
+    num_outputs = int(argv[2])
+    data_type = argv[3]
+    num_elements = int(argv[4])
+    model_1 = argv[5]
+    model_2 = argv[6]
+
+    env_file = Path(os.environ["AMS_OBJECTS"])
+    if not env_file.exists():
+        print("Environment file does not exist")
+        return -1
+
+    with open(env_file, "r") as fd:
+        data = json.load(fd)
+
+    db_type = data["db"]["dbType"]
+
+    sim_data = {}
+    if db_type == "hdf5" or db_type == "none":
+        db_path = data["db"]["fs_path"]
+        for m in [model_1, model_2]:
+            print("Reading Model", m)
+            ml_id = data["domain_models"][m]
+            model = data["ml_models"][ml_id]
+            uq_type = model["uq_type"]
+            if "uq_aggregate" in model:
+                uq_type += " ({0})".format(model["uq_aggregate"])
+
+            threshold = model["threshold"]
+            db_label = model["db_label"]
+            model_path = model.get("model_path", None)
+
+            (_in, _out), thresh, has_error = get_fs_data(db_type, db_path, model_path, threshold, model["db_label"])
+
+            if has_error:
+                return 1
+
+            if (_in is None) and (_out is None):
+                # This means we returned 0 as an error and we don't have any data
+                # to analyze, so we skip
+                continue
+
+            if _in is None:
+                print("In is None, Out is not")
+                return 1
+
+            if _out is None:
+                print("Out is None, In is not")
+                return 1
+
+            sim_data[m] = (_in, _out, thresh, uq_type)
+    elif db_type == "rmq":
+        print("RMQ")
+        sim_data = get_rmq_data(data, [model_1, model_2], 1)
+
+    for m, (_in, _out, thresh, uq_type) in sim_data.items():
+        print("Verify data of Model", m)
+        error = verify_data(thresh, uq_type, num_elements, _in, _out, num_inputs, num_outputs)
+        if error:
+            print("Error when verify_data")
+            return error
+    return 0
+
+
+if __name__ == "__main__":
+    if "AMS_OBJECTS" in os.environ:
+        sys.exit(from_json(sys.argv[1:]))
+    sys.exit(from_cli(sys.argv[1:]))
diff --git a/tests/AMSlib/ams_rmq_env.cpp b/tests/AMSlib/ams_rmq_env.cpp
deleted file mode 100644
index 6db24edb..00000000
--- a/tests/AMSlib/ams_rmq_env.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifdef __AMS_ENABLE_MPI__
-#include <mpi.h>
-#endif
-#include <unistd.h>
-
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <ml/uq.hpp>
-#include <umpire/Umpire.hpp>
-#include <umpire/strategy/QuickPool.hpp>
-#include <wf/basedb.hpp>
-#include <wf/resource_manager.hpp>
-
-#include "AMS.h"
-#include "wf/debug.h"
-
-#include "../utils.hpp"
-
-void createUmpirePool(std::string parent_name, std::string pool_name)
-{
-  auto &rm = umpire::ResourceManager::getInstance();
-  auto alloc_resource = rm.makeAllocator<umpire::strategy::QuickPool, true>(
-      pool_name, rm.getAllocator(parent_name));
-}
-
-
-AMSDType getDataType(char *d_type)
-{
-  AMSDType dType = AMSDType::AMS_DOUBLE;
-  if (std::strcmp(d_type, "float") == 0) {
-    dType = AMSDType::AMS_SINGLE;
-  } else if (std::strcmp(d_type, "double") == 0) {
-    dType = AMSDType::AMS_DOUBLE;
-  } else {
-    assert(false && "Unknown data type");
-  }
-  return dType;
-}
-
-template <typename DType>
-struct Problem {
-  int num_inputs;
-  int num_outputs;
-  Problem(int ni, int no) : num_inputs(ni), num_outputs(no) {}
-
-  void run(long num_elements, DType **inputs, DType **outputs)
-  {
-    for (int i = 0; i < num_elements; i++) {
-      DType sum = 0;
-      for (int j = 0; j < num_inputs; j++) {
-        sum += inputs[j][i];
-      }
-
-      for (int j = 0; j < num_outputs; j++) {
-        outputs[j][i] = sum;
-      }
-    }
-  }
-
-
-  const DType *initialize_inputs(DType *inputs, long length)
-  {
-    for (int i = 0; i < length; i++) {
-      inputs[i] = static_cast<DType>(i);
-    }
-    return inputs;
-  }
-
-  void ams_run(AMSExecutor &wf,
-               AMSResourceType resource,
-               int iterations,
-               int num_elements)
-  {
-    auto &rm = umpire::ResourceManager::getInstance();
-
-    for (int i = 0; i < iterations; i++) {
-      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
-      std::vector<const DType *> inputs;
-      std::vector<DType *> outputs;
-
-      // Allocate Input memory
-      for (int j = 0; j < num_inputs; j++) {
-        DType *data = new DType[elements];
-        inputs.push_back(initialize_inputs(data, elements));
-      }
-
-      // Allocate Output memory
-      for (int j = 0; j < num_outputs; j++) {
-        outputs.push_back(new DType[elements]);
-      }
-
-      AMSExecute(wf,
-                 (void *)this,
-                 elements,
-                 reinterpret_cast<const void **>(inputs.data()),
-                 reinterpret_cast<void **>(outputs.data()),
-                 inputs.size(),
-                 outputs.size());
-
-      for (int i = 0; i < num_outputs; i++) {
-        delete[] outputs[i];
-        outputs[i] = nullptr;
-      }
-
-
-      for (int i = 0; i < num_inputs; i++) {
-        delete[] inputs[i];
-        inputs[i] = nullptr;
-      }
-    }
-  }
-};
-
-void callBackDouble(void *cls, long elements, void **inputs, void **outputs)
-{
-  std::cout << "Called the double precision model\n";
-  static_cast<Problem<double> *>(cls)->run(elements,
-                                           (double **)(inputs),
-                                           (double **)(outputs));
-}
-
-
-void callBackSingle(void *cls, long elements, void **inputs, void **outputs)
-{
-  std::cout << "Called the single precision model\n";
-  static_cast<Problem<float> *>(cls)->run(elements,
-                                          (float **)(inputs),
-                                          (float **)(outputs));
-}
-
-
-int main(int argc, char **argv)
-{
-
-  if (argc != 7) {
-    std::cout << "Wrong cli\n";
-    std::cout << argv[0]
-              << " use_device(0|1) num_inputs num_outputs "
-                 "data_type(float|double) "
-                 "num_iterations num_elements"
-              << std::endl;
-    return -1;
-  }
-
-  installSignals();
-  AMSInit();
-
-  int use_device = std::atoi(argv[1]);
-  int num_inputs = std::atoi(argv[2]);
-  int num_outputs = std::atoi(argv[3]);
-  AMSDType data_type = getDataType(argv[4]);
-  int num_iterations = std::atoi(argv[5]);
-  int num_elements = std::atoi(argv[6]);
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  srand(time(NULL));
-
-  createUmpirePool("HOST", "TEST_HOST");
-  AMSSetAllocator(AMSResourceType::AMS_HOST, "TEST_HOST");
-
-  AMSCAbstrModel ams_model = AMSRegisterAbstractModel("rmq_db_no_model",
-                                                      AMSUQPolicy::AMS_RANDOM,
-                                                      0.5,
-                                                      "",
-                                                      "",
-                                                      "rmq_db_no_model",
-                                                      1);
-
-  if (data_type == AMSDType::AMS_SINGLE) {
-    Problem<float> prob(num_inputs, num_outputs);
-    AMSExecutor wf = AMSCreateExecutor(ams_model,
-                                       AMSDType::AMS_SINGLE,
-                                       resource,
-                                       (AMSPhysicFn)callBackSingle,
-                                       0,
-                                       1);
-
-    prob.ams_run(wf, resource, num_iterations, num_elements);
-  } else {
-    Problem<double> prob(num_inputs, num_outputs);
-    AMSExecutor wf = AMSCreateExecutor(ams_model,
-                                       AMSDType::AMS_DOUBLE,
-                                       resource,
-                                       (AMSPhysicFn)callBackDouble,
-                                       0,
-                                       1);
-    prob.ams_run(wf, resource, num_iterations, num_elements);
-  }
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/ams_update_model.cpp b/tests/AMSlib/ams_update_model.cpp
deleted file mode 100644
index 9a1b1c2d..00000000
--- a/tests/AMSlib/ams_update_model.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include <AMS.h>
-#include <ATen/core/interned_strings.h>
-#include <c10/core/TensorOptions.h>
-#include <torch/types.h>
-
-#include <cstring>
-#include <iostream>
-#include <ml/surrogate.hpp>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <vector>
-#include <wf/resource_manager.hpp>
-
-#include "../utils.hpp"
-
-#define SIZE (32L)
-
-template <typename T>
-bool inference(SurrogateModel<T> &model,
-               AMSResourceType resource,
-               std::string update_path)
-{
-  using namespace ams;
-
-  std::vector<const T *> inputs;
-  std::vector<T *> outputs;
-  auto &ams_rm = ams::ResourceManager::getInstance();
-
-  for (int i = 0; i < 2; i++)
-    inputs.push_back(ams_rm.allocate<T>(SIZE, resource));
-
-  for (int i = 0; i < 4 * 2; i++)
-    outputs.push_back(ams_rm.allocate<T>(SIZE, resource));
-
-  for (int repeat = 0; repeat < 2; repeat++) {
-    model.evaluate(
-        SIZE, inputs.size(), 4, inputs.data(), &(outputs.data()[repeat * 4]));
-    if (repeat == 0) model.update(update_path);
-  }
-
-  // Verify
-  bool errors = false;
-  for (int i = 0; i < 4; i++) {
-    T *first_model_out = outputs[i];
-    T *second_model_out = outputs[i + 4];
-    if (resource == AMSResourceType::AMS_DEVICE) {
-      first_model_out = ams_rm.allocate<T>(SIZE, AMSResourceType::AMS_HOST);
-      second_model_out = ams_rm.allocate<T>(SIZE, AMSResourceType::AMS_HOST);
-      ams_rm.copy(outputs[i],
-                  resource,
-                  first_model_out,
-                  AMSResourceType::AMS_HOST,
-                  SIZE);
-      ams_rm.copy(outputs[i + 4],
-                  resource,
-                  second_model_out,
-                  AMSResourceType::AMS_HOST,
-                  SIZE);
-    }
-
-    for (int j = 0; j < SIZE; j++) {
-      if (first_model_out[j] != 1.0) {
-        errors = true;
-        std::cout << "One Model " << first_model_out << " " << j << " "
-                  << first_model_out[j] << "\n";
-      }
-      if (second_model_out[j] != 0.0) {
-        std::cout << "Zero Model " << second_model_out << " " << j << " "
-                  << second_model_out[j] << "\n";
-        errors = true;
-      }
-    }
-
-    if (resource == AMSResourceType::AMS_DEVICE) {
-      ams_rm.deallocate(first_model_out, AMSResourceType::AMS_HOST);
-      ams_rm.deallocate(second_model_out, AMSResourceType::AMS_HOST);
-    }
-  }
-
-  for (int i = 0; i < 2; i++)
-    ams_rm.deallocate(const_cast<T *>(inputs[i]), resource);
-
-  for (int i = 0; i < 4 * 2; i++)
-    ams_rm.deallocate(outputs[i], resource);
-
-  return errors;
-}
-
-
-int main(int argc, char *argv[])
-{
-  using namespace ams;
-  installSignals();
-  AMSInit();
-
-  auto &ams_rm = ams::ResourceManager::getInstance();
-  int use_device = std::atoi(argv[1]);
-  std::string data_type(argv[2]);
-  std::string zero_model(argv[3]);
-  std::string one_model(argv[4]);
-
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  if (use_device == 1) {
-    resource = AMSResourceType::AMS_DEVICE;
-  }
-
-  int ret = 0;
-  if (data_type.compare("double") == 0) {
-    std::shared_ptr<SurrogateModel<double>> model =
-        SurrogateModel<double>::getInstance(one_model, resource);
-    assert(model->is_double());
-    ret = inference<double>(*model, resource, zero_model);
-  } else if (data_type.compare("single") == 0) {
-    std::shared_ptr<SurrogateModel<float>> model =
-        SurrogateModel<float>::getInstance(one_model, resource);
-    assert(!model->is_double());
-    ret = inference<float>(*model, resource, zero_model);
-  }
-  std::cout << "Zero Model is " << zero_model << "\n";
-  std::cout << "One Model is " << one_model << "\n";
-  AMSFinalize();
-  return ret;
-}
diff --git a/tests/AMSlib/ams_uq_model.cpp b/tests/AMSlib/ams_uq_model.cpp
deleted file mode 100644
index 0c033f39..00000000
--- a/tests/AMSlib/ams_uq_model.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include <AMS.h>
-
-#include <cstring>
-#include <iostream>
-#include <ml/surrogate.hpp>
-#include <ml/uq.hpp>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <vector>
-#include <wf/resource_manager.hpp>
-
-#include "../utils.hpp"
-
-#define SIZE (32L * 1024L + 3L)
-
-template <typename T>
-void model(UQ<T> &model,
-           AMSResourceType resource,
-           int num_inputs,
-           int num_outputs)
-{
-  std::vector<const T *> inputs;
-  std::vector<T *> outputs;
-  auto &ams_rm = ams::ResourceManager::getInstance();
-
-  for (int i = 0; i < num_inputs; i++)
-    inputs.push_back(ams_rm.allocate<T>(SIZE, resource));
-
-  for (int i = 0; i < num_outputs; i++)
-    outputs.push_back(ams_rm.allocate<T>(SIZE, resource));
-
-  bool *predicates = ams_rm.allocate<bool>(SIZE, resource);
-
-  std::cout << "We are calling evaluate\n";
-  model.evaluate(SIZE, inputs, outputs, predicates);
-
-
-  for (int i = 0; i < num_inputs; i++)
-    ams_rm.deallocate(const_cast<T *>(inputs[i]), resource);
-
-  for (int i = 0; i < num_outputs; i++)
-    ams_rm.deallocate(outputs[i], resource);
-
-  ams_rm.deallocate(predicates, resource);
-}
-
-
-int main(int argc, char *argv[])
-{
-  using namespace ams;
-  installSignals();
-  AMSInit();
-
-  auto &ams_rm = ResourceManager::getInstance();
-  int use_device = std::atoi(argv[1]);
-  std::string model_path(argv[2]);
-  std::string data_type(argv[3]);
-  std::string uq_path;
-  int num_inputs = std::atoi(argv[4]);
-  int num_outputs = std::atoi(argv[5]);
-  const AMSUQPolicy uq_policy = static_cast<AMSUQPolicy>(std::atoi(argv[6]));
-  float threshold = std::atof(argv[7]);
-
-  std::cout << "Executing on device " << use_device << "\n";
-
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  if (use_device == 1) {
-    resource = AMSResourceType::AMS_DEVICE;
-  }
-
-  if (data_type.compare("double") == 0) {
-    UQ<double> UQModel(resource, uq_policy, uq_path, -1, model_path, threshold);
-    model(UQModel, resource, num_inputs, num_outputs);
-  } else if (data_type.compare("single") == 0) {
-    UQ<float> UQModel(resource, uq_policy, uq_path, -1, model_path, threshold);
-    model(UQModel, resource, num_inputs, num_outputs);
-  }
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/cpu_packing_test.cpp b/tests/AMSlib/cpu_packing_test.cpp
deleted file mode 100644
index a84eaa3d..00000000
--- a/tests/AMSlib/cpu_packing_test.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#include <AMS.h>
-
-#include <cstring>
-#include <iostream>
-#include <vector>
-#include <wf/data_handler.hpp>
-#include <wf/resource_manager.hpp>
-
-#include "../utils.hpp"
-
-#define SIZE (32 * 1024 + 1)
-
-void initPredicate(bool* ptr, double* data, int size)
-{
-  for (int i = 0; i < size; i++) {
-    ptr[i] = i % 2 == 0;
-    data[i] = i;
-  }
-}
-
-int verify(double* dense, int size, int flag)
-{
-  for (int i = 0; i < size; i++) {
-    if (dense[i] != (i * 2 + (!flag))) {
-      std::cout << i << " Expected " << i * 2 << " gotten " << dense[i] << "\n";
-      return 1;
-    }
-  }
-  return 0;
-}
-
-int verify(bool* pred, double* d1, double* d2, int size, int flag)
-{
-  for (int i = 0; i < size; i++) {
-    if ((pred[i] == flag) && d1[i] != d2[i]) {
-      std::cout << pred[i] << " dense " << d1[i] << " sparse " << d2[i] << "\n";
-      return 1;
-    }
-  }
-  return 0;
-}
-
-int main(int argc, char* argv[])
-{
-  using namespace ams;
-  installSignals();
-  AMSInit();
-
-  using data_handler = DataHandler<double>;
-  const size_t size = SIZE;
-  int device = std::atoi(argv[1]);
-  auto& rm = ams::ResourceManager::getInstance();
-  rm.init();
-  if (device == 0) {
-    AMSResourceType resource = AMSResourceType::AMS_HOST;
-    bool* predicate = rm.allocate<bool>(SIZE, resource);
-    double* dense = rm.allocate<double>(SIZE, resource);
-    double* sparse = rm.allocate<double>(SIZE, resource);
-    double* rsparse = rm.allocate<double>(SIZE, resource);
-
-    initPredicate(predicate, sparse, SIZE);
-    std::vector<const double*> s_data({const_cast<const double*>(sparse)});
-    std::vector<double*> sr_data({rsparse});
-    std::vector<double*> d_data({dense});
-    int elements;
-
-    for (int flag = 0; flag < 2; flag++) {
-      elements =
-          data_handler::pack(resource, predicate, size, s_data, d_data, flag);
-
-      if (elements != (SIZE + flag) / 2) {
-        std::cout << "Did not compute dense number correctly " << elements
-                  << "\n";
-        return 1;
-      }
-
-      if (verify(dense, elements, flag)) {
-        std::cout << "Dense elements do not have the correct values\n";
-        return 1;
-      }
-
-      data_handler::unpack(resource, predicate, size, d_data, sr_data, flag);
-
-      if (verify(predicate, sparse, rsparse, size, flag)) {
-        std::cout << "Unpacking packed data does not match initial values\n";
-        return 1;
-      }
-    }
-
-    rm.deallocate(predicate, AMSResourceType::AMS_HOST);
-    rm.deallocate(dense, AMSResourceType::AMS_HOST);
-    rm.deallocate(sparse, AMSResourceType::AMS_HOST);
-    rm.deallocate(rsparse, AMSResourceType::AMS_HOST);
-  } else if (device == 1) {
-    AMSResourceType resource = AMSResourceType::AMS_DEVICE;
-    bool* h_predicate = rm.allocate<bool>(SIZE, AMSResourceType::AMS_HOST);
-    double* h_dense = rm.allocate<double>(SIZE, AMSResourceType::AMS_HOST);
-    double* h_sparse = rm.allocate<double>(SIZE, AMSResourceType::AMS_HOST);
-    double* h_rsparse = rm.allocate<double>(SIZE, AMSResourceType::AMS_HOST);
-
-    initPredicate(h_predicate, h_sparse, SIZE);
-
-    bool* predicate = rm.allocate<bool>(SIZE, resource);
-    double* dense = rm.allocate<double>(SIZE, resource);
-    double* sparse = rm.allocate<double>(SIZE, resource);
-    double* rsparse = rm.allocate<double>(SIZE, resource);
-    int* reindex = rm.allocate<int>(SIZE, resource);
-
-    rm.copy(h_predicate,
-            AMSResourceType::AMS_HOST,
-            predicate,
-            AMSResourceType::AMS_DEVICE,
-            SIZE);
-    rm.copy(h_sparse,
-            AMSResourceType::AMS_HOST,
-            sparse,
-            AMSResourceType::AMS_DEVICE,
-            SIZE);
-
-    std::vector<const double*> s_data({const_cast<const double*>(sparse)});
-    std::vector<double*> sr_data({rsparse});
-    std::vector<double*> d_data({dense});
-
-    for (int flag = 0; flag < 2; flag++) {
-      int elements;
-      elements =
-          data_handler::pack(resource, predicate, size, s_data, d_data, flag);
-
-      if (elements != (SIZE + flag) / 2) {
-        std::cout << "Did not compute dense number correctly(" << elements
-                  << ")\n";
-        return 1;
-      }
-
-      rm.copy(dense,
-              AMSResourceType::AMS_DEVICE,
-              h_dense,
-              AMSResourceType::AMS_HOST,
-              elements);
-
-      if (verify(h_dense, elements, flag)) {
-        std::cout << "Dense elements do not have the correct values\n";
-        return 1;
-      }
-
-      data_handler::unpack(resource, predicate, size, d_data, sr_data, flag);
-
-      rm.copy(rsparse,
-              AMSResourceType::AMS_DEVICE,
-              h_rsparse,
-              AMSResourceType::AMS_HOST,
-              size);
-
-      if (verify(h_predicate, h_sparse, h_rsparse, size, flag)) {
-        //      for ( int k = 0; k < SIZE; k++){
-        //        std::cout << k << " " << h_sparse[k] << " " << h_rsparse[k] <<
-        //        "\n";
-        //      }
-        std::cout << "Unpacking packed data does not match initial values\n";
-        return 1;
-      }
-    }
-
-    rm.deallocate(predicate, AMSResourceType::AMS_DEVICE);
-    rm.deallocate(h_predicate, AMSResourceType::AMS_HOST);
-    rm.deallocate(dense, AMSResourceType::AMS_DEVICE);
-    rm.deallocate(h_dense, AMSResourceType::AMS_HOST);
-    rm.deallocate(sparse, AMSResourceType::AMS_DEVICE);
-    rm.deallocate(h_sparse, AMSResourceType::AMS_HOST);
-    rm.deallocate(rsparse, AMSResourceType::AMS_DEVICE);
-    rm.deallocate(h_rsparse, AMSResourceType::AMS_HOST);
-    rm.deallocate(reindex, AMSResourceType::AMS_DEVICE);
-  }
-
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/db/CMakeLists.txt b/tests/AMSlib/db/CMakeLists.txt
new file mode 100644
index 00000000..0f2f66c1
--- /dev/null
+++ b/tests/AMSlib/db/CMakeLists.txt
@@ -0,0 +1,49 @@
+function(BUILD_UNIT_TEST exe source)
+  add_executable(${exe} ${source})
+
+  target_include_directories(${exe} PRIVATE ${caliper_INCLUDE_DIR} ${MPI_INCLUDE_PATH})
+  target_include_directories(${exe} PRIVATE ${CMAKE_SOURCE_DIR}/src/AMSlib/)
+  target_include_directories(${exe} PRIVATE ${CMAKE_BINARY_DIR}/include/)
+  target_link_libraries(${exe} PRIVATE stdc++fs AMS torch)
+
+  if (WITH_HDF5)
+    target_link_libraries(${exe} PRIVATE ${AMS_HDF5_TARGET})
+  endif()
+
+  if (WITH_CALIPER)
+    message(STATUS "BUilding witth calier ${exe}")
+    target_link_libraries(${exe} PRIVATE caliper)
+  endif()
+
+  if (WITH_RMQ)
+    target_link_libraries(${exe} PRIVATE amqpcpp)
+    if (OPENSSL_FOUND)
+      target_link_libraries(${exe} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    endif()
+    # NOTE: We set here the event/event pthreads as public. As there is no easy way
+    # to do a find package(libevent) and RMQ is not exposing that properly.
+    target_link_libraries(${exe} PRIVATE ${LIBEVENT_LIBRARY} ${LIBEVENT_THREAD})
+  endif()
+
+  if (WITH_MPI)
+    target_link_libraries(${exe} PRIVATE MPI::MPI_CXX)
+  endif()
+
+  target_compile_definitions(${exe} PRIVATE ${AMS_APP_DEFINES})
+endfunction()
+
+function(ADD_HDF5_UNIT_TEST name exec)
+  add_test(NAME ${name} COMMAND ${exec} ${ARGN})
+
+  set_tests_properties(${name} PROPERTIES LABELS HDF5_UNIT_TEST)
+endfunction()
+
+if (WITH_HDF5)
+BUILD_UNIT_TEST(hdf5_create hdf5_open.cpp)
+BUILD_UNIT_TEST(hdf5_store hdf5_store.cpp)
+BUILD_UNIT_TEST(db_manager_hdf5 db_manager_hdf5.cpp)
+ADD_HDF5_UNIT_TEST(DB::HDF5::CREATE COMMAND bash -c "rm -f ${CMAKE_CURRENT_BINARY_DIR}/create_test*.h5\;${CMAKE_CURRENT_BINARY_DIR}/hdf5_create  ${CMAKE_CURRENT_BINARY_DIR} test create_test")
+ADD_HDF5_UNIT_TEST(DB::HDF5::STORE COMMAND bash -c "rm -f ${CMAKE_CURRENT_BINARY_DIR}/store_test*.h5\;${CMAKE_CURRENT_BINARY_DIR}/hdf5_store ${CMAKE_CURRENT_BINARY_DIR} test store_test")
+ADD_HDF5_UNIT_TEST(DB::HDF5::DBManager COMMAND bash -c "rm -f ${CMAKE_CURRENT_BINARY_DIR}/domain_*.h5\;${CMAKE_CURRENT_BINARY_DIR}/db_manager_hdf5 ${CMAKE_CURRENT_BINARY_DIR}/")
+endif()
+
diff --git a/tests/AMSlib/db/db_manager_hdf5.cpp b/tests/AMSlib/db/db_manager_hdf5.cpp
new file mode 100644
index 00000000..d3be2ad3
--- /dev/null
+++ b/tests/AMSlib/db/db_manager_hdf5.cpp
@@ -0,0 +1,48 @@
+#include <filesystem>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "AMSTypes.hpp"
+#include "wf/basedb.hpp"
+
+int main(int argc, char* argv[])
+{
+  if (argc != 2) {
+    std::cout << "Wrong command line argument :\n";
+    std::cout << argv[0] << " <path-for-db-manager>\n";
+    return -1;
+  }
+  std::string db_path(argv[1]);
+  auto& db_instance = ams::db::DBManager::getInstance();
+  db_instance.instantiate_fs_db(ams::AMSDBType::AMS_HDF5, db_path);
+  for (auto dn : {std::string("domain_1"),
+                  std::string("domain_2"),
+                  std::string("domain_1"),
+                  std::string("domain_2")}) {
+    auto file_db = db_instance.getDB(dn, dn);
+  }
+
+  if (db_instance.getNumInstances() != 2) {
+    std::cout << "Wrong number of instances\n";
+    return -1;
+  }
+
+  // This is done to internally call the de-constructors of the respective DB.
+  db_instance.clean();
+  if (db_instance.getNumInstances() != 0) {
+    std::cout << "DB Instances did not reset \n";
+    return -1;
+  }
+
+  for (auto dn : {std::string("domain_1"), std::string("domain_2")}) {
+    auto fn = db_path + dn + "_0.h5";
+
+    if (!std::filesystem::exists(fn)) {
+      std::cout << "File " << fn << "does not exists.\n";
+      return -1;
+    }
+  }
+
+  return 0;
+}
diff --git a/tests/AMSlib/db/hdf5_open.cpp b/tests/AMSlib/db/hdf5_open.cpp
new file mode 100644
index 00000000..c453b975
--- /dev/null
+++ b/tests/AMSlib/db/hdf5_open.cpp
@@ -0,0 +1,124 @@
+#include <H5Tpublic.h>
+#include <hdf5.h>
+
+#include <filesystem>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "wf/basedb.hpp"
+
+template <typename T>
+int testReadHDF5Dataset(const std::string& filePath,
+                        const std::string& datasetName,
+                        hid_t DataType,
+                        std::vector<T> correct_contents)
+{
+  // Open the HDF5 file
+  hid_t file_id = H5Fopen(filePath.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  if (file_id < 0) {
+    std::cerr << "Failed to open HDF5 file: " << filePath << std::endl;
+    return -1;
+  }
+
+  // Open the dataset
+  hid_t dataset_id = H5Dopen(file_id, datasetName.c_str(), H5P_DEFAULT);
+  if (dataset_id < 0) {
+    std::cerr << "Failed to open dataset: " << datasetName << std::endl;
+    H5Fclose(file_id);
+    return -1;
+  }
+
+  // Get the dataspace of the dataset
+  hid_t dataspace_id = H5Dget_space(dataset_id);
+  if (dataspace_id < 0) {
+    std::cerr << "Failed to get dataspace for dataset: " << datasetName
+              << std::endl;
+    H5Dclose(dataset_id);
+    H5Fclose(file_id);
+    return -1;
+  }
+
+  // Get the number of dimensions and size of each dimension
+  int ndims = H5Sget_simple_extent_ndims(dataspace_id);
+  std::vector<hsize_t> dims(ndims);
+  H5Sget_simple_extent_dims(dataspace_id, dims.data(), nullptr);
+
+  // Print dimensions
+  for (size_t i = 0; i < dims.size(); ++i) {
+    std::cout << dims[i] << (i < dims.size() - 1 ? " x " : "\n");
+  }
+
+  // Determine the datatype of the dataset
+  hid_t datatype_id = H5Dget_type(dataset_id);
+  // Get the size of the datatype
+  size_t datatype_size = H5Tget_size(datatype_id);
+
+
+  // Only handle floating-point data for this example
+  std::vector<T> data;
+  if (datatype_size == sizeof(T)) {
+    // Allocate memory for the dataset
+    size_t total_elements = 1;
+    for (auto dim : dims) {
+      total_elements *= dim;
+    }
+    data.resize(total_elements);
+    // Read the dataset
+    if (H5Dread(
+            dataset_id, DataType, H5S_ALL, H5S_ALL, H5P_DEFAULT, data.data()) <
+        0)
+      std::cerr << "Failed to read dataset: " << datasetName << std::endl;
+  } else {
+    std::cerr << "Unsupported data type for dataset: " << datasetName
+              << std::endl;
+    return -1;
+  }
+
+  // Close HDF5 objects
+  H5Tclose(datatype_id);
+  H5Sclose(dataspace_id);
+  H5Dclose(dataset_id);
+  H5Fclose(file_id);
+  std::cout << "Read: " << std::string(data.begin(), data.end());
+  return (data == correct_contents) ? 0 : -1;
+}
+
+
+int main(int argc, char* argv[])
+{
+  if (argc != 4) {
+    std::cerr << "Wrong command line, correct one should be:\n";
+    std::cerr << argv[0]
+              << " <path-to-directory> <domain-name> <filename> <rid>";
+  }
+  std::string directory(argv[1]);
+  std::string domain_name(argv[2]);
+  std::string file_prefix(argv[3]);
+  std::string filename;
+  {
+    // Open non existing file
+    auto db = ams::db::hdf5DB(directory, domain_name, file_prefix, 0);
+    filename = db.getFilename();
+  }
+
+  std::cout << "Filename is:" << filename << "\n";
+  // Check if file exists
+  if (!std::filesystem::exists(filename)) {
+    std::cout << "File does not exist." << std::endl;
+    return -1;
+  }
+
+  // Open existing file
+  {
+    auto db = ams::db::hdf5DB(directory, domain_name, file_prefix, 0);
+    filename = db.getFilename();
+  }
+  if (!std::filesystem::exists(filename)) {
+    std::cout << "File does not exist." << std::endl;
+    return -1;
+  }
+  std::string dn("domain_name");
+  std::vector<char> _dn(domain_name.begin(), domain_name.end());
+  return testReadHDF5Dataset(filename, dn, H5T_NATIVE_CHAR, _dn);
+}
diff --git a/tests/AMSlib/db/hdf5_store.cpp b/tests/AMSlib/db/hdf5_store.cpp
new file mode 100644
index 00000000..7b18d050
--- /dev/null
+++ b/tests/AMSlib/db/hdf5_store.cpp
@@ -0,0 +1,223 @@
+#include <H5Tpublic.h>
+#include <hdf5.h>
+#include <torch/torch.h>
+
+#include <filesystem>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "wf/basedb.hpp"
+
+template <typename T>
+int testReadHDF5Dataset(const std::string& filePath,
+                        const std::string& datasetName,
+                        hid_t DataType,
+                        std::vector<T> correct_contents)
+{
+  // Open the HDF5 file
+  hid_t file_id = H5Fopen(filePath.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  if (file_id < 0) {
+    std::cerr << "Failed to open HDF5 file: " << filePath << std::endl;
+    return -1;
+  }
+
+  // Open the dataset
+  hid_t dataset_id = H5Dopen(file_id, datasetName.c_str(), H5P_DEFAULT);
+  if (dataset_id < 0) {
+    std::cerr << "Failed to open dataset: " << datasetName << std::endl;
+    H5Fclose(file_id);
+    return -1;
+  }
+
+  // Get the dataspace of the dataset
+  hid_t dataspace_id = H5Dget_space(dataset_id);
+  if (dataspace_id < 0) {
+    std::cerr << "Failed to get dataspace for dataset: " << datasetName
+              << std::endl;
+    H5Dclose(dataset_id);
+    H5Fclose(file_id);
+    return -1;
+  }
+
+  // Get the number of dimensions and size of each dimension
+  int ndims = H5Sget_simple_extent_ndims(dataspace_id);
+  std::vector<hsize_t> dims(ndims);
+  H5Sget_simple_extent_dims(dataspace_id, dims.data(), nullptr);
+
+  // Print dimensions
+  for (size_t i = 0; i < dims.size(); ++i) {
+    std::cout << dims[i] << (i < dims.size() - 1 ? " x " : "\n");
+  }
+
+  // Determine the datatype of the dataset
+  hid_t datatype_id = H5Dget_type(dataset_id);
+  // Get the size of the datatype
+  size_t datatype_size = H5Tget_size(datatype_id);
+
+
+  // Only handle floating-point data for this example
+  std::vector<T> data;
+  if (datatype_size == sizeof(T)) {
+    // Allocate memory for the dataset
+    size_t total_elements = 1;
+    for (auto dim : dims) {
+      total_elements *= dim;
+    }
+    data.resize(total_elements);
+    // Read the dataset
+    if (H5Dread(
+            dataset_id, DataType, H5S_ALL, H5S_ALL, H5P_DEFAULT, data.data()) <
+        0)
+      std::cerr << "Failed to read dataset: " << datasetName << std::endl;
+  } else {
+    std::cerr << "Unsupported data type for dataset: " << datasetName
+              << std::endl;
+    return -1;
+  }
+
+  // Close HDF5 objects
+  H5Tclose(datatype_id);
+  H5Sclose(dataspace_id);
+  H5Dclose(dataset_id);
+  H5Fclose(file_id);
+  std::cout << "Read: " << std::string(data.begin(), data.end());
+  return (data == correct_contents) ? 0 : -1;
+}
+
+
+#include <hdf5.h>
+#include <torch/torch.h>
+
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+// Function to read a dataset and compare it with the expected tensor
+bool verifyDatasetContents(const std::string& fileName,
+                           const std::string& datasetName,
+                           const std::vector<torch::Tensor>& expectedTensors)
+{
+  // Open the HDF5 file
+  hid_t file_id = H5Fopen(fileName.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  if (file_id < 0) {
+    throw std::runtime_error("Failed to open HDF5 file.");
+  }
+
+  // Open the dataset
+  hid_t dset_id = H5Dopen2(file_id, datasetName.c_str(), H5P_DEFAULT);
+  if (dset_id < 0) {
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to open dataset.");
+  }
+
+  // Get the dataspace
+  hid_t space_id = H5Dget_space(dset_id);
+  if (space_id < 0) {
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to get dataspace.");
+  }
+
+  // Get the dataset dimensions
+  int ndims = H5Sget_simple_extent_ndims(space_id);
+  if (ndims < 0) {
+    H5Sclose(space_id);
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to get number of dimensions.");
+  }
+
+  std::vector<hsize_t> dims(ndims);
+  if (H5Sget_simple_extent_dims(space_id, dims.data(), NULL) < 0) {
+    H5Sclose(space_id);
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to get dataset dimensions.");
+  }
+
+  // Close dataspace
+  H5Sclose(space_id);
+
+  // Flatten the dataset dimensions into a total size
+  size_t totalSize = 1;
+  for (const auto& dim : dims) {
+    totalSize *= dim;
+  }
+
+  // Allocate a tensor to read the dataset
+  auto readTensor =
+      torch::empty({static_cast<int64_t>(totalSize)}, torch::kFloat);
+
+  // Read the dataset into the tensor
+  herr_t status = H5Dread(dset_id,
+                          H5T_NATIVE_FLOAT,
+                          H5S_ALL,
+                          H5S_ALL,
+                          H5P_DEFAULT,
+                          readTensor.data_ptr());
+  if (status < 0) {
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to read dataset.");
+  }
+
+  // Close dataset and file
+  H5Dclose(dset_id);
+  H5Fclose(file_id);
+
+  // Concatenate all expected tensors into one
+  auto expectedTensor = torch::cat(expectedTensors).flatten();
+
+  // Compare the tensors
+  if (!torch::allclose(readTensor, expectedTensor)) {
+    throw std::runtime_error(
+        "Dataset contents do not match the expected tensors.");
+  }
+
+  std::cout << "Dataset contents match the expected tensors!" << std::endl;
+  return true;
+}
+
+
+int main(int argc, char* argv[])
+{
+  if (argc != 4) {
+    std::cerr << "Wrong command line, correct one should be:\n";
+    std::cerr << argv[0]
+              << " <path-to-directory> <domain-name> <filename> <rid>";
+  }
+  std::string directory(argv[1]);
+  std::string domain_name(argv[2]);
+  std::string file_prefix(argv[3]);
+  std::string filename;
+
+  std::vector<torch::Tensor> inputTensors, outputTensors;
+  for (int i = 0; i < 2; i++) {
+    {
+      // Scope it to automatically close C++ deconstructor and close file
+      auto db = ams::db::hdf5DB(directory, domain_name, file_prefix, 0);
+      torch::Tensor IData =
+          torch::rand({21, 4}, torch::TensorOptions().dtype(torch::kFloat));
+      torch::Tensor OData =
+          torch::rand({21, 4}, torch::TensorOptions().dtype(torch::kFloat));
+
+      // Test 1. Open file and write data to it.
+      filename = db.getFilename();
+      db.store(IData, OData);
+      inputTensors.emplace_back(std::move(IData));
+      outputTensors.emplace_back(std::move(OData));
+    }
+    if (!verifyDatasetContents(filename, "input_data", inputTensors) ||
+        !verifyDatasetContents(filename, "output_data", outputTensors))
+      return -1;
+    std::cout << ((i == 0) ? "Creating empty file and checking contents is "
+                             "correct\n"
+                           : "Opening existing file and checking contents is "
+                             "correct\n");
+  }
+
+  std::string dn("domain_name");
+  std::vector<char> _dn(domain_name.begin(), domain_name.end());
+  return testReadHDF5Dataset(filename, dn, H5T_NATIVE_CHAR, _dn);
+}
diff --git a/tests/AMSlib/debug_model.pt b/tests/AMSlib/debug_model.pt
deleted file mode 100644
index 28b510db..00000000
Binary files a/tests/AMSlib/debug_model.pt and /dev/null differ
diff --git a/tests/AMSlib/example_faiss.idx b/tests/AMSlib/example_faiss.idx
deleted file mode 100644
index e3f07e66..00000000
Binary files a/tests/AMSlib/example_faiss.idx and /dev/null differ
diff --git a/tests/AMSlib/faiss_debug.pt b/tests/AMSlib/faiss_debug.pt
deleted file mode 100644
index 7ea0c6eb..00000000
Binary files a/tests/AMSlib/faiss_debug.pt and /dev/null differ
diff --git a/tests/AMSlib/generate_constant_model.py b/tests/AMSlib/generate_constant_model.py
deleted file mode 100644
index fbeb69af..00000000
--- a/tests/AMSlib/generate_constant_model.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import torch
-import os
-import sys
-import numpy as np
-from torch.autograd import Variable
-from torch import jit
-
-class ConstantModel(torch.nn.Module):
-    def __init__(self, inputSize, outputSize, constant):
-        super(ConstantModel, self).__init__()
-        self.linear = torch.nn.Linear(inputSize, outputSize)
-        self.linear.weight.data.fill_(0.0)
-        self.linear.bias.data.fill_(constant)
-
-    def forward(self, x):
-        y = self.linear(x)
-        return y
-
-def main(args):
-    inputDim = int(args[1])
-    outputDim = int(args[2])
-    device = args[3]
-    enable_cuda = True
-    if device == "cuda":
-        enable_cuda = True
-        suffix = '_gpu'
-    elif device == "cpu":
-        enable_cuda = False
-        suffix = '_cpu'
-    
-    model = ConstantModel(inputDim, outputDim, 1.0).double()
-    if torch.cuda.is_available() and enable_cuda:
-        model = model.cuda()
-
-    model.eval()
-    with torch.jit.optimized_execution(True):
-        traced = torch.jit.trace(model, (torch.randn(inputDim, dtype=torch.double).to(device), ))
-        traced.save(f"ConstantOneModel_{suffix}.pt")
-
-    model = ConstantModel(inputDim, outputDim, 0.0).double()
-    if torch.cuda.is_available() and enable_cuda:
-        model = model.cuda()
-
-    model.eval()
-    with torch.jit.optimized_execution(True):
-        traced = torch.jit.trace(model, (torch.randn(inputDim, dtype=torch.double).to(device), ))
-        traced.save(f"ConstantZeroModel_{suffix}.pt")
-
-    inputs = Variable(torch.from_numpy(np.zeros((1, inputDim))).to(device))
-    zero_model = jit.load(f"ConstantZeroModel_{suffix}.pt")
-    print("ZeroModel", zero_model(inputs))
-
-    one_model = jit.load(f"ConstantOneModel_{suffix}.pt")
-    print("OneModel", one_model(inputs))
-
-
-
-
-if __name__ == '__main__':
-    main(sys.argv)
-
-
-
-
diff --git a/tests/AMSlib/generate_faiss.py b/tests/AMSlib/generate_faiss.py
deleted file mode 100644
index 05d66ff1..00000000
--- a/tests/AMSlib/generate_faiss.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
-# AMSLib Project Developers
-#
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-import argparse
-import numpy as np
-import faiss  # make faiss available
-
-
-def create_elements(nb, d, distance=10, offset=0):
-    centers = []
-    for i in range(nb):
-        c = ((i + 1) * distance) + offset
-        centers.append(np.random.uniform(low=c - 0.5, high=c + 0.5, size=(100, d)))
-
-    xb = np.vstack(centers).astype("float32")
-    return xb
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Faiss index generator for test purposes"
-    )
-    parser.add_argument(
-        "-d", "--dimensions", type=int, help="Dimensions of our vectors", default=4
-    )
-    parser.add_argument(
-        "-dst",
-        "--distance",
-        type=int,
-        help="Jump between centers in Faiss index",
-        default=10,
-    )
-    parser.add_argument(
-        "-f",
-        "--file",
-        type=str,
-        help="File to store FAISS index",
-        default="faiss_debug.pt",
-    )
-    parser.add_argument(
-        "-c",
-        "--num-centers",
-        type=int,
-        help="Number of centers in FAISS index",
-        default=10,
-    )
-    args = parser.parse_args()
-
-    xb = create_elements(args.num_centers, args.dimensions)
-    index = faiss.IndexFlatL2(args.dimensions)  # build the index
-    index.add(xb)  # add vectors to the index
-    faiss.write_index(index, args.file)
-    # print(xb)
-
-    # k=1
-    # xq_false = create_elements(args.num_centers, args.dimensions, args.distance, 5)
-    # xq_true = create_elements(args.num_centers,args.dimensions)
-    # xq = np.vstack((xq_false, xq_true)).astype('float32')
-    # print("Search for")
-    # print(xq)
-    # D, I = index.search(xq, k)
-    # print("Distance is", D)
-    # print("Indexes are", I)
-
-    # predicate = np.any(D < 4, axis=1)
-    # print(predicate)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/AMSlib/generate_tupple_model.py b/tests/AMSlib/generate_tupple_model.py
deleted file mode 100644
index 104a3537..00000000
--- a/tests/AMSlib/generate_tupple_model.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import torch
-import os
-import sys
-import numpy as np
-from torch.autograd import Variable
-from torch import jit
-from torch import Tensor
-from typing import Tuple
-
-
-# Ugly code that expands the fake_uq to the shape we need as an output
-def to_tupple(y: Tensor, fake_uq: Tensor) -> Tuple[Tensor, Tensor]:
-    outer_dim = y.shape[0]
-    fake_uq_dim = fake_uq.shape[0]
-    tmp = fake_uq.clone().detach()
-    additional_dims = torch.div(outer_dim, fake_uq_dim, rounding_mode="floor") + outer_dim % fake_uq_dim
-    final_shape = (additional_dims * fake_uq_dim, *fake_uq.shape[1:])
-    tmp = tmp.unsqueeze(0)
-    my_list = [1] * len(fake_uq.shape)
-    new_dims = (additional_dims, *my_list)
-    tmp = tmp.repeat(new_dims)
-    tmp = tmp.reshape(final_shape)
-    std = tmp[: y.shape[0], ...]
-    return y, std
-
-
-class TuppleModel(torch.nn.Module):
-    def __init__(self, inputSize, outputSize, fake_uq):
-        super(TuppleModel, self).__init__()
-        self.linear = torch.nn.Linear(inputSize, outputSize)
-        self.linear.weight.data.fill_(0.0)
-        self.linear.bias.data.fill_(0.0)
-        self.fake_uq = torch.nn.Parameter(fake_uq, requires_grad=False)
-
-    def forward(self, x):
-        y = self.linear(x)
-        return to_tupple(y, self.fake_uq)
-
-
-def main(args):
-    inputDim = int(args[1])
-    outputDim = int(args[2])
-    device = args[3]
-    uq_type = args[4]
-    precision = args[5]
-    output_name = args[6]
-    enable_cuda = True
-    if device == "cuda":
-        enable_cuda = True
-        suffix = "_gpu"
-    elif device == "cpu":
-        enable_cuda = False
-        suffix = "_cpu"
-    prec = torch.float32
-    if precision == "double":
-        prec = torch.double
-
-    fake_uq = torch.rand(2, outputDim, dtype=prec)
-    if uq_type == "mean":
-        # This sets odd uq to less than 0.5
-        fake_uq[0, ...] *= 0.5
-        # This sets even uq to larger than 0.5
-        fake_uq[1, ...] = 0.5 + 0.5 * (fake_uq[1, ...])
-    elif uq_type == "max":
-        max_val = torch.max(fake_uq, axis=1).values
-        scale = 0.49 / max_val
-        fake_uq *= scale.unsqueeze(0).T
-        fake_uq[0, int(outputDim / 2)] = 0.51
-    else:
-        print("Unknown uq type")
-        sys.exit()
-    if precision == "double":
-        model = TuppleModel(inputDim, outputDim, fake_uq).double()
-    else:
-        model = TuppleModel(inputDim, outputDim, fake_uq)
-
-    if torch.cuda.is_available() and enable_cuda:
-        model = model.cuda()
-
-    model.eval()
-
-    data = torch.randn(1023, inputDim, dtype=prec)
-
-    with torch.jit.optimized_execution(True):
-        traced = torch.jit.trace(model, (torch.randn(inputDim, dtype=prec).to(device),))
-        traced.save(f"{output_name}")
-
-    data = torch.zeros(2, inputDim, dtype=prec)
-    inputs = Variable(data.to(device))
-    model = jit.load(f"{output_name}")
-    model.eval()
-    with torch.no_grad():
-        print("Ouput", model(inputs))
-
-
-if __name__ == "__main__":
-    main(sys.argv)
diff --git a/tests/AMSlib/gpu_packing_test.cpp b/tests/AMSlib/gpu_packing_test.cpp
deleted file mode 100644
index 63635ded..00000000
--- a/tests/AMSlib/gpu_packing_test.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#include <cstring>
-#include <iostream>
-#include <umpire/Umpire.hpp>
-#include <vector>
-
-#include "utils/allocator.hpp"
-#include "utils/utils_data.hpp"
-
-#include "../utils.hpp"
-
-// All allocations and operations will happen on top of this 'SIZE'. 
-// Keep a small value for test to be 'fast'.
-#define SIZE (3280)
-
-void initPredicate(bool* ptr, double* data, int size)
-{
-  for (int i = 0; i < size; i++) {
-    ptr[i] = i % 2 == 0;
-    data[i] = i;
-  }
-}
-
-int verify(double* dense, int size)
-{
-  for (int i = 0; i < size; i++) {
-    if (dense[i] != i * 2) {
-      return 1;
-    }
-  }
-  return 0;
-}
-
-int verify(bool* pred, double* d1, double* d2, int size)
-{
-  for (int i = 0; i < size; i++) {
-    if (pred[i] && d1[i] != d2[i]) {
-      std::cout << i << " " << d1[i] << " " << d2[i] << "\n";
-      return 1;
-    }
-  }
-  return 0;
-}
-
-int main(int argc, char* argv[])
-{
-  using namespace ams;
-  installSignals();
-  AMSInit();
-
-  using data_handler = DataHandler<double>;
-  auto& rm = umpire::ResourceManager::getInstance();
-  auto& ams_rm = ams::ResourceManager::getInstance();
-  const size_t size = SIZE;
-  int dims = std::atoi(argv[1]);
-
-  bool* h_predicate =
-      ams_rm.allocate<bool>(SIZE,
-                                           ResourceManager::ResourceType::HOST);
-  double* h_dense = ams_rm.allocate<double>(
-      SIZE, ResourceManager::ResourceType::HOST);
-  double* h_sparse = ams_rm.allocate<double>(
-      SIZE, ResourceManager::ResourceType::HOST);
-  double* h_rsparse = ams_rm.allocate<double>(
-      SIZE, ResourceManager::ResourceType::HOST);
-
-  initPredicate(h_predicate, h_sparse, SIZE);
-
-  bool* predicate = ams_rm.allocate<bool>(SIZE);
-  double* dense = ams_rm.allocate<double>(SIZE);
-  double* sparse = ams_rm.allocate<double>(SIZE);
-  double* rsparse = ams_rm.allocate<double>(SIZE);
-  int* reindex = ams_rm.allocate<int>(SIZE);
-
-  rm.copy(predicate, h_predicate);
-  rm.copy(sparse, h_sparse);
-
-  std::vector<double*> s_data({sparse});
-  std::vector<double*> sr_data({rsparse});
-  std::vector<double*> d_data({dense});
-
-  int elements;
-  if (use_reindex)
-    elements = data_handler::pack(predicate, reindex, size, s_data, d_data);
-  else
-    elements = data_handler::pack(predicate, size, s_data, d_data);
-
-  if (elements != (SIZE + 1) / 2) {
-    std::cout << "Did not compute dense number correctly(" << elements << ")\n";
-    return 1;
-  }
-
-  rm.copy(h_dense, dense);
-  if (verify(h_dense, elements)) {
-    std::cout << "Dense elements do not have the correct values\n";
-    return 1;
-  }
-
-  if (use_reindex)
-    data_handler::unpack(reindex, elements, d_data, sr_data);
-  else
-    data_handler::unpack(predicate, size, d_data, sr_data);
-
-  rm.copy(h_rsparse, rsparse);
-  if (verify(h_predicate, h_sparse, h_rsparse, size)) {
-    std::cout << "Unpacking packed data does not match initial values\n";
-    return 1;
-  }
-
-  ams_rm.deallocate(predicate);
-  ams_rm.deallocate(h_predicate,
-                                   ResourceManager::ResourceType::HOST);
-  ams_rm.deallocate(dense);
-  ams_rm.deallocate(h_dense,
-                                   ResourceManager::ResourceType::HOST);
-  ams_rm.deallocate(sparse);
-  ams_rm.deallocate(h_sparse,
-                                   ResourceManager::ResourceType::HOST);
-  ams_rm.deallocate(rsparse);
-  ams_rm.deallocate(h_rsparse,
-                                   ResourceManager::ResourceType::HOST);
-  ams_rm.deallocate(reindex);
-  
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/lb.cpp b/tests/AMSlib/lb.cpp
deleted file mode 100644
index a48749a5..00000000
--- a/tests/AMSlib/lb.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <AMS.h>
-#include <mpi.h>
-
-#include <cstring>
-#include <iostream>
-#include <random>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <wf/resource_manager.hpp>
-
-#include "wf/redist_load.hpp"
-
-#include "../utils.hpp"
-
-#define SIZE (10)
-
-void init(double *data, int elements, double value)
-{
-  for (int i = 0; i < elements; i++) {
-    data[i] = value;
-  }
-}
-
-void evaluate(double *data, double *src, int elements)
-{
-  auto &rm = ams::ResourceManager::getInstance();
-  rm.copy(src, data, elements * sizeof(double));
-}
-
-int verify(double *data, double *src, int elements, int rId)
-{
-  return std::memcmp(data, src, elements * sizeof(double));
-}
-
-int main(int argc, char *argv[])
-{
-  using namespace ams;
-  int device = std::atoi(argv[1]);
-
-  installSignals();
-  AMSInit();
-
-  MPI_Init(&argc, &argv);
-  AMSSetupAllocator(AMSResourceType::AMS_HOST);
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  AMSSetDefaultAllocator(AMSResourceType::AMS_HOST);
-  int rId, wS;
-  MPI_Comm_size(MPI_COMM_WORLD, &wS);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rId);
-  srand(rId);
-  std::default_random_engine generator;
-  std::normal_distribution<double> distribution(0.5, 0.3);
-  srand(rId);
-  double threshold;
-  for (int i = 0; i <= rId; i++) {
-    threshold = distribution(generator);
-  }
-
-  int computeElements = (threshold * SIZE);  // / sizeof(double);
-
-  double *srcData, *destData;
-  double *srcHData = srcData =
-      ResourceManager::allocate<double>(computeElements,
-                                        AMSResourceType::AMS_HOST);
-  double *destHData = destData =
-      ResourceManager::allocate<double>(computeElements,
-                                        AMSResourceType::AMS_HOST);
-
-  init(srcHData, computeElements, static_cast<double>(rId));
-
-  if (device == 1) {
-    AMSSetupAllocator(AMSResourceType::AMS_DEVICE);
-    AMSSetDefaultAllocator(AMSResourceType::AMS_DEVICE);
-    resource = AMSResourceType::AMS_DEVICE;
-    srcData = ResourceManager::allocate<double>(computeElements,
-                                                AMSResourceType::AMS_DEVICE);
-    destData = ResourceManager::allocate<double>(computeElements,
-                                                 AMSResourceType::AMS_DEVICE);
-  }
-
-  std::vector<double *> inputs({srcData});
-  std::vector<double *> outputs({destData});
-
-  {
-
-    std::cerr << "Resource is " << resource << "\n";
-    AMSLoadBalancer<double> lBalancer(
-        rId, wS, computeElements, MPI_COMM_WORLD, 1, 1, resource);
-    lBalancer.scatterInputs(inputs, resource);
-    double **lbInputs = lBalancer.inputs();
-    double **lbOutputs = lBalancer.outputs();
-    evaluate(*lbOutputs, *lbInputs, lBalancer.getBalancedSize());
-    lBalancer.gatherOutputs(outputs, resource);
-  }
-
-  if (device == 1) {
-    ResourceManager::copy(destData,
-                          destHData,
-                          computeElements * sizeof(double));
-    ResourceManager::deallocate(destData, AMSResourceType::AMS_DEVICE);
-    ResourceManager::deallocate(srcData, AMSResourceType::AMS_DEVICE);
-  }
-
-  int ret = verify(destHData, srcHData, computeElements, rId);
-
-  ResourceManager::deallocate(destHData, AMSResourceType::AMS_HOST);
-  ResourceManager::deallocate(srcHData, AMSResourceType::AMS_HOST);
-
-  MPI_Finalize();
-  AMSFinalize();
-  return ret;
-}
diff --git a/tests/AMSlib/linear_scripted_cpu.pt b/tests/AMSlib/linear_scripted_cpu.pt
deleted file mode 100644
index fe67b26d..00000000
Binary files a/tests/AMSlib/linear_scripted_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/models/CMakeLists.txt b/tests/AMSlib/models/CMakeLists.txt
new file mode 100644
index 00000000..c365e1c0
--- /dev/null
+++ b/tests/AMSlib/models/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Output files for generated models (example: .pt files)
+
+set(TORCH_MODEL_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "Directory to store torch generated models")
+
+set(GENERATED_CPU_MODELS
+  ${CMAKE_CURRENT_BINARY_DIR}/double_cpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/double_cpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/double_cpu_random.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/single_cpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/single_cpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/single_cpu_random.pt
+
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_double_cpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_double_cpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_double_cpu_random.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_single_cpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_single_cpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_single_cpu_random.pt
+
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_double_cpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_double_cpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_double_cpu_random.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_single_cpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_single_cpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_single_cpu_random.pt
+
+
+)
+
+# Custom command to generate models
+add_custom_command(
+  OUTPUT ${GENERATED_CPU_MODELS}
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/generate.sh ${CMAKE_CURRENT_BINARY_DIR}/ "cpu"
+    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/generate.sh;${CMAKE_CURRENT_SOURCE_DIR}/generate.py;${CMAKE_CURRENT_SOURCE_DIR}/generate_linear_model.py" 
+    COMMENT "Generating PyTorch models..."
+)
+
+# Define a target to generate models
+add_custom_target(generate_cpu_models ALL
+  DEPENDS ${GENERATED_CPU_MODELS}
+    COMMENT "Generate cpu models before ctests..."
+)
+
+if (WITH_CUDA)
+set(GENERATED_GPU_MODELS
+  ${CMAKE_CURRENT_BINARY_DIR}/double_gpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/double_gpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/double_gpu_random.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/single_gpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/single_gpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/single_gpu_random.pt
+
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_double_gpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_double_gpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_double_gpu_random.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_single_gpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_single_gpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_traced_single_gpu_random.pt
+
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_double_gpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_double_gpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_double_gpu_random.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_single_gpu_duq_max.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_single_gpu_duq_mean.pt  
+  ${CMAKE_CURRENT_BINARY_DIR}/linear_scripted_single_gpu_random.pt
+)
+
+# Custom command to generate models
+add_custom_command(
+  OUTPUT ${GENERATED_GPU_MODELS}
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/generate.sh ${CMAKE_CURRENT_BINARY_DIR}/ "gpu"
+    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/generate.sh;${CMAKE_CURRENT_SOURCE_DIR}/generate.py" 
+    COMMENT "Generating PyTorch models..."
+)
+
+# Define a target to generate models
+add_custom_target(generate_gpu_models ALL
+  DEPENDS ${GENERATED_GPU_MODELS}
+    COMMENT "Generate gpu models before ctests..."
+)
+endif()
+
diff --git a/tests/AMSlib/models/generate.py b/tests/AMSlib/models/generate.py
new file mode 100644
index 00000000..fd042416
--- /dev/null
+++ b/tests/AMSlib/models/generate.py
@@ -0,0 +1,127 @@
+import torch
+import sys
+import torch.nn as nn
+import argparse
+from torch import Tensor
+from typing import Tuple
+
+
+# Ugly code that expands the fake_uq to the shape we need as an output
+def to_tupple(y: Tensor, fake_uq: Tensor) -> Tuple[Tensor, Tensor]:
+    outer_dim = y.shape[0]
+    fake_uq_dim = fake_uq.shape[0]
+    tmp = fake_uq.clone().detach()
+    additional_dims = torch.div(outer_dim, fake_uq_dim, rounding_mode="floor") + outer_dim % fake_uq_dim
+    final_shape = (additional_dims * fake_uq_dim, *fake_uq.shape[1:])
+    tmp = tmp.unsqueeze(0)
+    my_list = [1] * len(fake_uq.shape)
+    new_dims = (additional_dims, *my_list)
+    tmp = tmp.repeat(new_dims)
+    tmp = tmp.reshape(final_shape)
+    std = tmp[: y.shape[0], ...]
+    return y, std
+
+
+class TuppleModel(torch.nn.Module):
+    def __init__(self, inputSize, outputSize, fake_uq):
+        super(TuppleModel, self).__init__()
+        self.linear = torch.nn.Linear(inputSize, outputSize, False)
+        self.fake_uq = torch.nn.Parameter(fake_uq, requires_grad=False)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Check if in_features == out_features for identity initialization
+        if self.linear.weight.shape[0] == self.linear.weight.shape[1]:
+            nn.init.eye_(self.linear.weight)  # Initialize with identity matrix
+        else:
+            raise ValueError("Identity initialization requires in_features == out_features")
+
+    def forward(self, x):
+        y = self.linear(x)
+        return to_tupple(y, self.fake_uq)
+
+
+# Define a simple model
+class SimpleModel(nn.Module):
+    def __init__(self, in_features, out_features):
+        super(SimpleModel, self).__init__()
+        self.fc = nn.Linear(in_features, out_features, False)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Check if in_features == out_features for identity initialization
+        if self.fc.weight.shape[0] == self.fc.weight.shape[1]:
+            nn.init.eye_(self.fc.weight)  # Initialize with identity matrix
+        else:
+            raise ValueError("Identity initialization requires in_features == out_features")
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Generate and save a scripted model.")
+    parser.add_argument("precision", choices=["single", "double"], help="Model precision: 'single' or 'double'.")
+    parser.add_argument("device", choices=["cpu", "gpu"], help="Device: 'cpu' or 'gpu'.")
+    parser.add_argument("directory", type=str, help="Directory to save the model.")
+    parser.add_argument("uq", choices=["random", "duq_mean", "duq_max"], help="The UQ Type to use")
+    args = parser.parse_args()
+
+    # Initialize model
+    if args.uq == "duq_mean":
+        fake_uq = torch.rand(2, 8)
+        # This sets odd uq to less than 0.5
+        fake_uq[0, ...] *= 0.5
+        # This sets even uq to larger than 0.5
+        fake_uq[1, ...] = 0.5 + 0.5 * (fake_uq[1, ...])
+        model = TuppleModel(8, 8, fake_uq)
+    elif args.uq == "duq_max":
+        fake_uq = torch.rand(2, 8)
+        max_val = torch.max(fake_uq, axis=1).values
+        scale = 0.49 / max_val
+        fake_uq *= scale.unsqueeze(0).T
+        fake_uq[1, 2] = 0.51
+        model = TuppleModel(8, 8, fake_uq)
+    elif args.uq == "random":
+        model = SimpleModel(8, 8)
+    else:
+        sys.exit(-1)
+        print("I am missing valid uq method")
+
+    # Set the precision based on command-line argument
+    if args.precision == "single":
+        model = model.float()  # Set to single precision (float32)
+        prec = torch.float32
+    elif args.precision == "double":
+        model = model.double()  # Set to double precision (float64)
+        prec = torch.float64
+
+    # Set the device based on command-line argument
+    if args.device == "gpu" and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+
+    # Move model to the appropriate device
+    model.to(device)
+
+    # Create example input tensor
+    example_input = torch.randn(2, 8, device=device, dtype=prec)
+
+    # Trace the model
+    scripted_model = torch.jit.trace(model, example_input)
+
+    # Generate the file name
+    file_name = f"{args.precision}_{args.device}_{args.uq}.pt"
+    file_path = f"{args.directory}/{file_name}"
+
+    # Save the scripted model
+    scripted_model.save(file_path)
+
+    print(f"Model saved to {file_path}")
+
+
+if __name__ == "__main__":
+    main()
+    sys.exit(0)
diff --git a/tests/AMSlib/models/generate.sh b/tests/AMSlib/models/generate.sh
new file mode 100755
index 00000000..8379f16d
--- /dev/null
+++ b/tests/AMSlib/models/generate.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+if [[ $# -ne 2 ]]; then
+  echo "Expecting at least 2 CLI argument"
+  echo "$0 <directory> cpu|gpu"
+  exit
+fi
+
+directory=$1
+device=$2
+root_dir=$(dirname $0)
+
+mkdir -p $directory
+echo $device
+
+python ${root_dir}/generate.py single $device ${directory} duq_mean
+python ${root_dir}/generate.py single $device ${directory} duq_max
+python ${root_dir}/generate.py single $device ${directory} random 
+
+python ${root_dir}/generate.py double $device ${directory} duq_mean
+python ${root_dir}/generate.py double $device ${directory} duq_max
+python ${root_dir}/generate.py double $device ${directory} random 
+
+
+python ${root_dir}/generate_linear_model.py single $device ${directory} duq_mean 8 9
+python ${root_dir}/generate_linear_model.py single $device ${directory} duq_max 8 9
+python ${root_dir}/generate_linear_model.py single $device ${directory} random  8 9
+
+python ${root_dir}/generate_linear_model.py double $device ${directory} duq_mean 8 9
+python ${root_dir}/generate_linear_model.py double $device ${directory} duq_max 8 9
+python ${root_dir}/generate_linear_model.py double $device ${directory} random  8 9
+
+
diff --git a/tests/AMSlib/models/generate_linear_model.py b/tests/AMSlib/models/generate_linear_model.py
new file mode 100644
index 00000000..7a5657f5
--- /dev/null
+++ b/tests/AMSlib/models/generate_linear_model.py
@@ -0,0 +1,64 @@
+import torch
+import sys
+import numpy as np
+import math
+import argparse
+
+
+class linearRegression(torch.nn.Module):
+    def __init__(self, inputSize, outputSize):
+        super(linearRegression, self).__init__()
+        self.linear = torch.nn.Linear(inputSize, outputSize, bias=True)
+
+    def forward(self, x):
+        y = self.linear(x)
+        return y
+
+
+def main(args):
+    parser = argparse.ArgumentParser(description="Generate and save a scripted model.")
+    parser.add_argument("precision", choices=["single", "double"], help="Model precision: 'single' or 'double'.")
+    parser.add_argument("device", choices=["cpu", "gpu"], help="Device: 'cpu' or 'gpu'.")
+    parser.add_argument("directory", type=str, help="Directory to save the model.")
+    parser.add_argument("uq", choices=["random", "duq_mean", "duq_max"], help="The UQ Type to use (this is ignored a.t.m)")
+    parser.add_argument("inputDim", type=int, help="The dimensions of the input data")
+    parser.add_argument("outputDim", type=int, help="the dimensions of the output data")
+    args = parser.parse_args()
+
+    model = linearRegression(args.inputDim, args.outputDim)
+
+    # Set the precision based on command-line argument
+    if args.precision == "single":
+        model = model.float()  # Set to single precision (float32)
+        prec = torch.float32
+    elif args.precision == "double":
+        model = model.double()  # Set to double precision (float64)
+        prec = torch.float64
+
+    # Set the device based on command-line argument
+    if args.device == "gpu" and torch.cuda.is_available():
+        device = torch.device("cuda")
+        model = model.cuda()
+    else:
+        device = torch.device("cpu")
+
+    model.eval()
+
+    x = torch.rand((1, args.inputDim), device=device, dtype=prec)
+    y_before_jit = model(x)
+
+    # Generate the file name
+    file_name = f"{args.precision}_{args.device}_{args.uq}.pt"
+
+
+    with torch.jit.optimized_execution(True):
+        scripted = torch.jit.script(model)
+        file_path = f"{args.directory}/linear_scripted_{file_name}"
+        scripted.save(file_path)
+        file_path = f"{args.directory}/linear_traced_{file_name}"
+        traced = torch.jit.trace(model, (torch.randn(args.inputDim, dtype=prec).to(device),))
+        traced.save(file_path)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/tests/AMSlib/perf_regression/CMakeLists.txt b/tests/AMSlib/perf_regression/CMakeLists.txt
new file mode 100644
index 00000000..0a0a56b6
--- /dev/null
+++ b/tests/AMSlib/perf_regression/CMakeLists.txt
@@ -0,0 +1,33 @@
+function(BUILD_TEST exe source)
+  add_executable(${exe} ${source})
+  target_include_directories(${exe} PRIVATE "${PROJECT_SOURCE_DIR}/src/AMSlib/" "${PROJECT_SOURCE_DIR}/src/AMSlib/include" ${caliper_INCLUDE_DIR} ${MPI_INCLUDE_PATH})
+  target_compile_definitions(${exe} PRIVATE ${AMS_APP_DEFINES})
+  message("On test defines are ${AMS_APP_DEFINES}")
+  target_link_libraries(${exe} PRIVATE AMS torch)
+  if (WITH_HDF5)
+    target_link_libraries(${exe} PRIVATE ${AMS_HDF5_TARGET})
+  endif()
+  if (WITH_CALIPER)
+    target_link_libraries(${exe} PRIVATE caliper)
+  endif()
+  if (WITH_RMQ)
+    target_link_libraries(${exe} PRIVATE amqpcpp)
+    if (OPENSSL_FOUND)
+      target_link_libraries(${exe} PRIVATE OpenSSL::SSL OpenSSL::Crypto) 
+    endif()
+    # NOTE: We set here the event/event pthreads as public. As there is no easy way
+    # to do a find package(libevent) and RMQ is not exposing that properly.
+    target_link_libraries(${exe} PRIVATE ${LIBEVENT_LIBRARY} ${LIBEVENT_THREAD})
+  endif()
+
+  if (WITH_MPI)
+    target_link_libraries(${exe} PRIVATE MPI::MPI_CXX)
+  endif()
+endfunction()
+
+
+# AMS Database benchmark (RMQ and/or HDF5 + MPI / No ML models used)
+BUILD_TEST(ams_benchmark_db ams_bench_db.cpp)
+# The AMS DB Benchmark requires mfem
+# TODO: Remove mfem requirement from the benchmark
+target_link_libraries(ams_benchmark_db PRIVATE AMS ${AMS_EXAMPLE_LIBRARIES})
diff --git a/tests/AMSlib/ams_bench_db.cpp b/tests/AMSlib/perf_regression/ams_bench_db.cpp
similarity index 55%
rename from tests/AMSlib/ams_bench_db.cpp
rename to tests/AMSlib/perf_regression/ams_bench_db.cpp
index 9efc1444..6b729dc7 100644
--- a/tests/AMSlib/ams_bench_db.cpp
+++ b/tests/AMSlib/perf_regression/ams_bench_db.cpp
@@ -6,45 +6,23 @@
 #include <caliper/cali_macros.h>
 #endif
 
+#include <execinfo.h>
+
 #include <cassert>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
-#include <execinfo.h>
+#include <iostream>
 #include <limits>
+#include <string>
 #include <thread>
-
-#include <mfem.hpp>
-
-#include "AMS.h"
+#include <unordered_map>
+#include <vector>
 
 #include "../utils.hpp"
+#include "AMS.h"
+#include "macro.h"
 
-AMSDType getDataType(const char *d_type)
-{
-  AMSDType dType = AMSDType::AMS_DOUBLE;
-  if (std::strcmp(d_type, "float") == 0) {
-    dType = AMSDType::AMS_SINGLE;
-  } else if (d_type == "double") {
-    dType = AMSDType::AMS_DOUBLE;
-  } else {
-    assert(false && "Unknown data type (must be 'float' or 'double')");
-  }
-  return dType;
-}
-
-AMSDBType getDBType(const char *db_type)
-{
-  AMSDBType dbType = AMSDBType::AMS_NONE;
-  if (std::strcmp(db_type, "csv") == 0) {
-    dbType = AMSDBType::AMS_CSV;
-  } else if (std::strcmp(db_type, "hdf5") == 0) {
-    dbType = AMSDBType::AMS_HDF5;
-  } else if (std::strcmp(db_type, "rmq") == 0) {
-    dbType = AMSDBType::AMS_RMQ;
-  }
-  return dbType;
-}
 
 template <typename DType>
 struct Problem {
@@ -71,7 +49,7 @@ struct Problem {
     std::this_thread::sleep_for(std::chrono::milliseconds(sleep_msec));
   }
 
-  const DType *initialize_inputs(DType *inputs, long length)
+  DType *initialize_inputs(DType *inputs, long length)
   {
     for (int i = 0; i < length; i++) {
       inputs[i] = static_cast<DType>(i);
@@ -84,65 +62,83 @@ struct Problem {
                int iterations,
                int num_elements)
   {
-    CALIPER(CALI_CXX_MARK_FUNCTION;)
+    CALIPER(CALI_CXX_MARK_FUNCTION);
 
     CALIPER(CALI_CXX_MARK_LOOP_BEGIN(mainloop_id, "mainloop");)
 
     for (int i = 0; i < iterations; i++) {
       std::cout << "Iteration [" << i << "]\n";
-      CALIPER(CALI_CXX_MARK_LOOP_ITERATION(mainloop_id, i);)
-      int elements = num_elements;  // * ((DType)(rand()) / RAND_MAX) + 1;
-      std::vector<const DType *> inputs;
-      std::vector<DType *> outputs;
+
+      SmallVector<AMSTensor> input_tensors;
+      SmallVector<AMSTensor> output_tensors;
 
       // Allocate Input memory
       for (int j = 0; j < num_inputs; j++) {
-        DType *data = new DType[elements];
-        inputs.push_back(initialize_inputs(data, elements));
+        DType *data = new DType[num_elements];
+        DType *ptr = initialize_inputs(data, num_elements);
+        input_tensors.push_back(AMSTensor::view(
+            ptr,
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
       }
 
       // Allocate Output memory
       for (int j = 0; j < num_outputs; j++) {
-        outputs.push_back(new DType[elements]);
+        auto tmp = new DType[num_elements];
+        output_tensors.push_back(AMSTensor::view(
+            initialize_inputs(tmp, num_elements),
+            SmallVector<ams::AMSTensor::IntDimType>({num_elements, 1}),
+            SmallVector<ams::AMSTensor::IntDimType>({1, 1}),
+            resource));
       }
 
-      AMSExecute(wf,
-                 (void *)this,
-                 elements,
-                 reinterpret_cast<const void **>(inputs.data()),
-                 reinterpret_cast<void **>(outputs.data()),
-                 inputs.size(),
-                 outputs.size());
-
-      for (int i = 0; i < num_outputs; i++) {
-        delete[] outputs[i];
-        outputs[i] = nullptr;
+      EOSLambda OrigComputation =
+          [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+              ams::SmallVector<ams::AMSTensor> &ams_inouts,
+              ams::SmallVector<ams::AMSTensor> &ams_outs) {
+            DType *ins[num_inputs];
+            DType *outs[num_outputs];
+            if (num_inputs != ams_ins.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of inputs to remain the same");
+            else if (num_outputs != ams_outs.size())
+              throw std::runtime_error(
+                  "Expecting dimensions of outputs to remain the same");
+
+            // Here I can use domain knowledge (inouts is empty)
+            int num_elements = ams_ins[0].shape()[0];
+            for (int i = 0; i < num_inputs; i++) {
+              ins[i] = ams_ins[i].data<DType>();
+              if (ams_ins[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            for (int i = 0; i < num_outputs; i++) {
+              outs[i] = ams_outs[i].data<DType>();
+              if (ams_outs[i].shape()[0] != num_elements)
+                throw std::runtime_error(
+                    "Expected tensors to have the same shape");
+            }
+            run(num_elements, ins, outs);
+          };
+
+      ams::SmallVector<AMSTensor> inouts;
+      AMSExecute(wf, OrigComputation, input_tensors, inouts, output_tensors);
+
+      for (int i = 0; i < input_tensors.size(); i++) {
+        delete input_tensors[i].data<DType>();
       }
-      for (int i = 0; i < num_inputs; i++) {
-        delete[] inputs[i];
-        inputs[i] = nullptr;
+
+
+      for (int i = 0; i < output_tensors.size(); i++) {
+        delete output_tensors[i].data<DType>();
       }
+      CALIPER(CALI_CXX_MARK_LOOP_ITERATION(mainloop_id, i);)
     }
-    CALIPER(CALI_CXX_MARK_LOOP_END(mainloop_id);)
   }
 };
 
-void callBackDouble(void *cls, long elements, void **inputs, void **outputs)
-{
-  std::cout << "  > Called the double precision model\n";
-  static_cast<Problem<double> *>(cls)->run(elements,
-                                           (double **)(inputs),
-                                           (double **)(outputs));
-}
-
-void callBackSingle(void *cls, long elements, void **inputs, void **outputs)
-{
-  std::cout << "  > Called the single precision model\n";
-  static_cast<Problem<float> *>(cls)->run(elements,
-                                          (float **)(inputs),
-                                          (float **)(outputs));
-}
-
 int main(int argc, char **argv)
 {
   // Number of ranks in this run
@@ -164,10 +160,10 @@ int main(int argc, char **argv)
     std::cout.setstate(std::ios::failbit);
   }
 
-  const char *device_name = "cpu";
-  const char *db_config = "";
-  const char *db_type = "";
-  const char *precision_opt = "double";
+  std::string device_name = "cpu";
+  std::string db_config = "";
+  std::string db_type = "";
+  std::string precision_opt = "double";
 
   int seed = 0;
   int num_elems = 1024;
@@ -180,7 +176,7 @@ int main(int argc, char **argv)
   // -------------------------------------------------------------------------
   // setup command line parser
   // -------------------------------------------------------------------------
-  mfem::OptionsParser args(argc, argv);
+  TestArgs args;
   args.AddOption(&device_name,
                  "-d",
                  "--device",
@@ -215,28 +211,22 @@ int main(int argc, char **argv)
                  "-dt",
                  "--dbtype",
                  "Configuration option of the different DB types:\n"
-                 "\t 'csv': use CSV as a back end\n"
                  "\t 'hdf5': use HDF5 as a back end\n"
                  "\t 'rmq': use RabbitMQ as a back end\n");
 
-  args.AddOption(&verbose,
-                 "-v",
-                 "--verbose",
-                 "-qu",
-                 "--quiet",
-                 "Enable more verbose benchmark");
+  args.AddOption(&verbose, "-v", "--verbose", "Enable more verbose benchmark");
 
   // -------------------------------------------------------------------------
   // parse arguments
   // -------------------------------------------------------------------------
-  args.Parse();
+  args.Parse(argc, argv);
   if (!args.Good()) {
-    args.PrintUsage(std::cout);
+    args.PrintOptions();
     return -1;
   }
 
   if (rId == 0) {
-    args.PrintOptions(std::cout);
+    args.PrintUsage();
     std::cout << std::endl;
   }
 
@@ -261,9 +251,9 @@ int main(int argc, char **argv)
   // AMS allocators setup
   // -------------------------------------------------------------------------
   AMSResourceType resource = AMSResourceType::AMS_HOST;
-  const bool use_device = std::strcmp(device_name, "cpu") != 0;
+  const bool use_device = device_name == "cpu";
   if (use_device) {
-#ifdef __ENABLE_CUDA__
+#ifdef __AMS_ENABLE_CUDA__
     resource = AMSResourceType::AMS_DEVICE;
 #else
     std::cerr << "Error: Benchmark has not been compiled with CUDA support\n";
@@ -275,55 +265,24 @@ int main(int argc, char **argv)
                                                       AMSUQPolicy::AMS_RANDOM,
                                                       0.5,
                                                       "",
-                                                      "",
-                                                      "bench_db_no_model",
-                                                      1);
+                                                      "bench_db_no_model");
+
 
   std::cout << "Total elements across all " << wS
             << " ranks: " << wS * num_elems << "\n";
   std::cout << "Total elements per rank: " << num_elems << "\n";
 
+  AMSExecutor wf = AMSCreateExecutor(ams_model, rId, wS);
   if (data_type == AMSDType::AMS_SINGLE) {
     Problem<float> prob(num_inputs, num_outputs, sleep_msec);
-#ifdef __ENABLE_MPI__
-    AMSExecutor wf = AMSCreateDistributedExecutor(ams_model,
-                                                  AMSDType::AMS_SINGLE,
-                                                  resource,
-                                                  (AMSPhysicFn)callBackSingle,
-                                                  MPI_COMM_WORLD,
-                                                  rId,
-                                                  wS);
-#else
-    AMSExecutor wf = AMSCreateExecutor(ams_model,
-                                       AMSDType::AMS_SINGLE,
-                                       resource,
-                                       (AMSPhysicFn)callBackSingle,
-                                       rId,
-                                       wS);
-#endif
     prob.ams_run(wf, resource, num_iterations, num_elems);
   } else {
     Problem<double> prob(num_inputs, num_outputs, sleep_msec);
-#ifdef __ENABLE_MPI__
-    AMSExecutor wf = AMSCreateDistributedExecutor(ams_model,
-                                                  AMSDType::AMS_DOUBLE,
-                                                  resource,
-                                                  (AMSPhysicFn)callBackDouble,
-                                                  MPI_COMM_WORLD,
-                                                  rId,
-                                                  wS);
-#else
-    AMSExecutor wf = AMSCreateExecutor(ams_model,
-                                       AMSDType::AMS_DOUBLE,
-                                       resource,
-                                       (AMSPhysicFn)callBackDouble,
-                                       rId,
-                                       wS);
-#endif
+    prob.ams_run(wf, resource, num_iterations, num_elems);
     prob.ams_run(wf, resource, num_iterations, num_elems);
   }
 
   MPI_CALL(MPI_Finalize());
   AMSFinalize();
   return 0;
-}
\ No newline at end of file
+}
diff --git a/tests/AMSlib/test_hdcache.cpp b/tests/AMSlib/test_hdcache.cpp
deleted file mode 100644
index 27c82155..00000000
--- a/tests/AMSlib/test_hdcache.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#include <AMS.h>
-
-#include <climits>
-#include <cstring>
-#include <iostream>
-#include <ml/hdcache.hpp>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <vector>
-#include <wf/resource_manager.hpp>
-
-#include "../utils.hpp"
-
-template <typename T>
-std::vector<const T *> generate_vectors(const int num_clusters,
-                                        int elements,
-                                        int dims)
-{
-  std::vector<const T *> v_data;
-  auto &rm = ams::ResourceManager::getInstance();
-  // This are fixed to mimic the way the faiss was generated
-  // The code below generates data values that are either within
-  // the distance of the faiss index or just outside of it.
-  const T distance = 10.0;
-  const T offset = 5.0;
-  for (int i = 0; i < dims; i++) {
-    T *data =
-        rm.allocate<T>(num_clusters * elements, AMSResourceType::AMS_HOST);
-    for (int j = 0; j < elements; j++) {
-      // Generate a value for every cluster center
-      for (int k = 0; k < num_clusters; k++) {
-        T tmp = ((T)rand()) / INT_MAX;
-        tmp += (k + 1) * num_clusters;
-        if ((j % 2) == 0) {
-          tmp += offset;
-        }
-        data[j * num_clusters + k] = tmp;
-      }
-    }
-    v_data.push_back(data);
-  }
-  return std::move(v_data);
-}
-
-template <typename T>
-void print_vectors(std::vector<T *> &vec, int num_elements, int num_clusters)
-{
-  for (int i = 0; i < num_elements; i++) {
-    for (int c = 0; c < num_clusters; c++) {
-      for (auto v : vec) {
-        std::cout << v[i * num_clusters + c] << ":";
-      }
-      std::cout << "\n";
-    }
-  }
-}
-
-
-bool validate(const int num_clusters, const int elements, bool *predicates)
-{
-  bool res = true;
-  for (int j = 0; j < elements; j++) {
-    // Generate a value for every cluster center
-    for (int k = 0; k < num_clusters; k++) {
-      if (j % 2 == 0 && predicates[j * num_clusters + k] == true) {
-        res = false;
-      } else if (j % 2 == 1 && predicates[j * num_clusters + k] == false) {
-        res = false;
-      }
-    }
-  }
-  return res;
-}
-
-template <typename T>
-bool do_faiss(std::shared_ptr<HDCache<T>> &index,
-              AMSResourceType resource,
-              int nClusters,
-              int nDims,
-              int nElements,
-              float threshold)
-{
-
-  std::vector<const T *> orig_data =
-      generate_vectors<T>(nClusters, nElements, nDims);
-  std::vector<const T *> data = orig_data;
-  auto &rm = ams::ResourceManager::getInstance();
-
-  bool *predicates = rm.allocate<bool>(nClusters * nElements, resource);
-
-  if (resource == AMSResourceType::AMS_DEVICE) {
-    for (int i = 0; i < orig_data.size(); i++) {
-      T *d_data = rm.allocate<T>(nClusters * nElements, resource);
-      rm.copy(const_cast<T *>(orig_data[i]),
-              AMSResourceType::AMS_HOST,
-              d_data,
-              AMSResourceType::AMS_DEVICE,
-              nClusters * nElements);
-      data[i] = d_data;
-    }
-  }
-
-
-  index->evaluate(nClusters * nElements, data, predicates);
-
-  bool *h_predicates = predicates;
-
-  if (resource == AMSResourceType::AMS_DEVICE) {
-    h_predicates =
-        rm.allocate<bool>(nClusters * nElements, AMSResourceType::AMS_HOST);
-    rm.copy(predicates,
-            AMSResourceType::AMS_DEVICE,
-            h_predicates,
-            AMSResourceType::AMS_HOST,
-            nClusters * nElements);
-    for (auto d : data) {
-      rm.deallocate(const_cast<T *>(d), AMSResourceType::AMS_DEVICE);
-    }
-    rm.deallocate(predicates, AMSResourceType::AMS_DEVICE);
-  }
-
-
-  for (auto h_d : orig_data)
-    rm.deallocate(const_cast<T *>(h_d), AMSResourceType::AMS_HOST);
-
-  bool res = validate(nClusters, nElements, h_predicates);
-
-  rm.deallocate(h_predicates, AMSResourceType::AMS_HOST);
-  return res;
-}
-
-
-int main(int argc, char *argv[])
-{
-  using namespace ams;
-  installSignals();
-  AMSInit();
-
-  if (argc < 8) {
-    std::cerr << "Wrong CLI\n";
-    std::cerr << argv[0]
-              << " 'use device' 'path to faiss' 'data type (double|float)' "
-                 "'UQPolicy (0:Mean, 1:Max)' 'Num Clusters' 'Threshold' "
-                 "'number of dimensions' 'num elements'";
-    abort();
-  }
-  auto &rm = umpire::ResourceManager::getInstance();
-  int use_device = std::atoi(argv[1]);
-  char *faiss_path = argv[2];
-  char *data_type = argv[3];
-  AMSUQPolicy uq_policy = static_cast<AMSUQPolicy>(std::atoi(argv[4]));
-  int nClusters = std::atoi(argv[5]);
-  float threshold = std::atoi(argv[6]);
-  int nDims = std::atoi(argv[7]);
-  int nElements = std::atoi(argv[8]);
-
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  if (use_device == 1) resource = AMSResourceType::AMS_DEVICE;
-
-  auto &ams_rm = ResourceManager::getInstance();
-  ams_rm.init();
-
-  if (std::strcmp("double", data_type) == 0) {
-    std::shared_ptr<HDCache<double>> cache = HDCache<double>::getInstance(
-        faiss_path, resource, uq_policy, 10, threshold);
-    bool result =
-        do_faiss(cache, resource, nClusters, nDims, nElements, threshold);
-    cache.reset();
-    return !result;
-  } else if (std::strcmp("single", data_type) == 0) {
-    std::shared_ptr<HDCache<float>> cache = HDCache<float>::getInstance(
-        faiss_path, resource, uq_policy, 10, threshold);
-    bool result =
-        do_faiss(cache, resource, nClusters, nDims, nElements, threshold);
-    cache.reset();
-    return !result;
-  }
-
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/torch.duq b/tests/AMSlib/torch.duq
deleted file mode 100644
index 58ee24fc..00000000
Binary files a/tests/AMSlib/torch.duq and /dev/null differ
diff --git a/tests/AMSlib/torch.duq.cuda b/tests/AMSlib/torch.duq.cuda
deleted file mode 100644
index 1a80a5a1..00000000
Binary files a/tests/AMSlib/torch.duq.cuda and /dev/null differ
diff --git a/tests/AMSlib/torch/CMakeLists.txt b/tests/AMSlib/torch/CMakeLists.txt
new file mode 100644
index 00000000..710e8076
--- /dev/null
+++ b/tests/AMSlib/torch/CMakeLists.txt
@@ -0,0 +1,112 @@
+
+function(ADD_TORCH_UNIT_TEST name exec)
+  add_test(NAME ${name} COMMAND ${exec} ${ARGN})
+  set_tests_properties(${name} PROPERTIES LABELS TORCH_UNIT_TEST)
+endfunction()
+
+
+function(BUILD_UNIT_TEST exe source)
+  add_executable(${exe} ${source})
+  add_dependencies(${exe} generate_cpu_models)
+
+  if (WITH_CUDA)
+    add_dependencies(${exe} generate_gpu_models)
+    target_link_libraries(${exe} PRIVATE CUDA::cudart)
+  endif()
+
+  target_include_directories(${exe} PRIVATE ${CMAKE_SOURCE_DIR}/src/AMSlib/)
+  target_include_directories(${exe} PRIVATE ${CMAKE_BINARY_DIR}/include/)
+  target_link_libraries(${exe} PRIVATE stdc++fs AMS torch)
+  if (WITH_HDF5)
+    target_link_libraries(${exe} PRIVATE ${AMS_HDF5_TARGET})
+  endif()
+
+
+  target_compile_definitions(${exe} PRIVATE ${AMS_APP_DEFINES})
+endfunction()
+
+
+BUILD_UNIT_TEST(ams_verify_dtype verify_dtype.cpp)
+BUILD_UNIT_TEST(ams_verify_device verify_device.cpp)
+BUILD_UNIT_TEST(ams_evaluate_model evaluate_model.cpp)
+BUILD_UNIT_TEST(ams_convert_and_evaluate_model evalute_model_conversions.cpp)
+
+#detect datatype
+ADD_TORCH_UNIT_TEST(Surrogate::DType::HOST::Double::Random ams_verify_dtype double ${TORCH_MODEL_DIR}/double_cpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::HOST::Double::Duq_mean ams_verify_dtype double ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::HOST::Double::Duq_max ams_verify_dtype double ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt)
+
+ADD_TORCH_UNIT_TEST(Surrogate::DType::HOST::Single::Random ams_verify_dtype single ${TORCH_MODEL_DIR}/single_cpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::HOST::Single::Duq_mean ams_verify_dtype single ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::HOST::Single::Duq_max ams_verify_dtype single ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt)
+
+#detect device
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::HOST::Double::Random ams_verify_device cpu ${TORCH_MODEL_DIR}/double_cpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::HOST::Double::Duq_mean ams_verify_device cpu ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::HOST::Double::Duq_max ams_verify_device cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt)
+
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::HOST::Single::Random ams_verify_device cpu ${TORCH_MODEL_DIR}/single_cpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::HOST::Single::Duq_mean ams_verify_device cpu ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::HOST::Single::Duq_max ams_verify_device cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt)
+
+
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::HOST::Random::Double ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_cpu_random.pt "random")
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::HOST::Random::Single ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_cpu_random.pt "random")
+
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::HOST::UQMean::Double ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt "duq_mean")
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::HOST::UQMean::Single ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt "duq_mean")
+
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::HOST::UQMax::Double ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max")
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::HOST::UQMax::Single ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max")
+
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::HOST::Random::Double ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_cpu_random.pt "random")
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::HOST::Random::Single ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_cpu_random.pt "random")
+
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::HOST::UQMean::Double ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_cpu_duq_mean.pt "duq_mean")
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::HOST::UQMean::Single ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_cpu_duq_mean.pt "duq_mean")
+
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::HOST::UQMax::Double ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max")
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::HOST::UQMax::Single ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max")
+
+if(WITH_CUDA)
+
+#detect datatype
+ADD_TORCH_UNIT_TEST(Surrogate::DType::DEVICE::Double::Random ams_verify_dtype double ${TORCH_MODEL_DIR}/double_gpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::DEVICE::Double::Duq_mean ams_verify_dtype double ${TORCH_MODEL_DIR}/double_gpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::DEVICE::Double::Duq_max ams_verify_dtype double ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt)
+
+ADD_TORCH_UNIT_TEST(Surrogate::DType::DEVICE::Single::Random ams_verify_dtype single ${TORCH_MODEL_DIR}/single_gpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::DEVICE::Single::Duq_mean ams_verify_dtype single ${TORCH_MODEL_DIR}/single_gpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DType::DEVICE::Single::Duq_max ams_verify_dtype single ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt)
+
+#detect device
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::DEVICE::Double::Random ams_verify_device gpu ${TORCH_MODEL_DIR}/double_gpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::DEVICE::Double::Duq_mean ams_verify_device gpu ${TORCH_MODEL_DIR}/double_gpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::DEVICE::Double::Duq_max ams_verify_device gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt)
+
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::DEVICE::Single::Random ams_verify_device gpu ${TORCH_MODEL_DIR}/single_gpu_random.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::DEVICE::Single::Duq_mean ams_verify_device gpu ${TORCH_MODEL_DIR}/single_gpu_duq_mean.pt)
+ADD_TORCH_UNIT_TEST(Surrogate::DResource::DEVICE::Single::Duq_max ams_verify_device gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt)
+
+
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::DEVICE::Random::Double ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_gpu_random.pt "random")
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::DEVICE::Random::Single ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_gpu_random.pt "random")
+
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::DEVICE::UQMean::Double ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_gpu_duq_mean.pt "duq_mean")
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::DEVICE::UQMean::Single ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_gpu_duq_mean.pt "duq_mean")
+
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::DEVICE::UQMax::Double ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max")
+ADD_TORCH_UNIT_TEST(Surrogate::Evaluate::DEVICE::UQMax::Single ams_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max")
+
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::DEVICE::Random::Double ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_gpu_random.pt "random")
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::DEVICE::Random::Single ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_gpu_random.pt "random")
+
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::DEVICE::UQMean::Double ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_gpu_duq_mean.pt "duq_mean")
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::DEVICE::UQMean::Single ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_gpu_duq_mean.pt "duq_mean")
+
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::DEVICE::UQMax::Double ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max")
+ADD_TORCH_UNIT_TEST(Surrogate::EvaluateAndCat::DEVICE::UQMax::Single ams_convert_and_evaluate_model "2000,8" "2000,8" ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max")
+
+endif()
+
+
diff --git a/tests/AMSlib/torch/evaluate_model.cpp b/tests/AMSlib/torch/evaluate_model.cpp
new file mode 100644
index 00000000..9d995209
--- /dev/null
+++ b/tests/AMSlib/torch/evaluate_model.cpp
@@ -0,0 +1,99 @@
+#include <torch/torch.h>
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "AMS.h"
+#include "ml/surrogate.hpp"
+
+std::vector<std::int64_t> getDims(const std::string input, char delimiter)
+{
+  std::vector<int64_t> tokens;
+  std::stringstream ss(input);
+  std::string token;
+
+  while (std::getline(ss, token, delimiter)) {
+    tokens.push_back(std::stoi(token));
+  }
+
+  return tokens;
+}
+
+
+bool verify(torch::Tensor& input,
+            torch::Tensor& output,
+            torch::Tensor& predicate,
+            float threshold)
+{
+  if (!torch::equal(input, output))
+    throw std::runtime_error("Tensors are not identical");
+
+  torch::Tensor float_tensor = predicate.to(torch::kDouble);
+  // Calculate the probability (mean of the tensor)
+  double probability = float_tensor.mean().item<double>();
+  std::cout << "probability is " << probability << "\n";
+  if (probability != threshold)
+    throw std::runtime_error(
+        "Expecing a probability of 0.0 in the case of threshold <0>");
+  return true;
+}
+
+void test(SurrogateModel& model,
+          std::vector<int64_t>& iDims,
+          std::vector<int64_t>& oDims,
+          ams::AMSUQPolicy policy)
+{
+  auto model_type = model.getModelDataType();
+  auto model_device = model.getModelResourceType();
+  torch::Tensor input = torch::rand(iDims,
+                                    torch::TensorOptions()
+                                        .dtype(std::get<1>(model_type))
+                                        .device(std::get<1>(model_device)));
+  {
+    std::cout << "Staring Test-1 with random-uq and threshold of 0.0\n";
+    auto [out, predicate] = model._evaluate(input, policy, 0.0);
+    verify(input, out, predicate, 0.0);
+    std::cout << "SUCCESS\n";
+  }
+  {
+    std::cout << "Staring Test-2 with random-uq and threshold of 0.5\n";
+    auto [out, predicate] = model._evaluate(input, policy, 0.5);
+    verify(input, out, predicate, 0.5);
+    std::cout << "SUCCESS\n";
+  }
+  {
+    std::cout << "Staring Test-3 with random-uq and threshold of 1.0\n";
+    auto [out, predicate] = model._evaluate(input, policy, 1.0);
+    verify(input, out, predicate, 1.0);
+    std::cout << "SUCCESS\n";
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  if (argc != 5) {
+    std::cerr << "Wrong command line, expecting , "
+                 "<input-dim-shape (1024,2,4)> <output-dim-shape> (1024, 2, "
+                 "6) <model-path> <duq_type (mean|max)>\n";
+    return -1;
+  }
+
+  std::vector<int64_t> iShape(getDims(argv[1], ','));
+  std::vector<int64_t> oShape(getDims(argv[2], ','));
+  std::string model_path(argv[3]);
+  std::string uq(argv[4]);
+  bool isDeltaUQ = true;
+  if (uq.compare("random") == 0) isDeltaUQ = false;
+  auto model = SurrogateModel::getInstance(model_path, isDeltaUQ);
+  if (std::string(argv[4]).compare("duq_mean") == 0) {
+    test(*model, iShape, oShape, ams::AMSUQPolicy::AMS_DELTAUQ_MEAN);
+  } else if (std::string(argv[4]).compare("duq_max") == 0) {
+    test(*model, iShape, oShape, ams::AMSUQPolicy::AMS_DELTAUQ_MAX);
+  } else if (std::string(argv[4]).compare("random") == 0) {
+    test(*model, iShape, oShape, ams::AMSUQPolicy::AMS_RANDOM);
+  } else {
+    std::cout << "Unknown dUQ \n";
+    return 1;
+  }
+}
diff --git a/tests/AMSlib/torch/evalute_model_conversions.cpp b/tests/AMSlib/torch/evalute_model_conversions.cpp
new file mode 100644
index 00000000..f86e5546
--- /dev/null
+++ b/tests/AMSlib/torch/evalute_model_conversions.cpp
@@ -0,0 +1,229 @@
+#include <c10/core/DeviceType.h>
+#include <c10/util/SmallVector.h>
+#include <torch/torch.h>
+#include <torch/types.h>
+
+#include <algorithm>  // For std::shuffle
+#include <iostream>
+#include <random>  // For std::mt19937 and std::uniform_int_distribution
+#include <stdexcept>
+#include <string>
+
+#include "AMS.h"
+#include "ml/surrogate.hpp"
+
+void printTensorShape(const torch::Tensor& tensor)
+{
+  std::cout << "Tensor shape: [";
+  for (size_t i = 0; i < tensor.sizes().size(); ++i) {
+    std::cout << tensor.size(i);
+    if (i != tensor.sizes().size() - 1) {
+      std::cout << ", ";
+    }
+  }
+  std::cout << "]" << std::endl;
+}
+
+// Helper function to convert torch::Dtype to a string
+std::string dtypeToString(torch::Dtype dtype)
+{
+  static const std::unordered_map<torch::Dtype, std::string> dtypeMap = {
+      {torch::kFloat32, "float32"},
+      {torch::kFloat, "float32"},  // Alias for float32
+      {torch::kFloat64, "float64"},
+      {torch::kDouble, "float64"},  // Alias for float64
+      {torch::kInt32, "int32"},
+      {torch::kInt64, "int64"},
+      {torch::kBool, "bool"},
+      {torch::kUInt8, "uint8"},
+      {torch::kInt8, "int8"},
+      {torch::kHalf, "float16"},
+      {torch::kBFloat16, "bfloat16"}};
+  return dtypeMap.count(dtype) ? dtypeMap.at(dtype) : "unknown dtype";
+}
+
+// Helper function to convert c10::DeviceType to a string
+std::string deviceTypeToString(c10::DeviceType deviceType)
+{
+  static const std::unordered_map<c10::DeviceType, std::string> deviceMap = {
+      {c10::DeviceType::CPU, "CPU"},
+      {c10::DeviceType::CUDA, "CUDA"},
+      {c10::DeviceType::HIP, "HIP"},
+      {c10::DeviceType::FPGA, "FPGA"},
+      {c10::DeviceType::XLA, "XLA"},
+      {c10::DeviceType::Meta, "Meta"}};
+  return deviceMap.count(deviceType) ? deviceMap.at(deviceType)
+                                     : "unknown device";
+}
+
+std::vector<std::int64_t> getDims(const std::string input, char delimiter)
+{
+  std::vector<int64_t> tokens;
+  std::stringstream ss(input);
+  std::string token;
+
+  while (std::getline(ss, token, delimiter)) {
+    tokens.push_back(std::stoi(token));
+  }
+
+  return tokens;
+}
+
+std::vector<int> generateRandomVector(int target_sum, int size)
+{
+  if (target_sum < size) {
+    throw std::invalid_argument(
+        "Target sum must be at least equal to the size of the vector.");
+  }
+
+  std::vector<int> result(
+      size, 1);  // Start with each element as 1 (minimum positive integer).
+  target_sum -=
+      size;  // Reduce the remaining sum by the size (since all elements are 1).
+
+  std::random_device rd;
+  std::mt19937 gen(0);
+  std::uniform_int_distribution<> dis(0, target_sum);
+
+  // Generate random values and distribute the remaining sum
+  for (int i = 0; i < target_sum; ++i) {
+    int index = dis(gen) % size;  // Pick a random index
+    ++result[index];              // Increment the value at the chosen index
+  }
+
+  // Shuffle the vector for more randomness
+  std::shuffle(result.begin(), result.end(), gen);
+
+  return result;
+}
+
+
+bool verify(torch::Tensor& input,
+            torch::Tensor& output,
+            torch::Tensor& predicate,
+            float threshold,
+            torch::Dtype model_dtype)
+{
+
+  // our current 'interface' always return the dtype of the model.
+  if (output.dtype() != model_dtype) {
+    throw ::std::runtime_error(
+        "Tensors should have the data type of the model");
+  }
+
+  if (input.dtype() != output.dtype()) {
+    output = output.to(input.dtype());
+  }
+
+  bool close = torch::allclose(input, output, 1e-5, 1e-8);
+
+  if (!close) throw std::runtime_error("Tensors are not identical");
+
+  torch::Tensor float_tensor = predicate.to(torch::kDouble);
+  // Calculate the probability (mean of the tensor)
+  double probability = float_tensor.mean().item<double>();
+  std::cout << "probability is " << probability << "\n";
+  if (probability != threshold)
+    throw std::runtime_error(
+        "Expecing a probability of 0.0 in the case of threshold <0>");
+  return true;
+}
+
+void test(SurrogateModel& model,
+          std::vector<int64_t>& iDims,
+          std::vector<int64_t>& oDims,
+          ams::AMSUQPolicy policy)
+{
+  auto model_type = model.getModelDataType();
+  auto model_device = model.getModelResourceType();
+  std::vector<torch::Dtype> SupportedDTypes = {torch::kFloat32,
+                                               torch::kFloat64};
+  auto inputShapes = generateRandomVector(iDims[iDims.size() - 1], 3);
+  std::vector<c10::DeviceType> SupportedDevices = {c10::DeviceType::CPU};
+  if (torch::cuda::is_available() && torch::cuda::device_count() > 0) {
+    SupportedDevices.push_back(c10::DeviceType::CUDA);
+  }
+
+  for (auto type : SupportedDTypes) {
+    for (auto device : SupportedDevices) {
+      ams::SmallVector<torch::Tensor> inputs;
+      for (auto outer : inputShapes) {
+        std::vector<int64_t> partialShape(iDims.begin(), iDims.end());
+        partialShape[partialShape.size() - 1] = outer;
+        auto inp =
+            torch::rand(partialShape,
+                        torch::TensorOptions().dtype(type).device(device));
+        printTensorShape(inp);
+        inputs.push_back(inp);
+      }
+
+      std::cout << "Testing with input tensor type: " << dtypeToString(type)
+                << " on device: " << deviceTypeToString(device) << "\n";
+
+      std::cout << "Testing with model parameter types : "
+                << dtypeToString(std::get<1>(model_type)) << " on device: "
+                << deviceTypeToString(std::get<1>(model_device)) << "\n";
+
+
+      {
+        std::cout << "Staring Test-1 with random-uq and threshold of 0.0\n";
+        auto [out, predicate] = model.evaluate(inputs, policy, 0.0);
+        c10::SmallVector<torch::Tensor> data(inputs.begin(), inputs.end());
+        auto input = torch::cat(data, iDims.size() - 1);
+        std::cout << "Output of model is of type "
+                  << dtypeToString((torch::typeMetaToScalarType(out.dtype())))
+                  << "\n";
+        std::cout << "Input of model is of type "
+                  << dtypeToString(
+                         (torch::typeMetaToScalarType(inputs[0].dtype())))
+                  << "\n";
+        verify(input, out, predicate, 0.0, std::get<1>(model_type));
+        std::cout << "SUCCESS\n";
+      }
+      {
+        std::cout << "Staring Test-2 with random-uq and threshold of 0.5\n";
+        auto [out, predicate] = model.evaluate(inputs, policy, 0.5);
+        c10::SmallVector<torch::Tensor> data(inputs.begin(), inputs.end());
+        auto input = torch::cat(data, iDims.size() - 1);
+        verify(input, out, predicate, 0.5, std::get<1>(model_type));
+        std::cout << "SUCCESS\n";
+      }
+      {
+        std::cout << "Staring Test-3 with random-uq and threshold of 1.0\n";
+        auto [out, predicate] = model.evaluate(inputs, policy, 1.0);
+        c10::SmallVector<torch::Tensor> data(inputs.begin(), inputs.end());
+        auto input = torch::cat(data, iDims.size() - 1);
+        verify(input, out, predicate, 1.0, std::get<1>(model_type));
+        std::cout << "SUCCESS\n";
+      }
+    }
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  if (argc != 5) {
+    std::cerr << "Wrong command line, expecting , "
+                 "<input-dim-shape (1024,2,4)> <output-dim-shape> (1024, 2, "
+                 "6) <model-path> <duq_type (mean|max)>\n";
+    return -1;
+  }
+
+  std::vector<int64_t> iShape(getDims(argv[1], ','));
+  std::vector<int64_t> oShape(getDims(argv[2], ','));
+  std::string model_path(argv[3]);
+  std::string uq(argv[4]);
+  bool isDeltaUQ = true;
+  if (uq.compare("random") == 0) isDeltaUQ = false;
+  auto model = SurrogateModel::getInstance(model_path, isDeltaUQ);
+  if (std::string(argv[4]).compare("duq_mean") == 0) {
+    test(*model, iShape, oShape, ams::AMSUQPolicy::AMS_DELTAUQ_MEAN);
+  } else if (std::string(argv[4]).compare("duq_max") == 0) {
+    test(*model, iShape, oShape, ams::AMSUQPolicy::AMS_DELTAUQ_MAX);
+  } else if (std::string(argv[4]).compare("random") == 0) {
+    test(*model, iShape, oShape, ams::AMSUQPolicy::AMS_RANDOM);
+  } else {
+    std::cout << "Unknown dUQ \n";
+    return 1;
+  }
+}
diff --git a/tests/AMSlib/torch/verify_device.cpp b/tests/AMSlib/torch/verify_device.cpp
new file mode 100644
index 00000000..6e2e5f7c
--- /dev/null
+++ b/tests/AMSlib/torch/verify_device.cpp
@@ -0,0 +1,33 @@
+#include <iostream>
+#include <string>
+
+#include "ml/surrogate.hpp"
+
+
+int main(int argc, char *argv[])
+{
+  if (argc != 3) {
+    std::cerr << "Wrong command line, expecting <device (cpu|gpu)>  "
+                 "<path-to-model>\n";
+    return -1;
+  }
+  std::string device(argv[1]);
+  std::string model_path(argv[2]);
+  std::cout << "Opening model under " << model_path;
+  std::cout << " on device " << device << "\n";
+
+  auto model = SurrogateModel::getInstance(model_path);
+  if (device.compare("gpu") == 0 && model->is_gpu()) {
+    std::cout << "SUCCESS: " << "Model will execute on device\n";
+    return 0;
+  }
+  if (device.compare("cpu") == 0 && model->is_cpu()) {
+    std::cout << "SUCCESS: " << "Model will execute on host\n";
+    return 0;
+  }
+
+  if (device.compare("cpu") != 0 && device.compare("gpu") != 0)
+    std::cout << "AMS Surrogate does not support " << device
+              << " as a execution device\n";
+  return 1;
+}
diff --git a/tests/AMSlib/torch/verify_dtype.cpp b/tests/AMSlib/torch/verify_dtype.cpp
new file mode 100644
index 00000000..22062eb5
--- /dev/null
+++ b/tests/AMSlib/torch/verify_dtype.cpp
@@ -0,0 +1,26 @@
+#include <iostream>
+#include <string>
+
+#include "ml/surrogate.hpp"
+
+int main(int argc, char *argv[])
+{
+  if (argc != 3) {
+    std::cerr << "Wrong command line, expecting <precision "
+                 "(float|double) <path-to-model>\n";
+    return -1;
+  }
+  std::string precision(argv[1]);
+  std::string model_path(argv[2]);
+  std::cout << "Opening model under " << model_path;
+  std::cout << " with precision " << precision << "\n";
+  auto model = SurrogateModel::getInstance(model_path);
+  if (precision.compare("single") == 0 && model->is_float()) return 0;
+  if (precision.compare("double") == 0 && model->is_double()) return 0;
+
+  if (precision.compare("single") != 0 && precision.compare("double") != 0) {
+    std::cout << "AMS Surrogate does not support " << precision
+              << " as a dataype\n";
+  }
+  return 1;
+}
diff --git a/tests/AMSlib/torch_model.cpp b/tests/AMSlib/torch_model.cpp
deleted file mode 100644
index 47c84178..00000000
--- a/tests/AMSlib/torch_model.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
-#include <AMS.h>
-
-#include <cstring>
-#include <iostream>
-#include <ml/surrogate.hpp>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <vector>
-#include <wf/resource_manager.hpp>
-
-#include "../utils.hpp"
-
-#define SIZE (32L * 1024L + 3L)
-
-template <typename T>
-void inference(SurrogateModel<T> &model, AMSResourceType resource)
-{
-  using namespace ams;
-
-  std::vector<const T *> inputs;
-  std::vector<T *> outputs;
-  auto &ams_rm = ams::ResourceManager::getInstance();
-
-  for (int i = 0; i < 2; i++)
-    inputs.push_back(ams_rm.allocate<T>(SIZE, resource));
-
-  for (int i = 0; i < 4; i++)
-    outputs.push_back(ams_rm.allocate<T>(SIZE, resource));
-
-  model.evaluate(
-      SIZE, inputs.size(), outputs.size(), inputs.data(), outputs.data());
-
-
-  for (int i = 0; i < 2; i++)
-    ams_rm.deallocate(const_cast<T *>(inputs[i]), resource);
-
-  for (int i = 0; i < 4; i++)
-    ams_rm.deallocate(outputs[i], resource);
-}
-
-int main(int argc, char *argv[])
-{
-  using namespace ams;
-  installSignals();
-  AMSInit();
-
-  if (argc != 4) {
-    std::cout << "Wrong cli, correct one: \n";
-    std::cout << argv[0] << " "
-              << "use-device(0|1) model-path data_type('double'|'single') \n";
-  }
-
-  auto &ams_rm = ams::ResourceManager::getInstance();
-  int use_device = std::atoi(argv[1]);
-  std::string model_path(argv[2]);
-  char *data_type = argv[3];
-
-  AMSResourceType resource = AMSResourceType::AMS_HOST;
-  if (use_device == 1) {
-    resource = AMSResourceType::AMS_DEVICE;
-  }
-
-  ams_rm.init();
-
-  if (std::strcmp("double", data_type) == 0) {
-    std::shared_ptr<SurrogateModel<double>> model =
-        SurrogateModel<double>::getInstance(model_path, resource);
-    assert(model->is_double());
-    inference<double>(*model, resource);
-  } else if (std::strcmp("single", data_type) == 0) {
-    std::shared_ptr<SurrogateModel<float>> model =
-        SurrogateModel<float>::getInstance(model_path, resource);
-    assert(!model->is_double());
-    inference<float>(*model, resource);
-  } else {
-    std::cout << "Unknown data type " << data_type << "\n";
-    return 1;
-  }
-
-  AMSFinalize();
-  return 0;
-}
diff --git a/tests/AMSlib/tuple-single.torchscript b/tests/AMSlib/tuple-single.torchscript
deleted file mode 100644
index c40411b4..00000000
Binary files a/tests/AMSlib/tuple-single.torchscript and /dev/null differ
diff --git a/tests/AMSlib/tuple.duq b/tests/AMSlib/tuple.duq
deleted file mode 100644
index dd16fef9..00000000
Binary files a/tests/AMSlib/tuple.duq and /dev/null differ
diff --git a/tests/AMSlib/tuple.duq.cuda b/tests/AMSlib/tuple.duq.cuda
deleted file mode 100644
index 46a0c959..00000000
Binary files a/tests/AMSlib/tuple.duq.cuda and /dev/null differ
diff --git a/tests/AMSlib/uq_max_cpu.pt b/tests/AMSlib/uq_max_cpu.pt
deleted file mode 100644
index 4c46f93a..00000000
Binary files a/tests/AMSlib/uq_max_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/uq_max_double_cpu.pt b/tests/AMSlib/uq_max_double_cpu.pt
deleted file mode 100644
index ecc83c31..00000000
Binary files a/tests/AMSlib/uq_max_double_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/uq_max_single_cpu.pt b/tests/AMSlib/uq_max_single_cpu.pt
deleted file mode 100644
index 84e94e19..00000000
Binary files a/tests/AMSlib/uq_max_single_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/uq_mean_cpu.pt b/tests/AMSlib/uq_mean_cpu.pt
deleted file mode 100644
index e4c479e1..00000000
Binary files a/tests/AMSlib/uq_mean_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/uq_mean_double_cpu.pt b/tests/AMSlib/uq_mean_double_cpu.pt
deleted file mode 100644
index 19dcecfb..00000000
Binary files a/tests/AMSlib/uq_mean_double_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/uq_mean_single_cpu.pt b/tests/AMSlib/uq_mean_single_cpu.pt
deleted file mode 100644
index 0637ef97..00000000
Binary files a/tests/AMSlib/uq_mean_single_cpu.pt and /dev/null differ
diff --git a/tests/AMSlib/utils.hpp b/tests/AMSlib/utils.hpp
new file mode 100644
index 00000000..ba0716b1
--- /dev/null
+++ b/tests/AMSlib/utils.hpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+ * AMSLib Project Developers
+ *
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef __TEST_UTILS__
+#define __TEST_UTILS__
+
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <csignal>
+
+#include "AMS.h"
+
+#ifdef __AMS_ENABLE_MPI__
+#include <mpi.h>
+#define MPI_CALL(stmt)                                                         \
+  if (stmt != MPI_SUCCESS) {                                                   \
+    fprintf(stderr, "Error in MPI-Call (File: %s, %d)\n", __FILE__, __LINE__); \
+  }
+#else
+typedef void* MPI_Comm;
+#define MPI_CALL(stm)
+#endif
+
+using namespace ams;
+
+static inline AMSDType getDataType(const char* d_type)
+{
+  AMSDType dType = AMSDType::AMS_DOUBLE;
+  if (std::strcmp(d_type, "float") == 0) {
+    dType = AMSDType::AMS_SINGLE;
+  } else if (std::strcmp(d_type, "double") == 0) {
+    dType = AMSDType::AMS_DOUBLE;
+  } else {
+    assert(false && "Unknown data type (must be 'float' or 'double')");
+  }
+  return dType;
+}
+
+static inline AMSDType getDataType(std::string& d_type)
+{
+  AMSDType dType = AMSDType::AMS_DOUBLE;
+  if (d_type == "float") {
+    dType = AMSDType::AMS_SINGLE;
+  } else if (d_type == "double") {
+    dType = AMSDType::AMS_DOUBLE;
+  } else {
+    assert(false && "Unknown data type (must be 'float' or 'double')");
+  }
+  return dType;
+}
+
+
+static inline AMSDBType getDBType(const char* db_type)
+{
+  AMSDBType dbType = AMSDBType::AMS_NONE;
+  if (std::strcmp(db_type, "hdf5") == 0) {
+    dbType = AMSDBType::AMS_HDF5;
+  } else if (std::strcmp(db_type, "rmq") == 0) {
+    dbType = AMSDBType::AMS_RMQ;
+  }
+  return dbType;
+}
+
+static inline AMSDBType getDBType(std::string db_type)
+{
+  AMSDBType dbType = AMSDBType::AMS_NONE;
+  if (db_type == "hdf5") {
+    dbType = AMSDBType::AMS_HDF5;
+  } else if (db_type == "rmq") {
+    dbType = AMSDBType::AMS_RMQ;
+  }
+  return dbType;
+}
+
+static std::string UQPolicyToStr(AMSUQPolicy policy)
+{
+  if (policy == AMSUQPolicy::AMS_RANDOM)
+    return "random";
+  else if (policy == AMSUQPolicy::AMS_DELTAUQ_MEAN)
+    return "deltaUQ (mean)";
+  else if (policy == AMSUQPolicy::AMS_DELTAUQ_MAX)
+    return "deltaUQ (max)";
+  return "Unknown";
+}
+
+static AMSUQPolicy UQPolicyFromStr(std::string &policy)
+{
+  if (policy.compare("random") == 0)
+    return AMSUQPolicy::AMS_RANDOM;
+  else if (policy.compare("deltaUQ (mean)") == 0)
+    return AMSUQPolicy::AMS_DELTAUQ_MEAN;
+  else if (policy.compare("deltaUQ (max)") == 0)
+    return AMSUQPolicy::AMS_DELTAUQ_MAX;
+  return AMSUQPolicy::AMS_UQ_END;
+}
+
+
+// Signal handler to print the stack trace
+static inline void signalHandler(int signum)
+{
+  const char* msg = "[signalHandler] Caught signal\n";
+  write(STDERR_FILENO, msg, strlen(msg));
+
+  // Obtain the backtrace
+  const int maxFrames = 128;
+  void* addrlist[maxFrames];
+
+  // Get void*'s for all entries on the stack
+  int addrlen = backtrace(addrlist, maxFrames);
+
+  if (addrlen == 0) {
+    const char* no_stack = "No stack trace available\n";
+    write(STDERR_FILENO, no_stack, strlen(no_stack));
+    _exit(1);  // _exit() Cannot be trap, interrupted
+  }
+
+  // Print out all the frames to stderr
+  backtrace_symbols_fd(addrlist, addrlen, STDERR_FILENO);
+  _exit(1);
+}
+
+
+static inline void installSignals()
+{
+  std::signal(SIGSEGV, signalHandler);  // segmentation fault
+  std::signal(SIGABRT, signalHandler);  // abort()
+  std::signal(SIGFPE, signalHandler);   // floating-point exception
+  std::signal(SIGILL, signalHandler);   // illegal instruction
+  std::signal(SIGINT, signalHandler);   // interrupt (e.g., Ctrl+C)
+  std::signal(SIGTERM, signalHandler);  // termination request
+  std::signal(SIGPIPE, signalHandler);  // broken pipe
+}
+
+class TestArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
+
+#endif
diff --git a/tests/AMSlib/verify_ete.py b/tests/AMSlib/verify_ete.py
deleted file mode 100644
index a7899718..00000000
--- a/tests/AMSlib/verify_ete.py
+++ /dev/null
@@ -1,316 +0,0 @@
-import sys
-import json
-from pathlib import Path
-import pandas as pd
-import h5py
-import numpy as np
-import os
-
-
-def get_suffix(db_type):
-    if db_type == "csv":
-        return "csv"
-    if db_type == "none":
-        return "none"
-    if db_type == "hdf5":
-        return "h5"
-    return "unknown"
-
-
-def verify_data_collection(fs_path, db_type, num_inputs, num_outputs, name="test", debug_db=False):
-    # Returns a tuple of the input/ouput data and 0/1 for correct incorrect file.
-    # Checks whether the files also have the right number of columns
-    if not Path(fs_path).exists():
-        print("Expecting output directory to exist")
-        return None, 1
-
-    suffix = get_suffix(db_type)
-    if suffix == "none":
-        return None, 0
-
-    fn = f"{name}_0.{suffix}"
-    if debug_db and db_type != "hdf5":
-        print("Debug DB is only supported on hdf5")
-        return None, 1
-    elif debug_db:
-        fn = f"{name}_0.debug.{suffix}"
-
-    fp = Path(f"{fs_path}/{fn}")
-
-    if name == "" and fp.exists():
-        print(f"I was expecting file({fp}) to not exist")
-        fp.unlink()
-        return None, 1
-    elif name == "":
-        return (np.empty((0, 0)), np.empty((0, 0))), 0
-    elif not fp.exists():
-        print(f"File path {fn} does not exist")
-        return None, 1
-
-    if db_type == "csv":
-        df = pd.read_csv(str(fp), sep=":")
-        assert len(df.columns) == (num_inputs + num_outputs), "Expected equal number of inputs/outputs"
-
-        inputs = sum(1 for s in df.columns if "input" in s)
-        assert inputs == num_inputs, "Expected equal number of inputs"
-        outputs = sum(1 for s in df.columns if "output" in s)
-        assert outputs == num_outputs, "Expected equal number of outputs"
-        input_data = df[[f"input_{i}" for i in range(inputs)]].to_numpy()
-        output_data = df[[f"output_{i}" for i in range(outputs)]].to_numpy()
-        fp.unlink()
-        return (input_data, output_data), 0
-    elif db_type == "hdf5":
-        with h5py.File(fp, "r") as fd:
-            dsets = fd.keys()
-            assert len(dsets) == (
-                num_inputs + num_outputs + int(debug_db) + 1
-            ), "Expected equal number of inputs/outputs"
-            inputs = sum(1 for s in dsets if "input" in s)
-            assert inputs == num_inputs, "Expected equal number of inputs"
-            outputs = sum(1 for s in dsets if "output" in s)
-            assert outputs == num_outputs, "Expected equal number of outputs"
-            input_data = [[] for _ in range(num_inputs)]
-            output_data = [[] for _ in range(num_outputs)]
-            for d in dsets:
-                if not ("input_" in d or "output_" in d):
-                    continue
-                loc = int(d.split("_")[1])
-                if len(fd[d]):
-                    if "input" in d:
-                        input_data[loc] = fd[d][:]
-                    elif "output" in d:
-                        output_data[loc] = fd[d][:]
-            predicate = None
-            if debug_db:
-                predicate = np.array(fd["predicate"][:])
-            input_data = np.array(input_data)
-            output_data = np.array(output_data)
-            fp.unlink()
-            if debug_db:
-                return (input_data.T, output_data.T, predicate), 0
-            return (input_data.T, output_data.T), 0
-
-    else:
-        return None, 1
-
-
-def verify(
-    use_device,
-    num_inputs,
-    num_outputs,
-    model_path,
-    data_type,
-    uq_name,
-    threshold,
-    num_iterations,
-    num_elements,
-    db_type,
-    fs_path,
-    name="test",
-    debug_db=False,
-):
-    print("debug db is", debug_db)
-    # When AMS has no model path it always calls the domain solution.
-    # As such it behaves identically with threshold 0
-    if model_path == None or model_path == "":
-        threshold = 0.0
-
-    # Name maps to the db-name. When empty it means we did not want to collect any data
-    if name == "":
-        threshold = 1.0
-
-    if db_type != "none":
-        data, correct = verify_data_collection(fs_path, db_type, num_inputs, num_outputs, name, debug_db)
-        if correct:
-            return 1
-        inputs = data[0]
-        outputs = data[1]
-
-        if (model_path == None or model_path == "") and name == "":
-            return 0
-
-        # Check data type.
-        if db_type == "hdf5":
-            if "data_type" == "double":
-                assert inputs.dtype == np.float64, "Data types do not match"
-            elif "data_type" == "float":
-                assert inputs.dtype == np.float32, "Data types do not match"
-
-        # When debug db is set, we store always all elements
-        if debug_db:
-            predicate = data[2]
-            assert (
-                len(predicate) == num_elements
-            ), f"debug db should always contain all data but now it has {len(predicate)}"
-            assert (
-                len(inputs) == num_elements and len(outputs) == num_elements
-            ), f"Num elements should be the same as experiment {len(inputs)} {num_elements}"
-            # Predicate points to 'true' when we use the model. The sum should be "equal" to
-            # the threshold multiplied by the number of elements
-            arg = sum(predicate)
-            actual_elems = int(threshold * num_elements)
-            assert arg == actual_elems, "Predicate does not accumulate to the expected value"
-            # Over here I pick the values from input/outputs that will be selected for "domain" evaluation
-            # This will allow the code to verify that predicates pick the "right" values
-            inputs = inputs[np.logical_not(predicate)]
-            outputs = outputs[np.logical_not(predicate)]
-        elif threshold == 0.0:
-            # Threshold 0 means collect all data. Verify the sizes.
-            assert (
-                len(inputs) == num_elements and len(outputs) == num_elements
-            ), f"Num elements should be the same as experiment {len(inputs)} {num_elements}"
-
-        elif threshold == 1.0:
-            # Threshold 1.0 means to not collect any data. Verify the sizes.
-            assert len(inputs) == 0 and len(outputs) == 0, "Num elements should be zero"
-            # There is nothing else we can check here
-            return 0
-        else:
-            # Compute a theoritical range of possible values in the db.
-            # The duq/faiss tests have specific settings. The random one can have a
-            # bound. This checks for all these cases
-            lb = num_elements * (1 - threshold) - num_elements * 0.05
-            ub = num_elements * (1 - threshold) + num_elements * 0.05
-            assert (
-                len(inputs) > lb and len(inputs) < ub
-            ), f"Not in the bounds of correct items {lb} {ub} {len(inputs)} {name}"
-            assert (
-                len(outputs) > lb and len(outputs) < ub
-            ), f"Not in the bounds of correct items {lb} {ub} {len(inputs)} {name}"
-
-        if "delta" in uq_name:
-            assert "mean" in uq_name or "max" in uq_name, "unknown Delta UQ mechanism"
-            d_type = np.float32
-            if data_type == "double":
-                d_type = np.float64
-
-            if "mean" in uq_name:
-                # Our DUQ-mean model skips odd evaluations.
-                # Here we set on verify_inputs the inputs of those evaluations
-                verify_inputs = np.zeros((len(inputs), num_inputs), dtype=d_type)
-                if threshold == 0.0:
-                    step = 1
-                elif threshold == 0.5:
-                    verify_inputs[0] = np.ones(num_inputs, dtype=d_type)
-                    step = 2
-                for i in range(1, len(inputs)):
-                    verify_inputs[i] = verify_inputs[i - 1] + step
-                # Compare whether the results match our base function.
-                diff_sum = np.sum(np.abs(verify_inputs - inputs))
-                assert np.isclose(diff_sum, 0.0), "Mean Input data do not match"
-                verify_output = np.sum(inputs, axis=1).T * num_outputs
-                outputs = np.sum(outputs, axis=1)
-                diff_sum = np.sum(np.abs(outputs - verify_output))
-                assert np.isclose(diff_sum, 0.0), "Mean Output data do not match"
-            elif "max" in uq_name:
-                # Our DUQ-max model skips even evaluations.
-                # Here we set on verify_inputs the inputs of those evaluations
-                verify_inputs = np.zeros((len(inputs), num_inputs), dtype=d_type)
-                if threshold == 0.0:
-                    step = 1
-                elif threshold == 0.5:
-                    step = 2
-                for i in range(1, len(inputs)):
-                    verify_inputs[i] = verify_inputs[i - 1] + step
-                diff_sum = np.sum(np.abs(verify_inputs - inputs))
-                assert np.isclose(diff_sum, 0.0), "Max Input data do not match"
-                verify_output = np.sum(inputs, axis=1).T * num_outputs
-                outputs = np.sum(outputs, axis=1)
-                diff_sum = np.sum(np.abs(outputs - verify_output))
-                assert np.isclose(diff_sum, 0.0), "Max Output data do not match"
-    else:
-        return 0
-
-    return 0
-
-
-def from_cli(argv):
-    use_device = int(argv[0])
-    num_inputs = int(argv[1])
-    num_outputs = int(argv[2])
-    model_path = argv[3]
-    data_type = argv[4]
-    uq_name = argv[5]
-    threshold = float(argv[6])
-    num_iterations = int(argv[7])
-    num_elements = int(argv[8])
-    db_type = argv[9]
-    fs_path = argv[10]
-
-    return verify(
-        use_device,
-        num_inputs,
-        num_outputs,
-        model_path,
-        data_type,
-        uq_name,
-        threshold,
-        num_iterations,
-        num_elements,
-        db_type,
-        fs_path,
-    )
-
-
-def from_json(argv):
-    print(argv)
-    use_device = int(argv[0])
-    num_inputs = int(argv[1])
-    num_outputs = int(argv[2])
-    data_type = argv[3]
-    num_elements = int(argv[4])
-    model_1 = argv[5]
-    model_2 = argv[6]
-
-    env_file = Path(os.environ["AMS_OBJECTS"])
-    if not env_file.exists():
-        print("Environment file does not exist")
-        return -1
-
-    with open(env_file, "r") as fd:
-        data = json.load(fd)
-
-    db_type = data["db"]["dbType"]
-    fs_path = data["db"]["fs_path"]
-
-    for m in [model_1, model_2]:
-        print("Testing Model", m)
-        ml_id = data["domain_models"][m]
-        model = data["ml_models"][ml_id]
-
-        uq_type = model["uq_type"]
-        print(json.dumps(model, indent=6))
-        if "uq_aggregate" in model:
-            uq_type += " ({0})".format(model["uq_aggregate"])
-
-        print(uq_type)
-
-        threshold = model["threshold"]
-        db_label = model["db_label"]
-        model_path = model.get("model_path", None)
-        is_debug = model.get("debug_db", False)
-        res = verify(
-            use_device,
-            num_inputs,
-            num_outputs,
-            model_path,
-            data_type,
-            uq_type,
-            threshold,
-            -1,
-            num_elements,
-            db_type,
-            fs_path,
-            db_label,
-            is_debug,
-        )
-        if res != 0:
-            return res
-        print("[Success] Model", m)
-    return 0
-
-
-if __name__ == "__main__":
-    if "AMS_OBJECTS" in os.environ:
-        sys.exit(from_json(sys.argv[1:]))
-    sys.exit(from_cli(sys.argv[1:]))
diff --git a/tests/AMSlib/verify_rmq.py b/tests/AMSlib/verify_rmq.py
deleted file mode 100644
index 6917e8cd..00000000
--- a/tests/AMSlib/verify_rmq.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import sys
-import json
-from pathlib import Path
-import os
-import numpy as np
-
-from ams.rmq import BlockingClient, default_ams_callback
-
-def verify(
-    use_device,
-    num_inputs,
-    num_outputs,
-    data_type,
-    num_iterations,
-    num_elements,
-    rmq_json,
-    timeout = None,
-    domain_test = "rmq_db_no_model" # defined in ams_rmq_env.cpp
-):
-    host = rmq_json["service-host"]
-    vhost = rmq_json["rabbitmq-vhost"]
-    port = rmq_json["service-port"]
-    user = rmq_json["rabbitmq-user"]
-    password = rmq_json["rabbitmq-password"]
-    queue = rmq_json["rabbitmq-queue-physics"]
-    cert = None
-    if "rabbitmq-cert" in rmq_json:
-        cert = rmq_json["rabbitmq-cert"]
-        cert = None if cert == "" else cert
-
-    dtype = 4
-    if data_type == "double":
-        dtype = 8
-
-    with BlockingClient(host, port, vhost, user, password, cert, default_ams_callback) as client:
-        with client.connect(queue) as channel:
-            msgs = channel.receive(n_msg = num_iterations, timeout = timeout)
-
-    assert len(msgs) == num_iterations, f"Received incorrect number of messsages ({len(msgs)}): expected #msgs ({num_iterations})"
-
-    expected_input = np.array([[0., 0.],
-       [1., 1.],
-       [2., 2.],
-       [3., 3.],
-       [4., 4.],
-       [5., 5.],
-       [6., 6.],
-       [7., 7.],
-       [8., 8.],
-       [9., 9.]]
-    )
-
-    expected_output = np.array([[ 0.,  0.],
-       [ 2.,  2.],
-       [ 4.,  4.],
-       [ 6.,  6.],
-       [ 8.,  8.],
-       [10., 10.],
-       [12., 12.],
-       [14., 14.],
-       [16., 16.],
-       [18., 18.]]
-    )
-
-    for i, msg in enumerate(msgs):
-        domain, input_data, output_data = msg.decode()
-        assert msg.num_elements == num_elements, f"Message #{i}: incorrect #elements ({msg.num_element}) vs. expected #elem {num_elements})"
-        assert msg.input_dim == num_inputs, f"Message #{i}: incorrect #inputs ({msg.input_dim}) vs. expected #inputs {num_inputs})"
-        assert msg.output_dim == num_outputs, f"Message #{i}: incorrect #outputs ({msg.output_dim}) vs. expected #outputs {num_outputs})"
-        assert msg.dtype_byte == dtype, f"Message #{i}: incorrect datatype ({msg.dtype_byte} bytes) vs. expected type {dtype} bytes)"
-        assert domain == domain_test, f"Message #{i}: incorrect domain name (got {domain}) expected rmq_db_no_model)"
-        assert np.array_equal(input_data, expected_input), f"Message #{i}: incorrect incorrect input data"
-        assert np.array_equal(output_data, expected_output), f"Message #{i}: incorrect incorrect output data"
-
-    return 0
-
-def from_json(argv):
-    use_device = int(argv[0])
-    num_inputs = int(argv[1])
-    num_outputs = int(argv[2])
-    data_type = argv[3]
-    num_iterations = int(argv[4])
-    num_elements = int(argv[5])
-
-    env_file = Path(os.environ["AMS_OBJECTS"])
-    if not env_file.exists():
-        print("Environment file does not exist")
-        return -1
-
-    with open(env_file, "r") as fd:
-        rmq_json = json.load(fd)
-
-    res = verify(
-        use_device,
-        num_inputs,
-        num_outputs,
-        data_type,
-        num_iterations,
-        num_elements,
-        rmq_json["db"]["rmq_config"],
-        timeout = 20 # in seconds
-    )
-    if res != 0:
-        return res
-    print("[Success] rmq test received")
-    return 0
-
-if __name__ == "__main__":
-    if "AMS_OBJECTS" in os.environ:
-        sys.exit(from_json(sys.argv[1:]))
-    sys.exit(1)
diff --git a/tests/AMSlib/wf/CMakeLists.txt b/tests/AMSlib/wf/CMakeLists.txt
new file mode 100644
index 00000000..4160b6d1
--- /dev/null
+++ b/tests/AMSlib/wf/CMakeLists.txt
@@ -0,0 +1,192 @@
+function(ADD_WORKFLOW_UNIT_TEST name exec)
+  string(JOIN " " args ${ARGN})
+  add_test(NAME ${name} COMMAND bash -c "rm -f ${CMAKE_CURRENT_BINARY_DIR}/db_label*.h5; ${CMAKE_CURRENT_BINARY_DIR}/${exec} ${args}")
+  set_tests_properties(${name} PROPERTIES LABELS WORKFLOW_UNIT_TEST)
+endfunction()
+
+function(BUILD_UNIT_TEST exe source)
+  add_executable(${exe} ${source})
+
+  target_include_directories(${exe} PRIVATE ${CMAKE_SOURCE_DIR}/src/AMSlib/)
+  target_include_directories(${exe} PRIVATE ${CMAKE_BINARY_DIR}/include/)
+  target_link_libraries(${exe} PRIVATE ${AMS_APP_LIBRARIES} stdc++fs AMS torch)
+  target_compile_definitions(${exe} PRIVATE ${AMS_APP_DEFINES})
+
+  if (WITH_HDF5)
+    target_link_libraries(${exe} PRIVATE ${AMS_HDF5_TARGET})
+  endif()
+
+
+  if(WITH_CUDA)
+    target_link_libraries(${exe} PRIVATE CUDA::cudart)
+  endif()
+
+  if (WITH_CALIPER)
+    target_link_libraries(${exe} PRIVATE caliper)
+  endif()
+
+  if (WITH_RMQ)
+    target_link_libraries(${exe} PRIVATE amqpcpp)
+    if (OPENSSL_FOUND)
+      target_link_libraries(${exe} PRIVATE OpenSSL::SSL OpenSSL::Crypto) 
+    endif()
+    # NOTE: We set here the event/event pthreads as public. As there is no easy way
+    # to do a find package(libevent) and RMQ is not exposing that properly.
+    target_link_libraries(${exe} PRIVATE ${LIBEVENT_LIBRARY} ${LIBEVENT_THREAD})
+  endif()
+
+  if (WITH_MPI)
+    target_link_libraries(${exe} PRIVATE MPI::MPI_CXX)
+  endif()
+endfunction()
+
+BUILD_UNIT_TEST(domain_to_application domain_to_application.cpp)
+BUILD_UNIT_TEST(subselect_tensors subselect_tensors.cpp)
+BUILD_UNIT_TEST(scatter_physics scatter_physics.cpp)
+if (WITH_HDF5)
+BUILD_UNIT_TEST(evaluate_in_and_outs evaluate_in_and_outs.cpp)
+endif()
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::DomainToApplication::FLOAT::FLOAT domain_to_application float float cpu)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::DomainToApplication::DOUBLE::FLOAT domain_to_application double float cpu)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::DomainToApplication::FLOAT::DOUBLE domain_to_application float double cpu)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::DomainToApplication::DOUBLE::DOUBLE domain_to_application double double cpu)
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::SubSelect::Double subselect_tensors double cpu)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::SubSelect::Float subselect_tensors float cpu)
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::ScatterPhysics::Double scatter_physics double cpu)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::HOST::ScatterPhysics::Float scatter_physics float cpu)
+
+if (WITH_HDF5)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InAndOuts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLHost::Evaluate::Inouts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_cpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+endif()
+
+if (WITH_CUDA)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::DomainToApplication::FLOAT::FLOAT domain_to_application float float cuda)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::DomainToApplication::DOUBLE::FLOAT domain_to_application double float cuda)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::DomainToApplication::FLOAT::DOUBLE domain_to_application float double cuda)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::DomainToApplication::DOUBLE::DOUBLE domain_to_application double double cuda)
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::SubSelect::Double subselect_tensors double cuda)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::SubSelect::Float subselect_tensors float cuda)
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::ScatterPhysics::Double scatter_physics double cuda)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::DEVICE::ScatterPhysics::Float scatter_physics float cuda)
+
+if (WITH_HDF5)
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhFloat::MLFloat::AllPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhFloat::MLDouble::AllPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhDouble::MLDouble::AllPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhDouble::MLFloat::AllPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhFloat::MLFloat::BothModelPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhFloat::MLDouble::BothModelPhysics evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhDouble::MLDouble::BothModelPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhDouble::MLFloat::BothModelPhysics evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 0.5  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InAndOuts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhHOST::MLDevice::Evaluate::Inouts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double cpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhFloat::MLFloat::AllModel evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhFloat::MLDouble::AllModel evaluate_in_and_outs float gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhDouble::MLDouble::AllModel evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/double_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InAndOuts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "0")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::InInoutsAndOuts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0 "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "1")
+ADD_WORKFLOW_UNIT_TEST(WORKFLOW::PhDevice::MLDevice::Evaluate::Inouts::PhDouble::MLFloat::AllModel evaluate_in_and_outs double gpu ${TORCH_MODEL_DIR}/single_gpu_duq_max.pt "duq_max" 1.0  "1,8" "1,8" ${CMAKE_CURRENT_BINARY_DIR}/ "8")
+endif()
+
+endif()
+
diff --git a/tests/AMSlib/wf/domain_to_application.cpp b/tests/AMSlib/wf/domain_to_application.cpp
new file mode 100644
index 00000000..105d5981
--- /dev/null
+++ b/tests/AMSlib/wf/domain_to_application.cpp
@@ -0,0 +1,97 @@
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/rand.h>
+#include <c10/core/DeviceType.h>
+#include <torch/types.h>
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+
+#include "wf/workflow.hpp"
+
+std::vector<int> generateRandomVector(int target_sum, int size)
+{
+  if (target_sum < size) {
+    throw std::invalid_argument(
+        "Target sum must be at least equal to the size of the vector.");
+  }
+
+  std::vector<int> result(
+      size, 1);  // Start with each element as 1 (minimum positive integer).
+  target_sum -=
+      size;  // Reduce the remaining sum by the size (since all elements are 1).
+
+  std::random_device rd;
+  std::mt19937 gen(0);
+  std::uniform_int_distribution<> dis(0, target_sum);
+
+  // Generate random values and distribute the remaining sum
+  for (int i = 0; i < target_sum; ++i) {
+    int index = dis(gen) % size;  // Pick a random index
+    ++result[index];              // Increment the value at the chosen index
+  }
+
+  // Shuffle the vector for more randomness
+  std::shuffle(result.begin(), result.end(), gen);
+
+  return result;
+}
+
+
+int main(int argc, char *argv[])
+{
+
+  if (argc != 4) {
+    std::cout << "Wrong command line\n";
+    std::cout << argv[0]
+              << " <ml-type(float|double)> <ph-type (float|double)> <device>\n";
+    return -1;
+  }
+  torch::Dtype mlDType = torch::kFloat32;
+  torch::Dtype phDType = torch::kFloat32;
+  torch::DeviceType dev = c10::DeviceType::CPU;
+
+  std::string mlType(argv[1]);
+  std::string phType(argv[2]);
+  std::string device(argv[3]);
+
+
+  if (mlType.compare("double") == 0) {
+    mlDType = torch::kFloat64;
+  }
+
+  if (phType.compare("double") == 0) phDType = torch::kFloat64;
+
+  if (device.compare("cuda") == 0) dev = c10::DeviceType::CUDA;
+
+  torch::Tensor Src =
+      torch::rand({32, 11}, torch::TensorOptions().dtype(mlDType).device(dev));
+  auto shapes = generateRandomVector(11, 8);
+  ams::SmallVector<torch::Tensor> Dest;
+  auto tmp = torch::arange(0, 32, torch::kInt64) % 2;
+  auto Predicate = tmp.to(torch::kBool) != 0;
+
+
+  for (auto V : shapes) {
+    Dest.push_back(
+        torch::zeros({32, V},
+                     torch::TensorOptions().dtype(phDType).device(dev)));
+  }
+  ams::SmallVector<torch::Tensor> subset(Dest.begin(), Dest.end() - 1);
+  int offset =
+      ams::AMSWorkflow::MLDomainToApplication(Src, subset, Predicate, 0);
+  ams::AMSWorkflow::MLDomainToApplication(Src,
+                                          {Dest[Dest.size() - 1]},
+                                          Predicate,
+                                          offset);
+
+  auto Input = torch::cat(Dest, Dest[0].sizes().size() - 1);
+
+  auto result = torch::cat(Dest, 1).to(at::TensorOptions().dtype(mlDType));
+  auto inverted = ~Predicate;
+  Src.index_put_({inverted}, torch::zeros({1, Src.size(1)}, Src.options()));
+
+  bool close = torch::allclose(Src, result, 1e-5, 1e-8);
+  if (close) return 0;
+  return 1;
+}
diff --git a/tests/AMSlib/wf/evaluate_in_and_outs.cpp b/tests/AMSlib/wf/evaluate_in_and_outs.cpp
new file mode 100644
index 00000000..1bcad5e7
--- /dev/null
+++ b/tests/AMSlib/wf/evaluate_in_and_outs.cpp
@@ -0,0 +1,413 @@
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/matmul.h>
+#include <c10/core/DeviceType.h>
+#include <sys/types.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/torch.h>
+#include <torch/types.h>
+
+#include <cstdint>
+#include <vector>
+
+#include "AMS.h"
+#include "wf/workflow.hpp"
+
+using namespace ams;
+
+#define SIZE 32
+
+
+std::vector<std::int64_t> getDims(const std::string input, char delimiter)
+{
+  std::vector<int64_t> tokens;
+  std::stringstream ss(input);
+  std::string token;
+
+  while (std::getline(ss, token, delimiter)) {
+    tokens.push_back(std::stoi(token));
+  }
+
+  return tokens;
+}
+
+
+// Function to read a dataset and compare it with the expected tensor
+bool verifyDatasetContents(const std::string& fileName,
+                           const std::string& datasetName,
+                           torch::Tensor& expectedTensor)
+{
+  // Open the HDF5 file
+  hid_t file_id = H5Fopen(fileName.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
+  if (file_id < 0) {
+    throw std::runtime_error("Failed to open HDF5 file.");
+  }
+
+  // Open the dataset
+  hid_t dset_id = H5Dopen2(file_id, datasetName.c_str(), H5P_DEFAULT);
+  if (dset_id < 0) {
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to open dataset.");
+  }
+
+  // Get the dataspace
+  hid_t space_id = H5Dget_space(dset_id);
+  if (space_id < 0) {
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to get dataspace.");
+  }
+
+  // Get the dataset dimensions
+  int ndims = H5Sget_simple_extent_ndims(space_id);
+  if (ndims < 0) {
+    H5Sclose(space_id);
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to get number of dimensions.");
+  }
+
+  std::vector<hsize_t> dims(ndims);
+  if (H5Sget_simple_extent_dims(space_id, dims.data(), NULL) < 0) {
+    H5Sclose(space_id);
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to get dataset dimensions.");
+  }
+
+  // Close dataspace
+  H5Sclose(space_id);
+
+  // Flatten the dataset dimensions into a total size
+  size_t totalSize = 1;
+  for (const auto& dim : dims) {
+    totalSize *= dim;
+  }
+
+  // Allocate a tensor to read the dataset
+  auto readTensor =
+      torch::empty({static_cast<int64_t>(totalSize)}, torch::kFloat);
+
+  // Read the dataset into the tensor
+  herr_t status = H5Dread(dset_id,
+                          H5T_NATIVE_FLOAT,
+                          H5S_ALL,
+                          H5S_ALL,
+                          H5P_DEFAULT,
+                          readTensor.data_ptr());
+  if (status < 0) {
+    H5Dclose(dset_id);
+    H5Fclose(file_id);
+    throw std::runtime_error("Failed to read dataset.");
+  }
+
+  // Close dataset and file
+  H5Dclose(dset_id);
+  H5Fclose(file_id);
+
+  // Concatenate all expected tensors into one
+  expectedTensor = expectedTensor.flatten();
+
+  // Compare the tensors
+  if (!torch::allclose(readTensor, expectedTensor)) {
+    throw std::runtime_error(
+        "Dataset contents do not match the expected tensors.");
+  }
+
+  std::cout << "Dataset contents match the expected tensors!" << std::endl;
+  return true;
+}
+
+
+template <typename T, torch::Dtype DType, torch::DeviceType DeviceType>
+void compute(ams::AMSWorkflow& wf,
+             std::vector<torch::Tensor>& orig_in,
+             std::vector<torch::Tensor>& orig_inout,
+             std::vector<torch::Tensor>& orig_out,
+             T& broadcastVal,
+             bool has_broadcast = false)
+{
+
+  auto callBack = [&](const ams::SmallVector<ams::AMSTensor>& pruned_ins,
+                      ams::SmallVector<ams::AMSTensor>& pruned_inouts,
+                      ams::SmallVector<ams::AMSTensor>& pruned_outs) {
+    int numIn = pruned_ins.size();
+    int numInOut = pruned_inouts.size();
+    int numOut = pruned_outs.size();
+    int numElements = 0;
+    std::cout << "Num ins are " << numIn << "\n";
+    std::cout << "Num inouts are " << numInOut << "\n";
+    std::cout << "Num outs are " << numOut << "\n";
+    if (pruned_ins.size() != 0) {
+      numElements = pruned_ins[0].shape()[0];
+    } else if (pruned_inouts.size() != 0) {
+      numElements = pruned_inouts[0].shape()[0];
+    } else {
+      throw std::runtime_error(
+          "call back should be called at least with some elements in batch "
+          "axis");
+    }
+
+    // I am converthing all ams - tensors to torch - tensors. This is a conveniency for testing,
+    // as I can execute arbitary GPU code.
+    std::vector<torch::Tensor> in;
+    for (auto& V : pruned_ins) {
+      c10::IntArrayRef shape(V.shape().begin(), V.shape().size());
+      std::cout << "Pointer of in " << V.data<float>() << "\n";
+      in.push_back(torch::from_blob((void*)V.data<uint8_t>(),
+                                    shape,
+                                    torch::TensorOptions().dtype(DType).device(
+                                        DeviceType)));
+    }
+
+    std::vector<torch::Tensor> inout;
+    for (auto& V : pruned_inouts) {
+      std::cout << "Pointer of inout " << V.data<float>() << "\n";
+      c10::IntArrayRef shape(V.shape().begin(), V.shape().size());
+      inout.push_back(torch::from_blob(
+          (void*)V.data<uint8_t>(),
+          shape,
+          torch::TensorOptions().dtype(DType).device(DeviceType)));
+    }
+
+    std::vector<torch::Tensor> out;
+    for (auto& V : pruned_outs) {
+      c10::IntArrayRef shape(V.shape().begin(), V.shape().size());
+      std::cout << "Pointer of out " << V.data<uint8_t>() << "\n";
+      out.push_back(torch::from_blob((void*)V.data<uint8_t>(),
+                                     shape,
+                                     torch::TensorOptions().dtype(DType).device(
+                                         DeviceType)));
+    }
+
+
+    torch::Tensor identity_matrix =
+        torch::eye(out.size() + inout.size(),
+                   torch::TensorOptions().dtype(DType).device(DeviceType));
+
+    // Iterate over all elements
+    for (int i = 0; i < numElements; i++) {
+      // Create a tensor to aggregate input values
+      torch::Tensor aggregate =
+          torch::zeros({1, numIn + numInOut},
+                       torch::TensorOptions().dtype(DType).device(DeviceType));
+
+      // Fill aggregate with cumulative sums from `in` tensors
+      for (int j = 0; j < numIn; j++) {
+        aggregate[0][j] = in[j][i][0];
+      }
+
+
+      // Continue filling aggregate with cumulative sums from `inout` tensors
+      for (int j = 0; j < numInOut; j++) {
+        aggregate[0][numIn + j] = inout[j][i][0];
+      }
+
+      std::cout << "Aggr:" << aggregate << "\n";
+      std::cout << "IDM" << identity_matrix << "\n";
+      auto res = aggregate.matmul(identity_matrix) * 13.0;
+      std::cout << "Res " << res << "\n";
+
+      // Assign to `out` tensors using modulo indexing
+      for (int j = 0; j < numOut; j++) {
+        out[j][i][0] = res[0][j];
+      }
+
+      // Update `inout` tensors using modulo indexing
+      for (int j = 0; j < numInOut; j++) {
+        std::cout << "Setting in out for res" << res[0] << "\n";
+        inout[j][i][0] = res[0][numOut + j];
+      }
+    }
+  };
+  wf.evaluate(callBack, orig_in, orig_inout, orig_out);
+}
+
+
+int main(int argc, char* argv[])
+{
+
+  if (argc != 10) {
+    std::cout << "Wrong command line\n";
+    std::cout << argv[0]
+              << " <physics type (float|double)> <physics-device (cpu|cuda)> "
+                 "<path-to-model> <duq-type> <threshold> <input-dim-shape "
+                 "(1024,2) <output-dim-shape (1024, 2) > <path to db> "
+                 "<num-in-outs\n";
+    return -1;
+  }
+  torch::Dtype DType = torch::kFloat32;
+  torch::DeviceType dev = c10::DeviceType::CPU;
+
+  std::string Type(argv[1]);
+  std::string device(argv[2]);
+  std::string model_path(argv[3]);
+  std::string duq_type(argv[4]);
+  float threshold = std::atof(argv[5]);
+  std::vector<int64_t> iShape(getDims(argv[6], ','));
+  std::vector<int64_t> oShape(getDims(argv[7], ','));
+  std::string db_path(argv[8]);
+  int numInOuts = std::atoi(argv[9]);
+  auto& db_instance = ams::db::DBManager::getInstance();
+  db_instance.instantiate_fs_db(AMSDBType::AMS_HDF5, db_path);
+
+
+  AMSUQPolicy duq;
+
+  if (duq_type.compare("duq_mean") == 0) {
+    duq = AMSUQPolicy::AMS_DELTAUQ_MEAN;
+  } else if (duq_type.compare("duq_max") == 0) {
+    duq = AMSUQPolicy::AMS_DELTAUQ_MAX;
+  } else if (duq_type.compare("random") == 0) {
+    duq = AMSUQPolicy::AMS_RANDOM;
+  } else {
+    std::cout << "Unknown dUQ \n";
+    return 1;
+  }
+
+
+  if (Type.compare("double") == 0) {
+    DType = torch::kFloat64;
+  }
+
+  if (Type.compare("double") == 0) DType = torch::kFloat64;
+
+  if (device.compare("cuda") == 0) dev = c10::DeviceType::CUDA;
+  std::string domain_name("test");
+  std::string db_label("db_label");
+
+
+  auto tOptions = torch::TensorOptions().dtype(DType).device(dev);
+  std::string filename;
+
+  {
+    ams::AMSWorkflow wf = ams::AMSWorkflow(
+        model_path, domain_name, db_label, threshold, duq, 0, 1);
+
+
+    filename = wf.getDBFilename();
+
+    // How many numInOuts are we going to have in this test
+    std::vector<torch::Tensor> in;
+    std::vector<torch::Tensor> inout;
+    std::vector<torch::Tensor> out;
+    // Get the number of inputs for this test
+    int numIn = iShape[iShape.size() - 1] - numInOuts;
+    for (auto i = 0; i < numIn; i++) {
+      in.push_back(torch::ones({SIZE, 1}, tOptions));
+    }
+    for (auto i = 0l; i < numInOuts; i++) {
+      inout.push_back(torch::ones({SIZE, 1}, tOptions));
+    }
+
+    int numOut = oShape[oShape.size() - 1] - numInOuts;
+    for (auto i = 0; i < numOut; i++) {
+      out.push_back(torch::zeros({SIZE, 1}, tOptions));
+    }
+
+    // Call compute_torch
+    float fbroadcastVal = 0.0;
+    double dbroadcastVal = 0.0;
+    std::cout << "Creating workflow with:\n";
+    std::cout << "NumIn " << numIn << " " << in.size() << "\n";
+    std::cout << "NumOut " << numOut << " " << out.size() << "\n";
+    std::cout << "NumInOut " << numInOuts << " " << inout.size() << "\n";
+    if (DType == torch::kFloat64 && dev == c10::DeviceType::CUDA)
+      compute<double, torch::kFloat64, c10::DeviceType::CUDA>(
+          wf, in, inout, out, dbroadcastVal, false);
+    else if (DType == torch::kFloat32 && dev == c10::DeviceType::CUDA)
+      compute<float, torch::kFloat32, c10::DeviceType::CUDA>(
+          wf, in, inout, out, fbroadcastVal, false);
+    else if (DType == torch::kFloat64 && dev == c10::DeviceType::CPU)
+      compute<double, torch::kFloat64, c10::DeviceType::CPU>(
+          wf, in, inout, out, dbroadcastVal, false);
+    else if (DType == torch::kFloat32 && dev == c10::DeviceType::CPU)
+      compute<float, torch::kFloat32, c10::DeviceType::CPU>(
+          wf, in, inout, out, fbroadcastVal, false);
+
+    // We do this, as AMS should ignore completely the threshold
+    // value when it doesn't have a model
+    if (model_path.empty()) threshold = 0.0;
+
+    for (auto& V : {inout, out}) {
+      for (auto i = 0; i < V.size(); i++) {
+        auto data = V[i];
+        if (threshold == 0.0) {
+          auto correct = torch::ones(data.sizes(), data.options()) * 13;
+          bool close = torch::allclose(correct, data, 1e-5, 1e-8);
+          if (!close) {
+            std::cout << "Values are not close\n";
+            std::cout << data << "\n";
+            std::cout << "Correct data are "
+                      << "\n";
+            std::cout << correct << "\n";
+            return -1;
+          }
+        } else if (threshold == 0.5) {
+          auto correct = torch::ones(data.sizes(), data.options());
+          // Create a tensor with values [0, 1, 2, ..., size-1]
+          auto indices = torch::arange(data.sizes()[0], data.options());
+
+          auto alternating_tensor = (indices % 2) * 12;
+          alternating_tensor = alternating_tensor.reshape({data.sizes()[0], 1});
+          correct += alternating_tensor;
+          // Use modulo operation to create alternating 0s and 1s
+          bool close = torch::allclose(correct, data, 1e-5, 1e-8);
+          if (!close) {
+            std::cout << "Values are not close\n";
+            std::cout << data << "\n";
+            std::cout << "Correct data are "
+                      << "\n";
+            std::cout << correct << "\n";
+            return -1;
+          }
+        } else if (threshold == 1.0) {
+          auto correct = torch::ones(data.sizes(), data.options());
+          bool close = torch::allclose(correct, data, 1e-5, 1e-8);
+          if (!close) {
+            std::cout << "Values are not close\n";
+            std::cout << data << "\n";
+            std::cout << "Correct data are "
+                      << "\n";
+            std::cout << correct << "\n";
+            return -1;
+          }
+        } else {
+          std::cout << "Unknown threshold value\n";
+        }
+      }
+    }
+    in.clear();
+    inout.clear();
+    out.clear();
+  }
+
+  // Reverse to compute how many physics we want.
+  threshold = 1 - threshold;
+
+  if (threshold > 0) {
+    int numIn = iShape[iShape.size() - 1];
+    auto expectedInput =
+        torch::ones({(long)(SIZE * threshold), numIn},
+                    torch::TensorOptions().dtype(torch::kFloat32));
+
+    int numOut = iShape[oShape.size() - 1];
+    auto expectedOutput =
+        torch::ones({(long)(SIZE * threshold), numOut},
+                    torch::TensorOptions().dtype(torch::kFloat32)) *
+        13;
+
+    std::cout << "Output size :\n" << expectedOutput.sizes() << "\n";
+    std::cout << "Input size :\n" << expectedInput.sizes() << "\n";
+
+    auto& dbg_mg = ams::db::DBManager::getInstance();
+    dbg_mg.clean();
+    if (threshold != 1.0)
+      if (!verifyDatasetContents(filename, "input_data", expectedInput) ||
+          !verifyDatasetContents(filename, "output_data", expectedOutput)) {
+        std::cout << "Could not verify outputs\n";
+        return -1;
+      }
+  }
+
+  return 0;
+}
diff --git a/tests/AMSlib/wf/scatter_physics.cpp b/tests/AMSlib/wf/scatter_physics.cpp
new file mode 100644
index 00000000..545b3929
--- /dev/null
+++ b/tests/AMSlib/wf/scatter_physics.cpp
@@ -0,0 +1,61 @@
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/rand.h>
+#include <c10/core/DeviceType.h>
+#include <torch/types.h>
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+
+#include "wf/workflow.hpp"
+
+
+int main(int argc, char *argv[])
+{
+
+  if (argc != 3) {
+    std::cout << "Wrong command line\n";
+    std::cout << argv[0] << " <type (float|double)> <device>\n";
+    return -1;
+  }
+  torch::Dtype DType = torch::kFloat32;
+  torch::DeviceType dev = c10::DeviceType::CPU;
+
+  std::string Type(argv[1]);
+  std::string device(argv[2]);
+
+
+  if (Type.compare("double") == 0) {
+    DType = torch::kFloat64;
+  }
+
+  if (device.compare("cuda") == 0) dev = c10::DeviceType::CUDA;
+
+  ams::SmallVector<torch::Tensor> entireDomain;
+  ams::SmallVector<torch::Tensor> computedDomain;
+  for (int i = 0; i < 4; i++) {
+    entireDomain.push_back(
+        torch::zeros({128, 7},
+                     torch::TensorOptions().dtype(DType).device(dev)));
+    computedDomain.push_back(
+        torch::rand({64, 7}, torch::TensorOptions().dtype(DType).device(dev)));
+  }
+
+  auto tmp = torch::arange(0, 128, torch::kInt64) % 2;
+  auto Predicate = tmp.to(torch::kBool) != 0;
+
+
+  ams::AMSWorkflow::ScatterPhysicOutputsToOrigDomain(computedDomain,
+                                                     Predicate,
+                                                     entireDomain);
+
+  for (int i = 0; i < computedDomain.size(); i++) {
+    auto cd = computedDomain[i];
+    auto ed = entireDomain[i].index({Predicate});
+    bool close = torch::allclose(ed, cd, 1e-5, 1e-8);
+
+    if (!close) return 1;
+  }
+
+  return 0;
+}
diff --git a/tests/AMSlib/wf/subselect_tensors.cpp b/tests/AMSlib/wf/subselect_tensors.cpp
new file mode 100644
index 00000000..a9b427f2
--- /dev/null
+++ b/tests/AMSlib/wf/subselect_tensors.cpp
@@ -0,0 +1,58 @@
+
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/rand.h>
+#include <c10/core/DeviceType.h>
+#include <torch/types.h>
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+
+#include "wf/workflow.hpp"
+
+
+int main(int argc, char *argv[])
+{
+
+  if (argc != 3) {
+    std::cout << "Wrong command line\n";
+    std::cout << argv[0] << " <type (float|double)> <device>\n";
+    return -1;
+  }
+  torch::Dtype DType = torch::kFloat32;
+  torch::DeviceType dev = c10::DeviceType::CPU;
+
+  std::string Type(argv[1]);
+  std::string device(argv[2]);
+
+
+  if (Type.compare("double") == 0) {
+    DType = torch::kFloat64;
+  }
+
+  if (Type.compare("double") == 0) DType = torch::kFloat64;
+
+  if (device.compare("cuda") == 0) dev = c10::DeviceType::CUDA;
+
+  ams::SmallVector<torch::Tensor> vectors;
+  for (int i = 0; i < 4; i++)
+    vectors.push_back(
+        torch::rand({32, 11}, torch::TensorOptions().dtype(DType).device(dev)));
+
+  auto tmp = torch::arange(0, 32, torch::kInt64) % 2;
+  auto Predicate = tmp.to(torch::kBool) != 0;
+
+  auto subselectedTensors =
+      ams::AMSWorkflow::subSelectTensors(vectors, Predicate);
+
+  for (int i = 0; i < vectors.size(); i++) {
+    auto sb = subselectedTensors[i];
+    auto orig = vectors[i];
+    orig = orig.index({Predicate});
+    bool close = torch::allclose(orig, sb, 1e-5, 1e-8);
+
+    if (!close) return 1;
+  }
+
+  return 0;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 103bac3e..f4e11848 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,6 +3,26 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+message(STATUS "Python executable: ${Python3_EXECUTABLE}")
+
+# Check if h5py is installed
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} -c "import h5py"
+    RESULT_VARIABLE H5PY_CHECK_RESULT
+    OUTPUT_QUIET
+    ERROR_QUIET
+)
+
+if(NOT H5PY_CHECK_RESULT EQUAL 0)
+    message(FATAL_ERROR
+        "Missing Python dependency: h5py\n"
+        "Try running:\n"
+        "    pip install h5py\n"
+        "Or ensure it is available in your environment."
+    )
+endif()
+
 if (WITH_AMS_LIB)
   add_subdirectory(AMSlib)
 endif()
diff --git a/tests/utils.hpp b/tests/utils.hpp
deleted file mode 100644
index bef7b939..00000000
--- a/tests/utils.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
- * AMSLib Project Developers
- *
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- */
-
- #ifndef __TEST_UTILS__
- #define __TEST_UTILS__
- 
-#include <csignal>
-#include <unistd.h>
-#include <execinfo.h>
-
-// Signal handler to print the stack trace
-void signalHandler(int signum) {
-    const char* msg = "[signalHandler] Caught signal\n";
-    write(STDERR_FILENO, msg, sizeof(msg));
-
-    // Obtain the backtrace
-    const int maxFrames = 128;
-    void *addrlist[maxFrames];
-
-    // Get void*'s for all entries on the stack
-    int addrlen = backtrace(addrlist, maxFrames);
-
-    if (addrlen == 0) {
-        const char* no_stack = "No stack trace available\n";
-        write(STDERR_FILENO, no_stack, sizeof(no_stack));
-        _exit(1); // _exit() Cannot be trap, interrupted
-    }
-
-    // Print out all the frames to stderr
-    backtrace_symbols_fd(addrlist, addrlen, STDERR_FILENO);
-    _exit(1);
-}
-
-
-void installSignals() {
-    std::signal(SIGSEGV, signalHandler); // segmentation fault
-    std::signal(SIGABRT, signalHandler); // abort()
-    std::signal(SIGFPE, signalHandler);  // floating-point exception
-    std::signal(SIGILL, signalHandler);  // illegal instruction
-    std::signal(SIGINT, signalHandler);  // interrupt (e.g., Ctrl+C)
-    std::signal(SIGTERM, signalHandler); // termination request
-    std::signal(SIGPIPE, signalHandler); // broken pipe  
-}
-
-#endif
\ No newline at end of file
diff --git a/tutorial/Chapter.1.DataFlow/0.Build/CMakeLists.txt b/tutorial/Chapter.1.DataFlow/0.Build/CMakeLists.txt
new file mode 100644
index 00000000..7feb67e4
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/0.Build/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.10)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX0 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex0.cpp)
+
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX0 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX0 PRIVATE AMS::AMS)
diff --git a/tutorial/Chapter.1.DataFlow/0.Build/README.md b/tutorial/Chapter.1.DataFlow/0.Build/README.md
new file mode 100644
index 00000000..4e485628
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/0.Build/README.md
@@ -0,0 +1,59 @@
+# Build and Link with AMS
+
+AMS is a standard cmake package providing a `AMS-config.cmake` file and the AMS target.
+We provide an example C++ code and the respective `cmake` file to build and link with AMS. 
+
+## Include and link with AMS
+
+To use AMS we need to `find_package(AMS REQUIRED)` ([here](./CMakeLists.txt#L22)) and then use the `AMS::AMS` target in the `cmake` function `target_link_libraries`([here](./CMakeLists.txt#L36)).
+
+## Example Code 
+
+The [example](./ex0.cpp) code is very simple. The user provides in the *cli* the length of 2 vectors, the example initializes the `input` vector from *0-length-1* values and then assigns these values to the [output vector](./ex0.cpp#L52) finally it computes the sum of all elements in the output vector and prints the sum at the terminal with the expected output value.
+
+## Configure and link
+
+To configure the example please provide these commands:
+
+```
+mkdir build
+cd build
+cmake  ../
+```
+
+If AMS is not installed in a default cmake directory please also provide the path to the AMS installation by passing the option 
+`-DAMS_DIR=<path-to-install>` to the cmake command. 
+
+## Execute example
+
+To execute the example please provide the following *cli*:
+
+```
+./EX0 -l 10
+```
+
+And the expected output should be:
+
+```
+[Example] Expected output is 45 and computed 45
+```
+
+## Enable Logger
+
+The container's AMS version is linked and working with the AMS logger to enable it you can execute:
+
+```
+AMS_LOG_LEVEL=debug ./EX0 -l 10
+```
+
+The output should look like this:
+
+```
+[AMS:DEBUG:ResourceManager] Initialization of allocators
+[AMS:DEBUG:ResourceManager] Set Allocator [0] to pool with name : HOST
+[Example] Expected output is 45 and computed 45
+[AMS:DEBUG:AMS] Finalization of AMS
+[AMS:DEBUG:AMSDefaultDeviceAllocator] Destroying default host allocator
+```
+
+There are the following log levels, debug, info, warning.
diff --git a/tutorial/Chapter.1.DataFlow/0.Build/common.hpp b/tutorial/Chapter.1.DataFlow/0.Build/common.hpp
new file mode 100644
index 00000000..ae8ef4f3
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/0.Build/common.hpp
@@ -0,0 +1,132 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.1.DataFlow/0.Build/ex0.cpp b/tutorial/Chapter.1.DataFlow/0.Build/ex0.cpp
new file mode 100644
index 00000000..90bb304b
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/0.Build/ex0.cpp
@@ -0,0 +1,65 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  InitMemBlob(input, length);
+  ExampleCompute(input, output, length);
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.1.DataFlow/1.AMSTensor/CMakeLists.txt b/tutorial/Chapter.1.DataFlow/1.AMSTensor/CMakeLists.txt
new file mode 100644
index 00000000..ce83ac62
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/1.AMSTensor/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright 2021-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.10)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX1 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex1.cpp)
+set(AMS_SOL_SRC ex1.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX1 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX1 PRIVATE AMS::AMS)
+
+add_executable(SOL1 ${AMS_SOL_SRC})
+target_link_libraries(SOL1 PRIVATE AMS::AMS)
+
diff --git a/tutorial/Chapter.1.DataFlow/1.AMSTensor/README.md b/tutorial/Chapter.1.DataFlow/1.AMSTensor/README.md
new file mode 100644
index 00000000..5a78a0b6
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/1.AMSTensor/README.md
@@ -0,0 +1,17 @@
+# AMS Tensors
+
+AMS (partially) replaces some arbitary function with Machine Learning (Torch) model. To properly perform the replacement AMS requires information regarding the memory accesses of the computational functions. The lower building block is the `AMSTensor` a simple abstraction that describes continuous blobs of memory and their access pattern. 
+
+## The AMSTensor
+
+The AMSTensor is a C++ abstraction that associates a contineous memory blob with some access pattern and reshaping. In other words, it represents the memory as a tensor. The AMSTensor is a shim layer on top of the torch tensor representation and currently only isolates the binary linkage of the example/application code to the torch librarry.
+
+### Use AMSTensors in the example code 
+
+Introduce a function that takes as arguments AMSTensor and performs the same computation as the original `ExampleCompute`. Modify the [example code](./ex1.cpp) accordingly. Here is the signature of the requested function: 
+
+`
+void ExampleAMSTensorCompute(ams::AMSTensor& in, ams::AMSTensor& out)
+`
+
+
diff --git a/tutorial/Chapter.1.DataFlow/1.AMSTensor/common.hpp b/tutorial/Chapter.1.DataFlow/1.AMSTensor/common.hpp
new file mode 100644
index 00000000..ae8ef4f3
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/1.AMSTensor/common.hpp
@@ -0,0 +1,132 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.1.DataFlow/1.AMSTensor/ex1.cpp b/tutorial/Chapter.1.DataFlow/1.AMSTensor/ex1.cpp
new file mode 100644
index 00000000..90bb304b
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/1.AMSTensor/ex1.cpp
@@ -0,0 +1,65 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  InitMemBlob(input, length);
+  ExampleCompute(input, output, length);
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.1.DataFlow/1.AMSTensor/sol1.cpp b/tutorial/Chapter.1.DataFlow/1.AMSTensor/sol1.cpp
new file mode 100644
index 00000000..1d3ddf61
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/1.AMSTensor/sol1.cpp
@@ -0,0 +1,86 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+      "-l",
+      "--length",
+      "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  InitMemBlob(input, length);
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  ams::AMSTensor InT = ams::AMSTensor::view(input,
+      {length, 1},
+      {1, 1},
+      ams::AMSResourceType::AMS_HOST);
+
+  ams::AMSTensor OutT = ams::AMSTensor::view(output,
+      {length, 1},
+      {1, 1},
+      ams::AMSResourceType::AMS_HOST);
+
+
+  ExampleAMSTensorCompute(InT, OutT);
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+    << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.1.DataFlow/2.DataFlow/CMakeLists.txt b/tutorial/Chapter.1.DataFlow/2.DataFlow/CMakeLists.txt
new file mode 100644
index 00000000..17209ed1
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/2.DataFlow/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.20)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX2 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex2.cpp)
+set(AMS_SOL_SRC ex2.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX2 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX2 PRIVATE AMS::AMS)
+
+add_executable(SOL2 ${AMS_SOL_SRC})
+target_link_libraries(SOL2 PRIVATE AMS::AMS)
+
diff --git a/tutorial/Chapter.1.DataFlow/2.DataFlow/README.md b/tutorial/Chapter.1.DataFlow/2.DataFlow/README.md
new file mode 100644
index 00000000..4dd0ef9e
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/2.DataFlow/README.md
@@ -0,0 +1,25 @@
+# AMS DataFlow
+AMS (partially) replaces some arbitary function with Machine Learning (Torch) model. The `AMSTensor` provides information to the AMS runtime regarding the memory access pattern of a single memory blob. 
+
+However, an arbitary function can access multiple memory blobs with different intentions. AMS categorizes intention in 3 categories:
+
+1. Memory locations that are being written by the underlying computation and are considered a result of the function. In the example this would be `out`.
+2. Memory locations that are being read by the underlying computation and is "necessary" for the mathematical formulation of the underlying result. Intermediate inputs or temporal variables can be ignored. In the example this would be `in`.
+3. Memory locations that are being read AND written  by the underlying computation and is "necessary" for the mathematical formulation of the underlying result. Intermediate inputs or temporal variables can be ignored. 
+
+## The SmallVector
+
+Multiple memory blobs of the same intention can be packed together in a vector. AMS instead of using the `std::vector` uses ams::SmallVector a lightweight C++ vector abstraction (originated from the LLVM project) that can be allocated in the stack and is more efficient. 
+
+### Extend the example code to define a EOSLambda.
+
+Create a C++ lambda that takes 3 input parameters (1 for each memory category), each of Smallvector type storing the AMSTensors. The signature of the lambda should look like this:
+
+```cpp
+auto compute = [&](const ams::SmallVector<ams::AMSTensor> &,
+        ams::SmallVector<ams::AMSTensor> &,
+        ams::SmallVector<ams::AMSTensor> &);
+```
+
+
+
diff --git a/tutorial/Chapter.1.DataFlow/2.DataFlow/common.hpp b/tutorial/Chapter.1.DataFlow/2.DataFlow/common.hpp
new file mode 100644
index 00000000..ae8ef4f3
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/2.DataFlow/common.hpp
@@ -0,0 +1,132 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.1.DataFlow/2.DataFlow/ex2.cpp b/tutorial/Chapter.1.DataFlow/2.DataFlow/ex2.cpp
new file mode 100644
index 00000000..1d3ddf61
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/2.DataFlow/ex2.cpp
@@ -0,0 +1,86 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+      "-l",
+      "--length",
+      "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  InitMemBlob(input, length);
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  ams::AMSTensor InT = ams::AMSTensor::view(input,
+      {length, 1},
+      {1, 1},
+      ams::AMSResourceType::AMS_HOST);
+
+  ams::AMSTensor OutT = ams::AMSTensor::view(output,
+      {length, 1},
+      {1, 1},
+      ams::AMSResourceType::AMS_HOST);
+
+
+  ExampleAMSTensorCompute(InT, OutT);
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+    << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.1.DataFlow/2.DataFlow/sol2.cpp b/tutorial/Chapter.1.DataFlow/2.DataFlow/sol2.cpp
new file mode 100644
index 00000000..60316b88
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/2.DataFlow/sol2.cpp
@@ -0,0 +1,95 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+      "-l",
+      "--length",
+      "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  InitMemBlob(input, length);
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(input,
+      {length, 1},
+      {1, 1},
+      ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(output,
+      {length, 1},
+      {1, 1},
+      ams::AMSResourceType::AMS_HOST));
+
+  auto Computation =
+          [&](const ams::SmallVector<ams::AMSTensor> &ams_ins,
+              ams::SmallVector<ams::AMSTensor> &ams_inouts,
+              ams::SmallVector<ams::AMSTensor> &ams_outs) {
+  		ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+	  }
+  Computation(input_tensors, output_tensors);
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+    << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.1.DataFlow/3.CompoundModel/CMakeLists.txt b/tutorial/Chapter.1.DataFlow/3.CompoundModel/CMakeLists.txt
new file mode 100644
index 00000000..e4b51d05
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/3.CompoundModel/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.20)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX2 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex3.cpp)
+set(AMS_SOL_SRC sol3.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX3 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX3 PRIVATE AMS::AMS)
+
+add_executable(SOL3 ${AMS_SOL_SRC})
+target_link_libraries(SOL3 PRIVATE AMS::AMS)
+
diff --git a/tutorial/Chapter.1.DataFlow/3.CompoundModel/README.md b/tutorial/Chapter.1.DataFlow/3.CompoundModel/README.md
new file mode 100644
index 00000000..c91063d8
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/3.CompoundModel/README.md
@@ -0,0 +1,36 @@
+# AMS Core Concepts 
+
+
+## AMS Compound Model
+AMS (partially) replaces some arbitary function with a surrogate ML (torch) model. However, AMS does more than that, it is reponsible to perform:
+1. Data collection for that specific function
+2. Uncertainty quantification
+3. Model inference.
+
+To model all these 3 entities AMS provides a simple mapping between the original computation (the function to be replaced) with an *AMSCompoundModel*. The *AMSCompoundModel* enapuslates all these 3 capabilities and the AMS API associates the application function (referred in AMS terminology as domain) with the respective compound model. *In AMS API we represent a function with a user provided string, this is necessary to refer to the same function with a user provided string*.
+
+The AMS Compound Model is a persistent data structure that will be persist inside the application during application execution time and does not define any computation.
+
+
+### Create an empty Compound Model and associate it with a domain.
+
+Use the AMS API to create a AMS Compound model and register it with AMS.
+
+#### Discussion about solution
+
+
+In the solution we are using the following API call:
+
+```CPP
+  AMSCAbstrModel model_descr = 
+    AMSRegisterAbstractModel(
+      "compute", ams::AMSUQPolicy::AMS_RANDOM, -1.0, "", "compute");
+```
+
+
+Most of the parameters passed to the `AMSRegisterAbstractModel` are place holders and in the next examples we will fill in accordingly. 
+
+## AMS Executor 
+
+Besides the compound model, AMS provides the notion of the `AMSExecutor`, the executor is reponsible to perform all necessary actions described in the compound model and when uncertain it will fall back to the existing solution.
+Essentially the executor associates the domain name, domain function with a compound model and performs the necessary actions. 
diff --git a/tutorial/Chapter.1.DataFlow/3.CompoundModel/common.hpp b/tutorial/Chapter.1.DataFlow/3.CompoundModel/common.hpp
new file mode 100644
index 00000000..ae8ef4f3
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/3.CompoundModel/common.hpp
@@ -0,0 +1,132 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.1.DataFlow/3.CompoundModel/ex3.cpp b/tutorial/Chapter.1.DataFlow/3.CompoundModel/ex3.cpp
new file mode 100644
index 00000000..a67c0079
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/3.CompoundModel/ex3.cpp
@@ -0,0 +1,92 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  InitMemBlob(input, length);
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  auto Computation = [&](ams::SmallVector<ams::AMSTensor>& ams_ins,
+                         ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                         ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  Computation(input_tensors, inout_tensors, output_tensors);
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.1.DataFlow/3.CompoundModel/sol3.cpp b/tutorial/Chapter.1.DataFlow/3.CompoundModel/sol3.cpp
new file mode 100644
index 00000000..b773aaa5
--- /dev/null
+++ b/tutorial/Chapter.1.DataFlow/3.CompoundModel/sol3.cpp
@@ -0,0 +1,100 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_NONE, "");
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "compute", ams::AMSUQPolicy::AMS_RANDOM, -1.0, "", "");
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.2.DB/1.HDF5/CMakeLists.txt b/tutorial/Chapter.2.DB/1.HDF5/CMakeLists.txt
new file mode 100644
index 00000000..7417eb8b
--- /dev/null
+++ b/tutorial/Chapter.2.DB/1.HDF5/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.20)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX4 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex4.cpp)
+set(AMS_SOL_SRC sol4.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX4 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX4 PRIVATE AMS::AMS)
+
+add_executable(SOL4 ${AMS_SOL_SRC})
+target_link_libraries(SOL4 PRIVATE AMS::AMS)
+
diff --git a/tutorial/Chapter.2.DB/1.HDF5/README.md b/tutorial/Chapter.2.DB/1.HDF5/README.md
new file mode 100644
index 00000000..d23e3b18
--- /dev/null
+++ b/tutorial/Chapter.2.DB/1.HDF5/README.md
@@ -0,0 +1,14 @@
+# Data collection through the hdf5 interface
+
+AMS provides file system databases and in-situ data processing capabilities. File system databases are created in an existing directory and files follow the hdf5 file format.
+
+Within the hdf5 files there will exist 2 *datasets* the inputs and the outputs.
+1. Inputs will contain all input tensors and inout tensors as defined by the application executor. 
+2. Outputs contain both output tensors and inout tensors as defined by the application executor.
+
+Please extend the example code to perform data collection through the AMS interface. 
+
+## Extend the solution cli to accept a directory to store all data to and the file name prefix.
+
+You will need to instantiate a file system database and assign a database label to the compound model. The database label will act as the prefix of the filename.  
+
diff --git a/tutorial/Chapter.2.DB/1.HDF5/common.hpp b/tutorial/Chapter.2.DB/1.HDF5/common.hpp
new file mode 100644
index 00000000..ae8ef4f3
--- /dev/null
+++ b/tutorial/Chapter.2.DB/1.HDF5/common.hpp
@@ -0,0 +1,132 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.2.DB/1.HDF5/ex4.cpp b/tutorial/Chapter.2.DB/1.HDF5/ex4.cpp
new file mode 100644
index 00000000..22a47278
--- /dev/null
+++ b/tutorial/Chapter.2.DB/1.HDF5/ex4.cpp
@@ -0,0 +1,101 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_NONE, "");
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "compute", ams::AMSUQPolicy::AMS_RANDOM, -1.0, "", "");
+
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.2.DB/1.HDF5/sol4.cpp b/tutorial/Chapter.2.DB/1.HDF5/sol4.cpp
new file mode 100644
index 00000000..dabf228e
--- /dev/null
+++ b/tutorial/Chapter.2.DB/1.HDF5/sol4.cpp
@@ -0,0 +1,112 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  std::string db_path;
+  std::string fn;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.AddOption(&db_path,
+                 "-p",
+                 "--db-path",
+                 "The path of the file system database to store data to");
+  args.AddOption(&fn,
+                 "-f",
+                 "--filename-prefix",
+                 "The path of the file system database to store data to");
+
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_HDF5, db_path.c_str());
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "compute", ams::AMSUQPolicy::AMS_RANDOM, -1.0, "", fn.c_str());
+
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.2.DB/2.RMQ/README.md b/tutorial/Chapter.2.DB/2.RMQ/README.md
new file mode 100644
index 00000000..77fc024a
--- /dev/null
+++ b/tutorial/Chapter.2.DB/2.RMQ/README.md
@@ -0,0 +1 @@
+#TBD
diff --git a/tutorial/Chapter.3.Inference/1.HDF5/CMakeLists.txt b/tutorial/Chapter.3.Inference/1.HDF5/CMakeLists.txt
new file mode 100644
index 00000000..9e7f0bdd
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/1.HDF5/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.20)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX5 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex5.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX5 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX5 PRIVATE AMS::AMS)
diff --git a/tutorial/Chapter.3.Inference/1.HDF5/README.md b/tutorial/Chapter.3.Inference/1.HDF5/README.md
new file mode 100644
index 00000000..d27fd8d3
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/1.HDF5/README.md
@@ -0,0 +1,11 @@
+# Data collection through the hdf5 interface
+
+This example uses the solution of the data collection hdf5 chapter to create data to train our model on.
+
+Please build the example code and execute the following:
+
+```
+AMS_LOG_LEVEL=debug ./EX5 -l 100 -f db -p ../../
+```
+
+to generate a file `db_0.h5` 2 directories above the current one that will contain 1000 samples.
diff --git a/tutorial/Chapter.3.Inference/1.HDF5/common.hpp b/tutorial/Chapter.3.Inference/1.HDF5/common.hpp
new file mode 100644
index 00000000..ae8ef4f3
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/1.HDF5/common.hpp
@@ -0,0 +1,132 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.3.Inference/1.HDF5/ex5.cpp b/tutorial/Chapter.3.Inference/1.HDF5/ex5.cpp
new file mode 100644
index 00000000..dabf228e
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/1.HDF5/ex5.cpp
@@ -0,0 +1,112 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  std::string db_path;
+  std::string fn;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.AddOption(&db_path,
+                 "-p",
+                 "--db-path",
+                 "The path of the file system database to store data to");
+  args.AddOption(&fn,
+                 "-f",
+                 "--filename-prefix",
+                 "The path of the file system database to store data to");
+
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_HDF5, db_path.c_str());
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "compute", ams::AMSUQPolicy::AMS_RANDOM, -1.0, "", fn.c_str());
+
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.3.Inference/2.Train/README.md b/tutorial/Chapter.3.Inference/2.Train/README.md
new file mode 100644
index 00000000..24c1642b
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/2.Train/README.md
@@ -0,0 +1,4 @@
+# Data collection through the hdf5 interface
+
+Please write a python script to train a model using the data produced by the previous step. The script needs to store the final model into a torch::jit file.
+We provide as an example the [train](train.py) script which trains a model and properly stores it into a file. 
diff --git a/tutorial/Chapter.3.Inference/2.Train/train.py b/tutorial/Chapter.3.Inference/2.Train/train.py
new file mode 100644
index 00000000..cc66c4e4
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/2.Train/train.py
@@ -0,0 +1,82 @@
+import argparse
+from pathlib import Path
+import h5py
+import torch
+from torch import nn, optim
+from torch.utils.data import TensorDataset, DataLoader
+
+
+class SimpleModel(nn.Module):
+    # A simple model that contains a single linear layer
+    def __init__(self, in_features, out_features):
+        super(SimpleModel, self).__init__()
+        self.fc = nn.Linear(in_features, out_features, False)
+        # self.initialize_weights()
+
+    def initialize_weights(self):
+        # Check if in_features == out_features for identity initialization
+        if self.fc.weight.shape[0] == self.fc.weight.shape[1]:
+            nn.init.eye_(self.fc.weight)  # Initialize with identity matrix
+        else:
+            raise ValueError("Identity initialization requires in_features == out_features")
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Train and store a model.")
+    parser.add_argument("--filename", "-fn", type=str, required=True, help="The file containing the data to train on")
+    parser.add_argument("--epochs", "-e", type=int, help="Number of epochs", default=10)
+    parser.add_argument("--model-file", "-m", help="Filename of model", default="model")
+    args = parser.parse_args()
+    if not Path(args.filename).exists():
+        raise RuntimeError(f"Please provide an existing file, file {args.filename} does not exist")
+    device = "cpu"
+
+    with h5py.File(args.filename, "r") as fd:
+        X, y = fd["input_data"][:], fd["output_data"][:]
+        dataset = TensorDataset(
+            torch.from_numpy(X),
+            torch.from_numpy(y),
+        )
+        model = SimpleModel(X.shape[-1], y.shape[-1])
+        optimizer = optim.Adam(model.parameters(), lr=1e-3)
+        criterion = nn.MSELoss()
+        loader = DataLoader(dataset, batch_size=64, shuffle=True)
+
+        for epoch in range(1, args.epochs + 1):
+            model.train()
+            running_loss = 0.0
+
+            for xb, yb in loader:
+                xb, yb = xb.to(device), yb.to(device)
+
+                optimizer.zero_grad()
+                preds = model(xb)
+                loss = criterion(preds, yb)
+                loss.backward()
+                optimizer.step()
+
+                running_loss += loss.item() * xb.size(0)
+
+            epoch_loss = running_loss / len(loader.dataset)
+            print(f"Epoch {epoch:2d}/{args.epochs} — Loss: {epoch_loss:.4f}")
+        model = model.float()
+        prec = torch.float32
+        example_input = torch.randn(1, 1, device=device, dtype=prec)
+
+        # Trace the model
+        scripted_model = torch.jit.trace(model, example_input)
+
+        # Generate the file name
+        file_path = f"{args.model_file}.pt"
+        # Save the scripted model
+        scripted_model.save(file_path)
+
+        print(f"Model saved to {file_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorial/Chapter.3.Inference/3.Serve/CMakeLists.txt b/tutorial/Chapter.3.Inference/3.Serve/CMakeLists.txt
new file mode 100644
index 00000000..d4752b09
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/3.Serve/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.20)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX6 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex6.cpp)
+set(AMS_SOL_SRC sol6.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX6 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX6 PRIVATE AMS::AMS)
+
+add_executable(SOL6 ${AMS_SOL_SRC})
+target_link_libraries(SOL6 PRIVATE AMS::AMS)
+
diff --git a/tutorial/Chapter.3.Inference/3.Serve/README.md b/tutorial/Chapter.3.Inference/3.Serve/README.md
new file mode 100644
index 00000000..175970cd
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/3.Serve/README.md
@@ -0,0 +1,14 @@
+# Model inference through AMS interface.
+
+Please modify the example code to create a AMSCompoundModel with the model trained on the previous example. Since we have already defined compounding and the matching of the executors with the underlying function. The changes should be minimal.
+
+## Threshold and Uncertainty.
+
+In AMS every compound model is accompanied by some UQ method. When models do not define their own uncertainty approach, we will define by default *random* uncertainty. Although the implementation of the ML UQ mechanism will be different the implementation remains the same. 
+
+
+### Random Uncertainty
+
+In the case of random uncertainty *AMS* computes a random value in the interval [0,1] for every computed sample. When the uncertainty (the randomly generated value) is higher than a user provided threshold, AMS considers this as a highly uncertain inference and only computes the underlying function for that specific sample.
+
+In the provided [solution](./sol6.cpp) there is an extra cli parameter that defines the uncertainty threshold for the compound model. Use the parameter and explore both the reported values and how the size of the database increases.
diff --git a/tutorial/Chapter.3.Inference/3.Serve/common.hpp b/tutorial/Chapter.3.Inference/3.Serve/common.hpp
new file mode 100644
index 00000000..253bb0b8
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/3.Serve/common.hpp
@@ -0,0 +1,137 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, float* out)
+  {
+    *out = std::atof(s.c_str());
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.3.Inference/3.Serve/ex6.cpp b/tutorial/Chapter.3.Inference/3.Serve/ex6.cpp
new file mode 100644
index 00000000..dabf228e
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/3.Serve/ex6.cpp
@@ -0,0 +1,112 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  std::string db_path;
+  std::string fn;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.AddOption(&db_path,
+                 "-p",
+                 "--db-path",
+                 "The path of the file system database to store data to");
+  args.AddOption(&fn,
+                 "-f",
+                 "--filename-prefix",
+                 "The path of the file system database to store data to");
+
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_HDF5, db_path.c_str());
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr = AMSRegisterAbstractModel(
+      "compute", ams::AMSUQPolicy::AMS_RANDOM, -1.0, "", fn.c_str());
+
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.3.Inference/3.Serve/sol6.cpp b/tutorial/Chapter.3.Inference/3.Serve/sol6.cpp
new file mode 100644
index 00000000..4c3bbc3c
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/3.Serve/sol6.cpp
@@ -0,0 +1,124 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  std::string db_path;
+  std::string fn;
+  std::string model_path;
+  float threshold;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.AddOption(&db_path,
+                 "-p",
+                 "--db-path",
+                 "The path of the file system database to store data to");
+  args.AddOption(&fn,
+                 "-f",
+                 "--filename-prefix",
+                 "The path of the file system database to store data to");
+  args.AddOption(&model_path,
+                 "-m",
+                 "--model-path",
+                 "The path to a torchscript model");
+  args.AddOption(&threshold, "-t", "--threshold", "Model uncertainty");
+
+
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_HDF5, db_path.c_str());
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr =
+      AMSRegisterAbstractModel("compute",
+                               ams::AMSUQPolicy::AMS_RANDOM,
+                               threshold,
+                               model_path.c_str(),
+                               fn.c_str());
+
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.3.Inference/3.Serve/train.py b/tutorial/Chapter.3.Inference/3.Serve/train.py
new file mode 100644
index 00000000..ec76735a
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/3.Serve/train.py
@@ -0,0 +1,82 @@
+import argparse
+from pathlib import Path
+import h5py
+import torch
+from torch import nn, optim
+from torch.utils.data import TensorDataset, DataLoader
+
+
+class SimpleModel(nn.Module):
+    # A simple model that contains a single linear layer
+    def __init__(self, in_features, out_features):
+        super(SimpleModel, self).__init__()
+        self.fc = nn.Linear(in_features, out_features, False)
+        # self.initialize_weights()
+
+    def initialize_weights(self):
+        # Check if in_features == out_features for identity initialization
+        if self.fc.weight.shape[0] == self.fc.weight.shape[1]:
+            nn.init.eye_(self.fc.weight)  # Initialize with identity matrix
+        else:
+            raise ValueError("Identity initialization requires in_features == out_features")
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Train and store a model.")
+    parser.add_argument("--filename", "-fn", type=str, required=True, help="The file containing the data to train on")
+    parser.add_argument("--epochs", "-e", help="Number of epochs", default=10)
+    parser.add_argument("--model-file", "-m", help="Filename of model", default="model")
+    args = parser.parse_args()
+    if not Path(args.filename).exists():
+        raise RuntimeError(f"Please provide an existing file, file {args.filename} does not exist")
+    device = "cpu"
+
+    with h5py.File(args.filename, "r") as fd:
+        X, y = fd["input_data"][:], fd["output_data"][:]
+        dataset = TensorDataset(
+            torch.from_numpy(X),
+            torch.from_numpy(y),
+        )
+        model = SimpleModel(X.shape[-1], y.shape[-1])
+        optimizer = optim.Adam(model.parameters(), lr=1e-3)
+        criterion = nn.MSELoss()
+        loader = DataLoader(dataset, batch_size=64, shuffle=True)
+
+        for epoch in range(1, args.epochs + 1):
+            model.train()
+            running_loss = 0.0
+
+            for xb, yb in loader:
+                xb, yb = xb.to(device), yb.to(device)
+
+                optimizer.zero_grad()
+                preds = model(xb)
+                loss = criterion(preds, yb)
+                loss.backward()
+                optimizer.step()
+
+                running_loss += loss.item() * xb.size(0)
+
+            epoch_loss = running_loss / len(loader.dataset)
+            print(f"Epoch {epoch:2d}/{args.epochs} — Loss: {epoch_loss:.4f}")
+        model = model.double()  # Set to double precision (float64)
+        prec = torch.float64
+        example_input = torch.randn(1, 1, device=device, dtype=prec)
+
+        # Trace the model
+        scripted_model = torch.jit.trace(model, example_input)
+
+        # Generate the file name
+        file_path = f"{args.model_file}.pt"
+        # Save the scripted model
+        scripted_model.save(file_path)
+
+        print(f"Model saved to {file_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorial/Chapter.3.Inference/4.UQ/CMakeLists.txt b/tutorial/Chapter.3.Inference/4.UQ/CMakeLists.txt
new file mode 100644
index 00000000..8b3ba2ae
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/4.UQ/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other
+# AMSLib Project Developers
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+cmake_minimum_required(VERSION 3.20)
+cmake_policy(SET CMP0104 NEW)
+
+# Define the project
+project(EX6 LANGUAGES CXX C)
+
+set(AMS_EXAMPLE_SRC ex7.cpp)
+
+option(WITH_MPI            "Option to enable MPI" OFF)
+option(WITH_CALIPER        "Use Caliper for Profiling" OFF)
+
+set(CMAKE_CXX_STANDARD 14)      # Enable C++14
+set(CMAKE_CXX_STANDARD_REQUIRED ON) # Require the specified standard
+
+find_package(AMS REQUIRED)
+
+if (WITH_CALIPER)
+  find_package(caliper REQUIRED)
+endif()
+
+if (WITH_MPI)
+  find_package(MPI REQUIRED)
+endif()
+
+
+add_executable(EX7 ${AMS_EXAMPLE_SRC})
+target_link_libraries(EX7 PRIVATE AMS::AMS)
+
diff --git a/tutorial/Chapter.3.Inference/4.UQ/README.md b/tutorial/Chapter.3.Inference/4.UQ/README.md
new file mode 100644
index 00000000..7b54cd56
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/4.UQ/README.md
@@ -0,0 +1,13 @@
+# Model inference through AMS interface.
+
+Please modify the example code to create a AMSCompoundModel with the model trained on the previous example. Since we have already defined compounding and the matching of the executors with the underlying function. The changes should be minimal.
+
+## Threshold and Uncertainty.
+
+In AMS every compound model is accompanied by some UQ method. When models do not define their own uncertainty approach, we will define by default *random* uncertainty. Although the implementation of the ML UQ mechanism will be different the implementation remains the same. 
+
+### Torch Uncertainty
+
+In the case of random uncertainty *AMS* computes a random value in the interval [0,1] for every computed sample. When the uncertainty (the randomly generated value) is higher than a user provided threshold, AMS considers this as a highly uncertain inference and only computes the underlying function for that specific sample.
+
+We can though implement our own uncertainty quantification mechanism and attach it on the torch trained model.
diff --git a/tutorial/Chapter.3.Inference/4.UQ/common.hpp b/tutorial/Chapter.3.Inference/4.UQ/common.hpp
new file mode 100644
index 00000000..253bb0b8
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/4.UQ/common.hpp
@@ -0,0 +1,137 @@
+#include <functional>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+class ExampleArgs
+{
+public:
+  void PrintOptions() const
+  {
+    std::cout << "Available options:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << "\n    " << opt.help << "\n";
+    }
+  }
+
+  void Parse(int argc, char** argv)
+  {
+    for (int i = 1; i < argc; ++i) {
+      std::string arg = argv[i];
+      if (arg[0] != '-') continue;
+
+      std::string key = arg;
+      std::string value;
+
+      // If the next item isn't an option, treat it as a value
+      if (i + 1 < argc && argv[i + 1][0] != '-') {
+        value = argv[++i];
+      } else {
+        value = "true";  // Boolean flag
+      }
+
+      options_[key] = value;
+    }
+
+    // Set parsed values into variables
+    for (auto& opt : registered_) {
+      const auto& keys = opt.keys;
+      for (const auto& key : keys) {
+        if (options_.count(key)) {
+          opt.setter(options_[key]);
+          opt.wasset = true;
+          break;
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void AddOption(T* out,
+                 std::string short_opt,
+                 std::string long_opt,
+                 std::string help,
+                 bool required = true)
+  {
+    registered_.push_back(
+        {{short_opt, long_opt},
+         [out](const std::string& val) { parseValue(val, out); },
+         [out]() { return toString(*out); },
+         help,
+         required,
+         false});
+  }
+
+  bool Good() const
+  {
+    for (const auto& opt : registered_) {
+      if (opt.required && !opt.wasset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void PrintUsage() const
+  {
+    std::cout << "Parsed arguments:\n";
+    for (const auto& opt : registered_) {
+      std::cout << "  ";
+      for (size_t i = 0; i < opt.keys.size(); ++i) {
+        std::cout << opt.keys[i];
+        if (i + 1 < opt.keys.size()) std::cout << ", ";
+      }
+      std::cout << " = " << opt.getter() << "\n";
+    }
+  }
+
+private:
+  struct RegisteredOption {
+    std::vector<std::string> keys;
+    std::function<void(const std::string&)> setter;
+    std::function<std::string()> getter;
+    std::string help;
+    bool required;
+    bool wasset;
+  };
+
+  std::vector<RegisteredOption> registered_;
+  std::unordered_map<std::string, std::string> options_;
+
+  // Parser helper
+  template <typename T>
+  static void parseValue(const std::string& s, T* out);
+
+  static void parseValue(const std::string& s, std::string* out) { *out = s; }
+
+  static void parseValue(const std::string& s, int* out)
+  {
+    *out = std::stoi(s);
+  }
+
+  static void parseValue(const std::string& s, float* out)
+  {
+    *out = std::atof(s.c_str());
+  }
+
+  static void parseValue(const std::string& s, bool* out)
+  {
+    *out = (s == "true" || s == "1");
+  }
+
+  static void parseValue(const std::string& s, double* out)
+  {
+    *out = std::stod(s);
+  }
+
+
+  static std::string toString(const std::string& val) { return val; }
+  static std::string toString(bool val) { return val ? "true" : "false"; }
+  static std::string toString(int val) { return std::to_string(val); }
+  static std::string toString(double val) { return std::to_string(val); }
+};
diff --git a/tutorial/Chapter.3.Inference/4.UQ/ex7.cpp b/tutorial/Chapter.3.Inference/4.UQ/ex7.cpp
new file mode 100644
index 00000000..06df7e10
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/4.UQ/ex7.cpp
@@ -0,0 +1,124 @@
+#include <AMS.h>
+
+#include <iostream>
+
+#include "common.hpp"
+
+void InitMemBlob(double* ptr, int size)
+{
+  for (int i = 0; i < size; i++) {
+    ptr[i] = i;
+  }
+}
+
+void ExampleCompute(double* in, double* out, int size)
+{
+  for (int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+
+void ExampleAMSTensorCompute(const ams::AMSTensor& in, ams::AMSTensor& out)
+{
+  ExampleCompute(in.data<double>(), out.data<double>(), in.shape()[0]);
+}
+
+double ComputeSum(double* out, int size)
+{
+  double sum = 0;
+  for (int i = 0; i < size; i++) {
+    sum += out[i];
+  }
+  return sum;
+}
+
+int main(int argc, char* argv[])
+{
+  using namespace ams;
+  int length;
+  std::string db_path;
+  std::string fn;
+  std::string model_path;
+  float threshold;
+  ExampleArgs args;
+  args.AddOption(&length,
+                 "-l",
+                 "--length",
+                 "The size of the vectors to be initialized");
+  args.AddOption(&db_path,
+                 "-p",
+                 "--db-path",
+                 "The path of the file system database to store data to");
+  args.AddOption(&fn,
+                 "-f",
+                 "--filename-prefix",
+                 "The path of the file system database to store data to");
+  args.AddOption(&model_path,
+                 "-m",
+                 "--model-path",
+                 "The path to a torchscript model");
+  args.AddOption(&threshold, "-t", "--threshold", "Model uncertainty");
+
+
+  args.Parse(argc, argv);
+  if (!args.Good()) {
+    std::cout << "Wrong command line arguments\n";
+    args.PrintOptions();
+    return -1;
+  }
+
+  ams::AMSInit();
+
+  double* input = new double[length];
+  double* output = new double[length];
+
+  AMSConfigureFSDatabase(ams::AMSDBType::AMS_HDF5, db_path.c_str());
+  InitMemBlob(input, length);
+
+  AMSCAbstrModel model_descr =
+      AMSRegisterAbstractModel("compute",
+                               ams::AMSUQPolicy::AMS_DELTAUQ_MEAN,
+                               threshold,
+                               model_path.c_str(),
+                               fn.c_str());
+
+
+  /*
+   * Create AMS tensors for memory blobs
+   */
+
+  // We represet both input/output as blobs of lenth 'samples', each sample as 1 element.
+  SmallVector<AMSTensor> input_tensors;
+  SmallVector<AMSTensor> inout_tensors;
+  SmallVector<AMSTensor> output_tensors;
+
+  input_tensors.push_back(ams::AMSTensor::view(
+      input, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  output_tensors.push_back(ams::AMSTensor::view(
+      output, {length, 1}, {1, 1}, ams::AMSResourceType::AMS_HOST));
+
+  EOSLambda Computation = [&](const ams::SmallVector<ams::AMSTensor>& ams_ins,
+                              ams::SmallVector<ams::AMSTensor>& ams_inouts,
+                              ams::SmallVector<ams::AMSTensor>& ams_outs) {
+    ExampleAMSTensorCompute(ams_ins[0], ams_outs[0]);
+  };
+
+  AMSExecutor wf = AMSCreateExecutor(model_descr, 0, 1);
+  std::cout << "Calling AMS Execute\n";
+  AMSExecute(wf, Computation, input_tensors, inout_tensors, output_tensors);
+  std::cout << "Called AMS Execute\n";
+
+  auto sum = ComputeSum(output, length);
+
+  std::cout << "[Example] Expected output is " << (length * (length - 1)) / 2
+            << " and computed " << sum << "\n";
+
+
+  delete[] input;
+  delete[] output;
+  ams::AMSFinalize();
+
+
+  return 0;
+}
diff --git a/tutorial/Chapter.3.Inference/4.UQ/train.py b/tutorial/Chapter.3.Inference/4.UQ/train.py
new file mode 100644
index 00000000..84f4aa05
--- /dev/null
+++ b/tutorial/Chapter.3.Inference/4.UQ/train.py
@@ -0,0 +1,96 @@
+import argparse
+from pathlib import Path
+import h5py
+import torch
+from torch import nn, optim
+from torch.utils.data import TensorDataset, DataLoader
+
+
+class SimpleModel(nn.Module):
+    # A simple model that contains a single linear layer
+    def __init__(self, in_features, out_features):
+        super(SimpleModel, self).__init__()
+        self.fc = nn.Linear(in_features, out_features, False)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Check if in_features == out_features for identity initialization
+        if self.fc.weight.shape[0] == self.fc.weight.shape[1]:
+            nn.init.eye_(self.fc.weight)  # Initialize with identity matrix
+        else:
+            raise ValueError("Identity initialization requires in_features == out_features")
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+class UQModel(nn.Module):
+    def __init__(self, _base):
+        super(UQModel, self).__init__()
+        self.base = _base
+
+    def forward(self, x):
+        uncertainty = x / 1000
+        out = self.base(x)
+        # AMS Uncertainty requires that the uncertainty will have the same shape as
+        # the output. So we have a value for every partial component (dimension) of the output
+        return out, torch.abs(x-out) 
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description="Train and store a model.")
+    parser.add_argument("--filename", "-fn", type=str, required=True, help="The file containing the data to train on")
+    parser.add_argument("--epochs", "-e", help="Number of epochs", default=10)
+    parser.add_argument("--model-file", "-m", help="Filename of model", default="model")
+    args = parser.parse_args()
+    if not Path(args.filename).exists():
+        raise RuntimeError(f"Please provide an existing file, file {args.filename} does not exist")
+    device = "cpu"
+
+    with h5py.File(args.filename, "r") as fd:
+        X, y = fd["input_data"][:], fd["output_data"][:]
+        dataset = TensorDataset(
+            torch.from_numpy(X),
+            torch.from_numpy(y),
+        )
+        model = SimpleModel(X.shape[-1], y.shape[-1])
+        optimizer = optim.Adam(model.parameters(), lr=1e-3)
+        criterion = nn.MSELoss()
+        loader = DataLoader(dataset, batch_size=64, shuffle=True)
+
+        for epoch in range(1, args.epochs + 1):
+            model.train()
+            running_loss = 0.0
+
+            for xb, yb in loader:
+                xb, yb = xb.to(device), yb.to(device)
+
+                optimizer.zero_grad()
+                preds = model(xb)
+                loss = criterion(preds, yb)
+                loss.backward()
+                optimizer.step()
+
+                running_loss += loss.item() * xb.size(0)
+
+            epoch_loss = running_loss / len(loader.dataset)
+            print(f"Epoch {epoch:2d}/{args.epochs} — Loss: {epoch_loss:.4f}")
+
+        model = UQModel(model)  # Set to double precision (float64)
+        prec = torch.float32
+        example_input = torch.from_numpy(X)
+
+        # Trace the model
+        scripted_model = torch.jit.trace(model, example_input)
+
+        # Generate the file name
+        file_path = f"{args.model_file}.pt"
+        # Save the scripted model
+        scripted_model.save(file_path)
+
+        print(f"Model saved to {file_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorial/README.md b/tutorial/README.md
new file mode 100644
index 00000000..11296e12
--- /dev/null
+++ b/tutorial/README.md
@@ -0,0 +1,60 @@
+# AMS Tutorial
+
+The tutorial provides a docker with all the necessary installations to build applications against AMS and run them on a CPU system.
+
+## Download container
+
+To download the container please issue the following command on your terminal:
+
+```
+docker pull ghcr.io/llnl/ams-x86-tutorial:latest
+```
+
+The expected output should be similar to:
+
+```
+154ef9065217: Already exists
+4e338da1ac28: Already exists
+068a1074ab96: Already exists
+dd6c369f83fa: Already exists
+a112725ae27e: Already exists
+c948d35905b3: Already exists
+edcd2ec99d0b: Already exists
+
+Digest: sha256:24950b5ebb5ee90657fdd17d007921732b36e7d8e3820a6778779ed9357d2b9f
+Status: Downloaded newer image for ghcr.io/llnl/ams-x86-tutorial:latest
+```
+
+## Run container interactively
+
+```
+docker run --rm -it \
+  -v "$(pwd)":/workspace -w /workspace \
+  ghcr.io/llnl/ams-x86-tutorial:latest \
+  bash
+```
+
+The command should provide an interactive `bash` shell and the output should look like the following:
+
+```
+250428 15:31:30 mysqld_safe Logging to syslog.
+250428 15:31:30 mysqld_safe Starting mariadbd daemon with databases from /var/lib/mysql
+mysqld is alive
+MariaDB is up!
+Root password set to 'root'
+```
+
+## Contents
+
+1. [Basic API](./Chapter.1.DataFlow/)
+    1. [Build](./Chapter.1.DataFlow/0.Build)
+    2. [AMSTensor](./Chapter.1.DataFlow/1.AMSTensor)
+    3. [DataFlow](./Chapter.1.DataFlow/2.DataFlow)
+    4. [AbstractCompoundModel](./Chapter.1.DataFlow/3.CompoundModel/)
+2. [AMS Data Collection](./Chapter.2.DB)
+    1. [HDF5](./Chapter.2.DB/1.HDF5/)
+3. [AMS Model Serving](./Chapter.3.Inference)
+    1. [Data Collection](./Chapter.3.Inference/1.HDF5)
+    2. [Train](./Chapter.3.Inference/2.Train)
+    3. [Serve Inference](./Chapter.3.Inference/3.Serve)
+    4. [UQ](./Chapter.3.Inference/4.UQ)