diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b1ce3b48df..edf567f301 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,7 +4,7 @@ jobs:
   build_docker:
     strategy:
       matrix:
-        target: [gcc12_debug, gcc13, clang13, clang14_debug, clang15, rocm6, rocm6_desul, intel2024, intel2024_debug, intel2024_sycl]
+        target: [gcc12, gcc12_debug, gcc13, gcc13_desul, clang14_debug, clang14_style, clang14_desul, intel2024_0, intel2024_0_debug, intel2024_0_sycl, rocm6_4_3_desul]
     runs-on: ubuntu-latest
     steps:
     - run: |
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 21a3991cd3..22f6664407 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,7 +78,7 @@ stages:
     include:
       - local: '.gitlab/custom-jobs-and-variables.yml'
       - project: 'radiuss/radiuss-shared-ci'
-        ref: 'v2025.06.0'
+        ref: 'v2025.09.1'
         file: 'pipelines/${CI_MACHINE}.yml'
       - artifact: '${CI_MACHINE}-jobs.yml'
         job: 'generate-job-lists'
@@ -107,7 +107,7 @@ include:
     file: 'id_tokens.yml'
   # [Optional] checks preliminary to running the actual CI test
   - project: 'radiuss/radiuss-shared-ci'
-    ref: 'v2025.06.0'
+    ref: 'v2025.09.1'
     file: 'utilities/preliminary-ignore-draft-pr.yml'
   # pipelines subscribed by the project
   - local: '.gitlab/subscribed-pipelines.yml'
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index 51531b2132..b578273ed2 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -22,6 +22,16 @@ variables:
 # Project specific deps for dane
   PROJECT_DANE_DEPS:
 
+# Matrix
+# Arguments for top level allocation
+  MATRIX_SHARED_ALLOC: "--exclusive --partition=pci --time=60 --nodes=1"
+# Arguments for job level allocation
+  MATRIX_JOB_ALLOC: "--partition=pci --nodes=1"
+# Project specific variants for matrix
+  PROJECT_MATRIX_VARIANTS: "~shared +cuda cuda_arch=90 +tests"
+# Project specific deps for matrix
+  PROJECT_MATRIX_DEPS:
+
 # Corona
 # Arguments for top level allocation
   CORONA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1 -o per-resource.count=2"
@@ -44,7 +54,7 @@ variables:
 
 # Tuolumne
 # Arguments for top level allocation
-  TUOLUMNE_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2"
+  TUOLUMNE_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=90m --nodes=1 -o per-resource.count=2"
 # Arguments for job level allocation
   TUOLUMNE_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml
index 352ba7135a..64daeb9fd1 100644
--- a/.gitlab/jobs/corona.yml
+++ b/.gitlab/jobs/corona.yml
@@ -17,7 +17,7 @@
 # project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
 # So that the comparison with the original job is easier.
 
-# No overridden jobs so far.
+## rocm 5.7.1 job is in the shared RSC set of jobs
 
 ############
 # Extra jobs
@@ -26,16 +26,20 @@
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
+rocmcc_5_7_1_hip_openmp:
+  variables:
+    SPEC: " ~shared +rocm +openmp +tests +vectorization amdgpu_target=gfx906 %llvm-amdgpu@=5.7.1 ^hip@5.7.1"
+  extends: .job_on_corona
+
 rocmcc_5_7_1_hip_desul_atomics:
   variables:
-    SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@=5.7.1 ^hip@5.7.1"
+    SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %llvm-amdgpu@=5.7.1 ^hip@5.7.1"
   extends: .job_on_corona
 
-clang_20_0_0_sycl_gcc_10_3_1_rocmcc_6_0_2_hip:
+clang_22_0_0_sycl_gcc_10_3_1_rocmcc_6_4_2:
   variables:
-    SPEC: " ~shared +sycl ~openmp +tests %clang@=20.0.0 cxxflags==\"-w -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906\""
-    MODULE_LIST: "rocm/6.0.2"
-    SYCL_PATH: "/usr/WS2/raja-dev/clang_sycl_730cd3a5275f_hip_gcc10.3.1_rocm6.0.2/"
+    SPEC: " ~shared +sycl ~openmp +tests cxxflags==\"-w -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906\" %sycl-clang-22-gcc-10"
+    MODULE_LIST: "rocm/6.4.2"
+    SYCL_PATH: "/usr/workspace/raja-dev/clang_sycl_16b7bcb09915_hip_gcc10.3.1_rocm6.4.2"
     LD_LIBRARY_PATH: "${SYCL_PATH}/lib:${SYCL_PATH}/lib64:${LD_LIBRARY_PATH}"
   extends: .job_on_corona
-
diff --git a/.gitlab/jobs/dane.yml b/.gitlab/jobs/dane.yml
index b5b45541e7..33dbfd46f8 100644
--- a/.gitlab/jobs/dane.yml
+++ b/.gitlab/jobs/dane.yml
@@ -17,24 +17,23 @@
 # project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
 # when possible so that the comparison with the original job is easier.
 
-# Identical to shared job, but use OpenMP tasks and no vectorization
-clang_14_0_6:
-  variables:
-    SPEC: " ~shared +openmp +omptask ~vectorization +tests %clang@=14.0.6 ${PROJECT_POODLE_DEPS}"
+# Shared job uses %clang-19-gcc-13, replaced with %clang-19 below
+clang_19_1_3_gcc_13_3_1:
   extends: .job_on_dane
-
-# Identical to shared job, but use OpenMP tasks and no vectorization
-gcc_10_3_1:
   variables:
-    SPEC: " ~shared +openmp +omptask ~vectorization +tests %gcc@=10.3.1 ${PROJECT_POODLE_DEPS}"
+    ON_DANE: "OFF"
+
+# Shared job uses %oneapi-2025-gcc-10, replaced with %oneapi@=2025.2 below
+oneapi_2025_2_0_gcc_10_3_1:
   extends: .job_on_dane
+  variables:
+    ON_DANE: "OFF"
+
+## gcc 13.3.1 job is in the shared RSC set of jobs
 
-# custom variant
+# Similar to shared RSC job, but use OpenMP task, turn off vectorization,
+# and lower optimization level to get correct results
 # https://github.com/LLNL/RAJA/pull/1712#issuecomment-2292006843
-oneapi_2023_2_1:
-  variables:
-    SPEC: "${PROJECT_POODLE_VARIANTS} +lowopttest cxxflags==-fp-model=precise %oneapi@=2023.2.1 ${PROJECT_POODLE_DEPS}"
-  extends: .job_on_dane
 
 ############
 # Extra jobs
@@ -43,18 +42,32 @@ oneapi_2023_2_1:
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-clang_14_0_6_openmp_off:
+clang_14_0_6:
+  variables:
+    SPEC: " ~shared +openmp +omptask ~vectorization +tests %llvm@=14.0.6"
+  extends: .job_on_dane
+
+clang_19_1_3:
+  variables:
+    SPEC: " ~shared +openmp +tests %llvm@=19.1.3"
+  extends: .job_on_dane
+
+gcc_10_3_1:
+  variables:
+    SPEC: " ~shared +openmp +omptask ~vectorization +tests %gcc@=10.3.1 ${PROJECT_DANE_DEPS}"
+  extends: .job_on_dane
+
+gcc_12_1_1:
   variables:
-    SPEC: " ~shared ~openmp +tests %clang@=14.0.6"
+    SPEC: " ~shared +openmp +tests %gcc@=12.1.1"
   extends: .job_on_dane
 
-gcc_10_3_1_openmp_default:
+clang_14_0_6_gcc_11_2_1_desul_atomics:
   variables:
-    SPEC: " ~shared +tests %gcc@=10.3.1"
+    SPEC: " ~shared +openmp +tests +desul %clang-14-gcc-11"
   extends: .job_on_dane
 
-# OTHERS
-clang_14_0_6_gcc_10_3_1_desul_atomics:
+oneapi_2025_2_0:
   variables:
-    SPEC: " ~shared +openmp +tests +desul %clang@=14.0.6.gcc.10.3.1"
+    SPEC: "~shared +openmp +vectorization +tests cxxflags==-fp-model=precise %oneapi@=2025.2.0"
   extends: .job_on_dane
diff --git a/.gitlab/jobs/matrix.yml b/.gitlab/jobs/matrix.yml
new file mode 100644
index 0000000000..aab48b372c
--- /dev/null
+++ b/.gitlab/jobs/matrix.yml
@@ -0,0 +1,39 @@
+##############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+# Override reproducer section to define project specific variables.
+.matrix_reproducer_vars:
+  script:
+    - !reference [.reproducer_vars, script]
+
+########################
+# Overridden shared jobs
+########################
+# We duplicate the shared jobs description and add necessary changes for this
+# project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
+# when possible so that the comparison with the original job is easier.
+
+# Shared job uses toolchain %clang-18-gcc-13, replaced with %llvm@=18.1.8 below
+clang_18_1_1_gcc_13_cuda_12_6_0:
+  extends: .job_on_matrix
+  variables:
+    ON_MATRIX: "OFF"
+
+############
+# Extra jobs
+############
+# We do not recommend using ${PROJECT_<MACHINE>_VARIANTS} and
+# ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
+# describe the spec here.
+
+# clang_18_1_8_cuda_12_6_0
+clang_18_1_8_cuda_12_6_0:
+  variables:
+    SPEC: " ~shared +cuda cuda_arch=90 %llvm@=18.1.8 ^cuda@12.6.0+allow-unsupported-compilers"
+  extends: .job_on_matrix
+  allow_failure: true
+
diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml
index bc4abc8755..68fdc4a8f6 100644
--- a/.gitlab/jobs/tioga.yml
+++ b/.gitlab/jobs/tioga.yml
@@ -17,13 +17,9 @@
 # project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
 # So that the comparison with the original job is easier.
 
-# Compiler error preventing a test to succeed.
-# https://github.com/LLNL/RAJA/pull/1712#issuecomment-2316335119
-cce_18_0_1:
-  variables:
-    SPEC: "${PROJECT_TIOGA_VARIANTS} %cce@=18.0.1 ${PROJECT_TIOGA_DEPS}"
-  extends: .job_on_tioga
-  allow_failure: true
+## NOTE: rocm 6.4.1 job is in the shared RSC set of jobs
+
+## NOTE: cce 20.0.0 job is in the shared RSC set of jobs
 
 ############
 # Extra jobs
@@ -32,17 +28,12 @@ cce_18_0_1:
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-cce_19_0_0:
-  variables:
-    SPEC: "${PROJECT_TIOGA_VARIANTS} %cce@=19.0.0 ${PROJECT_TIOGA_DEPS}"
-  extends: .job_on_tioga
-
 rocmcc_6_4_1_hip_desul_atomics:
   variables:
-    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=6.4.1 ^hip@6.4.1"
+    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %llvm-amdgpu@=6.4.1 ^hip@6.4.1"
   extends: .job_on_tioga
 
 rocmcc_6_4_1_hip_openmp:
   variables:
-    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=6.4.1 ^hip@6.4.1"
+    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %llvm-amdgpu@=6.4.1 ^hip@6.4.1"
   extends: .job_on_tioga
diff --git a/.gitlab/jobs/tuolumne.yml b/.gitlab/jobs/tuolumne.yml
index 7ea3f58752..59c1c462b7 100644
--- a/.gitlab/jobs/tuolumne.yml
+++ b/.gitlab/jobs/tuolumne.yml
@@ -17,13 +17,9 @@
 # project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
 # So that the comparison with the original job is easier.
 
-# Compiler error preventing a test to succeed.
-# https://github.com/LLNL/RAJA/pull/1712#issuecomment-2316335119
-cce_18_0_1:
-  variables:
-    SPEC: "${PROJECT_TUOLUMNE_VARIANTS} %cce@=18.0.1 ${PROJECT_TUOLUMNE_DEPS}"
-  extends: .job_on_tuolumne
-  allow_failure: true
+## cce 20.0.0 job is in the shared RSC set of jobs
+
+## rocm 6.4.1 job is in the shared RSC set of jobs
 
 ############
 # Extra jobs
@@ -32,17 +28,12 @@ cce_18_0_1:
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-cce_19_0_0:
-  variables:
-    SPEC: "${PROJECT_TUOLUMNE_VARIANTS} %cce@=19.0.0 ${PROJECT_TUOLUMNE_DEPS}"
-  extends: .job_on_tuolumne
-
 rocmcc_6_4_1_hip_desul_atomics:
   variables:
-    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx942 %rocmcc@=6.4.1 ^hip@6.4.1"
+    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx942 %llvm-amdgpu@=6.4.1 ^hip@6.4.1"
   extends: .job_on_tuolumne
 
 rocmcc_6_4_1_hip_openmp:
   variables:
-    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx942 %rocmcc@=6.4.1 ^hip@6.4.1"
+    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx942 %llvm-amdgpu@=6.4.1 ^hip@6.4.1"
   extends: .job_on_tuolumne
diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml
index 95d8a65348..1502195750 100644
--- a/.gitlab/subscribed-pipelines.yml
+++ b/.gitlab/subscribed-pipelines.yml
@@ -38,12 +38,14 @@ generate-job-lists:
     LOCAL_JOBS_PATH: ".gitlab/jobs"
   script:
     - cat ${RADIUSS_JOBS_PATH}/dane.yml ${LOCAL_JOBS_PATH}/dane.yml > dane-jobs.yml
+    - cat ${RADIUSS_JOBS_PATH}/matrix.yml ${LOCAL_JOBS_PATH}/matrix.yml > matrix-jobs.yml
     - cat ${RADIUSS_JOBS_PATH}/corona.yml ${LOCAL_JOBS_PATH}/corona.yml > corona-jobs.yml
     - cat ${RADIUSS_JOBS_PATH}/tioga.yml ${LOCAL_JOBS_PATH}/tioga.yml > tioga-jobs.yml
     - cat ${RADIUSS_JOBS_PATH}/tuolumne.yml ${LOCAL_JOBS_PATH}/tuolumne.yml > tuolumne-jobs.yml
   artifacts:
     paths:
       - dane-jobs.yml
+      - matrix-jobs.yml
       - corona-jobs.yml
       - tioga-jobs.yml
       - tuolumne-jobs.yml
@@ -53,45 +55,101 @@ dane-up-check:
   variables:
     CI_MACHINE: "dane"
   extends: [.machine-check]
+  rules:
+    # Runs except if we explicitly deactivate dane by variable.
+    - if: '$ON_DANE == "OFF"'
+      when: never
+    - when: on_success
 
 dane-build-and-test:
   variables:
     CI_MACHINE: "dane"
   needs: [dane-up-check, generate-job-lists]
   extends: [.build-and-test]
+  rules:
+    # Runs except if we explicitly deactivate dane by variable.
+    - if: '$ON_DANE == "OFF"'
+      when: never
+    - when: on_success
+
+# MATRIX
+matrix-up-check:
+  variables:
+    CI_MACHINE: "matrix"
+  extends: [.machine-check]
+  rules:
+    # Runs except if we explicitly deactivate matrix by variable.
+    - if: '$ON_MATRIX == "OFF"'
+      when: never
+    - when: on_success
+
+matrix-build-and-test:
+  variables:
+    CI_MACHINE: "matrix"
+  needs: [matrix-up-check, generate-job-lists]
+  extends: [.build-and-test]
+  rules:
+    # Runs except if we explicitly deactivate matrix by variable.
+    - if: '$ON_MATRIX == "OFF"'
+      when: never
+    - when: on_success
 
 # CORONA
 corona-up-check:
   variables:
     CI_MACHINE: "corona"
   extends: [.machine-check]
+  rules:
+    - if: '$ON_CORONA == "OFF"'
+      when: never
+    - when: on_success
 
 corona-build-and-test:
   variables:
     CI_MACHINE: "corona"
   needs: [corona-up-check, generate-job-lists]
   extends: [.build-and-test]
+  rules:
+    - if: '$ON_CORONA == "OFF"'
+      when: never
+    - when: on_success
 
 # TIOGA
 tioga-up-check:
   variables:
     CI_MACHINE: "tioga"
   extends: [.machine-check]
+  rules:
+    - if: '$ON_TIOGA == "OFF"'
+      when: never
+    - when: on_success
 
 tioga-build-and-test:
   variables:
     CI_MACHINE: "tioga"
   needs: [tioga-up-check, generate-job-lists]
   extends: [.build-and-test]
+  rules:
+    - if: '$ON_TIOGA == "OFF"'
+      when: never
+    - when: on_success
 
 # TUOLUMNE
 tuolumne-up-check:
   variables:
     CI_MACHINE: "tuolumne"
   extends: [.machine-check]
+  rules:
+    - if: '$ON_TUOLUMNE == "OFF"'
+      when: never
+    - when: on_success
 
 tuolumne-build-and-test:
   variables:
     CI_MACHINE: "tuolumne"
   needs: [tuolumne-up-check, generate-job-lists]
   extends: [.build-and-test]
+  rules:
+    - if: '$ON_TUOLUMNE == "OFF"'
+      when: never
+    - when: on_success
diff --git a/.uberenv_config.json b/.uberenv_config.json
index 6f64e983a6..b16440eb3f 100644
--- a/.uberenv_config.json
+++ b/.uberenv_config.json
@@ -4,8 +4,8 @@
 "package_final_phase" : "initconfig",
 "package_source_dir" : "../..",
 "spack_url": "https://github.com/spack/spack.git",
-"spack_commit": "280017a9ba3f6a969743deca0eebc96e7c0e5cfd",
+"spack_branch": "v1.0.2",
 "spack_configs_path": "scripts/radiuss-spack-configs",
-"spack_packages_path": "scripts/radiuss-spack-configs/packages",
+"spack_packages_path": "scripts/radiuss-spack-configs/spack_repo/llnl_radiuss/packages",
 "spack_setup_clingo": false
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a2d862ab0..9438c31cdd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,8 @@ include(CMakeDependentOption)
 
 # Set version number
 set(RAJA_VERSION_MAJOR 2025)
-set(RAJA_VERSION_MINOR 09)
-set(RAJA_VERSION_PATCHLEVEL 1)
+set(RAJA_VERSION_MINOR 12)
+set(RAJA_VERSION_PATCHLEVEL 0)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
diff --git a/Dockerfile b/Dockerfile
index d20809d639..b605df6fda 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,68 +6,53 @@
 ###############################################################################
 
 ##
-## Note that we build with 'make -j 16' on GitHub Actions and
-## with 'make -j 6' on Azure. This is reflected in the 'make' commands below.
-## This seems to work best for throughput.
+## Note that we build with 'make -j 16' for newer targets and 'make -j 6' 
+## for older targets on GitHub Actions. This is reflected in the 'make' 
+## commands below. This seems to work best for throughput.
 ##
 
-FROM ghcr.io/llnl/radiuss:gcc-11-ubuntu-22.04 AS gcc11
+FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-24.04 AS gcc12
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
-    make -j 6 &&\
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
     ctest -T test --output-on-failure && \
     make clean
 
-FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-22.04 AS gcc12
+FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-24.04 AS gcc12_debug
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
-    make -j 6 &&\
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
     ctest -T test --output-on-failure && \
     make clean
 
-FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-22.04 AS gcc12_debug
+FROM ghcr.io/llnl/radiuss:gcc-13-ubuntu-24.04 AS gcc13
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DBLT_CXX_STD=c++17 .. && \
     make -j 16 &&\
-    ctest -T test --output-on-failure
-
-FROM ghcr.io/llnl/radiuss:gcc-12-ubuntu-22.04 AS gcc12_desul
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \
-    make -j 6 &&\
     ctest -T test --output-on-failure && \
     make clean
 
-FROM ghcr.io/llnl/radiuss:gcc-13-ubuntu-22.04 AS gcc13
+FROM ghcr.io/llnl/radiuss:gcc-13-ubuntu-24.04 AS gcc13_desul
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On .. && \
+RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_OPENMP=On -DRAJA_ENABLE_DESUL_ATOMICS=On -DBLT_CXX_STD=c++17 .. && \
     make -j 16 &&\
-    ctest -T test --output-on-failure
-
-FROM ghcr.io/llnl/radiuss:clang-13-ubuntu-22.04 AS clang13
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On .. && \
-    make -j 16 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    make clean
 
 FROM ghcr.io/llnl/radiuss:clang-14-ubuntu-22.04 AS clang14_debug
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug  -DENABLE_OPENMP=On .. && \
-    make -j 6 &&\
+RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Debug  -DENABLE_OPENMP=On -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
     ctest -T test --output-on-failure && \
     make clean
 
@@ -80,72 +65,49 @@ RUN clang-format --version && \
     cmake -DENABLE_CLANGFORMAT=ON ../ && \
     make check
 
-FROM ghcr.io/llnl/radiuss:clang-15-ubuntu-22.04 AS clang15
+FROM ghcr.io/llnl/radiuss:clang-14-ubuntu-22.04 AS clang14_desul
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_RUNTIME_PLUGINS=ON -DENABLE_OPENMP=On .. && \
+RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_ENABLE_DESUL_ATOMICS=On -DBLT_CXX_STD=c++17 .. && \
     make -j 16 &&\
-    ctest -T test --output-on-failure
-
-FROM ghcr.io/llnl/radiuss:clang-15-ubuntu-22.04 AS clang15_desul
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \
-    make -j 6 &&\
     ctest -T test --output-on-failure && \
     make clean
 
-## Test run failure in RAJA launch tests with new reducer interface.
-## Need to figure out best way to handle that.
-FROM ghcr.io/llnl/radiuss:ubuntu-20.04-intel-2024.0 AS intel2024
+## Don't run tests due to failure in RAJA launch tests with new reducer interface.
+FROM ghcr.io/llnl/radiuss:intel-2024.0-ubuntu-20.04 AS intel2024_0
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN /bin/bash -c "source /opt/intel/oneapi/setvars.sh 2>&1 > /dev/null && \
-    cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On .. && \
-    make -j 16"
-##    make -j 16 &&\
-##    ctest -T test --output-on-failure"
+    cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=On -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
+    make clean"
 
-## Test run failure in RAJA launch tests with new reducer interface.
-## Need to figure out best way to handle that.
-FROM ghcr.io/llnl/radiuss:ubuntu-20.04-intel-2024.0 AS intel2024_debug
+## Don't run tests due to failure in RAJA launch tests with new reducer interface.
+FROM ghcr.io/llnl/radiuss:intel-2024.0-ubuntu-20.04 AS intel2024_0_debug
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN /bin/bash -c "source /opt/intel/oneapi/setvars.sh 2>&1 > /dev/null && \
-    cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On .. && \
-    make -j 16"
-##    make -j 16 &&\
-##    ctest -T test --output-on-failure"
-
-##
-## Need to find a viable cuda image to test...
-##
+    cmake -DCMAKE_CXX_COMPILER=icpx -DCMAKE_BUILD_TYPE=Debug -DENABLE_OPENMP=On -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
+    make clean"
 
-FROM ghcr.io/llnl/radiuss:hip-6.0.2-ubuntu-20.04 AS rocm6
+FROM ghcr.io/llnl/radiuss:intel-2024.0-ubuntu-20.04 AS intel2024_0_sycl
 ENV GTEST_COLOR=1
-ENV HCC_AMDGPU_TARGET=gfx900
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=/opt/rocm-6.0.2/bin/amdclang++ -DROCM_PATH=/opt/rocm-6.0.2 -DCMAKE_BUILD_TYPE=Release -DENABLE_HIP=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
-    make -j 16
+RUN /bin/bash -c "source /opt/intel/oneapi/setvars.sh 2>&1 > /dev/null && \
+    cmake -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DRAJA_ENABLE_SYCL=On -DRAJA_ENABLE_DESUL_ATOMICS=On -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
+    make clean"
 
-FROM ghcr.io/llnl/radiuss:hip-6.0.2-ubuntu-20.04 AS rocm6_desul
+FROM ghcr.io/llnl/radiuss:hip-6.4.3-ubuntu-24.04 AS rocm6_4_3_desul
 ENV GTEST_COLOR=1
 ENV HCC_AMDGPU_TARGET=gfx900
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN cmake -DCMAKE_CXX_COMPILER=/opt/rocm-6.0.2/bin/amdclang++ -DROCM_PATH=/opt/rocm-6.0.2 -DCMAKE_BUILD_TYPE=Release -DENABLE_HIP=On -DRAJA_ENABLE_DESUL_ATOMICS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
-    make -j 16
-
-FROM ghcr.io/llnl/radiuss:intel-2024.0-ubuntu-20.04 AS intel2024_sycl
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN /bin/bash -c "source /opt/intel/oneapi/setvars.sh 2>&1 > /dev/null && \
-    cmake -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=Release -DENABLE_OPENMP=Off -DRAJA_ENABLE_SYCL=On -DBLT_CXX_STD=c++17 -DRAJA_ENABLE_DESUL_ATOMICS=On .. && \
-    make -j 4"
-
+RUN cmake -DCMAKE_CXX_COMPILER=/opt/rocm-6.4.3/bin/amdclang++ -DROCM_PATH=/opt/rocm-6.4.3 -DCMAKE_BUILD_TYPE=Release -DENABLE_HIP=On -DRAJA_ENABLE_DESUL_ATOMICS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD=c++17 .. && \
+    make -j 16 &&\
+    make clean 
diff --git a/README.md b/README.md
index d788dd65d4..1e2097d408 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@
 
 # <img src="/share/raja/logo/RAJA_LOGO_Color.png?raw=true" width="128" valign="middle" alt="RAJA"/>
 
-[![Azure Pipeline Build Status](https://dev.azure.com/LLNL-RAJA/RAJA/_apis/build/status/LLNL.RAJA?branchName=develop)](https://dev.azure.com/LLNL-RAJA/RAJA/_build/latest?definitionId=1&branchName=develop)
 [![Documentation Status](https://readthedocs.org/projects/raja/badge/?version=develop)](https://raja.readthedocs.io/en/develop/?badge=develop)
 [![Coverage](https://img.shields.io/codecov/c/github/LLNL/RAJA/develop.svg)](https://codecov.io/gh/LLNL/RAJA)
 [![Join the chat at https://gitter.im/llnl/raja](https://badges.gitter.im/llnl/raja.svg)](https://gitter.im/llnl/raja?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
diff --git a/RELEASE b/RELEASE
index 74b3e9c0b8..ff39101772 100644
--- a/RELEASE
+++ b/RELEASE
@@ -1,6 +1,6 @@
 *******************************************************************************
 
-RAJA: ................................, version 2025.09.0
+RAJA: ................................, version 2025.12.0
 
 Copyright (c) 2016-25, Lawrence Livermore National Security, LLC.
 Produced at the Lawrence Livermore National Laboratory.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 950b701469..c72ac952b0 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -20,6 +20,31 @@ Notable changes include:
   * Bug fixes/improvements:
 
 
+Version 2025.12.0 -- Release date 2025-12-19
+============================================
+
+This release contains mostly improvements to code robustness and testing, 
+including evolving internal code implementations to use C++17.
+
+Notable changes include:
+
+  * New features / API changes:
+
+  * Build changes/improvements:
+    * Update Camp submodule to v2025.12.0 release.
+    * Improve CMake support for configuring with Caliper and fix issue reported
+      by a user.
+    * Bump minimum CMake version required to 3.24.
+
+  * Bug fixes/improvements:
+    * Fix compilation failue when a downstream library or applications is
+      built without OpenMP enabled when RAJA was built with OpenMP enabled.
+      There may still be some corner cases that violate C++ ODR that we have not
+      resolved and are not being exposed by users.
+    * Various internal code cleanups, simplifications, and improvements using
+      C++17 features, with an eye toward C++20.
+
+
 Version 2025.09.1 -- Release date 2025-10-01
 ============================================
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 100566af90..0000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,92 +0,0 @@
-jobs:
-- job: Windows 
-  strategy:
-    matrix:
-      shared:
-        SHARED_ARGS: '-DBUILD_SHARED_LIBS=On'
-###########################
-## Windows static build is run on GitHub Actions
-##    static:
-##      SHARED_ARGS: '-DBUILD_SHARED_LIBS=Off'
-  pool:
-    vmImage: 'windows-2022'
-  variables:
-    CMAKE_EXTRA_FLAGS: '-DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off -DBLT_CXX_STD="" -DCMAKE_CXX_STANDARD=17'
-  steps:
-  - checkout: self
-    clean: boolean
-    submodules: recursive
-  - task: CMake@1
-    inputs:
-      workingDir: 'build'
-      cmakeArgs: '$(CMAKE_EXTRA_FLAGS) $(SHARED_ARGS) ../'
-  - task: CMake@1
-    inputs:
-      workingDir: 'build'
-      cmakeArgs: '--build . --config Release --verbose -j 4'
-  - task: CmdLine@2
-    inputs:
-      script: 'ctest.exe -T test -C Release'
-      workingDirectory: 'build'
-  - task: PublishTestResults@2
-    inputs:
-      testResultsFormat: 'cTest'
-      testResultsFiles: '**/Test.xml'
-- job: Docker
-  timeoutInMinutes: 360
-  strategy:
-    matrix: 
-      gcc11:
-        docker_target: gcc11
-      gcc12:
-        docker_target: gcc12
-      gcc12_desul:
-        docker_target: gcc12_desul
-      clang15_desul:
-        docker_target: clang15_desul
-      clang14_style:
-        docker_target: clang14_style
-  pool:
-    vmImage: 'ubuntu-latest'
-  variables:
-    DOCKER_BUILDKIT: '1'
-    CMAKE_EXTRA_FLAGS: '-DENABLE_DEVELOPER_BENCHMARKS=On -DENABLE_DEVELOPER_DEFAULTS=On -DCMAKE_CXX_STANDARD=17'
-  steps:
-  - checkout: self
-    clean: boolean
-    submodules: recursive
-  - task: Docker@1
-    inputs:
-      command: build
-      dockerFile: 'Dockerfile'
-      arguments: '--target $(docker_target)'
-  - task: PublishTestResults@2
-    inputs:
-      testResultsFormat: 'cTest'
-      testResultsFiles: '**/Test.xml'
-- job: Mac 
-  pool:
-    vmImage: 'macOS-latest'
-  variables:
-    CMAKE_EXTRA_FLAGS: '-DENABLE_OPENMP=Off -DCMAKE_CXX_STANDARD=17'
-  steps:
-  - checkout: self
-    clean: boolean
-    submodules: recursive
-  - task: CMake@1
-    inputs:
-      workingDir: 'build'
-      cmakeArgs: '$(CMAKE_EXTRA_FLAGS) ../'
-  - script: |
-      cd build
-      make -j 4
-    displayName: 'OSX Build'
-    condition: eq( variables['Agent.OS'], 'Darwin')
-  - script: |
-      cd build
-      ctest -T test --output-on-failure
-    displayName: 'OSX Test'
-  - task: PublishTestResults@2
-    inputs:
-      testResultsFormat: 'cTest'
-      testResultsFiles: '**/Test.xml'
diff --git a/docs/conf.py b/docs/conf.py
index 3e98107236..6df0fef485 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -87,9 +87,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'2025.09'
+version = u'2025.12'
 # The full version, including alpha/beta/rc tags.
-release = u'2025.09.1'
+release = u'2025.12.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst
index f68b4bbe8b..60167809a5 100644
--- a/docs/sphinx/dev_guide/build_configurations.rst
+++ b/docs/sphinx/dev_guide/build_configurations.rst
@@ -233,13 +233,13 @@ Reproducing Docker Builds Locally
 ==================================
 
 RAJA uses Docker container images that it shares with other LLNL GitHub projects
-for Azure and GitHub Actions CI testing (see :ref:`azure_ci-label` for more 
-information). We use Azure Pipelines and GitHub Actions for Linux, Windows,
-and MacOS builds for build environments and compiler versions that
-are not available on LLNL LC machines. 
+for GitHub Actions CI testing (see :ref:`github_actions_ci-label` for more 
+information). We use GitHub Actions for Linux, Windows, and MacOS builds for 
+build environments and compiler versions that are not available on LLNL LC 
+machines. 
 
-You can reproduce Azure and GitHub Actions builds locally for testing using
-the following steps if you have access to Docker.
+You can reproduce GitHub Actions builds locally for testing using the 
+following steps if you have access to Docker.
 
   #. Run the command to build a local Docker image::
 
diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst
index 1b0fff75bc..883ab8d016 100644
--- a/docs/sphinx/dev_guide/ci.rst
+++ b/docs/sphinx/dev_guide/ci.rst
@@ -17,23 +17,23 @@ Continuous Integration (CI) Testing
                  viewed by clicking the appropriate link in the **checks** 
                  section of a GitHub pull request.
 
-The RAJA project uses three CI tools to run tests:
-
-  * **Azure Pipelines** and **GitHub Actions** runs builds and tests for Linux,
-    Windows, and MacOS environments using compilers in container images 
-    maintained in the `RADIUSS Docker Project <https://github.com/LLNL/radiuss-docker>`_.
-    While we do some GPU builds on Azure and GitHub Actions for CUDA, HIP, and
-    SYCL, RAJA tests are only run for CPU-only builds. The current set of 
-    builds run on Azure and GitHub Actions can be seen by looking at the 
-    ``RAJA/azure-pipelines.yml`` and ``RAJA/.github/workflows/build.yml`` files,
-    respectively. The ``RAJA/Dockerfile`` file contains the CMake options used
-    to generate the build environment for each of the builds.
+The RAJA project uses two CI tools to run tests:
+
+  * **GitHub Actions** runs builds and tests for Linux, Windows, and MacOS 
+    environments using compilers in container images maintained in the 
+    `RADIUSS Docker Project <https://github.com/LLNL/radiuss-docker>`_.
+    While we do some GPU builds on GitHub Actions for CUDA, HIP, and SYCL, 
+    RAJA tests are only run for CPU-only builds. The current set of builds 
+    run on GitHub Actions can be seen by looking at the 
+    ``RAJA/.github/workflows/build.yml`` file. The ``RAJA/Dockerfile`` file 
+    contains the CMake options used to generate the build environment for 
+    each of the builds.
 
   * **GitLab** instance in the Collaboration Zone (CZ) of the Livermore 
     Computing (LC) Center runs builds and tests on LC platforms using
     software stacks (compilers, etc.) important to many RAJA user applications.
-    GitLab build configurations are more complex than Azure; they will be 
-    described in detail in :ref:`gitlab_ci-label`. 
+    GitLab build configurations are more complex than GitHub Actions; they 
+    will be described in detail in :ref:`gitlab_ci-label`. 
 
 These tools integrate with the RAJA GitHub project and automatically run RAJA 
 builds and tests when a PR is created and when changes are pushed to a PR 
@@ -303,79 +303,22 @@ Also, recall that to generate a host-config file, Spack uses packages and
 specs in the `RADIUSS Spack Configs project <https://github.com/LLNL/radiuss-spack-configs>`_ (a RAJA submodule), 
 plus RAJA-specific specs defined in files in the `RAJA/.gitlab/jobs <https://github.com/LLNL/RAJA/tree/develop/.gitlab/jobs>`_ directory, as described earlier.
 
-.. _azure_ci-label:
+.. _github_actions_ci-label:
 
-======================================
-Azure Pipelines and GitHub Actions CI
-======================================
+==================
+GitHub Actions CI
+==================
 
-We use Azure Pipelines and GitHub Actions to run builds and tests for Linux, 
-Windows, and MacOS environments. We use these tools to run Linux builds and
-tests for various less-common configurations, such as compiler versions that are
-not available on LC systems. While we do builds for CUDA, HIP, and SYCL RAJA 
-GPU back-ends in the Azure and GitHub Actions Linux environments, RAJA tests 
-are only run for CPU-only pipelines.
+We use GitHub Actions to run builds and tests for Linux, Windows, and MacOS 
+environments. We use this tool to run Linux builds and tests for various 
+less-common configurations, such as compiler versions that are not available 
+on LC systems. While we do builds for CUDA, HIP, and SYCL RAJA GPU back-ends 
+in the GitHub Actions Linux environment, RAJA tests are only run for CPU-only 
+pipelines.
 
-.. note:: Azure Pipelines and GitHub Actions CI test jobs are run on every
-   RAJA pull request, regardless of whether it was made from a branch in the
-   RAJA project repo or from a fork of the repo.
-
-Azure Pipelines Testing Workflow
---------------------------------
-
-The jobs run in the Azure Pipelines testing workflow for RAJA are specified in
-the `RAJA/azure-pipelines.yml <https://github.com/LLNL/RAJA/blob/develop/azure-pipelines.yml>`_ file. This file defines the job steps, commands,
-compilers, etc. for each OS environment in the associated ``- job:`` section.
-A summary of the configurations we build are:
-
-  * **Windows.** The ``- job: Windows`` Windows section contains information
-    for the Windows test builds. For example, we build and test RAJA as
-    a static and/or shared library. This is indicated in the Windows 
-    ``strategy`` section::
-
-      strategy:
-        matrix:
-          shared:
-            ...
-          static:
-            ...
-
-    We use the Windows/compiler image provided by the Azure application 
-    indicated the ``pool`` section; for example::
-
-      pool:
-        vmImage: 'windows-2019'
-
-    **MacOS.** The ``- job: Mac`` section contains information for Mac test 
-    builds. For example, we build RAJA using the MacOS/compiler 
-    image provided by the Azure application indicated in the ``pool`` section; 
-    for example::
-
-      pool:
-        vmImage: 'macOS-latest' 
-
-    **Linux.** The ``- job: Docker`` section contains information for Linux
-    test builds. We build and test RAJA using Docker container images generated 
-    with recent versions of various compilers. The RAJA project shares these 
-    images with other open-source LLNL RADIUSS projects and they are maintained
-    in the `RADIUSS Docker <https://github.com/LLNL/radiuss-docker>`_ 
-    project on GitHub. The builds we do at any point in time are located in 
-    the ``strategy`` block::
-
-      strategy:
-        matrix: 
-          gcc11:
-            docker_target: ...
-          ...
-          clang14:
-            docker_target: ...
-          ...
-
-    The Linux OS image used is indicated in the ``pool`` section; 
-    for example::
-
-      pool:
-        vmImage: 'ubuntu-latest'
+.. note:: GitHub Actions CI test jobs are run on every RAJA pull request, 
+   regardless of whether it was made from a branch in the RAJA project repo 
+   or from a fork of the repo.
 
 GitHub Actions Testing Workflow
 --------------------------------
diff --git a/docs/sphinx/dev_guide/ci_tasks.rst b/docs/sphinx/dev_guide/ci_tasks.rst
index 16b3a6bad5..e5c77da403 100644
--- a/docs/sphinx/dev_guide/ci_tasks.rst
+++ b/docs/sphinx/dev_guide/ci_tasks.rst
@@ -191,92 +191,107 @@ Building the Compiler
    that the head of the SYCL branch will fail to build. In the event that it
    does not build, try checking out an earlier commit. On the Intel/LLVM GitHub
    page, one can see which of their commits builds by checking the status
-   badge next to each commit. Look for a commit that passes. 
+   badge next to each commit. Look for a commit that passes.
 
-#. Load the module of the version of GCC headers that you want to use. For example, we typically use the system default, which on corona is gcc/10.3.1-magic::
 
-    module load gcc/10.3.1-magic
+#. On LC machines, it is following the good neighbor policy to do your build on a compute node.
 
-#. Load the module of the version of ROCm that you want to use. For example::
+   Use an appropriate bank to get an interactive node, e.g on Corona::
 
-    module load rocm/5.7.1 
+    flux alloc -t 60 -N 1 --bank=wbronze
 
+#. Load the module of the version of GCC headers that you want to use. We typically use the system default, which on corona at time of writing is gcc/10.3.1-magic::
+
+    GCC_VERSION=10.3.1
+    module load gcc/${GCC_VERSION}-magic
+
+#. Load the module of the version of ROCm that you want to use::
+    ROCM_VERSION=6.4.2
+    module load rocm/${ROCM_VERSION}
+
+#. Load Python module you want to use.  At time of writing, the LLVM configure requires at least version 3.7. we use Corona default::
+    PYTHON_VERSION=3.9.12
+    module load python/${PYTHON_VERSION}
+    
 #. Clone the SYCL branch of Intel's LLVM compiler::
 
     git clone https://github.com/intel/llvm -b sycl
 
-#. cd into the LLVM folder:: 
-    
-    cd llvm
+#. cd into the LLVM folder and extract the GIT SHA for naming the install directories.  The install directory uses the naming convention ``clang_sycl_<git sha>_hip_gcc<version>_rocm<version>``::
 
-   In the event that the head of the sycl branch does not build, run
-   ``git checkout <git sha>`` to checkout a version that does build.
+    cd llvm
+    GIT_SHA=$(git rev-parse --short=12 HEAD)
+    INSTALL_PREFIX=/usr/workspace/raja-dev/clang_sycl_${GIT_SHA}_hip_gcc${GCC_VERSION}_rocm${ROCM_VERSION}
 
 #. Build the compiler. 
 
-   Note that in this example, we are using rocm5.7.1, but one can change the
-   version they wish to use simply by changing the paths in the configure step
-
    a. Configure
 
-     .. code-block:: bash 
-
-        srun -n1 /usr/bin/python3 buildbot/configure.py --hip -o buildrocm5.7.1 \
-        --cmake-gen "Unix Makefiles" \
-        --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/opt/rocm-5.7.1 \
-        --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_INCLUDE_DIR=/opt/rocm-5.7.1/include \
-        --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_LIB_DIR=/opt/rocm-5.7.1/lib \
-        --cmake-opt=-DSYCL_BUILD_PI_HIP_INCLUDE_DIR=/opt/rocm-5.7.1/include \
-        --cmake-opt=-DSYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR=/opt/rocm-5.7.1/hsa/include/hsa \
-        --cmake-opt=-DSYCL_BUILD_PI_HIP_LIB_DIR=/opt/rocm-5.7.1/lib \
-        --cmake-opt=-DUR_HIP_ROCM_DIR=/opt/rocm-5.7.1 \
-        --cmake-opt=-DUR_HIP_INCLUDE_DIR=/opt/rocm-5.7.1/include \
-        --cmake-opt=-DUR_HIP_HSA_INCLUDE_DIR=/opt/rocm-5.7.1/hsa/include/hsa \
-        --cmake-opt=-DUR_HIP_LIB_DIR=/opt/rocm-5.7.1/lib
-
-   b. Build
-
      .. code-block:: bash
 
-      srun -n1 /usr/bin/python3 buildbot/compile.py -o buildrocm5.7.1
+     python3 buildbot/configure.py --hip -o buildrocm${ROCM_VERSION} \
+     --cmake-gen "Unix Makefiles" \
+     --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/opt/rocm-${ROCM_VERSION} \
+     --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_INCLUDE_DIR=/opt/rocm-${ROCM_VERSION}/include \
+     --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_LIB_DIR=/opt/rocm-${ROCM_VERSION}/lib \
+     --cmake-opt=-DSYCL_BUILD_PI_HIP_INCLUDE_DIR=/opt/rocm-${ROCM_VERSION}/include \
+     --cmake-opt=-DSYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR=/opt/rocm-${ROCM_VERSION}/hsa/include/hsa \
+     --cmake-opt=-DSYCL_BUILD_PI_HIP_LIB_DIR=/opt/rocm-${ROCM_VERSION}/lib \
+     --cmake-opt=-DUR_HIP_ROCM_DIR=/opt/rocm-${ROCM_VERSION} \
+     --cmake-opt=-DUR_HIP_INCLUDE_DIR=/opt/rocm-${ROCM_VERSION}/include \
+     --cmake-opt=-DUR_HIP_HSA_INCLUDE_DIR=/opt/rocm-${ROCM_VERSION}/hsa/include/hsa \
+     --cmake-opt=-DUR_HIP_LIB_DIR=/opt/rocm-${ROCM_VERSION}/lib 
 
-#. Test the compiler
+   #. Build::
 
-   Follow the steps in the `Using the compiler`_ section to test the installation
+      python buildbot/compile.py -o buildrocm${ROCM_VERSION}		     
 
-#. Install
+   #. Install::
+	
+      cp -rp buildrocm${ROCM_VERSION}/install ${INSTALL_PREFIX}
+      cd ..
 
-  a. The build step will install the compiler in the folder ``buildrocm<version>/install``. Copy this folder to the ``/usr/workspace/raja-dev/`` directory using the naming scheme ``clang_sycl_<git sha>_hip_gcc<version>_rocm<version>``
+#. Set the permissions of the folder, and everything in it to 750::
 
-  #. Set the permissions of the folder, and everything in it to 750::
+      chmod 750 ${INSTALL_PREFIX} -R  
 
-      chmod 750 /usr/workspace/raja-dev/<foldername>/ -R  
+#. Change the group of the folder and everything in it to raja-dev::
 
-  #. Change the group of the folder and everything in it to raja-dev::
+      chgrp raja-dev ${INSTALL_PREFIX} -R
 
-      chgrp raja-dev /usr/workspace/raja-dev/<foldername>/ -R  
+#. Test the compiler
 
+   Follow the steps in the `Using the compiler`_ section to test the installation
 
 Using the compiler
 ^^^^^^^^^^^^^^^^^^
 
-#. Load the version of rocm that you used when building the compiler, for example::
+#. Load the version of ROCm that you used when building the compiler, for example::
 
-    module load rocm/5.7.1
+    ROCM_VERSION=6.4.2 
+    module load rocm/${ROCM_VERSION}
 
 #. Navigate to the root of your local RAJA checkout space::
 
     cd /path/to/raja
 
+#. Determine where you installed the compiler.
+
+   This is the ``INSTALL_PREFIX`` used above.  For example::
+
+    SYCL_INSTALL_PREFIX=/usr/workspace/raja-dev/clang_sycl_16b7bcb09915_hip_gcc10.3.1_rocm6.4.2
+
 #. Run the test config script::
 
-    ./scripts/lc-builds/corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1
+    ./scripts/lc-builds/corona_sycl.sh ${SYCL_INSTALL_PREFIX}
 
-   Note that at the time of writing, the newest compiler we had built was at ``clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1``
+#. As indicated in the output of the ``corona_sycl.sh`` script the SYCL compiler libraries need to be on the ``LD_LIBRARY_PATH``::
+   
+    export LD_LIBRARY_PATH=${SYCL_INSTALL_PREFIX}/lib:${SYCL_INSTALL_PREFIX}/lib64:$LD_LIBRARY_PATH    
 
 #. cd into the generated build directory::
 
-    cd {build directory}
+    cd build_corona-sycl_${USER}
 
 #. Build the code and run the RAJA tests::
 
@@ -284,46 +299,28 @@ Using the compiler
     make test
 
 
-============================================
-Azure Pipelines and GitHub Actions CI Tasks
-============================================
+========================
+GitHub Actions CI Tasks
+========================
 
-The tasks in this section apply to RAJA Azure Pipelines and GitHub Actions
-CI testing that was described in :ref:`azure_ci-label`
+The tasks in this section apply to RAJA GitHub Actions CI testing that was 
+described in :ref:`github_actions_ci-label`
 
 Changing Builds/Container Images
 --------------------------------
 
-The builds we run in Azure are defined in the `RAJA/azure-pipelines.yml <https://github.com/LLNL/RAJA/blob/develop/azure-pipelines.yml>`_ file.
-
-The builds we run in GitHub Actions are defined in the `RAJA/.github/workflows/build.yml <https://github.com/LLNL/RAJA/blob/develop/.github/workflows/build.yml>`_ file.
+The builds we run in GitHub Actions are defined in the 
+`RAJA/.github/workflows/build.yml <https://github.com/LLNL/RAJA/blob/develop/.github/workflows/build.yml>`_ file.
   
 Linux/Docker
 ^^^^^^^^^^^^
 
-To update or add a new compiler / job to Azure Pipelines or GitHub Actions CI, 
-we need to edit either the ``RAJA/azure-pipelines.yml`` file or the
-``RAJA/.github/workflows/build.yml`` file and the ``RAJA/Dockerfile``, if
+To update or add a new compiler / job to GitHub Actions CI, we need to edit 
+the ``RAJA/.github/workflows/build.yml`` file and the ``RAJA/Dockerfile``, if
 changes are needed there.
 
-If we want to add a new Azure pipeline to build with ``compilerX``, then in the
-``RAJA/azure-pipelines.yml`` file we can add the job like so::
-
-  -job: Docker
-    ...
-    strategy:
-      matrix:
-        ...
-        compilerX: 
-          docker_target: compilerX
-
-Here, ``compilerX`` defines the name of a job in Azure. ``docker_target: compilerX`` defines a variable ``docker_target``, which is used to determine which 
-entry in the ``Dockerfile`` file to use, where the name after ``docker_target``
-is the shorthand name of job in the ``Dockerfile`` file following the word 
-``AS``.
-
-Similarly, for GitHub Actions, we add the name of the job to the job list in
-the ``RAJA/.github/workflows/build.yaml`` file::
+For GitHub Actions, we add the name of the job to the job list in the
+``RAJA/.github/workflows/build.yml`` file::
 
   jobs:
   build_docker:
@@ -355,24 +352,9 @@ list of currently available images.
 Windows / MacOS
 ^^^^^^^^^^^^^^^
 
-We run our Windows and MacOS builds directly on the provided Azure machine 
-instances. To change the versions, change the ``pool`` under ``-job: Windows``
-or ``-job: Mac`` in the ``RAJA/azure-pipelines.yml`` file::
-  
-  -job: Windows
-    ...
-    pool:
-      vmImage: 'windows-2019'
-    ...
-
-  -job: Mac
-    ...
-    pool:
-      vmImage: 'macOS-latest'
-
-Similarly, in GitHub Actions, we run our Windows and MacOS builds directly on
-the provided machine instances. To change the versions, change the
-appropriate lines in the ``RAJA/.github/workflows/build.yml`` file::
+In GitHub Actions, we run our Windows and MacOS builds directly on the 
+provided machine instances. To change the versions, change the appropriate 
+lines in the ``RAJA/.github/workflows/build.yml`` file::
 
   build_mac:
     runs-on: macos-latest
@@ -410,7 +392,9 @@ Windows / MacOS
 ^^^^^^^^^^^^^^^
 
 Windows and MacOS build / run parameters can be configured directly in the
-``RAJA/azure-pipelines.yml`` or ``RAJA/.github/workflows/build.yml`` file. CMake options can be configured with ``CMAKE_EXTRA_FLAGS`` for each job. The ``-j`` value can also be edited directly in these files for each job.
+``RAJA/.github/workflows/build.yml`` file. CMake options can be configured 
+in the workflow file for each job. The parallel build value can also be 
+edited directly in the workflow file for each job.
 
 .. _rajaperf_ci_tasks-label:
 
@@ -433,7 +417,7 @@ Specifically,
     <https://github.com/LLNL/RAJAPerf/tree/develop/scripts/gitlab>`_ directory.
   * The `RAJAPerf/Dockerfile
     <https://github.com/LLNL/RAJAPerf/blob/develop/Dockerfile>`_ drives the
-    Azure testing pipelines.
+    GitHub Actions testing pipelines.
 
 The Performance Suite GitLab CI uses the ``uberenv`` and
 ``radiuss-spack-configs`` versions located in the RAJA submodule to make the
diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst
index d26deb0459..69386776f4 100644
--- a/docs/sphinx/dev_guide/contributing.rst
+++ b/docs/sphinx/dev_guide/contributing.rst
@@ -71,10 +71,10 @@ of the contributor. The process involves four main steps:
   #. A RAJA contributor makes a PR on the RAJA GitHub project to merge a
      branch on which she has developed a contribution into another RAJA branch,
      typically, the develop branch.
-  #. When a PR is created, GitHub triggers Azure and GitHub Actions CI test
-     checks and GitLab CI checks if the branch is part of the RAJA GitHub repo. 
-     Running and pass/fail status for all checks is reported back to the 
-     corresponding GitHub pull request where it can be viewed and monitored.
+  #. When a PR is created, GitHub triggers GitHub Actions CI test checks and 
+     GitLab CI checks if the branch is part of the RAJA GitHub repo. Running 
+     and pass/fail status for all checks is reported back to the corresponding 
+     GitHub pull request where it can be viewed and monitored.
   #. Meanwhile, RAJA team members and other contributors review the PR, 
      suggesting changes and/or approving when they think it is ready to merge.
   #. When all checks pass and the PR is approved, the PR may be merged.
diff --git a/host-configs/lc-builds/toss4/cce_omptarget_X.cmake b/host-configs/lc-builds/toss4/cce_omptarget_X.cmake
new file mode 100644
index 0000000000..695e8838c8
--- /dev/null
+++ b/host-configs/lc-builds/toss4/cce_omptarget_X.cmake
@@ -0,0 +1,15 @@
+###############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -haccel=amd_${HIP_ARCH}" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -haccel=amd_${HIP_ARCH}" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -haccel=amd_${HIP_ARCH}" CACHE STRING "")
+
+# hcpu flag needs more experimentation, can cause runtime vectorization failures.
+# -hcpu=x86-genoa
diff --git a/host-configs/lc-builds/toss4/nvcc_clang_X.cmake b/host-configs/lc-builds/toss4/nvcc_clang_X.cmake
new file mode 100755
index 0000000000..6df45ac600
--- /dev/null
+++ b/host-configs/lc-builds/toss4/nvcc_clang_X.cmake
@@ -0,0 +1,26 @@
+###############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -funroll-loops -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -march=native -funroll-loops -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(HOST_OPT_FLAGS "-Xcompiler -O3 ")
+
+if(ENABLE_OPENMP)
+  set(HOST_OPT_FLAGS "${HOST_OPT_FLAGS} -Xcompiler -fopenmp")
+endif()
+
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss4/nvcc_gcc_X.cmake b/host-configs/lc-builds/toss4/nvcc_gcc_X.cmake
new file mode 100755
index 0000000000..080c8376a9
--- /dev/null
+++ b/host-configs/lc-builds/toss4/nvcc_gcc_X.cmake
@@ -0,0 +1,26 @@
+###############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(HOST_OPT_FLAGS "-Xcompiler -O3 -Xcompiler -finline-functions ")
+
+if(ENABLE_OPENMP)
+  set(HOST_OPT_FLAGS "${HOST_OPT_FLAGS} -Xcompiler -fopenmp")
+endif()
+
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_DEBUG "-g -G -O0" CACHE STRING "")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-g -lineinfo -O3 ${HOST_OPT_FLAGS}" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index abc9f30738..e09b88151c 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -135,6 +135,10 @@
 //
 #include "RAJA/pattern/atomic.hpp"
 
+// Thread operations support
+//
+#include "RAJA/pattern/thread.hpp"
+
 //
 // Shared memory view patterns
 //
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index 4987f00a50..36e4836378 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -167,8 +167,15 @@ static_assert(RAJA_HAS_SOME_CXX14,
  *
  * \brief Programming model back-ends.
  *
+ * \note The RAJA_ENABLE_<back-end> macros indicate if the back-end was
+ * enabled when RAJA was configured.   RAJA_<back-end>_ACTIVE macros
+ * indicate if the back-end is present when the application/library
+ * using RAJA is compiled.   An application/library can use a subset of
+ * back-ends that are present in a RAJA install.
+ * 
  ******************************************************************************
  */
+
 #cmakedefine RAJA_ENABLE_OPENMP
 #cmakedefine RAJA_ENABLE_TARGET_OPENMP
 #cmakedefine RAJA_ENABLE_CUDA
@@ -264,14 +271,14 @@ namespace RAJA {
 
 #if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__)
 #if defined(_OPENMP)
+// RAJA build can be configured but downstream packages may not; guard OpenMP 
+#define RAJA_OPENMP_ACTIVE  
 #if (_OPENMP >= 200805)
 #if defined(RAJA_ENABLE_OPENMP_TASK)
 #define RAJA_ENABLE_OPENMP_TASK_INTERNAL
 #endif
 #endif // _OPENMP >= 200805
-#else
-#error RAJA configured with RAJA_ENABLE_OPENMP, but _OPENMP is not defined in this code section
-#endif // else
+#endif // OPENMP
 #endif // RAJA_ENABLE_OPENMP && !__HIP_DEVICE_COMPILE__
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
diff --git a/include/RAJA/internal/ThreadUtils_CPU.hpp b/include/RAJA/internal/ThreadUtils_CPU.hpp
index 8d420fa113..fe262363a5 100644
--- a/include/RAJA/internal/ThreadUtils_CPU.hpp
+++ b/include/RAJA/internal/ThreadUtils_CPU.hpp
@@ -21,9 +21,9 @@
 
 #include "RAJA/config.hpp"
 
-#if defined(RAJA_ENABLE_OPENMP)
-#include <omp.h>
-#endif
+#include "RAJA/pattern/thread.hpp"
+#include "RAJA/policy/openmp/thread.hpp"
+#include "RAJA/policy/sequential/thread.hpp"
 
 namespace RAJA
 {
@@ -35,16 +35,10 @@ namespace RAJA
 *
 *************************************************************************
 */
-RAJA_INLINE
-int getMaxOMPThreadsCPU()
+template<typename ThreadPolicy = RAJA::detail::active_auto_thread>
+RAJA_INLINE int getMaxOMPThreadsCPU()
 {
-  int nthreads = 1;
-
-#if defined(RAJA_ENABLE_OPENMP)
-  nthreads = omp_get_max_threads();
-#endif
-
-  return nthreads;
+  return RAJA::get_max_threads<ThreadPolicy>();
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/pattern/detail/TypeTraits.hpp b/include/RAJA/pattern/detail/TypeTraits.hpp
new file mode 100644
index 0000000000..15a60812f0
--- /dev/null
+++ b/include/RAJA/pattern/detail/TypeTraits.hpp
@@ -0,0 +1,99 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header containing helper type traits for work with Reducers
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+
+#ifndef RAJA_TYPETRAITS_HPP
+#define RAJA_TYPETRAITS_HPP
+
+#include <type_traits>
+#include <camp/camp.hpp>
+
+namespace RAJA
+{
+namespace expt
+{
+//===========================================================================
+//
+//
+// Forward declarations for types used by type trait helpers
+//
+//
+
+// Forward declaration of ForallParamPack
+template<typename... Params>
+struct ForallParamPack;
+
+// Forward declaration of Reducer
+namespace detail
+{
+template<typename Op, typename T, typename VOp>
+struct Reducer;
+}
+
+//===========================================================================
+//
+//
+// Type traits for SFINAE work.
+//
+//
+namespace type_traits
+{
+template<typename T>
+struct is_ForallParamPack : std::false_type
+{};
+
+template<typename... Args>
+struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
+{};
+
+template<typename T>
+struct is_ForallParamPack_empty : std::true_type
+{};
+
+template<typename First, typename... Rest>
+struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
+    : std::false_type
+{};
+
+template<>
+struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
+{};
+}  // namespace type_traits
+
+template<typename T>
+struct is_instance_of_Reducer : std::false_type
+{};
+
+template<typename Op, typename T, typename VOp>
+struct is_instance_of_Reducer<detail::Reducer<Op, T, VOp>> : std::true_type
+{};
+
+template<typename T>
+struct tuple_contains_Reducers : std::false_type
+{};
+
+template<typename... Params>
+struct tuple_contains_Reducers<camp::tuple<Params...>>
+    : std::integral_constant<
+          bool,
+          camp::concepts::any_of<is_instance_of_Reducer<Params>...>::value>
+{};
+
+}  // namespace expt
+}  // namespace RAJA
+
+#endif  //  RAJA_TYPETRAITS_HPP
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index abcdd1ac61..8ae119e063 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -204,6 +204,6 @@ kernel(SegmentTuple&& segments, Bodies&&... bodies)
 #include "RAJA/pattern/kernel/Region.hpp"
 #include "RAJA/pattern/kernel/Tile.hpp"
 #include "RAJA/pattern/kernel/TileTCount.hpp"
-#include "RAJA/pattern/kernel/type-traits.hpp"
+#include "RAJA/pattern/kernel/TypeTraits.hpp"
 
 #endif /* RAJA_pattern_kernel_HPP */
diff --git a/include/RAJA/pattern/kernel/type-traits.hpp b/include/RAJA/pattern/kernel/TypeTraits.hpp
similarity index 97%
rename from include/RAJA/pattern/kernel/type-traits.hpp
rename to include/RAJA/pattern/kernel/TypeTraits.hpp
index 74a54a8a43..2d7ec81a30 100644
--- a/include/RAJA/pattern/kernel/type-traits.hpp
+++ b/include/RAJA/pattern/kernel/TypeTraits.hpp
@@ -39,7 +39,7 @@ template<typename SegmentTuple,
          typename... Bodies>
 struct loop_data_has_reducers<
     LoopData<SegmentTuple, ParamTuple, Resource, Bodies...>>
-    : RAJA::expt::tuple_contains_reducers<ParamTuple>
+    : RAJA::expt::tuple_contains_Reducers<ParamTuple>
 {};
 
 template<typename T>
diff --git a/include/RAJA/pattern/launch.hpp b/include/RAJA/pattern/launch.hpp
index 153e6e8376..66adf395e3 100644
--- a/include/RAJA/pattern/launch.hpp
+++ b/include/RAJA/pattern/launch.hpp
@@ -34,11 +34,11 @@
 #include "RAJA/policy/hip/launch.hpp"
 #endif
 
-#if defined(RAJA_ENABLE_OPENMP)
+#if defined(RAJA_OPENMP_ACTIVE)
 #include "RAJA/policy/openmp/launch.hpp"
 #endif
 
-#if defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_SYCL_ACTIVE)
 #include "RAJA/policy/sycl/launch.hpp"
 #endif
 
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index bfce94057c..67d9b74ce2 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -31,7 +31,7 @@
 // Odd dependecy with atomics is breaking CI builds
 //#include "RAJA/util/View.hpp"
 
-#if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_SYCL_ACTIVE)
 #define RAJA_TEAM_SHARED __shared__
 #else
 #define RAJA_TEAM_SHARED
@@ -185,7 +185,8 @@ class LaunchContext
 
   void* shared_mem_ptr;
 
-#if defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_SYCL_ACTIVE)
+  // SGS ODR issue
   mutable ::sycl::nd_item<3>* itm;
 #endif
 
@@ -231,11 +232,12 @@ class LaunchContext
   RAJA_HOST_DEVICE
   void teamSync()
   {
-#if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && defined(RAJA_ENABLE_SYCL)
+    // SGS ODR Issue
+#if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && defined(RAJA_SYCL_ACTIVE)
     itm->barrier(::sycl::access::fence_space::local_space);
 #endif
 
-#if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_SYCL_ACTIVE)
     __syncthreads();
 #endif
   }
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index b8b12a5c29..166594b6df 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -2,6 +2,7 @@
 #define FORALL_PARAM_HPP
 
 
+#include "RAJA/pattern/detail/TypeTraits.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 #include "RAJA/util/CombiningAdapter.hpp"
 #include "camp/camp.hpp"
@@ -23,7 +24,7 @@ namespace detail
 template<typename... Params>
 RAJA_HOST_DEVICE constexpr auto filter_reducers(camp::tuple<Params...>& params)
 {
-  return camp::get_refs_to_elements_by_type_trait<is_instance_of_reducer>(
+  return camp::get_refs_to_elements_by_type_trait<is_instance_of_Reducer>(
       params);
 }
 
@@ -35,8 +36,9 @@ void resolve_params_helper(ParamTuple& params_tuple,
                            const camp::idx_seq<Seq...>&,
                            Args&&... args)
 {
-  CAMP_EXPAND(param_resolve(ExecPol {}, camp::get<Seq>(params_tuple),
-                            std::forward<Args>(args)...));
+  (param_resolve(ExecPol {}, camp::get<Seq>(params_tuple),
+                 std::forward<Args>(args)...),
+   ...);
 }
 
 template<typename ExecPol, typename... Params, typename... Args>
@@ -57,8 +59,9 @@ void init_params_helper(ParamTuple& params_tuple,
                         const camp::idx_seq<Seq...>&,
                         Args&&... args)
 {
-  CAMP_EXPAND(param_init(ExecPol {}, camp::get<Seq>(params_tuple),
-                         std::forward<Args>(args)...));
+  (param_init(ExecPol {}, camp::get<Seq>(params_tuple),
+              std::forward<Args>(args)...),
+   ...);
 }
 
 template<typename ExecPol, typename... Params, typename... Args>
@@ -75,12 +78,12 @@ template<typename ExecPol, typename ParamTuple, camp::idx_t... Seq>
 RAJA_HOST_DEVICE void combine_params_helper(const camp::idx_seq<Seq...>&,
                                             ParamTuple& params_tuple)
 {
-  CAMP_EXPAND(param_combine(ExecPol {}, camp::get<Seq>(params_tuple)));
+  (param_combine(ExecPol {}, camp::get<Seq>(params_tuple)), ...);
 }
 
 template<typename EXEC_POL, typename T>
 camp::concepts::enable_if<
-    concepts::negate<is_instance_of_reducer<camp::decay<T>>>,
+    concepts::negate<is_instance_of_Reducer<camp::decay<T>>>,
     concepts::negate<std::is_same<T, RAJA::detail::Name>>>
 param_combine(EXEC_POL const&, T&, const T&)
 {}
@@ -90,8 +93,9 @@ RAJA_HOST_DEVICE void combine_params_helper(const camp::idx_seq<Seq...>&,
                                             ParamTuple& params_tuple,
                                             const ParamTuple& params_tuple_in)
 {
-  CAMP_EXPAND(param_combine(ExecPol {}, camp::get<Seq>(params_tuple),
-                            camp::get<Seq>(params_tuple_in)));
+  (param_combine(ExecPol {}, camp::get<Seq>(params_tuple),
+                 camp::get<Seq>(params_tuple_in)),
+   ...);
 }
 
 template<typename ExecPol, typename... Params>
@@ -142,8 +146,9 @@ struct ForallParamPack
                                        ForallParamPack& f_params,
                                        Args&&... args)
   {
-    CAMP_EXPAND(param_init(pol, camp::get<Seq>(f_params.param_tup),
-                           std::forward<Args>(args)...));
+    (param_init(pol, camp::get<Seq>(f_params.param_tup),
+                std::forward<Args>(args)...),
+     ...);
   }
 
   // Combine
@@ -154,8 +159,9 @@ struct ForallParamPack
       ForallParamPack& out,
       const ForallParamPack& in)
   {
-    CAMP_EXPAND(param_combine(pol, camp::get<Seq>(out.param_tup),
-                              camp::get<Seq>(in.param_tup)));
+    (param_combine(pol, camp::get<Seq>(out.param_tup),
+                   camp::get<Seq>(in.param_tup)),
+     ...);
   }
 
   template<typename EXEC_POL, camp::idx_t... Seq>
@@ -164,7 +170,7 @@ struct ForallParamPack
       camp::idx_seq<Seq...>,
       ForallParamPack& f_params)
   {
-    CAMP_EXPAND(param_combine(pol, camp::get<Seq>(f_params.param_tup)));
+    (param_combine(pol, camp::get<Seq>(f_params.param_tup)), ...);
   }
 
   // Resolve
@@ -174,8 +180,9 @@ struct ForallParamPack
                                           ForallParamPack& f_params,
                                           Args&&... args)
   {
-    CAMP_EXPAND(param_resolve(pol, camp::get<Seq>(f_params.param_tup),
-                              std::forward<Args>(args)...));
+    (param_resolve(pol, camp::get<Seq>(f_params.param_tup),
+                   std::forward<Args>(args)...),
+     ...);
   }
 
   // Used to construct the argument TYPES that will be invoked with the lambda.
@@ -254,8 +261,13 @@ struct ParamMultiplexer
                                        ForallParamPack<Params...>& f_params,
                                        Args&&... args)
   {
-    FP::parampack_init(pol, typename FP::params_seq(), f_params,
-                       std::forward<Args>(args)...);
+    constexpr bool has_reducers =
+        !RAJA::expt::type_traits::is_ForallParamPack_empty<FP>::value;
+    if constexpr (has_reducers)
+    {
+      FP::parampack_init(pol, typename FP::params_seq(), f_params,
+                         std::forward<Args>(args)...);
+    }
   }
 
   template<typename EXEC_POL,
@@ -267,8 +279,13 @@ struct ParamMultiplexer
       ForallParamPack<Params...>& f_params,
       Args&&... args)
   {
-    FP::parampack_combine(pol, typename FP::params_seq(), f_params,
-                          std::forward<Args>(args)...);
+    constexpr bool has_reducers =
+        !RAJA::expt::type_traits::is_ForallParamPack_empty<FP>::value;
+    if constexpr (has_reducers)
+    {
+      FP::parampack_combine(pol, typename FP::params_seq(), f_params,
+                            std::forward<Args>(args)...);
+    }
   }
 
   template<typename EXEC_POL,
@@ -279,8 +296,13 @@ struct ParamMultiplexer
                                           ForallParamPack<Params...>& f_params,
                                           Args&&... args)
   {
-    FP::parampack_resolve(pol, typename FP::params_seq(), f_params,
-                          std::forward<Args>(args)...);
+    constexpr bool has_reducers =
+        !RAJA::expt::type_traits::is_ForallParamPack_empty<FP>::value;
+    if constexpr (has_reducers)
+    {
+      FP::parampack_resolve(pol, typename FP::params_seq(), f_params,
+                            std::forward<Args>(args)...);
+    }
   }
 };
 
@@ -535,39 +557,6 @@ constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
 //===========================================================================
 
 
-//===========================================================================
-//
-//
-// Type trailts for SFINAE work.
-//
-//
-namespace type_traits
-{
-template<typename T>
-struct is_ForallParamPack : std::false_type
-{};
-
-template<typename... Args>
-struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
-{};
-
-template<typename T>
-struct is_ForallParamPack_empty : std::true_type
-{};
-
-template<typename First, typename... Rest>
-struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
-    : std::false_type
-{};
-
-template<>
-struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
-{};
-}  // namespace type_traits
-
-//===========================================================================
-
-
 //===========================================================================
 //
 //
@@ -601,10 +590,20 @@ RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params,
                                             Fn&& f,
                                             Ts&&... extra)
 {
-  return detail::invoke_with_order(
-      camp::forward<Params>(params), camp::forward<Fn>(f),
-      typename camp::decay<Params>::lambda_arg_seq(),
-      camp::forward<Ts...>(extra)...);
+  using FPType = camp::decay<Params>;
+  constexpr bool has_reducers =
+      !RAJA::expt::type_traits::is_ForallParamPack_empty<FPType>::value;
+  if constexpr (has_reducers)
+  {
+    return detail::invoke_with_order(
+        camp::forward<Params>(params), camp::forward<Fn>(f),
+        typename camp::decay<Params>::lambda_arg_seq(),
+        camp::forward<Ts...>(extra)...);
+  }
+  else
+  {
+    return f(camp::forward<Ts...>(extra)...);
+  }
 }
 
 //===========================================================================
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 21042a591b..4298994285 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -3,6 +3,7 @@
 
 #include <type_traits>
 
+#include "RAJA/pattern/detail/TypeTraits.hpp"
 #include "RAJA/pattern/params/params_base.hpp"
 #include "RAJA/util/SoAPtr.hpp"
 
@@ -247,24 +248,6 @@ auto constexpr ReduceLoc(T* target, IndexType* index)
       target, index);
 }
 
-template<typename T>
-struct is_instance_of_reducer : std::false_type
-{};
-
-template<typename Op, typename T, typename VOp>
-struct is_instance_of_reducer<detail::Reducer<Op, T, VOp>> : std::true_type
-{};
-
-template<typename T>
-struct tuple_contains_reducers : std::false_type
-{};
-
-template<typename... Params>
-struct tuple_contains_reducers<camp::tuple<Params...>>
-    : std::integral_constant<
-          bool,
-          camp::concepts::any_of<is_instance_of_reducer<Params>...>::value>
-{};
 
 }  // namespace expt
 
diff --git a/include/RAJA/pattern/thread.hpp b/include/RAJA/pattern/thread.hpp
new file mode 100644
index 0000000000..0fc15fa83c
--- /dev/null
+++ b/include/RAJA/pattern/thread.hpp
@@ -0,0 +1,95 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file defining thread operations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_thread_HPP
+#define RAJA_pattern_thread_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/policy/thread_auto.hpp"
+
+namespace RAJA
+{
+/*!
+ * \file
+ * Thread operation functions in the namespace RAJA::thread
+ *
+ * The dispatch of all of these is:
+ *
+ * int get_thread_num<Policy>()      -- User facing API
+ *
+ * calls
+ *
+ * int get_thread_num(Policy{})    -- Policy specific implementation
+ *
+ *
+ * With the exception of the auto_thread policy which then calls the
+ * "appropriate" policy implementation.
+ *
+ *
+ * Current supported policies include:
+ *
+ *   auto_thread       -- Attempts to do "the right thing"
+ *
+ *   omp_thread        -- Available (and default) when OpenMP is active
+ *                        these are safe inside and outside of OMP parallel
+ *                        regions
+ *
+ *   seq_thread        -- Non-thread
+ *
+ *
+ * The implementation code lives in:
+ * RAJA/policy/thread_auto.hpp     -- for auto_thread
+ * RAJA/policy/XXX/thread.hpp      -- for omp_thread
+ *
+ */
+
+/*!
+ * @brief Get maximum number of threads
+
+ * This is based on OpenMP threading model. This value is also an
+ * upper bound on the number of threads that could be used to form a
+ * new team if a parallel region without a num_threads clause were
+ * encountered after execution returns from this routine.
+ *
+ * Returns 1 if OMP is not active.
+ * @return Maximum number of threads
+ */
+template<typename Policy>
+RAJA_INLINE RAJA_HOST_DEVICE int get_max_threads()
+{
+  return RAJA::get_max_threads(Policy {});
+}
+
+/*!
+ * @brief Get current thread number
+ * This is based on the OpenMP threading model.  Within a parallel team
+ * executing a parallel region the threads are numbered 0-N. Returns 0 if called
+ * in sequential part of a program or OMP is not active
+ * @return Current thread number
+ */
+template<typename Policy>
+RAJA_INLINE RAJA_HOST_DEVICE int get_thread_num()
+{
+  return RAJA::get_thread_num(Policy {});
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index be8eafc58f..49bf62e5aa 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -27,11 +27,12 @@
 #include <cstddef>
 #include <cstdio>
 #include <limits>
+#include <mutex>
 #include <type_traits>
 #include <unordered_map>
 
+
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/resource.hpp"
@@ -182,17 +183,12 @@ struct cudaInfo
 
 struct cudaStatusInfo : cudaInfo
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  omp::mutex lock;
-#endif
+  std::mutex lock;
 };
 
 extern cudaStatusInfo g_status;
 
-extern cudaStatusInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP)
-#pragma omp threadprivate(tl_status)
-#endif
+thread_local extern cudaStatusInfo tl_status;
 
 // stream to synchronization status: true synchronized, false running
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
@@ -206,9 +202,7 @@ void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 RAJA_INLINE
 void synchronize()
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  lock_guard<omp::mutex> lock(detail::g_status.lock);
-#endif
+  std::lock_guard<std::mutex> lock(detail::g_status.lock);
   bool synchronize = false;
   for (auto& val : detail::g_stream_info_map)
   {
@@ -228,9 +222,7 @@ void synchronize()
 RAJA_INLINE
 void synchronize(::RAJA::resources::Cuda res)
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  lock_guard<omp::mutex> lock(detail::g_status.lock);
-#endif
+  std::lock_guard<std::mutex> lock(detail::g_status.lock);
   auto iter = detail::g_stream_info_map.find(res.get_stream());
   if (iter != detail::g_stream_info_map.end())
   {
@@ -250,9 +242,7 @@ void synchronize(::RAJA::resources::Cuda res)
 RAJA_INLINE
 void launch(::RAJA::resources::Cuda res, bool async = true)
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  lock_guard<omp::mutex> lock(detail::g_status.lock);
-#endif
+  std::lock_guard<std::mutex> lock(detail::g_status.lock);
   auto iter = detail::g_stream_info_map.find(res.get_stream());
   if (iter != detail::g_stream_info_map.end())
   {
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index dfa148320a..2c7b38d727 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -38,7 +38,8 @@
 
 #include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
-#if defined(RAJA_ENABLE_OPENMP)
+
+#if defined(RAJA_OPENMP_ACTIVE)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index cfa2c953d5..2af88ca941 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -390,23 +390,13 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
   auto ii         = IterationGetter::template index<IndexType>();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   if (ii < length)
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
 ///
@@ -432,23 +422,13 @@ __global__ void forallp_cuda_kernel(
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
   auto ii         = IterationGetter::template index<IndexType>();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   if (ii < length)
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
 ///
@@ -476,24 +456,14 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
 ///
@@ -521,24 +491,14 @@ __global__ void forallp_cuda_kernel(
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
 }  // namespace impl
@@ -590,8 +550,6 @@ forall_impl(resources::Cuda cuda_res,
   Iterator begin = std::begin(iter);
   Iterator end   = std::end(iter);
   IndexType len  = std::distance(begin, end);
-  constexpr bool is_forallparampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
 
   // Only launch kernel if we have something to iterate over
   if (len > 0)
@@ -620,11 +578,7 @@ forall_impl(resources::Cuda cuda_res,
     launch_info.res      = cuda_res;
 
     {
-      if constexpr (!is_forallparampack_empty)
-      {
-        RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params,
-                                                     launch_info);
-      }
+      RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params, launch_info);
 
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
@@ -641,11 +595,8 @@ forall_impl(resources::Cuda cuda_res,
       RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
                          Async);
 
-      if constexpr (!is_forallparampack_empty)
-      {
-        RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params,
-                                                        launch_info);
-      }
+      RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params,
+                                                      launch_info);
     }
 
     RAJA_FT_END;
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 32e4920731..d9cca09216 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -28,22 +28,6 @@
 namespace RAJA
 {
 
-template<typename BODY>
-__global__ void launch_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY body_in)
-{
-  LaunchContext ctx;
-
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
-
-  // Set pointer to shared memory
-  extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
-
-  body(ctx);
-}
-
 template<typename BODY, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
                                                  body_in,
@@ -76,71 +60,7 @@ struct LaunchExecute<
   template<typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
-
-    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
-
-    cuda_dim_t blockSize {
-        static_cast<cuda_dim_member_t>(params.threads.value[0]),
-        static_cast<cuda_dim_member_t>(params.threads.value[1]),
-        static_cast<cuda_dim_member_t>(params.threads.value[2])};
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
-
-      RAJA_FT_BEGIN;
-
-      {
-        size_t shared_mem_size = params.shared_mem_size;
-
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, cuda_res,
-            std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        const LaunchParams& launch_params,
        BODY_IN&& body_in,
@@ -214,23 +134,6 @@ struct LaunchExecute<
   }
 };
 
-template<typename BODY, int num_threads, size_t BLOCKS_PER_SM>
-__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_global_fcn_fixed(const RAJA_CUDA_GRID_CONSTANT BODY body_in)
-{
-  LaunchContext ctx;
-
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
-
-  // Set pointer to shared memory
-  extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
-
-  body(ctx);
-}
-
 template<typename BODY,
          int num_threads,
          size_t BLOCKS_PER_SM,
@@ -265,72 +168,7 @@ struct LaunchExecute<
   template<typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>);
-
-    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
-
-    cuda_dim_t blockSize {
-        static_cast<cuda_dim_member_t>(params.threads.value[0]),
-        static_cast<cuda_dim_member_t>(params.threads.value[1]),
-        static_cast<cuda_dim_member_t>(params.threads.value[2])};
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
-
-      RAJA_FT_BEGIN;
-
-      {
-        size_t shared_mem_size = params.shared_mem_size;
-
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            func, gridSize, blockSize, shared_mem_size, cuda_res,
-            std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void* args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
-                           cuda_res, async);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        const LaunchParams& launch_params,
        BODY_IN&& body_in,
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index 4aad6e6f0b..f4729d5dff 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -27,14 +27,15 @@
 
 #include <type_traits>
 #include <limits>
+#include <mutex>
 #include <utility>
 #include <vector>
 
+
 #include <cuda.h>
 
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/math.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/reduce.hpp"
 #include "RAJA/util/OffsetOperators.hpp"
@@ -51,6 +52,8 @@
 #include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
+#include "RAJA/pattern/thread.hpp"
+
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
@@ -204,7 +207,10 @@ RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
 //
 
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
-template<typename Combiner, typename T, typename tuning>
+template<typename Combiner,
+         typename T,
+         typename tuning,
+         typename ThreadPolicy = RAJA::detail::active_auto_thread>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
@@ -319,10 +325,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
 
   static int get_tally_replication()
   {
-    int min_tally_replication = 1;
-#if defined(RAJA_ENABLE_OPENMP)
-    min_tally_replication = omp_get_max_threads();
-#endif
+    int min_tally_replication = RAJA::get_max_threads<ThreadPolicy>();
 
     struct
     {
@@ -410,7 +413,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
 };
 
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
-template<typename Combiner, typename T, typename tuning>
+template<typename Combiner,
+         typename T,
+         typename tuning,
+         typename ThreadPolicy = RAJA::detail::active_auto_thread>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
@@ -451,10 +457,7 @@ struct MultiReduceGridAtomicHostInit_Data
   //! combine value on host, combine a value into the tally
   void combine_host(int bin, T value)
   {
-    int tally_rep = 0;
-#if defined(RAJA_ENABLE_OPENMP)
-    tally_rep = omp_get_thread_num();
-#endif
+    int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
     int tally_offset =
         GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner {}(m_tally_mem[tally_offset], value);
@@ -472,7 +475,10 @@ struct MultiReduceGridAtomicHostInit_Data
 };
 
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
-template<typename Combiner, typename T, typename tuning>
+template<typename Combiner,
+         typename T,
+         typename tuning,
+         typename ThreadPolicy = RAJA::detail::active_auto_thread>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
@@ -597,10 +603,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   //! combine value on host, combine a value into the tally
   void combine_host(int bin, T value)
   {
-    int tally_rep = 0;
-#if defined(RAJA_ENABLE_OPENMP)
-    tally_rep = omp_get_thread_num();
-#endif
+    int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
     int tally_offset =
         GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner {}(m_tally_mem[tally_offset], value);
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 25de2a9a5c..2dd1ece75f 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -26,6 +26,7 @@
 #if defined(RAJA_ENABLE_CUDA)
 
 #include <type_traits>
+#include <mutex>
 
 #include <cuda.h>
 
@@ -33,7 +34,6 @@
 #include "RAJA/util/SoAArray.hpp"
 #include "RAJA/util/SoAPtr.hpp"
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/reduce.hpp"
 
@@ -600,9 +600,7 @@ class PinnedTally
   //! get new value for use in resource
   auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
     ResourceNode* rn = resource_list;
     while (rn)
     {
@@ -652,9 +650,7 @@ class PinnedTally
 
   ~PinnedTally() { free_list(); }
 
-#if defined(RAJA_ENABLE_OPENMP)
-  omp::mutex m_mutex;
-#endif
+  std::mutex m_mutex;
 
 private:
   ResourceNode* resource_list;
@@ -1099,9 +1095,7 @@ class Reduce
     {
       if (val.value != val.identity)
       {
-#if defined(RAJA_ENABLE_OPENMP)
-        lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
-#endif
+        std::lock_guard<std::mutex> lock(tally_or_val_ptr.list->m_mutex);
         parent->combine(val.value);
       }
     }
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 559cc640c8..9824498aff 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -27,11 +27,12 @@
 #include <cstddef>
 #include <cstdio>
 #include <limits>
+#include <mutex>
 #include <type_traits>
 #include <unordered_map>
 
+
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/resource.hpp"
@@ -175,17 +176,12 @@ struct hipInfo
 
 struct hipStatusInfo : hipInfo
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  omp::mutex lock;
-#endif
+  std::mutex lock;
 };
 
 extern hipStatusInfo g_status;
 
-extern hipStatusInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP)
-#pragma omp threadprivate(tl_status)
-#endif
+thread_local extern hipStatusInfo tl_status;
 
 // stream to synchronization status: true synchronized, false running
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
@@ -199,9 +195,7 @@ void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 RAJA_INLINE
 void synchronize()
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  lock_guard<omp::mutex> lock(detail::g_status.lock);
-#endif
+  std::lock_guard<std::mutex> lock(detail::g_status.lock);
   bool synchronize = false;
   for (auto& val : detail::g_stream_info_map)
   {
@@ -221,9 +215,7 @@ void synchronize()
 RAJA_INLINE
 void synchronize(::RAJA::resources::Hip res)
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  lock_guard<omp::mutex> lock(detail::g_status.lock);
-#endif
+  std::lock_guard<std::mutex> lock(detail::g_status.lock);
   auto iter = detail::g_stream_info_map.find(res.get_stream());
   if (iter != detail::g_stream_info_map.end())
   {
@@ -243,9 +235,7 @@ void synchronize(::RAJA::resources::Hip res)
 RAJA_INLINE
 void launch(::RAJA::resources::Hip res, bool async = true)
 {
-#if defined(RAJA_ENABLE_OPENMP)
-  lock_guard<omp::mutex> lock(detail::g_status.lock);
-#endif
+  std::lock_guard<std::mutex> lock(detail::g_status.lock);
   auto iter = detail::g_stream_info_map.find(res.get_stream());
   if (iter != detail::g_stream_info_map.end())
   {
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index 5d0a91e140..18a510de09 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -31,7 +31,8 @@
 
 #include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
-#if defined(RAJA_ENABLE_OPENMP)
+
+#if defined(RAJA_OPENMP_ACTIVE)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index f421eb5038..6f098a9a11 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -383,26 +383,15 @@ __launch_bounds__(BlockSize, 1) __global__
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
   auto ii         = IterationGetter::template index<IndexType>();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   if (ii < length)
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
-///
 template<typename EXEC_POL,
          typename Iterator,
          typename LOOP_BODY,
@@ -423,26 +412,14 @@ __global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
   auto ii         = IterationGetter::template index<IndexType>();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+  ;
   if (ii < length)
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
-///
 template<
     typename EXEC_POL,
     typename Iterator,
@@ -466,27 +443,15 @@ __launch_bounds__(BlockSize, 1) __global__
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
-///
 template<
     typename EXEC_POL,
     typename Iterator,
@@ -509,24 +474,15 @@ __global__ void forallp_hip_kernel(const LOOP_BODY loop_body,
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body      = privatizer.get_priv();
-  constexpr bool is_forall_parampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
        ii += IterationGetter::template size<IndexType>())
   {
-    if constexpr (is_forall_parampack_empty)
-    {
-      body(idx[ii]);
-    }
-    else
-    {
-      RAJA::expt::invoke_body(f_params, body, idx[ii]);
-    }
-  }
-  if constexpr (!is_forall_parampack_empty)
-  {
-    RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
+
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
+
+  RAJA::expt::ParamMultiplexer::parampack_combine(EXEC_POL {}, f_params);
 }
 
 }  // namespace impl
@@ -576,8 +532,7 @@ forall_impl(resources::Hip hip_res,
   Iterator begin = std::begin(iter);
   Iterator end   = std::end(iter);
   IndexType len  = std::distance(begin, end);
-  constexpr bool is_forallparampack_empty =
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
+
   // Only launch kernel if we have something to iterate over
   if (len > 0)
   {
@@ -605,11 +560,7 @@ forall_impl(resources::Hip hip_res,
     launch_info.res      = hip_res;
 
     {
-      if constexpr (!is_forallparampack_empty)
-      {
-        RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params,
-                                                     launch_info);
-      }
+      RAJA::expt::ParamMultiplexer::parampack_init(pol, f_params, launch_info);
 
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
@@ -625,11 +576,9 @@ forall_impl(resources::Hip hip_res,
                       (void*)&f_params};
       RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
                         Async);
-      if constexpr (!is_forallparampack_empty)
-      {
-        RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params,
-                                                        launch_info);
-      }
+
+      RAJA::expt::ParamMultiplexer::parampack_resolve(pol, f_params,
+                                                      launch_info);
     }
 
     RAJA_FT_END;
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 040386a324..f3ae8f87c1 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -28,22 +28,6 @@
 namespace RAJA
 {
 
-template<typename BODY>
-__global__ void launch_global_fcn(const BODY body_in)
-{
-  LaunchContext ctx;
-
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
-
-  // Set pointer to shared memory
-  extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
-
-  body(ctx);
-}
-
 template<typename BODY, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(const BODY body_in,
                                              ReduceParams reduce_params)
@@ -73,71 +57,7 @@ struct LaunchExecute<
   template<typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
-
-    resources::Hip hip_res = res.get<RAJA::resources::Hip>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2])};
-
-    hip_dim_t blockSize {
-        static_cast<hip_dim_member_t>(params.threads.value[0]),
-        static_cast<hip_dim_member_t>(params.threads.value[1]),
-        static_cast<hip_dim_member_t>(params.threads.value[2])};
-
-    // Only launch kernel if we have something to iterate over
-    constexpr hip_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
-
-      RAJA_FT_BEGIN;
-
-      {
-        size_t shared_mem_size = params.shared_mem_size;
-
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void* args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        const LaunchParams& launch_params,
        BODY_IN&& body_in,
@@ -212,23 +132,6 @@ struct LaunchExecute<
   }
 };
 
-template<typename BODY, int num_threads>
-__launch_bounds__(num_threads, 1) __global__
-    void launch_global_fcn_fixed(const BODY body_in)
-{
-  LaunchContext ctx;
-
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body      = privatizer.get_priv();
-
-  // Set pointer to shared memory
-  extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
-
-  body(ctx);
-}
-
 template<typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
     void launch_new_reduce_global_fcn_fixed(const BODY body_in,
@@ -258,72 +161,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   template<typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
-       BODY_IN&& body_in,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func =
-        reinterpret_cast<const void*>(&launch_global_fcn_fixed<BODY, nthreads>);
-
-    resources::Hip hip_res = res.get<RAJA::resources::Hip>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2])};
-
-    hip_dim_t blockSize {
-        static_cast<hip_dim_member_t>(params.threads.value[0]),
-        static_cast<hip_dim_member_t>(params.threads.value[1]),
-        static_cast<hip_dim_member_t>(params.threads.value[2])};
-
-    // Only launch kernel if we have something to iterate over
-    constexpr hip_dim_member_t zero = 0;
-    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
-        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
-    {
-
-      RAJA_FT_BEGIN;
-
-      {
-        size_t shared_mem_size = params.shared_mem_size;
-
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
-                                                shared_mem_size, hip_res,
-                                                std::forward<BODY_IN>(body_in));
-
-        //
-        // Launch the kernel
-        //
-        void* args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
-                          hip_res, async);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-  // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        const LaunchParams& launch_params,
        BODY_IN&& body_in,
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index e374191324..41f0772764 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -27,14 +27,15 @@
 
 #include <type_traits>
 #include <limits>
+#include <mutex>
 #include <utility>
 #include <vector>
 
+
 #include "hip/hip_runtime.h"
 
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/math.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/reduce.hpp"
 #include "RAJA/util/OffsetOperators.hpp"
@@ -54,6 +55,8 @@
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 
+#include "RAJA/pattern/thread.hpp"
+
 namespace RAJA
 {
 
@@ -204,7 +207,10 @@ RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
 //
 
 //! MultiReduction data for Hip Offload -- stores value, host pointer
-template<typename Combiner, typename T, typename tuning>
+template<typename Combiner,
+         typename T,
+         typename tuning,
+         typename ThreadPolicy = RAJA::detail::active_auto_thread>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
@@ -319,10 +325,7 @@ struct MultiReduceGridAtomicHostInit_TallyData
 
   static int get_tally_replication()
   {
-    int min_tally_replication = 1;
-#if defined(RAJA_ENABLE_OPENMP)
-    min_tally_replication = omp_get_max_threads();
-#endif
+    int min_tally_replication = RAJA::get_max_threads<ThreadPolicy>();
 
     struct
     {
@@ -410,7 +413,10 @@ struct MultiReduceGridAtomicHostInit_TallyData
 };
 
 //! MultiReduction data for Hip Offload -- stores value, host pointer
-template<typename Combiner, typename T, typename tuning>
+template<typename Combiner,
+         typename T,
+         typename tuning,
+         typename ThreadPolicy = RAJA::detail::active_auto_thread>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
@@ -451,10 +457,7 @@ struct MultiReduceGridAtomicHostInit_Data
   //! combine value on host, combine a value into the tally
   void combine_host(int bin, T value)
   {
-    int tally_rep = 0;
-#if defined(RAJA_ENABLE_OPENMP)
-    tally_rep = omp_get_thread_num();
-#endif
+    int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
     int tally_offset =
         GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner {}(m_tally_mem[tally_offset], value);
@@ -472,7 +475,10 @@ struct MultiReduceGridAtomicHostInit_Data
 };
 
 //! MultiReduction data for Hip Offload -- stores value, host pointer
-template<typename Combiner, typename T, typename tuning>
+template<typename Combiner,
+         typename T,
+         typename tuning,
+         typename ThreadPolicy = RAJA::detail::active_auto_thread>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
@@ -597,10 +603,7 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   //! combine value on host, combine a value into the tally
   void combine_host(int bin, T value)
   {
-    int tally_rep = 0;
-#if defined(RAJA_ENABLE_OPENMP)
-    tally_rep = omp_get_thread_num();
-#endif
+    int tally_rep = RAJA::get_thread_num<ThreadPolicy>();
     int tally_offset =
         GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
     Combiner {}(m_tally_mem[tally_offset], value);
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index f67ba380e2..f2f15468ae 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -26,6 +26,7 @@
 #if defined(RAJA_ENABLE_HIP)
 
 #include <type_traits>
+#include <mutex>
 
 #include <hip/hip_runtime.h>
 
@@ -33,7 +34,6 @@
 #include "RAJA/util/SoAArray.hpp"
 #include "RAJA/util/SoAPtr.hpp"
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/reduce.hpp"
 
@@ -593,9 +593,7 @@ class PinnedTally
   //! get new value for use in resource
   auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
     ResourceNode* rn = resource_list;
     while (rn)
     {
@@ -645,9 +643,7 @@ class PinnedTally
 
   ~PinnedTally() { free_list(); }
 
-#if defined(RAJA_ENABLE_OPENMP)
-  omp::mutex m_mutex;
-#endif
+  std::mutex m_mutex;
 
 private:
   ResourceNode* resource_list;
@@ -1091,9 +1087,7 @@ class Reduce
     {
       if (val.value != val.identity)
       {
-#if defined(RAJA_ENABLE_OPENMP)
-        lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
-#endif
+        std::lock_guard<std::mutex> lock(tally_or_val_ptr.list->m_mutex);
         parent->combine(val.value);
       }
     }
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index 5fe73ceff0..21f760d516 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -42,6 +42,7 @@
 #include "RAJA/policy/openmp/scan.hpp"
 #include "RAJA/policy/openmp/sort.hpp"
 #include "RAJA/policy/openmp/synchronize.hpp"
+#include "RAJA/policy/openmp/thread.hpp"
 #include "RAJA/policy/openmp/launch.hpp"
 #include "RAJA/policy/openmp/WorkGroup.hpp"
 #include "RAJA/policy/openmp/params/reduce.hpp"
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 5887bc20bb..27b3b9760e 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -33,7 +33,7 @@
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/internal/fault_tolerance.hpp"
-#include "RAJA/pattern/kernel/type-traits.hpp"
+#include "RAJA/pattern/kernel/TypeTraits.hpp"
 
 #include "RAJA/index/IndexSet.hpp"
 #include "RAJA/index/ListSegment.hpp"
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 2092c87bb3..2ba8066fe9 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -28,68 +28,69 @@ template<>
 struct LaunchExecute<RAJA::omp_launch_t>
 {
 
-  template<typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<
-      resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
-       BODY const& body,
-       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
-  {
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      LaunchContext ctx;
-
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
-
-      ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
-
-      loop_body.get_priv()(ctx);
-
-      free(ctx.shared_mem_ptr);
-      ctx.shared_mem_ptr = nullptr;
-    });
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
 
   template<typename ReduceParams, typename BODY>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
-      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-      concepts::negate<
-          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>>
   exec(RAJA::resources::Resource res,
        LaunchParams const& launch_params,
        BODY const& body,
        ReduceParams& f_params)
   {
+    using RAJA::internal::thread_privatize;
+    constexpr bool has_reducers =
+        !RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>::value;
     using EXEC_POL = RAJA::omp_launch_t;
     EXEC_POL pol {};
+    using BodyType = decltype(thread_privatize(body));
 
-    expt::ParamMultiplexer::parampack_init(pol, f_params);
-
-    // reducer object must be named f_params as expected by macro below
-    RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-#pragma omp parallel reduction(combine : f_params)
-    {
-
+    auto parallel_section = [&](ReduceParams& f_params, auto func) {
       LaunchContext ctx;
-
-      using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
+      static_assert(std::is_invocable<decltype(func), ReduceParams&, BodyType&,
+                                      LaunchContext&>::value,
+                    "Internal RAJA error: Check the parallel kernel passed to "
+                    "OpenMP Parallel section in openmp/launch.hpp");
 
       ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size);
 
-      expt::invoke_body(f_params, loop_body.get_priv(), ctx);
+      func(f_params, loop_body, ctx);
 
       free(ctx.shared_mem_ptr);
       ctx.shared_mem_ptr = nullptr;
-    }
+    };
 
+    // Init reducers if present
+    expt::ParamMultiplexer::parampack_init(pol, f_params);
+
+    // reducer object must be named f_params as expected by macro below
+    if constexpr (has_reducers)
+    {
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+#pragma omp parallel reduction(combine : f_params)
+      {
+        // This "extra lambda" has to be declared within the scope of the OpenMP
+        // pragma so that the reduction parameter pack it operates on is the
+        // version tracked by the combine OpenMP syntax
+        auto parallel_kernel = [&](ReduceParams& f_params, BodyType& body,
+                                   LaunchContext& ctx) {
+          expt::invoke_body(f_params, body.get_priv(), ctx);
+        };
+        parallel_section(f_params, parallel_kernel);
+      }
+    }
+    else
+    {
+      RAJA::region<RAJA::omp_parallel_region>([&]() {
+        auto parallel_kernel = [&](ReduceParams&, BodyType& body,
+                                   LaunchContext& ctx) {
+          body.get_priv()(ctx);
+        };
+        parallel_section(f_params, parallel_kernel);
+      });
+    }
+    // Resolve reducers if present
     expt::ParamMultiplexer::parampack_resolve(pol, f_params);
 
     return resources::EventProxy<resources::Resource>(res);
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index a80e6ff5af..0a99133ad6 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -64,8 +64,7 @@ template<template<int> class ExecPol,
          int ChunkSize,
          typename ForallParam>
 RAJA_INLINE concepts::enable_if<
-    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-    std::integral_constant<bool, (ChunkSize <= 0)>>
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>>
 forall_impl(const ExecPol<ChunkSize>& p,
             Iterable&& iter,
             Func&& loop_body,
@@ -83,48 +82,21 @@ forall_impl(const ExecPol<ChunkSize>& p,
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
 
-#pragma omp for schedule(static) reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    if constexpr (ChunkSize > 0)
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
-    }
-  }
-
-  RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
-}
-
-//
-// omp for schedule(static, ChunkSize)
-//
-template<template<int> class ExecPol,
-         typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam>
-RAJA_INLINE concepts::enable_if<
-    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-    std::integral_constant<bool, (ChunkSize > 0)>>
-forall_impl(const ExecPol<ChunkSize>& p,
-            Iterable&& iter,
-            Func&& loop_body,
-            ForallParam&& f_params)
-{
-  using EXEC_POL = camp::decay<decltype(p)>;
-
-  RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-  {
-
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-
 #pragma omp for schedule(static, ChunkSize) reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
+    }
+    else
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+#pragma omp for schedule(static) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
     }
   }
 
@@ -193,49 +165,10 @@ RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
   RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
 }
 
-//
-// omp for schedule(dynamic)
-//
-template<typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam,
-         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = camp::decay<decltype(p)>;
-
-  RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-  {
-
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-
-#pragma omp for schedule(dynamic) reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
-    {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
-    }
-  }
-
-  RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
-}
-
 //
 // omp for schedule(dynamic, ChunkSize)
 //
-template<typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam,
-         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+template<typename Iterable, typename Func, int ChunkSize, typename ForallParam>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
                              Iterable&& iter,
                              Func&& loop_body,
@@ -252,25 +185,32 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
 
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
-
+    if constexpr (ChunkSize > 0)
+    {
 #pragma omp for schedule(dynamic, ChunkSize) reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
+    }
+    else
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+#pragma omp for schedule(dynamic) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
     }
   }
 
+
   RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
 }
 
 //
 // omp for schedule(guided)
 //
-template<typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam,
-         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+template<typename Iterable, typename Func, int ChunkSize, typename ForallParam>
 RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
                              Iterable&& iter,
                              Func&& loop_body,
@@ -287,46 +227,21 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
 
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
-
-#pragma omp for schedule(guided) reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    if constexpr (ChunkSize <= 0)
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+#pragma omp for schedule(guided) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
     }
-  }
-
-  RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
-}
-
-//
-// omp for schedule(guided, ChunkSize)
-//
-template<typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam,
-         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                             Iterable&& iter,
-                             Func&& loop_body,
-                             ForallParam&& f_params)
-{
-  using EXEC_POL = camp::decay<decltype(p)>;
-
-  RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-  {
-
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-
-#pragma omp for schedule(guided, ChunkSize) reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    else
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+#pragma omp for schedule(guided, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
     }
   }
 
@@ -336,11 +251,7 @@ RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
 //
 // omp for schedule(static) nowait
 //
-template<typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam,
-         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+template<typename Iterable, typename Func, int ChunkSize, typename ForallParam>
 RAJA_INLINE void forall_impl_nowait(
     const ::RAJA::policy::omp::Static<ChunkSize>& p,
     Iterable&& iter,
@@ -358,53 +269,25 @@ RAJA_INLINE void forall_impl_nowait(
 
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
-
-#pragma omp for schedule(static) nowait reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    if constexpr (ChunkSize <= 0)
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+#pragma omp for schedule(static) nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
     }
-  }
-
-  RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
-}
-
-//
-// omp for schedule(static, ChunkSize) nowait
-//
-template<typename Iterable,
-         typename Func,
-         int ChunkSize,
-         typename ForallParam,
-         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-RAJA_INLINE void forall_impl_nowait(
-    const ::RAJA::policy::omp::Static<ChunkSize>& p,
-    Iterable&& iter,
-    Func&& loop_body,
-    ForallParam&& f_params)
-{
-  using EXEC_POL = camp::decay<decltype(p)>;
-
-  RAJA::expt::ParamMultiplexer::parampack_init(p, f_params);
-  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-  RAJA_EXTRACT_BED_IT(iter);
-#pragma omp parallel
-  {
-
-    using RAJA::internal::thread_privatize;
-    auto body = thread_privatize(loop_body);
-
-#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    else
     {
-      RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i)
+      {
+        RAJA::expt::invoke_body(f_params, body.get_priv(), begin_it[i]);
+      }
     }
   }
-
   RAJA::expt::ParamMultiplexer::parampack_resolve(p, f_params);
 }
-
 }  //  namespace internal
 
 template<typename Schedule,
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index 960fca318c..0e07af334a 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -343,6 +343,8 @@ struct omp_atomic
 
 #endif
 
+struct omp_thread
+{};
 
 template<RAJA::omp::multi_reduce_algorithm algorithm>
 using omp_multi_reduce_tuning =
@@ -385,6 +387,11 @@ using omp_multi_reduce = omp_multi_reduce_unordered;
 ///
 using policy::omp::omp_atomic;
 
+///
+/// Type alias for thread
+///
+using policy::omp::omp_thread;
+
 ///
 /// Type aliases to simplify common omp parallel for loop execution
 ///
diff --git a/include/RAJA/policy/openmp/thread.hpp b/include/RAJA/policy/openmp/thread.hpp
new file mode 100644
index 0000000000..faf58d38ba
--- /dev/null
+++ b/include/RAJA/policy/openmp/thread.hpp
@@ -0,0 +1,47 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file defining OpenMP thread operations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_openmp_thread_HPP
+#define RAJA_policy_openmp_thread_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_OPENMP_ACTIVE)
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/policy/thread_auto.hpp"
+
+namespace RAJA
+{
+
+template<>
+RAJA_HOST_DEVICE RAJA_INLINE int get_max_threads(omp_thread)
+{
+  return omp_get_max_threads();
+}
+
+template<>
+RAJA_HOST_DEVICE RAJA_INLINE int get_thread_num(omp_thread)
+{
+  return omp_get_thread_num();
+}
+
+}  // namespace RAJA
+
+#endif  // RAJA_ENABLE_OPENMP
+#endif  // guard
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 4497db42c8..e9ac98afea 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -31,6 +31,7 @@
 #include "RAJA/policy/sequential/multi_reduce.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
 #include "RAJA/policy/sequential/sort.hpp"
+#include "RAJA/policy/sequential/thread.hpp"
 #include "RAJA/policy/sequential/launch.hpp"
 #include "RAJA/policy/sequential/WorkGroup.hpp"
 #include "RAJA/policy/sequential/params/reduce.hpp"
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 57bccc9f21..1fd9b08d1d 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -58,45 +58,33 @@ namespace sequential
 template<typename Iterable, typename Func, typename ForallParam>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<resources::Host>,
-    expt::type_traits::is_ForallParamPack<ForallParam>,
-    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+    expt::type_traits::is_ForallParamPack<ForallParam>>
 forall_impl(resources::Host host_res,
             const seq_exec& pol,
             Iterable&& iter,
             Func&& body,
             ForallParam f_params)
 {
+  constexpr bool has_reducers =
+      !expt::type_traits::is_ForallParamPack_empty<ForallParam>::value;
   expt::ParamMultiplexer::parampack_init(pol, f_params);
 
   RAJA_EXTRACT_BED_IT(iter);
 
   for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    expt::invoke_body(f_params, body, *(begin_it + i));
+    if constexpr (has_reducers)
+    {
+      expt::invoke_body(f_params, body, *(begin_it + i));
+    }
+    else
+    {
+      body(*(begin_it + i));
+    }
   }
 
-  expt::ParamMultiplexer::parampack_resolve(pol, f_params);
 
-  return resources::EventProxy<resources::Host>(host_res);
-}
-
-template<typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<resources::Host>,
-    expt::type_traits::is_ForallParamPack<ForallParam>,
-    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Host host_res,
-            const seq_exec&,
-            Iterable&& iter,
-            Func&& body,
-            ForallParam)
-{
-  RAJA_EXTRACT_BED_IT(iter);
-
-  for (decltype(distance_it) i = 0; i < distance_it; ++i)
-  {
-    body(*(begin_it + i));
-  }
+  expt::ParamMultiplexer::parampack_resolve(pol, f_params);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index ed735ba10d..9ead6375ae 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -124,6 +124,9 @@ struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
 struct seq_atomic
 {};
 
+struct seq_thread
+{};
+
 template<RAJA::sequential::multi_reduce_algorithm algorithm>
 using seq_multi_reduce_tuning =
     seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
@@ -147,6 +150,7 @@ using policy::sequential::seq_multi_reduce;
 using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
+using policy::sequential::seq_thread;
 using policy::sequential::seq_work;
 
 
diff --git a/include/RAJA/policy/sequential/thread.hpp b/include/RAJA/policy/sequential/thread.hpp
new file mode 100644
index 0000000000..a181b2bbaf
--- /dev/null
+++ b/include/RAJA/policy/sequential/thread.hpp
@@ -0,0 +1,44 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file defining sequential thread operations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_sequential_thread_HPP
+#define RAJA_policy_sequential_thread_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/util/macros.hpp"
+
+#include "RAJA/policy/thread_auto.hpp"
+
+namespace RAJA
+{
+template<>
+RAJA_HOST_DEVICE RAJA_INLINE int get_max_threads(seq_thread)
+{
+  return 1;
+}
+
+template<>
+RAJA_HOST_DEVICE RAJA_INLINE int get_thread_num(seq_thread)
+{
+  return 0;
+}
+
+}  // namespace RAJA
+
+
+#endif  // guard
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 29bebfac9e..0062eb322d 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -28,11 +28,12 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdio>
+#include <mutex>
 #include <type_traits>
 #include <unordered_map>
 
+
 #include "RAJA/util/basic_mempool.hpp"
-#include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 
 #include "RAJA/policy/sycl/policy.hpp"
@@ -53,15 +54,11 @@ struct syclInfo
   sycl_dim_t blockDim {0};
   ::sycl::queue qu    = ::sycl::queue();
   bool setup_reducers = false;
-#if defined(RAJA_ENABLE_OPENMP)
-  syclInfo* thread_states = nullptr;
-  omp::mutex lock;
-#endif
 };
 
 extern syclInfo g_status;
 
-extern syclInfo tl_status;
+thread_local extern syclInfo tl_status;
 
 extern std::unordered_map<::sycl::queue, bool> g_queue_info_map;
 
diff --git a/include/RAJA/policy/thread_auto.hpp b/include/RAJA/policy/thread_auto.hpp
new file mode 100644
index 0000000000..f5b69b7897
--- /dev/null
+++ b/include/RAJA/policy/thread_auto.hpp
@@ -0,0 +1,60 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file defining automatic thread operations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_thread_auto_HPP
+#define RAJA_policy_thread_auto_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/util/macros.hpp"
+
+#ifdef RAJA_OPENMP_ACTIVE
+#include "RAJA/policy/openmp/policy.hpp"
+#endif
+
+#include "RAJA/policy/sequential/policy.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+/*!
+ * Provides priority between thread policies that should do the "right thing"
+ *
+ * If OpenMP is active we always use the omp_thread.
+ *
+ * Fallback to seq_thread, which performs non-thread operations
+ * assumes there is no thread safety issues
+ */
+#if defined(RAJA_OPENMP_ACTIVE)
+using active_auto_thread = RAJA::omp_thread;
+#else
+using active_auto_thread = RAJA::seq_thread;
+#endif
+
+}  // namespace detail
+
+template<typename AtomicPolicy>
+RAJA_HOST_DEVICE RAJA_INLINE int get_max_threads(AtomicPolicy);
+
+template<typename AtomicPolicy>
+RAJA_HOST_DEVICE RAJA_INLINE int get_thread_num(AtomicPolicy);
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 9bcc1ce132..6aa6d63336 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -77,9 +77,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   using IndexLinear = IdxLin;
   using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
-  static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin limit  = RAJA::operators::limits<IdxLin>::max();
-  static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
+  static inline constexpr size_t n_dims = sizeof...(RangeInts);
+  static inline constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
+  static inline constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
   IdxLin sizes[n_dims]       = {0};
   IdxLin strides[n_dims]     = {0};
@@ -90,11 +90,12 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   /*!
    * Default constructor with zero sizes and strides.
    */
-  constexpr RAJA_INLINE LayoutBase_impl()                        = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&)  = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&)       = default;
-  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
-  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&)      = default;
+  RAJA_INLINE constexpr LayoutBase_impl()                       = default;
+  RAJA_INLINE constexpr LayoutBase_impl(LayoutBase_impl const&) = default;
+  RAJA_INLINE constexpr LayoutBase_impl(LayoutBase_impl&&)      = default;
+  RAJA_INLINE constexpr LayoutBase_impl& operator=(LayoutBase_impl const&) =
+      default;
+  RAJA_INLINE constexpr LayoutBase_impl& operator=(LayoutBase_impl&&) = default;
 
   /*!
    * Construct a layout given the size of each dimension.
@@ -116,10 +117,10 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
    *  Templated copy ctor from simillar layout.
    */
   template<typename CIdxLin, ptrdiff_t CStrideOneDim>
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE
-  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
-                                        CIdxLin,
-                                        CStrideOneDim>& rhs)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(
+      const LayoutBase_impl<camp::idx_seq<RangeInts...>,
+                            CIdxLin,
+                            CStrideOneDim>& rhs)
       : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
         strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
         inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
@@ -267,12 +268,6 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
   }
 };
 
-template<camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-constexpr size_t
-    LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::n_dims;
-template<camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-constexpr IdxLin
-    LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::limit;
 }  // namespace detail
 
 /*!
@@ -400,8 +395,8 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
  *
  */
 template<ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
-RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
-    Layout<n_dims, IdxLin> const& l)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr Layout<n_dims, IdxLin, s1_dim>
+make_stride_one(Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
@@ -411,8 +406,8 @@ RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
  *
  */
 template<ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
-RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim> make_stride_one(
-    TypedLayout<IdxLin, IdxTuple> const& l)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr TypedLayout<IdxLin, IdxTuple, s1_dim>
+make_stride_one(TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
   using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index a5f27b12bf..4237614a15 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -49,14 +49,14 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
   using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
-  static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
+  static inline constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
-  static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
+  static inline constexpr size_t n_dims = sizeof...(RangeInts);
+  IdxLin offsets[n_dims]                = {0};  // If not specified set to zero
 
-  constexpr RAJA_INLINE OffsetLayout_impl(
-      std::array<IdxLin, sizeof...(RangeInts)> begin,
-      std::array<IdxLin, sizeof...(RangeInts)> end)
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  OffsetLayout_impl(std::array<IdxLin, sizeof...(RangeInts)> begin,
+                    std::array<IdxLin, sizeof...(RangeInts)> end)
       : base_ {(end[RangeInts] - begin[RangeInts])...},
         offsets {begin[RangeInts]...}
   {}
@@ -66,7 +66,8 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
         offsets {c.offsets[RangeInts]...}
   {}
 
-  void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE void shift(
+      std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
     for (size_t i = 0; i < n_dims; ++i)
       offsets[i] += shift[i];
@@ -116,10 +117,11 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
     camp::sink((indices = (offsets[RangeInts] + indices))...);
   }
 
-  static RAJA_INLINE OffsetLayout_impl<IndexRange, IdxLin>
-  from_layout_and_offsets(
-      const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
-      const Layout<sizeof...(RangeInts), IdxLin>& rhs)
+  static constexpr RAJA_INLINE RAJA_HOST_DEVICE
+      OffsetLayout_impl<IndexRange, IdxLin>
+      from_layout_and_offsets(
+          const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
+          const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
     OffsetLayout_impl ret {rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
@@ -202,8 +204,8 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
 #endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
-      DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
+  operator()(DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
@@ -230,17 +232,18 @@ struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
 };
 
 template<size_t n_dims, typename IdxLin = Index_type>
-auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
-                        const std::array<IdxLin, n_dims>& end)
-    -> OffsetLayout<n_dims, IdxLin>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto make_offset_layout(
+    const std::array<IdxLin, n_dims>& begin,
+    const std::array<IdxLin, n_dims>& end) -> OffsetLayout<n_dims, IdxLin>
 {
   return OffsetLayout<n_dims, IdxLin> {begin, end};
 }
 
 template<size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
-                                 const std::array<IdxLin, Rank>& end,
-                                 const std::array<IdxLin, Rank>& permutation)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto make_permuted_offset_layout(
+    const std::array<IdxLin, Rank>& begin,
+    const std::array<IdxLin, Rank>& end,
+    const std::array<IdxLin, Rank>& permutation)
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index adbd13ff4c..c542e59359 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -61,9 +61,9 @@ namespace RAJA
  *
  */
 template<size_t Rank, typename IdxLin = Index_type>
-auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
-                          std::array<camp::idx_t, Rank> permutation)
-    -> Layout<Rank, IdxLin>
+constexpr RAJA_INLINE RAJA_HOST_DEVICE auto make_permuted_layout(
+    std::array<IdxLin, Rank> sizes,
+    std::array<camp::idx_t, Rank> permutation) -> Layout<Rank, IdxLin>
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 28c23d1ef6..f507e131c2 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -41,9 +41,12 @@ class SoAArray
   using value_type = T;
 
 public:
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
+  constexpr RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
 
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; }
+  constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    mem[i] = val;
+  }
 
 private:
   value_type mem[size];
@@ -60,12 +63,12 @@ class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   using second_type = IndexType;
 
 public:
-  RAJA_HOST_DEVICE value_type get(size_t i) const
+  constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
   {
     return value_type(mem[i], mem_idx[i]);
   }
 
-  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
     mem[i]     = val;
     mem_idx[i] = val.getLoc();
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index caeb99d4a6..fb613013a5 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -90,12 +90,12 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const
+  constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
   {
     return accessor::get(mem, i);
   }
 
-  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
     accessor::set(mem, i, val);
   }
@@ -166,12 +166,12 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const
+  constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
   {
     return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
 
-  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
     accessor::set(mem, i, first_type(val));
     accessor::set(mem_idx, i, val.getLoc());
@@ -238,12 +238,12 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const
+  constexpr RAJA_HOST_DEVICE value_type get(size_t i) const
   {
     return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
 
-  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  constexpr RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
     accessor::set(mem, i, val.getVal());
     accessor::set(mem_idx, i, val.getLoc());
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 741327a4c0..69ce145051 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -72,94 +72,122 @@ struct Span
   static_assert(type_traits::is_random_access_iterator<IterType>::value,
                 "IterType must model RandomAccessIterator");
 
-  RAJA_HOST_DEVICE Span(iterator begin, iterator end)
+  constexpr RAJA_HOST_DEVICE Span(iterator begin, iterator end)
       : m_begin {begin},
         m_end {end}
   {}
 
-  RAJA_HOST_DEVICE Span(iterator begin, size_type size)
+  constexpr RAJA_HOST_DEVICE Span(iterator begin, size_type size)
       : m_begin {begin},
         m_end {begin + size}
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator begin() const { return m_begin; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE const_iterator begin() const
+  {
+    return m_begin;
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator end() const { return m_end; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE const_iterator end() const
+  {
+    return m_end;
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const
+  {
+    return m_begin;
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const
+  {
+    return m_end;
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
   {
     return s.begin();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s)
+  {
+    return s.end();
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(
+      const Span& s)
   {
     return s.begin();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(
+      const Span& s)
   {
     return s.end();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(
+      const Span& s)
   {
     return s.cbegin();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(
+      const Span& s)
   {
     return s.cend();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE reference front() const
+  {
+    return *begin();
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE reference back() const
+  {
+    return *(end() - 1);
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
   {
     return data()[i];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE iterator data() const
+  {
+    return m_begin;
+  }
 
-  RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
   {
     return static_cast<size_type>(m_end - m_begin);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE bool empty() const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE bool empty() const
   {
     return size() == static_cast<size_type>(0);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE Span first(size_type count) const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE Span first(size_type count) const
   {
     return slice(0, count);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE Span last(size_type count) const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE Span last(size_type count) const
   {
     return slice(size() - count, count);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE Span subspan(size_type begin,
-                                            size_type length) const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE Span subspan(size_type begin,
+                                                      size_type length) const
   {
     return slice(begin, length);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
-                                          size_type length) const
+  constexpr RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
+                                                    size_type length) const
   {
     auto start = m_begin + begin;
     auto end   = start + length > m_end ? m_end : start + length;
@@ -192,14 +220,15 @@ struct Span
  *
  */
 template<typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
-                                                                 IndexType size)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
+    IterType begin,
+    IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
 template<typename Iter>
-RAJA_INLINE auto make_span(Iter& iterable)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE auto make_span(Iter& iterable)
 {
   using std::begin;
   using std::distance;
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 359783845f..8cbb05622f 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -208,12 +208,10 @@ struct ViewReturnHelper<camp::idx_seq<>,
 {
   using return_type = ElementType&;
 
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const& layout,
-                                           PointerType const& data,
-                                           Args const&... args)
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr return_type make_return(
+      LayoutType const& layout,
+      PointerType const& data,
+      Args const&... args)
   {
     return data[stripIndexType(layout(args...))];
   }
@@ -268,12 +266,10 @@ struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
   using return_type =
       internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  static constexpr return_type make_return(LayoutType const& layout,
-                                           PointerType const& data,
-                                           Args const&... args)
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr return_type make_return(
+      LayoutType const& layout,
+      PointerType const& data,
+      Args const&... args)
   {
 
     return return_type(ref_type {
@@ -385,11 +381,7 @@ struct ViewReturnHelper<
   using return_type =
       internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  static constexpr return_type make_return(
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr return_type make_return(
       LayoutType const& layout,
       PointerType const& data,
       RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
@@ -610,26 +602,22 @@ class ViewBase
   RAJA_INLINE
   constexpr ViewBase() {};
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE ViewBase(ViewBase const& c)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(ViewBase const& c)
       : m_layout(c.m_layout),
         m_data(c.m_data)
   {}
 
-  RAJA_HOST_DEVICE
-
-  RAJA_INLINE
-  ViewBase& operator=(ViewBase const& c)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase& operator=(ViewBase const& c)
   {
     m_layout = c.m_layout;
     m_data   = c.m_data;
   }
 #else
-  constexpr ViewBase()                             = default;
-  RAJA_INLINE constexpr ViewBase(ViewBase const&)  = default;
-  RAJA_INLINE constexpr ViewBase(ViewBase&&)       = default;
-  RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
-  RAJA_INLINE ViewBase& operator=(ViewBase&&)      = default;
+  constexpr ViewBase()                                       = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const&)            = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase&&)                 = default;
+  RAJA_INLINE constexpr ViewBase& operator=(ViewBase const&) = default;
+  RAJA_INLINE constexpr ViewBase& operator=(ViewBase&&)      = default;
 
 #endif
 
@@ -655,23 +643,25 @@ class ViewBase
         m_layout(rhs.get_layout())
   {}
 
-  RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
-
-  RAJA_HOST_DEVICE
-
-  RAJA_INLINE
-  constexpr pointer_type const& get_data() const { return m_data; }
-
-  RAJA_HOST_DEVICE
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr void set_data(PointerType data_ptr)
+  {
+    m_data = data_ptr;
+  }
 
-  RAJA_INLINE
-  constexpr layout_type const& get_layout() const { return m_layout; }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr pointer_type const& get_data() const
+  {
+    return m_data;
+  }
 
-  RAJA_HOST_DEVICE
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr layout_type const& get_layout() const
+  {
+    return m_layout;
+  }
 
-  RAJA_INLINE
-  constexpr linear_index_type size() const { return m_layout.size(); }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type size() const
+  {
+    return m_layout.size();
+  }
 
   template<camp::idx_t DIM>
   RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
@@ -709,9 +699,11 @@ class ViewBase
         m_layout, m_data, args...);
   }
 
+  // why doesn't this return a shifted copy instead of shifting this view?
   template<size_t n_dims   = layout_type::n_dims,
            typename IdxLin = linear_index_type>
-  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ShiftedView shift(
+      const std::array<IdxLin, n_dims>& shift)
   {
     static_assert(n_dims == layout_type::n_dims,
                   "Dimension mismatch in ViewBase shift");
@@ -802,9 +794,11 @@ class TypedViewBase<ValueType,
         match_typed_view_arg<IndexTypes>(args)...);
   }
 
+  // why doesn't this return a shifted copy instead of shifting this view?
   template<size_t n_dims   = sizeof...(IndexTypes),
            typename IdxLin = linear_index_type>
-  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ShiftedView shift(
+      const std::array<IdxLin, n_dims>& shift)
   {
     static_assert(n_dims == layout_type::n_dims,
                   "Dimension mismatch in TypedViewBase shift");
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index d00fa0cf65..0283b514cf 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -57,7 +57,8 @@ using TypedView = internal::
     TypedViewBase<ValueType, ValueType*, LayoutType, camp::list<IndexTypes...>>;
 
 template<typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr View<ValueType, Layout<1, IndexType, 0>>
+make_view(ValueType* ptr)
 {
   return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
@@ -66,7 +67,9 @@ template<size_t n_dims,
          typename IndexType,
          typename ValueType,
          typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr View<
+    ValueType,
+    IndexLayout<n_dims, IndexType, IndexTypes...>>
 make_index_view(ValueType* ptr,
                 IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
@@ -77,9 +80,9 @@ make_index_view(ValueType* ptr,
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
 template<typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE auto selecttuple(Lay lyout,
-                                              Tup&& tup,
-                                              camp::idx_seq<Idxs...>)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto selecttuple(Lay lyout,
+                                                        Tup&& tup,
+                                                        camp::idx_seq<Idxs...>)
     -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
 {
   return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
@@ -114,7 +117,7 @@ using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
 template<typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto removenth(Lay lyout, Tup&& tup)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto removenth(Lay lyout, Tup&& tup)
     -> decltype(selecttuple<Lay>(
         lyout,
         std::forward<Tup>(tup),
@@ -165,12 +168,14 @@ struct MultiView
   MultiView() = default;
 
   template<typename... Args>
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr MultiView(pointer_type data_ptr,
+                                                   Args... dim_sizes)
       : layout(dim_sizes...),
         data(nc_pointer_type(data_ptr))
   {}
 
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type const& ly)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr MultiView(pointer_type data_ptr,
+                                                   layout_type const& ly)
       : layout(ly),
         data(nc_pointer_type(data_ptr))
   {}
@@ -179,46 +184,46 @@ struct MultiView
   // arrays. For now, this becomes an ambiguous call to constructor error.
   // template<typename... Args,
   //         bool IsConstValue = std::is_const<value_type>::value>
-  // RAJA_INLINE constexpr MultiView(
+  // RAJA_HOST_DEVICE RAJA_INLINE constexpr MultiView(
   //    std::enable_if_t<IsConstValue, NonConstPointerType> data_ptr,
   //    Args... dim_sizes)
   //    : layout(dim_sizes...),
   //      data(nc_pointer_type(data_ptr))
   //{}
   // template<bool IsConstValue = std::is_const<value_type>::value>
-  // RAJA_INLINE constexpr MultiView(
+  // RAJA_HOST_DEVICE RAJA_INLINE constexpr MultiView(
   //    std::enable_if_t<IsConstValue, NonConstPointerType> data_ptr,
   //    layout_type const& ly)
   //    : layout(ly),
   //      data(nc_pointer_type(data_ptr))
   //{}
 
-  RAJA_INLINE constexpr MultiView(MultiView const&)  = default;
-  RAJA_INLINE constexpr MultiView(MultiView&&)       = default;
-  RAJA_INLINE MultiView& operator=(MultiView const&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView&&)      = default;
+  RAJA_INLINE constexpr MultiView(MultiView const&)            = default;
+  RAJA_INLINE constexpr MultiView(MultiView&&)                 = default;
+  RAJA_INLINE constexpr MultiView& operator=(MultiView const&) = default;
+  RAJA_INLINE constexpr MultiView& operator=(MultiView&&)      = default;
 
   template<bool IsConstView = std::is_const<value_type>::value>
-  RAJA_INLINE constexpr MultiView(
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr MultiView(
       std::enable_if_t<IsConstView, NonConstView> const& rhs)
       : layout(rhs.layout),
         data(nc_pointer_type(rhs.data))
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE void set_layout(layout_type const& ly)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr void set_layout(layout_type const& ly)
   {
     layout = ly;
   }
 
   template<bool IsConstValue = std::is_const<value_type>::value>
-  RAJA_HOST_DEVICE RAJA_INLINE void set_data(
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr void set_data(
       std::enable_if_t<IsConstValue, NonConstPointerType> data_ptr)
   {
     data = nc_pointer_type(
         data_ptr);  // This data_ptr should already be non-const.
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE void set_data(pointer_type data_ptr)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr void set_data(pointer_type data_ptr)
   {
     data = nc_pointer_type(data_ptr);
   }
@@ -233,8 +238,9 @@ struct MultiView
     return pointer_type(data);
   }
 
+  // why doesn't this return a shifted copy instead of shifting this view?
   template<size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE RAJA::
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr RAJA::
       MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
       shift(const std::array<IdxLin, n_dims>& shift)
   {
@@ -279,10 +285,15 @@ struct AtomicViewWrapper
 
   base_type base_;
 
-  RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType view) : base_(view) {}
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit AtomicViewWrapper(
+      ViewType view)
+      : base_(view)
+  {}
 
-  RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr void set_data(pointer_type data_ptr)
+  {
+    base_.set_data(data_ptr);
+  }
 
   template<typename... ARGS>
   RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS&&... args) const
@@ -305,10 +316,15 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
 
   base_type base_;
 
-  RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_ {view} {}
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit AtomicViewWrapper(
+      ViewType const& view)
+      : base_ {view}
+  {}
 
-  RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr void set_data(pointer_type data_ptr)
+  {
+    base_.set_data(data_ptr);
+  }
 
   template<typename... ARGS>
   RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(ARGS&&... args) const
@@ -318,8 +334,8 @@ struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
 };
 
 template<typename AtomicPolicy, typename ViewType>
-RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy> make_atomic_view(
-    ViewType const& view)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr AtomicViewWrapper<ViewType, AtomicPolicy>
+make_atomic_view(ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
@@ -338,13 +354,15 @@ template<typename meta_layout>
 struct PermutedViewHelper;
 
 template<typename T>
-constexpr auto get_last_index(T last)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get_last_index(T last)
 {
   return last;
 }
 
 template<typename T0, typename T1, typename... Args>
-constexpr auto get_last_index(T0, T1 t1, Args... args)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get_last_index(T0,
+                                                           T1 t1,
+                                                           Args... args)
 {
   return get_last_index(t1, args...);
 }
@@ -353,7 +371,8 @@ template<std::size_t... stride_order_idx>
 struct PermutedViewHelper<std::index_sequence<stride_order_idx...>>
 {
   template<typename IndexType, typename T, typename... Extents>
-  static auto get(T* ptr, Extents&&... extents)
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get(T* ptr,
+                                                         Extents&&... extents)
   {
     constexpr int N = sizeof...(Extents);
 
@@ -372,7 +391,8 @@ template<>
 struct PermutedViewHelper<layout_right>
 {
   template<typename IndexType, typename T, typename... Extents>
-  static auto get(T* ptr, Extents&&... extents)
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get(T* ptr,
+                                                         Extents&&... extents)
   {
     constexpr int N = sizeof...(Extents);
     using view_t    = RAJA::View<T, RAJA::Layout<N, IndexType, N - 1>>;
@@ -382,7 +402,8 @@ struct PermutedViewHelper<layout_right>
 };
 
 template<std::size_t... idx>
-constexpr auto make_reverse_array(std::index_sequence<idx...>)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto make_reverse_array(
+    std::index_sequence<idx...>)
 {
   return std::array<RAJA::idx_t, sizeof...(idx)> {sizeof...(idx) - 1U - idx...};
 }
@@ -391,7 +412,8 @@ template<>
 struct PermutedViewHelper<layout_left>
 {
   template<typename IndexType, typename T, typename... Extents>
-  static auto get(T* ptr, Extents&&... extents)
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr auto get(T* ptr,
+                                                         Extents&&... extents)
   {
     constexpr int N = sizeof...(Extents);
 
@@ -410,7 +432,9 @@ template<typename meta_layout,
          typename IndexType = RAJA::Index_type,
          typename T,
          typename... Extents>
-auto make_permuted_view(T* ptr, Extents&&... extents)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto make_permuted_view(
+    T* ptr,
+    Extents&&... extents)
 {
   return detail::PermutedViewHelper<meta_layout>::template get<IndexType>(
       ptr, std::forward<Extents>(extents)...);
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 898672f6d7..d3c2011957 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -24,9 +24,9 @@
 #include <cstdlib>
 #include <list>
 #include <map>
+#include <mutex>
 
 #include "RAJA/util/align.hpp"
-#include "RAJA/util/mutex.hpp"
 
 namespace RAJA
 {
@@ -329,9 +329,7 @@ class MemPool
   /// Free all backing allocations, even if they are currently in use
   void free_chunks()
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
 
     while (!m_arenas.empty())
     {
@@ -343,18 +341,14 @@ class MemPool
 
   size_t arena_size()
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
 
     return m_default_arena_size;
   }
 
   size_t arena_size(size_t new_size)
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
 
     size_t prev_size     = m_default_arena_size;
     m_default_arena_size = new_size;
@@ -364,9 +358,7 @@ class MemPool
   template<typename T>
   T* malloc(size_t nTs, size_t alignment = alignof(T))
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
 
     const size_t size                  = nTs * sizeof(T);
     void* ptr                          = nullptr;
@@ -398,9 +390,7 @@ class MemPool
 
   void free(const void* cptr)
   {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
+    std::lock_guard<std::mutex> lock(m_mutex);
 
     void* ptr                          = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
@@ -422,9 +412,7 @@ class MemPool
 private:
   using arena_container_type = std::list<detail::MemoryArena>;
 
-#if defined(RAJA_ENABLE_OPENMP)
-  omp::mutex m_mutex;
-#endif
+  std::mutex m_mutex;
 
   arena_container_type m_arenas;
   size_t m_default_arena_size;
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 708d3115e8..cc950d5ebf 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -39,9 +39,9 @@ namespace detail
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
 template<typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
-                                                Iter end,
-                                                UnaryFunc func)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                          Iter end,
+                                                          UnaryFunc func)
 {
   for (; begin != end; ++begin)
   {
@@ -54,8 +54,8 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
 template<typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
-                                                     UnaryFunc func)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 {
   // braced init lists are evaluated in order
   int seq_unused_array[] = {0, (func(Ts {}), 0)...};
@@ -67,9 +67,8 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
 template<typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
-                                                      UnaryFunc func,
-                                                      camp::idx_seq<Is...>)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -88,7 +87,7 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
 */
 RAJA_SUPPRESS_HD_WARN
 template<typename Container, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
+constexpr RAJA_HOST_DEVICE RAJA_INLINE
     concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
@@ -104,8 +103,8 @@ RAJA_HOST_DEVICE RAJA_INLINE
 */
 RAJA_SUPPRESS_HD_WARN
 template<typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
-                                                     UnaryFunc func)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc
+for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
@@ -116,7 +115,8 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
 */
 RAJA_SUPPRESS_HD_WARN
 template<typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+constexpr RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                                UnaryFunc func)
 {
   return detail::for_each_tuple(
       std::forward<Tuple>(t), std::move(func),
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
deleted file mode 100644
index 1786ab4cd7..0000000000
--- a/include/RAJA/util/mutex.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*!
-******************************************************************************
-*
-* \file
-*
-* \brief   Header file providing functionality similar to std mutex header.
-*
-******************************************************************************
-*/
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_util_mutex_HPP
-#define RAJA_util_mutex_HPP
-
-#include "RAJA/config.hpp"
-
-#if defined(RAJA_ENABLE_OPENMP)
-#include <omp.h>
-#endif
-
-namespace RAJA
-{
-
-#if defined(RAJA_ENABLE_OPENMP)
-namespace omp
-{
-
-//! class wrapping omp_lock_t with std::mutex interface
-class mutex
-{
-public:
-  using native_handle_type = omp_lock_t;
-
-  mutex() { omp_init_lock(&m_lock); }
-
-  mutex(const mutex&)            = delete;
-  mutex(mutex&&)                 = delete;
-  mutex& operator=(const mutex&) = delete;
-  mutex& operator=(mutex&&)      = delete;
-
-  void lock() { omp_set_lock(&m_lock); }
-
-  bool try_lock() { return omp_test_lock(&m_lock) != 0; }
-
-  void unlock() { omp_unset_lock(&m_lock); }
-
-  native_handle_type& native_handle() { return m_lock; }
-
-  ~mutex() { omp_destroy_lock(&m_lock); }
-
-private:
-  native_handle_type m_lock;
-};
-
-}  // namespace omp
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
-
-//! class providing functionality of std::lock_guard
-template<typename mutex_type>
-class lock_guard
-{
-public:
-  explicit lock_guard(mutex_type& m) : m_mutex(m) { m_mutex.lock(); }
-
-  lock_guard(const lock_guard&)            = delete;
-  lock_guard(lock_guard&&)                 = delete;
-  lock_guard& operator=(const lock_guard&) = delete;
-  lock_guard& operator=(lock_guard&&)      = delete;
-
-  ~lock_guard() { m_mutex.unlock(); }
-
-private:
-  mutex_type& m_mutex;
-};
-
-}  // namespace RAJA
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/sycl_compat.hpp b/include/RAJA/util/sycl_compat.hpp
index 2e6e03d1e7..cc553f54fd 100644
--- a/include/RAJA/util/sycl_compat.hpp
+++ b/include/RAJA/util/sycl_compat.hpp
@@ -18,6 +18,7 @@
 #ifndef RAJA_util_sycl_compat_HPP
 #define RAJA_util_sycl_compat_HPP
 
+#if defined(RAJA_SYCL_ACTIVE)
 #if (__INTEL_CLANG_COMPILER && __INTEL_CLANG_COMPILER < 20230000)
 // older version, use legacy header locations
 #include <CL/sycl.hpp>
@@ -25,5 +26,6 @@
 // SYCL 2020 standard header
 #include <sycl/sycl.hpp>
 #endif
+#endif
 
 #endif  // RAJA_util_sycl_compat_HPP
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 6d99862d7a..6486565fd0 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -333,11 +333,11 @@ class ConstRestrictRealPtr
   /// Ctors and assignment op.
   ///
 
-  ConstRestrictRealPtr() : dptr(0) { ; }
+  constexpr ConstRestrictRealPtr() : dptr(0) { ; }
 
-  ConstRestrictRealPtr(const Real_type* d) : dptr(d) { ; }
+  constexpr ConstRestrictRealPtr(const Real_type* d) : dptr(d) { ; }
 
-  ConstRestrictRealPtr& operator=(const Real_type* d)
+  constexpr ConstRestrictRealPtr& operator=(const Real_type* d)
   {
     ConstRestrictRealPtr copy(d);
     std::swap(dptr, copy.dptr);
@@ -351,18 +351,18 @@ class ConstRestrictRealPtr
   ///
   ///  Implicit conversion operator to bare const pointer.
   ///
-  operator const Real_type*() { return dptr; }
+  constexpr operator const Real_type*() { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to bare const pointer,
   ///  consistent with boost shared ptr.
   ///
-  const Real_type* get() const { return dptr; }
+  constexpr const Real_type* get() const { return dptr; }
 
   ///
   /// Bracket operator.
   ///
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
     return ((const Real_type* RAJA_RESTRICT)dptr)[i];
   }
@@ -370,7 +370,7 @@ class ConstRestrictRealPtr
   ///
   /// + operator for pointer arithmetic.
   ///
-  const Real_type* operator+(Index_type i) const { return dptr + i; }
+  constexpr const Real_type* operator+(Index_type i) const { return dptr + i; }
 
 private:
   const Real_type* dptr;
@@ -390,11 +390,11 @@ class RestrictRealPtr
   /// Ctors and assignment op.
   ///
 
-  RestrictRealPtr() : dptr(0) { ; }
+  constexpr RestrictRealPtr() : dptr(0) { ; }
 
-  RestrictRealPtr(Real_type* d) : dptr(d) { ; }
+  constexpr RestrictRealPtr(Real_type* d) : dptr(d) { ; }
 
-  RestrictRealPtr& operator=(Real_type* d)
+  constexpr RestrictRealPtr& operator=(Real_type* d)
   {
     RestrictRealPtr copy(d);
     std::swap(dptr, copy.dptr);
@@ -408,35 +408,38 @@ class RestrictRealPtr
   ///
   ///  Implicit conversion operator to (non-const) bare pointer.
   ///
-  operator Real_type*() { return dptr; }
+  constexpr operator Real_type*() { return dptr; }
 
   ///
   ///  Implicit conversion operator to const bare pointer.
   ///
-  operator const Real_type*() const { return dptr; }
+  constexpr operator const Real_type*() const { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to (non-const) bare pointer,
   ///  consistent with boost shared ptr.
   ///
-  Real_type* get() { return dptr; }
+  constexpr Real_type* get() { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to const bare pointer,
   ///  consistent with boost shared ptr.
   ///
-  const Real_type* get() const { return dptr; }
+  constexpr const Real_type* get() const { return dptr; }
 
   ///
   ///  Operator that enables implicit conversion from RestrictRealPtr to
   ///  RestrictRealConstPtr.
   ///
-  operator ConstRestrictRealPtr() { return ConstRestrictRealPtr(dptr); }
+  constexpr operator ConstRestrictRealPtr()
+  {
+    return ConstRestrictRealPtr(dptr);
+  }
 
   ///
   /// Bracket operator.
   ///
-  Real_type& operator[](Index_type i)
+  constexpr Real_type& operator[](Index_type i)
   {
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
   }
@@ -444,12 +447,12 @@ class RestrictRealPtr
   ///
   /// + operator for (non-const) pointer arithmetic.
   ///
-  Real_type* operator+(Index_type i) { return dptr + i; }
+  constexpr Real_type* operator+(Index_type i) { return dptr + i; }
 
   ///
   /// + operator for const pointer arithmetic.
   ///
-  const Real_type* operator+(Index_type i) const { return dptr + i; }
+  constexpr const Real_type* operator+(Index_type i) const { return dptr + i; }
 
 private:
   Real_type* dptr;
@@ -469,11 +472,11 @@ class ConstRestrictAlignedRealPtr
   /// Ctors and assignment op.
   ///
 
-  ConstRestrictAlignedRealPtr() : dptr(0) { ; }
+  constexpr ConstRestrictAlignedRealPtr() : dptr(0) { ; }
 
-  ConstRestrictAlignedRealPtr(const Real_type* d) : dptr(d) { ; }
+  constexpr ConstRestrictAlignedRealPtr(const Real_type* d) : dptr(d) { ; }
 
-  ConstRestrictAlignedRealPtr& operator=(const Real_type* d)
+  constexpr ConstRestrictAlignedRealPtr& operator=(const Real_type* d)
   {
     ConstRestrictAlignedRealPtr copy(d);
     std::swap(dptr, copy.dptr);
@@ -487,13 +490,13 @@ class ConstRestrictAlignedRealPtr
   ///
   ///  Implicit conversion operator to bare const pointer.
   ///
-  operator const Real_type*() { return dptr; }
+  constexpr operator const Real_type*() { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to bare const pointer,
   ///  consistent with boost shared ptr.
   ///
-  const Real_type* get() const { return dptr; }
+  constexpr const Real_type* get() const { return dptr; }
 
   ///
   /// Compiler-specific bracket operators.
@@ -501,7 +504,7 @@ class ConstRestrictAlignedRealPtr
 
 #if defined(RAJA_COMPILER_ICC)
   ///
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
 #if __ICC < 1300  // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
@@ -513,7 +516,7 @@ class ConstRestrictAlignedRealPtr
 
 #elif defined(RAJA_COMPILER_GNU)
   ///
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
 #if 1  // NOTE: alignment instrinsic not available for older GNU compilers
     return ((const Real_type* RAJA_RESTRICT)RAJA_ALIGN_DATA(dptr))[i];
@@ -523,14 +526,14 @@ class ConstRestrictAlignedRealPtr
   }
 
 #elif defined(RAJA_COMPILER_XLC)
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
     RAJA_ALIGN_DATA(dptr);
     return ((const Real_type* RAJA_RESTRICT)dptr)[i];
   }
 
 #elif defined(RAJA_COMPILER_CLANG)
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
     return ((const_TDRAReal_ptr)dptr)[i];
   }
@@ -543,7 +546,7 @@ class ConstRestrictAlignedRealPtr
   ///
   /// + operator for pointer arithmetic.
   ///
-  const Real_type* operator+(Index_type i) const { return dptr + i; }
+  constexpr const Real_type* operator+(Index_type i) const { return dptr + i; }
 
 private:
   const Real_type* dptr;
@@ -563,11 +566,11 @@ class RestrictAlignedRealPtr
   /// Ctors and assignment op.
   ///
 
-  RestrictAlignedRealPtr() : dptr(0) { ; }
+  constexpr RestrictAlignedRealPtr() : dptr(0) { ; }
 
-  RestrictAlignedRealPtr(Real_type* d) : dptr(d) { ; }
+  constexpr RestrictAlignedRealPtr(Real_type* d) : dptr(d) { ; }
 
-  RestrictAlignedRealPtr& operator=(Real_type* d)
+  constexpr RestrictAlignedRealPtr& operator=(Real_type* d)
   {
     RestrictAlignedRealPtr copy(d);
     std::swap(dptr, copy.dptr);
@@ -581,30 +584,30 @@ class RestrictAlignedRealPtr
   ///
   ///  Implicit conversion operator to (non-const) bare pointer.
   ///
-  operator Real_type*() { return dptr; }
+  constexpr operator Real_type*() { return dptr; }
 
   ///
   ///  Implicit conversion operator to const bare pointer.
   ///
-  operator const Real_type*() const { return dptr; }
+  constexpr operator const Real_type*() const { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to (non-const) bare pointer,
   ///  consistent with boost shared ptr.
   ///
-  Real_type* get() { return dptr; }
+  constexpr Real_type* get() { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to const bare pointer,
   ///  consistent with boost shared ptr.
   ///
-  const Real_type* get() const { return dptr; }
+  constexpr const Real_type* get() const { return dptr; }
 
   ///
   ///  Operator that enables implicit conversion from
   ///  RestrictAlignedRealPtr to RestrictAlignedRealConstPtr.
   ///
-  operator ConstRestrictAlignedRealPtr()
+  constexpr operator ConstRestrictAlignedRealPtr()
   {
     return ConstRestrictAlignedRealPtr(dptr);
   }
@@ -615,7 +618,7 @@ class RestrictAlignedRealPtr
 
 #if defined(RAJA_COMPILER_ICC)
   ///
-  Real_type& operator[](Index_type i)
+  constexpr Real_type& operator[](Index_type i)
   {
 #if __ICC < 1300  // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
@@ -626,7 +629,7 @@ class RestrictAlignedRealPtr
   }
 
   ///
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
 #if __ICC < 1300  // use alignment intrinsic
     RAJA_ALIGN_DATA(dptr);
@@ -638,7 +641,7 @@ class RestrictAlignedRealPtr
 
 #elif defined(RAJA_COMPILER_GNU)
   ///
-  Real_type& operator[](Index_type i)
+  constexpr Real_type& operator[](Index_type i)
   {
 #if 1  // NOTE: alignment instrinsic not available for older GNU compilers
     return ((Real_type * RAJA_RESTRICT) RAJA_ALIGN_DATA(dptr))[i];
@@ -648,7 +651,7 @@ class RestrictAlignedRealPtr
   }
 
   ///
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
 #if 1  // NOTE: alignment instrinsic not available for older GNU compilers
     return ((Real_type * RAJA_RESTRICT) RAJA_ALIGN_DATA(dptr))[i];
@@ -659,14 +662,14 @@ class RestrictAlignedRealPtr
 
 #elif defined(RAJA_COMPILER_XLC)
   ///
-  Real_type& operator[](Index_type i)
+  constexpr Real_type& operator[](Index_type i)
   {
     RAJA_ALIGN_DATA(dptr);
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
   }
 
   ///
-  const Real_type& operator[](Index_type i) const
+  constexpr const Real_type& operator[](Index_type i) const
   {
     RAJA_ALIGN_DATA(dptr);
     return ((Real_type * RAJA_RESTRICT) dptr)[i];
@@ -674,7 +677,10 @@ class RestrictAlignedRealPtr
 
 #elif defined(RAJA_COMPILER_CLANG)
   ///
-  Real_type& operator[](Index_type i) { return ((TDRAReal_ptr)dptr)[i]; }
+  constexpr Real_type& operator[](Index_type i)
+  {
+    return ((TDRAReal_ptr)dptr)[i];
+  }
 
   ///
   const Real_type& operator[](Index_type i) const
@@ -690,12 +696,12 @@ class RestrictAlignedRealPtr
   ///
   /// + operator for (non-const) pointer arithmetic.
   ///
-  Real_type* operator+(Index_type i) { return dptr + i; }
+  constexpr Real_type* operator+(Index_type i) { return dptr + i; }
 
   ///
   /// + operator for const pointer arithmetic.
   ///
-  const Real_type* operator+(Index_type i) const { return dptr + i; }
+  constexpr const Real_type* operator+(Index_type i) const { return dptr + i; }
 
 private:
   Real_type* dptr;
@@ -716,11 +722,11 @@ class ConstRestrictComplexPtr
   /// Ctors and assignment op.
   ///
 
-  ConstRestrictComplexPtr() : dptr(0) { ; }
+  constexpr ConstRestrictComplexPtr() : dptr(0) { ; }
 
-  ConstRestrictComplexPtr(const Complex_type* d) : dptr(d) { ; }
+  constexpr ConstRestrictComplexPtr(const Complex_type* d) : dptr(d) { ; }
 
-  ConstRestrictComplexPtr& operator=(const Complex_type* d)
+  constexpr ConstRestrictComplexPtr& operator=(const Complex_type* d)
   {
     ConstRestrictComplexPtr copy(d);
     std::swap(dptr, copy.dptr);
@@ -734,18 +740,18 @@ class ConstRestrictComplexPtr
   ///
   ///  Implicit conversion operator to bare const pointer.
   ///
-  operator const Complex_type*() const { return dptr; }
+  constexpr operator const Complex_type*() const { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to bare const pointer,
   ///  consistent with boost shared ptr.
   ///
-  const Complex_type* get() const { return dptr; }
+  constexpr const Complex_type* get() const { return dptr; }
 
   ///
   ///  Bracket operator.
   ///
-  const Complex_type& operator[](Index_type i) const
+  constexpr const Complex_type& operator[](Index_type i) const
   {
     return ((const Complex_type* RAJA_RESTRICT)dptr)[i];
   }
@@ -753,7 +759,10 @@ class ConstRestrictComplexPtr
   ///
   /// + operator for pointer arithmetic.
   ///
-  const Complex_type* operator+(Index_type i) const { return dptr + i; }
+  constexpr const Complex_type* operator+(Index_type i) const
+  {
+    return dptr + i;
+  }
 
 private:
   const Complex_type* dptr;
@@ -773,11 +782,11 @@ class RestrictComplexPtr
   /// Ctors and assignment op.
   ///
 
-  RestrictComplexPtr() : dptr(0) { ; }
+  constexpr RestrictComplexPtr() : dptr(0) { ; }
 
-  RestrictComplexPtr(Complex_type* d) : dptr(d) { ; }
+  constexpr RestrictComplexPtr(Complex_type* d) : dptr(d) { ; }
 
-  RestrictComplexPtr& operator=(Complex_type* d)
+  constexpr RestrictComplexPtr& operator=(Complex_type* d)
   {
     RestrictComplexPtr copy(d);
     std::swap(dptr, copy.dptr);
@@ -791,35 +800,38 @@ class RestrictComplexPtr
   ///
   ///  Implicit conversion operator to (non-const) bare pointer.
   ///
-  operator Complex_type*() { return dptr; }
+  constexpr operator Complex_type*() { return dptr; }
 
   ///
   ///  Implicit conversion operator to const bare pointer.
   ///
-  operator const Complex_type*() const { return dptr; }
+  constexpr operator const Complex_type*() const { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to (non-const) bare pointer,
   ///  consistent with boost shared ptr.
   ///
-  Complex_type* get() { return dptr; }
+  constexpr Complex_type* get() { return dptr; }
 
   ///
   ///  "Explicit conversion operator" to const bare pointer,
   ///  consistent with boost shared ptr.
   ///
-  const Complex_type* get() const { return dptr; }
+  constexpr const Complex_type* get() const { return dptr; }
 
   ///
   ///  Operator that enables implicit conversion from RestrictComplexPtr to
   ///  RestrictComplexConstPtr.
   ///
-  operator ConstRestrictComplexPtr() { return ConstRestrictComplexPtr(dptr); }
+  constexpr operator ConstRestrictComplexPtr()
+  {
+    return ConstRestrictComplexPtr(dptr);
+  }
 
   ///
   ///  (Non-const) bracket operator.
   ///
-  Complex_type& operator[](Index_type i)
+  constexpr Complex_type& operator[](Index_type i)
   {
     return ((Complex_type * RAJA_RESTRICT) dptr)[i];
   }
@@ -827,7 +839,7 @@ class RestrictComplexPtr
   ///
   ///  Const bracket operator.
   ///
-  const Complex_type& operator[](Index_type i) const
+  constexpr const Complex_type& operator[](Index_type i) const
   {
     return ((Complex_type * RAJA_RESTRICT) dptr)[i];
   }
@@ -835,12 +847,15 @@ class RestrictComplexPtr
   ///
   /// + operator for (non-const) pointer arithmetic.
   ///
-  Complex_type* operator+(Index_type i) { return dptr + i; }
+  constexpr Complex_type* operator+(Index_type i) { return dptr + i; }
 
   ///
   /// + operator for const pointer arithmetic.
   ///
-  const Complex_type* operator+(Index_type i) const { return dptr + i; }
+  constexpr const Complex_type* operator+(Index_type i) const
+  {
+    return dptr + i;
+  }
 
 private:
   Complex_type* dptr;
@@ -920,13 +935,15 @@ namespace detail
 struct DefaultAccessor
 {
   template<typename T>
-  static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
   template<typename T>
-  static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr void set(T* ptr,
+                                                         size_t i,
+                                                         T val)
   {
     ptr[i] = val;
   }
@@ -987,14 +1004,14 @@ struct AsIntegerArray
     return num_integer_type;
   }
 
-  RAJA_HOST_DEVICE constexpr T get_value() const
+  RAJA_HOST_DEVICE T get_value() const
   {
     T value;
     memcpy(&value, &array[0], sizeof(T));
     return value;
   }
 
-  RAJA_HOST_DEVICE constexpr void set_value(T value)
+  RAJA_HOST_DEVICE void set_value(T value)
   {
     memcpy(&array[0], &value, sizeof(T));
   }
@@ -1007,14 +1024,14 @@ struct AsIntegerArray
 template<typename T>
 struct ScopedAssignment
 {
-  ScopedAssignment(T& val, T const& new_val)
+  constexpr ScopedAssignment(T& val, T const& new_val)
       : m_ref_to_val(val),
         m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
-  ScopedAssignment(T& val, T&& new_val)
+  constexpr ScopedAssignment(T& val, T&& new_val)
       : m_ref_to_val(val),
         m_prev_val(std::move(val))
   {
@@ -1026,6 +1043,7 @@ struct ScopedAssignment
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
   ScopedAssignment& operator=(ScopedAssignment&&)      = delete;
 
+  // constexpr in c++20
   ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 8bbfbf2c01..83da60f8d4 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -55,112 +55,125 @@ struct ZipIterator
       zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator() : m_iterators() {}
 
   template<typename... Args,
            typename = concepts::enable_if<
                type_traits::convertible_to<Args&&, Iters>...>>
-  RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator(Args&&... args)
       : m_iterators(std::forward<Args>(args)...)
   {}
 
-  RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator(const ZipIterator& rhs)
       : m_iterators(rhs.m_iterators)
   {}
 
-  RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator(ZipIterator&& rhs)
       : m_iterators(std::move(rhs.m_iterators))
   {}
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator& operator=(
+      const ZipIterator& rhs)
   {
     m_iterators = rhs.m_iterators;
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator=(ZipIterator&& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator& operator=(
+      ZipIterator&& rhs)
   {
     m_iterators = std::move(rhs.m_iterators);
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type get_stride() const { return 1; }
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr difference_type get_stride() const
+  {
+    return 1;
+  }
 
-  RAJA_HOST_DEVICE inline bool operator==(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator==(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) == RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline bool operator!=(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator!=(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) != RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator>(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator<(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator>=(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) >= RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline bool operator<=(const ZipIterator& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator<=(
+      const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) <= RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator++()
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator& operator++()
   {
     detail::zip_for_each(m_iterators, detail::PreInc {});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator--()
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator& operator--()
   {
     detail::zip_for_each(m_iterators, detail::PreDec {});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator operator++(int)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator operator++(int)
   {
     ZipIterator tmp(*this);
     ++(*this);
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator operator--(int)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator operator--(int)
   {
     ZipIterator tmp(*this);
     --(*this);
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator& operator+=(
+      const difference_type& rhs)
   {
     detail::zip_for_each(m_iterators, detail::PlusEq<difference_type> {rhs});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator& operator-=(
+      const difference_type& rhs)
   {
     detail::zip_for_each(m_iterators, detail::MinusEq<difference_type> {rhs});
     return *this;
   }
 
-  RAJA_HOST_DEVICE inline difference_type operator-(
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr difference_type operator-(
       const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator operator+(
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator operator+(
       const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
@@ -168,7 +181,7 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator operator-(
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ZipIterator operator-(
       const difference_type& rhs) const
   {
     ZipIterator tmp(*this);
@@ -176,31 +189,34 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
-                                                const ZipIterator& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr friend ZipIterator operator+(
+      difference_type lhs,
+      const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline reference operator*() const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr reference operator*() const
   {
     return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)> {});
   }
 
   // TODO:: figure out what to do with this
-  // RAJA_HOST_DEVICE inline reference operator->() const
+  // RAJA_HOST_DEVICE RAJA_INLINE constexpr reference operator->() const
   // {
   //   return *(*this);
   // }
-  RAJA_HOST_DEVICE reference operator[](difference_type rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr reference operator[](
+      difference_type rhs) const
   {
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
-                                                     ZipIterator rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr friend void safe_iter_swap(
+      ZipIterator lhs,
+      ZipIterator rhs)
   {
     detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
   }
@@ -209,7 +225,8 @@ struct ZipIterator
   zip_val<camp::decay<Iters>...> m_iterators;
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr reference deref_helper(
+      camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
   }
@@ -220,7 +237,8 @@ struct ZipIterator
     a single ZipIterator object.
 */
 template<typename... Args>
-RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto zip(Args&&... args)
+    -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -230,7 +248,7 @@ RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
     ZipIterator objects.
 */
 template<typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto zip_span(Args&&... args)
     -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
             typename ZipIterator<
                 detail::ContainerIter<camp::decay<Args>>...>::difference_type>
@@ -251,9 +269,12 @@ RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
 template<typename T, typename Compare>
 struct CompareFirst
 {
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr CompareFirst(Compare comp_)
+      : comp(comp_)
+  {}
 
-  RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr bool operator()(T const& lhs,
+                                                         T const& rhs) const
   {
     return comp(RAJA::get<0>(lhs), RAJA::get<0>(rhs));
   }
@@ -267,7 +288,8 @@ struct CompareFirst
     like objects of type T.
 */
 template<typename T, typename Compare>
-RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto compare_first(Compare comp)
+    -> CompareFirst<T, Compare>
 {
   return {comp};
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index 03bd098d99..fb2245f978 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -49,15 +49,15 @@ using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
 template<camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
-                                                     zip_tuple<is_val, Ts...>>&
-get(zip_tuple<is_val, Ts...>& z) noexcept
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>&
+    get(zip_tuple<is_val, Ts...>& z) noexcept
 {
   return z.template get<I>();
 }
 
 template<camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr RAJA::
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RAJA::
     zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
     get(zip_tuple<is_val, Ts...> const& z) noexcept
 {
@@ -65,7 +65,7 @@ RAJA_HOST_DEVICE constexpr RAJA::
 }
 
 template<camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+RAJA_HOST_DEVICE RAJA_INLINE constexpr std::remove_reference_t<
     RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
 get(zip_tuple<is_val, Ts...>&& z) noexcept
 {
@@ -73,7 +73,7 @@ get(zip_tuple<is_val, Ts...>&& z) noexcept
 }
 
 template<camp::idx_t I, bool is_val, typename... Ts>
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+RAJA_HOST_DEVICE RAJA_INLINE constexpr std::remove_reference_t<
     RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
 {
@@ -86,7 +86,7 @@ namespace detail
 struct PassThrough
 {
   template<typename T>
-  RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(T&& t) const
       -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
@@ -96,7 +96,7 @@ struct PassThrough
 struct Move
 {
   template<typename T>
-  RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(T&& t) const
       -> decltype(std::move(t))
   {
     return std::move(t);
@@ -106,7 +106,7 @@ struct Move
 struct PreInc
 {
   template<typename Iter>
-  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(Iter&& iter) const
       -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
@@ -116,7 +116,7 @@ struct PreInc
 struct PreDec
 {
   template<typename Iter>
-  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(Iter&& iter) const
       -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
@@ -129,7 +129,7 @@ struct PlusEq
   const difference_type& rhs;
 
   template<typename Iter>
-  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(Iter&& iter) const
       -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
@@ -142,7 +142,7 @@ struct MinusEq
   const difference_type& rhs;
 
   template<typename Iter>
-  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(Iter&& iter) const
       -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
@@ -152,7 +152,7 @@ struct MinusEq
 struct DeRef
 {
   template<typename Iter>
-  RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr auto operator()(Iter&& iter) const
       -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
@@ -162,7 +162,7 @@ struct DeRef
 struct Swap
 {
   template<typename T0, typename T1>
-  RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
     safe_swap(std::forward<T0>(t0), std::forward<T1>(t1));
@@ -173,7 +173,7 @@ struct Swap
 struct IterSwap
 {
   template<typename T0, typename T1>
-  RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
     safe_iter_swap(std::forward<T0>(t0), std::forward<T1>(t1));
@@ -185,9 +185,10 @@ struct IterSwap
     \brief Call f on each member of t (f(t)...).
 */
 template<typename Tuple, typename F, camp::idx_t... Is>
-RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple&& t,
-                                               F&& f,
-                                               camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr void zip_for_each_impl(
+    Tuple&& t,
+    F&& f,
+    camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -196,10 +197,11 @@ RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple&& t,
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
 template<typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
-RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple0&& t0,
-                                               Tuple1&& t1,
-                                               F&& f,
-                                               camp::idx_seq<Is...>)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr void zip_for_each_impl(
+    Tuple0&& t0,
+    Tuple1&& t1,
+    F&& f,
+    camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
                                 RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
@@ -209,7 +211,7 @@ RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple0&& t0,
     \brief Call f on each member of t (f(t)...).
 */
 template<typename Tuple, typename F>
-RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr void zip_for_each(Tuple&& t, F&& f)
 {
   zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
                     typename camp::decay<Tuple>::IdxSeq {});
@@ -219,7 +221,9 @@ RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
 template<typename Tuple0, typename Tuple1, typename F>
-RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr void zip_for_each(Tuple0&& t0,
+                                                         Tuple1&& t1,
+                                                         F&& f)
 {
   static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
                              typename camp::decay<Tuple1>::IdxSeq>::value,
@@ -257,7 +261,7 @@ struct zip_tuple
   template<
       typename... Os,
       typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(Os&&... os)
       : m_tuple(std::forward<Os>(os)...)
   {}
 
@@ -266,63 +270,67 @@ struct zip_tuple
            typename = concepts::enable_if<type_traits::convertible_to<
                Os&&,
                typename std::remove_reference<Ts>::type>...>>
-  zip_tuple& assign(Os&&... os)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign(Os&&... os)
   {
     return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
   }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq {})
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(zip_tuple& o)
+      : zip_tuple(o, IdxSeq {})
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(zip_tuple const& o)
       : zip_tuple(o, IdxSeq {})
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(zip_tuple&& o)
       : zip_tuple(std::move(o), IdxSeq {})
   {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& operator=(zip_tuple& o)
   {
     return assign_helper(o, IdxSeq {});
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& operator=(
+      zip_tuple const& o)
   {
     return assign_helper(o, IdxSeq {});
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& operator=(zip_tuple&& o)
   {
     return assign_helper(std::move(o), IdxSeq {});
   }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq {})
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(opp_tuple& o)
+      : zip_tuple(o, IdxSeq {})
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(opp_tuple const& o)
       : zip_tuple(o, IdxSeq {})
   {}
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(opp_tuple&& o)
       : zip_tuple(std::move(o), IdxSeq {})
   {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& operator=(opp_tuple& o)
   {
     return assign_helper(o, IdxSeq {});
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& operator=(
+      opp_tuple const& o)
   {
     return assign_helper(o, IdxSeq {});
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& operator=(opp_tuple&& o)
   {
     return assign_helper(std::move(o), IdxSeq {});
   }
@@ -331,21 +339,22 @@ struct zip_tuple
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
   template<camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr RAJA::tuple_element_t<I, value_type>&
   get() & noexcept
   {
     return RAJA::get<I>(m_tuple);
   }
 
   template<camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const& get()
-      const& noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr RAJA::
+      tuple_element_t<I, value_type> const&
+      get() const& noexcept
   {
     return RAJA::get<I>(m_tuple);
   }
 
   template<camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr std::remove_reference_t<
       RAJA::tuple_element_t<I, value_type>>&&
   get() && noexcept
   {
@@ -353,7 +362,7 @@ struct zip_tuple
   }
 
   template<camp::idx_t I>
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr std::remove_reference_t<
       RAJA::tuple_element_t<I, value_type>> const&&
   get() const&& noexcept
   {
@@ -361,16 +370,16 @@ struct zip_tuple
   }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
-                                                     zip_tuple& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr friend void safe_swap(zip_tuple& lhs,
+                                                               zip_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
-                                                     opp_tuple& rhs)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr friend void safe_swap(zip_tuple& lhs,
+                                                               opp_tuple& rhs)
   {
     detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
@@ -391,7 +400,9 @@ struct zip_tuple
 
   // assignment helper from types convertible to Ts
   template<typename... Os, camp::idx_t... Is>
-  zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      camp::idx_seq<Is...>,
+      Os&&... os)
   {
     camp::sink(get<Is>() = std::forward<Os>(os)...);
     return *this;
@@ -399,25 +410,28 @@ struct zip_tuple
 
   // copy and move constructor helpers
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(zip_tuple& o,
+                                                   camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
-                                         camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(zip_tuple const& o,
+                                                   camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(zip_tuple&& o,
+                                                   camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
   {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      zip_tuple& o,
+      camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -427,8 +441,9 @@ struct zip_tuple
   }
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      zip_tuple const& o,
+      camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -438,8 +453,9 @@ struct zip_tuple
   }
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      zip_tuple&& o,
+      camp::idx_seq<Is...>)
   {
     if (this != &o)
     {
@@ -450,41 +466,46 @@ struct zip_tuple
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(opp_tuple& o,
+                                                   camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
-                                         camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(opp_tuple const& o,
+                                                   camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(o)...)
   {}
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple(opp_tuple&& o,
+                                                   camp::idx_seq<Is...>)
       : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
   {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      opp_tuple& o,
+      camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(o)...);
     return *this;
   }
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      opp_tuple const& o,
+      camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(o)...);
     return *this;
   }
 
   template<camp::idx_t... Is>
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
-                                                        camp::idx_seq<Is...>)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr zip_tuple& assign_helper(
+      opp_tuple&& o,
+      camp::idx_seq<Is...>)
   {
     camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
     return *this;
diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index 3297214ff3..6bb684e9ec 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -190,7 +190,7 @@ then
     timed_message "Cleaning working directory"
 
     # Map CPU core allocations
-    declare -A core_counts=(["lassen"]=40 ["poodle"]=28 ["dane"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32 ["tuolumne"]=48)
+    declare -A core_counts=(["lassen"]=40 ["poodle"]=28 ["dane"]=28 ["matrix"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32 ["tuolumne"]=48)
 
     # If building, then delete everything first
     # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources.
@@ -204,7 +204,7 @@ then
     # Shared allocation: Allows build_and_test.sh to run within a sub-allocation (see CI config).
     # Use /dev/shm: Prevent MPI tests from running on a node where the build dir doesn't exist.
     cmake_options=""
-    if [[ "${truehostname}" == "poodle" || "${truehostname}" == "dane" ]]
+    if [[ "${truehostname}" == "poodle" || "${truehostname}" == "dane" || "${truehostname}" == "matrix" ]]
     then
         cmake_options="-DBLT_MPI_COMMAND_APPEND:STRING=--overlap"
     fi
@@ -257,24 +257,19 @@ then
         echo "[Error]: Failure(s) while running CTest" && exit 1
     fi
 
-    if grep -q -i "ENABLE_HIP.*ON" ${hostconfig_path}
+    if [[ ! -d ${install_dir} ]]
     then
-        echo "[Warning]: Not testing install with HIP"
-    else
-        if [[ ! -d ${install_dir} ]]
-        then
-            echo "[Error]: Install directory not found : ${install_dir}" && exit 1
-        fi
-
-        cd ${install_dir}/examples/RAJA/using-with-cmake
-        mkdir build && cd build
-        if ! $cmake_exe -C ../host-config.cmake ..; then
-            echo "[Error]: Running $cmake_exe for using-with-cmake test" && exit 1
-        fi
-
-        if ! make; then
-            echo "[Error]: Running make for using-with-cmake test" && exit 1
-        fi
+        echo "[Error]: Install directory not found : ${install_dir}" && exit 1
+    fi
+
+    cd ${install_dir}/examples/RAJA/using-with-cmake
+    mkdir build && cd build
+    if ! $cmake_exe -C ../host-config.cmake ..; then
+        echo "[Error]: Running $cmake_exe for using-with-cmake test" && exit 1
+    fi
+
+    if ! make; then
+        echo "[Error]: Running make for using-with-cmake test" && exit 1
     fi
 
     timed_message "RAJA tests completed"
diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh
index b84d594b1a..b375b7ef45 100755
--- a/scripts/lc-builds/corona_sycl.sh
+++ b/scripts/lc-builds/corona_sycl.sh
@@ -13,7 +13,7 @@ if [[ $# -lt 1 ]]; then
   echo "   1) SYCL compiler installation path"
   echo
   echo "For example: "
-  echo "    corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_730cd3a5275f_hip_gcc10.3.1_rocm6.0.2"
+  echo "    corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_16b7bcb09915_hip_gcc10.3.1_rocm6.4.2"
   exit
 fi
 
diff --git a/scripts/lc-builds/toss4_cce_omptarget.sh b/scripts/lc-builds/toss4_cce_omptarget.sh
new file mode 100755
index 0000000000..96a9712f85
--- /dev/null
+++ b/scripts/lc-builds/toss4_cce_omptarget.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 2 ]]; then
+  echo
+  echo "You must pass 3 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP compute architecture"
+  echo "   3...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_cce_omptarget.sh 20.0.0-magic gfx942"
+  exit
+fi
+
+COMP_VER=$1
+HIP_ARCH=$2
+shift 2
+
+HOSTCONFIG="cce_omptarget_X"
+
+BUILD_SUFFIX=lc_toss4-cce-${COMP_VER}-${HIP_ARCH}-omptarget
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.24.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DHIP_ARCH=${HIP_ARCH} \
+  -DCMAKE_C_COMPILER="/usr/tce/packages/cce/cce-${COMP_VER}/bin/craycc" \
+  -DCMAKE_CXX_COMPILER="/usr/tce/packages/cce/cce-${COMP_VER}/bin/crayCC" \
+  -DBLT_CXX_STD=c++17 \
+  -DENABLE_CLANGFORMAT=Off \
+  -C "../host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake" \
+  -DENABLE_HIP=OFF \
+  -DENABLE_OPENMP=ON \
+  -DRAJA_ENABLE_TARGET_OPENMP=ON \
+  -DENABLE_CUDA=OFF \
+  -DENABLE_BENCHMARKS=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "    srun -n1 make"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_nvcc_clang.sh b/scripts/lc-builds/toss4_nvcc_clang.sh
new file mode 100755
index 0000000000..dac9fef11b
--- /dev/null
+++ b/scripts/lc-builds/toss4_nvcc_clang.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 3 ]]; then
+  echo
+  echo "You must pass 3 arguments to the script (in this order): "
+  echo "   1) compiler version number for nvcc"
+  echo "   2) CUDA compute architecture (number only, e.g., '90' not 'sm_90')"
+  echo "   3) compiler version number for clang"
+  echo
+  echo "For example: "
+  echo "    toss4_nvcc_clang.sh 12.6.0 90 14.0.6"
+  echo
+  echo "    toss4_nvcc_clang.sh 12.9.1 90 19.1.3-magic"
+  echo "         (note: a compilation issue with one RAJA benchmark code)"
+  exit
+fi
+
+COMP_NVCC_VER=$1
+COMP_ARCH=$2
+COMP_CLANG_VER=$3
+shift 3
+
+BUILD_SUFFIX=lc_toss4-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER}
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.25.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++17 \
+  -C ../host-configs/lc-builds/toss4/nvcc_clang_X.cmake \
+  -DENABLE_CLANGFORMAT=Off \
+  -DCLANGFORMAT_EXECUTABLE=/usr/tce/packages/clang/clang-14.0.6/bin/clang-format \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DRAJA_ENABLE_NVTX=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
+  -DENABLE_BENCHMARKS=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "On LC machines, you are likely to hit cudafe++ signal errors due to"
+echo "insufficient virtual memory if you try to build in parallel using all"
+echo "available cores; i.e., make -j. Try backing off to make -j 16."
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss4_nvcc_gcc.sh b/scripts/lc-builds/toss4_nvcc_gcc.sh
new file mode 100755
index 0000000000..d5b78b5da3
--- /dev/null
+++ b/scripts/lc-builds/toss4_nvcc_gcc.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 3 ]]; then
+  echo
+  echo "You must pass 3 arguments to the script (in this order): "
+  echo "   1) compiler version number for nvcc"
+  echo "   2) CUDA compute architecture (number only, e.g., '90' not 'sm_90')"
+  echo "   3) compiler version number for gcc"
+  echo
+  echo "For example: "
+  echo "    toss4_nvcc_gcc.sh 12.6.0 90 13.3.1-magic"
+  echo
+  echo "    toss4_nvcc_gcc.sh 12.9.1 90 13.3.1-magic"
+  echo "         (note: a compilation issue with one RAJA benchmark code)"
+  exit
+fi
+
+COMP_NVCC_VER=$1
+COMP_ARCH=$2
+COMP_GCC_VER=$3
+shift 3
+
+BUILD_SUFFIX=lc_toss4-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER}
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.25.2
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++17 \
+  -C ../host-configs/lc-builds/toss4/nvcc_gcc_X.cmake \
+  -DENABLE_CLANGFORMAT=Off \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DRAJA_ENABLE_NVTX=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
+  -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
+  -DENABLE_BENCHMARKS=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "On LC machines, you are likely to hit cudafe++ signal errors due to"
+echo "insufficient virtual memory if you try to build in parallel using all"
+echo "available cores; i.e., make -j. Try backing off to make -j 16."
+echo
+echo "***********************************************************************"
diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index b7b88725bc..fddc4f16ee 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit b7b88725bc2e5045fd3adef2b971a0f25f3b96e1
+Subproject commit fddc4f16ee987abc9c1c61879eaf8a2d6a8253d9
diff --git a/scripts/uberenv b/scripts/uberenv
index 6ba67dcbd7..bec05e20bf 160000
--- a/scripts/uberenv
+++ b/scripts/uberenv
@@ -1 +1 @@
-Subproject commit 6ba67dcbd7ccbb9c03920b89de19ac959e2c3bdd
+Subproject commit bec05e20bf2a1634d97ead358a9072c36f1fdcac
diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index ca30ea323e..54531c8c60 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -39,25 +39,39 @@ include(CMakeFindDependencyMacro)
 
 if (NOT TARGET camp)
   set(RAJA_CAMP_DIR "@camp_DIR@")
-  if(NOT camp_DIR) 
-    set(camp_DIR ${RAJA_CAMP_DIR}) 
+  if(NOT camp_DIR)
+    set(camp_DIR ${RAJA_CAMP_DIR})
   endif()
 
-  find_dependency(camp CONFIG NO_DEFAULT_PATH PATHS 
+  find_dependency(camp CONFIG NO_DEFAULT_PATH PATHS
     ${camp_DIR}
     ${camp_DIR}/lib/cmake/camp
     ${RAJA_PACKAGE_PREFIX_DIR}
     ${RAJA_PACKAGE_PREFIX_DIR}/lib/cmake/camp)
 endif ()
 
+if (@RAJA_ENABLE_CALIPER@)
+  if (NOT TARGET caliper)
+    set(RAJA_CALIPER_DIR "@caliper_DIR@")
+    if(NOT caliper_DIR)
+      set(caliper_DIR ${RAJA_CALIPER_DIR})
+    endif()
+    find_dependency(caliper CONFIG NO_DEFAULT_PATH PATHS
+      ${caliper_DIR}
+      ${caliper_DIR}/share/cmake/caliper
+      ${RAJA_PACKAGE_PREFIX_DIR}
+      ${RAJA_PACKAGE_PREFIX_DIR}/share/cmake/caliper)
+  endif ()
+endif ()
+
 if (@RAJA_ENABLE_DESUL_ATOMICS@)
   if (NOT TARGET desul_atomics)
     set(RAJA_DESUL_DIR "@desul_DIR@")
-    if(NOT desul_DIR) 
-      set(desul_DIR ${RAJA_DESUL_DIR}) 
+    if(NOT desul_DIR)
+      set(desul_DIR ${RAJA_DESUL_DIR})
     endif()
 
-    find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS 
+    find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS
       ${desul_DIR}
       ${desul_DIR}/lib/cmake/desul
       ${RAJA_PACKAGE_PREFIX_DIR}
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index f327d5b22d..de857b2fe4 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -44,10 +44,7 @@ namespace detail
 cudaStatusInfo g_status;
 
 //! State of the host code in this thread
-cudaStatusInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP)
-#pragma omp threadprivate(tl_status)
-#endif
+thread_local cudaStatusInfo tl_status;
 
 //! State of raja cuda stream synchronization for cuda reducer objects
 std::unordered_map<cudaStream_t, bool> g_stream_info_map;
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index ccb02126dc..d9e72bef05 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -44,10 +44,7 @@ namespace detail
 hipStatusInfo g_status;
 
 //! State of the host code in this thread
-hipStatusInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP)
-#pragma omp threadprivate(tl_status)
-#endif
+thread_local hipStatusInfo tl_status;
 
 //! State of raja hip stream synchronization for hip reducer objects
 std::unordered_map<hipStream_t, bool> g_stream_info_map;
diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
index 568e86e528..d7a5628456 100644
--- a/src/MemUtils_SYCL.cpp
+++ b/src/MemUtils_SYCL.cpp
@@ -42,10 +42,7 @@ namespace detail
 syclInfo g_status;
 
 //! State of the host code in this thread
-syclInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP)
-#pragma omp threadprivate(tl_status)
-#endif
+thread_local syclInfo tl_status;
 
 //! State of raja sycl queue synchronization for sycl reducer objects
 std::unordered_map<::sycl::queue, bool> g_queue_info_map {
diff --git a/test/install/using-with-cmake/CMakeLists.txt b/test/install/using-with-cmake/CMakeLists.txt
index bc07743edf..5c42c01632 100644
--- a/test/install/using-with-cmake/CMakeLists.txt
+++ b/test/install/using-with-cmake/CMakeLists.txt
@@ -5,22 +5,18 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-if (ENABLE_HIP)
-  cmake_minimum_required(VERSION 3.23)
-else()
-  cmake_minimum_required(VERSION 3.20)
-endif()
+cmake_minimum_required(VERSION 3.23)
 
- project(using_with_cmake)
+project(using_with_cmake)
 
- if(NOT DEFINED RAJA_DIR OR NOT EXISTS ${RAJA_DIR}/lib/cmake/raja/raja-config.cmake)
-     message(FATAL_ERROR "Missing required 'RAJA_DIR' variable pointing to an installed RAJA")
- endif()
+if(NOT DEFINED RAJA_DIR OR NOT EXISTS ${RAJA_DIR}/lib/cmake/raja/raja-config.cmake)
+    message(FATAL_ERROR "Missing required 'RAJA_DIR' variable pointing to an installed RAJA")
+endif()
 
- find_package(RAJA REQUIRED
-              NO_DEFAULT_PATH 
-              PATHS ${RAJA_DIR}/lib/cmake/raja)
+find_package(RAJA REQUIRED
+             NO_DEFAULT_PATH 
+             PATHS ${RAJA_DIR}/lib/cmake/raja)
 
- add_executable(using-with-cmake using-with-cmake.cpp)
- target_link_libraries(using-with-cmake RAJA)
+add_executable(using-with-cmake using-with-cmake.cpp)
+target_link_libraries(using-with-cmake RAJA)
  
diff --git a/test/install/using-with-cmake/host-config.cmake.in b/test/install/using-with-cmake/host-config.cmake.in
index 6a939d69dc..c2e3c42fab 100644
--- a/test/install/using-with-cmake/host-config.cmake.in
+++ b/test/install/using-with-cmake/host-config.cmake.in
@@ -16,6 +16,17 @@ set(CMAKE_CXX_FLAGS          "@CMAKE_CXX_FLAGS@"        CACHE STRING "")
 set(CMAKE_Fortran_FLAGS      "@CMAKE_Fortran_FLAGS@"    CACHE STRING "")
 set(CMAKE_EXE_LINKER_FLAGS   "@CMAKE_EXE_LINKER_FLAGS@" CACHE STRING "")
 
+# CUDA
+set(CMAKE_CUDA_COMPILER      "@CMAKE_CUDA_COMPILER@"       CACHE PATH "")
+set(CMAKE_CUDA_HOST_COMPILER "@CMAKE_CXX_COMPILER@"        CACHE PATH "")
+set(CUDA_TOOLKIT_ROOT_DIR    "@CUDAToolkit_ROOT@"          CACHE PATH "")
+set(CMAKE_CUDA_ARCHITECTURES "@CMAKE_CUDA_ARCHITECTURES@"  CACHE STRING "")
+
+# HIP
+set(ROCM_PATH               "@ROCM_PATH@"               CACHE PATH "")
+set(CMAKE_HIP_COMPILER      "@CMAKE_HIP_COMPILER@"      CACHE PATH "")
+set(CMAKE_HIP_ARCHITECTURES "@CMAKE_HIP_ARCHITECTURES@" CACHE STRING "")
+
 # MPI
 set(ENABLE_MPI             @ENABLE_MPI@               CACHE BOOL "")
 set(MPI_C_COMPILER         "@MPI_C_COMPILER@"         CACHE PATH "")
diff --git a/tpl/camp b/tpl/camp
index 4070ce93a8..a8caefa9f4 160000
--- a/tpl/camp
+++ b/tpl/camp
@@ -1 +1 @@
-Subproject commit 4070ce93a802849d61037310a87c50cc24c9e498
+Subproject commit a8caefa9f4c811b1a114b4ed2c9b681d40f12325