diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h
index 9da7185be84..581160fccb1 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h
@@ -894,4 +894,13 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) {
 }
 #endif  // !__OPENMP_AMDGCN__
 
+#define HIP_IMPL_GENERATE_SCAN_FUNC(OP, TYPE_ALIAS, TYPE) \
+  extern "C" __device__ __attribute__((const)) TYPE __ockl_wfscan_ ## OP ## _ ## TYPE_ALIAS(TYPE, bool);\
+\
+    template <bool Inclusive>\
+    __device__ __forceinline__ TYPE scan_ ## OP(TYPE val)\
+    {\
+      return __ockl_wfscan_ ## OP ## _ ## TYPE_ALIAS(val, Inclusive);\
+    }
+
 #endif
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
index af5c73288f9..e5e200c99e9 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
@@ -22,6 +22,11 @@
 #endif
 
 namespace cooperative_groups {
+  class thread_group;
+namespace impl {
+  template <typename TyGroup>
+__CG_QUALIFIER__ unsigned long long groupMask(const TyGroup&);
+}
 
 /** \brief The base type of all cooperative group types.
  *
@@ -32,12 +37,13 @@ namespace cooperative_groups {
  *         on Microsoft Windows.
  */
 class thread_group {
+  template <typename TyGroup>
+  friend __CG_QUALIFIER__ unsigned long long cooperative_groups::impl::groupMask(const TyGroup&);
  protected:
   __hip_uint32_t _type;         //! Type of the thread_group.
   __hip_uint32_t _num_threads;  //! Total number of threads in the thread_group.
   __hip_uint64_t _mask;         //! Lanemask for coalesced and tiled partitioned group types,
                                 //! LSB represents lane 0, and MSB represents lane 63
-
   //! Construct a thread group, and set thread group type and other essential
   //! thread group properties. This generic thread group is directly constructed
   //! only when the group is supposed to contain only the calling thread
@@ -387,6 +393,7 @@ class coalesced_group : public thread_group {
   friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
                                                           unsigned int tile_size);
   friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
+
   template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
   binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
 
@@ -1044,6 +1051,36 @@ template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
   __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
       : thread_block_tile_type<size, ParentCGTy>() {}
 };
+
+// becomes to std::true_type if the group is tiled and has a size known at compile time
+template <class TyGroup>
+struct isTiledGroup : __hip_internal::false_type {
+};
+
+template <unsigned int N, class ParentCGTy>
+struct isTiledGroup<cooperative_groups::thread_block_tile<N, ParentCGTy>>
+  : __hip_internal::integral_constant<bool,
+        (N == 1  || N == 2  || N == 4  || N == 8 ||
+         N == 16 || N == 32 || N == 64)> {
+};
+
+// returns the size of tile_group provided it is known at compile time
+template <class TyGroup>
+struct tiledGroupSize : __hip_internal::integral_constant<int, 0> {
+
+};
+template <unsigned int N, class ParentCGTy>
+struct tiledGroupSize<cooperative_groups::thread_block_tile<N, ParentCGTy>>
+  : __hip_internal::integral_constant<int, N> {
+};
+
+template <class TyGroup>
+struct isCoalescedGroup : __hip_internal::false_type {
+};
+
+template <>
+struct isCoalescedGroup<cooperative_groups::coalesced_group> : __hip_internal::true_type {
+};
 }  // namespace impl
 
 /** \brief    Group type - thread_block_tile
@@ -1287,6 +1324,7 @@ __CG_QUALIFIER__ coalesced_group binary_partition(const thread_block_tile<size,
   }
 }
 
+
 template <class T>
 struct plus {
   __CG_QUALIFIER__ T operator()(T lhs, T rhs) const
@@ -1334,6 +1372,153 @@ struct bit_or {
     return lhs | rhs;
   }
 };
+
+namespace impl {
+// when instantiated with two parameter types, allows to know if there are the same, regardless
+// of const/volatile qualifiers
+template <typename T, typename U>
+using is_param_type_same = __hip_internal::is_same<typename __hip_internal::remove_cvref<T>,
+                                                   typename __hip_internal::remove_cvref<U>>;
+
+template <class T, class Op>
+struct isArithmeticFunc : __hip_internal::false_type {
+};
+
+template <class T>
+struct isArithmeticFunc<T, cooperative_groups::plus<T>> : __hip_internal::true_type {
+};
+
+template <class T>
+struct isArithmeticFunc<T, cooperative_groups::less<T>> : __hip_internal::true_type {
+};
+
+template <class T>
+struct isArithmeticFunc<T, cooperative_groups::greater<T>> : __hip_internal::true_type {
+};
+
+template <class T, class Op>
+struct isBooleanFunc : __hip_internal::false_type {
+};
+
+template <class T>
+struct isBooleanFunc<T, cooperative_groups::bit_and<T>> : __hip_internal::true_type {
+};
+
+template <class T>
+struct isBooleanFunc<T, cooperative_groups::bit_or<T>> : __hip_internal::true_type {
+};
+
+template <class T>
+struct isBooleanFunc<T, cooperative_groups::bit_xor<T>> : __hip_internal::true_type {
+};
+
+// this is the value to return in exclusive_scan, for lane 0
+template <class T, class Op>
+struct CGIdentity {
+  __CG_QUALIFIER__ T operator()()
+  {
+    T result = {};
+    return result;
+  }
+};
+
+template <class T>
+struct CGIdentity<T, cooperative_groups::bit_and<T>> {
+  __CG_QUALIFIER__ T operator()()
+  {
+    T result {};
+    return ~result;
+  }
+};
+
+template <class T>
+struct CGIdentity<T, cooperative_groups::less<T>> {
+  __CG_QUALIFIER__ T operator()()
+  {
+    // CUDA would return 0 in this case. But in our case we mimic what __ockl_wfscan_*
+    // would do
+    return __hip_internal::NumericLimits<T>::maximum();
+  }
+};
+
+template <class T>
+struct CGIdentity<T, cooperative_groups::greater<T>> {
+  __CG_QUALIFIER__ T operator()()
+  {
+    return __hip_internal::NumericLimits<T>::minimum();
+  }
+};
+
+// calculates the necessary warp mask for cooperative groups that support reduce(), or
+// inclusive/exlcusive_scan()
+template <typename TyGroup>
+__CG_QUALIFIER__ unsigned long long groupMask(const TyGroup& group)
+{
+  unsigned long long mask = ~0ull;
+
+  if constexpr (impl::isCoalescedGroup<TyGroup>::value) {
+    mask = group.coalesced_info.member_mask;
+  } else {
+    // we cannot simply just use the __activemask() here, because more than one tile could have active
+    // threads at a time; we need to mask away the threads that not part of this tile first
+    mask >>= (64 - group.num_threads());
+    mask <<= (((internal::workgroup::thread_rank() % warpSize) / group.num_threads()) * group.num_threads());
+  }
+
+  return mask;
+}
+
+// backward permute implementation for cooperative group operations, i.e.
+// for up to 32 bytes (__hip_ds_bpermute() can only do 4 bytes at a time,
+// this function calls it (or the floating point version) multiple times to
+// implement it for bigger sizes
+template <bool isPrimitiveType, class T, size_t NumPermutes, typename __hip_internal::enable_if<NumPermutes == 0, int>::type = 0>
+__CG_QUALIFIER__ void bPermute(T&, T, int from)
+{
+}
+
+// trivial case: the type fits within the permute size
+template <bool IsPrimitiveType, class T, size_t NumPermutes, typename __hip_internal::enable_if<IsPrimitiveType && NumPermutes == 1, int>::type = 0>
+__CG_QUALIFIER__ void bPermute(T& permuteResult, T result, int from)
+{
+  auto backwardPermute = [](int index, T arg) {
+      if constexpr (__hip_internal::is_floating_point<T>::value &&
+                    sizeof(T) <= 4) {
+        return __hip_ds_bpermutef(index, arg);
+      } else {
+        return __hip_ds_bpermute(index, arg);
+      }
+    };
+
+  if constexpr (sizeof(T) == 2) {
+    union {
+      int i;
+      T f;
+    } tmp;
+
+    tmp.f = result;
+    tmp.i = __hip_ds_bpermute(from << 2, tmp.i);
+    permuteResult = tmp.f;
+  } else if constexpr (sizeof(T) == 4) {
+    auto bPermuteResult = backwardPermute(from << 2, result);
+    __builtin_memcpy(&permuteResult, &bPermuteResult, sizeof(result));
+  } else {
+    static_assert(__hip_internal::is_void<T>::value, "Unexpected type");
+  }
+}
+
+// Overload when we need multiple ds_permute, because one is not enough
+template <bool IsPrimitiveType, class T, size_t NumPermutes>
+__CG_QUALIFIER__ void bPermute(T permuteResult[NumPermutes], T result[NumPermutes], int from)
+{
+  // ds_bpermute only deals with 32-bit sizes, so for other sizes
+  // we need to call the permute multiple times
+  for (int i = 0; i < NumPermutes; i++) {
+    permuteResult[i] = __hip_ds_bpermute(from << 2, result[i]);
+  }
+}
+
+}  // namespace impl
 #endif
 
 /**
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h
index bdcacd4274f..a0a3b60f406 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h
@@ -18,10 +18,6 @@
 
 namespace cooperative_groups {
 namespace impl {
-  template <typename T, typename U>
-  using is_param_type_same = __hip_internal::is_same<typename __hip_internal::remove_cvref<T>,
-                                                     typename __hip_internal::remove_cvref<U>>;
-
   template <typename T, typename = void>
   struct has_add : __hip_internal::false_type {
   };
@@ -76,25 +72,6 @@ namespace impl {
                  __hip_internal::void_t<decltype(__reduce_xor_sync<unsigned long long>(0ull, T {}))>
     > : __hip_internal::true_type {};
 
-  // we can call reduce() only the block tiles that have a compile-time size
-  template <class TyGroup>
-  struct isTiledGroup : __hip_internal::false_type {
-  };
-
-  template <unsigned int N, class ParentCGTy>
-  struct isTiledGroup<cooperative_groups::thread_block_tile<N, ParentCGTy>>
-    : __hip_internal::integral_constant<bool,
-          (N == 1  || N == 2  || N == 4  || N == 8 ||
-           N == 16 || N == 32 || N == 64)> {
-  };
-
-  template <class TyGroup>
-  struct isCoalescedGroup : __hip_internal::false_type {
-  };
-
-  template <>
-  struct isCoalescedGroup<cooperative_groups::coalesced_group> : __hip_internal::true_type {
-  };
 }
 
 /** \ingroup CooperativeGAPI
@@ -126,15 +103,7 @@ __CG_QUALIFIER__ auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> de
 
   unsigned long long mask = ~0ull;
 
-  // we cannot simply just use the __activemask() here, because more than one tile could have active
-  // threads at a time; we need to mask away the threads that not part of this tile first
-  if constexpr (!__hip_internal::is_same<TyGroup, cooperative_groups::coalesced_group>::value) {
-    mask >>= (64 - group.num_threads());
-    mask <<= (((threadIdx.x % warpSize) / group.num_threads()) * group.num_threads());
-  }
-
-  // for coalesced_groups, the mask is simply the activemask
-  mask &= __activemask();
+  mask = impl::groupMask(group);
 
   if constexpr (__hip_internal::is_same<Op, cooperative_groups::plus<Val>>::value &&
                 impl::has_add<Val>::value) {
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h
new file mode 100644
index 00000000000..3a9c2eabdcd
--- /dev/null
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h
@@ -0,0 +1,304 @@
+#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H
+#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H
+
+#if __cplusplus
+#if !defined(__HIPCC_RTC__)
+#include <hip/amd_detail/hip_cooperative_groups_helper.h>
+#include <hip/amd_detail/amd_hip_cooperative_groups.h>
+#endif
+
+#if !defined(__HIPCC_RTC__)
+#include <type_traits>
+#endif
+
+namespace cooperative_groups {
+namespace impl {
+  // these functions allow to make use of C++ function overloads, instead of having to code
+  // a big if-constexpr according to operand type
+  HIP_IMPL_GENERATE_SCAN_FUNC(add, i32, int);
+  HIP_IMPL_GENERATE_SCAN_FUNC(add, u32, unsigned int);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(min, i32, int);
+  HIP_IMPL_GENERATE_SCAN_FUNC(min, u32, unsigned int);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(max, i32, int);
+  HIP_IMPL_GENERATE_SCAN_FUNC(max, u32, unsigned int);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(and, i32, int);
+  HIP_IMPL_GENERATE_SCAN_FUNC(and, u32, unsigned int);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(or, i32, int);
+  HIP_IMPL_GENERATE_SCAN_FUNC(or, u32, unsigned int);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(xor, i32, int);
+  HIP_IMPL_GENERATE_SCAN_FUNC(xor, u32, unsigned int);
+
+  // extra types. Unlike cg::reduce() which depends on reduce_*_sync() functions being defined with
+  // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES to be able to use the ockl intrinsics, for scan we always
+  // define them here
+  HIP_IMPL_GENERATE_SCAN_FUNC(add, i64, long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(add, u64, unsigned long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(add, f32, float);
+  HIP_IMPL_GENERATE_SCAN_FUNC(add, f64, double);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(min, i64, long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(min, u64, unsigned long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(min, f32, float);
+  HIP_IMPL_GENERATE_SCAN_FUNC(min, f64, double);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(max, i64, long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(max, u64, unsigned long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(max, f32, float);
+  HIP_IMPL_GENERATE_SCAN_FUNC(max, f64, double);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(and, i64, long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(and, u64, unsigned long long);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(or, i64, long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(or, u64, unsigned long long);
+
+  HIP_IMPL_GENERATE_SCAN_FUNC(xor, i64, long long);
+  HIP_IMPL_GENERATE_SCAN_FUNC(xor, u64, unsigned long long);
+
+  // not all types could be used with wfscan (e.g. user defined types), this predicate
+  // indicates whether that is the case
+  template <typename T, typename = void>
+  struct has_arithmetic_scan : __hip_internal::false_type {
+  };
+
+  // all the arithmetic operations accept the same types, so we just based it on the overload being
+  // present for scan_add
+  template <typename T>
+  struct has_arithmetic_scan<T,
+                 __hip_internal::void_t<decltype(scan_add<false>(T {}))>
+    > : __hip_internal::true_type {
+  };
+
+  template <typename T, typename = void>
+  struct has_boolean_scan : __hip_internal::false_type {
+  };
+
+  // all the arithmetic operations accept the same types, so we just based it on the overload being
+  // present for scan_and
+  template <typename T>
+  struct has_boolean_scan<T,
+                 __hip_internal::void_t<decltype(scan_and<false>(T {}))>
+    > : __hip_internal::true_type {
+  };
+
+  // given cooperative_groups template parameter, calls the right impl::scan function (that would contain __ockl_wfscan_*)
+  // and that is overloaded by type
+  template <class TyVal, class Op, bool Inclusive>
+  __CG_QUALIFIER__ TyVal call_scan(const TyVal& val)
+  {
+    using Val = typename __hip_internal::remove_cvref<TyVal>::type;
+
+    if constexpr (__hip_internal::is_same<Op, cooperative_groups::plus<Val>>::value) {
+      return impl::scan_add<Inclusive>(val);
+    } else if constexpr (__hip_internal::is_same<Op, cooperative_groups::less<Val>>::value) {
+      return impl::scan_min<Inclusive>(val);
+    } else if constexpr (__hip_internal::is_same<Op, cooperative_groups::greater<Val>>::value) {
+      return impl::scan_max<Inclusive>(val);
+    } else if constexpr (__hip_internal::is_same<Op, cooperative_groups::bit_and<Val>>::value) {
+      return impl::scan_and<Inclusive>(val);
+    } else if constexpr (__hip_internal::is_same<Op, cooperative_groups::bit_or<Val>>::value) {
+      return impl::scan_or<Inclusive>(val);
+    } else if constexpr (__hip_internal::is_same<Op, cooperative_groups::bit_xor<Val>>::value) {
+      return impl::scan_xor<Inclusive>(val);
+    }
+  }
+
+  template <bool Inclusive, typename TyGroup, typename TyVal, typename TyFn>
+  __CG_QUALIFIER__ auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val))
+  {
+    using Op = typename __hip_internal::remove_cvref<TyFn>::type;
+    using Val = typename __hip_internal::remove_cvref<TyVal>::type;
+
+    constexpr bool isPrimitiveType = impl::has_arithmetic_scan<Val>::value;
+    using permuteType = typename __hip_internal::conditional<isPrimitiveType && (sizeof(Val) == 4 || sizeof(Val) == 2), Val, unsigned int>::type;
+
+    // the number of backward permutes will be: size(Val) / 4 rounded up
+    static constexpr int kNumOfPermutes = (sizeof(Val) <= 4)?
+                                          1 :
+                                          (sizeof(Val) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
+    static_assert(cooperative_groups::impl::is_param_type_same<Val, decltype(op(val, val))>::value, "Operator input and output types differ");
+    static_assert(__hip_internal::is_trivially_copyable<Val>::value, "val must be trivially copyable");
+    static_assert(sizeof(Val) <= 32, "scan only operate on values of size up to 32 bytes");
+
+    if constexpr (!cooperative_groups::impl::isTiledGroup<TyGroup>::value && !cooperative_groups::impl::isCoalescedGroup<TyGroup>::value) {
+      static_assert(__hip_internal::is_void<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+
+    unsigned int maskNumBits;
+    int numIterations;
+    // next bit to aggregate with
+    unsigned int maskIdx;
+    unsigned int laneId = __lane_id();
+    int nextBit = laneId;
+    unsigned long long mask = ~0ull;
+
+    mask = impl::groupMask(group);
+    maskNumBits = __popcll(mask);
+    maskIdx = __popcll(((1ull << laneId) - 1) & mask);
+
+    if (laneId) {
+      mask <<= 64 - laneId;
+    }
+
+#ifdef __OPTIMIZE__  // at the time of this writing the ockl wfscan functions do not compile when
+                      // using -O0
+    if (impl::isTiledGroup<TyGroup>::value) {
+      // for tiled_groups we know at compile time that whether we can call the ockl intrinsics or
+      // not; if the block tile is actually the whole warp
+      if (impl::tiledGroupSize<TyGroup>::value == warpSize) {
+        if constexpr (impl::isArithmeticFunc<Val, Op>::value && impl::has_arithmetic_scan<Val>::value ||
+                      impl::isBooleanFunc<Val, Op>::value && impl::has_boolean_scan<Val>::value) {
+          return impl::call_scan<Val, Op, Inclusive>(val);
+        }
+      }
+    } else if constexpr (impl::isCoalescedGroup<TyGroup>::value) {
+      // for the coalesced_group case we do need to check at runtime, adding a slight overhead on
+      // this branch
+      if (maskNumBits == warpSize) {
+        if constexpr (impl::isArithmeticFunc<Val, Op>::value && impl::has_arithmetic_scan<Val>::value ||
+                      impl::isBooleanFunc<Val, Op>::value && impl::has_boolean_scan<Val>::value) {
+          return impl::call_scan<Val, Op, Inclusive>(val);
+        }
+      }
+    }
+#endif
+
+    // unsigned int[N] is used in some cases, e.g. when T is wider than 32-bit
+    using ResultType = typename __hip_internal::conditional<
+                         isPrimitiveType && (sizeof(Val) == 4 || sizeof(Val) == 2), permuteType,
+                         permuteType[kNumOfPermutes]>::type;
+    static constexpr int alignment = alignof(Val) <= 4? 4 : alignof(Val);
+    alignas(alignment) ResultType result;
+    alignas(alignment) ResultType permuteResult;
+
+    if constexpr (isPrimitiveType && (sizeof(Val) == 2 || sizeof(Val) == 4)) {
+      result = val;
+    } else {
+      __builtin_memcpy(result, &val, sizeof(Val));
+    }
+
+    // the number of iterations needs to be at least log2(number of bits on)
+    numIterations = sizeof(int) * 8 - __clz(maskNumBits);
+
+    if constexpr (impl::isTiledGroup<TyGroup>::value) {
+      // the number of bits in the mask is always a power of 2 when
+      // tiled blocks are used and in that case we need an iteration less
+      numIterations -= 1;
+    } else {
+      // in the coalesced_threads case it depends, we are not sure whether
+      // it is a power of 2, or not so we check
+      if (!(maskNumBits & (maskNumBits - 1))) {
+        numIterations -= 1;
+      }
+    }
+
+    int modulo = 1;
+
+    while (numIterations) {
+      int offset = modulo >> 1;
+      int increment = modulo - offset;
+      int nextPos = maskIdx - offset - increment;
+      bool insideLanes = nextPos >= 0;
+
+      if (insideLanes) {
+        int next;
+
+        // find the position to aggregate with
+        for (int i = 0; i < increment; i++) {
+          next = __builtin_clzll(mask) + 1;
+          mask <<= next;
+          nextBit -= next;
+        }
+      }
+
+      // clamp index; if out of bounds, the thread read its own value
+      nextBit = (nextBit < (laneId & ~(warpSize - 1))) ? laneId : nextBit;
+      bPermute<isPrimitiveType, permuteType, kNumOfPermutes>(permuteResult, result, nextBit);
+
+      if (insideLanes) {
+        if constexpr (!isPrimitiveType) {
+          Val toReturn;
+          toReturn = op(*reinterpret_cast<Val*>(permuteResult), *reinterpret_cast<Val*>(result));
+        __builtin_memcpy(result, &toReturn, sizeof(Val));
+        } else if constexpr (sizeof(Val) == 4 || sizeof(Val) == 2) {
+          result = op(permuteResult, result);
+        } else if constexpr (sizeof(Val) == 8) {
+          Val tmp;
+          unsigned long long rhs =
+              (static_cast<unsigned long long>(permuteResult[1]) << 32) | permuteResult[0];
+          __builtin_memcpy(&tmp, result, sizeof(Val));
+          tmp = op(*reinterpret_cast<Val*>(&rhs), tmp);
+          __builtin_memcpy(result, &tmp, sizeof(Val));
+        }
+      }
+
+      modulo <<= 1;
+      numIterations--;
+    }
+
+    if constexpr (Inclusive) {
+      return *reinterpret_cast<Val*>(&result);
+    } else {
+      int nextBit = laneId;
+
+      mask = impl::groupMask(group);
+
+      if (laneId) {
+        mask <<= 64 - laneId;
+        nextBit -= mask? __builtin_clzll(mask) + 1 : 0;
+      } else {
+        mask = 0ull;
+      }
+
+      // clamp index; if out of bounds, the thread read its own value
+      nextBit = (nextBit < (laneId & ~(warpSize - 1))) ? laneId : nextBit;
+      bPermute<isPrimitiveType, permuteType, kNumOfPermutes>(permuteResult, result, nextBit);
+
+      if (mask) {
+        return *reinterpret_cast<Val*>(&permuteResult);
+      } else {
+        impl::CGIdentity<Val, Op> identity;
+
+        return identity();
+      }
+    }
+  }
+}
+
+  template <typename TyGroup, typename TyVal, typename TyFn>
+  __CG_QUALIFIER__ auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op)
+    -> decltype(op(val, val))
+  {
+    return impl::scan<true>(group, val, op);
+  }
+
+  template <typename TyGroup, typename TyVal, typename TyFn>
+  __CG_QUALIFIER__ auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op)
+    -> decltype(op(val, val))
+  {
+    return impl::scan<false>(group, val, op);
+  }
+
+  template <typename TyGroup, typename TyVal>
+  __CG_QUALIFIER__ auto inclusive_scan(const TyGroup& group, TyVal&& val)
+  {
+    using Val = typename __hip_internal::remove_cvref<TyVal>::type;
+
+    return impl::scan<true>(group, val, cooperative_groups::plus<Val>());
+  }
+
+  template <typename TyGroup, typename TyVal>
+  __CG_QUALIFIER__ auto exclusive_scan(const TyGroup& group, TyVal&& val)
+  {
+    using Val = typename __hip_internal::remove_cvref<TyVal>::type;
+
+    return impl::scan<false>(group, val, cooperative_groups::plus<Val>());
+  }
+}
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h
index d1ba8e16a09..e4ccbc37da7 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp16.h
@@ -940,6 +940,20 @@ inline __device__ __half unsafeAtomicAdd(__half* address, __half value) {
   if (is_lower) return __low2half(out);
   return __high2half(out);
 }
+
+namespace __hip_internal {
+template <>
+struct NumericLimits<__half> {
+    static constexpr __half maximum() {
+      __half_raw raw { .x = 0x7C00U };
+      return __half(raw);
+    }
+    static constexpr __half minimum() {
+      __half_raw raw { .x = 0xFC00U };
+      return __half(raw);
+    }
+};
+}  // namespace __hip_internal
 #endif  // defined(__clang__) && defined(__HIP__)
 
 // Math functions
@@ -1163,8 +1177,17 @@ template <typename MaskT> __device__ inline __half __reduce_max_sync(MaskT mask,
 
   return __reduce_op_sync(mask, val, op, wfReduce);
 }
+#endif
 
-#endif  // __HIP_NO_HALF_OPERATORS__
+#if !defined(__HIP_NO_HALF_OPERATORS__)
+namespace cooperative_groups {
+namespace impl {
+HIP_IMPL_GENERATE_SCAN_FUNC(add, f16, __half);
+HIP_IMPL_GENERATE_SCAN_FUNC(min, f16, __half);
+HIP_IMPL_GENERATE_SCAN_FUNC(max, f16, __half);
+}
+}
+#endif
 
 #endif  // defined(__cplusplus)
 #elif defined(__GNUC__) || defined(_MSC_VER)
diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
index 535da836851..930c2853c3b 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h
@@ -314,6 +314,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
                                         1 :
                                         (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
   static constexpr auto kMaskNumBits = sizeof(MaskT) * 8;
+  static constexpr auto alignment = alignof(T) <= 4? 4 : alignof(T);
   static_assert(__hip_internal::is_integral<MaskT>::value && sizeof(MaskT) == 8,
                 "The mask must be a 64-bit integer. "
                 "Implicitly promoting a smaller integer is almost always an error.");
@@ -330,9 +331,13 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
   int lastLane = kMaskNumBits - leadingZeroes - 1;
   int maskNumBits;
   int numIterations;
+
   // unsigned int[N] is used in some cases, e.g. when T is wider than 32-bit
-  typename __hip_internal::conditional<isPrimitiveType && (sizeof(T) == 4 || sizeof(T) == 2), permuteType,
-                                       permuteType[kNumOfPermutes]>::type result, permuteResult;
+  using ResultType = typename __hip_internal::conditional<
+                       isPrimitiveType && (sizeof(T) == 4 || sizeof(T) == 2),
+                       permuteType, permuteType[kNumOfPermutes]>::type;
+  alignas(alignment) ResultType result;
+  alignas(alignment) ResultType permuteResult;
   auto backwardPermute = [](int index, permuteType arg) {
     if constexpr (__hip_internal::is_floating_point<T>::value &&
                   sizeof(T) <= 4) {
@@ -372,7 +377,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
   if constexpr (isPrimitiveType && (sizeof(T) == 2 || sizeof(T) == 4)) { 
     result = val;
   } else {
-    __builtin_memcpy(result, &val, sizeof(result));
+    __builtin_memcpy(result, &val, sizeof(T));
   }
 
   // add the values from the lanes using a reduction tree (first the threads with even-numbered
diff --git a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h
index f0d1b573753..bc81d4ace54 100644
--- a/projects/clr/hipamd/include/hip/amd_detail/host_defines.h
+++ b/projects/clr/hipamd/include/hip/amd_detail/host_defines.h
@@ -309,6 +309,50 @@ template <size_t... Ints>
 constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
   return {};
 }
+
+// An equivalent of std::numeric_limits<T>::max() and lowest(). Note that the
+// class name and methods have been changed intentionally to reflect the fact that is not
+// a one-to-one replacement of std::numeric_limits and also to avoid a name collision with
+// the Win32 max() macro
+template <typename T>
+struct NumericLimits;
+
+template <>
+struct NumericLimits<int> {
+    static constexpr int maximum() { return 0x7FFFFFFF; }
+    static constexpr int minimum() { return ~0x7FFFFFFF; }
+};
+
+template <>
+struct NumericLimits<unsigned int> {
+    static constexpr unsigned int maximum()    { return 0xFFFFFFFFu; }
+    static constexpr unsigned int minimum() { return 0u; }
+};
+
+template <>
+struct NumericLimits<long long> {
+    static constexpr long long maximum() { return 0x7FFFFFFFFFFFFFFFLL; }
+    static constexpr long long minimum() { return ~0x7FFFFFFFFFFFFFFFLL; }
+};
+
+template <>
+struct NumericLimits<unsigned long long> {
+    static constexpr unsigned long long maximum()    { return 0xFFFFFFFFFFFFFFFFull; }
+    static constexpr unsigned long long minimum() { return 0ull; }
+};
+
+template <>
+struct NumericLimits<float> {
+  static constexpr float maximum()    { return __builtin_bit_cast(float, 0x7f800000); }
+  static constexpr float minimum()    { return -maximum(); }
+};
+
+template <>
+struct NumericLimits<double> {
+  static constexpr double maximum()    { return __builtin_bit_cast(double, 0x7FF0000000000000LL); }
+  static constexpr double minimum()    { return -maximum(); }
+};
+
 }  // namespace __hip_internal
 typedef __hip_internal::uint8_t __hip_uint8_t;
 typedef __hip_internal::uint16_t __hip_uint16_t;
diff --git a/projects/clr/hipamd/src/hiprtc/CMakeLists.txt b/projects/clr/hipamd/src/hiprtc/CMakeLists.txt
index ca021702f98..872115e9182 100644
--- a/projects/clr/hipamd/src/hiprtc/CMakeLists.txt
+++ b/projects/clr/hipamd/src/hiprtc/CMakeLists.txt
@@ -165,6 +165,7 @@ ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_warp_sync_functions.h
 ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/hip_cooperative_groups_helper.h
 ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups.h
 ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups_reduce.h
+${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_cooperative_groups_scan.h
 ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_unsafe_atomics.h
 ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/amd_hip_atomic.h
 ${PROJECT_SOURCE_DIR}/include/hip/amd_detail/math_fwd.h
diff --git a/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml b/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml
index 8df088707f1..ce7ba045700 100644
--- a/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml
+++ b/projects/hip-tests/catch/config/configs/unit/cooperativeGrps.yaml
@@ -208,6 +208,7 @@ cooperativeGrps:
   Unit_Thread_Block_Coalesced_Reduce_arithmetic: *level_2
   Unit_Thread_Block_Tile_Reduce_Basic: *level_2
   Unit_Thread_Block_Tile_Reduce_Custom_Op: *level_2
+  Unit_Thread_Block_Tile_Scan_Custom_Op: *level_2
   Unit_Thread_Block_Tile_Reduce_Random_arithmetic: *level_2
   Unit_Thread_Block_Tile_Reduce_Random_boolean: *level_2
   Unit_Thread_Block_Coalesced_Reduce_arithmetic:
@@ -222,6 +223,10 @@ cooperativeGrps:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
     disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Scan_All_Parameter_Sizes:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
   Unit_Thread_Block_Tile_Reduce_Basic:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
@@ -230,6 +235,10 @@ cooperativeGrps:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
     disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Scan_Custom_Op:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
   Unit_Thread_Block_Tile_Reduce_Standard_Op_Custom_Type:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
@@ -238,6 +247,10 @@ cooperativeGrps:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
     disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Scan_Trivially_Copyable_Parameters:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
   Unit_Thread_Block_Tile_Reduce_All_Parameter_Sizes:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
@@ -250,3 +263,39 @@ cooperativeGrps:
     <<: *level_2
     # Rock_Window_Failures_on_gfx1151
     disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Multi_Dimensional_Reduce:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Inclusive_Scan_Basic:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Exclusive_Scan_Basic:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Scan_Random_arithmetic:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_Scan_Random_boolean:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Tile_2D_3D_Blocks:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Coalesced_Scan_arithmetic:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Coalesced_Scan_boolean:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
+  Unit_Thread_Block_Scan_partition:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows, amd_wsl]
diff --git a/projects/hip-tests/catch/config/configs/unit/rtc.yaml b/projects/hip-tests/catch/config/configs/unit/rtc.yaml
index 425dcdb4a06..a22da00d221 100644
--- a/projects/hip-tests/catch/config/configs/unit/rtc.yaml
+++ b/projects/hip-tests/catch/config/configs/unit/rtc.yaml
@@ -215,3 +215,9 @@ rtc:
     tags: [compile]
   Unit_Rtc_CoopReduce:
     <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows]
+  Unit_Rtc_CoopScan:
+    <<: *level_2
+    # Rock_Window_Failures_on_gfx1151
+    disabled: [amd_windows]
diff --git a/projects/hip-tests/catch/include/warp_common.hh b/projects/hip-tests/catch/include/warp_common.hh
index 67ff46d696b..1d8a72c3452 100644
--- a/projects/hip-tests/catch/include/warp_common.hh
+++ b/projects/hip-tests/catch/include/warp_common.hh
@@ -12,6 +12,7 @@
 
 #include <hip_test_common.hh>
 #include <resource_guards.hh>
+#include <hip/cooperative_groups/hip_reduce.h>
 #include <hip/hip_cooperative_groups.h>
 #include <hip/hip_fp16.h>
 #include <random>
@@ -247,6 +248,14 @@ struct MaxOfAbsolute {
   }
 };
 
+template <class T>
+struct NonCommutativeOp {
+  T __host__ __device__ operator()(T i, T j)
+  {
+    return std::abs(i) * j;
+  }
+};
+
 // typeid(T).name() does seem to return a very descriptive name for primitive types,
 // at least on clang, so we roll out an equivalent
 template<class T>
@@ -302,6 +311,8 @@ const char* opToString()
 #endif
   else if constexpr (std::is_same<Op, MaxOfAbsolute<T>>::value)
     return "MaxOfAbsolute";
+  else if constexpr (std::is_same<Op, NonCommutativeOp<T>>::value)
+    return "NonCommutativeOp";
   else {
     return "unknown operator";
   }
@@ -315,7 +326,7 @@ void genRandomMasks(LinearAllocGuard<T>& d_buf,
 {
   // masks must be != 0, hence passing 1 as the 'a' distribution parameter
   int wavefrontSize = getWarpSize();
-  std::uniform_int_distribution<unsigned long long> dist(1, wavefrontSize == 64? ~0ull : (1ul << 32) - 1);
+  std::uniform_int_distribution<unsigned long long> dist(1, wavefrontSize == 64? ~0ull : (1ull << 32) - 1);
   std::uniform_int_distribution<unsigned long long> distNoHoles(1, getWarpSize() - 2);
   int numBytes = numItems * sizeof(T);
   LinearAllocGuard<T> tmp(LinearAllocs::malloc, numBytes);
@@ -391,82 +402,220 @@ void genRandomBuffers(LinearAllocGuard<T>& d_buf,
   HIP_CHECK(hipMemcpy(d_buf.ptr(), buf.ptr(), numBytes, hipMemcpyHostToDevice));
 }
 
+enum class AggregationType { Reduce,
+                             InclusiveScan,
+                             ExclusiveScan,
+                             /// @brief an inclusive scan with the default operator, i.e. cg::plus
+                             InclusiveScanDefault,
+                             /// @brief an exclusive scan with the default operator, i.e. cg::plus
+                             ExclusiveScanDefault };
+
+inline bool isInclusive(AggregationType aggType)
+{
+  switch (aggType) {
+  case AggregationType::Reduce:
+    return true;
+  case AggregationType::InclusiveScan:
+    return true;
+  case AggregationType::ExclusiveScan:
+    return false;
+  case AggregationType::InclusiveScanDefault:
+    return true;
+  case AggregationType::ExclusiveScanDefault:
+    return false;
+  default:
+    assert(false && "Unknown aggregation type");
+    return "unknown";
+  }
+}
+
+inline const char* aggregationTypeToStr(AggregationType aggType)
+{
+  switch (aggType) {
+  case AggregationType::Reduce:
+    return "reduce";
+  case AggregationType::InclusiveScan:
+    return "inclusive scan";
+  case AggregationType::ExclusiveScan:
+    return "exclusive scan";
+  case AggregationType::InclusiveScanDefault:
+    return "inclusive scan plus";
+  case AggregationType::ExclusiveScanDefault:
+    return "exclusive scan plus";
+  default:
+    assert(false && "Unknown aggregation type");
+    return "unknown";
+  }
+}
+
+constexpr uint64_t nextPowerOf2(uint64_t v) {
+  v += (v == 0);
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v |= v >> 32;
+  return ++v;
+}
+
+// Sets id to the expected value exclusive_scan returns for the first active lane
+template <class T, class Op>
+void scanIdentity(T& id)
+{
+  T result = {};
+
+  if constexpr (std::is_same<Op, MinOp<T>>::value || std::is_same<Op, cooperative_groups::less<T>>::value) {
+    if constexpr (std::is_same<T, __half>::value) {
+      result = HIPRT_INF_FP16;
+    } else if (std::is_floating_point<T>::value) {
+      result = std::numeric_limits<T>::infinity();
+    } else {
+      result = std::numeric_limits<T>::max();
+    }
+  } else if constexpr (std::is_same<Op, MaxOp<T>>::value || std::is_same<Op, cooperative_groups::greater<T>>::value) {
+    if constexpr (std::is_same<T, __half>::value) {
+      result = -HIPRT_INF_FP16;
+    } else if (std::is_floating_point<T>::value) {
+      result = -std::numeric_limits<T>::infinity();
+    } else {
+      result = std::numeric_limits<T>::lowest();
+    }
+  } else if constexpr (std::is_same<Op, std::bit_and<T>>::value) {
+    result = ~result;
+  } else if constexpr (std::is_same<Op, std::bit_or<T>>::value) {
+  } else {
+    std::memset(&result, 0, sizeof(T));
+  }
+
+  id = result;
+}
+
 // given an operation produces the expected result of the warp-wide reduction
 // @mask indicates the lanes that will participate in the computation
+// @return the result associated the lane with the highest index that is active according to the
+//         mask
 template <class T, class Op>
-T calculateExpected(const T* input, Op& op, unsigned long long mask)
+T calculateExpected(T* output,
+                    const T* input,
+                    Op op,
+                    unsigned long long mask,
+                    AggregationType aggType)
 {
   T result;
-  int wavefrontSize = getWarpSize();
+  bool inclusive = aggType != AggregationType::ExclusiveScan && aggType != AggregationType::ExclusiveScanDefault;
+  int lastLane = 64 - __builtin_clzll(mask) - 1;
+  T aggregation[64];
+  // the results for the previous step of the aggregation
+  T lastAggregation[64];
+  // identity value for this type and operator
+  T id {};
 
-  if constexpr (std::is_same<Op, std::plus<T>>::value
-#if HT_AMD
-      || std::is_same<Op, cooperative_groups::plus<T>>::value
-#endif
-  ) {
-    T tmp[64] = { 0 };
+  std::memset(aggregation, 0, 64 * sizeof(T));
+  std::memset(lastAggregation, 0, 64 * sizeof(T));
+
+  scanIdentity<T, Op>(id);
+
+  for (int i = 0; i < 64; i++) {
+    output[i] = id;
+  }
 
-    for (int i = 0; i < wavefrontSize; i++) {
-       if (mask & (1ul << i)) {
-         tmp[i] = input[i];
+  if constexpr (std::is_same<Op, std::plus<T>>::value ||
+                std::is_same<Op, cooperative_groups::plus<T>>::value ) {
+    for (int i = 0; i < lastLane + 1; i++) {
+       if (mask & (1ull << i)) {
+         aggregation[i] = input[i];
+         lastAggregation[i] = input[i];
        }
     }
 
-    for (int modulo = 2; modulo <= wavefrontSize; modulo *= 2) {
-      for (int i = 0; i < wavefrontSize; i += modulo) {
-        int j = i + modulo / 2;
+    for (int modulo = 2; modulo <= nextPowerOf2(lastLane + 1); modulo *= 2) {
+      for (int i = 0; i < lastLane + 1; i += 1) {
+        int j = i - modulo / 2;
 
-        if (j < wavefrontSize)
-          tmp[i] += tmp[j];
+        if (j >= 0) {
+          aggregation[i] += lastAggregation[j];
+        }
       }
+      std::memcpy(lastAggregation, aggregation, sizeof(lastAggregation));
     }
-    result = tmp[0];
-#if HT_AMD
+
+    for (int i = 0; i < lastLane + 1; i += 1) {
+      if (inclusive) {
+        output[i] = aggregation[i];
+      } else if (i > 0) {
+        output[i] = aggregation[i - 1];
+      }
+    }
+
+    result = output[lastLane];
   } else if constexpr (std::is_same<Op, cooperative_groups::less<T>>::value) {
     MinOp<T> minOp;
-    return calculateExpected(input, minOp, mask);
+    return calculateExpected(output, input, minOp, mask, aggType);
   } else if constexpr (std::is_same<Op, cooperative_groups::greater<T>>::value) {
     MaxOp<T> maxOp;
-    return calculateExpected(input, maxOp, mask);
+    return calculateExpected(output, input, maxOp, mask, aggType);
   } else if constexpr (std::is_same<Op, cooperative_groups::bit_xor<T>>::value) {
     std::bit_xor<T> xorOp;
-    return calculateExpected(input, xorOp, mask);
+    return calculateExpected(output, input, xorOp, mask, aggType);
   } else if constexpr (std::is_same<Op, cooperative_groups::bit_or<T>>::value) {
     std::bit_or<T> orOp;
-    return calculateExpected(input, orOp, mask);
+    return calculateExpected(output, input, orOp, mask, aggType);
   } else if constexpr (std::is_same<Op, cooperative_groups::bit_and<T>>::value) {
     std::bit_and<T> andOp;
-    return calculateExpected(input, andOp, mask);
-#endif
+    return calculateExpected(output, input, andOp, mask, aggType);
   } else {
     bool initialized = false;
 
-    for (int i = 0; i < wavefrontSize; i++) {
-      if (mask & (1ul << i)) {
-        if (initialized)
-          result = op(input[i], result);
-        else {
+    result = id;
+
+    for (int i = 0; i < lastLane + 1; i++) {
+      if (mask & (1ull << i)) {
+        if (initialized) {
+          if (inclusive) {
+            result = op(result, input[i]);
+            output[i] = result;
+          } else {
+            output[i] = result;
+            result = op(result, input[i]);
+          }
+        } else {
           result = input[i];
+
+          if (inclusive) {
+            output[i] = result;
+          } else {
+            output[i] = id;
+          }
           initialized = true;
         }
       }
     }
   }
+
+  if (aggType == AggregationType::Reduce) {
+    for (int i = 0; i < lastLane + 1; i++) {
+      output[i] = result;
+    }
+  }
+
   return result;
 }
 
 template <class T>
-void printMismatch(const T& result, const T& expected, const T* input, unsigned long long mask)
+void printMismatch(const T& result, const T& expected, const T* input, unsigned long long mask, int laneId)
 {
   std::ios init(NULL);
 
   init.copyfmt(std::cout);
-  std::cout << "\nMismatch\n";
+  std::cout << "\nMismatch at lane: " << laneId << "\n";
   std::cout << "Mask: 0x" << std::hex << std::setfill('0') << std::setw(16) << mask << "\n";
+  std::cout << "Input:\n";
   std::cout.copyfmt(init);
 
   for (int i = 0; i < getWarpSize(); i++) {
-    if ((1ul << i) & mask) {
+    if ((1ull << i) & mask) {
       if constexpr (std::is_same<T, __half>::value) {
         const unsigned char* ptr = reinterpret_cast<const unsigned char*>(&input[i]);
 
@@ -490,10 +639,12 @@ void printMismatch(const T& result, const T& expected, const T* input, unsigned
   }
 }
 
-template <class T>
-void compareFloatingPoint(const T& result, const T& expected, unsigned long long mask, const T* input)
+template <class Op, class T>
+void compareFloatingPoint(const T& result, const T& expected, unsigned long long mask, const T* input, int laneId)
 {
   using namespace Catch::Matchers;
+  std::string opName = opToString<T, Op>();
+
   if constexpr (std::is_same<T, __half>::value) {
     float resultFloat = __half2float(result);
     float expectedFloat = __half2float(expected);
@@ -501,18 +652,24 @@ void compareFloatingPoint(const T& result, const T& expected, unsigned long long
     float relativeEpsilon = 0.1 * fmax(resultFloat, expectedFloat);
     float eps = 0.01f;
 
+    if constexpr (std::is_same<T, __half>::value) {
+      INFO("result: 0x" << std::hex << __half_as_ushort(result));
+    } else {
+      INFO("result: 0x" << std::hex << result);
+    }
+
     REQUIRE(!__hisnan(result));
-    REQUIRE(!__hisinf(result));
 
     if (relativeEpsilon > eps) {
       if (absDifference > 0.0001) {
         if (absDifference >= eps * fabs(fmax(resultFloat, expectedFloat))) {
-          printMismatch(result, expected, input, mask);
+          printMismatch(result, expected, input, mask, laneId);
           std::cout << "Relative epsilon: " << relativeEpsilon << "\n";
           std::cout << "Difference: " << absDifference << "\n";
         }
-       }
+      }
 
+      INFO("Operator: " << opName << " mask: 0x" << std::hex << mask);
       REQUIRE_THAT(__half2float(resultFloat), WithinRel(expectedFloat, eps));
     }
   } else {
@@ -524,11 +681,12 @@ void compareFloatingPoint(const T& result, const T& expected, unsigned long long
     if (relativeEpsilon > eps) {
       if (absDifference > 0.0001) {
         if (absDifference >= eps * fabs(fmax(result, expected))) {
-          printMismatch(result, expected, input, mask);
+          printMismatch(result, expected, input, mask, laneId);
           std::cout << "Relative epsilon: " << relativeEpsilon << "\n";
           std::cout << "Difference: " << absDifference << "\n";
         }
 
+        INFO("Operator: " << opName << " mask: 0x" << std::hex << mask);
         REQUIRE_THAT(result, WithinRel(expected, eps));
       }
     }
@@ -557,6 +715,7 @@ void runTestReduce(int iteration, Reduce reduce)
   LinearAllocGuard<T> input, d_input;
   LinearAllocGuard<unsigned long long> masks, d_masks;
   Op<T> op;
+  std::string opName = opToString<T, Op<T>>();
   int numReduce = 0;
 
   genRandomBuffers(d_input, input, dist, gen, kNumReduces * wavefrontSize);
@@ -566,15 +725,20 @@ void runTestReduce(int iteration, Reduce reduce)
   HIP_CHECK(hipMemcpy(output.ptr(), d_output.ptr(), d_output.size_bytes(), hipMemcpyDeviceToHost));
 
   while (numReduce < kNumReduces) {
+    T expectedByLane[64];
     T* waveInput = &input.ptr()[numReduce * wavefrontSize];
-    T expected = calculateExpected<T>(waveInput, op, masks.ptr()[numReduce]);
+    T expected = calculateExpected<T>(expectedByLane,
+                                      waveInput,
+                                      op,
+                                      masks.ptr()[numReduce],
+                                      AggregationType::Reduce);
     int lane = 0;
 
     while (lane < wavefrontSize) {
       auto result = output.ptr()[numReduce * wavefrontSize + lane];
       unsigned long long mask = masks.ptr()[numReduce];
 
-      if ((1ul << lane) & mask) {
+      if ((1ull << lane) & mask) {
         if constexpr (std::is_integral<T>::value || std::is_same<Op<T>, MinOp<T>>::value ||
                       std::is_same<Op<T>, MaxOp<T>>::value) {
           // for integral types or min/max the result should match exactly
@@ -582,12 +746,13 @@ void runTestReduce(int iteration, Reduce reduce)
             REQUIRE(__half2float(result) == __half2float(expected));
           else {
             if (result != expected) {
-              printMismatch(result, expected, waveInput, mask);
+              printMismatch(result, expected, waveInput, mask, lane);
+              INFO("Operator: " << opName << " mask: 0x" << std::hex << mask);
               REQUIRE(result == expected);
             }
           }
         } else
-          compareFloatingPoint(result, expected, mask, waveInput);
+          compareFloatingPoint<Op<T>>(result, expected, mask, waveInput, lane);
 
       }
       lane++;
diff --git a/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc b/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc
index 315633fb188..5af8dbc9702 100644
--- a/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc
+++ b/projects/hip-tests/catch/performance/api/warpSync/warpSync.cc
@@ -10,6 +10,7 @@
 #include "warp_common.hh"
 #include <hip/hip_runtime.h>
 #include <hip/cooperative_groups/hip_reduce.h>
+#include <hip/cooperative_groups/hip_scan.h>
 #include <hip/hip_fp16.h>
 #include <hip_test_common.hh>
 #include <performance_common.hh>
@@ -32,6 +33,22 @@
 
 static constexpr int kBlockDim = 1024;
 
+unsigned long long fullMask, halfHighBitsOn, halfBitsOn, high16BitsOn, high8BitsOn, high4BitsOn,
+                   allButOne;
+
+static void initializeMasks()
+{
+  int wavefrontSize = getWarpSize();
+  int halfWaveSize = wavefrontSize / 2;
+
+  fullMask = (getWarpSize() == 64)? ~0ull : (1ull << 32) - 1;
+  halfBitsOn = (1ull << (wavefrontSize / 2)) - 1;
+  halfHighBitsOn = halfBitsOn << halfWaveSize;
+  high16BitsOn = halfBitsOn << (wavefrontSize - 16);
+  high8BitsOn = halfBitsOn << (wavefrontSize - 8);
+  high4BitsOn = halfBitsOn << (wavefrontSize - 4);
+  allButOne = fullMask & ~1;
+}
 template <class T> struct AtomicAddOp {
   __device__ T operator()(T* lhs, const T& rhs) { return atomicAdd(lhs, rhs); }
 };
@@ -80,9 +97,10 @@ __global__ void reduceAtomics(T* __restrict__ output, const T* __restrict__ inpu
 
   __syncthreads();
 
-  uint lane = __lane_id();
+  uint lane = (threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y) %
+              warpSize;
 
-  if (mask & (1ul << lane)) op(&result[numWarp], input[idx]);
+  if (mask & (1ull << lane)) op(&result[numWarp], input[idx]);
 
   __syncthreads();
 
@@ -94,8 +112,10 @@ __global__ void reduceOpSync(T* __restrict__ output, const T* __restrict__ input
                              unsigned long long mask) {
   int idx = threadIdx.x + blockIdx.x * kBlockDim;
   T result;
+  int laneId = (threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y) %
+               warpSize;
 
-  if (mask & (1ul << __lane_id())) {
+  if (mask & (1ull << laneId)) {
     if constexpr (std::is_same<Op<T>, std::plus<T>>::value)
       result = __reduce_add_sync(mask, input[idx]);
     else if constexpr (std::is_same<Op<T>, MinOp<T>>::value)
@@ -115,7 +135,7 @@ __global__ void reduceOpSync(T* __restrict__ output, const T* __restrict__ input
   }
 }
 
-template <size_t TileSize, bool ExcludeFirst, class Functor, class T>
+template <size_t TileSize, class Functor, class T, AggregationType aggType>
 __global__ void reduceCoop(T* __restrict__ output, const T* __restrict__ input)
 {
   namespace cg = cooperative_groups;
@@ -126,14 +146,16 @@ __global__ void reduceCoop(T* __restrict__ output, const T* __restrict__ input)
   const int numTiles = warpSize / TileSize;
   int laneId = threadIdx.x % warpSize;
 
-  if constexpr (ExcludeFirst) {
-    if (laneId == 0) {
-      return;
-    }
+  if constexpr (aggType == AggregationType::Reduce) {
+    result = cg::reduce(mytile, input[idx], Functor());
+  } else if constexpr (aggType == AggregationType::InclusiveScan) {
+    result = cg::inclusive_scan(mytile, input[idx], Functor());
+  } else if constexpr (aggType == AggregationType::ExclusiveScan) {
+    result = cg::exclusive_scan(mytile, input[idx], Functor());
+  } else {
+    static_assert(__hip_internal::is_void<T>::value, "Unsupported aggType");
   }
 
-  result = cg::reduce(mytile, input[idx], Functor());
-
   if (laneId / TileSize  == numTiles - 1) {
     // it's always the higher order tile reduction the one that gets calculated
     output[idx / warpSize] = result;
@@ -190,11 +212,15 @@ template <class T, template <typename> class Op> class ReduceSyncBenchmark
 };
 
 template <class T, template <typename> class Op>
-void checkResults(T* d_lhs, T* d_rhs, size_t numBytes, unsigned long long mask) {
+void checkResults(T* d_lhs, T* d_rhs,
+                  size_t numBytes,
+                  unsigned long long mask,
+                  AggregationType aggType) {
   using namespace Catch::Matchers;
   LinearAllocGuard<T> h_lhs(LinearAllocs::malloc, numBytes);
   LinearAllocGuard<T> h_rhs(LinearAllocs::malloc, numBytes);
   bool memcmpResult;
+  std::string opName = opToString<T, Op<T>>();
 
   assert(numBytes % sizeof(T) == 0 && "numBytes needs to be a multiple of sizeof(T)");
   HIP_CHECK(hipMemcpy(h_lhs.ptr(), d_lhs, numBytes, hipMemcpyDeviceToHost));
@@ -205,6 +231,11 @@ void checkResults(T* d_lhs, T* d_rhs, size_t numBytes, unsigned long long mask)
     for (int i = 0; i < numBytes / sizeof(T); i++) {
       auto& lhsResult = h_lhs.ptr()[i];
       auto& rhsResult = h_rhs.ptr()[i];
+      INFO(" mask: 0x" << std::hex << mask);
+      INFO(" index: " << i);
+      INFO(" operator: " << opName);
+      INFO(" when checking: " << aggregationTypeToStr(aggType));
+
 
       if constexpr (std::is_integral<T>::value || std::is_same<Op<T>, MinOp<T>>::value ||
                     std::is_same<Op<T>, MaxOp<T>>::value)
@@ -219,9 +250,15 @@ void checkResults(T* d_lhs, T* d_rhs, size_t numBytes, unsigned long long mask)
 
 // in this case, instead of using masks, the TileSize would define the mask
 // at compile time
-template <size_t TileSize, bool ExcludeFirst, class T, template <typename> class Op> class CoopBenchmark
-  : public Benchmark<CoopBenchmark<TileSize, ExcludeFirst, T, Op>> {
+template <size_t TileSize, class T, template <typename> class Op, AggregationType aggType> class CoopBenchmark
+  : public Benchmark<CoopBenchmark<TileSize, T, Op, aggType>> {
 public:
+
+  CoopBenchmark()
+  {
+    initializeMasks();
+  }
+
   void operator()(T* output, const T* input, int numItems)
   {
     namespace cg = cooperative_groups;
@@ -233,17 +270,17 @@ template <size_t TileSize, bool ExcludeFirst, class T, template <typename> class
 
     TIMED_SECTION(kTimerTypeEvent) {
       if constexpr (std::is_same<Op<T>, std::plus<T>>::value) {
-        reduceCoop<TileSize, ExcludeFirst, cg::plus<T>, T><<<gridDim, blockDim>>>(output, input);
+        reduceCoop<TileSize, cg::plus<T>, T, aggType><<<gridDim, blockDim>>>(output, input);
       } else if constexpr (std::is_same<Op<T>, MinOp<T>>::value) {
-        reduceCoop<TileSize, ExcludeFirst, cg::less<T>, T><<<gridDim, blockDim>>>(output, input);
+        reduceCoop<TileSize, cg::less<T>, T, aggType><<<gridDim, blockDim>>>(output, input);
       } else if constexpr (std::is_same<Op<T>, MaxOp<T>>::value) {
-        reduceCoop<TileSize, ExcludeFirst, cg::greater<T>, T><<<gridDim, blockDim>>>(output, input);
+        reduceCoop<TileSize, cg::greater<T>, T, aggType><<<gridDim, blockDim>>>(output, input);
       } else if constexpr (std::is_same<Op<T>, AndOp<T>>::value) {
-        reduceCoop<TileSize, ExcludeFirst, cg::bit_and<T>, T><<<gridDim, blockDim>>>(output, input);
+        reduceCoop<TileSize, cg::bit_and<T>, T, aggType><<<gridDim, blockDim>>>(output, input);
       } else if constexpr (std::is_same<Op<T>, OrOp<T>>::value) {
-        reduceCoop<TileSize, ExcludeFirst, cg::bit_or<T>, T><<<gridDim, blockDim>>>(output, input);
+        reduceCoop<TileSize, cg::bit_or<T>, T, aggType><<<gridDim, blockDim>>>(output, input);
       } else if constexpr (std::is_same<Op<T>, XorOp<T>>::value) {
-        reduceCoop<TileSize, ExcludeFirst, cg::bit_xor<T>, T><<<gridDim, blockDim>>>(output, input);
+        reduceCoop<TileSize, cg::bit_xor<T>, T, aggType><<<gridDim, blockDim>>>(output, input);
       } else {
         static_assert(std::is_void<T>::value, "Unsupported operator");
       }
@@ -283,7 +320,47 @@ template <> struct HasAtomicOps<long long> {
   static constexpr bool value = false;
 };
 
+template <class T, template <typename> class Op, AggregationType aggType>
+void benchmarkCoop(LinearAllocGuard<T>* d_outputCoop,
+                   const LinearAllocGuard<T>& d_input,
+                   const std::map<std::string, unsigned long long>& masks,
+                   int numItems)
+{
+  int wavefrontSize = getWarpSize();
+
+  for (const auto& mask : masks) {
+    printf("%s %llx\n", mask.first.c_str(), mask.second);
+    unsigned long long warpMask = wavefrontSize == 64? ~0ull : 0xFFFFFFFF;
+
+    if (mask.second == (fullMask & warpMask)) {
+      if (wavefrontSize == 64) {
+        CoopBenchmark<64, T, Op, aggType> benchmark;
+        benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
+      } else {
+        CoopBenchmark<32, T, Op, aggType> benchmark;
+        benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
+      }
+    } else if (wavefrontSize == 64 && mask.second == (halfHighBitsOn & warpMask)) {
+      CoopBenchmark<32, T, Op, aggType> benchmark;
+      benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
+    } else if (mask.second == (high16BitsOn & warpMask)) {
+      CoopBenchmark<16, T, Op, aggType> benchmark;
+      benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
+    } else if (mask.second == (high8BitsOn & warpMask)) {
+      CoopBenchmark<8, T, Op, aggType> benchmark;
+      benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
+    } else if (mask.second == (high4BitsOn & warpMask)) {
+      CoopBenchmark<4, T, Op, aggType> benchmark;
+      benchmark.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
+    }
+  }
+}
 template <class T, template <typename> class Op> struct ReduceBenchmark {
+  ReduceBenchmark()
+  {
+    initializeMasks();
+  }
+
   void Run() {
     static constexpr int numMasks = 6;
     using distribution = typename DistributionType<T>::type;
@@ -302,12 +379,6 @@ template <class T, template <typename> class Op> struct ReduceBenchmark {
     LinearAllocGuard<T>* d_outputCoop = &d_outputsCoop[0];
     std::mt19937_64 gen(123);
     distribution dist;
-    int halfWaveSize = wavefrontSize / 2;
-    unsigned long long halfBitsOn = (1ul << (wavefrontSize / 2)) - 1;
-    unsigned long long fullMask = -1ul, halfHighBitsOn = halfBitsOn << halfWaveSize,
-                       high16BitsOn = halfBitsOn << (wavefrontSize - 16),
-                       high8BitsOn = halfBitsOn << (wavefrontSize - 8),
-                       high4BitsOn = halfBitsOn << (wavefrontSize - 4), allButOne = -1 & ~1;
     const char* typeStr = typeToString<T>();
     const char* opStr = opToString<T, Op<T>>();
     std::map<std::string, unsigned long long> masks;
@@ -372,56 +443,39 @@ template <class T, template <typename> class Op> struct ReduceBenchmark {
     }
 
     printf("\n--- reduce cooperative groups %s %s--- \n", opStr, typeStr);
-
-    for (const auto& mask : masks) {
-      printf("%s %llx\n", mask.first.c_str(), mask.second);
-      unsigned long warpMask = wavefrontSize == 64? ~0ull : 0xFFFFFFFF;
-
-      if (mask.second == (fullMask & warpMask)) {
-
-        if (wavefrontSize == 64) {
-          CoopBenchmark<64, false, T, Op> benchmarkCoop;
-          benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-        } else {
-          CoopBenchmark<32, false, T, Op> benchmarkCoop;
-          benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-        }
-      } else if (wavefrontSize == 64 && mask.second == (halfHighBitsOn & warpMask)) {
-        CoopBenchmark<32, false, T, Op> benchmarkCoop;
-        benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-      } else if (mask.second == (high16BitsOn & warpMask)) {
-        CoopBenchmark<16, false, T, Op> benchmarkCoop;
-        benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-      } else if (mask.second == (high8BitsOn & warpMask)) {
-        CoopBenchmark<8, false, T, Op> benchmarkCoop;
-        benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-      } else if (mask.second == (high4BitsOn & warpMask)) {
-        CoopBenchmark<4, false, T, Op> benchmarkCoop;
-        benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-      } else if (mask.second == (allButOne & warpMask)) {
-        if (wavefrontSize == 64) {
-          CoopBenchmark<64, true, T, Op> benchmarkCoop; 
-          benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-        } else {
-          CoopBenchmark<32, true, T, Op> benchmarkCoop; 
-          benchmarkCoop.Run((d_outputCoop++)->ptr(), d_input.ptr(), numItems);
-        }
-      }
-    }
-
+    benchmarkCoop<T, Op, AggregationType::Reduce>(d_outputCoop, d_input, masks, numItems);
     printf("\n");
 
     if constexpr (HasAtomicOps<T>::value) {
       printf("Checking results...\n");
 
       for (const auto& mask : masks) {
-        checkResults<T, Op>(d_outputsAtomic[pos].ptr(), d_outputsReduce[pos].ptr(), outputNumBytes,
-                            mask.second);
-        checkResults<T, Op>(d_outputsReduce[pos].ptr(), d_outputsCoop[pos].ptr(), outputNumBytes,
-                            mask.second);
+        checkResults<T, Op>(d_outputsAtomic[pos].ptr(),
+                            d_outputsReduce[pos].ptr(),
+                            outputNumBytes,
+                            mask.second,
+                            AggregationType::Reduce);
+
+        if (mask.second != allButOne) {
+          // allButOne is not supported in tiled cooperative groups; all the threads of the tile must
+          // be active
+          checkResults<T, Op>(d_outputsReduce[pos].ptr(),
+                              d_outputsCoop[pos].ptr(),
+                              outputNumBytes,
+                              mask.second,
+                              AggregationType::Reduce);
+        }
         pos++;
       }
     }
+
+    printf("\n--- inclusive scan cooperative groups %s %s--- \n", opStr, typeStr);
+    benchmarkCoop<T, Op, AggregationType::InclusiveScan>(d_outputCoop, d_input, masks, numItems);
+    printf("\n");
+
+    printf("\n--- exclusive scan cooperative groups %s %s--- \n", opStr, typeStr);
+    benchmarkCoop<T, Op, AggregationType::ExclusiveScan>(d_outputCoop, d_input, masks, numItems);
+    printf("\n");
   }
 };
 
diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh
index eead915c9eb..f7d5ce39f1a 100644
--- a/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh
+++ b/projects/hip-tests/catch/unit/cooperativeGrps/cooperative_groups_common.hh
@@ -9,6 +9,7 @@
 #include <hip_test_common.hh>
 #include <hip/hip_cooperative_groups.h>
 #include <hip/cooperative_groups/hip_reduce.h>
+#include <hip/cooperative_groups/hip_scan.h>
 #include <cmd_options.hh>
 #include "../math/math_common.hh"
 namespace {
diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc
index c0231d48294..7dfb28033b4 100644
--- a/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc
+++ b/projects/hip-tests/catch/unit/cooperativeGrps/thread_block_tile.cc
@@ -570,14 +570,24 @@ void testReduceForTileSize()
   void* devicePtr = d_result.ptr();
   void* args[] = { &devicePtr };
 
-  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(simpleSum<TileSize>), gridDim, blockDim, args, 0, nullptr));
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(simpleSum<TileSize>),
+                                       gridDim,
+                                       blockDim,
+                                       args,
+                                       0,
+                                       nullptr));
   HIP_CHECK(hipDeviceSynchronize());
   HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
                       h_result.size_bytes(), hipMemcpyDeviceToHost));
   REQUIRE(*h_result.host_ptr() == TileSize);
 
-  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(simpleSumSubtiles<TileSize>), gridDim, blockDim, args, 0, nullptr));
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(simpleSumSubtiles<TileSize>),
+                                       gridDim,
+                                       blockDim,
+                                       args,
+                                       0,
+                                       nullptr));
   HIP_CHECK(hipDeviceSynchronize());
   HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
@@ -601,10 +611,15 @@ HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Basic, int)
   }
 }
 
+// @extraMasks  when testing coalesced_threads, this can be use to simulate
+//              divergence
 template <size_t TileSize, class Functor, class T>
-void __global__ reduceKernel(T* output, const T* input)
+void __global__ reduceKernel(T* output,
+                             const T* input,
+                             unsigned long long* extraMasks,
+                             AggregationType* aggType)
 {
-  int tid = threadIdx.x;
+  int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   int laneId = tid % warpSize;
   cg::thread_block mygroup = cg::this_thread_block();
   auto mytile = cg::tiled_partition<TileSize>(mygroup);
@@ -612,16 +627,37 @@ void __global__ reduceKernel(T* output, const T* input)
   for (int i = 0; i < kNumReduces; i++) {
     int idx = warpSize * i + laneId;
     T& result = output[idx];
+    unsigned long long mask = extraMasks[i];
 
-    result = cg::reduce(mytile, input[idx], Functor());
+    if ((1ull << laneId) & mask) {
+      switch (*aggType) {
+      case AggregationType::Reduce:
+        result = cg::reduce(mytile, input[idx], Functor());
+        break;
+      case AggregationType::InclusiveScan:
+        result = cg::inclusive_scan(mytile, input[idx], Functor());
+        break;
+      case AggregationType::ExclusiveScan:
+        result = cg::exclusive_scan(mytile, input[idx], Functor());
+        break;
+      default:
+        assert(false && "Unsupported enumeration");
+      }
+
+    } else {
+      result = 0;
+    }
   }
 }
 
 // @extraMasks  used to simulate divergence when using coalesced_threads
 template <class Functor, class T>
-void __global__ reduceKernelCoalesced(T* output, const T* input, unsigned long long* extraMasks)
+void __global__ reduceKernelCoalesced(T* output,
+                                      const T* input,
+                                      unsigned long long* extraMasks,
+                                      AggregationType* aggType)
 {
-  int tid = threadIdx.x;
+  int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   int laneId = tid % warpSize;
 
   for (int i = 0; i < kNumReduces; i++) {
@@ -632,7 +668,19 @@ void __global__ reduceKernelCoalesced(T* output, const T* input, unsigned long l
     if ((1ull << laneId) & mask) {
       auto coalesced = cg::coalesced_threads();
 
-      result = cg::reduce(coalesced, input[idx], Functor());
+      switch (*aggType) {
+      case AggregationType::Reduce:
+        result = cg::reduce(coalesced, input[idx], Functor());
+        break;
+      case AggregationType::InclusiveScan:
+        result = cg::inclusive_scan(coalesced, input[idx], Functor());
+        break;
+      case AggregationType::ExclusiveScan:
+        result = cg::exclusive_scan(coalesced, input[idx], Functor());
+        break;
+      default:
+        assert(false && "Unsupported enumeration");
+      }
     } else {
       result = 0;
     }
@@ -641,7 +689,8 @@ void __global__ reduceKernelCoalesced(T* output, const T* input, unsigned long l
 
 // @TileSize the tile size or 0 when testing coalesced groups
 template <size_t TileSize, class Op, class T>
-void reduceForTypeAndOp()
+void aggregateForTypeAndOp(AggregationType aggType,
+                           dim3 blockDim)
 {
   using distribution = typename DistributionType<T>::type;
   int wavefrontSize = getWarpSize();
@@ -654,18 +703,20 @@ void reduceForTypeAndOp()
                                                     kNumReduces * sizeof(unsigned long long));
   LinearAllocGuard<unsigned long long> h_extraMasks(LinearAllocs::malloc,
                                                     kNumReduces * sizeof(unsigned long long));
+  LinearAllocGuard<AggregationType> d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType));
   std::mt19937_64 gen(Catch::rngSeed());
   dim3 gridDim = { 1 };
-  dim3 blockDim = { static_cast<unsigned short>(wavefrontSize) };
   hipError_t status;
   typename distribution::result_type a = std::is_same<T, half>::value? std::numeric_limits<unsigned short>::lowest() :
                                          (std::is_signed<T>::value? -1023 : 0);
   typename distribution::result_type b = std::is_same<T, half>::value? std::numeric_limits<unsigned short>::max() :
                                          1023;
-  distribution distInput(a, b);
-  int numReduce = 0;
+  distribution distInput {a, b};
+  int numAggregation = 0;
   void* kernelPtr;
+  T expected[64];
 
+  HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice));
   genRandomBuffers(d_input, h_input, distInput, gen, kNumReduces * wavefrontSize);
 
   if (TileSize) {
@@ -680,7 +731,7 @@ void reduceForTypeAndOp()
                    kNumReduces);
   }
 
-  std::array<void*, 3> devicePtrs = { d_result.ptr(), d_input.ptr(), d_extraMasks.ptr() };
+  std::array<void*, 4> devicePtrs = { d_result.ptr(), d_input.ptr(), d_extraMasks.ptr(), d_aggType.ptr() };
   void* args[devicePtrs.size()];
 
   for (int i = 0; i < devicePtrs.size(); i++) {
@@ -705,96 +756,123 @@ void reduceForTypeAndOp()
   HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
                       h_result.size_bytes(), hipMemcpyDeviceToHost));
 
-  while (numReduce < kNumReduces) {
+  while (numAggregation < kNumReduces) {
     for (int laneId = 0; laneId < wavefrontSize; laneId++) {
+      T result = h_result.host_ptr()[numAggregation * wavefrontSize + laneId];
+      int lastLane;
+      const T* input = &h_input.host_ptr()[numAggregation * wavefrontSize];
       unsigned long long mask = ~0ull;
-      T result = h_result.host_ptr()[numReduce * wavefrontSize + laneId], expected = 0;
-      const T* input = &h_input.host_ptr()[numReduce * wavefrontSize];
+      Op op {};
+      std::string opName = opToString<T, Op>();
+      int resultLane;
 
       if constexpr (TileSize > 0) {
         mask >>= (64 - TileSize);
         mask <<= ((laneId % wavefrontSize) / TileSize) * TileSize;
       }
 
-      mask &= h_extraMasks.host_ptr()[numReduce];
+      mask &= h_extraMasks.host_ptr()[numAggregation];
+      lastLane = 64 - __builtin_clzll(mask) - 1;
 
-      if ((1ull << laneId) & mask) {
-        Op op {};
-        expected = calculateExpected(input, op, mask);
+      if (laneId > lastLane) {
+        continue;
       }
 
-      if constexpr (std::is_integral<T>::value) {
-        // for integral types the result should match exactly
-        if (result != expected) {
-          std::string opName = opToString<T, Op>();
-          printMismatch(result, expected, input, mask);
-          INFO("Operator: " << opName);
-          REQUIRE(result == expected);
-        }
+      calculateExpected<T>(expected, input, op, mask, aggType);
+
+      if (aggType == AggregationType::Reduce) {
+        resultLane = lastLane;
       } else {
-        compareFloatingPoint(result, expected, mask, h_input.host_ptr());
+        resultLane = laneId;
+      }
+
+      if ((1ull << laneId) & mask) {
+        if constexpr (std::is_integral<T>::value) {
+          // for integral types the result should match exactly
+          // for reduce, the result would be in the last lane whose first bit is on in the mask
+          // for scans, the associated result is different in each lane
+          if (result != expected[resultLane]) {
+            INFO("Aggregation type: " << aggregationTypeToStr(aggType));
+            printMismatch(result, expected[resultLane], input, mask, laneId);
+            INFO("Operator: " << opName << " mask: 0x" << std::hex << mask);
+            REQUIRE(result == expected[resultLane]);
+          }
+        } else {
+          compareFloatingPoint<Op>(result,
+                                   expected[resultLane],
+                                   mask,
+                                   h_input.host_ptr(),
+                                   laneId);
+        }
       }
     }
 
-    numReduce++;
+    numAggregation++;
   }
 }
 
 template <class Op, class T>
-void reduceCoopTiles(const std::index_sequence<>)
+void aggregateCoopTiles(AggregationType, dim3, const std::index_sequence<>)
 {
 }
 
 template <class Op, class T, size_t TileSize, size_t... TileSizes>
-void reduceCoopTiles(const std::index_sequence<TileSize, TileSizes...>)
+void aggregateCoopTiles(AggregationType aggType, dim3 blockDim, const std::index_sequence<TileSize, TileSizes...>)
 {
   const std::index_sequence<TileSizes...> remainingTiles;
 
-  reduceForTypeAndOp<TileSize, Op, T>();
-  reduceCoopTiles<Op, T>(remainingTiles);
+  aggregateForTypeAndOp<TileSize, Op, T>(aggType, blockDim);
+  aggregateCoopTiles<Op, T>(aggType, blockDim, remainingTiles);
 }
 
-template <bool Coalesced, class Op, class T, int WarpSize>
-void runReduceRandomForType()
+template <bool Coalesced, class Op, class T, int MaxTileSize>
+void runAggregationRandomForType(AggregationType aggType, dim3 blockDim)
 {
   if constexpr (Coalesced) {
-    reduceForTypeAndOp<0, Op, T>();
-  } else if constexpr (WarpSize <= 32) {
+    aggregateForTypeAndOp<0, Op, T>(aggType, blockDim);
+  } else if constexpr (MaxTileSize <= 4) {
+    std::index_sequence<1, 2, 4> tileSizes;
+    aggregateCoopTiles<Op, T>(aggType, blockDim, tileSizes);
+  } else if constexpr (MaxTileSize <= 32) {
     std::index_sequence<1, 2, 4, 8, 16, 32> tileSizes;
-    reduceCoopTiles<Op, T>(tileSizes);
+    aggregateCoopTiles<Op, T>(aggType, blockDim, tileSizes);
   } else {
     std::index_sequence<1, 2, 4, 8, 16, 32, 64> tileSizes;
-    reduceCoopTiles<Op, T>(tileSizes);
+    aggregateCoopTiles<Op, T>(aggType, blockDim, tileSizes);
   }
 }
 
 template <bool Coalesced, class T, int WarpSize, class Op = void>
-void runReduceRandomForOps(const std::tuple<>)
+void runAggregationRandomForOps(AggregationType aggType, const std::tuple<>)
 {
 }
 
 template <bool Coalesced, class T, int WarpSize, class Op, class... Ops>
-void runReduceRandomForOps(const std::tuple<Op, Ops...>)
+void runAggregationRandomForOps(AggregationType aggType, const std::tuple<Op, Ops...>)
 {
   const std::tuple<Ops...> remainingOps;
+  int wavefrontSize = getWarpSize();
+  dim3 blockDim = {static_cast<unsigned int>(wavefrontSize)};
 
-  runReduceRandomForType<Coalesced, Op, T, WarpSize>();
-  runReduceRandomForOps<Coalesced, T, WarpSize>(remainingOps);
+  runAggregationRandomForType<Coalesced, Op, T, WarpSize>(aggType, blockDim);
+  runAggregationRandomForOps<Coalesced, T, WarpSize>(aggType, remainingOps);
 }
 
 // for all the tile sizes and all input types, using random input values, calculates the reduce()
 // values. Additionally, randomly make some threads not participate for the coalesced_threads case
 HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Random_arithmetic, int, unsigned int, long long,
-                   unsigned long long, float, half, double)
+                       unsigned long long, float, half, double)
 {
   std::tuple<cooperative_groups::plus<TestType>,
              cooperative_groups::less<TestType>,
              cooperative_groups::greater<TestType>> types;
+  int wavefrontSize = getWarpSize();
+  dim3 blockDim = {static_cast<unsigned int>(wavefrontSize)};
 
-  if (getWarpSize() == 32) {
-    runReduceRandomForOps<false, TestType, 32>(types);
+  if (wavefrontSize == 32) {
+    runAggregationRandomForOps<false, TestType, 32>(AggregationType::Reduce, types);
   } else {
-    runReduceRandomForOps<false, TestType, 64>(types);
+    runAggregationRandomForOps<false, TestType, 64>(AggregationType::Reduce, types);
   }
 }
 
@@ -806,22 +884,33 @@ HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Random_boolean, int, unsign
              cooperative_groups::bit_xor<TestType>> types;
 
   if (getWarpSize() == 32) {
-    runReduceRandomForOps<false, TestType, 32>(types);
+    runAggregationRandomForOps<false, TestType, 32>(AggregationType::Reduce, types);
   } else {
-    runReduceRandomForOps<false, TestType, 64>(types);
+    runAggregationRandomForOps<false, TestType, 64>(AggregationType::Reduce, types);
   }
 }
 
 // passes a custom operator to cooperative_groups::reduce()
 HIP_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Custom_Op)
 {
-  if (getWarpSize() == 32) {
-    runReduceRandomForType<false, MaxOfAbsolute<int>, int, 32>();
+  int wavefrontSize = getWarpSize();
+
+  dim3 blockDim = {static_cast<unsigned int>(wavefrontSize)};
+  if (wavefrontSize == 32) {
+    runAggregationRandomForType<false, MaxOfAbsolute<int>, int, 32>(AggregationType::Reduce, blockDim);
   } else {
-    runReduceRandomForType<false, MaxOfAbsolute<int>, int, 64>();
+    runAggregationRandomForType<false, MaxOfAbsolute<int>, int, 64>(AggregationType::Reduce, blockDim);
   }
 }
 
+HIP_TEST_CASE(Unit_Thread_Block_Tile_Scan_Custom_Op)
+{
+  dim3 blockDim = {static_cast<unsigned int>(getWarpSize())};
+  // only using 4 threads to avoid long long overflows
+  runAggregationRandomForType<false, NonCommutativeOp<long long>, long long, 4>(
+    AggregationType::InclusiveScan, blockDim);
+}
+
 struct Vector {
   int x;
   int y;
@@ -838,7 +927,7 @@ struct MaxMagnitude {
   }
 };
 
-void __global__ maxMagnitude(Vector* result)
+void __global__ maxMagnitude(Vector* result, AggregationType* aggregationType)
 {
   cg::thread_block mygroup = cg::this_thread_block();
   auto mytile = cg::tiled_partition<4>(mygroup);
@@ -848,27 +937,102 @@ void __global__ maxMagnitude(Vector* result)
                     { 0, 7 },
                     { 4, 1} };
 
-  *result = cg::reduce(mytile, input[threadIdx.x], op);
+  switch (*aggregationType) {
+  case AggregationType::Reduce:
+    result[threadIdx.x] = cg::reduce(mytile, input[threadIdx.x], op);
+    break;
+  case AggregationType::InclusiveScan:
+    result[threadIdx.x] = cg::inclusive_scan(mytile, input[threadIdx.x], op);
+    break;
+  case AggregationType::ExclusiveScan:
+    result[threadIdx.x] = cg::exclusive_scan(mytile, input[threadIdx.x], op);
+    break;
+  default:
+    assert(false && "Unexpected aggType");
+  }
 }
 
 // tests that we can pass trivially copyable structs as values to reduce
 HIP_TEST_CASE(Unit_Thread_Block_Tile_Reduce_Trivially_Copyable_Parameters)
 {
-  LinearAllocGuard<Vector> h_result(LinearAllocs::malloc, sizeof(Vector));
-  LinearAllocGuard<Vector> d_result(LinearAllocs::hipMalloc, sizeof(Vector));
   dim3 gridDim = { 1 };
   dim3 blockDim = { 4 };
-  void* devicePtr = d_result.ptr();
-  void* args[] = { &devicePtr };
+  LinearAllocGuard<Vector> h_result(LinearAllocs::malloc, sizeof(Vector) * blockDim.x);
+  LinearAllocGuard<Vector> d_result(LinearAllocs::hipMalloc, sizeof(Vector) * blockDim.x);
+  LinearAllocGuard<AggregationType> d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType));
+  std::array<void*, 2> devicePtrs = { d_result.ptr(), d_aggType.ptr() };
+  void* args[devicePtrs.size()];
   Vector* result;
+  AggregationType aggType = AggregationType::Reduce;
 
+  for (int i = 0; i < devicePtrs.size(); i++) {
+    args[i] = &devicePtrs[i];
+  }
+
+  HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice));
   HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(maxMagnitude), gridDim, blockDim, args, 0, nullptr));
   HIP_CHECK(hipDeviceSynchronize());
   HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
                       h_result.size_bytes(), hipMemcpyDeviceToHost));
-  result = &h_result.host_ptr()[0];
-  REQUIRE((result->x == 1 && result->y == 9));
+
+  for (unsigned int idx = 0; idx < blockDim.x; idx++) {
+    result = &h_result.host_ptr()[idx];
+    REQUIRE((result->x == 1 && result->y == 9));
+  }
+}
+
+TEST_CASE(Unit_Thread_Block_Tile_Scan_Trivially_Copyable_Parameters)
+{
+  dim3 gridDim = { 1 };
+  dim3 blockDim = { 4 };
+  LinearAllocGuard<Vector> h_result(LinearAllocs::malloc, sizeof(Vector) * blockDim.x);
+  LinearAllocGuard<Vector> d_result(LinearAllocs::hipMalloc, sizeof(Vector) * blockDim.x);
+  LinearAllocGuard<AggregationType> d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType));
+  std::array<void*, 2> devicePtrs = { d_result.ptr(), d_aggType.ptr() };
+  void* args[devicePtrs.size()];
+  Vector* result;
+  AggregationType aggType;
+
+  for (int i = 0; i < devicePtrs.size(); i++) {
+    args[i] = &devicePtrs[i];
+  }
+
+  SECTION("inclusive") {
+    aggType = AggregationType::InclusiveScan;
+    HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice));
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(maxMagnitude), gridDim, blockDim, args, 0, nullptr));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
+                        h_result.size_bytes(), hipMemcpyDeviceToHost));
+    result = &h_result.host_ptr()[0];
+    REQUIRE((result->x == 2 && result->y == 3));
+    result = &h_result.host_ptr()[1];
+    REQUIRE((result->x == 1 && result->y == 9));
+    result = &h_result.host_ptr()[2];
+    REQUIRE((result->x == 1 && result->y == 9));
+    result = &h_result.host_ptr()[3];
+    REQUIRE((result->x == 1 && result->y == 9));
+  }
+
+  SECTION("exclusive") {
+    aggType = AggregationType::ExclusiveScan;
+    HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice));
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(maxMagnitude), gridDim, blockDim, args, 0, nullptr));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
+                        h_result.size_bytes(), hipMemcpyDeviceToHost));
+    result = &h_result.host_ptr()[0];
+    REQUIRE((result->x == 0 && result->y == 0));
+    result = &h_result.host_ptr()[1];
+    REQUIRE((result->x == 2 && result->y == 3));
+    result = &h_result.host_ptr()[2];
+    REQUIRE((result->x == 1 && result->y == 9));
+    result = &h_result.host_ptr()[3];
+    REQUIRE((result->x == 1 && result->y == 9));
+  }
 }
 
 template <size_t NumElems>
@@ -897,9 +1061,7 @@ struct Max {
     ArrayContainer<NumElems> result;
 
     for (int i = 0; i < NumElems; i++) {
-      for (int i = 0; i < NumElems; i++) {
         result[i] = std::max(lhs[i], rhs[i]);
-      }
     }
 
     return result;
@@ -916,13 +1078,50 @@ __global__ void applyFunctor(ArrayContainer<NumElems>* result)
 
   if (threadIdx.x < NumElems) {
     input[threadIdx.x] = threadIdx.x;
-    __syncwarp();
+  }
+
+  mytile.sync();
+
+  if (threadIdx.x < NumElems) {
     *result = cg::reduce(mytile, input, op);
   }
 }
 
+template <size_t NumElems, class Functor>
+__global__ void applyScanFunctor(ArrayContainer<NumElems>** result,
+                                 AggregationType* aggType)
+{
+  cg::thread_block mygroup = cg::this_thread_block();
+  auto mytile = cg::tiled_partition<NumElems>(mygroup);
+  __shared__ ArrayContainer<NumElems> input;
+  Functor op;
+  unsigned int tid = threadIdx.x +
+                     threadIdx.y * blockDim.x +
+                     threadIdx.z * blockDim.x * blockDim.y;
+
+  if (threadIdx.x < NumElems) {
+    input[threadIdx.x] = threadIdx.x;
+  }
+
+  mytile.sync();
+
+  if (threadIdx.x < NumElems) {
+    switch (*aggType) {
+    case AggregationType::InclusiveScan:
+      *(result[tid]) = cg::inclusive_scan(mytile, input, op);
+      break;
+    case AggregationType::ExclusiveScan:
+      *(result[tid]) = cg::exclusive_scan(mytile, input, op);
+      break;
+    default:
+      assert(false && "AggregationType not supported");
+    }
+  }
+}
+
+// tests aggregations of arguments of different sizes (types <= 32 bytes are accepted)
 template <size_t NumElems, template <size_t> class Functor>
-void testReduceSizes()
+void testArgsDifferentSizesReduce()
 {
   LinearAllocGuard<ArrayContainer<NumElems>> h_result(LinearAllocs::malloc, sizeof(ArrayContainer<NumElems>));
   LinearAllocGuard<ArrayContainer<NumElems>> d_result(LinearAllocs::hipMalloc, sizeof(ArrayContainer<NumElems>));
@@ -932,7 +1131,8 @@ void testReduceSizes()
   void* args[] = { &devicePtr };
   ArrayContainer<NumElems>* result;
 
-  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(applyFunctor<NumElems, Functor<NumElems>>), gridDim, blockDim, args, 0, nullptr));
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(applyFunctor<NumElems, Functor<NumElems>>),
+                                       gridDim, blockDim, args, 0, nullptr));
   HIP_CHECK(hipDeviceSynchronize());
   HIP_CHECK(hipGetLastError());
   HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
@@ -953,7 +1153,101 @@ void testReduceSizes()
   }
 
   if constexpr (NumElems > 1) {
-    testReduceSizes<NumElems / 2, Functor>();
+    testArgsDifferentSizesReduce<NumElems / 2, Functor>();
+  }
+}
+
+// in this case, as opposed to the reduction, were we were only saving one result per reduction,
+// we save one result per lane. i.e. we save one ArrayContainer per lane, as each lane will return
+// a different ArrayContainer value
+template <size_t NumElems, template <size_t> class Functor>
+void testArgsDifferentSizesScan(AggregationType aggType)
+{
+  int wavefrontSize = getWarpSize();
+  // one per lane
+  LinearAllocGuard<ArrayContainer<NumElems>> h_result[64];
+  LinearAllocGuard<ArrayContainer<NumElems>> d_result[64];
+
+  // as we cannot pass d_result directly (because it is an array of
+  // LinearAllocGuards), we convert to an array of raw device pointers
+  ArrayContainer<NumElems>* h_devicePtrs[64];
+  LinearAllocGuard<ArrayContainer<NumElems>*> d_devicePtrs(LinearAllocs::hipMalloc, sizeof(void*) * 64);
+  LinearAllocGuard<AggregationType> d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType));
+  dim3 gridDim = { 1 };
+  dim3 blockDim = { 32 };
+  ArrayContainer<NumElems>* result;
+
+  for (int i = 0; i < wavefrontSize; i++) {
+    h_result[i] = LinearAllocGuard<ArrayContainer<NumElems>>(LinearAllocs::malloc,
+                                                             sizeof(ArrayContainer<NumElems>));
+    d_result[i] = LinearAllocGuard<ArrayContainer<NumElems>>(LinearAllocs::hipMalloc,
+                                                             sizeof(ArrayContainer<NumElems>));
+  }
+
+  for (int i = 0; i < wavefrontSize; i++) {
+    h_devicePtrs[i] = d_result[i].ptr();
+  }
+
+  HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(d_devicePtrs.ptr(),
+                      h_devicePtrs,
+                      d_devicePtrs.size_bytes(),
+                      hipMemcpyHostToDevice));
+
+  std::array<void*, 2> devicePtrs = { d_devicePtrs.ptr(), d_aggType.ptr() };
+  void* args[devicePtrs.size()];
+
+  for (int i = 0; i < devicePtrs.size(); i++) {
+    args[i] = &devicePtrs[i];
+  }
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(applyScanFunctor<NumElems, Functor<NumElems>>),
+                                       gridDim, blockDim, args, 0, nullptr));
+  HIP_CHECK(hipDeviceSynchronize());
+  HIP_CHECK(hipGetLastError());
+
+  for (int i = 0; i < wavefrontSize; i++) {
+    HIP_CHECK(hipMemcpy(h_result[i].host_ptr(), d_result[i].ptr(),
+                        h_result[i].size_bytes(), hipMemcpyDeviceToHost));
+  }
+
+  INFO("T is of size: " << NumElems);
+
+  for (int laneId = 0; laneId < NumElems; laneId++) {
+    result = &(h_result[laneId].host_ptr()[0]);
+
+    if (aggType == AggregationType::InclusiveScan) {
+      INFO("Lane: " << laneId);
+
+      if (std::is_same<Functor<NumElems>, Sum<NumElems>>::value) {
+        // the result can be calculated with an arithmetic series formula, modulo 256
+        // (we do overflow unsigned char for some indices, but that is defined behaviour)
+        REQUIRE((*result)[laneId] == (((laneId + 1) * (laneId + laneId) / 2) % 256));
+      } else {
+        REQUIRE((*result)[laneId] == laneId);
+      }
+    } else if (aggType == AggregationType::ExclusiveScan) {
+      if (std::is_same<Functor<NumElems>, Sum<NumElems>>::value) {
+        INFO("Lane: " << laneId);
+
+        // the result can be calculated with an arithmetic series formula, modulo 256
+        // (we do overflow unsigned char for some indices, but that is defined behaviour)
+        if (laneId == 0) {
+          REQUIRE((*result)[laneId] == 0);
+        } else {
+          REQUIRE((*result)[laneId] == (((laneId) * (laneId + laneId) / 2) % 256));
+        }
+      } else {
+        if (laneId == 0) {
+          REQUIRE((*result)[laneId] == 0);
+        } else {
+          REQUIRE((*result)[laneId] == laneId);
+        }
+      }
+    }
+  }
+
+  if constexpr (NumElems > 1) {
+    testArgsDifferentSizesScan<NumElems / 2, Functor>(aggType);
   }
 }
 
@@ -962,11 +1256,24 @@ void testReduceSizes()
 HIP_TEST_CASE(Unit_Thread_Block_Tile_Reduce_All_Parameter_Sizes)
 {
   SECTION("sum") {
-    testReduceSizes<32, Sum>();
+    testArgsDifferentSizesReduce<32, Sum>();
+  }
+
+  SECTION("max") {
+    testArgsDifferentSizesReduce<32, Max>();
+  }
+}
+
+TEST_CASE(Unit_Thread_Block_Tile_Scan_All_Parameter_Sizes)
+{
+  SECTION("sum") {
+    testArgsDifferentSizesScan<32, Sum>(AggregationType::InclusiveScan);
+    testArgsDifferentSizesScan<32, Sum>(AggregationType::ExclusiveScan);
   }
 
   SECTION("max") {
-    testReduceSizes<32, Max>();
+    testArgsDifferentSizesScan<32, Max>(AggregationType::InclusiveScan);
+    testArgsDifferentSizesScan<32, Max>(AggregationType::ExclusiveScan);
   }
 }
 
@@ -1026,24 +1333,463 @@ HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Reduce_arithmetic, int, unsig
              cooperative_groups::greater<TestType>> ops;
 
   if (getWarpSize() == 32) {
-    runReduceRandomForOps<true, TestType, 32>(ops);
+    runAggregationRandomForOps<true, TestType, 32>(AggregationType::Reduce, ops);
   } else {
-    runReduceRandomForOps<true, TestType, 64>(ops);
+    runAggregationRandomForOps<true, TestType, 64>(AggregationType::Reduce, ops);
   }
 }
 
-
 HIP_TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Reduce_boolean, int, unsigned int, long long, unsigned long long)
 {
   std::tuple<cooperative_groups::bit_and<TestType>,
              cooperative_groups::bit_or<TestType>,
              cooperative_groups::bit_xor<TestType>> ops;
   if (getWarpSize() == 32) {
-    runReduceRandomForOps<true, TestType, 32>(ops);
+    runAggregationRandomForOps<true, TestType, 32>(AggregationType::Reduce, ops);
   } else {
-    runReduceRandomForOps<true, TestType, 64>(ops);
+    runAggregationRandomForOps<true, TestType, 64>(AggregationType::Reduce, ops);
+  }
+}
+
+template <size_t TileSize, AggregationType AggType, class Op, class T>
+void __global__ simpleScan(T* result)
+{
+  T value = threadIdx.x;
+  cg::thread_block mygroup = cg::this_thread_block();
+  auto mytile = cg::tiled_partition<TileSize>(mygroup);
+  Op op;
+
+  if constexpr (AggType == AggregationType::InclusiveScan) {
+    result[threadIdx.x] = cg::inclusive_scan(mytile, value, op);
+  } else if constexpr (AggType == AggregationType::ExclusiveScan) {
+    result[threadIdx.x] = cg::exclusive_scan(mytile, value, op);
+  } else if constexpr (AggType == AggregationType::InclusiveScanDefault) {
+    result[threadIdx.x] = cg::inclusive_scan(mytile, value);
+  } else if constexpr (AggType == AggregationType::ExclusiveScanDefault) {
+    result[threadIdx.x] = cg::exclusive_scan(mytile, value);
+  } else {
+    assert(false && "Unexpected aggType");
+  }
+}
+
+/// @tparam Op either std::plus or std::less
+template <size_t TileSize, AggregationType AggType, class Op, class T>
+void testScanForTileSize()
+{
+  LinearAllocGuard<T> h_result(LinearAllocs::malloc, sizeof(T) * getWarpSize());
+  LinearAllocGuard<T> d_result(LinearAllocs::hipMalloc, h_result.size_bytes());
+  dim3 gridDim = { 1 };
+  dim3 blockDim = { static_cast<unsigned short>(getWarpSize()) };
+  T id = 0;
+  T accum = 0;
+  int pos = 0;
+  LinearAllocGuard<AggregationType> d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType));
+  std::array<void*, 2> devicePtrs = { d_result.ptr(), d_aggType.ptr() };
+  void* args[devicePtrs.size()];
+  AggregationType aggType = AggType;
+
+  if (!isInclusive(aggType)) {
+    scanIdentity<T, Op>(id);
   }
-} 
+
+  accum = id;
+
+  for (int i = 0; i < devicePtrs.size(); i++) {
+    args[i] = &devicePtrs[i];
+  }
+
+  HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(AggType), hipMemcpyHostToDevice));
+  HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(simpleScan<TileSize, AggType, Op, T>),
+                                       gridDim,
+                                       blockDim,
+                                       args,
+                                       0,
+                                       nullptr));
+  HIP_CHECK(hipDeviceSynchronize());
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
+                      h_result.size_bytes(), hipMemcpyDeviceToHost));
+
+  while (pos < getWarpSize()) {
+    for (int i = 0; i < TileSize; i++) {
+      UNSCOPED_INFO("Index: " << pos + i << " tile size: " << TileSize);
+
+      if (isInclusive(aggType)) {
+        accum += pos + i;
+        INFO("Inclusive scan");
+        REQUIRE(h_result.host_ptr()[pos + i] == accum);
+      } else {
+        using ComparisonType = typename std::conditional<std::is_same<T, half>::value, float, T>::type;
+        ComparisonType result, expected;
+
+        INFO("Exclusive scan");
+
+        if constexpr (std::is_same<T, half>::value) {
+          // Catch2 cannot print fp16 if there is an error
+          result = __half2float(h_result.host_ptr()[pos + i]);
+          expected = __half2float(__half2float(accum));
+        } else {
+          result = h_result.host_ptr()[pos + i];
+          expected = accum;
+        }
+
+        INFO("Lane: " << (pos + i));
+        REQUIRE(result == expected);
+
+        if (i == 0) {
+          accum = id;
+        }
+
+        if constexpr (std::is_same<Op, cooperative_groups::less<T>>::value) {
+          accum = std::min(__half2float(accum), __half2float(pos + i));
+        } else {
+          accum += pos + i;
+        }
+      }
+    }
+
+    accum = id;
+    pos += TileSize;
+  }
+}
+
+TEST_CASE(Unit_Thread_Block_Tile_Inclusive_Scan_Basic)
+{
+  using Op = cooperative_groups::plus<int>;
+  static constexpr AggregationType AggType = AggregationType::InclusiveScan;
+
+  testScanForTileSize<1, AggType, Op, int>();
+  testScanForTileSize<2, AggType, Op, int>();
+  testScanForTileSize<4, AggType, Op, int>();
+  testScanForTileSize<8, AggType, Op, int>();
+  testScanForTileSize<16, AggType, Op, int>();
+  testScanForTileSize<32, AggType, Op, int>();
+  testScanForTileSize<32, AggregationType::InclusiveScanDefault, Op, int>();
+
+  if (getWarpSize() == 64) {
+    testScanForTileSize<64, AggType, Op, int>();
+  }
+}
+
+TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Exclusive_Scan_Basic, int, half)
+{
+  static constexpr AggregationType AggType = AggregationType::ExclusiveScan;
+
+  SECTION("plus") {
+    using Op = cooperative_groups::plus<TestType>;
+    testScanForTileSize<1, AggType, Op, TestType>();
+    testScanForTileSize<2, AggType, Op, TestType>();
+    testScanForTileSize<4, AggType, Op, TestType>();
+    testScanForTileSize<8, AggType, Op, TestType>();
+    testScanForTileSize<16, AggType, Op, TestType>();
+    testScanForTileSize<32, AggType, Op, TestType>();
+    testScanForTileSize<32, AggregationType::ExclusiveScanDefault, Op, int>();
+
+    if (getWarpSize() == 64) {
+      testScanForTileSize<64, AggType, Op, TestType>();
+    }
+  }
+
+  SECTION("less") {
+    using Op = cooperative_groups::less<TestType>;
+    testScanForTileSize<1, AggType, Op, TestType>();
+    testScanForTileSize<2, AggType, Op, TestType>();
+    testScanForTileSize<4, AggType, Op, TestType>();
+    testScanForTileSize<8, AggType, Op, TestType>();
+    testScanForTileSize<16, AggType, Op, TestType>();
+    testScanForTileSize<32, AggType, Op, TestType>();
+
+    if (getWarpSize() == 64) {
+      testScanForTileSize<64, AggType, Op, TestType>();
+    }
+  }
+}
+
+// for all the tile sizes and all input types, using random input values, calculates the scan
+// values. Additionally, randomly make some threads not participate for the coalesced_threads case
+TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Scan_Random_arithmetic,  int, unsigned int, long long,
+                   unsigned long long, float, half, double)
+{
+  std::tuple<cooperative_groups::plus<TestType>,
+             cooperative_groups::less<TestType>,
+             cooperative_groups::greater<TestType>> types;
+
+  SECTION("inclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<false, TestType, 32>(AggregationType::InclusiveScan, types);
+    } else {
+      runAggregationRandomForOps<false, TestType, 64>(AggregationType::InclusiveScan, types);
+    }
+  }
+
+  SECTION("exclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<false, TestType, 32>(AggregationType::ExclusiveScan, types);
+    } else {
+      runAggregationRandomForOps<false, TestType, 64>(AggregationType::ExclusiveScan, types);
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE(Unit_Thread_Block_Tile_Scan_Random_boolean, int, unsigned int, long long,
+                   unsigned long long)
+{
+  std::tuple<cooperative_groups::bit_and<TestType>,
+             cooperative_groups::bit_or<TestType>,
+             cooperative_groups::bit_xor<TestType>> types;
+
+  SECTION("inclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<false, TestType, 32>(AggregationType::InclusiveScan, types);
+    } else {
+      runAggregationRandomForOps<false, TestType, 64>(AggregationType::InclusiveScan, types);
+    }
+  }
+
+  SECTION("exclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<false, TestType, 32>(AggregationType::ExclusiveScan, types);
+    } else {
+      runAggregationRandomForOps<false, TestType, 64>(AggregationType::ExclusiveScan, types);
+    }
+  }
+}
+
+// make sures that tiled blocks that use the y or z dimension work correctly
+TEST_CASE(Unit_Thread_Block_Tile_2D_3D_Blocks)
+{
+  int wavefrontSize = getWarpSize();
+
+  SECTION("2D") {
+    dim3 blockDim {1, 2, static_cast<unsigned int>(wavefrontSize / 2)};
+
+    if (wavefrontSize == 32) {
+      runAggregationRandomForType<false, std::plus<int>, int, 32>(AggregationType::Reduce, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 32>(AggregationType::InclusiveScan, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 32>(AggregationType::ExclusiveScan, blockDim);
+    } else {
+      runAggregationRandomForType<false, std::plus<int>, int, 64>(AggregationType::Reduce, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 64>(AggregationType::InclusiveScan, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 64>(AggregationType::ExclusiveScan, blockDim);
+    }
+  }
+
+  SECTION("3D") {
+    dim3 blockDim {static_cast<unsigned int>(wavefrontSize) / 2, 2, 1};
+
+    if (wavefrontSize == 32) {
+      runAggregationRandomForType<false, std::plus<int>, int, 32>(AggregationType::Reduce, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 32>(AggregationType::InclusiveScan, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 32>(AggregationType::ExclusiveScan, blockDim);
+    } else {
+      runAggregationRandomForType<false, std::plus<int>, int, 64>(AggregationType::Reduce, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 64>(AggregationType::InclusiveScan, blockDim);
+      runAggregationRandomForType<false, std::plus<int>, int, 64>(AggregationType::ExclusiveScan, blockDim);
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Scan_arithmetic, int, unsigned int, long long,
+                   unsigned long long, float, half, double)
+{
+  std::tuple<cooperative_groups::plus<TestType>,
+             cooperative_groups::less<TestType>,
+             cooperative_groups::greater<TestType>> ops;
+
+  SECTION("inclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<true, TestType, 32>(AggregationType::InclusiveScan, ops);
+    } else {
+      runAggregationRandomForOps<true, TestType, 64>(AggregationType::InclusiveScan, ops);
+    }
+  }
+
+  SECTION("exclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<true, TestType, 32>(AggregationType::ExclusiveScan, ops);
+    } else {
+      runAggregationRandomForOps<true, TestType, 64>(AggregationType::ExclusiveScan, ops);
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE(Unit_Thread_Block_Coalesced_Scan_boolean, int, unsigned int, long long,
+                   unsigned long long)
+{
+  std::tuple<cooperative_groups::bit_and<TestType>,
+             cooperative_groups::bit_or<TestType>,
+             cooperative_groups::bit_xor<TestType>> ops;
+
+  SECTION("inclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<true, TestType, 32>(AggregationType::InclusiveScan, ops);
+    } else {
+      runAggregationRandomForOps<true, TestType, 64>(AggregationType::InclusiveScan, ops);
+    }
+  }
+
+  SECTION("exclusive") {
+    if (getWarpSize() == 32) {
+      runAggregationRandomForOps<true, TestType, 32>(AggregationType::ExclusiveScan, ops);
+    } else {
+      runAggregationRandomForOps<true, TestType, 64>(AggregationType::ExclusiveScan, ops);
+    }
+  }
+}
+
+void __global__ binaryPartitionCoalesced(int* out, int* ranks)
+{
+   if (threadIdx.x >= warpSize / 2) {
+     // this group will contain the upper part of the threads
+     auto coalesced = cg::coalesced_threads();
+
+     // this group is subsequently split in two: on even and odd indexes
+     auto partitioned = cg::binary_partition(coalesced, threadIdx.x % 2);
+
+     ranks[threadIdx.x] = partitioned.thread_rank();
+     out[threadIdx.x] = cg::inclusive_scan(partitioned, threadIdx.x);
+   } else {
+     ranks[threadIdx.x] = -1;
+     out[threadIdx.x] = -1;
+   }
+}
+
+template <unsigned int WarpSize>
+void __global__ binaryPartitionTiled(int* out, int* ranks)
+{
+  cg::thread_block mygroup = cg::this_thread_block();
+  auto tile = cg::tiled_partition<WarpSize / 2>(mygroup);
+
+  if (tile.meta_group_rank() == 1) {
+    auto partitioned = cg::binary_partition(tile, threadIdx.x % 2);
+    ranks[threadIdx.x] = partitioned.thread_rank();
+    out[threadIdx.x] = cg::inclusive_scan(partitioned, threadIdx.x);
+  } else {
+    ranks[threadIdx.x] = -1;
+    out[threadIdx.x] = -1;
+  }
+}
+
+TEST_CASE(Unit_Thread_Block_Scan_partition)
+{
+  int wavefrontSize = getWarpSize();
+  LinearAllocGuard<int> h_result(LinearAllocs::malloc, sizeof(int) * wavefrontSize);
+  LinearAllocGuard<int> d_result(LinearAllocs::hipMalloc, h_result.size_bytes());
+  LinearAllocGuard<int> h_ranks(LinearAllocs::malloc, sizeof(int) * wavefrontSize);
+  LinearAllocGuard<int> d_ranks(LinearAllocs::hipMalloc, h_ranks.size_bytes());
+  dim3 gridDim = { 1 };
+  dim3 blockDim = { static_cast<unsigned short>(getWarpSize()) };
+  void* resultsPtr = d_result.ptr();
+  void* ranksPtr = d_ranks.ptr();
+  void* args[] = { &resultsPtr, &ranksPtr };
+  auto checkResults = [&]() {
+    int accumEven = 0;
+    int accumOdd = 0;
+
+    for (int laneId = 0; laneId < getWarpSize(); laneId++) {
+      if (laneId >= wavefrontSize / 2) {
+        if (laneId % 2 == 0) {
+          accumEven += laneId;
+        } else {
+          accumOdd += laneId;
+        }
+
+        INFO("laneId: " << laneId);
+        REQUIRE(h_ranks.host_ptr()[laneId] == (laneId - wavefrontSize / 2) / 2);
+        REQUIRE(h_result.host_ptr()[laneId] == (laneId % 2 ? accumOdd : accumEven));
+      } else {
+        INFO("laneId: " << laneId);
+        REQUIRE(h_ranks.host_ptr()[laneId] == -1);
+        REQUIRE(h_result.host_ptr()[laneId] == -1);
+      }
+    }
+  };
+
+  // the result of both sections must be the same; in one we use coalesced_threads to sub-divide
+  // into higher-order and lower-order threads, in another we just use cg::tiled_partition to do the
+  // equivalent
+  SECTION("coalesced") {
+    HIP_CHECK(hipLaunchCooperativeKernel(reinterpret_cast<void*>(binaryPartitionCoalesced),
+                                        gridDim,
+                                        blockDim,
+                                        args,
+                                        0,
+                                        nullptr));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(h_ranks.host_ptr(), d_ranks.ptr(),
+                        h_ranks.size_bytes(), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
+                        h_result.size_bytes(), hipMemcpyDeviceToHost));
+    checkResults();
+  }
+
+  SECTION("tiled") {
+    void* kernelPtr = reinterpret_cast<void*>(wavefrontSize == 32?
+                                              binaryPartitionTiled<32> : binaryPartitionTiled<64>);
+    HIP_CHECK(hipLaunchCooperativeKernel(kernelPtr,
+                                         gridDim,
+                                         blockDim,
+                                         args,
+                                         0,
+                                         nullptr));
+    HIP_CHECK(hipDeviceSynchronize());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(h_ranks.host_ptr(), d_ranks.ptr(),
+                        h_ranks.size_bytes(), hipMemcpyDeviceToHost));
+    HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
+                        h_result.size_bytes(), hipMemcpyDeviceToHost));
+    checkResults();
+  }
+}
+
+__global__ void multiDimReduce(int* output)
+{
+  cg::thread_block mygroup = cg::this_thread_block();
+  auto mytile = cg::tiled_partition<16>(mygroup);
+  int tid = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+
+  output[tid] = cooperative_groups::reduce(mytile, tid, cooperative_groups::plus<int>());
+}
+
+// test tiles reduces on multidimensional blocks. Basically splits the 128 threads in the block
+// into 16 threads tiles, each one contributing the tid to their reduces
+HIP_TEST_CASE(Unit_Thread_Block_Tile_Multi_Dimensional_Reduce)
+{
+  LinearAllocGuard<int> h_result(LinearAllocs::malloc, sizeof(int) * 128);
+  LinearAllocGuard<int> d_result(LinearAllocs::hipMalloc, h_result.size_bytes());
+  dim3 gridDim = { 1 };
+  // use a block size bigger than the warp size
+  dim3 blockDim = { 16, 4, 2 };
+  void* devicePtr = d_result.ptr();
+  void* args[1] = { &devicePtr };
+  hipError_t status;
+
+  status = hipLaunchCooperativeKernel(multiDimReduce,
+                                      gridDim,
+                                      blockDim,
+                                      args,
+                                      0,
+                                      nullptr);
+  HIP_CHECK(status);
+  HIP_CHECK(hipDeviceSynchronize());
+  HIP_CHECK(hipGetLastError());
+  HIP_CHECK(hipMemcpy(h_result.host_ptr(), d_result.ptr(),
+                      h_result.size_bytes(), hipMemcpyDeviceToHost));
+
+  for (int row = 0; row < 8; row++) {
+    int rowResult = 0;
+
+    for (int index = 0; index < 16; index++) {
+      rowResult += row * 16 + index;
+    }
+
+    for (int index = 0; index < 16; index++) {
+      INFO("row: " << row << " index: " << index);
+      REQUIRE(h_result.host_ptr()[row * 16 + index] == rowResult);
+    }
+  }
+}
 
 /**
  * End doxygen group DeviceLanguageTest.
diff --git a/projects/hip-tests/catch/unit/rtc/rtc_coop.cc b/projects/hip-tests/catch/unit/rtc/rtc_coop.cc
index 364ed259714..5c55a057676 100644
--- a/projects/hip-tests/catch/unit/rtc/rtc_coop.cc
+++ b/projects/hip-tests/catch/unit/rtc/rtc_coop.cc
@@ -34,7 +34,7 @@ void compileProgram(hiprtcProgram& prog, const std::tuple<T, Types...>&) {
   std::string expression;
   std::tuple<Types...> remainingTypes;
 
-  expression = std::string("reduceCoopKernel<") +
+  expression = std::string("aggregationCoopKernel<") +
                functorToString<T, Op>() +
                ", " +
                typeToString<T>() +
@@ -64,7 +64,7 @@ void compileProgram(hiprtcProgram& prog, const std::tuple<>&) {
 }
 
 template <class T, template <typename> class Op>
-void runReduce(hiprtcProgram& prog) {
+void runAggregation(hiprtcProgram& prog, AggregationType aggType) {
   using distribution = typename DistributionType<T>::type;
 
   static constexpr std::array<int, 7> tileSizes = {1, 2, 4, 8, 16, 32, 64};
@@ -76,6 +76,7 @@ void runReduce(hiprtcProgram& prog) {
   LinearAllocGuard<T> input(LinearAllocs::malloc, d_input.size_bytes());
   LinearAllocGuard<T> d_output(LinearAllocs::hipMalloc, wavefrontSize * sizeof(T) * tileSizes.size());
   LinearAllocGuard<T> output(LinearAllocs::malloc, d_output.size_bytes());
+  LinearAllocGuard<AggregationType> d_aggType(LinearAllocs::hipMalloc, sizeof(AggregationType));
   std::mt19937_64 gen(Catch::rngSeed());
   // for float16, we generate any random unsigned short, but cap the exponent later on
   // to keep it in the range (-8.0..8.0) (just to avoid overflows)
@@ -84,15 +85,16 @@ void runReduce(hiprtcProgram& prog) {
   T b = std::is_same<T, half>::value? std::numeric_limits<unsigned short>::max() : 1023;
   distribution dist(a, b);
 
+  HIP_CHECK(hipMemcpy(d_aggType.ptr(), &aggType, sizeof(aggType), hipMemcpyHostToDevice));
   genRandomBuffers(d_input, input, dist, gen, wavefrontSize);
-  std::vector<const void*> args = { d_output.ptr(), d_input.ptr() };
+  std::vector<const void*> args = { d_output.ptr(), d_input.ptr(), d_aggType.ptr() };
   std::size_t sizeBytes = args.size() * sizeof(void*);
   void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, args.data(), HIP_LAUNCH_PARAM_BUFFER_SIZE, &sizeBytes,
                     HIP_LAUNCH_PARAM_END};
   std::vector<char> code;
   size_t codeSize;
   std::string expression =
-      std::string("reduceCoopKernel<") +
+      std::string("aggregationCoopKernel<") +
       functorToString<T, Op>() +
       ", " +
       typeToString<T>() +
@@ -116,21 +118,13 @@ void runReduce(hiprtcProgram& prog) {
 
   INFO("Type: " << typeToString<T>());
   for (auto tileSize : tileSizes) {
-    UNSCOPED_INFO("Tile size: " << tileSize);
-
     for (unsigned int laneId = 0; laneId < wavefrontSize; laneId++) {
-      unsigned long long mask = ~0ull >> (64 - tileSize);
-
-      mask <<= (((laneId % wavefrontSize) / tileSize) * tileSize);
-
       if (tileSize <= wavefrontSize) {
         std::string inputStr;
 
         if constexpr (!std::is_same<T, half>::value) {
           inputStr = std::string(" input: ") + std::to_string(input.host_ptr()[laneId]);
         }
-
-        UNSCOPED_INFO("laneId: " << laneId << " mask: " << mask << inputStr);
       }
     }
 
@@ -139,11 +133,24 @@ void runReduce(hiprtcProgram& prog) {
 
       mask <<= ((laneId % wavefrontSize) / tileSize) * tileSize;
 
-      if (tileSize <= wavefrontSize) {
-        T expected;
-        Op<T> op;
-        expected = calculateExpected(input.host_ptr(), op, mask);
-        REQUIRE(output.host_ptr()[numTile * wavefrontSize + laneId] == expected);
+      if ((1ull << laneId) & mask) {
+        if (tileSize <= wavefrontSize) {
+          T expectedByLane[64];
+          Op<T> op;
+          T result = output.host_ptr()[numTile * wavefrontSize + laneId];
+
+          calculateExpected(expectedByLane, input.host_ptr(), op, mask, aggType);
+
+          if constexpr (std::is_integral<T>::value) {
+            REQUIRE(result == expectedByLane[laneId]);
+          } else {
+            compareFloatingPoint<Op<T>>(result,
+                                        expectedByLane[laneId],
+                                        mask,
+                                        input.host_ptr(),
+                                        laneId);
+          }
+        }
       }
     }
 
@@ -152,61 +159,75 @@ void runReduce(hiprtcProgram& prog) {
 }
 
 template <template <typename> class Op, class Type = void>
-void runCoopTestReduceForTypes(hiprtcProgram&, const std::tuple<>) {}
+void runCoopAggregationForTypes(hiprtcProgram&, AggregationType, const std::tuple<>) {}
 
 template <template <typename> class Op, class T, typename... Types>
-void runCoopTestReduceForTypes(hiprtcProgram& prog, const std::tuple<T, Types...>) {
+void runCoopAggregationForTypes(hiprtcProgram& prog, AggregationType aggType, const std::tuple<T, Types...>) {
   std::tuple<Types...> remainingTypes;
 
-  runReduce<T, Op>(prog);
-  runCoopTestReduceForTypes<Op>(prog, remainingTypes);
+  runAggregation<T, Op>(prog, aggType);
+  runCoopAggregationForTypes<Op>(prog, aggType, remainingTypes);
 }
 
 template <template <typename> class Op, typename... Types>
-void runAndCompileTest(const std::tuple<Types...> types) {
+void runAndCompileTest(AggregationType aggType, const std::tuple<Types...> types) {
   std::string kernelStr;
   hiprtcProgram prog;
 
   kernelStr = R"(
     namespace cg = cooperative_groups;
 
+    enum class AggregationType { Reduce, InclusiveScan, ExclusiveScan};
+
     template <template <typename> class Op, class T>
-    __device__ void reduceTiles(T*, const T*, const __hip_internal::index_sequence<>) 
+    __device__ void aggregateTiles(T*,
+                                   const T*,
+                                   AggregationType* aggType,
+                                   const __hip_internal::index_sequence<>)
     {
     }
 
-    // run reduce for a specific type and for different tile sizes as a variadic template parameter
+    // run aggregation for a specific type and for different tile sizes as a variadic template parameter
     // @output the result, per lane
     template <template <typename> class Op, class T, size_t TileSize, size_t... TileSizes>
-    __device__ void reduceTiles(T* output, const T* input, const __hip_internal::index_sequence<TileSize, TileSizes...>) 
+    __device__ void aggregateTiles(T* output,
+                                   const T* input,
+                                   AggregationType* aggType,
+                                   const __hip_internal::index_sequence<TileSize, TileSizes...>)
     {
       const __hip_internal::index_sequence<TileSizes...> remainingTiles;
       cg::thread_block group = cg::this_thread_block();
       auto tile = cg::tiled_partition<TileSize>(group);
       Op<T> op;
 
-      output[threadIdx.x] = cg::reduce(tile, input[threadIdx.x], op);
-      reduceTiles<Op, T>(output + warpSize, input, remainingTiles);
+      if (*aggType == AggregationType::Reduce) {
+        output[threadIdx.x] = cg::reduce(tile, input[threadIdx.x], op);
+      } else if (*aggType == AggregationType::InclusiveScan) {
+        output[threadIdx.x] = cg::inclusive_scan(tile, input[threadIdx.x], op);
+      } else if (*aggType == AggregationType::ExclusiveScan) {
+        output[threadIdx.x] = cg::exclusive_scan(tile, input[threadIdx.x], op);
+      }
+      aggregateTiles<Op, T>(output + warpSize, input, aggType, remainingTiles);
     }
 
     // @output will receive a different result per tile size
     template <template <typename> class Op, class T, int WarpSize>
-    __global__ void reduceCoopKernel(T* output, const T* input)
+    __global__ void aggregationCoopKernel(T* output, const T* input, AggregationType* aggType)
     {
       if constexpr (WarpSize <= 32) {
         __hip_internal::index_sequence<1, 2, 4, 8, 16, 32> tileSizes;
-        reduceTiles<Op, T>(output, input, tileSizes);
+        aggregateTiles<Op, T>(output, input, aggType, tileSizes);
       } else {
         __hip_internal::index_sequence<1, 2, 4, 8, 16, 32, 64> tileSizes;
-        reduceTiles<Op, T>(output, input, tileSizes);
+        aggregateTiles<Op, T>(output, input, aggType, tileSizes);
       }
     }
   )";
 
   HIPRTC_CHECK(
-      hiprtcCreateProgram(&prog, kernelStr.c_str(), "coop_reduce.hip", 0, nullptr, nullptr));
+      hiprtcCreateProgram(&prog, kernelStr.c_str(), "coop_aggregation.hip", 0, nullptr, nullptr));
   compileProgram<Op>(prog, types);
-  runCoopTestReduceForTypes<Op>(prog, types);
+  runCoopAggregationForTypes<Op>(prog, aggType, types);
   HIPRTC_CHECK(hiprtcDestroyProgram(&prog));
 }
 
@@ -216,26 +237,62 @@ HIP_TEST_CASE(Unit_Rtc_CoopReduce)
   const std::tuple<int, unsigned int, long long, unsigned long long> integralTypes;
 
   SECTION("add") {
-    runAndCompileTest<cooperative_groups::plus>(allTypes);
+    runAndCompileTest<cooperative_groups::plus>(AggregationType::Reduce, allTypes);
+  }
+
+  SECTION("less") {
+    runAndCompileTest<cooperative_groups::less>(AggregationType::Reduce, allTypes);
+  }
+
+  SECTION("greater") {
+    runAndCompileTest<cooperative_groups::greater>(AggregationType::Reduce, allTypes);
+  }
+
+  SECTION("and") {
+    runAndCompileTest<cooperative_groups::bit_and>(AggregationType::Reduce, integralTypes);
+  }
+
+  SECTION("or") {
+    runAndCompileTest<cooperative_groups::bit_or>(AggregationType::Reduce, integralTypes);
+  }
+
+  SECTION("xor") {
+    runAndCompileTest<cooperative_groups::bit_xor>(AggregationType::Reduce, integralTypes);
+  }
+}
+
+TEST_CASE("Unit_Rtc_CoopScan")
+{
+  const std::tuple<int, unsigned int, long long, unsigned long long, float, half, double> allTypes;
+  const std::tuple<int, unsigned int, long long, unsigned long long> integralTypes;
+
+  SECTION("add") {
+    runAndCompileTest<cooperative_groups::plus>(AggregationType::InclusiveScan, allTypes);
+    runAndCompileTest<cooperative_groups::plus>(AggregationType::ExclusiveScan, allTypes);
   }
 
   SECTION("less") {
-    runAndCompileTest<cooperative_groups::less>(allTypes);
+    runAndCompileTest<cooperative_groups::less>(AggregationType::InclusiveScan, allTypes);
+    runAndCompileTest<cooperative_groups::less>(AggregationType::ExclusiveScan, allTypes);
   }
 
   SECTION("greater") {
-    runAndCompileTest<cooperative_groups::greater>(allTypes);
+    runAndCompileTest<cooperative_groups::greater>(AggregationType::InclusiveScan, allTypes);
+    runAndCompileTest<cooperative_groups::greater>(AggregationType::ExclusiveScan, allTypes);
   }
 
   SECTION("and") {
-    runAndCompileTest<cooperative_groups::bit_and>(integralTypes);
+    runAndCompileTest<cooperative_groups::bit_and>(AggregationType::InclusiveScan, integralTypes);
+    runAndCompileTest<cooperative_groups::bit_and>(AggregationType::ExclusiveScan, integralTypes);
   }
 
   SECTION("or") {
-    runAndCompileTest<cooperative_groups::bit_or>(integralTypes);
+    runAndCompileTest<cooperative_groups::bit_or>(AggregationType::InclusiveScan, integralTypes);
+    runAndCompileTest<cooperative_groups::bit_or>(AggregationType::ExclusiveScan, integralTypes);
   }
 
   SECTION("xor") {
-    runAndCompileTest<cooperative_groups::bit_xor>(integralTypes);
+    runAndCompileTest<cooperative_groups::bit_xor>(AggregationType::InclusiveScan, integralTypes);
+    runAndCompileTest<cooperative_groups::bit_xor>(AggregationType::ExclusiveScan, integralTypes);
   }
 }
diff --git a/projects/hip-tests/catch/unit/rtc/rtc_reduce.cc b/projects/hip-tests/catch/unit/rtc/rtc_reduce.cc
index cd0c0245f88..ff6f99db176 100644
--- a/projects/hip-tests/catch/unit/rtc/rtc_reduce.cc
+++ b/projects/hip-tests/catch/unit/rtc/rtc_reduce.cc
@@ -86,7 +86,7 @@ void runTestReduceForTypes(hiprtcProgram& prog, const std::tuple<T, Types...>) {
 }
 
 template <class T, template <typename> class Op>
-void opToString(std::string& scalarName, std::string& intrinsicName) {
+void reduceOpToString(std::string& scalarName, std::string& intrinsicName) {
   if constexpr (std::is_same<Op<T>, std::plus<T>>::value) {
     scalarName = "std::plus";
     intrinsicName = "__reduce_add_sync";
@@ -105,9 +105,6 @@ void opToString(std::string& scalarName, std::string& intrinsicName) {
   } else if constexpr (std::is_same<Op<T>, XorOp<T>>::value) {
     scalarName = "std::bit_xor";
     intrinsicName = "__reduce_xor_sync";
-  } else if constexpr (std::is_same<Op<T>, XorOp<T>>::value) {
-    scalarName = "std::bit_xor";
-    intrinsicName = "__reduce_xor_sync";
   } else
     static_assert(std::is_void<T>::value, "Unexpected operator");
 }
@@ -119,7 +116,7 @@ void compileProgram(hiprtcProgram& prog, const std::tuple<>&) {
   hiprtcResult compileResult;
   const char* options[] = {"-DHIP_ENABLE_WARP_SYNC_BUILTINS", "-DHIP_ENABLE_EXTRA_WARP_SYNC_TYPES"};
 
-  opToString<int, Op>(scalarName, intrinsicName);
+  reduceOpToString<int, Op>(scalarName, intrinsicName);
   compileResult = hiprtcResult{hiprtcCompileProgram(prog, NELEMS(options), options)};
   HIPRTC_CHECK(hiprtcGetProgramLogSize(prog, &logSize));
 
@@ -139,7 +136,7 @@ void runAndCompileTest(const std::tuple<Types...> types) {
   std::string scalarName, intrinsicName, kernelStr;
   hiprtcProgram prog;
 
-  opToString<int, Op>(scalarName, intrinsicName);
+  reduceOpToString<int, Op>(scalarName, intrinsicName);
   kernelStr = R"(
     template <class T, class MaskType>
     __global__ void reduceRtcKernel(T* output, const T* input, const MaskType* masks, int* numReduces)
diff --git a/projects/hip-tests/catch/unit/warp/warp_reduce.cc b/projects/hip-tests/catch/unit/warp/warp_reduce.cc
index 1309ecaa51d..55a7f015b9b 100644
--- a/projects/hip-tests/catch/unit/warp/warp_reduce.cc
+++ b/projects/hip-tests/catch/unit/warp/warp_reduce.cc
@@ -90,6 +90,7 @@ template <class T> void runTestMultipleMasks(unsigned long long masks[], int num
   distribution distInput(a, b);
   dim3 blkDim{wavefrontSize};
   dim3 grdDim{1u};
+  T expectedByLane[64];
 
   HIP_CHECK(hipMemcpy(d_masks.ptr(), &masks[0], d_masks.size_bytes(), hipMemcpyHostToDevice));
   genRandomBuffers(d_input, input, distInput, gen, wavefrontSize);
@@ -99,7 +100,7 @@ template <class T> void runTestMultipleMasks(unsigned long long masks[], int num
 
   for (int numMask = 0; numMask < numMasks; numMask++) {
     unsigned long long mask = masks[numMask];
-    T expected = calculateExpected<T>(input.ptr(), op, mask);
+    T expected = calculateExpected<T>(expectedByLane, input.ptr(), op, mask, AggregationType::Reduce);
     int lane = 0;
 
     while (lane < wavefrontSize) {
@@ -109,11 +110,12 @@ template <class T> void runTestMultipleMasks(unsigned long long masks[], int num
         if constexpr (std::is_integral<T>::value) {
           // for integral types the result should match exactly
           if (result != expected) {
-            printMismatch(result, expected, input.ptr(), mask);
+            printMismatch(result, expected, input.ptr(), mask, lane);
             REQUIRE(result == expected);
           }
-        } else
-          compareFloatingPoint(result, expected, mask, input.ptr());
+        } else {
+          compareFloatingPoint<std::plus<T>>(result, expected, mask, input.ptr(), lane);
+        }
       }
 
       lane++;
diff --git a/projects/hip/include/hip/cooperative_groups/hip_scan.h b/projects/hip/include/hip/cooperative_groups/hip_scan.h
new file mode 100644
index 00000000000..70f9118dc4c
--- /dev/null
+++ b/projects/hip/include/hip/cooperative_groups/hip_scan.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+/**
+ *  @file  hip_scan.h
+ *
+ *  @brief Defines cooperative_groups scan device API
+ */
+
+#ifndef HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_SCAN_H
+#define HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_SCAN_H
+
+#include <hip/hip_version.h>
+#include <hip/hip_common.h>
+
+#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
+#if __cplusplus && defined(__clang__) && defined(__HIP__)
+#include <hip/amd_detail/amd_hip_cooperative_groups_scan.h>
+#endif
+
+#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
+#include <hip/nvidia_detail/nvidia_hip_cooperative_groups_scan.h>
+#else
+#error ("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
+#endif
+
+#endif // HIP_INCLUDE_HIP_HIP_COOPERATIVE_GROUP_SCAN_H
diff --git a/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups_scan.h b/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups_scan.h
new file mode 100644
index 00000000000..cae4495b44b
--- /dev/null
+++ b/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups_scan.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H
+
+#include <cooperative_groups/scan.h>
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_SCAN_H
+