diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml
new file mode 100644
index 0000000000..92165f603b
--- /dev/null
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -0,0 +1,39 @@
+name: build-cachelib-centos-latest
+on:
+  schedule:
+    - cron:  '0 7 * * *'
+    
+jobs:
+  build-cachelib-centos8-latest:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "centos:latest"
+    steps:
+      - name: "update packages"
+        run: dnf upgrade -y
+      - name: "install sudo,git"
+        run: dnf install -y sudo git cmake gcc
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
new file mode 100644
index 0000000000..5bc3ad3c70
--- /dev/null
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -0,0 +1,43 @@
+name: build-cachelib-debian-10
+on:
+  schedule:
+    - cron:  '30 5 * * 0,3'
+
+jobs:
+  build-cachelib-debian-10:
+    name: "Debian/Buster - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    # Docker container image name
+    container: "debian:buster-slim"
+    steps:
+      - name: "update packages"
+        run: apt-get update
+      - name: "upgrade packages"
+        run: apt-get -y upgrade
+      - name: "install sudo,git"
+        run: apt-get install -y sudo git procps
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true
+          echo === env ===
+          env
+          echo === cc -v ===
+          cc -v || true
+          echo === g++ -v ===
+          g++ - || true
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+      - name: "build CacheLib using build script"
+        run: ./contrib/build.sh -j -v -T
+      - name: "run tests"
+        timeout-minutes: 60
+        run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..be28bc233c
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,49 @@
+name: build-cachelib-docker
+on:
+  push:
+  pull_request:
+
+jobs:
+  build-cachelib-docker:
+    name: "CentOS/latest - Build CacheLib with all dependencies"
+    runs-on: ubuntu-latest
+    env:
+      REPO:           cachelib
+      GITHUB_REPO:    intel/CacheLib
+      CONTAINER_REG:  ghcr.io/pmem/cachelib
+      CONTAINER_REG_USER:   ${{ secrets.GH_CR_USER }}
+      CONTAINER_REG_PASS:   ${{ secrets.GH_CR_PAT }}
+      FORCE_IMAGE_ACTION:   ${{ secrets.FORCE_IMAGE_ACTION }}
+      HOST_WORKDIR:         ${{ github.workspace }}
+      WORKDIR:              docker
+      IMG_VER:              devel
+    strategy:
+      matrix:
+        CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+    steps:
+      - name: "System Information"
+        run: |
+          echo === uname ===
+          uname -a
+          echo === /etc/os-release ===
+          cat /etc/os-release
+          echo === df -hl ===
+          df -hl
+          echo === free -h ===
+          free -h
+          echo === top ===
+          top -b -n1 -1 -Eg || timeout 1 top -b -n1
+          echo === env ===
+          env
+          echo === gcc -v ===
+          gcc -v
+      - name: "checkout sources"
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Pull the image or rebuild and push it
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+      - name: Run the build
+        run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 4b4897b610..90c8d739c6 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -1,6 +1,6 @@
 # From: https://github.com/marketplace/actions/clang-format-check#multiple-paths
 name: clang-format Check
-on: [pull_request]
+on: []
 jobs:
   formatting-check:
     name: Formatting Check
diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..6976c9ddc9
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,113 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture. 
+
+<p align="center">
+  <img width="640" height="360" alt="BackgroundEvictor" src="cachelib-background-evictor.png">
+</p>
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters. 
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the 
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads. 
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1. 
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free. 
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this 
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads. 
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
+## Allocation policies
+
+- `maxAcAllocationWatermark`:  Item is always allocated in topmost tier if at least this 
+percentage of the AllocationClass is free.
+- `minAcAllocationWatermark`: Item is always allocated in bottom tier if only this percent
+of the AllocationClass is free. If percentage of free AllocationClasses is between `maxAcAllocationWatermark`
+and `minAcAllocationWatermark`: then extra checks (described below) are performed to decide where to put the element.
+
+By default, allocation will always be performed from the upper tier.
+
+- `acTopTierEvictionWatermark`: If there is less that this percent of free memory in topmost tier, cachelib will attempt to evict from top tier. This option takes precedence before allocationWatermarks.
+
+### Extra policies (used only when  percentage of free AllocationClasses is between `maxAcAllocationWatermark`
+and `minAcAllocationWatermark`)
+- `sizeThresholdPolicy`: If item is smaller than this value, always allocate it in upper tier.
+- `defaultTierChancePercentage`: Change (0-100%) of allocating item in top tier
+
+## MMContainer options
+
+- `lruInsertionPointSpec`: Can be set per tier when LRU2Q is used. Determines where new items are
+inserted. 0 = insert to hot queue, 1 = insert to warm queue, 2 = insert to cold queue
+- `markUsefulChance`: Per-tier, determines chance of moving item to the head of LRU on access
+
diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt
index 36df0dc19f..e77c25085c 100644
--- a/cachelib/CMakeLists.txt
+++ b/cachelib/CMakeLists.txt
@@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
+if(COVERAGE_ENABLED)
+  # Add code coverage
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage")
+endif()
+
 # include(fb_cxx_flags)
 message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}")
 
diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h
new file mode 100644
index 0000000000..b77436635f
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover-inl.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace facebook {
+namespace cachelib {
+
+template <typename CacheT>
+BackgroundMover<CacheT>::BackgroundMover(
+    Cache& cache,
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    MoverDir direction)
+    : cache_(cache), strategy_(strategy), direction_(direction) {
+  if (direction_ == MoverDir::Evict) {
+    moverFunc = BackgroundMoverAPIWrapper<CacheT>::traverseAndEvictItems;
+
+  } else if (direction_ == MoverDir::Promote) {
+    moverFunc = BackgroundMoverAPIWrapper<CacheT>::traverseAndPromoteItems;
+  }
+}
+
+template <typename CacheT>
+BackgroundMover<CacheT>::~BackgroundMover() {
+  stop(std::chrono::seconds(0));
+}
+
+template <typename CacheT>
+void BackgroundMover<CacheT>::work() {
+  try {
+    checkAndRun();
+  } catch (const std::exception& ex) {
+    XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what());
+  }
+}
+
+template <typename CacheT>
+void BackgroundMover<CacheT>::setAssignedMemory(
+    std::vector<MemoryDescriptorType>&& assignedMemory) {
+  XLOG(INFO, "Class assigned to background worker:");
+  for (auto [tid, pid, cid] : assignedMemory) {
+    XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
+  }
+
+  mutex.lock_combine([this, &assignedMemory] {
+    this->assignedMemory_ = std::move(assignedMemory);
+  });
+}
+
+// Look for classes that exceed the target memory capacity
+// and return those for eviction
+template <typename CacheT>
+void BackgroundMover<CacheT>::checkAndRun() {
+  auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; });
+
+  unsigned int moves = 0;
+  std::set<ClassId> classes{};
+  auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+  for (size_t i = 0; i < batches.size(); i++) {
+    const auto [tid, pid, cid] = assignedMemory[i];
+    const auto batch = batches[i];
+
+    classes.insert(cid);
+    const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
+
+    if (!batch) {
+      continue;
+    }
+
+    // try moving BATCH items from the class in order to reach free target
+    auto moved = moverFunc(cache_, tid, pid, cid, batch);
+    moves += moved;
+    moves_per_class_[tid][pid][cid] += moved;
+    totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize);
+  }
+
+  numTraversals.inc();
+  numMovedItems.add(moves);
+  totalClasses.add(classes.size());
+}
+
+template <typename CacheT>
+BackgroundMoverStats BackgroundMover<CacheT>::getStats() const noexcept {
+  BackgroundMoverStats stats;
+  stats.numMovedItems = numMovedItems.get();
+  stats.runCount = numTraversals.get();
+  stats.totalBytesMoved = totalBytesMoved.get();
+  stats.totalClasses = totalClasses.get();
+
+  return stats;
+}
+
+template <typename CacheT>
+std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+BackgroundMover<CacheT>::getClassStats() const noexcept {
+  return moves_per_class_;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
new file mode 100644
index 0000000000..1246676d6e
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/CacheStats.h"
+#include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/PeriodicWorker.h"
+
+namespace facebook {
+namespace cachelib {
+
+// wrapper that exposes the private APIs of CacheType that are specifically
+// needed for the cache api
+template <typename C>
+struct BackgroundMoverAPIWrapper {
+  static size_t traverseAndEvictItems(C& cache,
+                                      unsigned int tid,
+                                      unsigned int pid,
+                                      unsigned int cid,
+                                      size_t batch) {
+    return cache.traverseAndEvictItems(tid, pid, cid, batch);
+  }
+
+  static size_t traverseAndPromoteItems(C& cache,
+                                        unsigned int tid,
+                                        unsigned int pid,
+                                        unsigned int cid,
+                                        size_t batch) {
+    return cache.traverseAndPromoteItems(tid, pid, cid, batch);
+  }
+};
+
+enum class MoverDir { Evict = 0, Promote };
+
+// Periodic worker that evicts items from tiers in batches
+// The primary aim is to reduce insertion times for new items in the
+// cache
+template <typename CacheT>
+class BackgroundMover : public PeriodicWorker {
+ public:
+  using Cache = CacheT;
+  // @param cache               the cache interface
+  // @param strategy            the stragey class that defines how objects are
+  // moved,
+  //                            (promoted vs. evicted and how much)
+  BackgroundMover(Cache& cache,
+                  std::shared_ptr<BackgroundMoverStrategy> strategy,
+                  MoverDir direction_);
+
+  ~BackgroundMover() override;
+
+  BackgroundMoverStats getStats() const noexcept;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+  getClassStats() const noexcept;
+
+  void setAssignedMemory(
+      std::vector<MemoryDescriptorType>&& assignedMemory);
+
+ private:
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+      moves_per_class_;
+  // cache allocator's interface for evicting
+  using Item = typename Cache::Item;
+
+  Cache& cache_;
+  std::shared_ptr<BackgroundMoverStrategy> strategy_;
+  MoverDir direction_;
+
+  std::function<size_t(
+      Cache&, unsigned int, unsigned int, unsigned int, size_t)>
+      moverFunc;
+
+  // implements the actual logic of running the background evictor
+  void work() override final;
+  void checkAndRun();
+
+  AtomicCounter numMovedItems{0};
+  AtomicCounter numTraversals{0};
+  AtomicCounter totalClasses{0};
+  AtomicCounter totalBytesMoved{0};
+
+  std::vector<MemoryDescriptorType> assignedMemory_;
+  folly::DistributedMutex mutex;
+};
+} // namespace cachelib
+} // namespace facebook
+
+#include "cachelib/allocator/BackgroundMover-inl.h"
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
new file mode 100644
index 0000000000..7706a625a5
--- /dev/null
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/Cache.h"
+
+
+namespace facebook {
+namespace cachelib {
+
+struct MemoryDescriptorType {
+    MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : 
+        tid_(tid), pid_(pid), cid_(cid) {}
+    TierId tid_;
+    PoolId pid_;
+    ClassId cid_;
+};
+
+// Base class for background eviction strategy.
+class BackgroundMoverStrategy {
+ public:
+  virtual std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) = 0;
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/CCacheAllocator.cpp b/cachelib/allocator/CCacheAllocator.cpp
index 2709bde377..dd1986114b 100644
--- a/cachelib/allocator/CCacheAllocator.cpp
+++ b/cachelib/allocator/CCacheAllocator.cpp
@@ -36,7 +36,9 @@ CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator,
       currentChunksIndex_(0) {
   auto& currentChunks = chunks_[currentChunksIndex_];
   for (auto chunk : *object.chunks()) {
-    currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk)));
+    // TODO : pass multi-tier flag when compact cache supports multi-tier config
+    currentChunks.push_back(
+        allocator_.unCompress(CompressedPtr(chunk), false /* isMultiTier */));
   }
 }
 
@@ -97,7 +99,9 @@ CCacheAllocator::SerializationType CCacheAllocator::saveState() {
 
   std::lock_guard<std::mutex> guard(resizeLock_);
   for (auto chunk : getCurrentChunks()) {
-    object.chunks()->push_back(allocator_.compress(chunk).saveState());
+    // TODO : pass multi-tier flag when compact cache supports multi-tier config
+    object.chunks()->push_back(
+        allocator_.compress(chunk, false /* isMultiTier */).saveState());
   }
   return object;
 }
diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index 78cfa7ca06..6103cdc823 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library (cachelib_allocator
     CCacheManager.cpp
     ContainerTypes.cpp
     FreeMemStrategy.cpp
+    FreeThresholdStrategy.cpp
     HitsPerSlabStrategy.cpp
     LruTailAgeStrategy.cpp
     MarginalHitsOptimizeStrategy.cpp
@@ -117,6 +118,8 @@ if (BUILD_TESTS)
   add_test (tests/ChainedHashTest.cpp)
   add_test (tests/AllocatorResizeTypeTest.cpp)
   add_test (tests/AllocatorHitStatsTypeTest.cpp)
+  add_test (tests/AllocatorMemoryTiersTest.cpp)
+  add_test (tests/MemoryTiersTest.cpp)
   add_test (tests/MultiAllocatorTest.cpp)
   add_test (tests/NvmAdmissionPolicyTest.cpp)
   add_test (tests/CacheAllocatorConfigTest.cpp)
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index e225ba8a01..c871358189 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -85,6 +85,9 @@ class CacheBase {
   CacheBase(CacheBase&&) = default;
   CacheBase& operator=(CacheBase&&) = default;
 
+  // TODO: come up with some reasonable number
+  static constexpr unsigned kMaxTiers = 2;
+
   // Get a string referring to the cache name for this cache
   virtual const std::string getCacheName() const = 0;
 
@@ -95,6 +98,12 @@ class CacheBase {
   //
   // @param poolId    The pool id to query
   virtual const MemoryPool& getPool(PoolId poolId) const = 0;
+  
+  // Get the reference  to a memory pool using a tier id, for stats purposes
+  //
+  // @param poolId    The pool id to query
+  // @param tierId    The tier of the pool id
+  virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
 
   // Get Pool specific stats (regular pools). This includes stats from the
   // Memory Pool and also the cache.
@@ -102,6 +111,12 @@ class CacheBase {
   // @param poolId   the pool id
   virtual PoolStats getPoolStats(PoolId poolId) const = 0;
 
+  // Get Allocation Class specific stats.
+  //
+  // @param poolId   the pool id
+  // @param classId   the class id
+  virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0;
+
   // @param poolId   the pool id
   virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
 
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 1d89593268..614f031aeb 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <folly/Random.h>
+
 namespace facebook {
 namespace cachelib {
 
@@ -35,6 +37,7 @@ CacheAllocator<CacheTrait>::CacheAllocator(SharedMemNewT, Config config)
 template <typename CacheTrait>
 CacheAllocator<CacheTrait>::CacheAllocator(SharedMemAttachT, Config config)
     : CacheAllocator(InitMemType::kMemAttach, config) {
+  /* TODO - per tier? */
   for (auto pid : *metadata_.compactCachePools()) {
     isCompactCachePool_[pid] = true;
   }
@@ -53,12 +56,13 @@ CacheAllocator<CacheTrait>::CacheAllocator(
     : isOnShm_{type != InitMemType::kNone ? true
                                           : config.memMonitoringEnabled()},
       config_(config.validate()),
+      memoryTierConfigs(config.getMemoryTierConfigs()),
       tempShm_(type == InitMemType::kNone && isOnShm_
-                   ? std::make_unique<TempShmMapping>(config_.size)
+                   ? std::make_unique<TempShmMapping>(config_.getCacheSize())
                    : nullptr),
       shmManager_(type != InitMemType::kNone
                       ? std::make_unique<ShmManager>(config_.cacheDir,
-                                                     config_.usePosixShm)
+                                                     config_.isUsingPosixShm())
                       : nullptr),
       deserializer_(type == InitMemType::kMemAttach ? createDeserializer()
                                                     : nullptr),
@@ -67,12 +71,12 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                     : serialization::CacheAllocatorMetadata{}},
       allocator_(initAllocator(type)),
       compactCacheManager_(type != InitMemType::kMemAttach
-                               ? std::make_unique<CCacheManager>(*allocator_)
-                               : restoreCCacheManager()),
+                               ? std::make_unique<CCacheManager>(*allocator_[0] /* TODO: per tier */)
+                               : restoreCCacheManager(0/* TODO: per tier */)),
       compressor_(createPtrCompressor()),
       mmContainers_(type == InitMemType::kMemAttach
                         ? deserializeMMContainers(*deserializer_, compressor_)
-                        : MMContainers{}),
+                        : MMContainers{getNumTiers()}),
       accessContainer_(initAccessContainer(
           type, detail::kShmHashTableName, config.accessConfig)),
       chainedItemAccessContainer_(
@@ -81,6 +85,8 @@ CacheAllocator<CacheTrait>::CacheAllocator(
                               config.chainedItemAccessConfig)),
       chainedItemLocks_(config_.chainedItemsLockPower,
                         std::make_shared<MurmurHash2>()),
+      movesMap_(kShards),
+      moveLock_(kShards), 
       cacheCreationTime_{
           type != InitMemType::kMemAttach
               ? util::getCurrentTimeSec()
@@ -105,48 +111,98 @@ CacheAllocator<CacheTrait>::~CacheAllocator() {
 }
 
 template <typename CacheTrait>
-ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts() {
+ShmSegmentOpts CacheAllocator<CacheTrait>::createShmCacheOpts(TierId tid) {
   ShmSegmentOpts opts;
   opts.alignment = sizeof(Slab);
   auto memoryTierConfigs = config_.getMemoryTierConfigs();
   // TODO: we support single tier so far
-  XDCHECK_EQ(memoryTierConfigs.size(), 1ul);
-  opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind();
-
+  if (memoryTierConfigs.size() > 2) {
+    throw std::invalid_argument("CacheLib only supports two memory tiers");
+  }
+  opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind();
   return opts;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::memoryTierSize(TierId tid) const {
+  auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL,
+  [](const size_t i, const MemoryTierCacheConfig& config){
+    return i + config.getRatio();
+  });
+
+  return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createPrivateAllocator() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+
+  if (isOnShm_)
+    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            tempShm_->getAddr(),
+                            config_.getCacheSize()));
+  else
+    allocators.emplace_back(std::make_unique<MemoryAllocator>(
+                            getAllocatorConfig(config_),
+                            config_.getCacheSize()));
+
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::createNewMemoryAllocator() {
+CacheAllocator<CacheTrait>::createNewMemoryAllocator(TierId tid) {
+  size_t tierSize = memoryTierSize(tid);
   return std::make_unique<MemoryAllocator>(
       getAllocatorConfig(config_),
       shmManager_
-          ->createShm(detail::kShmCacheName, config_.size,
-                      config_.slabMemoryBaseAddr, createShmCacheOpts())
+          ->createShm(detail::kShmCacheName + std::to_string(tid),
+                      tierSize, config_.slabMemoryBaseAddr,
+                      createShmCacheOpts(tid))
           .addr,
-      config_.size);
+      tierSize);
 }
 
 template <typename CacheTrait>
 std::unique_ptr<MemoryAllocator>
-CacheAllocator<CacheTrait>::restoreMemoryAllocator() {
+CacheAllocator<CacheTrait>::restoreMemoryAllocator(TierId tid) {
   return std::make_unique<MemoryAllocator>(
       deserializer_->deserialize<MemoryAllocator::SerializationType>(),
       shmManager_
-          ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
-                      createShmCacheOpts())
-          .addr,
-      config_.size,
+          ->attachShm(detail::kShmCacheName + std::to_string(tid),
+            config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
+      memoryTierSize(tid),
       config_.disableFullCoredump);
 }
 
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::createAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(createNewMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
+template <typename CacheTrait>
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::restoreAllocators() {
+  std::vector<std::unique_ptr<MemoryAllocator>> allocators;
+  for (int tid = 0; tid < getNumTiers(); tid++) {
+    allocators.emplace_back(restoreMemoryAllocator(tid));
+  }
+  return allocators;
+}
+
 template <typename CacheTrait>
 std::unique_ptr<CCacheManager>
-CacheAllocator<CacheTrait>::restoreCCacheManager() {
+CacheAllocator<CacheTrait>::restoreCCacheManager(TierId tid) {
   return std::make_unique<CCacheManager>(
       deserializer_->deserialize<CCacheManager::SerializationType>(),
-      *allocator_);
+      *allocator_[tid]);
 }
 
 template <typename CacheTrait>
@@ -235,23 +291,30 @@ void CacheAllocator<CacheTrait>::initWorkers() {
                           config_.poolOptimizeStrategy,
                           config_.ccacheOptimizeStepSizePercent);
   }
+
+  if (config_.backgroundEvictorEnabled()) {
+      startNewBackgroundEvictor(config_.backgroundEvictorInterval,
+                                config_.backgroundEvictorStrategy,
+                                config_.backgroundEvictorThreads);
+  }
+
+  if (config_.backgroundPromoterEnabled()) {
+      startNewBackgroundPromoter(config_.backgroundPromoterInterval,
+                                config_.backgroundPromoterStrategy,
+                                config_.backgroundPromoterThreads);
+  }
 }
 
 template <typename CacheTrait>
-std::unique_ptr<MemoryAllocator> CacheAllocator<CacheTrait>::initAllocator(
+std::vector<std::unique_ptr<MemoryAllocator>>
+CacheAllocator<CacheTrait>::initAllocator(
     InitMemType type) {
   if (type == InitMemType::kNone) {
-    if (isOnShm_ == true) {
-      return std::make_unique<MemoryAllocator>(
-          getAllocatorConfig(config_), tempShm_->getAddr(), config_.size);
-    } else {
-      return std::make_unique<MemoryAllocator>(getAllocatorConfig(config_),
-                                               config_.size);
-    }
+    return createPrivateAllocator();
   } else if (type == InitMemType::kMemNew) {
-    return createNewMemoryAllocator();
+    return createAllocators();
   } else if (type == InitMemType::kMemAttach) {
-    return restoreMemoryAllocator();
+    return restoreAllocators();
   }
 
   // Invalid type
@@ -318,13 +381,31 @@ CacheAllocator<CacheTrait>::allocate(PoolId poolId,
                           ttlSecs == 0 ? 0 : creationTime + ttlSecs);
 }
 
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) {
+  // TODO: should we also work on lower tiers? should we have separate set of params?
+  if (tid == 1) return false;
+  return (1-getACStats(tid, pid, cid).usageFraction())*100 <= config_.lowEvictionAcWatermark;
+}
+ 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) {
+  XDCHECK(numWorkers);
+
+  // TODO: came up with some better sharding (use some hashing)
+  return (tid + pid + cid) % numWorkers;
+}
+
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
-                                             typename Item::Key key,
-                                             uint32_t size,
-                                             uint32_t creationTime,
-                                             uint32_t expiryTime) {
+CacheAllocator<CacheTrait>::allocateInternalTier(TierId tid,
+                                                 PoolId pid,
+                                                 typename Item::Key key,
+                                                 uint32_t size,
+                                                 uint32_t creationTime,
+                                                 uint32_t expiryTime,
+                                                 bool fromBgThread) {
   util::LatencyTracker tracker{stats().allocateLatency_};
 
   SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -333,13 +414,32 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   const auto requiredSize = Item::getRequiredSize(key, size);
 
   // the allocation class in our memory allocator.
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
 
+  // TODO: per-tier
   (*stats_.allocAttempts)[pid][cid].inc();
+  
+  void *memory = nullptr;
+
+  if (tid == 0 && config_.acTopTierEvictionWatermark > 0.0
+    && 100.0 * (1 - getACStats(tid, pid, cid).usageFraction()) < config_.acTopTierEvictionWatermark) {
+    memory = findEviction(tid, pid, cid);
+  } 
 
-  void* memory = allocator_->allocate(pid, requiredSize);
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    // TODO: should we try allocate item even if this will result in violating
+    // acTopTierEvictionWatermark?
+    memory = allocator_[tid]->allocate(pid, requiredSize);
+  }
+  
+  if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
+    backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp();
+  }
+  
+  if (memory == nullptr) {
+    memory = findEviction(tid, pid, cid);
   }
 
   WriteHandle handle;
@@ -350,7 +450,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     // for example.
     SCOPE_FAIL {
       // free back the memory to the allocator since we failed.
-      allocator_->free(memory);
+      allocator_[tid]->free(memory);
     };
 
     handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
@@ -361,7 +461,7 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
     }
 
   } else { // failed to allocate memory.
-    (*stats_.allocFailures)[pid][cid].inc();
+    (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
     // wake up rebalancer
     if (poolRebalancer_) {
       poolRebalancer_->wakeUp();
@@ -378,6 +478,90 @@ CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
   return handle;
 }
 
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTargetTierForItem(PoolId pid,
+                                             typename Item::Key key,
+                                             uint32_t size,
+                                             uint32_t creationTime,
+                                             uint32_t expiryTime) {
+  if (getNumTiers() == 1)
+    return 0;
+
+  if (config_.forceAllocationTier != UINT64_MAX) {
+    return config_.forceAllocationTier;
+  }
+
+  const TierId defaultTargetTier = 0;
+
+  const auto requiredSize = Item::getRequiredSize(key, size);
+  const auto cid = allocator_[defaultTargetTier]->getAllocationClassId(pid, requiredSize);
+
+  auto freePercentage = 100.0 * (1 - getACStats(defaultTargetTier, pid, cid).usageFraction());
+
+  // TODO: COULD we implement BG worker which would move slabs around
+  // so that there is similar amount of free space in each pool/ac.
+  // Should this be responsibility of BG evictor?
+
+  if (freePercentage >= config_.maxAcAllocationWatermark)
+    return defaultTargetTier;
+
+  if (freePercentage <= config_.minAcAllocationWatermark)
+    return defaultTargetTier + 1;
+
+  // TODO: we can think about creating different allocation classes for different memory tiers
+  // and we could look at possible fragmentation when deciding where to put the item
+  if (config_.sizeThresholdPolicy)
+    return requiredSize < config_.sizeThresholdPolicy ? defaultTargetTier : defaultTargetTier + 1;
+
+  // TODO: (e.g. always put chained items to second tier)
+  // if (chainedItemsPolicy)
+  //  return item.isChainedItem() ? defaultTargetTier + 1 : defaultTargetTier;
+
+  // TODO:
+  // if (expiryTimePolicy)
+  //   return (expiryTime - creationTime) < expiryTimePolicy ? defaultTargetTier : defaultTargetTier + 1;
+
+  // TODO:
+  // if (keyPolicy) // this can be based on key length or some other properties
+  //  return getTargetTierForKey(key);
+
+  // TODO:
+  // if (compressabilityPolicy) // if compresses well store on CXL? latency will be higher anyway
+  //  return TODO;
+
+  if (config_.defaultTierChancePercentage >= 100.00) {
+    return 0;
+  }
+
+  return (folly::Random::rand32() % 100) < config_.defaultTierChancePercentage ? defaultTargetTier : defaultTargetTier + 1;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::shouldEvictToNextMemoryTier(
+    TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item)
+{
+  if (config_.disableEvictionToMemory)
+    return false;
+
+  // TODO: implement more advanced admission policies for memory tiers, for example:
+  // - always evict big items to NVMe only
+  // - do not evict an item if pressure on target Tier is high
+  return true;
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::allocateInternal(PoolId pid,
+                                             typename Item::Key key,
+                                             uint32_t size,
+                                             uint32_t creationTime,
+                                             uint32_t expiryTime,
+                                             bool fromBgThread) {
+  auto tid = getTargetTierForItem(pid, key, size, creationTime, expiryTime);
+  return allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateChainedItem(const ReadHandle& parent,
@@ -408,21 +592,29 @@ CacheAllocator<CacheTrait>::allocateChainedItemInternal(
   // number of bytes required for this item
   const auto requiredSize = ChainedItem::getRequiredSize(size);
 
-  const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
-  const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+  // TODO: is this correct?
+  auto tid = getTierId(*parent);
 
+  const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId;
+  const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+
+  util::RollingLatencyTracker rollTracker{
+      (*stats_.classAllocLatency)[tid][pid][cid]};
+  
+  // TODO: per-tier? Right now stats_ are not used in any public periodic
+  // worker
   (*stats_.allocAttempts)[pid][cid].inc();
 
-  void* memory = allocator_->allocate(pid, requiredSize);
+  void* memory = allocator_[tid]->allocate(pid, requiredSize);
   if (memory == nullptr) {
-    memory = findEviction(pid, cid);
+    memory = findEviction(tid, pid, cid);
   }
   if (memory == nullptr) {
     (*stats_.allocFailures)[pid][cid].inc();
     return WriteHandle{};
   }
 
-  SCOPE_FAIL { allocator_->free(memory); };
+  SCOPE_FAIL { allocator_[tid]->free(memory); };
 
   auto child = acquire(
       new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size,
@@ -466,14 +658,15 @@ void CacheAllocator<CacheTrait>::addChainedItem(WriteHandle& parent,
   // Count a new child
   stats_.numChainedChildItems.inc();
 
-  insertInMMContainer(*child);
-
   // Increment refcount since this chained item is now owned by the parent
   // Parent will decrement the refcount upon release. Since this is an
   // internal refcount, we dont include it in active handle tracking.
-  child->incRef();
+  auto ret = child->incRef(true);
+  XDCHECK(ret == RefcountWithFlags::incResult::incOk);
   XDCHECK_EQ(2u, child->getRefCount());
 
+  insertInMMContainer(*child);
+
   invalidateNvm(*parent);
   if (auto eventTracker = getEventTracker()) {
     eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(),
@@ -717,7 +910,8 @@ CacheAllocator<CacheTrait>::replaceChainedItemLocked(Item& oldItem,
   // Since this is an internal refcount, we dont include it in active handle
   // tracking.
 
-  newItemHdl->incRef();
+  auto ret = newItemHdl->incRef(true);
+  XDCHECK(ret == RefcountWithFlags::incResult::incOk);
   return oldItemHdl;
 }
 
@@ -731,8 +925,8 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     throw std::runtime_error(
         folly::sformat("cannot release this item: {}", it.toString()));
   }
-
-  const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+  const auto tid = getTierId(it);
+  const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
 
   if (ctx == RemoveContext::kEviction) {
     const auto timeNow = util::getCurrentTimeSec();
@@ -756,8 +950,7 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
           folly::sformat("Can not recycle a chained item {}, toRecyle",
                          it.toString(), toRecycle->toString()));
     }
-
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
     return ReleaseRes::kReleased;
   }
 
@@ -826,26 +1019,29 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
       auto next = head->getNext(compressor_);
 
       const auto childInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(head));
+          allocator_[tid]->getAllocInfo(static_cast<const void*>(head));
       (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
           util::getFragmentation(*this, *head));
 
       removeFromMMContainer(*head);
 
-      // If this chained item is marked as exclusive, we will not free it.
-      // We must capture the exclusive state before we do the decRef when
-      // we know the item must still be valid
-      const bool wasExclusive = head->isExclusive();
+      // If this chained item is marked as moving, we will not free it.
+      // We must capture the moving state before we do the decRef when
+      // we know the item must still be valid. Item cannot be marked as
+      // exclusive. Only parent can be marked as such and even parent needs
+      // to be unmark prior to calling releaseBackToAllocator.
+      const bool wasMoving = head->isMoving();
+      XDCHECK(!head->isMarkedForEviction());
 
       // Decref and check if we were the last reference. Now if the item
-      // was marked exclusive, after decRef, it will be free to be released
+      // was marked moving, after decRef, it will be free to be released
       // by slab release thread
       const auto childRef = head->decRef();
 
-      // If the item is already exclusive and we already decremented the
+      // If the item is already moving and we already decremented the
       // refcount, we don't need to free this item. We'll let the slab
       // release thread take care of that
-      if (!wasExclusive) {
+      if (!wasMoving) {
         if (childRef != 0) {
           throw std::runtime_error(folly::sformat(
               "chained item refcount is not zero. We cannot proceed! "
@@ -853,13 +1049,13 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
               childRef, head->toString()));
         }
 
-        // Item is not exclusive and refcount is 0, we can proceed to
+        // Item is not moving and refcount is 0, we can proceed to
         // free it or recylce the memory
         if (head == toRecycle) {
           XDCHECK(ReleaseRes::kReleased != res);
           res = ReleaseRes::kRecycled;
         } else {
-          allocator_->free(head);
+          allocator_[tid]->free(head);
         }
       }
 
@@ -874,16 +1070,19 @@ CacheAllocator<CacheTrait>::releaseBackToAllocator(Item& it,
     res = ReleaseRes::kRecycled;
   } else {
     XDCHECK(it.isDrained());
-    allocator_->free(&it);
+    allocator_[tid]->free(&it);
   }
 
   return res;
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::incRef(Item& it) {
-  it.incRef();
-  ++handleCount_.tlStats();
+RefcountWithFlags::incResult CacheAllocator<CacheTrait>::incRef(Item& it, bool failIfMoving) {
+   auto ret = it.incRef(failIfMoving);
+   if (ret == RefcountWithFlags::incResult::incOk) {
+     ++handleCount_.tlStats();
+   }
+   return ret;
 }
 
 template <typename CacheTrait>
@@ -903,8 +1102,18 @@ CacheAllocator<CacheTrait>::acquire(Item* it) {
 
   SCOPE_FAIL { stats_.numRefcountOverflow.inc(); };
 
-  incRef(*it);
-  return WriteHandle{it, *this};
+  // TODO: do not block incRef for child items to avoid deadlock
+  auto failIfMoving = getNumTiers() > 1 && !it->isChainedItem();
+  auto incRes = incRef(*it, failIfMoving);
+  if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) {
+    return WriteHandle{it, *this};
+  } else if (incRes == RefcountWithFlags::incResult::incFailedEviction){
+    // item is being evicted
+    return WriteHandle{};
+  } else {
+    // item is being moved - wait for completion
+    return handleWithWaitContextForMovingItem(*it);
+  }
 }
 
 template <typename CacheTrait>
@@ -946,6 +1155,25 @@ bool CacheAllocator<CacheTrait>::replaceInMMContainer(Item& oldItem,
   }
 }
 
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::replaceInMMContainer(Item* oldItem,
+                                                      Item& newItem) {
+  return replaceInMMContainer(*oldItem, newItem);
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::replaceInMMContainer(EvictionIterator& oldItemIt,
+                                                      Item& newItem) {
+  auto& oldContainer = getMMContainer(*oldItemIt);
+  auto& newContainer = getMMContainer(newItem);
+
+  // This function is used for eviction across tiers
+  XDCHECK(&oldContainer != &newContainer);
+  oldContainer.remove(oldItemIt);
+
+  return newContainer.add(newItem);
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::replaceChainedItemInMMContainer(
     Item& oldItem, Item& newItem) {
@@ -1091,6 +1319,165 @@ CacheAllocator<CacheTrait>::insertOrReplace(const WriteHandle& handle) {
   return replaced;
 }
 
+/* Next two methods are used to asynchronously move Item between memory tiers.
+ *
+ * The thread, which moves Item, allocates new Item in the tier we are moving to
+ * and calls moveRegularItemWithSync() method. This method does the following:
+ *  1. Update the access container with the new item from the tier we are
+ *     moving to. This Item has moving flag set.
+ *  2. Copy data from the old Item to the new one.
+ *
+ * Concurrent threads which are getting handle to the same key:
+ *  1. When a handle is created it checks if the moving flag is set
+ *  2. If so, Handle implementation creates waitContext and adds it to the
+ *     MoveCtx by calling handleWithWaitContextForMovingItem() method.
+ *  3. Wait until the moving thread will complete its job.
+ */
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::handleWithWaitContextForMovingItem(Item& item) {
+  auto shard = getShardForKey(item.getKey());
+  auto& movesMap = getMoveMapForShard(shard);
+  {
+    auto lock = getMoveLockForShard(shard);
+
+    WriteHandle hdl{*this};
+    auto waitContext = hdl.getItemWaitContext();
+
+    auto ret = movesMap.try_emplace(item.getKey(), std::make_unique<MoveCtx>());
+    ret.first->second->addWaiter(std::move(waitContext));
+
+    return hdl;
+  }
+}
+
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::wakeUpWaitersLocked(folly::StringPiece key,
+  WriteHandle&& handle) {
+  std::unique_ptr<MoveCtx> ctx;
+  auto shard = getShardForKey(key);
+  auto& movesMap = getMoveMapForShard(shard);
+  {
+    auto lock = getMoveLockForShard(shard);
+    movesMap.eraseInto(key, [&](auto &&key, auto &&value) {
+      ctx = std::move(value);
+    });
+  }
+
+  if (ctx) {
+    ctx->setItemHandle(std::move(handle));
+    return ctx->numWaiters();
+  }
+
+  return 0;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::moveRegularItemWithSync(
+    Item& oldItem, WriteHandle& newItemHdl) {
+  //on function exit - the new item handle is no longer moving
+  //and other threads may access it - but in case where
+  //we failed to replace in access container we can give the
+  //new item back to the allocator
+  auto guard = folly::makeGuard([&]() {
+    auto ref = newItemHdl->unmarkMoving();
+    if (UNLIKELY(ref == 0)) {
+      const auto res =
+          releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
+      XDCHECK(res == ReleaseRes::kReleased);
+    }
+  });
+
+  XDCHECK(oldItem.isMoving());
+  XDCHECK(!oldItem.isExpired());
+  // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
+  // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_};
+
+  XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
+
+  // take care of the flags before we expose the item to be accessed. this
+  // will ensure that when another thread removes the item from RAM, we issue
+  // a delete accordingly. See D7859775 for an example
+  if (oldItem.isNvmClean()) {
+    newItemHdl->markNvmClean();
+  }
+
+  // mark new item as moving to block readers until the data is copied
+  // (moveCb is called). Mark item in MMContainer temporarily (TODO: should
+  // we remove markMoving requirement for the item to be linked?)
+  newItemHdl->markInMMContainer();
+  auto marked = newItemHdl->markMoving(false /* there is already a handle */);
+  newItemHdl->unmarkInMMContainer();
+  XDCHECK(marked);
+
+  auto predicate = [&](const Item& item){
+    // we rely on moving flag being set (it should block all readers)
+    XDCHECK(item.getRefCount() == 0);
+    return true;
+  };
+
+  auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl,
+                                   predicate);
+  // another thread may have called insertOrReplace which could have
+  // marked this item as unaccessible causing the replaceIf
+  // in the access container to fail - in this case we want
+  // to abort the move since the item is no longer valid
+  if (!replaced) {
+      return false;
+  }
+  // what if another thread calls insertOrReplace now when
+  // the item is moving and already replaced in the hash table?
+  // 1. it succeeds in updating the hash table - so there is
+  //    no guarentee that isAccessible() is true
+  // 2. it will then try to remove from MM container
+  //     - this operation will wait for newItemHdl to
+  //       be unmarkedMoving via the waitContext
+  // 3. replaced handle is returned and eventually drops
+  //    ref to 0 and the item is recycled back to allocator.
+
+  if (config_.moveCb) {
+    // Execute the move callback. We cannot make any guarantees about the
+    // consistency of the old item beyond this point, because the callback can
+    // do more than a simple memcpy() e.g. update external references. If there
+    // are any remaining handles to the old item, it is the caller's
+    // responsibility to invalidate them. The move can only fail after this
+    // statement if the old item has been removed or replaced, in which case it
+    // should be fine for it to be left in an inconsistent state.
+    config_.moveCb(oldItem, *newItemHdl, nullptr);
+  } else {
+    std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+                oldItem.getSize());
+  }
+
+  // Adding the item to mmContainer has to succeed since no one can remove the item
+  auto& newContainer = getMMContainer(*newItemHdl);
+  auto mmContainerAdded = newContainer.add(*newItemHdl);
+  XDCHECK(mmContainerAdded);
+
+  // no one can add or remove chained items at this point
+  if (oldItem.hasChainedItem()) {
+    // safe to acquire handle for a moving Item
+    auto incRes = incRef(oldItem, false);
+    XDCHECK(incRes == RefcountWithFlags::incResult::incOk);
+    auto oldHandle = WriteHandle{&oldItem,*this};
+    XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+    XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
+    try {
+      auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
+      transferChainLocked(oldHandle, newItemHdl);
+    } catch (const std::exception& e) {
+      // this should never happen because we drained all the handles.
+      XLOGF(DFATAL, "{}", e.what());
+      throw;
+    }
+
+    XDCHECK(!oldItem.hasChainedItem());
+    XDCHECK(newItemHdl->hasChainedItem());
+  }
+  newItemHdl.unmarkNascent();
+  return true;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
                                                  WriteHandle& newItemHdl) {
@@ -1122,15 +1509,15 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
   config_.moveCb(oldItem, *newItemHdl, nullptr);
 
   // Inside the access container's lock, this checks if the old item is
-  // accessible and its refcount is zero. If the item is not accessible,
+  // accessible and its refcount is one. If the item is not accessible,
   // there is no point to replace it since it had already been removed
   // or in the process of being removed. If the item is in cache but the
-  // refcount is non-zero, it means user could be attempting to remove
+  // refcount is non-one, it means user could be attempting to remove
   // this item through an API such as remove(itemHandle). In this case,
   // it is unsafe to replace the old item with a new one, so we should
   // also abort.
   if (!accessContainer_->replaceIf(oldItem, *newItemHdl,
-                                   itemExclusivePredicate)) {
+                                   itemSlabMovePredicate)) {
     return false;
   }
 
@@ -1151,13 +1538,12 @@ bool CacheAllocator<CacheTrait>::moveRegularItem(Item& oldItem,
 
   // no one can add or remove chained items at this point
   if (oldItem.hasChainedItem()) {
-    // safe to acquire handle for a moving Item
-    auto oldHandle = acquire(&oldItem);
-    XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+    auto oldItemHdl = acquire(&oldItem);
+    XDCHECK_EQ(1u, oldItemHdl->getRefCount()) << oldItemHdl->toString();
     XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
     try {
       auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
-      transferChainLocked(oldHandle, newItemHdl);
+      transferChainLocked(oldItemHdl, newItemHdl);
     } catch (const std::exception& e) {
       // this should never happen because we drained all the handles.
       XLOGF(DFATAL, "{}", e.what());
@@ -1179,18 +1565,18 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
 
   // This item has been unlinked from its parent and we're the only
   // owner of it, so we're done here
-  if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) {
+  if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
     return false;
   }
 
-  const auto parentKey = oldItem.getParentItem(compressor_).getKey();
-
-  // Grab lock to prevent anyone else from modifying the chain
+  auto& expectedParent = oldItem.getParentItem(compressor_);
+  const auto parentKey = expectedParent.getKey();
   auto l = chainedItemLocks_.lockExclusive(parentKey);
 
+  // verify old item under the lock
   auto parentHandle =
       validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey);
-  if (!parentHandle) {
+  if (!parentHandle || &expectedParent != parentHandle.get()) {
     return false;
   }
 
@@ -1210,7 +1596,7 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
 
   // In case someone else had removed this chained item from its parent by now
   // So we check again to see if the it has been unlinked from its parent
-  if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) {
+  if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
     return false;
   }
 
@@ -1226,90 +1612,187 @@ bool CacheAllocator<CacheTrait>::moveChainedItem(ChainedItem& oldItem,
   // parent's chain and the MMContainer.
   auto oldItemHandle =
       replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle);
-  XDCHECK(oldItemHandle->isExclusive());
+  XDCHECK(oldItemHandle->isMoving());
   XDCHECK(!oldItemHandle->isInMMContainer());
 
   return true;
 }
 
 template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::Item*
-CacheAllocator<CacheTrait>::findEviction(PoolId pid, ClassId cid) {
-  auto& mmContainer = getMMContainer(pid, cid);
+typename CacheAllocator<CacheTrait>::NvmCacheT::PutToken
+CacheAllocator<CacheTrait>::createPutToken(Item& item) {
+  const bool evictToNvmCache = shouldWriteToNvmCache(item);
+  return evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
+                         : typename NvmCacheT::PutToken{};
+}
+
+template <typename CacheTrait>
+void CacheAllocator<CacheTrait>::unlinkItemForEviction(Item& it) {
+  XDCHECK(it.isMarkedForEviction());
+  XDCHECK(it.getRefCount() == 0);
+  accessContainer_->remove(it);
+  removeFromMMContainer(it);
 
+  // Since we managed to mark the item for eviction we must be the only
+  // owner of the item.
+  const auto ref = it.unmarkForEviction();
+  XDCHECK(ref == 0u);
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::Item*
+CacheAllocator<CacheTrait>::findEviction(TierId tid, PoolId pid, ClassId cid) {
+  auto& mmContainer = getMMContainer(tid, pid, cid);
+  bool lastTier = tid+1 >= getNumTiers();
   // Keep searching for a candidate until we were able to evict it
   // or until the search limit has been exhausted
   unsigned int searchTries = 0;
-  auto itr = mmContainer.getEvictionIterator();
   while ((config_.evictionSearchTries == 0 ||
-          config_.evictionSearchTries > searchTries) &&
-         itr) {
-    ++searchTries;
-    (*stats_.evictionAttempts)[pid][cid].inc();
+          config_.evictionSearchTries > searchTries)) {
+
+    Item* toRecycle = nullptr;
+    Item* candidate = nullptr;
+    typename NvmCacheT::PutToken token;
+
+    mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
+                                      &searchTries, &mmContainer, &lastTier,
+                                      &token](auto&& itr) {
+      if (!itr) {
+        ++searchTries;
+        (*stats_.evictionAttempts)[pid][cid].inc();
+        return;
+      }
 
-    Item* toRecycle = itr.get();
+      while ((config_.evictionSearchTries == 0 ||
+              config_.evictionSearchTries > searchTries) &&
+             itr) {
+        ++searchTries;
+        (*stats_.evictionAttempts)[pid][cid].inc();
+
+        auto* toRecycle_ = itr.get();
+        auto* candidate_ =
+            toRecycle_->isChainedItem()
+                ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+                : toRecycle_;
+
+        if (lastTier) {
+          // if it's last tier, the item will be evicted
+          // need to create put token before marking it exclusive
+          token = createPutToken(*candidate_);
+        }
 
-    Item* candidate =
-        toRecycle->isChainedItem()
-            ? &toRecycle->asChainedItem().getParentItem(compressor_)
-            : toRecycle;
+        if (lastTier && shouldWriteToNvmCache(*candidate_) && !token.isValid()) {
+          stats_.evictFailConcurrentFill.inc();
+        } else if ( (lastTier && candidate_->markForEviction()) ||
+                    (!lastTier && candidate_->markMoving(true)) ) {
+          XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction());
+          // markForEviction to make sure no other thead is evicting the item
+          // nor holding a handle to that item if this is last tier
+          // since we won't be moving the item to the next tier
+          toRecycle = toRecycle_;
+          candidate = candidate_;
+
+          // Check if parent changed for chained items - if yes, we cannot
+          // remove the child from the mmContainer as we will not be evicting
+          // it. We could abort right here, but we need to cleanup in case
+          // unmarkForEviction() returns 0 - so just go through normal path.
+          if (!toRecycle_->isChainedItem() ||
+              &toRecycle->asChainedItem().getParentItem(compressor_) ==
+                  candidate)
+            mmContainer.remove(itr);
+          return;
+        }
 
-    // make sure no other thead is evicting the item
-    if (candidate->getRefCount() != 0 || !candidate->markExclusive()) {
-      ++itr;
-      continue;
-    }
+        if (candidate_->hasChainedItem()) {
+          stats_.evictFailParentAC.inc();
+        } else {
+          stats_.evictFailAC.inc();
+        }
 
-    // for chained items, the ownership of the parent can change. We try to
-    // evict what we think as parent and see if the eviction of parent
-    // recycles the child we intend to.
-    bool evictionSuccessful = false;
-    {
-      auto toReleaseHandle =
-          itr->isChainedItem()
-              ? advanceIteratorAndTryEvictChainedItem(itr)
-              : advanceIteratorAndTryEvictRegularItem(mmContainer, itr);
-      evictionSuccessful = toReleaseHandle != nullptr;
-      // destroy toReleseHandle. The item won't be released to allocator
-      // since we marked it as exclusive.
-    }
-
-    const auto ref = candidate->unmarkExclusive();
-    if (ref == 0u) {
-      // Invalidate iterator since later on we may use this mmContainer
-      // again, which cannot be done unless we drop this iterator
-      itr.destroy();
-
-      // recycle the item. it's safe to do so, even if toReleaseHandle was
-      // NULL. If `ref` == 0 then it means that we are the last holder of
-      // that item.
-      if (candidate->hasChainedItem()) {
-        (*stats_.chainedItemEvictions)[pid][cid].inc();
-      } else {
-        (*stats_.regularItemEvictions)[pid][cid].inc();
+        ++itr;
+        XDCHECK(toRecycle == nullptr);
+        XDCHECK(candidate == nullptr);
       }
+    });
 
-      if (auto eventTracker = getEventTracker()) {
-        eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
-                             AllocatorApiResult::EVICTED, candidate->getSize(),
-                             candidate->getConfiguredTTL().count());
-      }
+    if (!toRecycle)
+      continue;
 
-      // check if by releasing the item we intend to, we actually
-      // recycle the candidate.
-      if (ReleaseRes::kRecycled ==
-          releaseBackToAllocator(*candidate, RemoveContext::kEviction,
-                                 /* isNascent */ false, toRecycle)) {
-        return toRecycle;
+    XDCHECK(toRecycle);
+    XDCHECK(candidate);
+
+    auto evictedToNext = lastTier ? nullptr
+        : tryEvictToNextMemoryTier(*candidate, false);
+    if (!evictedToNext) {
+      //if insertOrReplace was called during move
+      //then candidate will not be accessible (failed replace during tryEvict)
+      // - therefore this was why we failed to
+      //   evict to the next tier and insertOrReplace
+      //   will remove from NVM cache
+      //however, if candidate is accessible
+      //that means the allocation in the next
+      //tier failed - so we will continue to
+      //evict the item to NVM cache
+      bool failedToReplace = !candidate->isAccessible();
+      if (!token.isValid() && !failedToReplace) {
+        token = createPutToken(*candidate);
       }
+      // tryEvictToNextMemoryTier can fail if:
+      //    a) allocation of the new item fails in that case,
+      //       it should be still possible to mark item for eviction.
+      //    b) another thread calls insertOrReplace and the item
+      //       is no longer accessible
+      //
+      // in case that we are on the last tier, we whould have already marked
+      // as exclusive since we will not be moving the item to the next tier
+      // but rather just evicting all together, no need to
+      // markForEvictionWhenMoving
+      auto ret = lastTier ? true : candidate->markForEvictionWhenMoving();
+      XDCHECK(ret);
+
+      unlinkItemForEviction(*candidate);
+      
+      if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
+              && !failedToReplace) {
+        nvmCache_->put(*candidate, std::move(token));
+      }
+      // wake up any readers that wait for the move to complete
+      // it's safe to do now, as we have the item marked exclusive and
+      // no other reader can be added to the waiters list
+      wakeUpWaiters(*candidate, {});
+
     } else {
-      XDCHECK(!evictionSuccessful);
+      XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+      XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+      XDCHECK(!candidate->isAccessible());
+      XDCHECK(candidate->getKey() == evictedToNext->getKey());
+
+      wakeUpWaiters(*candidate, std::move(evictedToNext));
     }
 
-    // If we destroyed the itr to possibly evict and failed, we restart
-    // from the beginning again
-    if (!itr) {
-      itr.resetToBegin();
+    XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
+    // recycle the item. it's safe to do so, even if toReleaseHandle was
+    // NULL. If `ref` == 0 then it means that we are the last holder of
+    // that item.
+    if (candidate->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[pid][cid].inc();
+    } else {
+      (*stats_.regularItemEvictions)[pid][cid].inc();
+    }
+
+    if (auto eventTracker = getEventTracker()) {
+      eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
+                           AllocatorApiResult::EVICTED, candidate->getSize(),
+                           candidate->getConfiguredTTL().count());
+    }
+
+    // check if by releasing the item we intend to, we actually
+    // recycle the candidate.
+    auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                      /* isNascent */ false, toRecycle);
+    if (ret == ReleaseRes::kRecycled) {
+      return toRecycle;
     }
   }
   return nullptr;
@@ -1363,6 +1846,97 @@ bool CacheAllocator<CacheTrait>::shouldWriteToNvmCacheExclusive(
   return true;
 }
 
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+  XDCHECK(item.isMoving());
+  XDCHECK(item.getRefCount() == 0);
+  if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet
+  if(item.isExpired()) {
+    accessContainer_->remove(item);
+    item.unmarkMoving();
+    return acquire(&item);
+  }
+
+  TierId nextTier = tid;
+  while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+    if (!shouldEvictToNextMemoryTier(tid, nextTier, pid, item))
+      continue;
+
+    // allocateInternal might trigger another eviction
+    auto newItemHdl = allocateInternalTier(nextTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime(),
+                     fromBgThread);
+
+    if (newItemHdl) {
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      if (!moveRegularItemWithSync(item, newItemHdl)) {
+          return WriteHandle{};
+      }
+      XDCHECK_EQ(newItemHdl->getKey(),item.getKey());
+      item.unmarkMoving();
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) {
+  auto tid = getTierId(item);
+  auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+  return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(
+    TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+  if(item.isExpired()) { return {}; }
+  TierId nextTier = tid;
+  while (nextTier > 0) { // try to evict down to the next memory tiers
+    auto toPromoteTier = nextTier - 1;
+    --nextTier;
+
+    // allocateInternal might trigger another eviction
+    auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
+                     item.getKey(),
+                     item.getSize(),
+                     item.getCreationTime(),
+                     item.getExpiryTime(),
+                     fromBgThread);
+
+    if (newItemHdl) {
+      XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+      if (!moveRegularItemWithSync(item, newItemHdl)) {
+          return WriteHandle{};
+      }
+      item.unmarkMoving();
+      return newItemHdl;
+    } else {
+      return WriteHandle{};
+    }
+  }
+
+  return {};
+}
+
+template <typename CacheTrait>
+typename CacheAllocator<CacheTrait>::WriteHandle
+CacheAllocator<CacheTrait>::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
+    auto tid = getTierId(item);
+    auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+    return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::RemoveRes
 CacheAllocator<CacheTrait>::remove(typename Item::Key key) {
@@ -1453,7 +2027,7 @@ bool CacheAllocator<CacheTrait>::pushToNvmCacheFromRamForTesting(
 
   if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) &&
       shouldWriteToNvmCacheExclusive(*handle)) {
-    nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey()));
+    nvmCache_->put(*handle, nvmCache_->createPutToken(handle->getKey()));
     return true;
   }
   return false;
@@ -1563,21 +2137,57 @@ void CacheAllocator<CacheTrait>::invalidateNvm(Item& item) {
   }
 }
 
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const Item& item) const {
+  return getTierId(item.getMemory());
+}
+
+template <typename CacheTrait>
+TierId
+CacheAllocator<CacheTrait>::getTierId(const void* ptr) const {
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    if (allocator_[tid]->isMemoryInAllocator(ptr))
+      return tid;
+  }
+
+  throw std::invalid_argument("Item does not belong to any tier!");
+}
+
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
 CacheAllocator<CacheTrait>::getMMContainer(const Item& item) const noexcept {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
-  return getMMContainer(allocInfo.poolId, allocInfo.classId);
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
+  return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
 }
 
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::MMContainer&
-CacheAllocator<CacheTrait>::getMMContainer(PoolId pid,
+CacheAllocator<CacheTrait>::getMMContainer(TierId tid,
+                                           PoolId pid,
                                            ClassId cid) const noexcept {
-  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_.size());
-  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[pid].size());
-  return *mmContainers_[pid][cid];
+  XDCHECK_LT(static_cast<size_t>(tid), mmContainers_.size());
+  XDCHECK_LT(static_cast<size_t>(pid), mmContainers_[tid].size());
+  XDCHECK_LT(static_cast<size_t>(cid), mmContainers_[tid][pid].size());
+  return *mmContainers_[tid][pid][cid];
+}
+
+template <typename CacheTrait>
+MMContainerStat CacheAllocator<CacheTrait>::getMMContainerStat(
+    TierId tid, PoolId pid, ClassId cid) const noexcept {
+  if(static_cast<size_t>(tid) >= mmContainers_.size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
+    return MMContainerStat{};
+  }
+  if (static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
+    return MMContainerStat{};
+  }
+  return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats()
+                                 : MMContainerStat{};
 }
 
 template <typename CacheTrait>
@@ -1763,8 +2373,9 @@ void CacheAllocator<CacheTrait>::markUseful(const ReadHandle& handle,
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
                                                            AccessMode mode) {
+  const auto tid = getTierId(item);
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&item));
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item));
   (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
 
   // track recently accessed items if needed
@@ -1772,14 +2383,15 @@ bool CacheAllocator<CacheTrait>::recordAccessInMMContainer(Item& item,
     ring_->trackItem(reinterpret_cast<uintptr_t>(&item), item.getSize());
   }
 
-  auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+  auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
   return mmContainer.recordAccess(item, mode);
 }
 
 template <typename CacheTrait>
 uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
+  const auto tid = getTierId(item);
   const auto allocSize =
-      allocator_->getAllocInfo(static_cast<const void*>(&item)).allocSize;
+      allocator_[tid]->getAllocInfo(static_cast<const void*>(&item)).allocSize;
   return item.isChainedItem()
              ? allocSize - ChainedItem::getRequiredSize(0)
              : allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -1788,8 +2400,10 @@ uint32_t CacheAllocator<CacheTrait>::getUsableSize(const Item& item) const {
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::SampleItem
 CacheAllocator<CacheTrait>::getSampleItem() {
-  size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
-  size_t ramCacheSize = allocator_->getMemorySizeInclAdvised();
+  // TODO: is using random tier a good idea?
+  auto tid = folly::Random::rand32() % getNumTiers();
+  static size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
+  static size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised();
 
   bool fromNvm =
       folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize;
@@ -1798,19 +2412,18 @@ CacheAllocator<CacheTrait>::getSampleItem() {
   }
 
   // Sampling from DRAM cache
-  auto item = reinterpret_cast<const Item*>(allocator_->getRandomAlloc());
+  auto item = reinterpret_cast<const Item*>(allocator_[tid]->getRandomAlloc());
   if (!item) {
     return SampleItem{false /* fromNvm */};
   }
 
   // Check that item returned is the same that was sampled
-
   auto sharedHdl = std::make_shared<ReadHandle>(findInternal(item->getKey()));
   if (sharedHdl->get() != item) {
     return SampleItem{false /* fromNvm */};
   }
 
-  const auto allocInfo = allocator_->getAllocInfo(item->getMemory());
+  const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory());
 
   // Convert the Item to IOBuf to make SampleItem
   auto iobuf = folly::IOBuf{
@@ -1829,28 +2442,33 @@ CacheAllocator<CacheTrait>::getSampleItem() {
 
 template <typename CacheTrait>
 std::vector<std::string> CacheAllocator<CacheTrait>::dumpEvictionIterator(
-    PoolId pid, ClassId cid, size_t numItems) {
+  PoolId pid, ClassId cid, size_t numItems) {
   if (numItems == 0) {
     return {};
   }
 
-  if (static_cast<size_t>(pid) >= mmContainers_.size() ||
-      static_cast<size_t>(cid) >= mmContainers_[pid].size()) {
+  // Always evict from the lowest layer.
+  int tid = getNumTiers() - 1;
+
+  if (static_cast<size_t>(tid) >= mmContainers_.size() ||
+      static_cast<size_t>(pid) >= mmContainers_[tid].size() ||
+      static_cast<size_t>(cid) >= mmContainers_[tid][pid].size()) {
     throw std::invalid_argument(
-        folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid));
+        folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid));
   }
 
   std::vector<std::string> content;
 
-  auto& mm = *mmContainers_[pid][cid];
-  auto evictItr = mm.getEvictionIterator();
-  size_t i = 0;
-  while (evictItr && i < numItems) {
-    content.push_back(evictItr->toString());
-    ++evictItr;
-    ++i;
+  while (tid >= 0) {
+    auto& mm = *mmContainers_[tid][pid][cid];
+    mm.withEvictionIterator([&content, numItems](auto&& itr) {
+      while (itr && content.size() < numItems) {
+        content.push_back(itr->toString());
+        ++itr;
+      }
+    });
+    --tid;
   }
-
   return content;
 }
 
@@ -2026,19 +2644,50 @@ PoolId CacheAllocator<CacheTrait>::addPool(
     std::shared_ptr<RebalanceStrategy> resizeStrategy,
     bool ensureProvisionable) {
   folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_);
-  auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable);
+
+  PoolId pid = 0;
+  size_t totalCacheSize = 0;
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    totalCacheSize += allocator_[tid]->getMemorySize();
+  }
+
+  for (TierId tid = 0; tid < getNumTiers(); tid++) {
+    auto tierSizeRatio =
+        static_cast<double>(allocator_[tid]->getMemorySize()) / totalCacheSize;
+    size_t tierPoolSize = static_cast<size_t>(tierSizeRatio * size);
+    
+    // TODO: what if we manage to add pool only in one tier?
+    // we should probably remove that on failure
+    auto res = allocator_[tid]->addPool(
+        name, tierPoolSize, allocSizes, ensureProvisionable);
+    XDCHECK(tid == 0 || res == pid);
+    pid = res;
+  }
+
   createMMContainers(pid, std::move(config));
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
   setResizeStrategy(pid, std::move(resizeStrategy));
+
+  if (backgroundEvictor_.size()) {
+    for (size_t id = 0; id < backgroundEvictor_.size(); id++)
+      backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0));
+  }
+
+  if (backgroundPromoter_.size()) {
+    for (size_t id = 0; id < backgroundPromoter_.size(); id++)
+      backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1));
+  }
+
   return pid;
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> rebalanceStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setRebalanceStrategy(pid, std::move(rebalanceStrategy));
 }
@@ -2046,9 +2695,9 @@ void CacheAllocator<CacheTrait>::overridePoolRebalanceStrategy(
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::overridePoolResizeStrategy(
     PoolId pid, std::shared_ptr<RebalanceStrategy> resizeStrategy) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  if (static_cast<size_t>(pid) >= mmContainers_[0].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
   }
   setResizeStrategy(pid, std::move(resizeStrategy));
 }
@@ -2060,14 +2709,14 @@ void CacheAllocator<CacheTrait>::overridePoolOptimizeStrategy(
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
+void CacheAllocator<CacheTrait>::overridePoolConfig(TierId tid, PoolId pid,
                                                     const MMConfig& config) {
-  if (static_cast<size_t>(pid) >= mmContainers_.size()) {
+  // TODO: add generic tier id checking
+  if (static_cast<size_t>(pid) >= mmContainers_[tid].size()) {
     throw std::invalid_argument(folly::sformat(
-        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+        "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size()));
   }
-
-  auto& pool = allocator_->getPool(pid);
+  auto& pool = allocator_[tid]->getPool(pid);
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     MMConfig mmConfig = config;
     mmConfig.addExtraConfig(
@@ -2075,29 +2724,35 @@ void CacheAllocator<CacheTrait>::overridePoolConfig(PoolId pid,
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    DCHECK_NOTNULL(mmContainers_[pid][cid].get());
-    mmContainers_[pid][cid]->setConfig(mmConfig);
+    DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get());
+    mmContainers_[tid][pid][cid]->setConfig(mmConfig);
   }
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::createMMContainers(const PoolId pid,
                                                     MMConfig config) {
-  auto& pool = allocator_->getPool(pid);
+  // pools on each layer should have the same number of class id, etc.
+  // TODO: think about deduplication
+  auto& pool = allocator_[0]->getPool(pid);
+
   for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) {
     config.addExtraConfig(
         config_.trackTailHits
             ? pool.getAllocationClass(static_cast<ClassId>(cid))
                   .getAllocsPerSlab()
             : 0);
-    mmContainers_[pid][cid].reset(new MMContainer(config, compressor_));
+    for (TierId tid = 0; tid < getNumTiers(); tid++) {
+      mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_));
+    }
   }
 }
 
 template <typename CacheTrait>
 PoolId CacheAllocator<CacheTrait>::getPoolId(
     folly::StringPiece name) const noexcept {
-  return allocator_->getPoolId(name.str());
+  // each tier has the same pools
+  return allocator_[0]->getPoolId(name.str());
 }
 
 // The Function returns a consolidated vector of Release Slab
@@ -2140,7 +2795,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::filterCompactCachePools(
 template <typename CacheTrait>
 std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIds() const {
   folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_);
-  return filterCompactCachePools(allocator_->getPoolIds());
+  // TODO - get rid of the duplication - right now, each tier
+  // holds pool objects with mostly the same info
+  return filterCompactCachePools(allocator_[0]->getPoolIds());
 }
 
 template <typename CacheTrait>
@@ -2165,10 +2822,9 @@ std::set<PoolId> CacheAllocator<CacheTrait>::getRegularPoolIdsForResize()
   // getAdvisedMemorySize - then pools may be overLimit even when
   // all slabs are not allocated. Otherwise, pools may be overLimit
   // only after all slabs are allocated.
-  //
-  return (allocator_->allSlabsAllocated()) ||
-                 (allocator_->getAdvisedMemorySize() != 0)
-             ? filterCompactCachePools(allocator_->getPoolsOverLimit())
+  return (allocator_[currentTier()]->allSlabsAllocated()) ||
+                 (allocator_[currentTier()]->getAdvisedMemorySize() != 0)
+             ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit())
              : std::set<PoolId>{};
 }
 
@@ -2177,9 +2833,19 @@ const std::string CacheAllocator<CacheTrait>::getCacheName() const {
   return config_.cacheName;
 }
 
+template <typename CacheTrait>
+size_t CacheAllocator<CacheTrait>::getPoolSize(PoolId poolId) const {
+  size_t poolSize = 0;
+  for (auto& allocator: allocator_) {
+    const auto& pool = allocator->getPool(poolId);
+    poolSize += pool.getPoolSize();
+  }
+  return poolSize;
+}
+
 template <typename CacheTrait>
 PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
-  const auto& pool = allocator_->getPool(poolId);
+  const auto& pool = allocator_[currentTier()]->getPool(poolId);
   const auto& allocSizes = pool.getAllocSizes();
   auto mpStats = pool.getStats();
   const auto& classIds = mpStats.classIds;
@@ -2198,7 +2864,7 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   if (!isCompactCache) {
     for (const ClassId cid : classIds) {
       uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get();
-      XDCHECK(mmContainers_[poolId][cid],
+      XDCHECK(mmContainers_[currentTier()][poolId][cid],
               folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid));
       cacheStats.insert(
           {cid,
@@ -2208,16 +2874,14 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
             (*stats_.fragmentationSize)[poolId][cid].get(), classHits,
             (*stats_.chainedItemEvictions)[poolId][cid].get(),
             (*stats_.regularItemEvictions)[poolId][cid].get(),
-            mmContainers_[poolId][cid]->getStats()}
-
-          });
+            getMMContainerStat(currentTier(), poolId, cid)}});
       totalHits += classHits;
     }
   }
 
   PoolStats ret;
   ret.isCompactCache = isCompactCache;
-  ret.poolName = allocator_->getPoolName(poolId);
+  ret.poolName = allocator_[currentTier()]->getPoolName(poolId);
   ret.poolSize = pool.getPoolSize();
   ret.poolUsableSize = pool.getPoolUsableSize();
   ret.poolAdvisedSize = pool.getPoolAdvisedSize();
@@ -2229,29 +2893,39 @@ PoolStats CacheAllocator<CacheTrait>::getPoolStats(PoolId poolId) const {
   return ret;
 }
 
+template <typename CacheTrait>
+ACStats CacheAllocator<CacheTrait>::getACStats(TierId tid,
+                                               PoolId poolId,
+                                               ClassId classId) const {
+  const auto& pool = allocator_[tid]->getPool(poolId);
+  const auto& ac = pool.getAllocationClass(classId);
+
+  auto stats = ac.getStats();
+  stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId];
+  return stats;
+}
+
 template <typename CacheTrait>
 PoolEvictionAgeStats CacheAllocator<CacheTrait>::getPoolEvictionAgeStats(
     PoolId pid, unsigned int slabProjectionLength) const {
   PoolEvictionAgeStats stats;
-
-  const auto& pool = allocator_->getPool(pid);
+  const auto& pool = allocator_[currentTier()]->getPool(pid);
   const auto& allocSizes = pool.getAllocSizes();
   for (ClassId cid = 0; cid < static_cast<ClassId>(allocSizes.size()); ++cid) {
-    auto& mmContainer = getMMContainer(pid, cid);
+    auto& mmContainer = getMMContainer(currentTier(), pid, cid);
     const auto numItemsPerSlab =
-        allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
+        allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab();
     const auto projectionLength = numItemsPerSlab * slabProjectionLength;
     stats.classEvictionAgeStats[cid] =
         mmContainer.getEvictionAgeStat(projectionLength);
   }
-
   return stats;
 }
 
 template <typename CacheTrait>
 CacheMetadata CacheAllocator<CacheTrait>::getCacheMetadata() const noexcept {
   return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion,
-                       kCacheNvmFormatVersion, config_.size};
+                       kCacheNvmFormatVersion, config_.getCacheSize()};
 }
 
 template <typename CacheTrait>
@@ -2283,7 +2957,7 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
   }
 
   try {
-    auto releaseContext = allocator_->startSlabRelease(
+    auto releaseContext = allocator_[currentTier()]->startSlabRelease(
         pid, victim, receiver, mode, hint,
         [this]() -> bool { return shutDownInProgress_; });
 
@@ -2292,15 +2966,15 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
       return;
     }
 
-    releaseSlabImpl(releaseContext);
-    if (!allocator_->allAllocsFreed(releaseContext)) {
+    releaseSlabImpl(currentTier(), releaseContext);
+    if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) {
       throw std::runtime_error(
           folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}",
                          releaseContext.getPoolId(),
                          releaseContext.getClassId()));
     }
 
-    allocator_->completeSlabRelease(releaseContext);
+    allocator_[currentTier()]->completeSlabRelease(releaseContext);
   } catch (const exception::SlabReleaseAborted& e) {
     stats_.numAbortedSlabReleases.inc();
     throw exception::SlabReleaseAborted(folly::sformat(
@@ -2311,8 +2985,7 @@ void CacheAllocator<CacheTrait>::releaseSlab(PoolId pid,
 }
 
 template <typename CacheTrait>
-SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats()
-    const noexcept {
+SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats() const noexcept {
   std::lock_guard<std::mutex> l(workersMutex_);
   return SlabReleaseStats{stats_.numActiveSlabReleases.get(),
                           stats_.numReleasedForRebalance.get(),
@@ -2330,7 +3003,7 @@ SlabReleaseStats CacheAllocator<CacheTrait>::getSlabReleaseStats()
 }
 
 template <typename CacheTrait>
-void CacheAllocator<CacheTrait>::releaseSlabImpl(
+void CacheAllocator<CacheTrait>::releaseSlabImpl(TierId tid,
     const SlabReleaseContext& releaseContext) {
   auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs());
   bool releaseStuck = false;
@@ -2358,10 +3031,11 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
   //  3. If 2 is successful, Move or Evict
   //  4. Move on to the next item if current item is freed
   for (auto alloc : releaseContext.getActiveAllocations()) {
+    auto startTimeSec = util::getCurrentTimeSec();
     // Need to mark an item for release before proceeding
     // If we can't mark as moving, it means the item is already freed
     const bool isAlreadyFreed =
-        !markExclusiveForSlabRelease(releaseContext, alloc, throttler);
+        !markMovingForSlabRelease(releaseContext, alloc, throttler);
     if (isAlreadyFreed) {
       continue;
     }
@@ -2375,7 +3049,7 @@ void CacheAllocator<CacheTrait>::releaseSlabImpl(
     if (!isMoved) {
       evictForSlabRelease(releaseContext, item, throttler);
     }
-    XDCHECK(allocator_->isAllocFreed(releaseContext, alloc));
+    XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc));
   }
 }
 
@@ -2389,6 +3063,14 @@ void CacheAllocator<CacheTrait>::throttleWith(util::Throttler& t,
   }
 }
 
+template <typename CacheTrait>
+typename RefcountWithFlags::Value CacheAllocator<CacheTrait>::unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle)
+{
+  auto ret = item.unmarkMoving();
+  wakeUpWaiters(item, std::move(handle));
+  return ret;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) {
@@ -2398,7 +3080,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
 
   bool isMoved = false;
   auto startTime = util::getCurrentTimeSec();
-  WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem);
+  WriteHandle newItemHdl{};
 
   for (unsigned int itemMovingAttempts = 0;
        itemMovingAttempts < config_.movingTries;
@@ -2406,16 +3088,24 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
     stats_.numMoveAttempts.inc();
 
     // Nothing to move and the key is likely also bogus for chained items.
-    if (oldItem.isOnlyExclusive()) {
-      oldItem.unmarkExclusive();
+    if (oldItem.isOnlyMoving()) {
+      auto ret = unmarkMovingAndWakeUpWaiters(oldItem, {});
+      XDCHECK(ret == 0);
       const auto res =
           releaseBackToAllocator(oldItem, RemoveContext::kNormal, false);
       XDCHECK(res == ReleaseRes::kReleased);
       return true;
     }
 
+    throttleWith(throttler, [&] {
+      XLOGF(WARN,
+            "Spent {} seconds, slab release still trying to move Item: {}. "
+            "Pool: {}, Class: {}.",
+            util::getCurrentTimeSec() - startTime, oldItem.toString(),
+            ctx.getPoolId(), ctx.getClassId());
+    });
+
     if (!newItemHdl) {
-      // try to allocate again if it previously wasn't successful
       newItemHdl = allocateNewItemForOldItem(oldItem);
     }
 
@@ -2426,14 +3116,6 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
         break;
       }
     }
-
-    throttleWith(throttler, [&] {
-      XLOGF(WARN,
-            "Spent {} seconds, slab release still trying to move Item: {}. "
-            "Pool: {}, Class: {}.",
-            util::getCurrentTimeSec() - startTime, oldItem.toString(),
-            ctx.getPoolId(), ctx.getClassId());
-    });
   }
 
   // Return false if we've exhausted moving tries.
@@ -2446,7 +3128,7 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
   // that's identical to this one to replace it. Here we just need to wait
   // until all users have dropped the item handles before we can proceed.
   startTime = util::getCurrentTimeSec();
-  while (!oldItem.isOnlyExclusive()) {
+  while (!oldItem.isOnlyMoving()) {
     throttleWith(throttler, [&] {
       XLOGF(WARN,
             "Spent {} seconds, slab release still waiting for refcount to "
@@ -2455,8 +3137,12 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
             ctx.getPoolId(), ctx.getClassId());
     });
   }
-  const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory());
-  allocator_->free(&oldItem);
+  auto tid = getTierId(oldItem);
+  auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl));
+  XDCHECK(ref == 0);
+
+  const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory());
+  allocator_[tid]->free(&oldItem);
 
   (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub(
       util::getFragmentation(*this, oldItem));
@@ -2465,10 +3151,10 @@ bool CacheAllocator<CacheTrait>::moveForSlabRelease(
 }
 
 template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::ReadHandle
+typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::validateAndGetParentHandleForChainedMoveLocked(
     const ChainedItem& item, const Key& parentKey) {
-  ReadHandle parentHandle{};
+  WriteHandle parentHandle{};
   try {
     parentHandle = findInternal(parentKey);
     // If the parent is not the same as the parent of the chained item,
@@ -2487,6 +3173,7 @@ CacheAllocator<CacheTrait>::validateAndGetParentHandleForChainedMoveLocked(
 template <typename CacheTrait>
 typename CacheAllocator<CacheTrait>::WriteHandle
 CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
+  XDCHECK(oldItem.isMoving());
   if (oldItem.isChainedItem()) {
     const auto& oldChainedItem = oldItem.asChainedItem();
     const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey();
@@ -2500,33 +3187,35 @@ CacheAllocator<CacheTrait>::allocateNewItemForOldItem(const Item& oldItem) {
       return {};
     }
 
-    // Set up the destination for the move. Since oldChainedItem would have
-    // the exclusive bit set, it won't be picked for eviction.
+    // Set up the destination for the move. Since oldChainedItem would
+    // be marked as moving, it won't be picked for eviction.
     auto newItemHdl =
-        allocateChainedItemInternal(parentHandle, oldChainedItem.getSize());
+        allocateChainedItemInternal(parentHandle, oldItem.getSize());
     if (!newItemHdl) {
       return {};
     }
 
-    XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize());
+    XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
     auto parentPtr = parentHandle.getInternal();
     XDCHECK_EQ(reinterpret_cast<uintptr_t>(parentPtr),
                reinterpret_cast<uintptr_t>(
-                   &oldChainedItem.getParentItem(compressor_)));
+                   &newItemHdl->asChainedItem().getParentItem(compressor_)));
 
     return newItemHdl;
   }
 
   const auto allocInfo =
-      allocator_->getAllocInfo(static_cast<const void*>(&oldItem));
+      allocator_[getTierId(oldItem)]->getAllocInfo(static_cast<const void*>(&oldItem));
 
   // Set up the destination for the move. Since oldItem would have the moving
   // bit set, it won't be picked for eviction.
-  auto newItemHdl = allocateInternal(allocInfo.poolId,
-                                     oldItem.getKey(),
-                                     oldItem.getSize(),
-                                     oldItem.getCreationTime(),
-                                     oldItem.getExpiryTime());
+  auto newItemHdl = allocateInternalTier(getTierId(oldItem),
+                                         allocInfo.poolId,
+                                         oldItem.getKey(),
+                                         oldItem.getSize(),
+                                         oldItem.getCreationTime(),
+                                         oldItem.getExpiryTime(),
+                                         false);
   if (!newItemHdl) {
     return {};
   }
@@ -2545,7 +3234,8 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
   // a regular item or chained item is synchronized with any potential
   // user-side mutation.
   std::unique_ptr<SyncObj> syncObj;
-  if (config_.movingSync) {
+  if (config_.movingSync && getNumTiers() == 1) {
+    // TODO: use moving-bit synchronization for single tier as well
     if (!oldItem.isChainedItem()) {
       syncObj = config_.movingSync(oldItem.getKey());
     } else {
@@ -2553,7 +3243,7 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
       // item is still valid.
       const std::string parentKey =
           oldItem.asChainedItem().getParentItem(compressor_).getKey().str();
-      if (oldItem.isOnlyExclusive()) {
+      if (oldItem.isOnlyMoving()) {
         // If chained item no longer has a refcount, its parent is already
         // being released, so we abort this try to moving.
         return false;
@@ -2570,67 +3260,59 @@ bool CacheAllocator<CacheTrait>::tryMovingForSlabRelease(
     }
   }
 
-  return oldItem.isChainedItem()
-             ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
-             : moveRegularItem(oldItem, newItemHdl);
+  // TODO: we can unify move*Item and move*ItemWithSync by always
+  // using the moving bit to block readers.
+  if (getNumTiers() == 1) {
+    return oldItem.isChainedItem()
+              ? moveChainedItem(oldItem.asChainedItem(), newItemHdl)
+              : moveRegularItem(oldItem, newItemHdl);
+  } else {
+    if (oldItem.isChainedItem() || oldItem.hasChainedItem()) {
+      // TODO: add support for chained items
+      return false;
+    } else {
+      //move can fail if another thread calls insertOrReplace
+      //in this case oldItem is no longer valid (not accessible, 
+      //it gets removed from MMContainer and evictForSlabRelease
+      //will send it back to the allocator
+      bool ret = moveRegularItemWithSync(oldItem, newItemHdl);
+      if (!ret) {
+          //we failed to move - newItemHdl was released back to allocator
+          //by the moveRegularItemWithSync but oldItem is not accessible
+          //and no longer valid - we need to clean it up here
+          XDCHECK(!oldItem.isAccessible());
+          oldItem.markForEvictionWhenMoving();
+          unlinkItemForEviction(oldItem);
+          wakeUpWaiters(oldItem, {});
+      } else {
+        removeFromMMContainer(oldItem);
+      }
+      return ret;
+    }
+  }
+}
+
+template <typename CacheTrait>
+void CacheAllocator<CacheTrait>::wakeUpWaiters(Item& item, WriteHandle handle)
+{
+  // readers do not block on 'moving' items in case there is only one tier
+  if (getNumTiers() > 1) {
+    wakeUpWaitersLocked(item.getKey(), std::move(handle));
+  }
 }
 
 template <typename CacheTrait>
 void CacheAllocator<CacheTrait>::evictForSlabRelease(
     const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) {
   auto startTime = util::getCurrentTimeSec();
+
   while (true) {
+    XDCHECK(item.isMoving());
     stats_.numEvictionAttempts.inc();
 
-    // if the item is already in a state where only the exclusive bit is set,
-    // nothing needs to be done. We simply need to unmark exclusive bit and free
-    // the item.
-    if (item.isOnlyExclusive()) {
-      item.unmarkExclusive();
-      const auto res =
-          releaseBackToAllocator(item, RemoveContext::kNormal, false);
-      XDCHECK(ReleaseRes::kReleased == res);
-      return;
-    }
-
-    // Since we couldn't move, we now evict this item. Owning handle will be
-    // the item's handle for regular/normal items and will be the parent
-    // handle for chained items.
-    auto owningHandle =
-        item.isChainedItem()
-            ? evictChainedItemForSlabRelease(item.asChainedItem())
-            : evictNormalItemForSlabRelease(item);
-
-    // we managed to evict the corresponding owner of the item and have the
-    // last handle for the owner.
-    if (owningHandle) {
-      const auto allocInfo =
-          allocator_->getAllocInfo(static_cast<const void*>(&item));
-      if (owningHandle->hasChainedItem()) {
-        (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId]
-            .inc();
-      } else {
-        (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId]
-            .inc();
-      }
-
-      stats_.numEvictionSuccesses.inc();
-
-      // we have the last handle. no longer need to hold on to the exclusive bit
-      item.unmarkExclusive();
-
-      // manually decrement the refcount to call releaseBackToAllocator
-      const auto ref = decRef(*owningHandle);
-      XDCHECK(ref == 0);
-      const auto res = releaseBackToAllocator(*owningHandle.release(),
-                                              RemoveContext::kEviction, false);
-      XDCHECK(res == ReleaseRes::kReleased);
-      return;
-    }
-
     if (shutDownInProgress_) {
-      item.unmarkExclusive();
-      allocator_->abortSlabRelease(ctx);
+      auto ref = unmarkMovingAndWakeUpWaiters(item, {});
+      allocator_[getTierId(item)]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while trying to evict"
                          " Item: {} Pool: {}, Class: {}.",
@@ -2649,268 +3331,141 @@ void CacheAllocator<CacheTrait>::evictForSlabRelease(
                                        .toString())
                   : "");
     });
-  }
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictRegularItem(
-    MMContainer& mmContainer, EvictionIterator& itr) {
-  // we should flush this to nvmcache if it is not already present in nvmcache
-  // and the item is not expired.
-  Item& item = *itr;
-  const bool evictToNvmCache = shouldWriteToNvmCache(item);
-
-  auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
-                               : typename NvmCacheT::PutToken{};
-
-  // record the in-flight eviciton. If not, we move on to next item to avoid
-  // stalling eviction.
-  if (evictToNvmCache && !token.isValid()) {
-    ++itr;
-    stats_.evictFailConcurrentFill.inc();
-    return WriteHandle{};
-  }
-
-  // If there are other accessors, we should abort. Acquire a handle here since
-  // if we remove the item from both access containers and mm containers
-  // below, we will need a handle to ensure proper cleanup in case we end up
-  // not evicting this item
-  auto evictHandle = accessContainer_->removeIf(item, &itemExclusivePredicate);
-  if (!evictHandle) {
-    ++itr;
-    stats_.evictFailAC.inc();
-    return evictHandle;
-  }
-
-  mmContainer.remove(itr);
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(evictHandle.get()),
-             reinterpret_cast<uintptr_t>(&item));
-  XDCHECK(!evictHandle->isInMMContainer());
-  XDCHECK(!evictHandle->isAccessible());
-
-  // Invalidate iterator since later on if we are not evicting this
-  // item, we may need to rely on the handle we created above to ensure
-  // proper cleanup if the item's raw refcount has dropped to 0.
-  // And since this item may be a parent item that has some child items
-  // in this very same mmContainer, we need to make sure we drop this
-  // exclusive iterator so we can gain access to it when we're cleaning
-  // up the child items
-  itr.destroy();
-
-  // Ensure that there are no accessors after removing from the access
-  // container
-  XDCHECK(evictHandle->getRefCount() == 1);
-
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) {
-    XDCHECK(token.isValid());
-    nvmCache_->put(evictHandle, std::move(token));
-  }
-  return evictHandle;
-}
-
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::advanceIteratorAndTryEvictChainedItem(
-    EvictionIterator& itr) {
-  XDCHECK(itr->isChainedItem());
-
-  ChainedItem* candidate = &itr->asChainedItem();
-  ++itr;
-
-  // The parent could change at any point through transferChain. However, if
-  // that happens, we would realize that the releaseBackToAllocator return
-  // kNotRecycled and we would try another chained item, leading to transient
-  // failure.
-  auto& parent = candidate->getParentItem(compressor_);
+    // if the item is already in a state where only the exclusive bit is set,
+    // nothing needs to be done. We simply need to call unmarkMoving and free
+    // the item.
+    if (item.isOnlyMoving()) {
+      auto ref = unmarkMovingAndWakeUpWaiters(item, {});
+      XDCHECK(ref == 0);
+      const auto res =
+          releaseBackToAllocator(item, RemoveContext::kNormal, false);
+      XDCHECK(ReleaseRes::kReleased == res);
+      return;
+    }
 
-  const bool evictToNvmCache = shouldWriteToNvmCache(parent);
+    typename NvmCacheT::PutToken token;
+    Item* evicted;
+    if (item.isChainedItem()) {
+      auto& expectedParent = item.asChainedItem().getParentItem(compressor_);
+
+      if (getNumTiers() == 1) {
+        // TODO: unify this with multi-tier implementation
+        // right now, taking a chained item lock here would lead to deadlock
+        const std::string parentKey = expectedParent.getKey().str();
+        auto l = chainedItemLocks_.lockExclusive(parentKey);
+
+        // check if the child is still in mmContainer and the expected parent is
+        // valid under the chained item lock.
+        if (expectedParent.getKey() != parentKey || !item.isInMMContainer() ||
+            item.isOnlyMoving() ||
+            &expectedParent != &item.asChainedItem().getParentItem(compressor_) ||
+            !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
+          continue;
+        }
 
-  auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey())
-                               : typename NvmCacheT::PutToken{};
+        // search if the child is present in the chain
+        {
+          auto parentHandle = findInternal(parentKey);
+          if (!parentHandle || parentHandle != &expectedParent) {
+            continue;
+          }
+
+          ChainedItem* head = nullptr;
+          { // scope for the handle
+            auto headHandle = findChainedItem(expectedParent);
+            head = headHandle ? &headHandle->asChainedItem() : nullptr;
+          }
+
+          bool found = false;
+          while (head) {
+            if (head == &item) {
+              found = true;
+              break;
+            }
+            head = head->getNext(compressor_);
+          }
+
+          if (!found) {
+            continue;
+          }
+        }
+      }
 
-  // if token is invalid, return. iterator is already advanced.
-  if (evictToNvmCache && !token.isValid()) {
-    stats_.evictFailConcurrentFill.inc();
-    return WriteHandle{};
-  }
+      evicted = &expectedParent;
+      token = createPutToken(*evicted);
+      if (evicted->markForEviction()) {
+        // unmark the child so it will be freed
+        item.unmarkMoving();
+        unlinkItemForEviction(*evicted);
+        // wake up any readers that wait for the move to complete
+        // it's safe to do now, as we have the item marked exclusive and
+        // no other reader can be added to the waiters list
+        wakeUpWaiters(*evicted, {});
+      } else {
+        // TODO: potential deadlock with markUseful for parent item
+        // for now, we do not block any reader on child items but this
+        // should probably be fixed
+        continue;
+      }
+    } else {
+      evicted = &item;
 
-  // check if the parent exists in the hashtable and refcount is drained.
-  auto parentHandle =
-      accessContainer_->removeIf(parent, &itemExclusivePredicate);
-  if (!parentHandle) {
-    stats_.evictFailParentAC.inc();
-    return parentHandle;
-  }
+      token = createPutToken(*evicted);
+      if (evicted->markForEvictionWhenMoving()) {
+        unlinkItemForEviction(*evicted);
+        wakeUpWaiters(*evicted, {});
+      } else {
+        continue;
+      }
+    }
 
-  // Invalidate iterator since later on we may use the mmContainer
-  // associated with this iterator which cannot be done unless we
-  // drop this iterator
-  //
-  // This must be done once we know the parent is not nullptr.
-  // Since we can very well be the last holder of this parent item,
-  // which may have a chained item that is linked in this MM container.
-  itr.destroy();
+    if (token.isValid() && shouldWriteToNvmCacheExclusive(*evicted)) {
+      nvmCache_->put(*evicted, std::move(token));
+    }
 
-  // Ensure we have the correct parent and we're the only user of the
-  // parent, then free it from access container. Otherwise, we abort
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(&parent),
-             reinterpret_cast<uintptr_t>(parentHandle.get()));
-  XDCHECK_EQ(1u, parent.getRefCount());
+    const auto allocInfo =
+        allocator_[getTierId(*evicted)]->getAllocInfo(static_cast<const void*>(evicted));
+    if (evicted->hasChainedItem()) {
+      (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    } else {
+      (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc();
+    }
 
-  removeFromMMContainer(*parentHandle);
+    stats_.numEvictionSuccesses.inc();
 
-  XDCHECK(!parent.isInMMContainer());
-  XDCHECK(!parent.isAccessible());
+    XDCHECK(evicted->getRefCount() == 0);
+    const auto res =
+        releaseBackToAllocator(*evicted, RemoveContext::kEviction, false);
 
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) {
-    XDCHECK(token.isValid());
-    XDCHECK(parentHandle->hasChainedItem());
-    nvmCache_->put(parentHandle, std::move(token));
+    if (getNumTiers() == 1) {
+      XDCHECK(res == ReleaseRes::kReleased);
+    } else {
+      const bool isAlreadyFreed =
+          !markMovingForSlabRelease(ctx, &item, throttler);
+      if (!isAlreadyFreed) {
+        continue;
+      }
+    }
+  
+    return;
   }
-
-  return parentHandle;
 }
 
 template <typename CacheTrait>
+template <typename Fn>
 typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::evictNormalItemForSlabRelease(Item& item) {
-  XDCHECK(item.isExclusive());
-
-  if (item.isOnlyExclusive()) {
-    return WriteHandle{};
-  }
-
-  auto predicate = [](const Item& it) { return it.getRefCount() == 0; };
-
-  const bool evictToNvmCache = shouldWriteToNvmCache(item);
-  auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
-                               : typename NvmCacheT::PutToken{};
-
-  // We remove the item from both access and mm containers. It doesn't matter
-  // if someone else calls remove on the item at this moment, the item cannot
-  // be freed as long as we have the exclusive bit set.
-  auto handle = accessContainer_->removeIf(item, std::move(predicate));
+CacheAllocator<CacheTrait>::removeIf(Item& item, Fn&& predicate) {
+  auto handle = accessContainer_->removeIf(item, std::forward<Fn>(predicate));
 
-  if (!handle) {
-    return handle;
-  }
-
-  XDCHECK_EQ(reinterpret_cast<uintptr_t>(handle.get()),
+  if (handle) {
+    XDCHECK_EQ(reinterpret_cast<uintptr_t>(handle.get()),
              reinterpret_cast<uintptr_t>(&item));
-  XDCHECK_EQ(1u, handle->getRefCount());
-  removeFromMMContainer(item);
-
-  // now that we are the only handle and we actually removed something from
-  // the RAM cache, we enqueue it to nvmcache.
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) {
-    nvmCache_->put(handle, std::move(token));
+    removeFromMMContainer(item);
   }
 
   return handle;
 }
 
-template <typename CacheTrait>
-typename CacheAllocator<CacheTrait>::WriteHandle
-CacheAllocator<CacheTrait>::evictChainedItemForSlabRelease(ChainedItem& child) {
-  XDCHECK(child.isExclusive());
-
-  // We have the child marked as moving, but dont know anything about the
-  // state of the parent. Unlike the case of regular eviction where we are
-  // sure that the child is inside the MMContainer, ensuring its parent is
-  // valid, we can not make any assumptions here. We try to find the parent
-  // first through the access container and then verify that the parent's
-  // chain points to the child before cleaning up the parent. If the parent
-  // was in the process of being re-allocated or child was being removed
-  // concurrently, we would synchronize here on one of the checks.
-  Item& expectedParent = child.getParentItem(compressor_);
-
-  // Grab exclusive lock since we are modifying the chain. at this point, we
-  // dont know the state of the parent. so we need to do some validity checks
-  // after we have the chained item lock to ensure that we got the lock off of
-  // a valid state.
-  const std::string parentKey = expectedParent.getKey().str();
-  auto l = chainedItemLocks_.lockExclusive(parentKey);
-
-  // check if the child is still in mmContainer and the expected parent is
-  // valid under the chained item lock.
-  if (expectedParent.getKey() != parentKey || !child.isInMMContainer() ||
-      child.isOnlyExclusive() ||
-      &expectedParent != &child.getParentItem(compressor_) ||
-      !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) {
-    return {};
-  }
-
-  // search if the child is present in the chain
-  auto parentHandle = findInternal(parentKey);
-  if (!parentHandle || parentHandle != &expectedParent) {
-    return {};
-  }
-
-  ChainedItem* head = nullptr;
-  { // scope for the handle
-    auto headHandle = findChainedItem(expectedParent);
-    head = headHandle ? &headHandle->asChainedItem() : nullptr;
-  }
-
-  bool found = false;
-  while (head) {
-    if (head == &child) {
-      found = true;
-      break;
-    }
-    head = head->getNext(compressor_);
-  }
-
-  if (!found) {
-    return {};
-  }
-
-  // if we found the child in the parent's chain, we remove it and ensure that
-  // the handle we obtained was the last one. Before that, create a put token
-  // to guard any racing cache find to avoid item re-appearing in NvmCache.
-  const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent);
-
-  auto token = evictToNvmCache
-                   ? nvmCache_->createPutToken(expectedParent.getKey())
-                   : typename NvmCacheT::PutToken{};
-
-  if (!accessContainer_->removeIf(expectedParent,
-                                  parentEvictForSlabReleasePredicate)) {
-    return {};
-  }
-
-  // at this point, we should be the last handle owner
-  XDCHECK_EQ(1u, parentHandle->getRefCount());
-
-  // We remove the parent from both access and mm containers. It doesn't
-  // matter if someone else calls remove on the parent at this moment, it
-  // cannot be freed since we hold an active item handle
-  removeFromMMContainer(*parentHandle);
-
-  // In case someone else had removed this chained item from its parent by now
-  // So we check again to see if it has been unlinked from its parent
-  if (!child.isInMMContainer() || child.isOnlyExclusive()) {
-    return {};
-  }
-
-  // check after removing from the MMContainer that the parent is still not
-  // being marked as moving. If parent is moving, it will release the child
-  // item and we will wait for that.
-  if (parentHandle->isExclusive()) {
-    return {};
-  }
-
-  // now that we are the only handle and we actually removed something from
-  // the RAM cache, we enqueue it to nvmcache.
-  if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) {
-    DCHECK(parentHandle->hasChainedItem());
-    nvmCache_->put(parentHandle, std::move(token));
-  }
-
-  return parentHandle;
-}
-
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::removeIfExpired(const ReadHandle& handle) {
   if (!handle) {
@@ -2919,18 +3474,11 @@ bool CacheAllocator<CacheTrait>::removeIfExpired(const ReadHandle& handle) {
 
   // We remove the item from both access and mm containers.
   // We want to make sure the caller is the only one holding the handle.
-  auto removedHandle =
-      accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate);
-  if (removedHandle) {
-    removeFromMMContainer(*(handle.getInternal()));
-    return true;
-  }
-
-  return false;
+  return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate);
 }
 
 template <typename CacheTrait>
-bool CacheAllocator<CacheTrait>::markExclusiveForSlabRelease(
+bool CacheAllocator<CacheTrait>::markMovingForSlabRelease(
     const SlabReleaseContext& ctx, void* alloc, util::Throttler& throttler) {
   // MemoryAllocator::processAllocForRelease will execute the callback
   // if the item is not already free. So there are three outcomes here:
@@ -2945,18 +3493,23 @@ bool CacheAllocator<CacheTrait>::markExclusiveForSlabRelease(
   // At first, we assume this item was already freed
   bool itemFreed = true;
   bool markedMoving = false;
-  const auto fn = [&markedMoving, &itemFreed](void* memory) {
+  TierId tid = getTierId(alloc);
+  auto numTiers = getNumTiers();
+  const auto fn = [&markedMoving, &itemFreed, numTiers](void* memory) {
     // Since this callback is executed, the item is not yet freed
     itemFreed = false;
     Item* item = static_cast<Item*>(memory);
-    if (item->markExclusive()) {
+    // TODO: for chained items, moving bit is only used to avoid
+    // freeing the item prematurely
+    auto failIfRefNotZero = numTiers > 1 && !item->isChainedItem();
+    if (item->markMoving(failIfRefNotZero)) {
       markedMoving = true;
     }
   };
 
   auto startTime = util::getCurrentTimeSec();
   while (true) {
-    allocator_->processAllocForRelease(ctx, alloc, fn);
+    allocator_[tid]->processAllocForRelease(ctx, alloc, fn);
 
     // If item is already freed we give up trying to mark the item moving
     // and return false, otherwise if marked as moving, we return true.
@@ -2971,7 +3524,7 @@ bool CacheAllocator<CacheTrait>::markExclusiveForSlabRelease(
     itemFreed = true;
 
     if (shutDownInProgress_) {
-      allocator_->abortSlabRelease(ctx);
+      allocator_[tid]->abortSlabRelease(ctx);
       throw exception::SlabReleaseAborted(
           folly::sformat("Slab Release aborted while still trying to mark"
                          " as moving for Item: {}. Pool: {}, Class: {}.",
@@ -2994,12 +3547,15 @@ template <typename CCacheT, typename... Args>
 CCacheT* CacheAllocator<CacheTrait>::addCompactCache(folly::StringPiece name,
                                                      size_t size,
                                                      Args&&... args) {
+  if (getNumTiers() != 1)
+    throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported.");
+
   if (!config_.isCompactCacheEnabled()) {
     throw std::logic_error("Compact cache is not enabled");
   }
 
   folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_);
-  auto poolId = allocator_->addPool(name, size, {Slab::kSize});
+  auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize});
   isCompactCachePool_[poolId] = true;
 
   auto ptr = std::make_unique<CCacheT>(
@@ -3108,12 +3664,15 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
   *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get();
   *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get();
 
+  // TODO: implement serialization for multiple tiers
   auto serializeMMContainers = [](MMContainers& mmContainers) {
     MMSerializationTypeContainer state;
-    for (unsigned int i = 0; i < mmContainers.size(); ++i) {
+    for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) {
       for (unsigned int j = 0; j < mmContainers[i].size(); ++j) {
-        if (mmContainers[i][j]) {
-          state.pools_ref()[i][j] = mmContainers[i][j]->saveState();
+        for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) {
+          if (mmContainers[i][j][k]) {
+            state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState();
+          }
         }
       }
     }
@@ -3123,7 +3682,8 @@ folly::IOBufQueue CacheAllocator<CacheTrait>::saveStateToIOBuf() {
       serializeMMContainers(mmContainers_);
 
   AccessSerializationType accessContainerState = accessContainer_->saveState();
-  MemoryAllocator::SerializationType allocatorState = allocator_->saveState();
+  // TODO: foreach allocator
+  MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState();
   CCacheManager::SerializationType ccState = compactCacheManager_->saveState();
 
   AccessSerializationType chainedItemAccessContainerState =
@@ -3148,6 +3708,8 @@ bool CacheAllocator<CacheTrait>::stopWorkers(std::chrono::seconds timeout) {
   success &= stopPoolResizer(timeout);
   success &= stopMemMonitor(timeout);
   success &= stopReaper(timeout);
+  success &= stopBackgroundEvictor(timeout);
+  success &= stopBackgroundPromoter(timeout);
   return success;
 }
 
@@ -3185,6 +3747,8 @@ CacheAllocator<CacheTrait>::shutDown() {
       (shmShutDownStatus == ShmShutDownRes::kSuccess);
   shmManager_.reset();
 
+  // TODO: save per-tier state
+
   if (shmShutDownSucceeded) {
     if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt)
       return ShutDownStatus::kSuccess;
@@ -3248,22 +3812,26 @@ CacheAllocator<CacheTrait>::deserializeMMContainers(
   const auto container =
       deserializer.deserialize<MMSerializationTypeContainer>();
 
-  MMContainers mmContainers;
+  /* TODO: right now, we create empty containers because deserialization
+   * only works for a single (topmost) tier. */
+  MMContainers mmContainers{getNumTiers()};
 
   for (auto& kvPool : *container.pools_ref()) {
     auto i = static_cast<PoolId>(kvPool.first);
     auto& pool = getPool(i);
     for (auto& kv : kvPool.second) {
       auto j = static_cast<ClassId>(kv.first);
-      MMContainerPtr ptr =
-          std::make_unique<typename MMContainerPtr::element_type>(kv.second,
-                                                                  compressor);
-      auto config = ptr->getConfig();
-      config.addExtraConfig(config_.trackTailHits
-                                ? pool.getAllocationClass(j).getAllocsPerSlab()
-                                : 0);
-      ptr->setConfig(config);
-      mmContainers[i][j] = std::move(ptr);
+      for (TierId tid = 0; tid < getNumTiers(); tid++) {
+        MMContainerPtr ptr =
+            std::make_unique<typename MMContainerPtr::element_type>(kv.second,
+                                                                    compressor);
+        auto config = ptr->getConfig();
+        config.addExtraConfig(config_.trackTailHits
+                                  ? pool.getAllocationClass(j).getAllocsPerSlab()
+                                  : 0);
+        ptr->setConfig(config);
+        mmContainers[tid][i][j] = std::move(ptr);
+      }
     }
   }
   // We need to drop the unevictableMMContainer in the desierializer.
@@ -3403,6 +3971,8 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
   ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime();
   ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false;
   ret.reaperStats = getReaperStats();
+  ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict);
+  ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote);
   ret.numActiveHandles = getNumActiveHandles();
 
   ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_;
@@ -3414,11 +3984,14 @@ GlobalCacheStats CacheAllocator<CacheTrait>::getGlobalCacheStats() const {
 
 template <typename CacheTrait>
 CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
-  const auto totalCacheSize = allocator_->getMemorySize();
-  const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised();
-
+  size_t totalCacheSize = 0;
+  size_t configuredTotalCacheSize = 0;
+  for(auto& allocator: allocator_) {
+    totalCacheSize += allocator->getMemorySize();
+    configuredTotalCacheSize += allocator->getMemorySizeInclAdvised();
+  }
   auto addSize = [this](size_t a, PoolId pid) {
-    return a + allocator_->getPool(pid).getPoolSize();
+    return a + allocator_[currentTier()]->getPool(pid).getPoolSize();
   };
   const auto regularPoolIds = getRegularPoolIds();
   const auto ccCachePoolIds = getCCachePoolIds();
@@ -3431,9 +4004,9 @@ CacheMemoryStats CacheAllocator<CacheTrait>::getCacheMemoryStats() const {
                           configuredTotalCacheSize,
                           configuredRegularCacheSize,
                           configuredCompactCacheSize,
-                          allocator_->getAdvisedMemorySize(),
+                          allocator_[currentTier()]->getAdvisedMemorySize(),
                           memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0,
-                          allocator_->getUnreservedMemorySize(),
+                          allocator_[currentTier()]->getUnreservedMemorySize(),
                           nvmCache_ ? nvmCache_->getSize() : 0,
                           util::getMemAvailable(),
                           util::getRSSBytes()};
@@ -3586,6 +4159,64 @@ bool CacheAllocator<CacheTrait>::startNewReaper(
   return true;
 }
 
+template <typename CacheTrait>
+auto CacheAllocator<CacheTrait>::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid)
+{
+  std::vector<MemoryDescriptorType> asssignedMemory;
+  // TODO: for now, only evict from tier 0
+  auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds());
+  for (const auto pid : pools) {
+    const auto& mpStats = getPoolByTid(pid,tid).getStats();
+    for (const auto cid : mpStats.classIds) {
+      if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) {
+        asssignedMemory.emplace_back(tid, pid, cid);
+      }
+    }
+  }
+  return asssignedMemory;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::startNewBackgroundEvictor(
+    std::chrono::milliseconds interval,
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    size_t threads) {
+  XDCHECK(threads > 0);
+  backgroundEvictor_.resize(threads);
+  bool result = true;
+
+  for (size_t i = 0; i < threads; i++) {
+    auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy, MoverDir::Evict);
+    result = result && ret;
+
+    if (result) {
+      backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0));
+    }
+  }
+  return result;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::startNewBackgroundPromoter(
+    std::chrono::milliseconds interval,
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    size_t threads) {
+  XDCHECK(threads > 0);
+  XDCHECK(getNumTiers() > 1);
+  backgroundPromoter_.resize(threads);
+  bool result = true;
+
+  for (size_t i = 0; i < threads; i++) {
+    auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy, MoverDir::Promote);
+    result = result && ret;
+
+    if (result) {
+      backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1));
+    }
+  }
+  return result;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::stopPoolRebalancer(
     std::chrono::seconds timeout) {
@@ -3613,6 +4244,26 @@ bool CacheAllocator<CacheTrait>::stopReaper(std::chrono::seconds timeout) {
   return stopWorker("Reaper", reaper_, timeout);
 }
 
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::stopBackgroundEvictor(std::chrono::seconds timeout) {
+  bool result = true;
+  for (size_t i = 0; i < backgroundEvictor_.size(); i++) {
+    auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout);
+    result = result && ret;
+  }
+  return result;
+}
+
+template <typename CacheTrait>
+bool CacheAllocator<CacheTrait>::stopBackgroundPromoter(std::chrono::seconds timeout) {
+  bool result = true;
+  for (size_t i = 0; i < backgroundPromoter_.size(); i++) {
+    auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout);
+    result = result && ret;
+  }
+  return result;
+}
+
 template <typename CacheTrait>
 bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
     const std::string& cacheDir, bool posix) {
@@ -3621,6 +4272,8 @@ bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
       // cache dir exists. clean up only if there are no other processes
       // attached. if another process was attached, the following would fail.
       ShmManager::cleanup(cacheDir, posix);
+
+      // TODO: cleanup per-tier state
     } catch (const std::exception& e) {
       XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what());
       return false;
@@ -3630,7 +4283,8 @@ bool CacheAllocator<CacheTrait>::cleanupStrayShmSegments(
     // Any other concurrent process can not be attached to the segments or
     // even if it does, we want to mark it for destruction.
     ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix);
-    ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix);
+    ShmManager::removeByName(cacheDir, detail::kShmCacheName
+                             + std::to_string(0 /* TODO: per tier */), posix);
     ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix);
     ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName,
                              posix);
@@ -3644,14 +4298,16 @@ uint64_t CacheAllocator<CacheTrait>::getItemPtrAsOffset(const void* ptr) {
   // the two differ (e.g. Mac OS 12) - causing templating instantiation
   // errors downstream.
 
+  auto tid = getTierId(ptr);
+
   // if this succeeeds, the address is valid within the cache.
-  allocator_->getAllocInfo(ptr);
+  allocator_[tid]->getAllocInfo(ptr);
 
   if (!isOnShm_ || !shmManager_) {
     throw std::invalid_argument("Shared memory not used");
   }
 
-  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName);
+  const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid));
 
   return reinterpret_cast<uint64_t>(ptr) -
          reinterpret_cast<uint64_t>(shm.getCurrentMapping().addr);
diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h
index ed0096390a..d02c093c12 100644
--- a/cachelib/allocator/CacheAllocator.h
+++ b/cachelib/allocator/CacheAllocator.h
@@ -22,6 +22,8 @@
 #include <folly/ScopeGuard.h>
 #include <folly/logging/xlog.h>
 #include <folly/synchronization/SanitizeThread.h>
+#include <folly/hash/Hash.h>
+#include <folly/container/F14Map.h>
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -38,6 +40,7 @@
 #include <folly/Range.h>
 #pragma GCC diagnostic pop
 
+#include "cachelib/allocator/BackgroundMover.h"
 #include "cachelib/allocator/CCacheManager.h"
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/CacheAllocatorConfig.h"
@@ -710,6 +713,11 @@ class CacheAllocator : public CacheBase {
   // @return    the full usable size for this item
   uint32_t getUsableSize(const Item& item) const;
 
+  // gets the allocation class assigned to BG worker
+  auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid);
+  bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid);
+  size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers);
+
   // Get a random item from memory
   // This is useful for profiling and sampling cachelib managed memory
   //
@@ -806,7 +814,7 @@ class CacheAllocator : public CacheBase {
   // @param config    new config for the pool
   //
   // @throw std::invalid_argument if the poolId is invalid
-  void overridePoolConfig(PoolId pid, const MMConfig& config);
+  void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config);
 
   // update an existing pool's rebalance strategy
   //
@@ -847,8 +855,9 @@ class CacheAllocator : public CacheBase {
   // @return  true if the operation succeeded. false if the size of the pool is
   //          smaller than _bytes_
   // @throw   std::invalid_argument if the poolId is invalid.
+  // TODO: should call shrinkPool for specific tier?
   bool shrinkPool(PoolId pid, size_t bytes) {
-    return allocator_->shrinkPool(pid, bytes);
+    return allocator_[currentTier()]->shrinkPool(pid, bytes);
   }
 
   // grow an existing pool by _bytes_. This will fail if there is no
@@ -857,8 +866,9 @@ class CacheAllocator : public CacheBase {
   // @return    true if the pool was grown. false if the necessary number of
   //            bytes were not available.
   // @throw     std::invalid_argument if the poolId is invalid.
+  // TODO: should call growPool for specific tier?
   bool growPool(PoolId pid, size_t bytes) {
-    return allocator_->growPool(pid, bytes);
+    return allocator_[currentTier()]->growPool(pid, bytes);
   }
 
   // move bytes from one pool to another. The source pool should be at least
@@ -871,7 +881,7 @@ class CacheAllocator : public CacheBase {
   //          correct size to do the transfer.
   // @throw   std::invalid_argument if src or dest is invalid pool
   bool resizePools(PoolId src, PoolId dest, size_t bytes) override {
-    return allocator_->resizePools(src, dest, bytes);
+    return allocator_[currentTier()]->resizePools(src, dest, bytes);
   }
 
   // Add a new compact cache with given name and size
@@ -1053,6 +1063,11 @@ class CacheAllocator : public CacheBase {
   // @param reaperThrottleConfig    throttling config
   bool startNewReaper(std::chrono::milliseconds interval,
                       util::Throttler::Config reaperThrottleConfig);
+  
+  bool startNewBackgroundPromoter(std::chrono::milliseconds interval,
+                      std::shared_ptr<BackgroundMoverStrategy> strategy, size_t threads);
+  bool startNewBackgroundEvictor(std::chrono::milliseconds interval,
+                      std::shared_ptr<BackgroundMoverStrategy> strategy, size_t threads);
 
   // Stop existing workers with a timeout
   bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{
@@ -1062,6 +1077,8 @@ class CacheAllocator : public CacheBase {
                              0});
   bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0});
   bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0});
+  bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0});
+  bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0});
 
   // Set pool optimization to either true or false
   //
@@ -1076,12 +1093,13 @@ class CacheAllocator : public CacheBase {
   // @throw std::invalid_argument if the memory does not belong to this
   //        cache allocator
   AllocInfo getAllocInfo(const void* memory) const {
-    return allocator_->getAllocInfo(memory);
+    return allocator_[getTierId(memory)]->getAllocInfo(memory);
   }
 
   // return the ids for the set of existing pools in this cache.
   std::set<PoolId> getPoolIds() const override final {
-    return allocator_->getPoolIds();
+    // all tiers have the same pool ids. TODO: deduplicate
+    return allocator_[0]->getPoolIds();
   }
 
   // return a list of pool ids that are backing compact caches. This includes
@@ -1093,18 +1111,22 @@ class CacheAllocator : public CacheBase {
 
   // return the pool with speicified id.
   const MemoryPool& getPool(PoolId pid) const override final {
-    return allocator_->getPool(pid);
+    return allocator_[currentTier()]->getPool(pid);
+  }
+
+  const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final {
+    return allocator_[tid]->getPool(pid);
   }
 
   // calculate the number of slabs to be advised/reclaimed in each pool
   PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final {
     auto regularPoolIds = getRegularPoolIds();
-    return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds);
+    return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds);
   }
 
   // update number of slabs to advise in the cache
   void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final {
-    allocator_->updateNumSlabsToAdvise(numSlabsToAdvise);
+    allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise);
   }
 
   // returns a valid PoolId corresponding to the name or kInvalidPoolId if the
@@ -1112,8 +1134,9 @@ class CacheAllocator : public CacheBase {
   PoolId getPoolId(folly::StringPiece name) const noexcept;
 
   // returns the pool's name by its poolId.
-  std::string getPoolName(PoolId poolId) const override {
-    return allocator_->getPoolName(poolId);
+  std::string getPoolName(PoolId poolId) const {
+    // all tiers have the same pool names.
+    return allocator_[0]->getPoolName(poolId);
   }
 
   // get stats related to all kinds of slab release events.
@@ -1145,6 +1168,52 @@ class CacheAllocator : public CacheBase {
     auto stats = reaper_ ? reaper_->getStats() : ReaperStats{};
     return stats;
   }
+  
+  // returns the background mover stats
+  BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const {
+    
+    auto stats = BackgroundMoverStats{};
+    if (direction == MoverDir::Evict) {
+        for (auto &bg : backgroundEvictor_)
+          stats += bg->getStats();
+    } else if (direction == MoverDir::Promote) {
+        for (auto &bg : backgroundPromoter_)
+          stats += bg->getStats();
+    }
+    return stats;
+
+  }
+  
+  
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
+  getBackgroundMoverClassStats(MoverDir direction) const {
+    std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> stats;
+
+    if (direction == MoverDir::Evict) {
+        for (auto &bg : backgroundEvictor_) {
+          for (auto &tid : bg->getClassStats()) {
+            for (auto &pid : tid.second) {
+              for (auto &cid : pid.second) {
+                stats[tid.first][pid.first][cid.first] += cid.second;
+              }
+            }
+          }
+        }
+    } else if (direction == MoverDir::Promote) {
+        for (auto &bg : backgroundPromoter_) {
+          for (auto &tid : bg->getClassStats()) {
+            for (auto &pid : tid.second) {
+              for (auto &cid : pid.second) {
+                stats[tid.first][pid.first][cid.first] += cid.second;
+              }
+            }
+          }
+        }
+    }
+
+    return stats;
+  }
+  
 
   // return the LruType of an item
   typename MMType::LruType getItemLruType(const Item& item) const;
@@ -1159,6 +1228,9 @@ class CacheAllocator : public CacheBase {
   // whether it is object-cache
   bool isObjectCache() const override final { return false; }
 
+  // combined pool size for all memory tiers
+  size_t getPoolSize(PoolId pid) const;
+
   // pool stats by pool id
   PoolStats getPoolStats(PoolId pid) const override final;
 
@@ -1175,6 +1247,9 @@ class CacheAllocator : public CacheBase {
   // return cache's memory usage stats
   CacheMemoryStats getCacheMemoryStats() const override final;
 
+  // return stats for Allocation Class
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final;
+
   // return the nvm cache stats map
   util::StatsMap getNvmCacheStatsMap() const override final;
 
@@ -1284,6 +1359,7 @@ class CacheAllocator : public CacheBase {
                  sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) +
                  sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item),
                 "vtable overhead");
+  // Check for CompressedPtr single/multi tier support
   static_assert(32 == sizeof(Item), "item overhead is 32 bytes");
 
   // make sure there is no overhead in ChainedItem on top of a regular Item
@@ -1308,7 +1384,7 @@ class CacheAllocator : public CacheBase {
 
  private:
   // wrapper around Item's refcount and active handle tracking
-  FOLLY_ALWAYS_INLINE void incRef(Item& it);
+  FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(Item& it, bool failIfMoving);
   FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it);
 
   // drops the refcount and if needed, frees the allocation back to the memory
@@ -1359,6 +1435,12 @@ class CacheAllocator : public CacheBase {
                                     bool nascent = false,
                                     const Item* toRecycle = nullptr);
 
+  // Must be called by the thread which called markForEviction and
+  // succeeded. After this call, the item is unlinked from Access and
+  // MM Containers. The item is no longer marked as exclusive and it's
+  // ref count is 0 - it's available for recycling.
+  void unlinkItemForEviction(Item& it);
+
   // acquires an handle on the item. returns an empty handle if it is null.
   // @param it    pointer to an item
   // @return WriteHandle   return a handle to this item
@@ -1378,11 +1460,14 @@ class CacheAllocator : public CacheBase {
 
   using MMContainerPtr = std::unique_ptr<MMContainer>;
   using MMContainers =
-      std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
-                 MemoryPoolManager::kMaxPools>;
+      std::vector<std::array<std::array<MMContainerPtr, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>>;
 
   void createMMContainers(const PoolId pid, MMConfig config);
 
+  TierId getTierId(const Item& item) const;
+  TierId getTierId(const void* ptr) const;
+
   // acquire the MMContainer corresponding to the the Item's class and pool.
   //
   // @return pointer to the MMContainer.
@@ -1390,7 +1475,12 @@ class CacheAllocator : public CacheBase {
   // allocation from the memory allocator.
   MMContainer& getMMContainer(const Item& item) const noexcept;
 
-  MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept;
+  MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept;
+
+  // Get stats of the specified pid and cid.
+  // If such mmcontainer is not valid (pool id or cid out of bound)
+  // or the mmcontainer is not initialized, return an empty stat.
+  MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept;
 
   // create a new cache allocation. The allocation can be initialized
   // appropriately and made accessible through insert or insertOrReplace.
@@ -1420,7 +1510,20 @@ class CacheAllocator : public CacheBase {
                                Key key,
                                uint32_t size,
                                uint32_t creationTime,
-                               uint32_t expiryTime);
+                               uint32_t expiryTime,
+                               bool fromBgThread = false);
+
+  // create a new cache allocation on specific memory tier.
+  // For description see allocateInternal.
+  //
+  // @param tid id a memory tier
+  WriteHandle allocateInternalTier(TierId tid,
+                                   PoolId id,
+                                   Key key,
+                                   uint32_t size,
+                                   uint32_t creationTime,
+                                   uint32_t expiryTime,
+                                   bool fromBgThread);
 
   // Allocate a chained item
   //
@@ -1448,17 +1551,17 @@ class CacheAllocator : public CacheBase {
   // @return  handle to the parent item if the validations pass
   //          otherwise, an empty Handle is returned.
   //
-  ReadHandle validateAndGetParentHandleForChainedMoveLocked(
+  WriteHandle validateAndGetParentHandleForChainedMoveLocked(
       const ChainedItem& item, const Key& parentKey);
 
   // Given an existing item, allocate a new one for the
   // existing one to later be moved into.
   //
-  // @param oldItem    the item we want to allocate a new item for
+  // @param item   reference to the item we want to allocate a new item for
   //
   // @return  handle to the newly allocated item
   //
-  WriteHandle allocateNewItemForOldItem(const Item& oldItem);
+  WriteHandle allocateNewItemForOldItem(const Item& item);
 
   // internal helper that grabs a refcounted handle to the item. This does
   // not record the access to reflect in the mmContainer.
@@ -1507,12 +1610,21 @@ class CacheAllocator : public CacheBase {
   //              not exist.
   FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode);
 
+  // Moves a regular item to a different memory tier.
+  //
+  // @param oldItem     Reference to the item being moved
+  // @param newItemHdl  Reference to the handle of the new item being moved into
+  //
+  // @return true  If the move was completed, and the containers were updated
+  //               successfully.
+  bool moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl);
+
   // Moves a regular item to a different slab. This should only be used during
   // slab release after the item's exclusive bit has been set. The user supplied
   // callback is responsible for copying the contents and fixing the semantics
   // of chained item.
   //
-  // @param oldItem     Reference to the item being moved
+  // @param oldItem     item being moved
   // @param newItemHdl  Reference to the handle of the new item being moved into
   //
   // @return true  If the move was completed, and the containers were updated
@@ -1592,6 +1704,10 @@ class CacheAllocator : public CacheBase {
   //         false  if the item is not in MMContainer
   bool removeFromMMContainer(Item& item);
 
+  using EvictionIterator = typename MMContainer::LockedIterator;
+
+  WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); }
+
   // Replaces an item in the MMContainer with another item, at the same
   // position.
   //
@@ -1602,6 +1718,17 @@ class CacheAllocator : public CacheBase {
   //               destination item did not exist in the container, or if the
   //               source item already existed.
   bool replaceInMMContainer(Item& oldItem, Item& newItem);
+  bool replaceInMMContainer(Item* oldItem, Item& newItem);
+  bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem);
+
+  TierId getTargetTierForItem(PoolId pid,
+                                             typename Item::Key key,
+                                             uint32_t size,
+                                             uint32_t creationTime,
+                                             uint32_t expiryTime);
+
+  bool shouldEvictToNextMemoryTier(
+    TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item);
 
   // Replaces an item in the MMContainer with another item, at the same
   // position. Or, if the two chained items belong to two different MM
@@ -1658,28 +1785,41 @@ class CacheAllocator : public CacheBase {
   // @param  pid  the id of the pool to look for evictions inside
   // @param  cid  the id of the class to look for evictions inside
   // @return An evicted item or nullptr  if there is no suitable candidate.
-  Item* findEviction(PoolId pid, ClassId cid);
+  Item* findEviction(TierId tid, PoolId pid, ClassId cid);
 
-  using EvictionIterator = typename MMContainer::LockedIterator;
+  // Try to move the item down to the next memory tier
+  //
+  // @param tid current tier ID of the item
+  // @param pid the pool ID the item belong to.
+  // @param item the item to evict
+  //
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle.
+  WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread);
+
+  WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread);
 
-  // Advance the current iterator and try to evict a regular item
+  WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread);
+
+  // Wakes up waiters if there are any
   //
-  // @param  mmContainer  the container to look for evictions.
-  // @param  itr          iterator holding the item
+  // @param item    wakes waiters that are waiting on that item
+  // @param handle  handle to pass to the waiters
+  void wakeUpWaiters(Item& item, WriteHandle handle);
+
+  // Unmarks item as moving and wakes up any waiters waiting on that item
   //
-  // @return  valid handle to regular item on success. This will be the last
-  //          handle to the item. On failure an empty handle.
-  WriteHandle advanceIteratorAndTryEvictRegularItem(MMContainer& mmContainer,
-                                                    EvictionIterator& itr);
+  // @param item    wakes waiters that are waiting on that item
+  // @param handle  handle to pass to the waiters
+  typename RefcountWithFlags::Value unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle);
 
-  // Advance the current iterator and try to evict a chained item
-  // Iterator may also be reset during the course of this function
+  // Try to move the item down to the next memory tier
   //
-  // @param  itr          iterator holding the item
+  // @param item the item to evict
   //
-  // @return  valid handle to the parent item on success. This will be the last
-  //          handle to the item
-  WriteHandle advanceIteratorAndTryEvictChainedItem(EvictionIterator& itr);
+  // @return valid handle to the item. This will be the last
+  //         handle to the item. On failure an empty handle. 
+  WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread);
 
   // Deserializer CacheAllocatorMetadata and verify the version
   //
@@ -1693,7 +1833,7 @@ class CacheAllocator : public CacheBase {
       const typename Item::PtrCompressor& compressor);
 
   unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final {
-    return allocator_->reclaimSlabsAndGrow(id, numSlabs);
+    return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs);
   }
 
   FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const {
@@ -1752,26 +1892,27 @@ class CacheAllocator : public CacheBase {
                    const void* hint = nullptr) final;
 
   // @param releaseContext  slab release context
-  void releaseSlabImpl(const SlabReleaseContext& releaseContext);
+  void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext);
 
   // @return  true when successfully marked as moving,
   //          fasle when this item has already been freed
-  bool markExclusiveForSlabRelease(const SlabReleaseContext& ctx,
-                                   void* alloc,
-                                   util::Throttler& throttler);
+  bool markMovingForSlabRelease(const SlabReleaseContext& ctx,
+                                void* alloc,
+                                util::Throttler& throttler);
 
   // "Move" (by copying) the content in this item to another memory
   // location by invoking the move callback.
   //
   //
   // @param ctx         slab release context
-  // @param item        old item to be moved elsewhere
+  // @param oldItem     old item to be moved elsewhere
+  // @param handle      handle to the item or to it's parent (if chained)
   // @param throttler   slow this function down as not to take too much cpu
   //
   // @return    true  if the item has been moved
   //            false if we have exhausted moving attempts
   bool moveForSlabRelease(const SlabReleaseContext& ctx,
-                          Item& item,
+                          Item& oldItem,
                           util::Throttler& throttler);
 
   // "Move" (by copying) the content in this item to another memory
@@ -1794,18 +1935,13 @@ class CacheAllocator : public CacheBase {
                            Item& item,
                            util::Throttler& throttler);
 
-  // Helper function to evict a normal item for slab release
-  //
-  // @return last handle for corresponding to item on success. empty handle on
-  // failure. caller can retry if needed.
-  WriteHandle evictNormalItemForSlabRelease(Item& item);
+  typename NvmCacheT::PutToken createPutToken(Item& item);
 
-  // Helper function to evict a child item for slab release
-  // As a side effect, the parent item is also evicted
+  // Helper function to remove a item if predicates is true.
   //
-  // @return  last handle to the parent item of the child on success. empty
-  // handle on failure. caller can retry.
-  WriteHandle evictChainedItemForSlabRelease(ChainedItem& item);
+  // @return last handle to the item on success. empty handle on failure.
+  template <typename Fn>
+  WriteHandle removeIf(Item& item, Fn&& predicate);
 
   // Helper function to remove a item if expired.
   //
@@ -1824,10 +1960,141 @@ class CacheAllocator : public CacheBase {
     // primitives. So we consciously exempt ourselves here from TSAN data race
     // detection.
     folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__);
-    auto slabsSkipped = allocator_->forEachAllocation(std::forward<Fn>(f));
+    auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward<Fn>(f));
     stats().numReaperSkippedSlabs.add(slabsSkipped);
   }
 
+  // exposed for the background evictor to iterate through the memory and evict
+  // in batch. This should improve insertion path for tiered memory config
+  size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+auto& mmContainer = getMMContainer(tid, pid, cid);
+    size_t evictions = 0;
+    size_t evictionCandidates = 0;
+    std::vector<Item*> candidates;
+    candidates.reserve(batch);
+
+    size_t tries = 0;
+    mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) {
+      while (candidates.size() < batch && 
+        (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && 
+         itr) {
+        tries++;
+        Item* candidate = itr.get();
+        XDCHECK(candidate);
+
+        if (candidate->isChainedItem()) {
+          throw std::runtime_error("Not supported for chained items");
+        }
+
+        if (candidate->markMoving(true)) {
+          mmContainer.remove(itr);
+          candidates.push_back(candidate);
+        } else {
+            ++itr;
+        }
+      }
+    });
+
+    for (Item *candidate : candidates) {
+      auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */);
+      if (!evictedToNext) {
+	  auto token = createPutToken(*candidate);
+
+	  auto ret = candidate->markForEvictionWhenMoving();
+	  XDCHECK(ret);
+
+          unlinkItemForEviction(*candidate);
+      	  // wake up any readers that wait for the move to complete
+      	  // it's safe to do now, as we have the item marked exclusive and
+      	  // no other reader can be added to the waiters list
+      	  wakeUpWaiters(*candidate, WriteHandle{});
+
+      	  if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) {
+      	    nvmCache_->put(*candidate, std::move(token));
+      	  }
+      } else {
+          evictions++;
+      	  XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+      	  XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+      	  XDCHECK(!candidate->isAccessible());
+      	  XDCHECK(candidate->getKey() == evictedToNext->getKey());
+
+      	  wakeUpWaiters(*candidate, std::move(evictedToNext));
+      }
+      XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
+      if (candidate->hasChainedItem()) {
+        (*stats_.chainedItemEvictions)[pid][cid].inc();
+      } else {
+        (*stats_.regularItemEvictions)[pid][cid].inc();
+      }
+
+      // it's safe to recycle the item here as there are no more
+      // references and the item could not been marked as moving
+      // by other thread since it's detached from MMContainer.
+      auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                /* isNascent */ false);
+      XDCHECK(res == ReleaseRes::kReleased);
+    }
+    return evictions;
+  }
+
+  size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) {
+auto& mmContainer = getMMContainer(tid, pid, cid);
+    size_t promotions = 0;
+    std::vector<Item*> candidates;
+    candidates.reserve(batch);
+
+    size_t tries = 0;
+
+    mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){
+      while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) {
+        tries++;
+        Item* candidate = itr.get();
+        XDCHECK(candidate);
+
+        if (candidate->isChainedItem()) {
+          throw std::runtime_error("Not supported for chained items");
+        }
+
+        // TODO: only allow it for read-only items?
+        // or implement mvcc
+        if (candidate->markMoving(true)) {
+          candidates.push_back(candidate);
+        }
+
+        ++itr;
+      }
+    });
+
+    for (Item *candidate : candidates) {
+      auto promoted = tryPromoteToNextMemoryTier(*candidate, true);
+      if (promoted) {
+        promotions++;
+  	    removeFromMMContainer(*candidate);
+        XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+        // it's safe to recycle the item here as there are no more
+        // references and the item could not been marked as moving
+        // by other thread since it's detached from MMContainer.
+        auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+                                  /* isNascent */ false);
+        XDCHECK(res == ReleaseRes::kReleased);
+        wakeUpWaiters(*candidate, std::move(promoted));
+      } else {
+        // we failed to allocate a new item, this item is no  longer moving
+        auto ref = unmarkMovingAndWakeUpWaiters(*candidate, {});
+        if (UNLIKELY(ref == 0)) {
+          const auto res =
+              releaseBackToAllocator(*candidate, 
+                      RemoveContext::kNormal, false);
+          XDCHECK(res == ReleaseRes::kReleased);
+        }
+      }
+     
+    }
+    return promotions;
+  }
+
   // returns true if nvmcache is enabled and we should write this item to
   // nvmcache.
   bool shouldWriteToNvmCache(const Item& item);
@@ -1868,10 +2135,10 @@ class CacheAllocator : public CacheBase {
                   std::unique_ptr<T>& worker,
                   std::chrono::seconds timeout = std::chrono::seconds{0});
 
-  ShmSegmentOpts createShmCacheOpts();
-  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator();
-  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator();
-  std::unique_ptr<CCacheManager> restoreCCacheManager();
+  ShmSegmentOpts createShmCacheOpts(TierId tid);
+  std::unique_ptr<MemoryAllocator> createNewMemoryAllocator(TierId tid);
+  std::unique_ptr<MemoryAllocator> restoreMemoryAllocator(TierId tid);
+  std::unique_ptr<CCacheManager> restoreCCacheManager(TierId tid);
 
   PoolIds filterCompactCachePools(const PoolIds& poolIds) const;
 
@@ -1891,7 +2158,7 @@ class CacheAllocator : public CacheBase {
   }
 
   typename Item::PtrCompressor createPtrCompressor() const {
-    return allocator_->createPtrCompressor<Item>();
+    return typename Item::PtrCompressor(allocator_);
   }
 
   // helper utility to throttle and optionally log.
@@ -1914,9 +2181,14 @@ class CacheAllocator : public CacheBase {
 
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
-  // @return pointer to memory allocator
+  // @return vector of pointers to memory allocator
   // @throw std::runtime_error if type is invalid
-  std::unique_ptr<MemoryAllocator> initAllocator(InitMemType type);
+  std::vector<std::unique_ptr<MemoryAllocator>> initAllocator(InitMemType type);
+
+  std::vector<std::unique_ptr<MemoryAllocator>> createPrivateAllocator();
+  std::vector<std::unique_ptr<MemoryAllocator>> createAllocators();
+  std::vector<std::unique_ptr<MemoryAllocator>> restoreAllocators();
+
   // @param type        the type of initialization
   // @return nullptr if the type is invalid
   // @return pointer to access container
@@ -1928,18 +2200,14 @@ class CacheAllocator : public CacheBase {
   std::optional<bool> saveNvmCache();
   void saveRamCache();
 
-  static bool itemExclusivePredicate(const Item& item) {
-    return item.getRefCount() == 0;
+  static bool itemSlabMovePredicate(const Item& item) {
+    return item.isMoving() && item.getRefCount() == 0;
   }
 
   static bool itemExpiryPredicate(const Item& item) {
     return item.getRefCount() == 1 && item.isExpired();
   }
 
-  static bool parentEvictForSlabReleasePredicate(const Item& item) {
-    return item.getRefCount() == 1 && !item.isExclusive();
-  }
-
   std::unique_ptr<Deserializer> createDeserializer();
 
   // Execute func on each item. `func` can throw exception but must ensure
@@ -1978,6 +2246,100 @@ class CacheAllocator : public CacheBase {
 
   // BEGIN private members
 
+  TierId currentTier() const {
+    // TODO: every function which calls this method should be refactored.
+    // We should go case by case and either make such function work on
+    // all tiers or expose separate parameter to describe the tier ID.
+    return 0;
+  }
+
+  unsigned getNumTiers() const {
+    return memoryTierConfigs.size();
+  }
+
+  size_t memoryTierSize(TierId tid) const;
+
+  WriteHandle handleWithWaitContextForMovingItem(Item& item);
+
+  size_t wakeUpWaitersLocked(folly::StringPiece key, WriteHandle&& handle);
+
+  class MoveCtx {
+   public:
+    MoveCtx() {}
+
+    ~MoveCtx() {
+      // prevent any further enqueue to waiters
+      // Note: we don't need to hold locks since no one can enqueue
+      // after this point.
+      wakeUpWaiters();
+    }
+
+    // record the item handle. Upon destruction we will wake up the waiters
+    // and pass a clone of the handle to the callBack. By default we pass
+    // a null handle
+    void setItemHandle(WriteHandle _it) { it = std::move(_it); }
+
+    // enqueue a waiter into the waiter list
+    // @param  waiter       WaitContext
+    void addWaiter(std::shared_ptr<WaitContext<ReadHandle>> waiter) {
+      XDCHECK(waiter);
+      waiters.push_back(std::move(waiter));
+    }
+
+    size_t numWaiters() const { return waiters.size(); }
+
+   private:
+    // notify all pending waiters that are waiting for the fetch.
+    void wakeUpWaiters() {
+      bool refcountOverflowed = false;
+      for (auto& w : waiters) {
+        // If refcount overflowed earlier, then we will return miss to
+        // all subsequent waitors.
+        if (refcountOverflowed) {
+          w->set(WriteHandle{});
+          continue;
+        }
+
+        try {
+          w->set(it.clone());
+        } catch (const exception::RefcountOverflow&) {
+          // We'll return a miss to the user's pending read,
+          // so we should enqueue a delete via NvmCache.
+          // TODO: cache.remove(it);
+          refcountOverflowed = true;
+        }
+      }
+    }
+
+    WriteHandle it; // will be set when Context is being filled
+    std::vector<std::shared_ptr<WaitContext<ReadHandle>>> waiters; // list of
+                                                                   // waiters
+  };
+  using MoveMap =
+      folly::F14ValueMap<folly::StringPiece,
+                         std::unique_ptr<MoveCtx>,
+                         folly::HeterogeneousAccessHash<folly::StringPiece>>;
+
+  static size_t getShardForKey(folly::StringPiece key) {
+    return folly::Hash()(key) % kShards;
+  }
+
+  MoveMap& getMoveMapForShard(size_t shard) {
+    return movesMap_[shard].movesMap_;
+  }
+
+  MoveMap& getMoveMap(folly::StringPiece key) {
+    return getMoveMapForShard(getShardForKey(key));
+  }
+
+  std::unique_lock<std::mutex> getMoveLockForShard(size_t shard) {
+    return std::unique_lock<std::mutex>(moveLock_[shard].moveLock_);
+  }
+
+  std::unique_lock<std::mutex> getMoveLock(folly::StringPiece key) {
+    return getMoveLockForShard(getShardForKey(key));
+  }
+
   // Whether the memory allocator for this cache allocator was created on shared
   // memory. The hash table, chained item hash table etc is also created on
   // shared memory except for temporary shared memory mode when they're created
@@ -1986,6 +2348,8 @@ class CacheAllocator : public CacheBase {
 
   Config config_{};
 
+  const typename Config::MemoryTierConfigs memoryTierConfigs;
+
   // Manages the temporary shared memory segment for memory allocator that
   // is not persisted when cache process exits.
   std::unique_ptr<TempShmMapping> tempShm_;
@@ -2003,9 +2367,10 @@ class CacheAllocator : public CacheBase {
   const MMConfig mmConfig_{};
 
   // the memory allocator for allocating out of the available memory.
-  std::unique_ptr<MemoryAllocator> allocator_;
+  std::vector<std::unique_ptr<MemoryAllocator>> allocator_;
 
   // compact cache allocator manager
+  // TODO: per tier?
   std::unique_ptr<CCacheManager> compactCacheManager_;
 
   // compact cache instances reside here when user "add" or "attach" compact
@@ -2055,6 +2420,10 @@ class CacheAllocator : public CacheBase {
 
   // free memory monitor
   std::unique_ptr<MemoryMonitor> memMonitor_;
+  
+  // background evictor
+  std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundEvictor_;
+  std::vector<std::unique_ptr<BackgroundMover<CacheT>>> backgroundPromoter_;
 
   // check whether a pool is a slabs pool
   std::array<bool, MemoryPoolManager::kMaxPools> isCompactCachePool_{};
@@ -2067,6 +2436,22 @@ class CacheAllocator : public CacheBase {
   // poolResizer_, poolOptimizer_, memMonitor_, reaper_
   mutable std::mutex workersMutex_;
 
+  static constexpr size_t kShards = 8192; // TODO: need to define right value
+
+  struct MovesMapShard {
+    alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_;
+  };
+
+  struct MoveLock {
+    alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_;
+  };
+
+  // a map of all pending moves
+  std::vector<MovesMapShard> movesMap_;
+
+  // a map of move locks for each shard
+  std::vector<MoveLock> moveLock_;
+
   // time when the ram cache was first created
   const uint32_t cacheCreationTime_{0};
 
@@ -2100,6 +2485,7 @@ class CacheAllocator : public CacheBase {
   // Make this friend to give access to acquire and release
   friend ReadHandle;
   friend ReaperAPIWrapper<CacheT>;
+  friend BackgroundMoverAPIWrapper<CacheT>;
   friend class CacheAPIWrapperForNvm<CacheT>;
   friend class FbInternalRuntimeUpdateWrapper<CacheT>;
   friend class objcache2::ObjectCache<CacheT>;
diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h
index ec44ff8467..503b121ccf 100644
--- a/cachelib/allocator/CacheAllocatorConfig.h
+++ b/cachelib/allocator/CacheAllocatorConfig.h
@@ -31,6 +31,7 @@
 #include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/NvmAdmissionPolicy.h"
 #include "cachelib/allocator/PoolOptimizeStrategy.h"
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
 #include "cachelib/allocator/Util.h"
 #include "cachelib/common/EventInterface.h"
@@ -207,6 +208,9 @@ class CacheAllocatorConfig {
   // Accepts vector of MemoryTierCacheConfig. Each vector element describes
   // configuration for a single memory cache tier. Tier sizes are specified as
   // ratios, the number of parts of total cache size each tier would occupy.
+  // @throw std::invalid_argument if:
+  // - the size of configs is 0
+  // - the size of configs is greater than kMaxCacheMemoryTiers
   CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs);
 
   // Return reference to MemoryTierCacheConfigs.
@@ -262,6 +266,16 @@ class CacheAllocatorConfig {
       std::chrono::seconds regularInterval,
       std::chrono::seconds ccacheInterval,
       uint32_t ccacheStepSizePercent);
+  
+  // Enable the background evictor - scans a tier to look for objects
+  // to evict to the next tier
+  CacheAllocatorConfig& enableBackgroundEvictor(
+      std::shared_ptr<BackgroundMoverStrategy> backgroundMoverStrategy,
+      std::chrono::milliseconds regularInterval, size_t threads);
+
+  CacheAllocatorConfig& enableBackgroundPromoter(
+      std::shared_ptr<BackgroundMoverStrategy> backgroundMoverStrategy,
+      std::chrono::milliseconds regularInterval, size_t threads);
 
   // This enables an optimization for Pool rebalancing and resizing.
   // The rough idea is to ensure only the least useful items are evicted when
@@ -337,6 +351,17 @@ class CacheAllocatorConfig {
            poolOptimizeStrategy != nullptr;
   }
 
+  // @return whether background evictor thread is enabled
+  bool backgroundEvictorEnabled() const noexcept {
+    return backgroundEvictorInterval.count() > 0 &&
+           backgroundEvictorStrategy != nullptr;
+  }
+
+  bool backgroundPromoterEnabled() const noexcept {
+    return backgroundPromoterInterval.count() > 0 &&
+           backgroundPromoterStrategy != nullptr;
+  }
+
   // @return whether memory monitor is enabled
   bool memMonitoringEnabled() const noexcept {
     return memMonitorConfig.mode != MemoryMonitor::Disabled &&
@@ -374,8 +399,7 @@ class CacheAllocatorConfig {
   std::map<std::string, std::string> serialize() const;
 
   // The max number of memory cache tiers
-  // TODO: increase this number when multi-tier configs are enabled
-  inline static const size_t kMaxCacheMemoryTiers = 1;
+  inline static const size_t kMaxCacheMemoryTiers = 2;
 
   // Cache name for users to indentify their own cache.
   std::string cacheName{""};
@@ -448,6 +472,16 @@ class CacheAllocatorConfig {
   // The slab release process is considered as being stuck if it does not
   // make any progress for the below threshold
   std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)};
+  
+  // rebalance to avoid alloc fialures.
+  std::shared_ptr<BackgroundMoverStrategy> backgroundEvictorStrategy;
+  std::shared_ptr<BackgroundMoverStrategy> backgroundPromoterStrategy;
+  // time interval to sleep between runs of the background evictor
+  std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}};
+  std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}};
+
+  size_t backgroundEvictorThreads{1};
+  size_t backgroundPromoterThreads{1};
 
   // time interval to sleep between iterations of pool size optimization,
   // for regular pools and compact caches
@@ -587,6 +621,32 @@ class CacheAllocatorConfig {
   // If true, we will delay worker start until user explicitly calls
   // CacheAllocator::startCacheWorkers()
   bool delayCacheWorkersStart{false};
+  
+  // see MultiTierDataMovement.md
+  double promotionAcWatermark{4.0}; 
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+
+  uint64_t minEvictionBatch{1};
+  uint64_t minPromotionBatch{1};
+
+  uint64_t maxEvictionPromotionHotness{60};
+
+  bool disableEvictionToMemory{false};
+  double minAcAllocationWatermark{0.0};
+  double maxAcAllocationWatermark{0.0};
+  double acTopTierEvictionWatermark{0.0}; // TODO: make it per TIER?
+  uint64_t sizeThresholdPolicy{0};   
+  double defaultTierChancePercentage{100.0}; // TODO: default could be based on ratio
+  uint64_t forceAllocationTier{UINT64_MAX};
 
   friend CacheT;
 
@@ -924,6 +984,26 @@ CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enablePoolRebalancing(
   return *this;
 }
 
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableBackgroundEvictor(
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    std::chrono::milliseconds interval, size_t evictorThreads) {
+  backgroundEvictorStrategy = strategy;
+  backgroundEvictorInterval = interval;
+  backgroundEvictorThreads = evictorThreads;
+  return *this;
+}
+
+template <typename T>
+CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enableBackgroundPromoter(
+    std::shared_ptr<BackgroundMoverStrategy> strategy,
+    std::chrono::milliseconds interval, size_t promoterThreads) {
+  backgroundPromoterStrategy = strategy;
+  backgroundPromoterInterval = interval;
+  backgroundPromoterThreads = promoterThreads;
+  return *this;
+}
+
 template <typename T>
 CacheAllocatorConfig<T>& CacheAllocatorConfig<T>::enablePoolResizing(
     std::shared_ptr<RebalanceStrategy> resizeStrategy,
@@ -1085,7 +1165,7 @@ std::map<std::string, std::string> CacheAllocatorConfig<T>::serialize() const {
 
   configMap["size"] = std::to_string(size);
   configMap["cacheDir"] = cacheDir;
-  configMap["posixShm"] = usePosixShm ? "set" : "empty";
+  configMap["posixShm"] = isUsingPosixShm() ? "set" : "empty";
 
   configMap["defaultAllocSizes"] = "";
   // Stringify std::set
diff --git a/cachelib/allocator/CacheItem-inl.h b/cachelib/allocator/CacheItem-inl.h
index f59fa9d599..b33c1ea28a 100644
--- a/cachelib/allocator/CacheItem-inl.h
+++ b/cachelib/allocator/CacheItem-inl.h
@@ -148,15 +148,16 @@ std::string CacheItem<CacheTrait>::toString() const {
     return folly::sformat(
         "item: "
         "memory={}:raw-ref={}:size={}:key={}:hex-key={}:"
-        "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime="
+        "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:"
+        "isMoving={}:references={}:ctime="
         "{}:"
         "expTime={}:updateTime={}:isNvmClean={}:isNvmEvicted={}:hasChainedItem="
         "{}",
         this, getRefCountAndFlagsRaw(), getSize(),
         folly::humanify(getKey().str()), folly::hexlify(getKey()),
-        isInMMContainer(), isAccessible(), isExclusive(), getRefCount(),
-        getCreationTime(), getExpiryTime(), getLastAccessTime(), isNvmClean(),
-        isNvmEvicted(), hasChainedItem());
+        isInMMContainer(), isAccessible(), isMarkedForEviction(), isMoving(),
+        getRefCount(), getCreationTime(), getExpiryTime(), getLastAccessTime(),
+        isNvmClean(), isNvmEvicted(), hasChainedItem());
   }
 }
 
@@ -217,23 +218,43 @@ bool CacheItem<CacheTrait>::isInMMContainer() const noexcept {
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::markExclusive() noexcept {
-  return ref_.markExclusive();
+bool CacheItem<CacheTrait>::markForEviction() noexcept {
+  return ref_.markForEviction();
 }
 
 template <typename CacheTrait>
-RefcountWithFlags::Value CacheItem<CacheTrait>::unmarkExclusive() noexcept {
-  return ref_.unmarkExclusive();
+RefcountWithFlags::Value CacheItem<CacheTrait>::unmarkForEviction() noexcept {
+  return ref_.unmarkForEviction();
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::isExclusive() const noexcept {
-  return ref_.isExclusive();
+bool CacheItem<CacheTrait>::isMarkedForEviction() const noexcept {
+  return ref_.isMarkedForEviction();
 }
 
 template <typename CacheTrait>
-bool CacheItem<CacheTrait>::isOnlyExclusive() const noexcept {
-  return ref_.isOnlyExclusive();
+bool CacheItem<CacheTrait>::markForEvictionWhenMoving() {
+  return ref_.markForEvictionWhenMoving();
+}
+
+template <typename CacheTrait>
+bool CacheItem<CacheTrait>::markMoving(bool failIfRefNotZero) {
+  return ref_.markMoving(failIfRefNotZero);
+}
+
+template <typename CacheTrait>
+RefcountWithFlags::Value CacheItem<CacheTrait>::unmarkMoving() noexcept {
+  return ref_.unmarkMoving();
+}
+
+template <typename CacheTrait>
+bool CacheItem<CacheTrait>::isMoving() const noexcept {
+  return ref_.isMoving();
+}
+
+template <typename CacheTrait>
+bool CacheItem<CacheTrait>::isOnlyMoving() const noexcept {
+  return ref_.isOnlyMoving();
 }
 
 template <typename CacheTrait>
@@ -335,7 +356,7 @@ bool CacheItem<CacheTrait>::updateExpiryTime(uint32_t expiryTimeSecs) noexcept {
   // check for moving to make sure we are not updating the expiry time while at
   // the same time re-allocating the item with the old state of the expiry time
   // in moveRegularItem(). See D6852328
-  if (isExclusive() || !isInMMContainer() || isChainedItem()) {
+  if (isMoving() || isMarkedForEviction() || !isInMMContainer() || isChainedItem()) {
     return false;
   }
   // attempt to atomically update the value of expiryTime
@@ -451,12 +472,14 @@ std::string CacheChainedItem<CacheTrait>::toString() const {
   return folly::sformat(
       "chained item: "
       "memory={}:raw-ref={}:size={}:parent-compressed-ptr={}:"
-      "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime={}"
+      "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:"
+      "isMoving={}:references={}:ctime={}"
       ":"
       "expTime={}:updateTime={}",
       this, Item::getRefCountAndFlagsRaw(), Item::getSize(), cPtr.getRaw(),
-      Item::isInMMContainer(), Item::isAccessible(), Item::isExclusive(),
-      Item::getRefCount(), Item::getCreationTime(), Item::getExpiryTime(),
+      Item::isInMMContainer(), Item::isAccessible(),
+      Item::isMarkedForEviction(), Item::isMoving(), Item::getRefCount(),
+      Item::getCreationTime(), Item::getExpiryTime(),
       Item::getLastAccessTime());
 }
 
diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h
index 06136db032..6728b654eb 100644
--- a/cachelib/allocator/CacheItem.h
+++ b/cachelib/allocator/CacheItem.h
@@ -46,6 +46,9 @@ class BaseAllocatorTest;
 template <typename AllocatorT>
 class AllocatorHitStatsTest;
 
+template <typename AllocatorT>
+class AllocatorMemoryTiersTest;
+
 template <typename AllocatorT>
 class MapTest;
 
@@ -305,12 +308,17 @@ class CACHELIB_PACKED_ATTR CacheItem {
    */
   RefcountWithFlags::Value getRefCountAndFlagsRaw() const noexcept;
 
-  FOLLY_ALWAYS_INLINE void incRef() {
-    if (LIKELY(ref_.incRef())) {
-      return;
+  // Increments item's ref count
+  //
+  // @return true on success, failure if item is marked as exclusive
+  // @throw exception::RefcountOverflow on ref count overflow
+  FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(bool failIfMoving) {
+    try {
+      return ref_.incRef(failIfMoving);
+    } catch (exception::RefcountOverflow& e) {
+      throw exception::RefcountOverflow(
+          folly::sformat("{} item: {}", e.what(), toString()));
     }
-    throw exception::RefcountOverflow(
-        folly::sformat("Refcount maxed out. item: {}", toString()));
   }
 
   FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef() {
@@ -344,23 +352,43 @@ class CACHELIB_PACKED_ATTR CacheItem {
 
   /**
    * The following two functions corresond to whether or not an item is
-   * currently in the process of being moved. This happens during a slab
-   * rebalance, eviction or resize operation.
+   * currently in the process of being evicted.
    *
-   * An item can only be marked exclusive when `isInMMContainer` returns true.
+   * An item can only be marked exclusive when `isInMMContainer` returns true
+   * and item is not already exclusive nor moving and the ref count is 0.
    * This operation is atomic.
    *
-   * User can also query if an item "isOnlyExclusive". This returns true only
-   * if the refcount is 0 and only the exclusive bit is set.
-   *
-   * Unmarking exclusive does not depend on `isInMMContainer`.
+   * Unmarking exclusive does not depend on `isInMMContainer`
    * Unmarking exclusive will also return the refcount at the moment of
    * unmarking.
    */
-  bool markExclusive() noexcept;
-  RefcountWithFlags::Value unmarkExclusive() noexcept;
-  bool isExclusive() const noexcept;
-  bool isOnlyExclusive() const noexcept;
+  bool markForEviction() noexcept;
+  RefcountWithFlags::Value unmarkForEviction() noexcept;
+  bool isMarkedForEviction() const noexcept;
+
+  /**
+   * The following functions correspond to whether or not an item is
+   * currently in the processed of being moved. When moving, ref count
+   * is always >= 1.
+   *
+   * An item can only be marked moving when `isInMMContainer` returns true
+   * and item is not already exclusive nor moving.
+   *
+   * User can also query if an item "isOnlyMoving". This returns true only
+   * if the refcount is one and only the exclusive bit is set.
+   *
+   * Unmarking moving does not depend on `isInMMContainer`
+   * Unmarking moving will also return the refcount at the moment of
+   * unmarking.
+   */
+  bool markMoving(bool failIfRefNotZero);
+  RefcountWithFlags::Value unmarkMoving() noexcept;
+  bool isMoving() const noexcept;
+  bool isOnlyMoving() const noexcept;
+
+  /** This function attempts to mark item as exclusive.
+   * Can only be called on the item that is moving.*/
+  bool markForEvictionWhenMoving();
 
   /**
    * Item cannot be marked both chained allocation and
@@ -448,6 +476,8 @@ class CACHELIB_PACKED_ATTR CacheItem {
   FRIEND_TEST(ItemTest, NonStringKey);
   template <typename AllocatorT>
   friend class facebook::cachelib::tests::AllocatorHitStatsTest;
+  template <typename AllocatorT>
+  friend class facebook::cachelib::tests::AllocatorMemoryTiersTest;
 };
 
 // A chained item has a hook pointing to the next chained item. The hook is
diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp
index c16149df6b..b4770a3480 100644
--- a/cachelib/allocator/CacheStats.cpp
+++ b/cachelib/allocator/CacheStats.cpp
@@ -44,6 +44,8 @@ void Stats::init() {
   initToZero(*fragmentationSize);
   initToZero(*chainedItemEvictions);
   initToZero(*regularItemEvictions);
+
+  classAllocLatency = std::make_unique<PerTierPoolClassRollingStats>();
 }
 
 template <int>
@@ -51,7 +53,7 @@ struct SizeVerify {};
 
 void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const {
 #ifndef SKIP_SIZE_VERIFY
-  SizeVerify<sizeof(Stats)> a = SizeVerify<16176>{};
+  SizeVerify<sizeof(Stats)> a = SizeVerify<16192>{};
   std::ignore = a;
 #endif
   ret.numCacheGets = numCacheGets.get();
diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h
index fb9955b805..46c051be14 100644
--- a/cachelib/allocator/CacheStats.h
+++ b/cachelib/allocator/CacheStats.h
@@ -25,6 +25,7 @@
 #include "cachelib/allocator/memory/Slab.h"
 #include "cachelib/common/FastStats.h"
 #include "cachelib/common/PercentileStats.h"
+#include "cachelib/common/RollingStats.h"
 #include "cachelib/common/Time.h"
 
 namespace facebook {
@@ -289,6 +290,27 @@ struct ReaperStats {
   uint64_t avgTraversalTimeMs{0};
 };
 
+// Mover Stats
+struct BackgroundMoverStats {
+  // the number of items this worker moved by looking at pools/classes stats
+  uint64_t numMovedItems{0};
+  // number of times we went executed the thread //TODO: is this def correct?
+  uint64_t runCount{0};
+  // total number of classes
+  uint64_t totalClasses{0};
+  // eviction size
+  uint64_t totalBytesMoved{0};
+
+  BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) {
+    numMovedItems += rhs.numMovedItems;
+    runCount += rhs.runCount;
+    totalClasses += rhs.totalClasses;
+    totalBytesMoved += rhs.totalBytesMoved;
+    return *this;
+  }
+};
+
+
 // CacheMetadata type to export
 struct CacheMetadata {
   // allocator_version
@@ -309,6 +331,11 @@ struct Stats;
 // Stats that apply globally in cache and
 // the ones that are aggregated over all pools
 struct GlobalCacheStats {
+  // background eviction stats
+  BackgroundMoverStats evictionStats;
+  
+  BackgroundMoverStats promotionStats;
+
   // number of calls to CacheAllocator::find
   uint64_t numCacheGets{0};
 
diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h
index b2a5f8c469..19a15fbbd4 100644
--- a/cachelib/allocator/CacheStatsInternal.h
+++ b/cachelib/allocator/CacheStatsInternal.h
@@ -21,6 +21,7 @@
 #include "cachelib/allocator/Cache.h"
 #include "cachelib/allocator/memory/MemoryAllocator.h"
 #include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -229,6 +230,14 @@ struct Stats {
   std::unique_ptr<PerPoolClassAtomicCounters> chainedItemEvictions{};
   std::unique_ptr<PerPoolClassAtomicCounters> regularItemEvictions{};
 
+  using PerTierPoolClassRollingStats = std::array<
+      std::array<std::array<util::RollingStats, MemoryAllocator::kMaxClasses>,
+                 MemoryPoolManager::kMaxPools>,
+      CacheBase::kMaxTiers>;
+
+  // rolling latency tracking for every alloc class in every pool
+  std::unique_ptr<PerTierPoolClassRollingStats> classAllocLatency{};
+
   // Eviction failures due to parent cannot be removed from access container
   AtomicCounter evictFailParentAC{0};
 
diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp
new file mode 100644
index 0000000000..4a900c2cb1
--- /dev/null
+++ b/cachelib/allocator/FreeThresholdStrategy.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+
+#include <folly/logging/xlog.h>
+
+namespace facebook {
+namespace cachelib {
+
+FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark,
+                                             double highEvictionAcWatermark,
+                                             uint64_t maxEvictionBatch,
+                                             uint64_t minEvictionBatch)
+    : lowEvictionAcWatermark(lowEvictionAcWatermark),
+      highEvictionAcWatermark(highEvictionAcWatermark),
+      maxEvictionBatch(maxEvictionBatch),
+      minEvictionBatch(minEvictionBatch) {}
+
+std::vector<size_t> FreeThresholdStrategy::calculateBatchSizes(
+    const CacheBase& cache,
+    std::vector<MemoryDescriptorType> acVec) {
+  std::vector<size_t> batches{};
+  for (auto [tid, pid, cid] : acVec) {
+    auto stats = cache.getACStats(tid, pid, cid);
+    if ((1-stats.usageFraction())*100 >= highEvictionAcWatermark) {
+      batches.push_back(0);
+    } else {
+      auto toFreeMemPercent = highEvictionAcWatermark - (1-stats.usageFraction())*100;
+      auto toFreeItems = static_cast<size_t>(
+          toFreeMemPercent * (stats.totalSlabs() * Slab::kSize) / stats.allocSize);
+      batches.push_back(toFreeItems);
+    }
+  }
+
+  if (batches.size() == 0) {
+    return batches;
+  }
+
+  auto maxBatch = *std::max_element(batches.begin(), batches.end());
+  if (maxBatch == 0)
+    return batches;
+
+  std::transform(
+      batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+        if (numItems == 0) {
+          return 0UL;
+        }
+
+        auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch;
+        if (cappedBatchSize < minEvictionBatch)
+          return minEvictionBatch;
+        else
+          return cappedBatchSize;
+      });
+
+  return batches;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h
new file mode 100644
index 0000000000..94316bfe82
--- /dev/null
+++ b/cachelib/allocator/FreeThresholdStrategy.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/Cache.h"
+
+namespace facebook {
+namespace cachelib {
+
+// Base class for background mover strategy.
+class FreeThresholdStrategy : public BackgroundMoverStrategy {
+ public:
+  FreeThresholdStrategy(double lowEvictionAcWatermark,
+                        double highEvictionAcWatermark,
+                        uint64_t maxEvictionBatch,
+                        uint64_t minEvictionBatch);
+  ~FreeThresholdStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVecs);
+
+ private:
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  uint64_t maxEvictionBatch{40};
+  uint64_t minEvictionBatch{5};
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h
index 11d2bed2be..06c21bffe4 100644
--- a/cachelib/allocator/Handle.h
+++ b/cachelib/allocator/Handle.h
@@ -400,6 +400,12 @@ struct ReadHandleImpl {
       }
     }
 
+   protected:
+    friend class ReadHandleImpl;
+    // Method used only by ReadHandleImpl ctor
+    void discard() {
+      it_.store(nullptr, std::memory_order_relaxed);
+    }
    private:
     // we are waiting on Item* to be set to a value. One of the valid values is
     // nullptr. So choose something that we dont expect to indicate a ptr
@@ -479,7 +485,8 @@ struct ReadHandleImpl {
 
   // Handle which has the item already
   FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept
-      : alloc_(&alloc), it_(it) {}
+      : alloc_(&alloc), it_(it) {
+  }
 
   // handle that has a wait context allocated. Used for async handles
   // In this case, the it_ will be filled in asynchronously and mulitple
diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h
index ba388d40a4..33b356077d 100644
--- a/cachelib/allocator/MM2Q-inl.h
+++ b/cachelib/allocator/MM2Q-inl.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <folly/Random.h>
+
 namespace facebook {
 namespace cachelib {
 
@@ -104,6 +106,9 @@ bool MM2Q::Container<T, HookPtr>::recordAccess(T& node,
       return false;
     }
 
+    if (config_.markUsefulChance < 100.0 && folly::Random::rand32() % 100 >= config_.markUsefulChance)
+      return false;
+
     return lruMutex_->lock_combine(func);
   }
   return false;
@@ -223,15 +228,32 @@ void MM2Q::Container<T, HookPtr>::rebalance() noexcept {
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 bool MM2Q::Container<T, HookPtr>::add(T& node) noexcept {
   const auto currTime = static_cast<Time>(util::getCurrentTimeSec());
-  return lruMutex_->lock_combine([this, &node, currTime]() {
+
+  auto insertToList = [this, &node] {
+    if (config_.lruInsertionPointSpec == 0) {
+      markHot(node);
+      unmarkCold(node);
+      unmarkTail(node);
+      lru_.getList(LruType::Hot).linkAtHead(node);
+    } else if (config_.lruInsertionPointSpec == 1) {
+      unmarkHot(node);
+      unmarkCold(node);
+      unmarkTail(node);
+      lru_.getList(LruType::Warm).linkAtHead(node);
+    } else {
+      unmarkHot(node);
+      markCold(node);
+      unmarkTail(node);
+      lru_.getList(LruType::Cold).linkAtHead(node);
+    }
+  };
+
+  return lruMutex_->lock_combine([this, &node, currTime, &insertToList]() {
     if (node.isInMMContainer()) {
       return false;
     }
 
-    markHot(node);
-    unmarkCold(node);
-    unmarkTail(node);
-    lru_.getList(LruType::Hot).linkAtHead(node);
+    insertToList();
     rebalance();
 
     node.markInMMContainer();
@@ -258,6 +280,16 @@ void MM2Q::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+// returns the head of the hot queue for promotion
+template <typename T, MM2Q::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MM2Q::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  lruMutex_->lock_combine([this, &fun]() {
+    fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)});
+  });
+}
+
 template <typename T, MM2Q::Hook<T> T::*HookPtr>
 void MM2Q::Container<T, HookPtr>::removeLocked(T& node,
                                                bool doRebalance) noexcept {
diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h
index 982eca21f9..2469385181 100644
--- a/cachelib/allocator/MM2Q.h
+++ b/cachelib/allocator/MM2Q.h
@@ -68,6 +68,7 @@ class MM2Q {
   enum LruType { Warm, WarmTail, Hot, Cold, ColdTail, NumTypes };
 
   // Config class for MM2Q
+  // TODO: implement support for useCombinedLockForIterators
   struct Config {
     // Create from serialized config
     explicit Config(SerializationConfigType configState)
@@ -354,6 +355,14 @@ class MM2Q {
 
     // Whether to use combined locking for withEvictionIterator.
     bool useCombinedLockForIterators{false};
+
+    // % chance that accessRecord will update the queue
+    double markUsefulChance{100.0};
+
+    // 0 - insert to hot queue
+    // 1 - insert to warm queue
+    // 2 - insert to cold queue
+    uint8_t lruInsertionPointSpec {0};
   };
 
   // The container object which can be used to keep track of objects of type
@@ -501,6 +510,11 @@ class MM2Q {
     // Iterator passed as parameter.
     template <typename F>
     void withEvictionIterator(F&& f);
+    
+    // Execute provided function under container lock. Function gets
+    // iterator passed as parameter.
+    template <typename F>
+    void withPromotionIterator(F&& f);
 
     // get the current config as a copy
     Config getConfig() const;
diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h
index d35759f212..e12cc4f9f5 100644
--- a/cachelib/allocator/MMLru-inl.h
+++ b/cachelib/allocator/MMLru-inl.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <folly/Random.h>
+
 namespace facebook {
 namespace cachelib {
 namespace detail {
@@ -87,6 +89,9 @@ bool MMLru::Container<T, HookPtr>::recordAccess(T& node,
       return false;
     }
 
+    if (config_.markUsefulChance < 100.0 && folly::Random::rand32() % 100 < config_.markUsefulChance)
+      return false;
+
     lruMutex_->lock_combine(func);
     return true;
   }
@@ -229,6 +234,18 @@ void MMLru::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   }
 }
 
+template <typename T, MMLru::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMLru::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  if (config_.useCombinedLockForIterators) {
+    lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); });
+  } else {
+    LockHolder lck{*lruMutex_};
+    fun(Iterator{lru_.begin()});
+  }
+}
+
 template <typename T, MMLru::Hook<T> T::*HookPtr>
 void MMLru::Container<T, HookPtr>::ensureNotInsertionPoint(T& node) noexcept {
   // If we are removing the insertion point node, grow tail before we remove
diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h
index 29c6d02689..60c707038b 100644
--- a/cachelib/allocator/MMLru.h
+++ b/cachelib/allocator/MMLru.h
@@ -224,18 +224,22 @@ class MMLru {
     // access. If set, and tryLock fails, access will not result in promotion.
     bool tryLockUpdate{false};
 
+    // % chance that accessRecord will update the list
+    double markUsefulChance{100.0};
+
     // By default insertions happen at the head of the LRU. If we need
     // insertions at the middle of lru we can adjust this to be a non-zero.
     // Ex: lruInsertionPointSpec = 1, we insert at the middle (1/2 from end)
     //     lruInsertionPointSpec = 2, we insert at a point 1/4th from tail
     uint8_t lruInsertionPointSpec{0};
 
+    // Whether to use combined locking for withEvictionIterator.
+    bool useCombinedLockForIterators{true};
+
     // Minimum interval between reconfigurations. If 0, reconfigure is never
     // called.
     std::chrono::seconds mmReconfigureIntervalSecs{};
 
-    // Whether to use combined locking for withEvictionIterator.
-    bool useCombinedLockForIterators{false};
   };
 
   // The container object which can be used to keep track of objects of type
@@ -376,6 +380,9 @@ class MMLru {
     template <typename F>
     void withEvictionIterator(F&& f);
 
+    template <typename F>
+    void withPromotionIterator(F&& f);
+
     // get copy of current config
     Config getConfig() const;
 
diff --git a/cachelib/allocator/MMTinyLFU-inl.h b/cachelib/allocator/MMTinyLFU-inl.h
index 46640b24ca..9203a54dd6 100644
--- a/cachelib/allocator/MMTinyLFU-inl.h
+++ b/cachelib/allocator/MMTinyLFU-inl.h
@@ -227,6 +227,13 @@ void MMTinyLFU::Container<T, HookPtr>::withEvictionIterator(F&& fun) {
   fun(getEvictionIterator());
 }
 
+template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
+template <typename F>
+void
+MMTinyLFU::Container<T, HookPtr>::withPromotionIterator(F&& fun) {
+  throw std::runtime_error("Not supported");
+}
+
 template <typename T, MMTinyLFU::Hook<T> T::*HookPtr>
 void MMTinyLFU::Container<T, HookPtr>::removeLocked(T& node) noexcept {
   if (isTiny(node)) {
diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h
index c8f2699264..eb45cefd22 100644
--- a/cachelib/allocator/MMTinyLFU.h
+++ b/cachelib/allocator/MMTinyLFU.h
@@ -496,6 +496,9 @@ class MMTinyLFU {
     // iterator passed as parameter.
     template <typename F>
     void withEvictionIterator(F&& f);
+    
+    template <typename F>
+    void withPromotionIterator(F&& f);
 
     // for saving the state of the lru
     //
diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h
index a60fb64d3e..bb8dea0825 100644
--- a/cachelib/allocator/MemoryTierCacheConfig.h
+++ b/cachelib/allocator/MemoryTierCacheConfig.h
@@ -23,10 +23,7 @@ namespace cachelib {
 class MemoryTierCacheConfig {
  public:
   // Creates instance of MemoryTierCacheConfig for Posix/SysV Shared memory.
-  static MemoryTierCacheConfig fromShm() {
-    // TODO: expand this method when adding support for file-mapped memory
-    return MemoryTierCacheConfig();
-  }
+  static MemoryTierCacheConfig fromShm() { return MemoryTierCacheConfig(); }
 
   // Specifies ratio of this memory tier to other tiers. Absolute size
   // of each tier can be calculated as:
@@ -49,7 +46,7 @@ class MemoryTierCacheConfig {
 
   const NumaBitMask& getMemBind() const noexcept { return numaNodes; }
 
-  size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) {
+  size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const {
     // TODO: Call this method when tiers are enabled in allocator
     // to calculate tier sizes in bytes.
     if (!partitionNum) {
@@ -65,6 +62,10 @@ class MemoryTierCacheConfig {
     return getRatio() * (totalCacheSize / partitionNum);
   }
 
+  // TODO: move it to MMContainer config
+  double markUsefulChance{100.0}; // call mark useful only with this
+  uint8_t lruInsertionPointSpec{0}; // look at LRU/LRU2Q description (possible values vary)
+
  private:
   // Ratio is a number of parts of the total cache size to be allocated for this
   // tier. E.g. if X is a total cache size, Yi are ratios specified for memory
diff --git a/cachelib/allocator/PoolOptimizer.cpp b/cachelib/allocator/PoolOptimizer.cpp
index 8d67762be8..d101231a04 100644
--- a/cachelib/allocator/PoolOptimizer.cpp
+++ b/cachelib/allocator/PoolOptimizer.cpp
@@ -51,6 +51,8 @@ void PoolOptimizer::optimizeRegularPoolSizes() {
 
 void PoolOptimizer::optimizeCompactCacheSizes() {
   try {
+    // TODO: should optimizer look at each tier individually?
+    // If yes, then resizePools should be per-tier
     auto strategy = cache_.getPoolOptimizeStrategy();
     if (!strategy) {
       strategy = strategy_;
diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h
new file mode 100644
index 0000000000..1022aca0f8
--- /dev/null
+++ b/cachelib/allocator/PromotionStrategy.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/Cache.h"
+
+namespace facebook {
+namespace cachelib {
+
+// Base class for background eviction strategy.
+class PromotionStrategy : public BackgroundMoverStrategy {
+ public:
+  PromotionStrategy(uint64_t promotionAcWatermark,
+                    uint64_t maxPromotionBatch,
+                    uint64_t minPromotionBatch)
+      : promotionAcWatermark(promotionAcWatermark),
+        maxPromotionBatch(maxPromotionBatch),
+        minPromotionBatch(minPromotionBatch) {}
+  ~PromotionStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) {
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+      XDCHECK(tid > 0);
+      auto stats = cache.getACStats(tid - 1, pid, cid);
+      if ((1-stats.usageFraction())*100 < promotionAcWatermark)
+        batches.push_back(0);
+      else {
+        auto maxPossibleItemsToPromote = static_cast<size_t>(
+            (promotionAcWatermark - (1-stats.usageFraction())*100) *
+            (stats.totalSlabs() * Slab::kSize) / stats.allocSize);
+        batches.push_back(maxPossibleItemsToPromote);
+      }
+    }
+
+    if (batches.size() == 0) {
+      return batches;
+    }
+
+    auto maxBatch = *std::max_element(batches.begin(), batches.end());
+    if (maxBatch == 0)
+      return batches;
+
+    std::transform(
+        batches.begin(), batches.end(), batches.begin(), [&](auto numItems) {
+          if (numItems == 0) {
+            return 0UL;
+          }
+
+          auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch;
+          if (cappedBatchSize < minPromotionBatch)
+            return minPromotionBatch;
+          else
+            return cappedBatchSize;
+        });
+
+    return batches;
+  }
+
+ private:
+  double promotionAcWatermark{4.0};
+  uint64_t maxPromotionBatch{40};
+  uint64_t minPromotionBatch{5};
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/Refcount.h b/cachelib/allocator/Refcount.h
index c60dea34f1..8251ef15ba 100644
--- a/cachelib/allocator/Refcount.h
+++ b/cachelib/allocator/Refcount.h
@@ -130,34 +130,41 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   RefcountWithFlags& operator=(const RefcountWithFlags&) = delete;
   RefcountWithFlags(RefcountWithFlags&&) = delete;
   RefcountWithFlags& operator=(RefcountWithFlags&&) = delete;
-
+  enum incResult {
+     incOk,
+     incFailedMoving,
+     incFailedEviction
+   };
   // Bumps up the reference count only if the new count will be strictly less
-  // than or equal to the maxCount.
-  // @return true if refcount is bumped. false otherwise.
-  FOLLY_ALWAYS_INLINE bool incRef() noexcept {
-    Value* const refPtr = &refCount_;
-    unsigned int nCASFailures = 0;
-    constexpr bool isWeak = false;
-    Value oldVal = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
-
-    while (true) {
-      const Value newCount = oldVal + static_cast<Value>(1);
-      if (UNLIKELY((oldVal & kAccessRefMask) == (kAccessRefMask))) {
-        return false;
-      }
-
-      if (__atomic_compare_exchange_n(refPtr, &oldVal, newCount, isWeak,
-                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
-        return true;
-      }
-
-      if ((++nCASFailures % 4) == 0) {
-        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
-        // above should take about 100 clock cycles. we pause once every 400
-        // cycles or so if we are extremely unlucky.
-        folly::asm_volatile_pause();
-      }
-    }
+  // than or equal to the maxCount and the item is not exclusive
+  // @return true if refcount is bumped. false otherwise (if item is exclusive)
+  // @throw  exception::RefcountOverflow if new count would be greater than
+  // maxCount
+  FOLLY_ALWAYS_INLINE incResult incRef(bool failIfMoving) {
+    incResult res = incOk;
+    auto predicate = [failIfMoving, &res](const Value curValue) {
+       Value bitMask = getAdminRef<kExclusive>();
+
+       const bool exlusiveBitIsSet = curValue & bitMask;
+       if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
+         throw exception::RefcountOverflow("Refcount maxed out.");
+       } else if (exlusiveBitIsSet && (curValue & kAccessRefMask) == 0) {
+         res = incFailedEviction;
+         return false;
+       } else if (exlusiveBitIsSet && failIfMoving) {
+         res = incFailedMoving;
+         return false;
+       }
+       res = incOk;
+       return true;
+     };
+
+     auto newValue = [](const Value curValue) {
+       return (curValue + static_cast<Value>(1));
+     };
+
+     atomicUpdateValue(predicate, newValue);
+     return res;
   }
 
   // Bumps down the reference count
@@ -167,33 +174,38 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   // @throw  RefcountUnderflow when we are trying to decremenet from 0
   //         refcount and have a refcount leak.
   FOLLY_ALWAYS_INLINE Value decRef() {
-    Value* const refPtr = &refCount_;
-    unsigned int nCASFailures = 0;
-    constexpr bool isWeak = false;
-
-    Value oldVal = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
-    while (true) {
-      const Value newCount = oldVal - static_cast<Value>(1);
-      if ((oldVal & kAccessRefMask) == 0) {
+    auto predicate = [](const Value curValue) {
+      if ((curValue & kAccessRefMask) == 0) {
         throw exception::RefcountUnderflow(
             "Trying to decRef with no refcount. RefCount Leak!");
       }
+      return true;
+    };
 
-      if (__atomic_compare_exchange_n(refPtr, &oldVal, newCount, isWeak,
-                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
-        return newCount & kRefMask;
-      }
-      if ((++nCASFailures % 4) == 0) {
-        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
-        // above should take about 100 clock cycles. we pause once every 400
-        // cycles or so if we are extremely unlucky
-        folly::asm_volatile_pause();
-      }
-    }
+    Value retValue;
+    auto newValue = [&retValue](const Value curValue) {
+      retValue = (curValue - static_cast<Value>(1));
+      return retValue;
+    };
+
+    auto updated = atomicUpdateValue(predicate, newValue);
+    XDCHECK(updated);
+
+    return retValue & kRefMask;
   }
 
-  // Return refcount excluding control bits and flags
-  Value getAccessRef() const noexcept { return getRaw() & kAccessRefMask; }
+  // Return refcount excluding moving refcount, control bits and flags.
+  Value getAccessRef() const noexcept {
+    auto raw = getRaw();
+    auto accessRef = raw & kAccessRefMask;
+
+    if ((raw & getAdminRef<kExclusive>()) && accessRef >= 1) {
+      // if item is moving, ignore the extra ref
+      return accessRef - static_cast<Value>(1);
+    } else {
+      return accessRef;
+    }
+  }
 
   // Return access ref and the admin ref bits
   Value getRefWithAccessAndAdmin() const noexcept {
@@ -246,65 +258,145 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   }
 
   /**
-   * The following four functions are used to track whether or not
-   * an item is currently in the process of being moved. This happens during a
-   * slab rebalance or resize operation or during eviction.
+   * The following two functions correspond to whether or not an item is
+   * currently in the process of being evicted. When item is marked for
+   * eviction, `kExclusive` bit is set and ref count is zero.
    *
-   * An item can only be marked exclusive when `isInMMContainer` returns true
-   * and the item is not yet marked as exclusive. This operation is atomic.
+   * An item can only be marked for eviction when `isInMMContainer` and
+   * `isAccessible` return true and item is not already marked for eviction
+   * nor moving and the ref count is 0. This operation is atomic.
    *
-   * User can also query if an item "isOnlyExclusive". This returns true only
-   * if the refcount is 0 and only the exclusive bit is set.
-   *
-   * Unmarking exclusive does not depend on `isInMMContainer`.
-   * Unmarking exclusive will also return the refcount at the moment of
-   * unmarking.
+   * Unmarking eviction does not depend on `isInMMContainer` nor `isAccessible`
    */
-  bool markExclusive() noexcept {
-    Value bitMask = getAdminRef<kExclusive>();
-    Value conditionBitMask = getAdminRef<kLinked>();
-
-    Value* const refPtr = &refCount_;
-    unsigned int nCASFailures = 0;
-    constexpr bool isWeak = false;
-    Value curValue = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
-    while (true) {
+  bool markForEviction() noexcept {
+    auto predicate = [](const Value curValue) {
+      Value conditionBitMask = getAdminRef<kLinked>();
       const bool flagSet = curValue & conditionBitMask;
-      const bool alreadyExclusive = curValue & bitMask;
+      const bool alreadyExclusive = curValue & getAdminRef<kExclusive>();
+      const bool accessible = curValue & getAdminRef<kAccessible>();
+
       if (!flagSet || alreadyExclusive) {
         return false;
       }
-
-      const Value newValue = curValue | bitMask;
-      if (__atomic_compare_exchange_n(refPtr, &curValue, newValue, isWeak,
-                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
-        XDCHECK(newValue & conditionBitMask);
-        return true;
+      if ((curValue & kAccessRefMask) != 0) {
+        return false;
       }
-
-      if ((++nCASFailures % 4) == 0) {
-        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
-        // above should take about 100 clock cycles. we pause once every 400
-        // cycles or so if we are extremely unlucky.
-        folly::asm_volatile_pause();
+      if (!accessible) {
+        return false;
       }
-    }
+
+      return true;
+    };
+
+    auto newValue = [](const Value curValue) {
+      return curValue | getAdminRef<kExclusive>();
+    };
+
+    return atomicUpdateValue(predicate, newValue);
   }
-  Value unmarkExclusive() noexcept {
+
+  Value unmarkForEviction() noexcept {
+    XDCHECK(isMarkedForEviction());
     Value bitMask = ~getAdminRef<kExclusive>();
     return __atomic_and_fetch(&refCount_, bitMask, __ATOMIC_ACQ_REL) & kRefMask;
   }
-  bool isExclusive() const noexcept {
-    return getRaw() & getAdminRef<kExclusive>();
+
+  bool isMarkedForEviction() const noexcept {
+    auto raw = getRaw();
+    return (raw & getAdminRef<kExclusive>()) && ((raw & kAccessRefMask) == 0);
+  }
+
+  /**
+   * The following functions correspond to whether or not an item is
+   * currently in the processed of being moved. When moving, internal
+   * ref count is always >= 1 and `kExclusive` bit is set. getRefCount
+   * does not return the extra ref (it can return 0).
+   *
+   * An item can only be marked moving when `isInMMContainer` returns true
+   * and item is not already marked for eviction nor moving.
+   *
+   * User can also query if an item "isOnlyMoving". This returns true only
+   * if the refcount is one and only the exlusive bit is set.
+   *
+   * Unmarking moving does not depend on `isInMMContainer`
+   */
+  bool markMoving(bool failIfRefNotZero) {
+    auto predicate = [failIfRefNotZero](const Value curValue) {
+      Value conditionBitMask = getAdminRef<kLinked>();
+      const bool flagSet = curValue & conditionBitMask;
+      const bool alreadyExclusive = curValue & getAdminRef<kExclusive>();
+      if (failIfRefNotZero && (curValue & kAccessRefMask) != 0) {
+        return false;
+      }
+      if (!flagSet || alreadyExclusive) {
+        return false;
+      }
+      if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) {
+        throw exception::RefcountOverflow("Refcount maxed out.");
+      }
+
+      return true;
+    };
+
+    auto newValue = [](const Value curValue) {
+      // Set exclusive flag and make the ref count non-zero (to distinguish
+      // from exclusive case). This extra ref will not be reported to the
+      // user
+      return (curValue + static_cast<Value>(1)) | getAdminRef<kExclusive>();
+    };
+
+    return atomicUpdateValue(predicate, newValue);
+  }
+
+  Value unmarkMoving() noexcept {
+    XDCHECK(isMoving());
+    auto predicate = [](const Value curValue) {
+      XDCHECK((curValue & kAccessRefMask) != 0);
+      return true;
+    };
+
+    Value retValue;
+    auto newValue = [&retValue](const Value curValue) {
+      retValue =
+          (curValue - static_cast<Value>(1)) & ~getAdminRef<kExclusive>();
+      return retValue;
+    };
+
+    auto updated = atomicUpdateValue(predicate, newValue);
+    XDCHECK(updated);
+
+    return retValue & kRefMask;
   }
-  bool isOnlyExclusive() const noexcept {
-    // An item is only exclusive when its refcount is zero and only the
-    // exclusive bit among all the control bits is set. This indicates an item
-    // is exclusive to the current thread. No other thread is allowed to
-    // do anything with it.
+
+  bool isMoving() const noexcept {
+    auto raw = getRaw();
+    return (raw & getAdminRef<kExclusive>()) && ((raw & kAccessRefMask) != 0);
+  }
+
+  /** This function attempts to mark item for eviction.
+   * Can only be called on the item that is moving.*/
+  bool markForEvictionWhenMoving() {
+    XDCHECK(isMoving());
+
+    auto predicate = [](const Value curValue) {
+      return (curValue & kAccessRefMask) == 1;
+    };
+
+    auto newValue = [](const Value curValue) {
+      XDCHECK((curValue & kAccessRefMask) == 1);
+      return (curValue - static_cast<Value>(1));
+    };
+
+    return atomicUpdateValue(predicate, newValue);
+  }
+
+  bool isOnlyMoving() const noexcept {
+    // An item is only moving when its refcount is one and only the exclusive
+    // bit among all the control bits is set. This indicates an item is already
+    // on its way out of cache and does not need to be moved.
     auto ref = getRefWithAccessAndAdmin();
-    bool anyOtherBitSet = ref & ~getAdminRef<kExclusive>();
-    if (anyOtherBitSet) {
+    Value valueWithoutExclusiveBit = ref & ~getAdminRef<kExclusive>();
+    if (valueWithoutExclusiveBit != 1) {
       return false;
     }
     return ref & getAdminRef<kExclusive>();
@@ -370,6 +462,39 @@ class FOLLY_PACK_ATTR RefcountWithFlags {
   }
 
  private:
+  /**
+   * Helper function to modify refCount_ atomically.
+   *
+   * If predicate(currentValue) is true, then it atomically assigns result
+   * of newValueF(currentValue) to refCount_ and returns true. Otherwise
+   * returns false and leaves refCount_ unmodified.
+   */
+  template <typename P, typename F>
+  bool atomicUpdateValue(P&& predicate, F&& newValueF) {
+    Value* const refPtr = &refCount_;
+    unsigned int nCASFailures = 0;
+    constexpr bool isWeak = false;
+    Value curValue = __atomic_load_n(refPtr, __ATOMIC_RELAXED);
+    while (true) {
+      if (!predicate(curValue)) {
+        return false;
+      }
+
+      const Value newValue = newValueF(curValue);
+      if (__atomic_compare_exchange_n(refPtr, &curValue, newValue, isWeak,
+                                      __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
+        return true;
+      }
+
+      if ((++nCASFailures % 4) == 0) {
+        // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl
+        // above should take about 100 clock cycles. we pause once every 400
+        // cycles or so if we are extremely unlucky.
+        folly::asm_volatile_pause();
+      }
+    }
+  }
+
   template <Flags flagBit>
   static Value getFlag() noexcept {
     static_assert(flagBit >= kNumAccessRefBits + kNumAdminRefBits,
diff --git a/cachelib/allocator/datastruct/DList.h b/cachelib/allocator/datastruct/DList.h
index 2e872c8ee0..4d862b1908 100644
--- a/cachelib/allocator/datastruct/DList.h
+++ b/cachelib/allocator/datastruct/DList.h
@@ -221,6 +221,10 @@ class DList {
       curr_ = dir_ == Direction::FROM_HEAD ? dlist_->head_ : dlist_->tail_;
     }
 
+    Direction getDirection() noexcept {
+        return dir_;
+    }
+
    protected:
     void goForward() noexcept;
     void goBackward() noexcept;
diff --git a/cachelib/allocator/datastruct/MultiDList-inl.h b/cachelib/allocator/datastruct/MultiDList-inl.h
index e20510d4fc..cd79b600c5 100644
--- a/cachelib/allocator/datastruct/MultiDList-inl.h
+++ b/cachelib/allocator/datastruct/MultiDList-inl.h
@@ -25,12 +25,26 @@ void MultiDList<T, HookPtr>::Iterator::goForward() noexcept {
   }
   // Move iterator forward
   ++currIter_;
-  // If we land at the rend of this list, move to the previous list.
-  while (index_ != kInvalidIndex &&
-         currIter_ == mlist_.lists_[index_]->rend()) {
-    --index_;
-    if (index_ != kInvalidIndex) {
-      currIter_ = mlist_.lists_[index_]->rbegin();
+
+  if (currIter_.getDirection() == DListIterator::Direction::FROM_HEAD) {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex && index_ != mlist_.lists_.size() &&
+           currIter_ == mlist_.lists_[index_]->end()) {
+      ++index_;
+      if (index_ != kInvalidIndex && index_ != mlist_.lists_.size()) {
+        currIter_ = mlist_.lists_[index_]->begin();
+      } else {
+          return;
+      }
+    }
+  } else {
+    // If we land at the rend of this list, move to the previous list.
+    while (index_ != kInvalidIndex &&
+           currIter_ == mlist_.lists_[index_]->rend()) {
+      --index_;
+      if (index_ != kInvalidIndex) {
+        currIter_ = mlist_.lists_[index_]->rbegin();
+      }
     }
   }
 }
@@ -71,6 +85,25 @@ void MultiDList<T, HookPtr>::Iterator::initToValidRBeginFrom(
                   : mlist_.lists_[index_]->rbegin();
 }
 
+template <typename T, DListHook<T> T::*HookPtr>
+void MultiDList<T, HookPtr>::Iterator::initToValidBeginFrom(
+    size_t listIdx) noexcept {
+  // Find the first non-empty list.
+  index_ = listIdx;
+  while (index_ != mlist_.lists_.size() &&
+         mlist_.lists_[index_]->size() == 0) {
+    ++index_;
+  }
+  if (index_ == mlist_.lists_.size()) {
+    //we reached the end - we should get set to
+    //invalid index
+    index_ = std::numeric_limits<size_t>::max();
+  }
+  currIter_ = index_ == std::numeric_limits<size_t>::max()
+                  ? mlist_.lists_[0]->begin()
+                  : mlist_.lists_[index_]->begin();
+}
+
 template <typename T, DListHook<T> T::*HookPtr>
 typename MultiDList<T, HookPtr>::Iterator&
 MultiDList<T, HookPtr>::Iterator::operator++() noexcept {
@@ -97,7 +130,16 @@ typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::rbegin(
   if (listIdx >= lists_.size()) {
     throw std::invalid_argument("Invalid list index for MultiDList iterator.");
   }
-  return MultiDList<T, HookPtr>::Iterator(*this, listIdx);
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, false);
+}
+
+template <typename T, DListHook<T> T::*HookPtr>
+typename MultiDList<T, HookPtr>::Iterator MultiDList<T, HookPtr>::begin(
+    size_t listIdx) const {
+  if (listIdx >= lists_.size()) {
+    throw std::invalid_argument("Invalid list index for MultiDList iterator.");
+  }
+  return MultiDList<T, HookPtr>::Iterator(*this, listIdx, true);
 }
 
 template <typename T, DListHook<T> T::*HookPtr>
diff --git a/cachelib/allocator/datastruct/MultiDList.h b/cachelib/allocator/datastruct/MultiDList.h
index 1a59baa715..bd7be00bd4 100644
--- a/cachelib/allocator/datastruct/MultiDList.h
+++ b/cachelib/allocator/datastruct/MultiDList.h
@@ -110,14 +110,18 @@ class MultiDList {
     }
 
     explicit Iterator(const MultiDList<T, HookPtr>& mlist,
-                      size_t listIdx) noexcept
+                      size_t listIdx, bool head) noexcept
         : currIter_(mlist.lists_[mlist.lists_.size() - 1]->rbegin()),
           mlist_(mlist) {
       XDCHECK_LT(listIdx, mlist.lists_.size());
-      initToValidRBeginFrom(listIdx);
+      if (head) {
+        initToValidBeginFrom(listIdx);
+      } else {
+        initToValidRBeginFrom(listIdx);
+      }
       // We should either point to an element or the end() iterator
       // which has an invalid index_.
-      XDCHECK(index_ == kInvalidIndex || currIter_.get() != nullptr);
+      XDCHECK(index_ == kInvalidIndex || index_ == mlist.lists_.size() || currIter_.get() != nullptr);
     }
     virtual ~Iterator() = default;
 
@@ -169,6 +173,9 @@ class MultiDList {
 
     // reset iterator to the beginning of a speicific queue
     void initToValidRBeginFrom(size_t listIdx) noexcept;
+    
+    // reset iterator to the head of a specific queue
+    void initToValidBeginFrom(size_t listIdx) noexcept;
 
     // Index of current list
     size_t index_{0};
@@ -184,6 +191,9 @@ class MultiDList {
 
   // provides an iterator starting from the tail of a specific list.
   Iterator rbegin(size_t idx) const;
+  
+  // provides an iterator starting from the head of a specific list.
+  Iterator begin(size_t idx) const;
 
   // Iterator to compare against for the end.
   Iterator rend() const noexcept;
diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp
index 71089153e9..512df86bbe 100644
--- a/cachelib/allocator/memory/AllocationClass.cpp
+++ b/cachelib/allocator/memory/AllocationClass.cpp
@@ -50,7 +50,7 @@ AllocationClass::AllocationClass(ClassId classId,
       poolId_(poolId),
       allocationSize_(allocSize),
       slabAlloc_(s),
-      freedAllocations_{slabAlloc_.createPtrCompressor<FreeAlloc>()} {
+      freedAllocations_{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()} {
   checkState();
 }
 
@@ -102,7 +102,7 @@ AllocationClass::AllocationClass(
       currSlab_(s.getSlabForIdx(*object.currSlabIdx())),
       slabAlloc_(s),
       freedAllocations_(*object.freedAllocationsObject(),
-                        slabAlloc_.createPtrCompressor<FreeAlloc>()),
+                        slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()),
       canAllocate_(*object.canAllocate()) {
   if (!slabAlloc_.isRestorable()) {
     throw std::logic_error("The allocation class cannot be restored.");
@@ -356,9 +356,10 @@ std::pair<bool, std::vector<void*>> AllocationClass::pruneFreeAllocs(
   // allocated slab, release any freed allocations belonging to this slab.
   // Set the bit to true if the corresponding allocation is freed, false
   // otherwise.
-  FreeList freeAllocs{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList notInSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
-  FreeList inSlab{slabAlloc_.createPtrCompressor<FreeAlloc>()};
+  FreeList freeAllocs{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList notInSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+  FreeList inSlab{slabAlloc_.createSingleTierPtrCompressor<FreeAlloc>()};
+
 
   lock_->lock_combine([&]() {
     // Take the allocation class free list offline
diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h
index b602e4d66d..269887f207 100644
--- a/cachelib/allocator/memory/AllocationClass.h
+++ b/cachelib/allocator/memory/AllocationClass.h
@@ -94,14 +94,6 @@ class AllocationClass {
     return static_cast<unsigned int>(Slab::kSize / allocationSize_);
   }
 
-  // total number of slabs under this AllocationClass.
-  unsigned int getNumSlabs() const {
-    return lock_->lock_combine([this]() {
-      return static_cast<unsigned int>(freeSlabs_.size() +
-                                       allocatedSlabs_.size());
-    });
-  }
-
   // fetch stats about this allocation class.
   ACStats getStats() const;
 
@@ -453,7 +445,7 @@ class AllocationClass {
   struct CACHELIB_PACKED_ATTR FreeAlloc {
     using CompressedPtr = facebook::cachelib::CompressedPtr;
     using PtrCompressor =
-        facebook::cachelib::PtrCompressor<FreeAlloc, SlabAllocator>;
+        facebook::cachelib::SingleTierPtrCompressor<FreeAlloc, SlabAllocator>;
     SListHook<FreeAlloc> hook_{};
   };
 
diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h
index 96d39ae2b9..d664063ea3 100644
--- a/cachelib/allocator/memory/CompressedPtr.h
+++ b/cachelib/allocator/memory/CompressedPtr.h
@@ -27,18 +27,32 @@ namespace cachelib {
 
 class SlabAllocator;
 
-// the following are for pointer compression for the memory allocator.  We
-// compress pointers by storing the slab index and the alloc index of the
-// allocation inside the slab. With slab worth kNumSlabBits of data, if we
-// have the min allocation size as 64 bytes, that requires kNumSlabBits - 6
-// bits for storing the alloc index. This leaves the remaining (32 -
-// (kNumSlabBits - 6)) bits for the slab index.  Hence we can index 256 GiB
-// of memory in slabs and index anything more than 64 byte allocations inside
-// the slab using a 32 bit representation.
-//
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor;
+
 // This CompressedPtr makes decompression fast by staying away from division and
-// modulo arithmetic and doing those during the compression time. We most often
-// decompress a CompressedPtr than compress a pointer while creating one.
+// modulo arithmetic and doing those during the compression time. We most  often
+// decompress a CompressedPtr than compress a pointer  while creating one.  This
+// is used for pointer compression by the memory allocator.
+
+// We compress pointers by storing the tier index, slab index and alloc index of
+// the allocation inside the slab.
+
+// In original design (without memory tiers):
+// Each slab addresses 22 bits of allocations (kNumSlabBits). This is split into
+// allocation index and allocation size. If we have the min allocation size of
+// 64 bytes (kMinAllocPower = 6 bits), remaining kNumSlabBits(22) -
+// kMinAllocPower(6) = 16 bits for storing the alloc index. This leaves the
+// remaining 32 - (kNumSlabBits - kMinAllocPower) = 16 bits  for the  slab
+// index. Hence we can index 256 GiB of memory.
+
+// In multi-tier design:
+// kNumSlabIds and kMinAllocPower remains unchanged. The tier id occupies the
+// 32nd bit only since its value cannot exceed kMaxTiers(2). This leaves the
+// remaining 32 - (kNumSlabBits - kMinAllocPower) - 1 bit for tier id = 15 bits
+// for the slab index. Hence we can index 128 GiB of memory per tier in
+// multi-tier configuration.
+
 class CACHELIB_PACKED_ATTR CompressedPtr {
  public:
   using PtrType = uint32_t;
@@ -62,9 +76,10 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
     return static_cast<uint32_t>(1) << (Slab::kMinAllocPower);
   }
 
-  // maximum adressable memory for pointer compression to work.
+  // maximum addressable memory for pointer compression to work.
   static constexpr size_t getMaxAddressableSize() noexcept {
-    return static_cast<size_t>(1) << (kNumSlabIdxBits + Slab::kNumSlabBits);
+    return static_cast<size_t>(1)
+           << (numSlabIdxBits(false) + Slab::kNumSlabBits);
   }
 
   // default construct to nullptr.
@@ -89,8 +104,11 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   PtrType ptr_{kNull};
 
   // create a compressed pointer for a valid memory allocation.
-  CompressedPtr(uint32_t slabIdx, uint32_t allocIdx)
-      : ptr_(compress(slabIdx, allocIdx)) {}
+  CompressedPtr(uint32_t slabIdx,
+                uint32_t allocIdx,
+                bool isMultiTiered,
+                TierId tid = 0)
+      : ptr_(compress(slabIdx, allocIdx, isMultiTiered, tid)) {}
 
   constexpr explicit CompressedPtr(PtrType ptr) noexcept : ptr_{ptr} {}
 
@@ -100,55 +118,88 @@ class CACHELIB_PACKED_ATTR CompressedPtr {
   static constexpr unsigned int kNumAllocIdxBits =
       Slab::kNumSlabBits - Slab::kMinAllocPower;
 
+  // Use 32nd bit position for TierId
+  static constexpr unsigned int kNumTierIdxOffset = 31;
+
   static constexpr PtrType kAllocIdxMask = ((PtrType)1 << kNumAllocIdxBits) - 1;
 
-  // Number of bits for the slab index. This will be the top 16 bits of the
-  // compressed ptr.
-  static constexpr unsigned int kNumSlabIdxBits =
-      NumBits<PtrType>::value - kNumAllocIdxBits;
+  // kNumTierIdxBits most significant bits
+  static constexpr PtrType kTierIdxMask = (PtrType)1 << kNumTierIdxOffset;
+
+  // Number of bits for the slab index.
+  // If CacheLib is single tiered, slab index will be the top 16 bits
+  // of the compressed ptr.
+  // Else if CacheLib is multi-tiered, the topmost 32nd bit will be
+  // reserved for tier id. The following 15 bits will be reserved for
+  // the slab index.
+  static constexpr unsigned int numSlabIdxBits(bool isMultiTiered) {
+    return kNumTierIdxOffset - kNumAllocIdxBits + (!isMultiTiered);
+  }
 
   // Compress the given slabIdx and allocIdx into a 32-bit compressed
   // pointer.
-  static PtrType compress(uint32_t slabIdx, uint32_t allocIdx) noexcept {
+  static PtrType compress(uint32_t slabIdx,
+                          uint32_t allocIdx,
+                          bool isMultiTiered,
+                          TierId tid) noexcept {
     XDCHECK_LE(allocIdx, kAllocIdxMask);
-    XDCHECK_LT(slabIdx, (1u << kNumSlabIdxBits) - 1);
-    return (slabIdx << kNumAllocIdxBits) + allocIdx;
+    XDCHECK_LT(slabIdx, (1u << numSlabIdxBits(isMultiTiered)) - 1);
+    if (!isMultiTiered) {
+      return (slabIdx << kNumAllocIdxBits) + allocIdx;
+    }
+    return (static_cast<uint32_t>(tid) << kNumTierIdxOffset) +
+           (slabIdx << kNumAllocIdxBits) + allocIdx;
   }
 
   // Get the slab index of the compressed ptr
-  uint32_t getSlabIdx() const noexcept {
+  uint32_t getSlabIdx(bool isMultiTiered) const noexcept {
     XDCHECK(!isNull());
-    return static_cast<uint32_t>(ptr_ >> kNumAllocIdxBits);
+    auto noTierIdPtr = isMultiTiered ? ptr_ & ~kTierIdxMask : ptr_;
+    return static_cast<uint32_t>(noTierIdPtr >> kNumAllocIdxBits);
   }
 
   // Get the allocation index of the compressed ptr
   uint32_t getAllocIdx() const noexcept {
     XDCHECK(!isNull());
+    // Note: tid check not required in ptr_ since only
+    //       the lower 16 bits are being read here.
     return static_cast<uint32_t>(ptr_ & kAllocIdxMask);
   }
 
+  uint32_t getTierId(bool isMultiTiered) const noexcept {
+    XDCHECK(!isNull());
+    return isMultiTiered ? static_cast<uint32_t>(ptr_ >> kNumTierIdxOffset) : 0;
+  }
+
+  void setTierId(TierId tid) noexcept {
+    ptr_ += static_cast<uint32_t>(tid) << kNumTierIdxOffset;
+  }
+
   friend SlabAllocator;
+  template <typename CPtrType, typename AllocatorContainer>
+  friend class PtrCompressor;
 };
 
 template <typename PtrType, typename AllocatorT>
-class PtrCompressor {
+class SingleTierPtrCompressor {
  public:
-  explicit PtrCompressor(const AllocatorT& allocator) noexcept
+  explicit SingleTierPtrCompressor(const AllocatorT& allocator) noexcept
       : allocator_(allocator) {}
 
   const CompressedPtr compress(const PtrType* uncompressed) const {
-    return allocator_.compress(uncompressed);
+    return allocator_.compress(uncompressed, false /* isMultiTiered */);
   }
 
   PtrType* unCompress(const CompressedPtr compressed) const {
-    return static_cast<PtrType*>(allocator_.unCompress(compressed));
+    return static_cast<PtrType*>(
+        allocator_.unCompress(compressed, false /* isMultiTiered */));
   }
 
-  bool operator==(const PtrCompressor& rhs) const noexcept {
+  bool operator==(const SingleTierPtrCompressor& rhs) const noexcept {
     return &allocator_ == &rhs.allocator_;
   }
 
-  bool operator!=(const PtrCompressor& rhs) const noexcept {
+  bool operator!=(const SingleTierPtrCompressor& rhs) const noexcept {
     return !(*this == rhs);
   }
 
@@ -156,5 +207,53 @@ class PtrCompressor {
   // memory allocator that does the pointer compression.
   const AllocatorT& allocator_;
 };
+
+template <typename PtrType, typename AllocatorContainer>
+class PtrCompressor {
+ public:
+  explicit PtrCompressor(const AllocatorContainer& allocators) noexcept
+      : allocators_(allocators) {}
+
+  const CompressedPtr compress(const PtrType* uncompressed) const {
+    if (uncompressed == nullptr)
+      return CompressedPtr{};
+
+    TierId tid;
+    for (tid = 0; tid < allocators_.size(); tid++) {
+      if (allocators_[tid]->isMemoryInAllocator(
+              static_cast<const void*>(uncompressed)))
+        break;
+    }
+
+    bool isMultiTiered = allocators_.size() > 1;
+    auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered);
+    if (isMultiTiered) { // config has multiple tiers
+      cptr.setTierId(tid);
+    }
+    return cptr;
+  }
+
+  PtrType* unCompress(const CompressedPtr compressed) const {
+    if (compressed.isNull()) {
+      return nullptr;
+    }
+    bool isMultiTiered = allocators_.size() > 1;
+    auto& allocator = *allocators_[compressed.getTierId(isMultiTiered)];
+    return static_cast<PtrType*>(
+        allocator.unCompress(compressed, isMultiTiered));
+  }
+
+  bool operator==(const PtrCompressor& rhs) const noexcept {
+    return &allocators_ == &rhs.allocators_;
+  }
+
+  bool operator!=(const PtrCompressor& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+
+ private:
+  // memory allocator that does the pointer compression.
+  const AllocatorContainer& allocators_;
+};
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h
index 509664afa6..a77d23494c 100644
--- a/cachelib/allocator/memory/MemoryAllocator.h
+++ b/cachelib/allocator/memory/MemoryAllocator.h
@@ -516,12 +516,13 @@ class MemoryAllocator {
   using CompressedPtr = facebook::cachelib::CompressedPtr;
   template <typename PtrType>
   using PtrCompressor =
-      facebook::cachelib::PtrCompressor<PtrType, SlabAllocator>;
-
+      facebook::cachelib::PtrCompressor<PtrType,
+             std::vector<std::unique_ptr<MemoryAllocator>>>;
+  
   template <typename PtrType>
-  PtrCompressor<PtrType> createPtrCompressor() {
-    return slabAllocator_.createPtrCompressor<PtrType>();
-  }
+  using SingleTierPtrCompressor =
+       facebook::cachelib::PtrCompressor<PtrType,
+       SlabAllocator>;
 
   // compress a given pointer to a valid allocation made out of this allocator
   // through an allocate() or nullptr. Calling this otherwise with invalid
@@ -534,8 +535,9 @@ class MemoryAllocator {
   //                as the original pointer is valid.
   //
   // @throw  std::invalid_argument if the ptr is invalid.
-  CompressedPtr CACHELIB_INLINE compress(const void* ptr) const {
-    return slabAllocator_.compress(ptr);
+  CompressedPtr CACHELIB_INLINE compress(const void* ptr,
+                                         bool isMultiTiered) const {
+    return slabAllocator_.compress(ptr, isMultiTiered);
   }
 
   // retrieve the raw pointer corresponding to the compressed pointer. This is
@@ -546,8 +548,9 @@ class MemoryAllocator {
   // @return        the raw pointer corresponding to this compressed pointer.
   //
   // @throw   std::invalid_argument if the compressed pointer is invalid.
-  void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr) const {
-    return slabAllocator_.unCompress(cPtr);
+  void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr,
+                                   bool isMultiTiered) const {
+    return slabAllocator_.unCompress(cPtr, isMultiTiered);
   }
 
   // a special implementation of pointer compression for benchmarking purposes.
@@ -644,6 +647,13 @@ class MemoryAllocator {
     memoryPoolManager_.updateNumSlabsToAdvise(numSlabs);
   }
 
+  // returns ture if ptr points to memory which is managed by this
+  // allocator
+  bool isMemoryInAllocator(const void *ptr) {
+    return ptr && ptr >= slabAllocator_.getSlabMemoryBegin()
+      && ptr < slabAllocator_.getSlabMemoryEnd();
+  }
+
  private:
   // @param memory    pointer to the memory.
   // @return          the MemoryPool corresponding to the memory.
diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h
index 74ebbe64dd..acda9ee530 100644
--- a/cachelib/allocator/memory/MemoryAllocatorStats.h
+++ b/cachelib/allocator/memory/MemoryAllocatorStats.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 
 #include "cachelib/allocator/memory/Slab.h"
+#include "cachelib/common/RollingStats.h"
 
 namespace facebook {
 namespace cachelib {
@@ -47,6 +48,9 @@ struct ACStats {
   // true if the allocation class is full.
   bool full;
 
+  // Rolling allocation latency (in ns)
+  util::RollingStats allocLatencyNs;
+
   constexpr unsigned long long totalSlabs() const noexcept {
     return freeSlabs + usedSlabs;
   }
@@ -54,6 +58,17 @@ struct ACStats {
   constexpr size_t getTotalFreeMemory() const noexcept {
     return Slab::kSize * freeSlabs + freeAllocs * allocSize;
   }
+
+  constexpr double usageFraction() const noexcept {
+    if (usedSlabs == 0)
+      return 0.0;
+
+    return activeAllocs / (usedSlabs * allocsPerSlab);
+  }
+
+  constexpr size_t totalAllocatedSize() const noexcept {
+    return activeAllocs * allocSize;
+  }
 };
 
 // structure to query stats corresponding to a MemoryPool
diff --git a/cachelib/allocator/memory/Slab.h b/cachelib/allocator/memory/Slab.h
index 4784bee8e9..897ad4e349 100644
--- a/cachelib/allocator/memory/Slab.h
+++ b/cachelib/allocator/memory/Slab.h
@@ -50,6 +50,8 @@ namespace cachelib {
  * independantly by the SlabAllocator.
  */
 
+// identifier for the memory tier
+using TierId = int8_t;
 // identifier for the memory pool
 using PoolId = int8_t;
 // identifier for the allocation class
diff --git a/cachelib/allocator/memory/SlabAllocator.cpp b/cachelib/allocator/memory/SlabAllocator.cpp
index ade5a8e535..080a3ee174 100644
--- a/cachelib/allocator/memory/SlabAllocator.cpp
+++ b/cachelib/allocator/memory/SlabAllocator.cpp
@@ -40,7 +40,9 @@
 using namespace facebook::cachelib;
 
 namespace {
-size_t roundDownToSlabSize(size_t size) { return size - (size % sizeof(Slab)); }
+static inline size_t roundDownToSlabSize(size_t size) {
+  return size - (size % sizeof(Slab));
+}
 } // namespace
 
 // definitions to avoid ODR violation.
@@ -48,7 +50,6 @@ using PtrType = CompressedPtr::PtrType;
 constexpr uint64_t SlabAllocator::kAddressMask;
 constexpr PtrType CompressedPtr::kAllocIdxMask;
 constexpr unsigned int CompressedPtr::kNumAllocIdxBits;
-constexpr unsigned int CompressedPtr::kNumSlabIdxBits;
 
 constexpr unsigned int SlabAllocator::kLockSleepMS;
 constexpr size_t SlabAllocator::kPagesPerStep;
diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h
index 5f5bf3265a..a80a54672c 100644
--- a/cachelib/allocator/memory/SlabAllocator.h
+++ b/cachelib/allocator/memory/SlabAllocator.h
@@ -225,7 +225,8 @@ class SlabAllocator {
   // the corresponding memory allocator. trying to inline this just increases
   // the code size and does not move the needle on the benchmarks much.
   // Calling this with invalid input in optimized build is undefined behavior.
-  CompressedPtr CACHELIB_INLINE compress(const void* ptr) const {
+  CompressedPtr CACHELIB_INLINE compress(const void* ptr,
+                                         bool isMultiTiered) const {
     if (ptr == nullptr) {
       return CompressedPtr{};
     }
@@ -246,18 +247,23 @@ class SlabAllocator {
         static_cast<uint32_t>(reinterpret_cast<const uint8_t*>(ptr) -
                               reinterpret_cast<const uint8_t*>(slab)) /
         allocSize;
-    return CompressedPtr{slabIndex, allocIdx};
+    return CompressedPtr{slabIndex, allocIdx, isMultiTiered};
   }
 
   // uncompress the point and return the raw ptr.  This function never throws
   // in optimized build and assumes that the caller is responsible for calling
   // it with a valid compressed pointer.
-  void* CACHELIB_INLINE unCompress(const CompressedPtr ptr) const {
+  void* CACHELIB_INLINE unCompress(const CompressedPtr ptr,
+                                   bool isMultiTiered) const {
     if (ptr.isNull()) {
       return nullptr;
     }
 
-    const SlabIdx slabIndex = ptr.getSlabIdx();
+    /* TODO: isMultiTiered set to false by default.
+       Multi-tiering flag will have no impact till
+       rest of the multi-tiering changes are merged.
+     */
+    const SlabIdx slabIndex = ptr.getSlabIdx(isMultiTiered);
     const uint32_t allocIdx = ptr.getAllocIdx();
     const Slab* slab = &slabMemoryStart_[slabIndex];
 
@@ -312,8 +318,19 @@ class SlabAllocator {
   }
 
   template <typename PtrType>
-  PtrCompressor<PtrType, SlabAllocator> createPtrCompressor() const {
-    return PtrCompressor<PtrType, SlabAllocator>(*this);
+  SingleTierPtrCompressor<PtrType, SlabAllocator> createSingleTierPtrCompressor() const {
+     return SingleTierPtrCompressor<PtrType, SlabAllocator>(*this);
+  }
+
+  // returns starting address of memory we own.
+  const Slab* getSlabMemoryBegin() const noexcept {
+    return reinterpret_cast<Slab*>(memoryStart_);
+  }
+
+  // returns first byte after the end of memory region we own.
+  const Slab* getSlabMemoryEnd() const noexcept {
+    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
+                                   memorySize_);
   }
 
  private:
@@ -333,12 +350,6 @@ class SlabAllocator {
   // @throw std::invalid_argument if the state is invalid.
   void checkState() const;
 
-  // returns first byte after the end of memory region we own.
-  const Slab* getSlabMemoryEnd() const noexcept {
-    return reinterpret_cast<Slab*>(reinterpret_cast<uint8_t*>(memoryStart_) +
-                                   memorySize_);
-  }
-
   // returns true if we have slabbed all the memory that is available to us.
   // false otherwise.
   bool allMemorySlabbed() const noexcept {
diff --git a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
index cbd8cbef17..a19b9749a6 100644
--- a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
+++ b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp
@@ -401,13 +401,28 @@ TEST_F(MemoryAllocatorTest, PointerCompression) {
   for (const auto& pool : poolAllocs) {
     const auto& allocs = pool.second;
     for (const auto* alloc : allocs) {
-      CompressedPtr ptr = m.compress(alloc);
+      CompressedPtr ptr = m.compress(alloc, false /* isMultiTiered */);
       ASSERT_FALSE(ptr.isNull());
-      ASSERT_EQ(alloc, m.unCompress(ptr));
+      ASSERT_EQ(alloc, m.unCompress(ptr, false /* isMultiTiered */));
     }
   }
 
-  ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr)));
+  ASSERT_EQ(nullptr,
+            m.unCompress(m.compress(nullptr, false /* isMultiTiered */),
+                         false /* isMultiTiered */));
+
+  // test pointer compression with multi-tier
+  for (const auto& pool : poolAllocs) {
+    const auto& allocs = pool.second;
+    for (const auto* alloc : allocs) {
+      CompressedPtr ptr = m.compress(alloc, true /* isMultiTiered */);
+      ASSERT_FALSE(ptr.isNull());
+      ASSERT_EQ(alloc, m.unCompress(ptr, true /* isMultiTiered */));
+    }
+  }
+
+  ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr, true /* isMultiTiered */),
+                                  true /* isMultiTiered */));
 }
 
 TEST_F(MemoryAllocatorTest, Restorable) {
diff --git a/cachelib/allocator/nvmcache/NvmCache-inl.h b/cachelib/allocator/nvmcache/NvmCache-inl.h
index b79712e607..3cef6b7ad0 100644
--- a/cachelib/allocator/nvmcache/NvmCache-inl.h
+++ b/cachelib/allocator/nvmcache/NvmCache-inl.h
@@ -460,19 +460,18 @@ uint32_t NvmCache<C>::getStorageSizeInNvm(const Item& it) {
 }
 
 template <typename C>
-std::unique_ptr<NvmItem> NvmCache<C>::makeNvmItem(const WriteHandle& hdl) {
-  const auto& item = *hdl;
+std::unique_ptr<NvmItem> NvmCache<C>::makeNvmItem(const Item& item) {
   auto poolId = cache_.getAllocInfo((void*)(&item)).poolId;
 
   if (item.isChainedItem()) {
     throw std::invalid_argument(folly::sformat(
-        "Chained item can not be flushed separately {}", hdl->toString()));
+        "Chained item can not be flushed separately {}", item.toString()));
   }
 
   auto chainedItemRange =
-      CacheAPIWrapperForNvm<C>::viewAsChainedAllocsRange(cache_, *hdl);
+      CacheAPIWrapperForNvm<C>::viewAsChainedAllocsRange(cache_, item);
   if (config_.encodeCb && !config_.encodeCb(EncodeDecodeContext{
-                              *(hdl.getInternal()), chainedItemRange})) {
+                              const_cast<Item&>(item), chainedItemRange})) {
     return nullptr;
   }
 
@@ -496,12 +495,10 @@ std::unique_ptr<NvmItem> NvmCache<C>::makeNvmItem(const WriteHandle& hdl) {
 }
 
 template <typename C>
-void NvmCache<C>::put(WriteHandle& hdl, PutToken token) {
+void NvmCache<C>::put(Item& item, PutToken token) {
   util::LatencyTracker tracker(stats().nvmInsertLatency_);
-  HashedKey hk{hdl->getKey()};
+  HashedKey hk{item.getKey()};
 
-  XDCHECK(hdl);
-  auto& item = *hdl;
   // for regular items that can only write to nvmcache upon eviction, we
   // should not be recording a write for an nvmclean item unless it is marked
   // as evicted from nvmcache.
@@ -526,7 +523,7 @@ void NvmCache<C>::put(WriteHandle& hdl, PutToken token) {
     return;
   }
 
-  auto nvmItem = makeNvmItem(hdl);
+  auto nvmItem = makeNvmItem(item);
   if (!nvmItem) {
     stats().numNvmPutEncodeFailure.inc();
     return;
diff --git a/cachelib/allocator/nvmcache/NvmCache.h b/cachelib/allocator/nvmcache/NvmCache.h
index c9f5c753f0..245431f7d0 100644
--- a/cachelib/allocator/nvmcache/NvmCache.h
+++ b/cachelib/allocator/nvmcache/NvmCache.h
@@ -158,11 +158,11 @@ class NvmCache {
   PutToken createPutToken(folly::StringPiece key);
 
   // store the given item in navy
-  // @param hdl         handle to cache item. should not be null
+  // @param item        cache item
   // @param token       the put token for the item. this must have been
   //                    obtained before enqueueing the put to maintain
   //                    consistency
-  void put(WriteHandle& hdl, PutToken token);
+  void put(Item& item, PutToken token);
 
   // returns the current state of whether nvmcache is enabled or not. nvmcache
   // can be disabled if the backend implementation ends up in a corrupt state
@@ -286,7 +286,7 @@ class NvmCache {
   // returns true if there is tombstone entry for the key.
   bool hasTombStone(HashedKey hk);
 
-  std::unique_ptr<NvmItem> makeNvmItem(const WriteHandle& handle);
+  std::unique_ptr<NvmItem> makeNvmItem(const Item& item);
 
   // wrap an item into a blob for writing into navy.
   Blob makeBlob(const Item& it);
diff --git a/cachelib/allocator/nvmcache/tests/NvmTestBase.h b/cachelib/allocator/nvmcache/tests/NvmTestBase.h
index fd88875fa9..70f00f2e52 100644
--- a/cachelib/allocator/nvmcache/tests/NvmTestBase.h
+++ b/cachelib/allocator/nvmcache/tests/NvmTestBase.h
@@ -108,7 +108,7 @@ class NvmCacheTest : public testing::Test {
   void pushToNvmCacheFromRamForTesting(WriteHandle& handle) {
     auto nvmCache = getNvmCache();
     if (nvmCache) {
-      nvmCache->put(handle, nvmCache->createPutToken(handle->getKey()));
+      nvmCache->put(*handle, nvmCache->createPutToken(handle->getKey()));
     }
   }
 
@@ -127,7 +127,7 @@ class NvmCacheTest : public testing::Test {
   }
 
   std::unique_ptr<NvmItem> makeNvmItem(const WriteHandle& handle) {
-    return getNvmCache()->makeNvmItem(handle);
+    return getNvmCache()->makeNvmItem(*handle);
   }
 
   std::unique_ptr<folly::IOBuf> createItemAsIOBuf(folly::StringPiece key,
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
new file mode 100644
index 0000000000..ad845d94df
--- /dev/null
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/allocator/tests/AllocatorMemoryTiersTest.h"
+
+namespace facebook {
+namespace cachelib {
+namespace tests {
+
+using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruAllocator>;
+//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest<LruTestAllocator>;
+// TODO(MEMORY_TIER): add more tests with different eviction policies
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); }
+TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); }
+
+} // end of namespace tests
+} // end of namespace cachelib
+} // end of namespace facebook
diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
new file mode 100644
index 0000000000..c724cd9617
--- /dev/null
+++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/CacheAllocatorConfig.h"
+#include "cachelib/allocator/MemoryTierCacheConfig.h"
+#include "cachelib/allocator/tests/TestBase.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <semaphore.h>
+#include <folly/synchronization/Latch.h>
+
+namespace facebook {
+namespace cachelib {
+namespace tests {
+
+template <typename AllocatorT>
+class AllocatorMemoryTiersTest : public AllocatorTest<AllocatorT> {
+ private:
+  template<typename MvCallback>
+  void testMultiTiersAsyncOpDuringMove(std::unique_ptr<AllocatorT>& alloc,
+                                       PoolId& pool, bool& quit, MvCallback&& moveCb) {
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */,
+                                     -1 /* movingAttemptsLimit */);
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+  }
+  
+ public:
+  void testMultiTiersInvalid() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+  }
+
+  void testMultiTiersValid() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
+  
+  void testMultiTiersBackgroundMovers() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(10 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.usePosixForShm();
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+    config.enableBackgroundEvictor(std::make_shared<FreeThresholdStrategy>(2, 10, 100, 40),
+            std::chrono::milliseconds(10),1);
+    config.enableBackgroundPromoter(std::make_shared<PromotionStrategy>(5, 4, 2),
+            std::chrono::milliseconds(10),1);
+
+    auto allocator = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(allocator != nullptr);
+    const size_t numBytes = allocator->getCacheMemoryStats().ramCacheSize;
+
+    auto poolId = allocator->addPool("default", numBytes);
+
+    const unsigned int keyLen = 100;
+    const unsigned int size = 100;
+    unsigned int allocs = 0;
+
+    //we should work on pool stats because filluppooluntil evictions
+    //will finish once we evict an item from tier 0 to tier 1 and
+    //there will be unallocated memory left.
+    while (allocs < 174760) {
+      const auto key = this->getRandomNewKey(*allocator, keyLen);
+      ASSERT_EQ(allocator->find(key), nullptr);
+      auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+      allocs++;
+    }
+   
+    const auto key = this->getRandomNewKey(*allocator, keyLen);
+    auto handle = util::allocateAccessible(*allocator, poolId, key, size);
+    ASSERT_NE(nullptr, handle);
+    const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId;
+    ASSERT_EQ(cid,5);
+    auto stats = allocator->getGlobalCacheStats();
+    auto slabStats = allocator->getACStats(0,0,cid);
+    const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); 
+    //cache is 10MB should move about 1MB to reach 10% free
+    uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize;
+    while (stats.evictionStats.numMovedItems < approxEvict*0.95 && (1-slabStats.usageFraction()) >= 0.095) {
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        stats = allocator->getGlobalCacheStats();
+        slabStats = allocator->getACStats(0,0,cid);
+    }
+    ASSERT_GE(1-slabStats.usageFraction(),0.095);
+
+    auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict);
+    auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote);
+
+    ASSERT_GE(stats.evictionStats.numMovedItems,1);
+    ASSERT_GE(stats.evictionStats.runCount,1);
+    ASSERT_GE(stats.promotionStats.numMovedItems,1);
+   
+    ASSERT_GE(perclassEstats[0][0][cid], 1);
+    ASSERT_GE(perclassPstats[1][0][cid], 1);
+    
+  }
+
+  void testMultiTiersValidMixed() {
+    typename AllocatorT::Config config;
+    config.setCacheSize(100 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    ASSERT_NO_THROW(config.configureMemoryTiers(
+        {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0")),
+         MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+             std::string("0"))}));
+
+    auto alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+
+    auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+    auto handle = alloc->allocate(pool, "key", std::string("value").size());
+    ASSERT(handle != nullptr);
+    ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+  }
+
+  void testMultiTiersRemoveDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      
+      auto key = oldItem.getKey();
+      t = std::make_unique<std::thread>([&](){
+            // remove() function is blocked by wait context
+            // till item is moved to next tier. So that, we should
+            // notify latch before calling remove()
+            latch.count_down();
+            alloc->remove(key);
+          });
+      // wait till async thread is running
+      latch.wait();
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      quit = true;
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+  }
+  
+  void testMultiTiersReplaceDuringEviction() {
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    std::unique_ptr<std::thread> t;
+    folly::Latch latch(1);
+    bool quit = false;
+
+    auto moveCb = [&] (typename AllocatorT::Item& oldItem,
+                       typename AllocatorT::Item& newItem,
+                       typename AllocatorT::Item* /* parentPtr */) {
+      auto key = oldItem.getKey();
+      if(!quit) {
+        // we need to replace only once because subsequent allocate calls
+        // will cause evictions recursevly
+        quit = true;
+        t = std::make_unique<std::thread>([&](){
+              auto handle = alloc->allocate(pool, key, std::string("new value").size());
+              // insertOrReplace() function is blocked by wait context
+              // till item is moved to next tier. So that, we should
+              // notify latch before calling insertOrReplace()
+              latch.count_down();
+              ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+            });
+        // wait till async thread is running
+        latch.wait();
+      }
+      memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+    };
+
+    testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb);
+
+    t->join();
+
+  }
+
+
+  void gdb_sync1() {}
+  void gdb_sync2() {}
+  void gdb_sync3() {}
+  using ReadHandle = typename AllocatorT::ReadHandle;
+  void testMultiTiersReplaceDuringEvictionWithReader() {
+    sem_unlink ("/gdb1_sem");
+    sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0);
+    int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
+    char gdbcmds[] = 
+                     "set attached=1\n"
+                     "break gdb_sync1\n"
+                     "break gdb_sync2\n"
+                     "break moveRegularItemWithSync\n"
+                     "c\n"
+                     "set scheduler-locking on\n"
+                     "thread 1\n"
+                     "c\n"
+                     "thread 4\n"
+                     "c\n"
+                     "thread 5\n"
+                     "break nativeFutexWaitImpl thread 5\n"
+                     "c\n"
+                     "thread 4\n"
+                     "break nativeFutexWaitImpl thread 4\n"
+                     "c\n"
+                     "thread 1\n"
+                     "break releaseBackToAllocator\n"
+                     "c\n"
+                     "c\n"
+                     "thread 5\n"
+                     "c\n"
+                     "thread 4\n"
+                     "c\n"
+                     "thread 1\n"
+                     "break gdb_sync3\n"
+                     "c\n"
+                     "quit\n";
+    int ret = write(gdbfd,gdbcmds,strlen(gdbcmds));
+    int ppid = getpid(); //parent pid
+    //int pid = 0;
+    int pid = fork();
+    if (pid == 0) {
+        sem_wait(sem);
+        sem_close(sem);
+        sem_unlink("/gdb1_sem");
+        char cmdpid[256];
+        sprintf(cmdpid,"%d",ppid);
+        int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0);
+        ASSERT(f != -1);
+    }
+    sem_post(sem);
+    //wait for gdb to run
+    int attached = 0;
+    while (attached == 0);
+    
+    std::unique_ptr<AllocatorT> alloc;
+    PoolId pool;
+    bool quit = false;
+    
+    typename AllocatorT::Config config;
+    config.setCacheSize(4 * Slab::kSize);
+    config.enableCachePersistence("/tmp");
+    config.configureMemoryTiers({
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0")),
+        MemoryTierCacheConfig::fromShm()
+            .setRatio(1).setMemBind(std::string("0"))
+    });
+
+    alloc = std::make_unique<AllocatorT>(AllocatorT::SharedMemNew, config);
+    ASSERT(alloc != nullptr);
+    pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize);
+
+    int i = 0;
+    typename AllocatorT::Item* evicted;
+    std::unique_ptr<std::thread> t;
+    std::unique_ptr<std::thread> r;
+    while(!quit) {
+      auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size());
+      ASSERT(handle != nullptr);
+      if (i == 1) {
+          evicted = static_cast<typename AllocatorT::Item*>(handle.get());
+          folly::Latch latch_t(1);
+          t = std::make_unique<std::thread>([&](){
+                auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size());
+                ASSERT(handleNew != nullptr);
+                latch_t.count_down();
+                //first breakpoint will be this one because 
+                //thread 1 still has more items to fill up the
+                //cache before an evict is evicted
+                gdb_sync1();
+                ASSERT(evicted->isMoving());
+                //need to suspend thread 1 - who is doing the eviction
+                //gdb will do this for us
+                folly::Latch latch(1);
+                r = std::make_unique<std::thread>([&](){
+                    ASSERT(evicted->isMoving());
+                    latch.count_down();
+                    auto handleEvict = alloc->find(std::to_string(1));
+                    //does find block until done moving?? yes
+                    while (evicted->isMarkedForEviction()); //move will fail
+                    XDCHECK(handleEvict == nullptr) << handleEvict->toString();
+                    ASSERT(handleEvict == nullptr);
+                });
+                latch.wait();
+                gdb_sync2();
+                alloc->insertOrReplace(handleNew);
+                ASSERT(!evicted->isAccessible()); //move failed
+                quit = true;
+              });
+          latch_t.wait();
+      }
+      ASSERT_NO_THROW(alloc->insertOrReplace(handle));
+    }
+    t->join();
+    r->join();
+    gdb_sync3();
+  }
+};
+} // namespace tests
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/tests/AllocatorResizeTest.h b/cachelib/allocator/tests/AllocatorResizeTest.h
index d65205ac74..883dd9c056 100644
--- a/cachelib/allocator/tests/AllocatorResizeTest.h
+++ b/cachelib/allocator/tests/AllocatorResizeTest.h
@@ -966,23 +966,23 @@ class AllocatorResizeTest : public AllocatorTest<AllocatorT> {
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->adviseAwaySlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), i * perIterAdvSize);
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), i * perIterAdvSize);
       }
       i--;
       // This should fail
       alloc.memMonitor_->adviseAwaySlabs();
       std::this_thread::sleep_for(std::chrono::seconds{2});
-      auto totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      auto totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, i * perIterAdvSize);
 
       // Try to reclaim back
       for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) {
         alloc.memMonitor_->reclaimSlabs();
         std::this_thread::sleep_for(std::chrono::seconds{2});
-        ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(),
+        ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(),
                   totalAdvisedAwayMemory - i * perIterAdvSize);
       }
-      totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize();
+      totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize();
       ASSERT_EQ(totalAdvisedAwayMemory, 0);
     }
   }
diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp
index 97e08cd7ed..a572485e8d 100644
--- a/cachelib/allocator/tests/AllocatorTypeTest.cpp
+++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "cachelib/allocator/MemoryTierCacheConfig.h"
 #include "cachelib/allocator/tests/BaseAllocatorTest.h"
 #include "cachelib/allocator/tests/TestBase.h"
 
@@ -226,6 +227,11 @@ TYPED_TEST(BaseAllocatorTest, ReaperSkippingSlabTraversalWhileSlabReleasing) {
 }
 
 TYPED_TEST(BaseAllocatorTest, ReaperShutDown) { this->testReaperShutDown(); }
+TYPED_TEST(BaseAllocatorTest, ReaperShutDownFile) {
+  this->testReaperShutDown(
+      {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+          std::string("0"))});
+}
 
 TYPED_TEST(BaseAllocatorTest, ShutDownWithActiveHandles) {
   this->testShutDownWithActiveHandles();
@@ -403,6 +409,7 @@ TYPED_TEST(BaseAllocatorTest, RateMap) { this->testRateMap(); }
 TYPED_TEST(BaseAllocatorTest, StatSnapshotTest) {
   this->testStatSnapshotTest();
 }
+TYPED_TEST(BaseAllocatorTest, BasicMultiTier) {this->testBasicMultiTier(); }
 
 namespace { // the tests that cannot be done by TYPED_TEST.
 
diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h
index aa9d38a857..5733798e98 100644
--- a/cachelib/allocator/tests/BaseAllocatorTest.h
+++ b/cachelib/allocator/tests/BaseAllocatorTest.h
@@ -1253,7 +1253,8 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     this->testLruLength(alloc, poolId, sizes, keyLen, evictedKeys);
   }
 
-  void testReaperShutDown() {
+  void testReaperShutDown(
+      typename AllocatorT::Config::MemoryTierConfigs cfgs = {}) {
     const size_t nSlabs = 20;
     const size_t size = nSlabs * Slab::kSize;
 
@@ -1263,6 +1264,8 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     config.setAccessConfig({8, 8});
     config.enableCachePersistence(this->cacheDir_);
     config.enableItemReaperInBackground(std::chrono::seconds(1), {});
+    if (cfgs.size())
+      config.configureMemoryTiers(cfgs);
     std::vector<typename AllocatorT::Key> keys;
     {
       AllocatorT alloc(AllocatorT::SharedMemNew, config);
@@ -4074,15 +4077,16 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
   // Check that item is in the expected container.
   bool findItem(AllocatorT& allocator, typename AllocatorT::Item* item) {
     auto& container = allocator.getMMContainer(*item);
-    auto itr = container.getEvictionIterator();
     bool found = false;
-    while (itr) {
-      if (itr.get() == item) {
-        found = true;
-        break;
+    container.withEvictionIterator([&found, &item](auto&& itr) {
+      while (itr) {
+        if (itr.get() == item) {
+          found = true;
+          break;
+        }
+        ++itr;
       }
-      ++itr;
-    }
+    });
     return found;
   }
 
@@ -4232,13 +4236,13 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
     // Had a bug: D4799860 where we allocated the wrong size for chained item
     {
       const auto parentAllocInfo =
-          alloc.allocator_->getAllocInfo(itemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(itemHandle->getMemory());
       const auto child1AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle->getMemory());
       const auto child2AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle2->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle2->getMemory());
       const auto child3AllocInfo =
-          alloc.allocator_->getAllocInfo(chainedItemHandle3->getMemory());
+          alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle3->getMemory());
 
       const auto parentCid = parentAllocInfo.classId;
       const auto child1Cid = child1AllocInfo.classId;
@@ -5470,8 +5474,12 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(big->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*big);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(big.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+
+      ASSERT_EQ(big.get(), evictionCandidate);
 
       alloc.remove("hello");
     }
@@ -5485,8 +5493,11 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
       ASSERT_TRUE(small2->isInMMContainer());
 
       auto& mmContainer = alloc.getMMContainer(*small2);
-      auto itr = mmContainer.getEvictionIterator();
-      ASSERT_EQ(small2.get(), &(*itr));
+
+      typename AllocatorT::Item* evictionCandidate = nullptr;
+      mmContainer.withEvictionIterator(
+          [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); });
+      ASSERT_EQ(small2.get(), evictionCandidate);
 
       alloc.remove("hello");
     }
@@ -6284,6 +6295,86 @@ class BaseAllocatorTest : public AllocatorTest<AllocatorT> {
         });
     EXPECT_EQ(intervalNameExists, 4);
   }
+  
+  void testSingleTierMemoryAllocatorSize() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+    config.enableCachePersistence(folly::sformat("/tmp/single-tier-test/{}", ::getpid()));
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testSingleTierMemoryAllocatorSizeAnonymous() {
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(cacheSize);
+
+    AllocatorT alloc(config);
+
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize);
+  }
+
+  void testBasicMultiTier() {
+    using Item = typename AllocatorT::Item;
+    const static std::string data = "data";
+
+    std::set<std::string> movedKeys;
+    auto moveCb = [&](const Item& oldItem, Item& newItem, Item* /* parentPtr */) {
+      std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize());
+      movedKeys.insert(oldItem.getKey().str());
+    };
+
+    typename AllocatorT::Config config;
+    static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */
+    config.setCacheSize(100 * 1024 * 1024); /* 100 MB */
+    config.enableCachePersistence(folly::sformat("/tmp/multi-tier-test/{}", ::getpid()));
+    config.configureMemoryTiers({
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+      MemoryTierCacheConfig::fromShm().setRatio(1)
+        .setMemBind(std::string("0")),
+    });
+    config.enableMovingOnSlabRelease(moveCb);
+
+    AllocatorT alloc(AllocatorT::SharedMemNew, config);
+
+    EXPECT_EQ(alloc.allocator_.size(), 2);
+    EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize / 2);
+    EXPECT_LE(alloc.allocator_[1]->getMemorySize(), cacheSize / 2);
+
+    const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize;
+    auto pid = alloc.addPool("default", numBytes);
+
+    static constexpr size_t numOps = cacheSize / 1024;
+    for (int i = 0; i < numOps; i++) {
+      std::string key = std::to_string(i);
+      auto h = alloc.allocate(pid, key, 1024);
+      EXPECT_TRUE(h);
+
+      std::memcpy(h->getMemory(), data.data(), data.size());
+
+      alloc.insertOrReplace(h);
+    }
+
+    EXPECT_TRUE(movedKeys.size() > 0);
+
+    size_t movedButStillInMemory = 0;
+    for (const auto &k : movedKeys) {
+      auto h = alloc.find(k);
+
+      if (h) {
+        movedButStillInMemory++;
+        /* All moved elements should be in the second tier. */
+        EXPECT_TRUE(alloc.allocator_[1]->isMemoryInAllocator(h->getMemory()));
+        EXPECT_EQ(data, std::string((char*)h->getMemory(), data.size()));
+      }
+    }
+
+    EXPECT_TRUE(movedButStillInMemory > 0);
+  }
 };
 } // namespace tests
 } // namespace cachelib
diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp
index 928fcc0c67..dae14c5335 100644
--- a/cachelib/allocator/tests/CacheBaseTest.cpp
+++ b/cachelib/allocator/tests/CacheBaseTest.cpp
@@ -33,7 +33,10 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase {
   const std::string getCacheName() const override { return cacheName; }
   bool isObjectCache() const override { return false; }
   const MemoryPool& getPool(PoolId) const override { return memoryPool_; }
+  //TODO: support tiers
+  const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; }
   PoolStats getPoolStats(PoolId) const override { return PoolStats(); }
+  ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); };
   AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override {
     return AllSlabReleaseEvents{};
   }
diff --git a/cachelib/allocator/tests/ItemHandleTest.cpp b/cachelib/allocator/tests/ItemHandleTest.cpp
index d992a84011..5213166816 100644
--- a/cachelib/allocator/tests/ItemHandleTest.cpp
+++ b/cachelib/allocator/tests/ItemHandleTest.cpp
@@ -39,6 +39,8 @@ struct TestItem {
   using ChainedItem = int;
 
   void reset() {}
+
+  folly::StringPiece getKey() const { return folly::StringPiece(); }
 };
 
 struct TestNvmCache;
@@ -80,6 +82,12 @@ struct TestAllocator {
 
   void adjustHandleCountForThread_private(int i) { tlRef_.tlStats() += i; }
 
+  bool addWaitContextForMovingItem(
+      folly::StringPiece key,
+      std::shared_ptr<WaitContext<TestReadHandle>> waiter) {
+    return false;
+  }
+
   util::FastStats<int> tlRef_;
 };
 } // namespace
diff --git a/cachelib/allocator/tests/ItemTest.cpp b/cachelib/allocator/tests/ItemTest.cpp
index b0f3a2fdec..2f25cc07f0 100644
--- a/cachelib/allocator/tests/ItemTest.cpp
+++ b/cachelib/allocator/tests/ItemTest.cpp
@@ -82,11 +82,23 @@ TEST(ItemTest, ExpiryTime) {
   EXPECT_TRUE(result);
   EXPECT_EQ(tenMins, item->getConfiguredTTL());
 
+  // So that exclusive bit will be set
+  item->markAccessible();
   // Test that writes fail while the item is moving
-  item->markExclusive();
+  result = item->markMoving(true);
+  EXPECT_TRUE(result);
+  result = item->updateExpiryTime(0);
+  EXPECT_FALSE(result);
+  item->unmarkMoving();
+
+  // Test that writes fail while the item is marked for eviction
+  item->markAccessible();
+  result = item->markForEviction();
+  EXPECT_TRUE(result);
   result = item->updateExpiryTime(0);
   EXPECT_FALSE(result);
-  item->unmarkExclusive();
+  item->unmarkForEviction();
+  item->unmarkAccessible();
 
   // Test that writes fail while the item is not in an MMContainer
   item->unmarkInMMContainer();
diff --git a/cachelib/allocator/tests/MM2QTest.cpp b/cachelib/allocator/tests/MM2QTest.cpp
index e11dd95f5a..0e01ffa56f 100644
--- a/cachelib/allocator/tests/MM2QTest.cpp
+++ b/cachelib/allocator/tests/MM2QTest.cpp
@@ -223,6 +223,19 @@ void MMTypeTest<MMType>::testIterate(std::vector<std::unique_ptr<Node>>& nodes,
   }
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testIterateHot(std::vector<std::unique_ptr<Node>>& nodes,
+                                     Container& c) {
+  auto it = nodes.rbegin();
+  c.withPromotionIterator([&it,&c](auto &&it2q) {
+    while (it2q && c.isHot(*it2q)) {
+        ASSERT_EQ(it2q->getId(), (*it)->getId());
+        ++it2q;
+        ++it;
+    }
+  });
+}
+
 template <typename MMType>
 void MMTypeTest<MMType>::testMatch(std::string expected,
                                    MMTypeTest<MMType>::Container& c) {
@@ -238,6 +251,23 @@ void MMTypeTest<MMType>::testMatch(std::string expected,
   ASSERT_EQ(expected, actual);
 }
 
+template <typename MMType>
+void MMTypeTest<MMType>::testMatchHot(std::string expected,
+                                   MMTypeTest<MMType>::Container& c) {
+  int index = -1;
+  std::string actual;
+  c.withPromotionIterator([&c,&actual,&index](auto &&it2q) {
+    while (it2q) {
+      ++index;
+      actual += folly::stringPrintf(
+          "%d:%s, ", it2q->getId(),
+          (c.isHot(*it2q) ? "H" : (c.isCold(*it2q) ? "C" : "W")));
+      ++it2q;
+    }
+  });
+  ASSERT_EQ(expected, actual);
+}
+
 TEST_F(MM2QTest, DetailedTest) {
   MM2Q::Config config;
   config.lruRefreshTime = 0;
@@ -259,8 +289,11 @@ TEST_F(MM2QTest, DetailedTest) {
   }
 
   testIterate(nodes, c);
+  testIterateHot(nodes, c);
 
   testMatch("0:C, 1:C, 2:C, 3:C, 4:H, 5:H, ", c);
+  testMatchHot("5:H, 4:H, 3:C, 2:C, 1:C, 0:C, ", c);
+
   // Move 3 to top of the hot cache
   c.recordAccess(*(nodes[4]), AccessMode::kRead);
   testMatch("0:C, 1:C, 2:C, 3:C, 5:H, 4:H, ", c);
diff --git a/cachelib/allocator/tests/MMTypeTest.h b/cachelib/allocator/tests/MMTypeTest.h
index d38f6ce2c1..dbc55677ea 100644
--- a/cachelib/allocator/tests/MMTypeTest.h
+++ b/cachelib/allocator/tests/MMTypeTest.h
@@ -147,7 +147,9 @@ class MMTypeTest : public testing::Test {
   void testRecordAccessBasic(Config c);
   void testSerializationBasic(Config c);
   void testIterate(std::vector<std::unique_ptr<Node>>& nodes, Container& c);
+  void testIterateHot(std::vector<std::unique_ptr<Node>>& nodes, Container& c);
   void testMatch(std::string expected, Container& c);
+  void testMatchHot(std::string expected, Container& c);
   size_t getListSize(const Container& c, typename MMType::LruType list);
   void verifyIterationVariants(Container& c);
 };
diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp
new file mode 100644
index 0000000000..298195378c
--- /dev/null
+++ b/cachelib/allocator/tests/MemoryTiersTest.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Random.h>
+
+#include <numeric>
+
+#include "cachelib/allocator/CacheAllocator.h"
+#include "cachelib/allocator/tests/TestBase.h"
+
+namespace facebook {
+namespace cachelib {
+namespace tests {
+
+using LruAllocatorConfig = CacheAllocatorConfig<LruAllocator>;
+using LruMemoryTierConfigs = LruAllocatorConfig::MemoryTierConfigs;
+using Strings = std::vector<std::string>;
+using Ratios = std::vector<size_t>;
+
+constexpr size_t MB = 1024ULL * 1024ULL;
+constexpr size_t GB = MB * 1024ULL;
+
+const size_t defaultTotalCacheSize{1 * GB};
+const std::string defaultCacheDir{"/tmp/metadataDir"};
+
+template <typename Allocator>
+class MemoryTiersTest : public AllocatorTest<Allocator> {
+ public:
+  void basicCheck(LruAllocatorConfig& actualConfig,
+                  size_t expectedTotalCacheSize = defaultTotalCacheSize,
+                  const std::string& expectedCacheDir = defaultCacheDir) {
+    EXPECT_EQ(actualConfig.getCacheSize(), expectedTotalCacheSize);
+    auto configs = actualConfig.getMemoryTierConfigs();
+
+    size_t sum_ratios = std::accumulate(
+        configs.begin(), configs.end(), 0UL,
+        [](const size_t i, const MemoryTierCacheConfig& config) {
+          return i + config.getRatio();
+        });
+    size_t sum_sizes = std::accumulate(
+        configs.begin(), configs.end(), 0UL,
+        [&](const size_t i, const MemoryTierCacheConfig& config) {
+          return i + config.calculateTierSize(actualConfig.getCacheSize(),
+                                              sum_ratios);
+        });
+
+    EXPECT_GE(expectedTotalCacheSize, sum_ratios * Slab::kSize);
+    EXPECT_LE(sum_sizes, expectedTotalCacheSize);
+    EXPECT_GE(sum_sizes, expectedTotalCacheSize - configs.size() * Slab::kSize);
+  }
+
+  LruAllocatorConfig createTestCacheConfig(
+      const Ratios& tierRatios = {1},
+      bool setPosixForShm = true,
+      size_t cacheSize = defaultTotalCacheSize,
+      const std::string& cacheDir = defaultCacheDir) {
+    LruAllocatorConfig cfg;
+    cfg.setCacheSize(cacheSize).enableCachePersistence(cacheDir);
+
+    if (setPosixForShm)
+      cfg.usePosixForShm();
+    LruMemoryTierConfigs tierConfigs;
+    tierConfigs.reserve(tierRatios.size());
+    for (auto i = 0; i < tierRatios.size(); ++i) {
+      tierConfigs.push_back(MemoryTierCacheConfig::fromShm()
+                                .setRatio(tierRatios[i])
+                                .setMemBind(std::string("0")));
+    }
+
+    cfg.configureMemoryTiers(tierConfigs);
+    return cfg;
+  }
+
+  LruAllocatorConfig createTieredCacheConfig(size_t totalCacheSize,
+                                             size_t numTiers = 2) {
+    LruAllocatorConfig tieredCacheConfig{};
+    std::vector<MemoryTierCacheConfig> configs;
+    for (auto i = 1; i <= numTiers; ++i) {
+      configs.push_back(MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind(
+          std::string("0")));
+    }
+    tieredCacheConfig.setCacheSize(totalCacheSize)
+        .enableCachePersistence(
+            folly::sformat("/tmp/multi-tier-test/{}", ::getpid()))
+        .usePosixForShm()
+        .configureMemoryTiers(configs);
+    return tieredCacheConfig;
+  }
+
+  LruAllocatorConfig createDramCacheConfig(size_t totalCacheSize) {
+    LruAllocatorConfig dramConfig{};
+    dramConfig.setCacheSize(totalCacheSize);
+    return dramConfig;
+  }
+
+  void validatePoolSize(PoolId poolId,
+                        std::unique_ptr<LruAllocator>& allocator,
+                        size_t expectedSize) {
+    size_t actualSize = allocator->getPoolSize(poolId);
+    EXPECT_EQ(actualSize, expectedSize);
+  }
+
+  void testAddPool(std::unique_ptr<LruAllocator>& alloc,
+                   size_t poolSize,
+                   bool isSizeValid = true,
+                   size_t numTiers = 2) {
+    if (isSizeValid) {
+      auto pool = alloc->addPool("validPoolSize", poolSize);
+      EXPECT_LE(alloc->getPoolSize(pool), poolSize);
+      if (poolSize >= numTiers * Slab::kSize)
+        EXPECT_GE(alloc->getPoolSize(pool),
+                  poolSize - numTiers * Slab::kSize);
+    } else {
+      EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize),
+                   std::invalid_argument);
+      // TODO: test this for all tiers
+      EXPECT_EQ(alloc->getPoolIds().size(), 0);
+    }
+  }
+};
+
+using LruMemoryTiersTest = MemoryTiersTest<LruAllocator>;
+
+TEST_F(LruMemoryTiersTest, TestValid1TierConfig) {
+  LruAllocatorConfig cfg = createTestCacheConfig().validate();
+  basicCheck(cfg);
+}
+
+TEST_F(LruMemoryTiersTest, TestValid2TierConfig) {
+  LruAllocatorConfig cfg = createTestCacheConfig({1, 1});
+  basicCheck(cfg);
+}
+
+TEST_F(LruMemoryTiersTest, TestValid2TierRatioConfig) {
+  LruAllocatorConfig cfg = createTestCacheConfig({5, 2});
+  basicCheck(cfg);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigNumberOfPartitionsTooLarge) {
+  EXPECT_THROW(createTestCacheConfig({defaultTotalCacheSize, 1}).validate(),
+               std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesAndRatioNotSet) {
+  EXPECT_THROW(createTestCacheConfig({1, 0}), std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatiosCacheSizeNotSet) {
+  EXPECT_THROW(createTestCacheConfig({1, 1}, true,
+                                     /* cacheSize */ 0)
+                   .validate(),
+               std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) {
+  EXPECT_THROW(createTestCacheConfig({1, 0}), std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) {
+  EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument);
+}
+
+TEST_F(LruMemoryTiersTest, TestPoolAllocations) {
+  std::vector<size_t> totalCacheSizes = {8 * GB, 2 * GB};
+
+  static const size_t numExtraSizes = 4;
+  static const size_t numExtraSlabs = 20;
+
+  for (size_t i = 0; i < numExtraSizes; i++) {
+    totalCacheSizes.push_back(totalCacheSizes.back() +
+                              (folly::Random::rand64() % numExtraSlabs) *
+                                  Slab::kSize);
+  }
+
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+      basicCheck(cfg, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = std::unique_ptr<LruAllocator>(
+          new LruAllocator(LruAllocator::SharedMemNew, cfg));
+
+      size_t size = (folly::Random::rand64() %
+                      (alloc->getCacheMemoryStats().ramCacheSize - Slab::kSize)) +
+                    Slab::kSize;
+      testAddPool(alloc, size, true);
+    }
+  }
+}
+
+TEST_F(LruMemoryTiersTest, TestPoolInvalidAllocations) {
+  std::vector<size_t> totalCacheSizes = {48 * MB, 51 * MB, 256 * MB,
+                                         1 * GB,  5 * GB,  8 * GB};
+  size_t min_ratio = 1;
+  size_t max_ratio = 111;
+
+  static const size_t numCombinations = 10;
+
+  for (auto totalCacheSize : totalCacheSizes) {
+    for (size_t k = 0; k < numCombinations; k++) {
+      const size_t i = folly::Random::rand32() % max_ratio + min_ratio;
+      const size_t j = folly::Random::rand32() % max_ratio + min_ratio;
+      LruAllocatorConfig cfg =
+          createTestCacheConfig({i, j},
+                                /* usePoisx */ true, totalCacheSize);
+
+      std::unique_ptr<LruAllocator> alloc = nullptr;
+      try {
+         alloc = std::unique_ptr<LruAllocator>(
+            new LruAllocator(LruAllocator::SharedMemNew, cfg));
+      } catch(...) {
+        // expection only if cache too small
+        size_t sum_ratios = std::accumulate(
+          cfg.getMemoryTierConfigs().begin(), cfg.getMemoryTierConfigs().end(), 0UL,
+          [](const size_t i, const MemoryTierCacheConfig& config) {
+            return i + config.getRatio();
+        });
+        auto tier1slabs = cfg.getMemoryTierConfigs()[0].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        auto tier2slabs = cfg.getMemoryTierConfigs()[1].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize;
+        EXPECT_TRUE(tier1slabs <= 2 || tier2slabs <= 2);
+
+        continue;
+      }
+
+      size_t size = (folly::Random::rand64() % (100 * GB)) +
+                    alloc->getCacheMemoryStats().ramCacheSize;
+      testAddPool(alloc, size, false);
+    }
+  }
+}
+} // namespace tests
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp
index b355a48a8e..862271b03d 100644
--- a/cachelib/allocator/tests/RefCountTest.cpp
+++ b/cachelib/allocator/tests/RefCountTest.cpp
@@ -30,6 +30,7 @@ class RefCountTest : public AllocTestBase {
  public:
   static void testMultiThreaded();
   static void testBasic();
+  static void testMarkForEvictionAndMoving();
 };
 
 void RefCountTest::testMultiThreaded() {
@@ -51,7 +52,7 @@ void RefCountTest::testMultiThreaded() {
         nLocalRef--;
         ref.markAccessible();
       } else {
-        ref.incRef();
+        ref.incRef(true);
         nLocalRef++;
         ref.unmarkAccessible();
       }
@@ -81,7 +82,7 @@ void RefCountTest::testBasic() {
   ASSERT_EQ(0, ref.getRaw());
   ASSERT_FALSE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
@@ -89,7 +90,7 @@ void RefCountTest::testBasic() {
   ref.markInMMContainer();
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -100,18 +101,18 @@ void RefCountTest::testBasic() {
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
   for (uint32_t i = 0; i < RefcountWithFlags::kAccessRefMask; i++) {
-    ASSERT_TRUE(ref.incRef());
+    ASSERT_EQ(ref.incRef(true),RefcountWithFlags::incOk);
   }
 
   // Incrementing past the max will fail
   auto rawRef = ref.getRaw();
-  ASSERT_FALSE(ref.incRef());
+  ASSERT_THROW(ref.incRef(true), std::overflow_error);
   ASSERT_EQ(rawRef, ref.getRaw());
 
   // Bumping up access ref shouldn't affect admin ref and flags
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(RefcountWithFlags::kAccessRefMask, ref.getAccessRef());
   ASSERT_TRUE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -128,7 +129,7 @@ void RefCountTest::testBasic() {
   // Bumping down access ref shouldn't affect admin ref and flags
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_TRUE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -136,7 +137,7 @@ void RefCountTest::testBasic() {
   ref.template unSetFlag<RefcountWithFlags::Flags::kMMFlag0>();
   ASSERT_TRUE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
@@ -145,33 +146,119 @@ void RefCountTest::testBasic() {
   ASSERT_EQ(0, ref.getRaw());
   ASSERT_FALSE(ref.isInMMContainer());
   ASSERT_FALSE(ref.isAccessible());
-  ASSERT_FALSE(ref.isExclusive());
+  ASSERT_FALSE(ref.isMoving());
   ASSERT_EQ(0, ref.getAccessRef());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag0>());
   ASSERT_FALSE(ref.template isFlagSet<RefcountWithFlags::Flags::kMMFlag1>());
 
   // conditionally set flags
-  ASSERT_FALSE((ref.markExclusive()));
+  ASSERT_FALSE(ref.markMoving(true));
   ref.markInMMContainer();
-  ASSERT_TRUE((ref.markExclusive()));
-  ASSERT_FALSE((ref.isOnlyExclusive()));
+  // only first one succeeds
+  ASSERT_TRUE(ref.markMoving(true));
+  ASSERT_FALSE(ref.markMoving(true));
   ref.unmarkInMMContainer();
+
   ref.template setFlag<RefcountWithFlags::Flags::kMMFlag0>();
-  // Have no other admin refcount but with a flag still means "isOnlyExclusive"
-  ASSERT_TRUE((ref.isOnlyExclusive()));
+  // Have no other admin refcount but with a flag still means "isOnlyMoving"
+  ASSERT_TRUE((ref.isOnlyMoving()));
 
-  // Set some flags and verify that "isOnlyExclusive" does not care about flags
+  // Set some flags and verify that "isOnlyMoving" does not care about flags
   ref.markIsChainedItem();
   ASSERT_TRUE(ref.isChainedItem());
-  ASSERT_TRUE((ref.isOnlyExclusive()));
+  ASSERT_TRUE((ref.isOnlyMoving()));
   ref.unmarkIsChainedItem();
   ASSERT_FALSE(ref.isChainedItem());
-  ASSERT_TRUE((ref.isOnlyExclusive()));
+  ASSERT_TRUE((ref.isOnlyMoving()));
+}
+
+void RefCountTest::testMarkForEvictionAndMoving() {
+  {
+    // cannot mark for eviction when not accessible or not in MMContainer
+    RefcountWithFlags ref;
+    ASSERT_FALSE(ref.markForEviction());
+
+    ref.markInMMContainer();
+    ASSERT_FALSE(ref.markForEviction());
+    ref.unmarkInMMContainer();
+
+    ref.markAccessible();
+    ASSERT_FALSE(ref.markForEviction());
+  }
+
+  {
+    // can mark for eviction when accessible and in MMContainer
+    // and unmarkForEviction return value contains admin bits
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+    ASSERT_TRUE(ref.markForEviction());
+    ASSERT_TRUE(ref.unmarkForEviction() > 0);
+  }
+
+  {
+    // cannot mark for eviction when moving
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ASSERT_TRUE(ref.markMoving(true));
+    ASSERT_FALSE(ref.markForEviction());
+
+    ref.unmarkInMMContainer();
+    ref.unmarkAccessible();
+    auto ret = ref.unmarkMoving();
+    ASSERT_EQ(ret, 0);
+  }
+
+  {
+    // cannot mark moving when marked for eviction
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ASSERT_TRUE(ref.markForEviction());
+    ASSERT_FALSE(ref.markMoving(true));
+
+    ref.unmarkInMMContainer();
+    ref.unmarkAccessible();
+    auto ret = ref.unmarkForEviction();
+    ASSERT_EQ(ret, 0);
+  }
+
+  {
+    // can mark moving when ref count > 0
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ref.incRef(true);
+
+    ASSERT_TRUE(ref.markMoving(false));
+
+    ref.unmarkInMMContainer();
+    ref.unmarkAccessible();
+    auto ret = ref.unmarkMoving();
+    ASSERT_EQ(ret, 1);
+  }
+
+  {
+    // cannot mark for eviction when ref count > 0
+    RefcountWithFlags ref;
+    ref.markInMMContainer();
+    ref.markAccessible();
+
+    ref.incRef(true);
+    ASSERT_FALSE(ref.markForEviction());
+  }
 }
 } // namespace
 
 TEST_F(RefCountTest, MutliThreaded) { testMultiThreaded(); }
 TEST_F(RefCountTest, Basic) { testBasic(); }
+TEST_F(RefCountTest, MarkForEvictionAndMoving) {
+  testMarkForEvictionAndMoving();
+}
 } // namespace tests
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/allocator/tests/TestBase-inl.h b/cachelib/allocator/tests/TestBase-inl.h
index bf7355c87d..4d45e981bc 100644
--- a/cachelib/allocator/tests/TestBase-inl.h
+++ b/cachelib/allocator/tests/TestBase-inl.h
@@ -312,7 +312,7 @@ void AllocatorTest<AllocatorT>::testShmIsRemoved(
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_FALSE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
@@ -326,7 +326,7 @@ void AllocatorTest<AllocatorT>::testShmIsNotRemoved(
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
-      config.getCacheDir(), detail::kShmCacheName, config.usePosixShm));
+      config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm));
   ASSERT_TRUE(AllocatorT::ShmManager::segmentExists(
       config.getCacheDir(), detail::kShmChainedItemHashTableName,
       config.usePosixShm));
diff --git a/cachelib/benchmarks/PtrCompressionBench.cpp b/cachelib/benchmarks/PtrCompressionBench.cpp
index aeaa2c3b11..5daefc146f 100644
--- a/cachelib/benchmarks/PtrCompressionBench.cpp
+++ b/cachelib/benchmarks/PtrCompressionBench.cpp
@@ -61,7 +61,8 @@ void buildAllocs(size_t poolSize) {
         void* alloc = ma->allocate(pid, size);
         XDCHECK_GE(size, CompressedPtr::getMinAllocSize());
         if (alloc != nullptr) {
-          validAllocs.push_back({alloc, ma->compress(alloc)});
+          validAllocs.push_back(
+              {alloc, ma->compress(alloc, false /* isMultiTiered */)});
           validAllocsAlt.push_back({alloc, ma->compressAlt(alloc)});
           numAllocations++;
         }
@@ -83,7 +84,7 @@ BENCHMARK(CompressionAlt) {
 
 BENCHMARK_RELATIVE(Compression) {
   for (const auto& alloc : validAllocs) {
-    CompressedPtr c = m->compress(alloc.first);
+    CompressedPtr c = m->compress(alloc.first, false /* isMultiTiered */);
     folly::doNotOptimizeAway(c);
   }
 }
@@ -97,7 +98,7 @@ BENCHMARK(DeCompressAlt) {
 
 BENCHMARK_RELATIVE(DeCompress) {
   for (const auto& alloc : validAllocs) {
-    void* ptr = m->unCompress(alloc.second);
+    void* ptr = m->unCompress(alloc.second, false /* isMultiTiered */);
     folly::doNotOptimizeAway(ptr);
   }
 }
diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h
index ed8bfd1b04..7b062cd3b7 100644
--- a/cachelib/cachebench/cache/Cache-inl.h
+++ b/cachelib/cachebench/cache/Cache-inl.h
@@ -46,6 +46,16 @@ Cache<Allocator>::Cache(const CacheConfig& config,
       config_.getRebalanceStrategy(),
       std::chrono::seconds(config_.poolRebalanceIntervalSec));
 
+  allocatorConfig_.enableBackgroundEvictor(
+      config_.getBackgroundEvictorStrategy(),
+      std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec),
+      config_.evictorThreads);
+
+  allocatorConfig_.enableBackgroundPromoter(
+      config_.getBackgroundPromoterStrategy(),
+      std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec),
+      config_.promoterThreads);
+
   if (config_.moveOnSlabRelease && movingSync != nullptr) {
     allocatorConfig_.enableMovingOnSlabRelease(
         [](Item& oldItem, Item& newItem, Item* parentPtr) {
@@ -98,6 +108,25 @@ Cache<Allocator>::Cache(const CacheConfig& config,
     }
   });
 
+  allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch;
+  allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch;
+  allocatorConfig_.minEvictionBatch = config_.minEvictionBatch;
+  allocatorConfig_.minPromotionBatch = config_.minPromotionBatch;
+  allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness;
+  allocatorConfig_.forceAllocationTier = config_.forceAllocationTier;
+
+  allocatorConfig_.disableEvictionToMemory = config_.disableEvictionToMemory;
+  allocatorConfig_.lowEvictionAcWatermark = config_.lowEvictionAcWatermark;
+  allocatorConfig_.highEvictionAcWatermark = config_.highEvictionAcWatermark;
+  allocatorConfig_.minAcAllocationWatermark = config_.minAcAllocationWatermark;
+  allocatorConfig_.maxAcAllocationWatermark = config_.maxAcAllocationWatermark;
+  allocatorConfig_.sizeThresholdPolicy = config_.sizeThresholdPolicy;
+  allocatorConfig_.defaultTierChancePercentage = config_.defaultTierChancePercentage;
+  allocatorConfig_.numDuplicateElements = config_.numDuplicateElements;
+  allocatorConfig_.syncPromotion = config_.syncPromotion;
+  allocatorConfig_.promotionAcWatermark = config_.promotionAcWatermark;
+  allocatorConfig_.acTopTierEvictionWatermark = config_.acTopTierEvictionWatermark;
+
   if (config_.enableItemDestructorCheck) {
     auto removeCB = [&](const typename Allocator::DestructorData& data) {
       if (!itemRecords_.validate(data)) {
@@ -620,10 +649,36 @@ Stats Cache<Allocator>::getStats() const {
     aggregate += poolStats;
   }
 
+  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats{};
+
+  for (size_t pid = 0; pid < pools_.size(); pid++) {
+    auto cids = cache_->getPoolStats(static_cast<PoolId>(pid)).getClassIds();
+    for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) {
+      for (auto cid : cids)
+        allocationClassStats[tid][pid][cid] = cache_->getACStats(tid, pid, cid);
+    }
+  }
+
   const auto cacheStats = cache_->getGlobalCacheStats();
   const auto rebalanceStats = cache_->getSlabReleaseStats();
   const auto navyStats = cache_->getNvmCacheStatsMap().toMap();
 
+  ret.allocationClassStats = allocationClassStats;
+
+  ret.backgndEvicStats.nEvictedItems =
+            cacheStats.evictionStats.numMovedItems;
+  ret.backgndEvicStats.nTraversals =
+            cacheStats.evictionStats.runCount;
+  ret.backgndEvicStats.nClasses =
+            cacheStats.evictionStats.totalClasses;
+  ret.backgndEvicStats.evictionSize =
+            cacheStats.evictionStats.totalBytesMoved;
+  
+  ret.backgndPromoStats.nPromotedItems =
+            cacheStats.promotionStats.numMovedItems;
+  ret.backgndPromoStats.nTraversals =
+            cacheStats.promotionStats.runCount;
+
   ret.numEvictions = aggregate.numEvictions();
   ret.numItems = aggregate.numItems();
   ret.evictAttempts = cacheStats.evictionAttempts;
@@ -677,6 +732,9 @@ Stats Cache<Allocator>::getStats() const {
     ret.nvmCounters = cache_->getNvmCacheStatsMap().toMap();
   }
 
+  ret.backgroundEvictionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Evict);
+  ret.backgroundPromotionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Promote);
+
   // nvm stats from navy
   if (!isRamOnly() && !navyStats.empty()) {
     auto lookup = [&navyStats](const std::string& key) {
diff --git a/cachelib/cachebench/cache/Cache.cpp b/cachelib/cachebench/cache/Cache.cpp
index 70feb0f764..ea9d6b106c 100644
--- a/cachelib/cachebench/cache/Cache.cpp
+++ b/cachelib/cachebench/cache/Cache.cpp
@@ -20,6 +20,12 @@ DEFINE_bool(report_api_latency,
             false,
             "Enable reporting cache API latency tracking");
 
+DEFINE_string(
+    report_ac_memory_usage_stats,
+    "",
+    "Enable reporting statistics for each allocation class. Set to"
+    "'human_readable' to print KB/MB/GB or to 'raw' to print in bytes.");
+
 namespace facebook {
 namespace cachelib {
 namespace cachebench {} // namespace cachebench
diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h
index a86dd06c49..a85c1efb66 100644
--- a/cachelib/cachebench/cache/Cache.h
+++ b/cachelib/cachebench/cache/Cache.h
@@ -44,6 +44,7 @@
 #include "cachelib/cachebench/util/NandWrites.h"
 
 DECLARE_bool(report_api_latency);
+DECLARE_string(report_ac_memory_usage_stats);
 
 namespace facebook {
 namespace cachelib {
@@ -324,6 +325,10 @@ class Cache {
   // return the stats for the pool.
   PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); }
 
+  ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const {
+    return cache_->getACStats(tid, pid, cid);
+  }
+
   // return the total number of inconsistent operations detected since start.
   unsigned int getInconsistencyCount() const {
     return inconsistencyCount_.load(std::memory_order_relaxed);
diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h
index d6c9e53584..993f3845a3 100644
--- a/cachelib/cachebench/cache/CacheStats.h
+++ b/cachelib/cachebench/cache/CacheStats.h
@@ -21,11 +21,38 @@
 #include "cachelib/common/PercentileStats.h"
 
 DECLARE_bool(report_api_latency);
+DECLARE_string(report_ac_memory_usage_stats);
 
 namespace facebook {
 namespace cachelib {
 namespace cachebench {
+
+struct BackgroundEvictionStats {
+  // the number of items this worker evicted by looking at pools/classes stats
+  uint64_t nEvictedItems{0};
+
+  // number of times we went executed the thread //TODO: is this def correct?
+  uint64_t nTraversals{0};
+
+  // number of classes
+  uint64_t nClasses{0};
+
+  // size of evicted items
+  uint64_t evictionSize{0};
+};
+
+struct BackgroundPromotionStats {
+  // the number of items this worker evicted by looking at pools/classes stats
+  uint64_t nPromotedItems{0};
+
+  // number of times we went executed the thread //TODO: is this def correct?
+  uint64_t nTraversals{0};
+};
+
 struct Stats {
+  BackgroundEvictionStats backgndEvicStats;
+  BackgroundPromotionStats backgndPromoStats;
+
   uint64_t numEvictions{0};
   uint64_t numItems{0};
 
@@ -100,11 +127,16 @@ struct Stats {
   uint64_t invalidDestructorCount{0};
   int64_t unDestructedItemCount{0};
 
+  std::map<TierId, std::map<PoolId, std::map<ClassId, ACStats>>> allocationClassStats;
+
   // populate the counters related to nvm usage. Cache implementation can decide
   // what to populate since not all of those are interesting when running
   // cachebench.
   std::unordered_map<std::string, double> nvmCounters;
 
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundEvictionClasses;
+  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>> backgroundPromotionClasses;
+
   // errors from the nvm engine.
   std::unordered_map<std::string, double> nvmErrors;
 
@@ -125,12 +157,80 @@ struct Stats {
         << std::endl;
     out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl;
 
+    auto foreachAC = [&](auto &map, auto cb) {
+      for (auto &tidStats : map) {
+        for (auto &pidStat : tidStats.second) {
+          for (auto &cidStat : pidStat.second) {
+            cb(tidStats.first, pidStat.first, cidStat.first, cidStat.second);
+          }
+        }
+      }
+    };
+
     for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) {
       out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid,
                             poolUsageFraction[pid])
           << std::endl;
     }
 
+    if (FLAGS_report_ac_memory_usage_stats != "") {
+      auto formatMemory = [&](size_t bytes) -> std::tuple<std::string, double> {
+        if (FLAGS_report_ac_memory_usage_stats == "raw") {
+          return {"B", bytes};
+        }
+
+        constexpr double KB = 1024.0;
+        constexpr double MB = 1024.0 * 1024;
+        constexpr double GB = 1024.0 * 1024 * 1024;
+
+        if (bytes >= GB) {
+          return {"GB", static_cast<double>(bytes) / GB};
+        } else if (bytes >= MB) {
+          return {"MB", static_cast<double>(bytes) / MB};
+        } else if (bytes >= KB) {
+          return {"KB", static_cast<double>(bytes) / KB};
+        } else {
+          return {"B", bytes};
+        }
+      };
+
+      auto foreachAC = [&](auto cb) {
+        for (auto& tidStat : allocationClassStats) {
+          for (auto& pidStat : tidStat.second) {
+            for (auto& cidStat : pidStat.second) {
+              cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second);
+            }
+          }
+        }
+      };
+
+
+      foreachAC([&](auto tid, auto pid, auto cid, auto stats) {
+        auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize);
+        auto [memorySizeSuffix, memorySize] =
+            formatMemory(stats.totalAllocatedSize());
+
+        // If the pool is not full, extrapolate usageFraction for AC assuming it
+        // will grow at the same rate. This value will be the same for all ACs.
+        auto acUsageFraction = (poolUsageFraction[pid] < 1.0)
+                                   ? poolUsageFraction[pid]
+                                   : stats.usageFraction();
+
+        out << folly::sformat(
+                   "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} "
+                   "memorySize: {:8.2f}{} "
+                   "rollingAvgAllocLatency: {:8.2f}ns",
+                   tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction,
+                   memorySize, memorySizeSuffix,
+                   stats.allocLatencyNs.estimate())
+            << std::endl;
+      });
+    }
+
+    out << folly::sformat("Tier 0 Background Evicted Items : {:,}",
+                            backgndEvicStats.nEvictedItems) << std::endl;
+    out << folly::sformat("Tier 0 Background Traversals : {:,}",
+                            backgndEvicStats.nTraversals) << std::endl;
     if (numCacheGets > 0) {
       out << folly::sformat("Cache Gets    : {:,}", numCacheGets) << std::endl;
       out << folly::sformat("Hit Ratio     : {:6.2f}%", overallHitRatio)
@@ -161,6 +261,22 @@ struct Stats {
       }
     }
 
+    if (!backgroundEvictionClasses.empty() && backgndEvicStats.nEvictedItems > 0 ) {
+      out << "== Class Background Eviction Counters Map ==" << std::endl;
+      foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){
+        out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}",
+          tid, pid, cid, evicted) << std::endl;
+      });
+    }
+    
+    if (!backgroundPromotionClasses.empty() && backgndPromoStats.nPromotedItems > 0) {
+      out << "== Class Background Promotion Counters Map ==" << std::endl;
+      foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){
+        out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}",
+          tid, pid, cid, promoted) << std::endl;
+      });
+    }
+
     if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) {
       const double ramHitRatio = invertPctFn(numCacheGetMiss, numCacheGets);
       const double nvmHitRatio = invertPctFn(numNvmGetMiss, numNvmGets);
diff --git a/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
new file mode 100644
index 0000000000..076550bc5c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json
@@ -0,0 +1,54 @@
+{
+  "cache_config" : {
+    "cacheSizeMB" : 300,
+    "poolRebalanceIntervalSec" : 1,
+    "moveOnSlabRelease" : true,
+
+    "cacheDir": "/tmp/mem-tier2",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+	"memBindNodes": 0
+      }
+    ],
+
+    "numPools" : 2,
+    "poolSizes" : [0.5, 0.5],
+    "allocFactor" : 2.0,
+    "nvmCacheSizeMB" : 1024
+  },
+  "test_config" :
+    {
+
+      "checkConsistency" : true,
+
+      "numOps" : 60000,
+      "numThreads" : 20,
+      "numKeys" : 200000,
+
+
+      "keySizeRange" : [1, 8, 64],
+      "keySizeRangeProbability" : [0.5, 0.5],
+
+      "valSizeRange" : [256, 1024, 4096, 8192],
+      "valSizeRangeProbability" : [0.2, 0.7, 0.1],
+
+      "chainedItemLengthRange" : [1, 2, 4, 32],
+      "chainedItemLengthRangeProbability" : [0.8, 0.18, 0.02],
+
+      "chainedItemValSizeRange" : [1, 128, 256, 1024, 4096, 20480],
+      "chainedItemValSizeRangeProbability" : [0.1, 0.1, 0.2, 0.3, 0.3],
+
+      "getRatio" : 0.8,
+      "setRatio" : 0.1,
+      "delRatio" : 0.0,
+      "addChainedRatio" : 0.05,
+      "keyPoolDistribution": [0.5, 0.5],
+      "opPoolDistribution" : [0.5, 0.5]
+    }
+
+}
diff --git a/cachelib/cachebench/test_configs/consistency/navy.json b/cachelib/cachebench/test_configs/consistency/navy.json
index 73b016a50f..b95b056d31 100644
--- a/cachelib/cachebench/test_configs/consistency/navy.json
+++ b/cachelib/cachebench/test_configs/consistency/navy.json
@@ -14,8 +14,8 @@
 
       "checkConsistency" : true,
 
-      "numOps" : 30000000,
-      "numThreads" : 40,
+      "numOps" : 600000,
+      "numThreads" : 20,
       "numKeys" : 200000,
 
 
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
new file mode 100644
index 0000000000..d9acdf7c6c
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json
@@ -0,0 +1,42 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tiers",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      },
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json",
+
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
new file mode 100644
index 0000000000..6d47e08b74
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json
@@ -0,0 +1,32 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier"
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
new file mode 100644
index 0000000000..4feab55154
--- /dev/null
+++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json
@@ -0,0 +1,38 @@
+{
+  "cache_config": {
+    "cacheSizeMB": 8192,
+    "poolRebalanceIntervalSec": 0,
+    "cacheDir": "/tmp/mem-tier",
+    "memoryTiers" : [
+      {
+        "ratio": 1,
+        "memBindNodes": 0
+      }
+    ]
+  }, 
+  "test_config": 
+    {
+      "addChainedRatio": 0.0, 
+      "delRatio": 0.0, 
+      "enableLookaside": true, 
+      "getRatio": 0.7684563460126871, 
+      "keySizeRange": [
+        1, 
+        8, 
+        64
+      ], 
+      "keySizeRangeProbability": [
+        0.3, 
+        0.7
+      ], 
+      "loneGetRatio": 0.2315436539873129, 
+      "numKeys": 71605574, 
+      "numOps": 5000000, 
+      "numThreads": 24, 
+      "popDistFile": "pop.json", 
+       
+      "setRatio": 0.0, 
+      "valSizeDistFile": "sizes.json"
+    }
+ 
+}
diff --git a/cachelib/cachebench/test_configs/moveing.json b/cachelib/cachebench/test_configs/moveing.json
new file mode 100644
index 0000000000..964d43b8ed
--- /dev/null
+++ b/cachelib/cachebench/test_configs/moveing.json
@@ -0,0 +1,39 @@
+// @nolint like default.json, but moves items during slab release instead of evicting them.
+{
+    "cache_config" : {
+      "cacheSizeMB" : 2248,
+      "cacheDir": "/tmp/mem-tier5",
+      "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": 0
+        }, {
+          "ratio": 1,
+          "memBindNodes": 0
+        }
+      ]
+      ,"poolRebalanceIntervalSec" : 1,
+      "moveOnSlabRelease" : true,
+      "rebalanceMinSlabs" : 2
+    },
+    "test_config" : 
+      {
+        "preallocateCache" : true,
+        "numOps" : 4000000,
+        "numThreads" : 32,
+        "numKeys" : 250000,
+        "generator": "online",
+  
+        "keySizeRange" : [1, 8, 32, 64, 128, 256, 512],
+        "keySizeRangeProbability" : [0.1, 0.1, 0.2, 0.2, 0.3, 0.1],
+  
+        "valSizeRange" : [1, 128, 512, 1024, 4096, 10240, 20480, 40960, 60000],
+        "valSizeRangeProbability" : [0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1],
+  
+        "getRatio" : 0.70,
+        "setRatio" : 0.30
+        //"delRatio" : 0.10
+      }
+   
+  }
+  
\ No newline at end of file
diff --git a/cachelib/cachebench/test_configs/simple_tiers_test.json b/cachelib/cachebench/test_configs/simple_tiers_test.json
index 182bb514cb..58302b9f20 100644
--- a/cachelib/cachebench/test_configs/simple_tiers_test.json
+++ b/cachelib/cachebench/test_configs/simple_tiers_test.json
@@ -1,14 +1,18 @@
 // @nolint instantiates a small cache and runs a quick run of basic operations.
 {
     "cache_config" : {
-      "cacheSizeMB" : 512,
-      "usePosixShm" : false,
+      "cacheSizeMB" : 1024,
       "cacheDir" : "/tmp/mem-tiers",
       "memoryTiers" : [
+        {
+          "ratio": 1,
+          "memBindNodes": "0"
+        },
         {
           "ratio": 1,
           "memBindNodes": "0"
         }
+
       ],
       "poolRebalanceIntervalSec" : 1,
       "moveOnSlabRelease" : false,
@@ -19,7 +23,7 @@
     "test_config" : {
         "numOps" : 100000,
         "numThreads" : 32,
-        "numKeys" : 1000000,
+        "numKeys" : 2000000,
 
         "keySizeRange" : [1, 8, 64],
         "keySizeRangeProbability" : [0.3, 0.7],
@@ -33,4 +37,4 @@
         "keyPoolDistribution": [0.4, 0.6],
         "opPoolDistribution" : [0.5, 0.5]
     }
-  }
\ No newline at end of file
+  }
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index b9ba839218..35a5952501 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -19,6 +19,8 @@
 #include "cachelib/allocator/HitsPerSlabStrategy.h"
 #include "cachelib/allocator/LruTailAgeStrategy.h"
 #include "cachelib/allocator/RandomStrategy.h"
+#include "cachelib/allocator/FreeThresholdStrategy.h"
+#include "cachelib/allocator/PromotionStrategy.h"
 
 namespace facebook {
 namespace cachelib {
@@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, cacheDir);
   JSONSetVal(configJson, cacheSizeMB);
   JSONSetVal(configJson, poolRebalanceIntervalSec);
+  JSONSetVal(configJson, backgroundEvictorIntervalMilSec);
+  JSONSetVal(configJson, backgroundPromoterIntervalMilSec);
+  JSONSetVal(configJson, backgroundEvictorStrategy);
   JSONSetVal(configJson, moveOnSlabRelease);
   JSONSetVal(configJson, rebalanceStrategy);
   JSONSetVal(configJson, rebalanceMinSlabs);
@@ -83,6 +88,16 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, navyEncryption);
   JSONSetVal(configJson, deviceMaxWriteSize);
 
+  JSONSetVal(configJson, disableEvictionToMemory);
+  JSONSetVal(configJson, acTopTierEvictionWatermark);
+  JSONSetVal(configJson, sizeThresholdPolicy);
+  JSONSetVal(configJson, defaultTierChancePercentage);
+  JSONSetVal(configJson, numDuplicateElements);
+  JSONSetVal(configJson, syncPromotion);
+  JSONSetVal(configJson, evictorThreads);
+  JSONSetVal(configJson, promoterThreads);
+  JSONSetVal(configJson, forceAllocationTier);
+
   JSONSetVal(configJson, memoryOnlyTTL);
 
   JSONSetVal(configJson, usePosixShm);
@@ -101,10 +116,27 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold);
 
   JSONSetVal(configJson, customConfigJson);
+  
+  //Background related configs
+  JSONSetVal(configJson, lowEvictionAcWatermark);
+  JSONSetVal(configJson, highEvictionAcWatermark);
+  JSONSetVal(configJson, minAcAllocationWatermark);
+  JSONSetVal(configJson, maxAcAllocationWatermark);
+  JSONSetVal(configJson, numDuplicateElements);
+  JSONSetVal(configJson, syncPromotion);
+  JSONSetVal(configJson, evictorThreads);
+  JSONSetVal(configJson, promoterThreads);
+  JSONSetVal(configJson, promotionAcWatermark);
+  JSONSetVal(configJson, maxEvictionBatch);
+  JSONSetVal(configJson, maxPromotionBatch);
+  JSONSetVal(configJson, minEvictionBatch);
+  JSONSetVal(configJson, minPromotionBatch);
+  JSONSetVal(configJson, maxEvictionPromotionHotness);
+  
   // if you added new fields to the configuration, update the JSONSetVal
   // to make them available for the json configs and increment the size
   // below
-  checkCorrectSize<CacheConfig, 728>();
+  checkCorrectSize<CacheConfig, 928>();
 
   if (numPools != poolSizes.size()) {
     throw std::invalid_argument(folly::sformat(
@@ -137,8 +169,24 @@ std::shared_ptr<RebalanceStrategy> CacheConfig::getRebalanceStrategy() const {
 MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, ratio);
   JSONSetVal(configJson, memBindNodes);
+  JSONSetVal(configJson, markUsefulChance);
+  JSONSetVal(configJson, lruInsertionPointSpec);
+
+  checkCorrectSize<MemoryTierConfig, 56>();
+}
 
-  checkCorrectSize<MemoryTierConfig, 40>();
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundEvictorStrategy() const {
+  if (backgroundEvictorIntervalMilSec == 0) {
+    return nullptr;
+  }
+  return std::make_shared<FreeThresholdStrategy>(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch);
+}
+
+std::shared_ptr<BackgroundMoverStrategy> CacheConfig::getBackgroundPromoterStrategy() const {
+  if (backgroundPromoterIntervalMilSec == 0) {
+    return nullptr;
+  }
+  return std::make_shared<PromotionStrategy>(promotionAcWatermark, maxPromotionBatch, minPromotionBatch);
 }
 } // namespace cachebench
 } // namespace cachelib
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index a1b8f52011..3b4ff3d52c 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -20,6 +20,7 @@
 
 #include "cachelib/allocator/CacheAllocator.h"
 #include "cachelib/allocator/RebalanceStrategy.h"
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/cachebench/util/JSONConfig.h"
 #include "cachelib/common/Ticker.h"
 #include "cachelib/navy/common/Device.h"
@@ -52,6 +53,8 @@ struct MemoryTierConfig : public JSONConfig {
     MemoryTierCacheConfig config = MemoryTierCacheConfig::fromShm();
     config.setRatio(ratio);
     config.setMemBind(NumaBitMask(memBindNodes));
+    config.markUsefulChance = markUsefulChance;
+    config.lruInsertionPointSpec = lruInsertionPointSpec;
     return config;
   }
 
@@ -59,6 +62,9 @@ struct MemoryTierConfig : public JSONConfig {
   size_t ratio{0};
   // Allocate memory only from specified NUMA nodes
   std::string memBindNodes{""};
+
+  double markUsefulChance{100.0};
+  uint32_t lruInsertionPointSpec{0};
 };
 
 struct CacheConfig : public JSONConfig {
@@ -71,7 +77,10 @@ struct CacheConfig : public JSONConfig {
 
   uint64_t cacheSizeMB{0};
   uint64_t poolRebalanceIntervalSec{0};
+  uint64_t backgroundEvictorIntervalMilSec{0};
+  uint64_t backgroundPromoterIntervalMilSec{0};
   std::string rebalanceStrategy;
+  std::string backgroundEvictorStrategy;
   uint64_t rebalanceMinSlabs{1};
   double rebalanceDiffRatio{0.25};
   bool moveOnSlabRelease{false};
@@ -92,7 +101,7 @@ struct CacheConfig : public JSONConfig {
   bool lruUpdateOnWrite{false};
   bool lruUpdateOnRead{true};
   bool tryLockUpdate{false};
-  bool useCombinedLockForIterators{false};
+  bool useCombinedLockForIterators{true};
 
   // LRU param
   uint64_t lruIpSpec{0};
@@ -249,6 +258,34 @@ struct CacheConfig : public JSONConfig {
   // eviction-age is more than this threshold. 0 means no threshold
   uint32_t nvmAdmissionRetentionTimeThreshold{0};
 
+  // See BackgroundMovers.md for complete description
+  double promotionAcWatermark{4.0};
+  double lowEvictionAcWatermark{2.0};
+  double highEvictionAcWatermark{5.0};
+  double minAcAllocationWatermark{0.0};
+  double maxAcAllocationWatermark{0.0};
+
+  double numDuplicateElements{0.0}; // inclusivness of the cache
+  double syncPromotion{0.0}; // can promotion be done synchronously in user thread
+  
+  uint64_t evictorThreads{1};
+  uint64_t promoterThreads{1};
+  
+  uint64_t maxEvictionBatch{40};
+  uint64_t maxPromotionBatch{10};
+  
+  uint64_t minEvictionBatch{5};
+  uint64_t minPromotionBatch{5};
+  
+  uint64_t maxEvictionPromotionHotness{60};
+
+  bool disableEvictionToMemory{false};
+
+  double acTopTierEvictionWatermark{0.0};
+  uint64_t sizeThresholdPolicy{0};   
+  double defaultTierChancePercentage{100.0};
+  uint64_t forceAllocationTier{UINT64_MAX};
+
   //
   // Options below are not to be populated with JSON
   //
@@ -284,6 +321,8 @@ struct CacheConfig : public JSONConfig {
   CacheConfig() {}
 
   std::shared_ptr<RebalanceStrategy> getRebalanceStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundEvictorStrategy() const;
+  std::shared_ptr<BackgroundMoverStrategy> getBackgroundPromoterStrategy() const;
 };
 } // namespace cachebench
 } // namespace cachelib
diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h
new file mode 100644
index 0000000000..4d179681ad
--- /dev/null
+++ b/cachelib/common/RollingStats.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/Range.h>
+#include <folly/logging/xlog.h>
+
+#include "cachelib/common/Utils.h"
+
+namespace facebook {
+namespace cachelib {
+namespace util {
+
+class RollingStats {
+ public:
+  // track latency by taking the value of duration directly.
+  void trackValue(double value) {
+    // This is a highly unlikely scenario where
+    // cnt_ reaches numerical limits. Skip update
+    // of the rolling average anymore.
+    if (cnt_ == std::numeric_limits<uint64_t>::max()) {
+      cnt_ = 0;
+      return;
+    }
+    auto ratio = static_cast<double>(cnt_) / (cnt_ + 1);
+    avg_ *= ratio;
+    ++cnt_;
+    avg_ += value / cnt_;
+  }
+
+  // Return the rolling average.
+  double estimate() { return avg_; }
+
+ private:
+  double avg_{0};
+  uint64_t cnt_{0};
+};
+
+class RollingLatencyTracker {
+ public:
+  explicit RollingLatencyTracker(RollingStats& stats)
+      : stats_(&stats), begin_(std::chrono::steady_clock::now()) {}
+  RollingLatencyTracker() {}
+  ~RollingLatencyTracker() {
+    if (stats_) {
+      auto tp = std::chrono::steady_clock::now();
+      auto diffNanos =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tp - begin_)
+              .count();
+      stats_->trackValue(static_cast<double>(diffNanos));
+    }
+  }
+
+  RollingLatencyTracker(const RollingLatencyTracker&) = delete;
+  RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete;
+
+  RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept
+      : stats_(rhs.stats_), begin_(rhs.begin_) {
+    rhs.stats_ = nullptr;
+  }
+
+  RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept {
+    if (this != &rhs) {
+      this->~RollingLatencyTracker();
+      new (this) RollingLatencyTracker(std::move(rhs));
+    }
+    return *this;
+  }
+
+ private:
+  RollingStats* stats_{nullptr};
+  std::chrono::time_point<std::chrono::steady_clock> begin_;
+};
+} // namespace util
+} // namespace cachelib
+} // namespace facebook
diff --git a/contrib/build-package.sh b/contrib/build-package.sh
index 755933bd44..f0f3283df0 100755
--- a/contrib/build-package.sh
+++ b/contrib/build-package.sh
@@ -78,9 +78,8 @@ build_tests=
 show_help=
 many_jobs=
 verbose=
-PREFIX="$PWD/opt/cachelib/"
-
-while getopts :BSdhijtvp: param
+install_path=
+while getopts :BSdhijtvI: param
 do
   case $param in
     i) install=yes ;;
@@ -91,7 +90,7 @@ do
     v) verbose=yes ;;
     j) many_jobs=yes ;;
     t) build_tests=yes ;;
-    p) PREFIX=$OPTARG ;;
+    I) install_path=${OPTARG} ; install=yes ;;
     ?) die "unknown option. See -h for help."
   esac
 done
@@ -281,6 +280,7 @@ test -d cachelib || die "expected 'cachelib' directory not found in $PWD"
 
 
 # After ensuring we are in the correct directory, set the installation prefix"
+PREFIX=${install_path:-"$PWD/opt/cachelib/"}
 CMAKE_PARAMS="$CMAKE_PARAMS -DCMAKE_INSTALL_PREFIX=$PREFIX"
 CMAKE_PREFIX_PATH="$PREFIX/lib/cmake:$PREFIX/lib64/cmake:$PREFIX/lib:$PREFIX/lib64:$PREFIX:${CMAKE_PREFIX_PATH:-}"
 export CMAKE_PREFIX_PATH
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 0000000000..bb82f0142d
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+#
+# build.sh - runs a Docker container from a Docker image with environment
+#		prepared for running CacheLib builds and tests. It uses Docker image
+#		tagged as described in ./images/build-image.sh.
+#
+# Notes:
+# - set env var 'HOST_WORKDIR' to where the root of this project is on the host machine,
+# - set env var 'OS' and 'OS_VER' properly to a system/Docker you want to build this
+#	repo on (for proper values take a look at the list of Dockerfiles at the
+#	utils/docker/images directory in this repo), e.g. OS=ubuntu, OS_VER=20.04,
+# - set env var 'CONTAINER_REG' to container registry address
+#	[and possibly user/org name, and package name], e.g. "<CR_addr>/pmem/CacheLib",
+# - set env var 'DNS_SERVER' if you use one,
+# - set env var 'COMMAND' to execute specific command within Docker container or
+#	env var 'TYPE' to pick command based on one of the predefined types of build (see below).
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGE_NAME=${CONTAINER_REG}:${TAG}
+CONTAINER_NAME=CacheLib-${OS}-${OS_VER}
+WORKDIR=/CacheLib  # working dir within Docker container
+SCRIPTSDIR=${WORKDIR}/docker
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=32)."
+	exit 1
+fi
+
+if [[ -z "${HOST_WORKDIR}" ]]; then
+	echo "ERROR: The variable HOST_WORKDIR has to contain a path to " \
+		"the root of this project on the host machine."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+# Set command to execute in the Docker container
+COMMAND="./run-build.sh";
+echo "COMMAND to execute within Docker container: ${COMMAND}"
+
+if [ -n "${DNS_SERVER}" ]; then DOCKER_OPTS="${DOCKER_OPTS} --dns=${DNS_SERVER}"; fi
+
+# Check if we are running on a CI (Travis or GitHub Actions)
+[ -n "${GITHUB_ACTIONS}" -o -n "${TRAVIS}" ] && CI_RUN="YES" || CI_RUN="NO"
+
+# Do not allocate a pseudo-TTY if we are running on GitHub Actions
+[ ! "${GITHUB_ACTIONS}" ] && DOCKER_OPTS="${DOCKER_OPTS} --tty=true"
+
+
+echo "Running build using Docker image: ${IMAGE_NAME}"
+
+# Run a container with
+#  - environment variables set (--env)
+#  - host directory containing source mounted (-v)
+#  - working directory set (-w)
+docker run --privileged=true --name=${CONTAINER_NAME} -i \
+	${DOCKER_OPTS} \
+	--env http_proxy=${http_proxy} \
+	--env https_proxy=${https_proxy} \
+	--env TERM=xterm-256color \
+	--env WORKDIR=${WORKDIR} \
+	--env SCRIPTSDIR=${SCRIPTSDIR} \
+	--env GITHUB_REPO=${GITHUB_REPO} \
+	--env CI_RUN=${CI_RUN} \
+	--env TRAVIS=${TRAVIS} \
+	--env GITHUB_ACTIONS=${GITHUB_ACTIONS} \
+	--env CI_COMMIT=${CI_COMMIT} \
+	--env CI_COMMIT_RANGE=${CI_COMMIT_RANGE} \
+	--env CI_BRANCH=${CI_BRANCH} \
+	--env CI_EVENT_TYPE=${CI_EVENT_TYPE} \
+	--env CI_REPO_SLUG=${CI_REPO_SLUG} \
+	--env DOC_UPDATE_GITHUB_TOKEN=${DOC_UPDATE_GITHUB_TOKEN} \
+	--env DOC_UPDATE_BOT_NAME=${DOC_UPDATE_BOT_NAME} \
+	--env DOC_REPO_OWNER=${DOC_REPO_OWNER} \
+	--env COVERITY_SCAN_TOKEN=${COVERITY_SCAN_TOKEN} \
+	--env COVERITY_SCAN_NOTIFICATION_EMAIL=${COVERITY_SCAN_NOTIFICATION_EMAIL} \
+	--env TEST_TIMEOUT=${TEST_TIMEOUT} \
+	--env TZ='Europe/Warsaw' \
+	--shm-size=4G \
+	-v ${HOST_WORKDIR}:${WORKDIR} \
+	-v /etc/localtime:/etc/localtime \
+	-w ${SCRIPTSDIR} \
+	${IMAGE_NAME} ${COMMAND}
+
diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh
new file mode 100755
index 0000000000..985a6e0ff1
--- /dev/null
+++ b/docker/images/build-image.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+#
+# build-image.sh - prepares a Docker image with <OS>-based environment for
+#		testing (or dev) purpose, tagged with ${CONTAINER_REG}:${OS}-${OS_VER}-${IMG_VER},
+#		according to the ${OS}-${OS_VER}.Dockerfile file located in the same directory.
+#		IMG_VER is a version of Docker image (it usually relates to project's release tag)
+#		and it defaults to "devel".
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+echo "Check if the file ${OS}-${OS_VER}.Dockerfile exists"
+if [[ ! -f "${OS}-${OS_VER}.Dockerfile" ]]; then
+	echo "Error: ${OS}-${OS_VER}.Dockerfile does not exist."
+	exit 1
+fi
+
+echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}"
+docker build -t ${CONTAINER_REG}:${TAG} \
+	--build-arg http_proxy=$http_proxy \
+	--build-arg https_proxy=$https_proxy \
+	-f ${OS}-${OS_VER}.Dockerfile .
diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile
new file mode 100644
index 0000000000..29752c5d98
--- /dev/null
+++ b/docker/images/centos-8streams.Dockerfile
@@ -0,0 +1,24 @@
+FROM quay.io/centos/centos:stream8
+
+RUN dnf install -y \
+cmake \
+sudo \
+git \
+tzdata \
+vim \
+gdb \
+clang \
+python36 \
+glibc-devel.i686 \
+xmlto \
+uuid \
+libuuid-devel \
+json-c-devel \
+perf \
+numactl
+
+COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh
+RUN ./install-cachelib-deps.sh
+
+COPY ./install-dsa-deps.sh ./install-dsa-deps.sh
+RUN ./install-dsa-deps.sh
diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh
new file mode 100755
index 0000000000..6d8fbdef7b
--- /dev/null
+++ b/docker/images/install-cachelib-deps.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+git clone -b develop https://github.com/intel/CacheLib CacheLib
+
+./CacheLib/contrib/prerequisites-centos8.sh
+
+for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ;
+do
+    sudo ./CacheLib/contrib/build-package.sh -j -I /opt/ "$pkg"
+done
+
+rm -rf CacheLib
diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh
new file mode 100755
index 0000000000..b4c62ecc93
--- /dev/null
+++ b/docker/images/install-dsa-deps.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Copyright 2023, Intel Corporation
+
+# Install idxd-config
+git clone https://github.com/intel/idxd-config.git
+cd idxd-config
+./autogen.sh
+./configure CFLAGS='-g -O2' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64
+make
+make check
+sudo make install
+cd ../
+rm -rf idxd-config
+
+# Install DML Library
+git clone --recursive https://github.com/intel/DML.git
+cd DML
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . --target install
+cd ../../
+rm -rf DML
diff --git a/docker/images/push-image.sh b/docker/images/push-image.sh
new file mode 100755
index 0000000000..8f516b4205
--- /dev/null
+++ b/docker/images/push-image.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# push-image.sh - pushes the Docker image tagged as described in
+#		./build-image.sh, to the ${CONTAINER_REG}.
+#
+# The script utilizes ${CONTAINER_REG_USER} and ${CONTAINER_REG_PASS} variables to
+# log in to the ${CONTAINER_REG}. The variables can be set in the CI's configuration
+# for automated builds.
+#
+
+set -e
+IMG_VER=${IMG_VER:-devel}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set " \
+		"(e.g. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG_USER}" || -z "${CONTAINER_REG_PASS}" ]]; then
+	echo "ERROR: variables CONTAINER_REG_USER=\"${CONTAINER_REG_USER}\" and " \
+		"CONTAINER_REG_PASS=\"${CONTAINER_REG_PASS}\"" \
+		"have to be set properly to allow login to the Container Registry."
+	exit 1
+fi
+
+# Check if the image tagged with ${CONTAINER_REG}:${TAG} exists locally
+if [[ ! $(docker images -a | awk -v pattern="^${CONTAINER_REG}:${TAG}\$" \
+	'$1":"$2 ~ pattern') ]]
+then
+	echo "ERROR: Docker image tagged ${CONTAINER_REG}:${TAG} does not exist locally."
+	exit 1
+fi
+
+echo "Log in to the Container Registry: ${CONTAINER_REG}"
+echo "${CONTAINER_REG_PASS}" | docker login ghcr.io -u="${CONTAINER_REG_USER}" --password-stdin
+
+echo "Push the image to the Container Registry"
+docker push ${CONTAINER_REG}:${TAG}
diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh
new file mode 100755
index 0000000000..dcdcb40e8c
--- /dev/null
+++ b/docker/pull-or-rebuild-image.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2016-2021, Intel Corporation
+
+#
+# pull-or-rebuild-image.sh - rebuilds the Docker image used in the
+#		current build (if necessary) or pulls it from the Container Registry.
+#		Docker image is tagged as described in docker/build-image.sh,
+#		but IMG_VER defaults in this script to "latest" (just in case it's
+#		used locally without building any images).
+#
+# If Docker was rebuilt and all requirements are fulfilled (more details in
+# push_image function below) image will be pushed to the ${CONTAINER_REG}.
+#
+# The script rebuilds the Docker image if:
+# 1. the Dockerfile for the current OS version (${OS}-${OS_VER}.Dockerfile)
+#    or any .sh script in the Dockerfiles directory were modified and committed, or
+# 2. "rebuild" param was passed as a first argument to this script.
+#
+# The script pulls the Docker image if:
+# 1. it does not have to be rebuilt (based on committed changes), or
+# 2. "pull" param was passed as a first argument to this script.
+#
+
+set -e
+
+source $(dirname ${0})/set-ci-vars.sh
+IMG_VER=${IMG_VER:-latest}
+TAG="${OS}-${OS_VER}-${IMG_VER}"
+IMAGES_DIR_NAME=images
+BASE_DIR=docker/${IMAGES_DIR_NAME}
+
+if [[ -z "${OS}" || -z "${OS_VER}" ]]; then
+	echo "ERROR: The variables OS and OS_VER have to be set properly " \
+             "(eg. OS=fedora, OS_VER=34)."
+	exit 1
+fi
+
+if [[ -z "${CONTAINER_REG}" ]]; then
+	echo "ERROR: CONTAINER_REG environment variable is not set " \
+		"(e.g. \"<registry_addr>/<org_name>/<package_name>\")."
+	exit 1
+fi
+
+function build_image() {
+	echo "Building the Docker image for the ${OS}-${OS_VER}.Dockerfile"
+	pushd ${IMAGES_DIR_NAME}
+	./build-image.sh
+	popd
+}
+
+function pull_image() {
+	echo "Pull the image '${CONTAINER_REG}:${TAG}' from the Container Registry."
+	docker pull ${CONTAINER_REG}:${TAG}
+}
+
+function push_image {
+	# Check if the image has to be pushed to the Container Registry:
+	# - only upstream (not forked) repository,
+	# - stable-* or master branch,
+	# - not a pull_request event,
+	# - and PUSH_IMAGE flag was set for current build.
+	if [[ "${CI_REPO_SLUG}" == "${GITHUB_REPO}" \
+		&& (${CI_BRANCH} == develop || ${CI_BRANCH} == main) \
+		&& ${CI_EVENT_TYPE} != "pull_request" \
+		&& ${PUSH_IMAGE} == "1" ]]
+	then
+		echo "The image will be pushed to the Container Registry: ${CONTAINER_REG}"
+		pushd ${IMAGES_DIR_NAME}
+		./push-image.sh
+		popd
+	else
+		echo "Skip pushing the image to the Container Registry."
+	fi
+}
+
+# If "rebuild" or "pull" are passed to the script as param, force rebuild/pull.
+if [[ "${1}" == "rebuild" ]]; then
+	build_image
+	push_image
+	exit 0
+elif [[ "${1}" == "pull" ]]; then
+	pull_image
+	exit 0
+fi
+
+# Determine if we need to rebuild the image or just pull it from
+# the Container Registry, based on committed changes.
+if [ -n "${CI_COMMIT_RANGE}" ]; then
+	commits=$(git rev-list ${CI_COMMIT_RANGE})
+else
+	commits=${CI_COMMIT}
+fi
+
+if [[ -z "${commits}" ]]; then
+	echo "'commits' variable is empty. Docker image will be pulled."
+fi
+
+echo "Commits in the commit range:"
+for commit in ${commits}; do echo ${commit}; done
+
+echo "Files modified within the commit range:"
+files=$(for commit in ${commits}; do git diff-tree --no-commit-id --name-only \
+	-r ${commit}; done | sort -u)
+for file in ${files}; do echo ${file}; done
+
+# Check if committed file modifications require the Docker image to be rebuilt
+for file in ${files}; do
+	# Check if modified files are relevant to the current build
+	if [[ ${file} =~ ^(${BASE_DIR})\/(${OS})-(${OS_VER})\.Dockerfile$ ]] \
+		|| [[ ${file} =~ ^(${BASE_DIR})\/.*\.sh$ ]]
+	then
+		build_image
+		push_image
+		exit 0
+	fi
+done
+
+# Getting here means rebuilding the Docker image isn't required (based on changed files).
+# Pull the image from the Container Registry or rebuild anyway, if pull fails.
+if ! pull_image; then
+	build_image
+	push_image
+fi
diff --git a/docker/run-build.sh b/docker/run-build.sh
new file mode 100755
index 0000000000..02c7caf731
--- /dev/null
+++ b/docker/run-build.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022, Intel Corporation
+
+set -e
+
+function sudo_password() {
+	echo ${USERPASS} | sudo -Sk $*
+}
+
+cd ..
+mkdir build
+cd build
+cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug
+sudo_password make install -j$(nproc)
+
+cd /opt/tests && $WORKDIR/run_tests.sh
diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh
new file mode 100755
index 0000000000..f6f52132c8
--- /dev/null
+++ b/docker/set-ci-vars.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020-2021, Intel Corporation
+
+#
+# set-ci-vars.sh -- set CI variables common for both:
+#                   Travis and GitHub Actions CIs
+#
+
+set -e
+
+function get_commit_range_from_last_merge {
+	# get commit id of the last merge
+	LAST_MERGE=$(git log --merges --pretty=%H -1)
+	LAST_COMMIT=$(git log --pretty=%H -1)
+	RANGE_END="HEAD"
+	if [ -n "${GITHUB_ACTIONS}" ] && [ "${GITHUB_EVENT_NAME}" == "pull_request" ] && [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+		# GitHub Actions commits its own merge in case of pull requests
+		# so the first merge commit has to be skipped.
+
+		LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1)
+		LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1)
+		# If still the last commit is a merge commit it means we're manually
+		# merging changes (probably back from stable branch). We have to use
+		# left parent of the merge and the current commit for COMMIT_RANGE.
+		if [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then
+			LAST_MERGE=$(git log --merges --pretty=%P -2 | tail -n1 | cut -d" " -f1)
+			RANGE_END=${LAST_COMMIT}
+		fi
+	elif [ "${LAST_MERGE}" == "${LAST_COMMIT}" ] &&
+		([ "${TRAVIS_EVENT_TYPE}" == "push" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]); then
+		# Other case in which last commit equals last merge, is when committing
+		# a manual merge. Push events don't set proper COMMIT_RANGE.
+		# It has to be then set: from merge's left parent to the current commit.
+		LAST_MERGE=$(git log --merges --pretty=%P -1 | cut -d" " -f1)
+	fi
+	if [ "${LAST_MERGE}" == "" ]; then
+		# possible in case of shallow clones
+		# or new repos with no merge commits yet
+		# - pick up the first commit
+		LAST_MERGE=$(git log --pretty=%H | tail -n1)
+	fi
+	COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}"
+	# make sure it works now
+	if ! git rev-list ${COMMIT_RANGE} >/dev/null; then
+		COMMIT_RANGE=""
+	fi
+	echo ${COMMIT_RANGE}
+}
+
+COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_last_merge)
+
+if [ -n "${TRAVIS}" ]; then
+	CI_COMMIT=${TRAVIS_COMMIT}
+	CI_COMMIT_RANGE="${TRAVIS_COMMIT_RANGE/.../..}"
+	CI_BRANCH=${TRAVIS_BRANCH}
+	CI_EVENT_TYPE=${TRAVIS_EVENT_TYPE}
+	CI_REPO_SLUG=${TRAVIS_REPO_SLUG}
+
+	# CI_COMMIT_RANGE is usually invalid for force pushes - fix it when used
+	# with non-upstream repository
+	if [ -n "${CI_COMMIT_RANGE}" -a "${CI_REPO_SLUG}" != "${GITHUB_REPO}" ]; then
+		if ! git rev-list ${CI_COMMIT_RANGE}; then
+			CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+		fi
+	fi
+
+	case "${TRAVIS_CPU_ARCH}" in
+	"amd64")
+		CI_CPU_ARCH="x86_64"
+		;;
+	*)
+		CI_CPU_ARCH=${TRAVIS_CPU_ARCH}
+		;;
+	esac
+
+elif [ -n "${GITHUB_ACTIONS}" ]; then
+	CI_COMMIT=${GITHUB_SHA}
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_BRANCH=$(echo ${GITHUB_REF} | cut -d'/' -f3)
+	CI_REPO_SLUG=${GITHUB_REPOSITORY}
+	CI_CPU_ARCH="x86_64" # GitHub Actions supports only x86_64
+
+	case "${GITHUB_EVENT_NAME}" in
+	"schedule")
+		CI_EVENT_TYPE="cron"
+		;;
+	*)
+		CI_EVENT_TYPE=${GITHUB_EVENT_NAME}
+		;;
+	esac
+
+else
+	CI_COMMIT=$(git log --pretty=%H -1)
+	CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE}
+	CI_CPU_ARCH="x86_64"
+fi
+
+export CI_COMMIT=${CI_COMMIT}
+export CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+export CI_BRANCH=${CI_BRANCH}
+export CI_EVENT_TYPE=${CI_EVENT_TYPE}
+export CI_REPO_SLUG=${CI_REPO_SLUG}
+export CI_CPU_ARCH=${CI_CPU_ARCH}
+
+echo CI_COMMIT=${CI_COMMIT}
+echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE}
+echo CI_BRANCH=${CI_BRANCH}
+echo CI_EVENT_TYPE=${CI_EVENT_TYPE}
+echo CI_REPO_SLUG=${CI_REPO_SLUG}
+echo CI_CPU_ARCH=${CI_CPU_ARCH}
diff --git a/run_code_coverage.sh b/run_code_coverage.sh
new file mode 100755
index 0000000000..7722e262bf
--- /dev/null
+++ b/run_code_coverage.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#Build CacheLib with flag -DCOVERAGE_ENABLED=ON
+
+# Track coverage
+lcov -c -i -b . -d . -o Coverage.baseline
+./run_tests.sh
+lcov -c -d . -b . -o Coverage.out
+lcov -a Coverage.baseline -a Coverage.out -o Coverage.combined
+
+# Generate report
+COVERAGE_DIR='coverage_report'
+genhtml Coverage.combined -o ${COVERAGE_DIR}
+COVERAGE_REPORT="${COVERAGE_DIR}.tgz"
+tar -zcvf ${COVERAGE_REPORT} ${COVERAGE_DIR}
+echo "Created coverage report ${COVERAGE_REPORT}"
+
+# Cleanup
+rm Coverage.baseline Coverage.out Coverage.combined
+rm -rf ${COVERAGE_DIR}
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 0000000000..e575dbc62a
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Newline separated list of tests to ignore
+BLACKLIST="allocator-test-NavySetupTest
+allocator-test-NvmCacheTests
+shm-test-test_page_size"
+
+if [ "$1" == "long" ]; then
+    find -type f -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+else
+    find -type f \( -not -name "*bench*" -and -not -name "navy*" \) -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c
+fi
+
+../bin/cachebench --json_test_config ../test_configs/consistency/navy.json
+../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json