intel
diff --git a/‎MultiTierDataMovement.md
Lines changed: 90 additions & 0 deletions b/‎MultiTierDataMovement.md
Lines changed: 90 additions & 0 deletions
diff --git a/‎cachelib/allocator/BackgroundMover.h
Lines changed: 81 additions & 34 deletions b/‎cachelib/allocator/BackgroundMover.h
Lines changed: 81 additions & 34 deletions
diff --git a/‎cachelib/allocator/BackgroundMoverStrategy.h
Lines changed: 29 additions & 8 deletions b/‎cachelib/allocator/BackgroundMoverStrategy.h
Lines changed: 29 additions & 8 deletions
diff --git a/‎cachelib/allocator/Cache.h
Lines changed: 16 additions & 0 deletions b/‎cachelib/allocator/Cache.h
Lines changed: 16 additions & 0 deletions
@@ -0,0 +1,90 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture. 
+
+<p align="center">
+  <img width="640" height="360" alt="BackgroundEvictor" src="cachelib-background-evictor.png">
+</p>
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters. 
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the 
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads. 
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1. 
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free. 
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this 
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads. 
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
@@ -18,7 +18,6 @@
 
 #include "cachelib/allocator/BackgroundMoverStrategy.h"
 #include "cachelib/allocator/CacheStats.h"
-#include "cachelib/common/AtomicCounter.h"
 #include "cachelib/common/PeriodicWorker.h"
 
 namespace facebook::cachelib {
@@ -51,6 +50,7 @@ enum class MoverDir { Evict = 0, Promote };
 template <typename CacheT>
 class BackgroundMover : public PeriodicWorker {
  public:
+  using ClassBgStatsType = std::map<MemoryDescriptorType,uint64_t>;
   using Cache = CacheT;
   // @param cache               the cache interface
   // @param strategy            the stragey class that defines how objects are
@@ -62,8 +62,9 @@ class BackgroundMover : public PeriodicWorker {
   ~BackgroundMover() override;
 
   BackgroundMoverStats getStats() const noexcept;
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-  getClassStats() const noexcept;
+  ClassBgStatsType getClassStats() const noexcept {
+    return movesPerClass_;
+  }
 
   void setAssignedMemory(std::vector<MemoryDescriptorType>&& assignedMemory);
 
@@ -72,8 +73,27 @@ class BackgroundMover : public PeriodicWorker {
   static size_t workerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers);
 
  private:
-  std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-      movesPerClass_;
+  ClassBgStatsType movesPerClass_;
+
+  struct TraversalStats {
+    // record a traversal and its time taken
+    void recordTraversalTime(uint64_t nsTaken);
+
+    uint64_t getAvgTraversalTimeNs(uint64_t numTraversals) const;
+    uint64_t getMinTraversalTimeNs() const { return minTraversalTimeNs_; }
+    uint64_t getMaxTraversalTimeNs() const { return maxTraversalTimeNs_; }
+    uint64_t getLastTraversalTimeNs() const { return lastTraversalTimeNs_; }
+
+   private:
+    // time it took us the last time to traverse the cache.
+    uint64_t lastTraversalTimeNs_{0};
+    uint64_t minTraversalTimeNs_{
+        std::numeric_limits<uint64_t>::max()};
+    uint64_t maxTraversalTimeNs_{0};
+    uint64_t totalTraversalTimeNs_{0};
+  };
+
+  TraversalStats traversalStats_;
   // cache allocator's interface for evicting
   using Item = typename Cache::Item;
 
@@ -89,9 +109,10 @@ class BackgroundMover : public PeriodicWorker {
   void work() override final;
   void checkAndRun();
 
-  AtomicCounter numMovedItems_{0};
-  AtomicCounter numTraversals_{0};
-  AtomicCounter totalBytesMoved_{0};
+  uint64_t numMovedItems{0};
+  uint64_t numTraversals{0};
+  uint64_t totalClasses{0};
+  uint64_t totalBytesMoved{0};
 
   std::vector<MemoryDescriptorType> assignedMemory_;
   folly::DistributedMutex mutex_;
@@ -111,6 +132,20 @@ BackgroundMover<CacheT>::BackgroundMover(
   }
 }
 
+template <typename CacheT>
+void BackgroundMover<CacheT>::TraversalStats::recordTraversalTime(uint64_t nsTaken) {
+  lastTraversalTimeNs_ = nsTaken;
+  minTraversalTimeNs_ = std::min(minTraversalTimeNs_, nsTaken);
+  maxTraversalTimeNs_ = std::max(maxTraversalTimeNs_, nsTaken);
+  totalTraversalTimeNs_ += nsTaken;
+}
+
+template <typename CacheT>
+uint64_t BackgroundMover<CacheT>::TraversalStats::getAvgTraversalTimeNs(
+    uint64_t numTraversals) const {
+  return numTraversals ? totalTraversalTimeNs_ / numTraversals : 0;
+}
+
 template <typename CacheT>
 BackgroundMover<CacheT>::~BackgroundMover() {
   stop(std::chrono::seconds(0));
@@ -144,44 +179,56 @@ template <typename CacheT>
 void BackgroundMover<CacheT>::checkAndRun() {
   auto assignedMemory = mutex_.lock_combine([this] { return assignedMemory_; });
 
-  unsigned int moves = 0;
-  auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
-
-  for (size_t i = 0; i < batches.size(); i++) {
-    const auto [tid, pid, cid] = assignedMemory[i];
-    const auto batch = batches[i];
+  while (true) {
+    unsigned int moves = 0;
+    std::set<ClassId> classes{};
+    auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+    const auto begin = util::getCurrentTimeNs();
+    for (size_t i = 0; i < batches.size(); i++) {
+      const auto [tid, pid, cid] = assignedMemory[i];
+      const auto batch = batches[i];
+      if (!batch) {
+        continue;
+      }
+
+      // try moving BATCH items from the class in order to reach free target
+      auto moved = moverFunc(cache_, tid, pid, cid, batch);
+      moves += moved;
+      movesPerClass_[assignedMemory[i]] += moved;
+    }
+    auto end = util::getCurrentTimeNs();
+    if (moves > 0) {
+      traversalStats_.recordTraversalTime(end > begin ? end - begin : 0);
+      numMovedItems += moves;
+      numTraversals++;
+    }
 
-    if (batch == 0) {
-      continue;
+    //we didn't move any objects done with this run
+    if (moves == 0 || shouldStopWork()) {
+        break;
     }
-    const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
-    // try moving BATCH items from the class in order to reach free target
-    auto moved = moverFunc(cache_, tid, pid, cid, batch);
-    moves += moved;
-    movesPerClass_[tid][pid][cid] += moved;
-    totalBytesMoved_.add(moved * mpStats.acStats.at(cid).allocSize );
   }
-
-  numTraversals_.inc();
-  numMovedItems_.add(moves);
 }
 
 template <typename CacheT>
 BackgroundMoverStats BackgroundMover<CacheT>::getStats() const noexcept {
   BackgroundMoverStats stats;
-  stats.numMovedItems = numMovedItems_.get();
-  stats.runCount = numTraversals_.get();
-  stats.totalBytesMoved = totalBytesMoved_.get();
+  stats.numMovedItems = numMovedItems;
+  stats.totalBytesMoved = totalBytesMoved;
+  stats.totalClasses = totalClasses;
+  auto runCount = getRunCount();
+  stats.runCount = runCount;
+  stats.numTraversals = numTraversals;
+  stats.avgItemsMoved = (double) stats.numMovedItems / (double)runCount;
+  stats.lastTraversalTimeNs = traversalStats_.getLastTraversalTimeNs();
+  stats.avgTraversalTimeNs = traversalStats_.getAvgTraversalTimeNs(numTraversals);
+  stats.minTraversalTimeNs = traversalStats_.getMinTraversalTimeNs();
+  stats.maxTraversalTimeNs = traversalStats_.getMaxTraversalTimeNs();
 
   return stats;
 }
 
-template <typename CacheT>
-std::map<TierId, std::map<PoolId, std::map<ClassId, uint64_t>>>
-BackgroundMover<CacheT>::getClassStats() const noexcept {
-  return movesPerClass_;
-}
-
 template <typename CacheT>
 size_t BackgroundMover<CacheT>::workerId(TierId tid,
                                          PoolId pid,
 
@@ -21,14 +21,6 @@
 namespace facebook {
 namespace cachelib {
 
-struct MemoryDescriptorType {
-  MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : 
-      tid_(tid), pid_(pid), cid_(cid) {}
-  TierId tid_;
-  PoolId pid_;
-  ClassId cid_;
-};
-
 // Base class for background eviction strategy.
 class BackgroundMoverStrategy {
  public:
@@ -46,5 +38,34 @@ class BackgroundMoverStrategy {
   virtual ~BackgroundMoverStrategy() = default;
 };
 
+class DefaultBackgroundMoverStrategy : public BackgroundMoverStrategy {
+  public:
+    DefaultBackgroundMoverStrategy(uint64_t batchSize, double targetFree)
+      : batchSize_(batchSize), targetFree_((double)targetFree/100.0) {}
+    ~DefaultBackgroundMoverStrategy() {}
+
+  std::vector<size_t> calculateBatchSizes(
+      const CacheBase& cache,
+      std::vector<MemoryDescriptorType> acVec) {
+    std::vector<size_t> batches{};
+    for (auto [tid, pid, cid] : acVec) {
+        double usage = cache.getPoolByTid(pid, tid).getApproxUsage(cid);
+        uint32_t perSlab = cache.getPoolByTid(pid, tid).getPerSlab(cid);
+        if (usage >= (1.0-targetFree_)) {
+          uint32_t batch = batchSize_ > perSlab ? perSlab : batchSize_;
+          batches.push_back(batch);
+        } else {
+          //no work to be done since there is already
+          //at least targetFree remaining in the class
+          batches.push_back(0);
+        }
+    }
+    return batches;
+  }
+  private:
+    uint64_t batchSize_{100};
+    double targetFree_{0.05};
+};
+
 } // namespace cachelib
 } // namespace facebook
@@ -73,6 +73,22 @@ enum class DestructorContext {
   kRemovedFromNVM
 };
 
+struct MemoryDescriptorType {
+    MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) :
+        tid_(tid), pid_(pid), cid_(cid) {}
+    TierId tid_;
+    PoolId pid_;
+    ClassId cid_;
+
+    bool operator<(const MemoryDescriptorType& rhs) const {
+      return std::make_tuple(tid_, pid_, cid_) < std::make_tuple(rhs.tid_, rhs.pid_, rhs.cid_);
+    }
+
+    bool operator==(const MemoryDescriptorType& rhs) const {
+      return std::make_tuple(tid_, pid_, cid_) == std::make_tuple(rhs.tid_, rhs.pid_, rhs.cid_);
+    }
+};
+
 // A base class of cache exposing members and status agnostic of template type.
 class CacheBase {
  public: