diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml
new file mode 100644
index 0000000000..92165f603b
--- /dev/null
+++ b/.github/workflows/build-cachelib-centos-long.yml
@@ -0,0 +1,39 @@
+name: build-cachelib-centos-latest
+on:
+ schedule:
+ - cron: '0 7 * * *'
+
+jobs:
+ build-cachelib-centos8-latest:
+ name: "CentOS/latest - Build CacheLib with all dependencies"
+ runs-on: ubuntu-latest
+ # Docker container image name
+ container: "centos:latest"
+ steps:
+ - name: "update packages"
+ run: dnf upgrade -y
+ - name: "install sudo,git"
+ run: dnf install -y sudo git cmake gcc
+ - name: "System Information"
+ run: |
+ echo === uname ===
+ uname -a
+ echo === /etc/os-release ===
+ cat /etc/os-release
+ echo === df -hl ===
+ df -hl
+ echo === free -h ===
+ free -h
+ echo === top ===
+ top -b -n1 -1 -Eg || timeout 1 top -b -n1
+ echo === env ===
+ env
+ echo === gcc -v ===
+ gcc -v
+ - name: "checkout sources"
+ uses: actions/checkout@v2
+ - name: "build CacheLib using build script"
+ run: ./contrib/build.sh -j -v -T
+ - name: "run tests"
+ timeout-minutes: 60
+ run: cd opt/cachelib/tests && ../../../run_tests.sh long
diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml
new file mode 100644
index 0000000000..5bc3ad3c70
--- /dev/null
+++ b/.github/workflows/build-cachelib-debian.yml
@@ -0,0 +1,43 @@
+name: build-cachelib-debian-10
+on:
+ schedule:
+ - cron: '30 5 * * 0,3'
+
+jobs:
+ build-cachelib-debian-10:
+ name: "Debian/Buster - Build CacheLib with all dependencies"
+ runs-on: ubuntu-latest
+ # Docker container image name
+ container: "debian:buster-slim"
+ steps:
+ - name: "update packages"
+ run: apt-get update
+ - name: "upgrade packages"
+ run: apt-get -y upgrade
+ - name: "install sudo,git"
+ run: apt-get install -y sudo git procps
+ - name: "System Information"
+ run: |
+ echo === uname ===
+ uname -a
+ echo === /etc/os-release ===
+ cat /etc/os-release
+ echo === df -hl ===
+ df -hl
+ echo === free -h ===
+ free -h
+ echo === top ===
+ top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true
+ echo === env ===
+ env
+ echo === cc -v ===
+ cc -v || true
+ echo === g++ -v ===
+ g++ - || true
+ - name: "checkout sources"
+ uses: actions/checkout@v2
+ - name: "build CacheLib using build script"
+ run: ./contrib/build.sh -j -v -T
+ - name: "run tests"
+ timeout-minutes: 60
+ run: cd opt/cachelib/tests && ../../../run_tests.sh
diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml
new file mode 100644
index 0000000000..be28bc233c
--- /dev/null
+++ b/.github/workflows/build-cachelib-docker.yml
@@ -0,0 +1,49 @@
+name: build-cachelib-docker
+on:
+ push:
+ pull_request:
+
+jobs:
+ build-cachelib-docker:
+ name: "CentOS/latest - Build CacheLib with all dependencies"
+ runs-on: ubuntu-latest
+ env:
+ REPO: cachelib
+ GITHUB_REPO: intel/CacheLib
+ CONTAINER_REG: ghcr.io/pmem/cachelib
+ CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }}
+ CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }}
+ FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }}
+ HOST_WORKDIR: ${{ github.workspace }}
+ WORKDIR: docker
+ IMG_VER: devel
+ strategy:
+ matrix:
+ CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"]
+ steps:
+ - name: "System Information"
+ run: |
+ echo === uname ===
+ uname -a
+ echo === /etc/os-release ===
+ cat /etc/os-release
+ echo === df -hl ===
+ df -hl
+ echo === free -h ===
+ free -h
+ echo === top ===
+ top -b -n1 -1 -Eg || timeout 1 top -b -n1
+ echo === env ===
+ env
+ echo === gcc -v ===
+ gcc -v
+ - name: "checkout sources"
+ uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Pull the image or rebuild and push it
+ run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION
+
+ - name: Run the build
+ run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 4b4897b610..90c8d739c6 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -1,6 +1,6 @@
# From: https://github.com/marketplace/actions/clang-format-check#multiple-paths
name: clang-format Check
-on: [pull_request]
+on: []
jobs:
formatting-check:
name: Formatting Check
diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md
new file mode 100644
index 0000000000..6976c9ddc9
--- /dev/null
+++ b/MultiTierDataMovement.md
@@ -0,0 +1,113 @@
+# Background Data Movement
+
+In order to reduce the number of online evictions and support asynchronous
+promotion - we have added two periodic workers to handle eviction and promotion.
+
+The diagram below shows a simplified version of how the background evictor
+thread (green) is integrated to the CacheLib architecture.
+
+
+
+
+
+## Background Evictors
+
+The background evictors scan each class to see if there are objects to move the next (lower)
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default
+the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also,
+the background evictor thread will be woken up everytime there is a failed allocation (from
+a request handling thread) and the current percentage of free memory for the
+AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter
+not as important when there are many allocations occuring from request handling threads.
+
+- `evictorThreads`: The number of background evictors to run - each thread is a assigned
+a set of AllocationClasses to scan and evict objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses.
+The default is 1.
+
+- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any
+candidates)
+
+- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch`
+but it specifies how many candidates will be taken into consideration, not the actual number of items to evict.
+This option can be used to configure duration of critical section on LRU lock.
+
+
+### FreeThresholdStrategy (default)
+
+- `lowEvictionAcWatermark`: Triggers background eviction thread to run
+when this percentage of the AllocationClass is free.
+The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`.
+
+- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this
+percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we
+don't set this above `10`.
+
+
+## Background Promoters
+
+The background promoters scan each class to see if there are objects to move to a lower
+tier using a given strategy. Here we document the parameters for the different
+strategies and general parameters.
+
+- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default
+the background promoter threads will wake up every 10 ms to scan the AllocationClasses for
+objects to promote.
+
+- `promoterThreads`: The number of background promoters to run - each thread is a assigned
+a set of AllocationClasses to scan and promote objects from. Currently, each thread gets
+an equal number of classes to scan - but as object size distribution may be unequal - future
+versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`.
+
+- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The
+default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not
+remove objects at a reasonable rate, too high and it might increase contention with user threads.
+
+- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any
+candidates)
+
+- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since
+we won't need to modify the data when a user is done with the data. Therefore, for a short time
+the data could reside in both tiers until it is evicted from its current tier. The default is to
+not allow this (0). Setting the value to 100 will enable duplicate elements in tiers.
+
+### Background Promotion Strategy (only one currently)
+
+- `promotionAcWatermark`: Promote items if there is at least this
+percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects
+to that tier. The objects are chosen from the head of the LRU. The default is `4.0`.
+This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`.
+- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to
+`maxEvictionBatch`. It's value should be lower to decrease contention on hot items.
+
+## Allocation policies
+
+- `maxAcAllocationWatermark`: Item is always allocated in topmost tier if at least this
+percentage of the AllocationClass is free.
+- `minAcAllocationWatermark`: Item is always allocated in bottom tier if only this percent
+of the AllocationClass is free. If percentage of free AllocationClasses is between `maxAcAllocationWatermark`
+and `minAcAllocationWatermark`: then extra checks (described below) are performed to decide where to put the element.
+
+By default, allocation will always be performed from the upper tier.
+
+- `acTopTierEvictionWatermark`: If there is less that this percent of free memory in topmost tier, cachelib will attempt to evict from top tier. This option takes precedence before allocationWatermarks.
+
+### Extra policies (used only when percentage of free AllocationClasses is between `maxAcAllocationWatermark`
+and `minAcAllocationWatermark`)
+- `sizeThresholdPolicy`: If item is smaller than this value, always allocate it in upper tier.
+- `defaultTierChancePercentage`: Change (0-100%) of allocating item in top tier
+
+## MMContainer options
+
+- `lruInsertionPointSpec`: Can be set per tier when LRU2Q is used. Determines where new items are
+inserted. 0 = insert to hot queue, 1 = insert to warm queue, 2 = insert to cold queue
+- `markUsefulChance`: Per-tier, determines chance of moving item to the head of LRU on access
+
diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt
index 36df0dc19f..e77c25085c 100644
--- a/cachelib/CMakeLists.txt
+++ b/cachelib/CMakeLists.txt
@@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)
+if(COVERAGE_ENABLED)
+ # Add code coverage
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage")
+endif()
+
# include(fb_cxx_flags)
message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}")
diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h
new file mode 100644
index 0000000000..b77436635f
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover-inl.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace facebook {
+namespace cachelib {
+
+template
+BackgroundMover::BackgroundMover(
+ Cache& cache,
+ std::shared_ptr strategy,
+ MoverDir direction)
+ : cache_(cache), strategy_(strategy), direction_(direction) {
+ if (direction_ == MoverDir::Evict) {
+ moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems;
+
+ } else if (direction_ == MoverDir::Promote) {
+ moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems;
+ }
+}
+
+template
+BackgroundMover::~BackgroundMover() {
+ stop(std::chrono::seconds(0));
+}
+
+template
+void BackgroundMover::work() {
+ try {
+ checkAndRun();
+ } catch (const std::exception& ex) {
+ XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what());
+ }
+}
+
+template
+void BackgroundMover::setAssignedMemory(
+ std::vector&& assignedMemory) {
+ XLOG(INFO, "Class assigned to background worker:");
+ for (auto [tid, pid, cid] : assignedMemory) {
+ XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid);
+ }
+
+ mutex.lock_combine([this, &assignedMemory] {
+ this->assignedMemory_ = std::move(assignedMemory);
+ });
+}
+
+// Look for classes that exceed the target memory capacity
+// and return those for eviction
+template
+void BackgroundMover::checkAndRun() {
+ auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; });
+
+ unsigned int moves = 0;
+ std::set classes{};
+ auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory);
+
+ for (size_t i = 0; i < batches.size(); i++) {
+ const auto [tid, pid, cid] = assignedMemory[i];
+ const auto batch = batches[i];
+
+ classes.insert(cid);
+ const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats();
+
+ if (!batch) {
+ continue;
+ }
+
+ // try moving BATCH items from the class in order to reach free target
+ auto moved = moverFunc(cache_, tid, pid, cid, batch);
+ moves += moved;
+ moves_per_class_[tid][pid][cid] += moved;
+ totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize);
+ }
+
+ numTraversals.inc();
+ numMovedItems.add(moves);
+ totalClasses.add(classes.size());
+}
+
+template
+BackgroundMoverStats BackgroundMover::getStats() const noexcept {
+ BackgroundMoverStats stats;
+ stats.numMovedItems = numMovedItems.get();
+ stats.runCount = numTraversals.get();
+ stats.totalBytesMoved = totalBytesMoved.get();
+ stats.totalClasses = totalClasses.get();
+
+ return stats;
+}
+
+template
+std::map>>
+BackgroundMover::getClassStats() const noexcept {
+ return moves_per_class_;
+}
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h
new file mode 100644
index 0000000000..1246676d6e
--- /dev/null
+++ b/cachelib/allocator/BackgroundMover.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Intel and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/BackgroundMoverStrategy.h"
+#include "cachelib/allocator/CacheStats.h"
+#include "cachelib/common/AtomicCounter.h"
+#include "cachelib/common/PeriodicWorker.h"
+
+namespace facebook {
+namespace cachelib {
+
+// wrapper that exposes the private APIs of CacheType that are specifically
+// needed for the cache api
+template
+struct BackgroundMoverAPIWrapper {
+ static size_t traverseAndEvictItems(C& cache,
+ unsigned int tid,
+ unsigned int pid,
+ unsigned int cid,
+ size_t batch) {
+ return cache.traverseAndEvictItems(tid, pid, cid, batch);
+ }
+
+ static size_t traverseAndPromoteItems(C& cache,
+ unsigned int tid,
+ unsigned int pid,
+ unsigned int cid,
+ size_t batch) {
+ return cache.traverseAndPromoteItems(tid, pid, cid, batch);
+ }
+};
+
+enum class MoverDir { Evict = 0, Promote };
+
+// Periodic worker that evicts items from tiers in batches
+// The primary aim is to reduce insertion times for new items in the
+// cache
+template
+class BackgroundMover : public PeriodicWorker {
+ public:
+ using Cache = CacheT;
+ // @param cache the cache interface
+ // @param strategy the stragey class that defines how objects are
+ // moved,
+ // (promoted vs. evicted and how much)
+ BackgroundMover(Cache& cache,
+ std::shared_ptr strategy,
+ MoverDir direction_);
+
+ ~BackgroundMover() override;
+
+ BackgroundMoverStats getStats() const noexcept;
+ std::map>>
+ getClassStats() const noexcept;
+
+ void setAssignedMemory(
+ std::vector&& assignedMemory);
+
+ private:
+ std::map>>
+ moves_per_class_;
+ // cache allocator's interface for evicting
+ using Item = typename Cache::Item;
+
+ Cache& cache_;
+ std::shared_ptr strategy_;
+ MoverDir direction_;
+
+ std::function
+ moverFunc;
+
+ // implements the actual logic of running the background evictor
+ void work() override final;
+ void checkAndRun();
+
+ AtomicCounter numMovedItems{0};
+ AtomicCounter numTraversals{0};
+ AtomicCounter totalClasses{0};
+ AtomicCounter totalBytesMoved{0};
+
+ std::vector assignedMemory_;
+ folly::DistributedMutex mutex;
+};
+} // namespace cachelib
+} // namespace facebook
+
+#include "cachelib/allocator/BackgroundMover-inl.h"
diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h
new file mode 100644
index 0000000000..7706a625a5
--- /dev/null
+++ b/cachelib/allocator/BackgroundMoverStrategy.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cachelib/allocator/Cache.h"
+
+
+namespace facebook {
+namespace cachelib {
+
+struct MemoryDescriptorType {
+ MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) :
+ tid_(tid), pid_(pid), cid_(cid) {}
+ TierId tid_;
+ PoolId pid_;
+ ClassId cid_;
+};
+
+// Base class for background eviction strategy.
+class BackgroundMoverStrategy {
+ public:
+ virtual std::vector calculateBatchSizes(
+ const CacheBase& cache,
+ std::vector acVec) = 0;
+};
+
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/allocator/CCacheAllocator.cpp b/cachelib/allocator/CCacheAllocator.cpp
index 2709bde377..dd1986114b 100644
--- a/cachelib/allocator/CCacheAllocator.cpp
+++ b/cachelib/allocator/CCacheAllocator.cpp
@@ -36,7 +36,9 @@ CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator,
currentChunksIndex_(0) {
auto& currentChunks = chunks_[currentChunksIndex_];
for (auto chunk : *object.chunks()) {
- currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk)));
+ // TODO : pass multi-tier flag when compact cache supports multi-tier config
+ currentChunks.push_back(
+ allocator_.unCompress(CompressedPtr(chunk), false /* isMultiTier */));
}
}
@@ -97,7 +99,9 @@ CCacheAllocator::SerializationType CCacheAllocator::saveState() {
std::lock_guard guard(resizeLock_);
for (auto chunk : getCurrentChunks()) {
- object.chunks()->push_back(allocator_.compress(chunk).saveState());
+ // TODO : pass multi-tier flag when compact cache supports multi-tier config
+ object.chunks()->push_back(
+ allocator_.compress(chunk, false /* isMultiTier */).saveState());
}
return object;
}
diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt
index 78cfa7ca06..6103cdc823 100644
--- a/cachelib/allocator/CMakeLists.txt
+++ b/cachelib/allocator/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library (cachelib_allocator
CCacheManager.cpp
ContainerTypes.cpp
FreeMemStrategy.cpp
+ FreeThresholdStrategy.cpp
HitsPerSlabStrategy.cpp
LruTailAgeStrategy.cpp
MarginalHitsOptimizeStrategy.cpp
@@ -117,6 +118,8 @@ if (BUILD_TESTS)
add_test (tests/ChainedHashTest.cpp)
add_test (tests/AllocatorResizeTypeTest.cpp)
add_test (tests/AllocatorHitStatsTypeTest.cpp)
+ add_test (tests/AllocatorMemoryTiersTest.cpp)
+ add_test (tests/MemoryTiersTest.cpp)
add_test (tests/MultiAllocatorTest.cpp)
add_test (tests/NvmAdmissionPolicyTest.cpp)
add_test (tests/CacheAllocatorConfigTest.cpp)
diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h
index e225ba8a01..c871358189 100644
--- a/cachelib/allocator/Cache.h
+++ b/cachelib/allocator/Cache.h
@@ -85,6 +85,9 @@ class CacheBase {
CacheBase(CacheBase&&) = default;
CacheBase& operator=(CacheBase&&) = default;
+ // TODO: come up with some reasonable number
+ static constexpr unsigned kMaxTiers = 2;
+
// Get a string referring to the cache name for this cache
virtual const std::string getCacheName() const = 0;
@@ -95,6 +98,12 @@ class CacheBase {
//
// @param poolId The pool id to query
virtual const MemoryPool& getPool(PoolId poolId) const = 0;
+
+ // Get the reference to a memory pool using a tier id, for stats purposes
+ //
+ // @param poolId The pool id to query
+ // @param tierId The tier of the pool id
+ virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0;
// Get Pool specific stats (regular pools). This includes stats from the
// Memory Pool and also the cache.
@@ -102,6 +111,12 @@ class CacheBase {
// @param poolId the pool id
virtual PoolStats getPoolStats(PoolId poolId) const = 0;
+ // Get Allocation Class specific stats.
+ //
+ // @param poolId the pool id
+ // @param classId the class id
+ virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0;
+
// @param poolId the pool id
virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0;
diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h
index 1d89593268..614f031aeb 100644
--- a/cachelib/allocator/CacheAllocator-inl.h
+++ b/cachelib/allocator/CacheAllocator-inl.h
@@ -16,6 +16,8 @@
#pragma once
+#include
+
namespace facebook {
namespace cachelib {
@@ -35,6 +37,7 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config)
template
CacheAllocator::CacheAllocator(SharedMemAttachT, Config config)
: CacheAllocator(InitMemType::kMemAttach, config) {
+ /* TODO - per tier? */
for (auto pid : *metadata_.compactCachePools()) {
isCompactCachePool_[pid] = true;
}
@@ -53,12 +56,13 @@ CacheAllocator::CacheAllocator(
: isOnShm_{type != InitMemType::kNone ? true
: config.memMonitoringEnabled()},
config_(config.validate()),
+ memoryTierConfigs(config.getMemoryTierConfigs()),
tempShm_(type == InitMemType::kNone && isOnShm_
- ? std::make_unique(config_.size)
+ ? std::make_unique(config_.getCacheSize())
: nullptr),
shmManager_(type != InitMemType::kNone
? std::make_unique(config_.cacheDir,
- config_.usePosixShm)
+ config_.isUsingPosixShm())
: nullptr),
deserializer_(type == InitMemType::kMemAttach ? createDeserializer()
: nullptr),
@@ -67,12 +71,12 @@ CacheAllocator::CacheAllocator(
: serialization::CacheAllocatorMetadata{}},
allocator_(initAllocator(type)),
compactCacheManager_(type != InitMemType::kMemAttach
- ? std::make_unique(*allocator_)
- : restoreCCacheManager()),
+ ? std::make_unique(*allocator_[0] /* TODO: per tier */)
+ : restoreCCacheManager(0/* TODO: per tier */)),
compressor_(createPtrCompressor()),
mmContainers_(type == InitMemType::kMemAttach
? deserializeMMContainers(*deserializer_, compressor_)
- : MMContainers{}),
+ : MMContainers{getNumTiers()}),
accessContainer_(initAccessContainer(
type, detail::kShmHashTableName, config.accessConfig)),
chainedItemAccessContainer_(
@@ -81,6 +85,8 @@ CacheAllocator::CacheAllocator(
config.chainedItemAccessConfig)),
chainedItemLocks_(config_.chainedItemsLockPower,
std::make_shared()),
+ movesMap_(kShards),
+ moveLock_(kShards),
cacheCreationTime_{
type != InitMemType::kMemAttach
? util::getCurrentTimeSec()
@@ -105,48 +111,98 @@ CacheAllocator::~CacheAllocator() {
}
template
-ShmSegmentOpts CacheAllocator::createShmCacheOpts() {
+ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) {
ShmSegmentOpts opts;
opts.alignment = sizeof(Slab);
auto memoryTierConfigs = config_.getMemoryTierConfigs();
// TODO: we support single tier so far
- XDCHECK_EQ(memoryTierConfigs.size(), 1ul);
- opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind();
-
+ if (memoryTierConfigs.size() > 2) {
+ throw std::invalid_argument("CacheLib only supports two memory tiers");
+ }
+ opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind();
return opts;
}
+template
+size_t CacheAllocator::memoryTierSize(TierId tid) const {
+ auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL,
+ [](const size_t i, const MemoryTierCacheConfig& config){
+ return i + config.getRatio();
+ });
+
+ return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions);
+}
+
+template
+std::vector>
+CacheAllocator::createPrivateAllocator() {
+ std::vector> allocators;
+
+ if (isOnShm_)
+ allocators.emplace_back(std::make_unique(
+ getAllocatorConfig(config_),
+ tempShm_->getAddr(),
+ config_.getCacheSize()));
+ else
+ allocators.emplace_back(std::make_unique(
+ getAllocatorConfig(config_),
+ config_.getCacheSize()));
+
+ return allocators;
+}
+
template
std::unique_ptr
-CacheAllocator::createNewMemoryAllocator() {
+CacheAllocator::createNewMemoryAllocator(TierId tid) {
+ size_t tierSize = memoryTierSize(tid);
return std::make_unique(
getAllocatorConfig(config_),
shmManager_
- ->createShm(detail::kShmCacheName, config_.size,
- config_.slabMemoryBaseAddr, createShmCacheOpts())
+ ->createShm(detail::kShmCacheName + std::to_string(tid),
+ tierSize, config_.slabMemoryBaseAddr,
+ createShmCacheOpts(tid))
.addr,
- config_.size);
+ tierSize);
}
template
std::unique_ptr
-CacheAllocator::restoreMemoryAllocator() {
+CacheAllocator::restoreMemoryAllocator(TierId tid) {
return std::make_unique(
deserializer_->deserialize(),
shmManager_
- ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr,
- createShmCacheOpts())
- .addr,
- config_.size,
+ ->attachShm(detail::kShmCacheName + std::to_string(tid),
+ config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr,
+ memoryTierSize(tid),
config_.disableFullCoredump);
}
+template
+std::vector>
+CacheAllocator::createAllocators() {
+ std::vector> allocators;
+ for (int tid = 0; tid < getNumTiers(); tid++) {
+ allocators.emplace_back(createNewMemoryAllocator(tid));
+ }
+ return allocators;
+}
+
+template
+std::vector>
+CacheAllocator::restoreAllocators() {
+ std::vector> allocators;
+ for (int tid = 0; tid < getNumTiers(); tid++) {
+ allocators.emplace_back(restoreMemoryAllocator(tid));
+ }
+ return allocators;
+}
+
template
std::unique_ptr
-CacheAllocator::restoreCCacheManager() {
+CacheAllocator::restoreCCacheManager(TierId tid) {
return std::make_unique(
deserializer_->deserialize(),
- *allocator_);
+ *allocator_[tid]);
}
template
@@ -235,23 +291,30 @@ void CacheAllocator::initWorkers() {
config_.poolOptimizeStrategy,
config_.ccacheOptimizeStepSizePercent);
}
+
+ if (config_.backgroundEvictorEnabled()) {
+ startNewBackgroundEvictor(config_.backgroundEvictorInterval,
+ config_.backgroundEvictorStrategy,
+ config_.backgroundEvictorThreads);
+ }
+
+ if (config_.backgroundPromoterEnabled()) {
+ startNewBackgroundPromoter(config_.backgroundPromoterInterval,
+ config_.backgroundPromoterStrategy,
+ config_.backgroundPromoterThreads);
+ }
}
template
-std::unique_ptr CacheAllocator::initAllocator(
+std::vector>
+CacheAllocator::initAllocator(
InitMemType type) {
if (type == InitMemType::kNone) {
- if (isOnShm_ == true) {
- return std::make_unique(
- getAllocatorConfig(config_), tempShm_->getAddr(), config_.size);
- } else {
- return std::make_unique(getAllocatorConfig(config_),
- config_.size);
- }
+ return createPrivateAllocator();
} else if (type == InitMemType::kMemNew) {
- return createNewMemoryAllocator();
+ return createAllocators();
} else if (type == InitMemType::kMemAttach) {
- return restoreMemoryAllocator();
+ return restoreAllocators();
}
// Invalid type
@@ -318,13 +381,31 @@ CacheAllocator::allocate(PoolId poolId,
ttlSecs == 0 ? 0 : creationTime + ttlSecs);
}
+template
+bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) {
+ // TODO: should we also work on lower tiers? should we have separate set of params?
+ if (tid == 1) return false;
+ return (1-getACStats(tid, pid, cid).usageFraction())*100 <= config_.lowEvictionAcWatermark;
+}
+
+template
+size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) {
+ XDCHECK(numWorkers);
+
+ // TODO: came up with some better sharding (use some hashing)
+ return (tid + pid + cid) % numWorkers;
+}
+
+
template
typename CacheAllocator::WriteHandle
-CacheAllocator::allocateInternal(PoolId pid,
- typename Item::Key key,
- uint32_t size,
- uint32_t creationTime,
- uint32_t expiryTime) {
+CacheAllocator::allocateInternalTier(TierId tid,
+ PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime,
+ bool fromBgThread) {
util::LatencyTracker tracker{stats().allocateLatency_};
SCOPE_FAIL { stats_.invalidAllocs.inc(); };
@@ -333,13 +414,32 @@ CacheAllocator::allocateInternal(PoolId pid,
const auto requiredSize = Item::getRequiredSize(key, size);
// the allocation class in our memory allocator.
- const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+ const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+ util::RollingLatencyTracker rollTracker{
+ (*stats_.classAllocLatency)[tid][pid][cid]};
+ // TODO: per-tier
(*stats_.allocAttempts)[pid][cid].inc();
+
+ void *memory = nullptr;
+
+ if (tid == 0 && config_.acTopTierEvictionWatermark > 0.0
+ && 100.0 * (1 - getACStats(tid, pid, cid).usageFraction()) < config_.acTopTierEvictionWatermark) {
+ memory = findEviction(tid, pid, cid);
+ }
- void* memory = allocator_->allocate(pid, requiredSize);
if (memory == nullptr) {
- memory = findEviction(pid, cid);
+ // TODO: should we try allocate item even if this will result in violating
+ // acTopTierEvictionWatermark?
+ memory = allocator_[tid]->allocate(pid, requiredSize);
+ }
+
+ if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) {
+ backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp();
+ }
+
+ if (memory == nullptr) {
+ memory = findEviction(tid, pid, cid);
}
WriteHandle handle;
@@ -350,7 +450,7 @@ CacheAllocator::allocateInternal(PoolId pid,
// for example.
SCOPE_FAIL {
// free back the memory to the allocator since we failed.
- allocator_->free(memory);
+ allocator_[tid]->free(memory);
};
handle = acquire(new (memory) Item(key, size, creationTime, expiryTime));
@@ -361,7 +461,7 @@ CacheAllocator::allocateInternal(PoolId pid,
}
} else { // failed to allocate memory.
- (*stats_.allocFailures)[pid][cid].inc();
+ (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier
// wake up rebalancer
if (poolRebalancer_) {
poolRebalancer_->wakeUp();
@@ -378,6 +478,90 @@ CacheAllocator::allocateInternal(PoolId pid,
return handle;
}
+template
+TierId
+CacheAllocator::getTargetTierForItem(PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime) {
+ if (getNumTiers() == 1)
+ return 0;
+
+ if (config_.forceAllocationTier != UINT64_MAX) {
+ return config_.forceAllocationTier;
+ }
+
+ const TierId defaultTargetTier = 0;
+
+ const auto requiredSize = Item::getRequiredSize(key, size);
+ const auto cid = allocator_[defaultTargetTier]->getAllocationClassId(pid, requiredSize);
+
+ auto freePercentage = 100.0 * (1 - getACStats(defaultTargetTier, pid, cid).usageFraction());
+
+ // TODO: COULD we implement BG worker which would move slabs around
+ // so that there is similar amount of free space in each pool/ac.
+ // Should this be responsibility of BG evictor?
+
+ if (freePercentage >= config_.maxAcAllocationWatermark)
+ return defaultTargetTier;
+
+ if (freePercentage <= config_.minAcAllocationWatermark)
+ return defaultTargetTier + 1;
+
+ // TODO: we can think about creating different allocation classes for different memory tiers
+ // and we could look at possible fragmentation when deciding where to put the item
+ if (config_.sizeThresholdPolicy)
+ return requiredSize < config_.sizeThresholdPolicy ? defaultTargetTier : defaultTargetTier + 1;
+
+ // TODO: (e.g. always put chained items to second tier)
+ // if (chainedItemsPolicy)
+ // return item.isChainedItem() ? defaultTargetTier + 1 : defaultTargetTier;
+
+ // TODO:
+ // if (expiryTimePolicy)
+ // return (expiryTime - creationTime) < expiryTimePolicy ? defaultTargetTier : defaultTargetTier + 1;
+
+ // TODO:
+ // if (keyPolicy) // this can be based on key length or some other properties
+ // return getTargetTierForKey(key);
+
+ // TODO:
+ // if (compressabilityPolicy) // if compresses well store on CXL? latency will be higher anyway
+ // return TODO;
+
+ if (config_.defaultTierChancePercentage >= 100.00) {
+ return 0;
+ }
+
+ return (folly::Random::rand32() % 100) < config_.defaultTierChancePercentage ? defaultTargetTier : defaultTargetTier + 1;
+}
+
+template
+bool CacheAllocator::shouldEvictToNextMemoryTier(
+ TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item)
+{
+ if (config_.disableEvictionToMemory)
+ return false;
+
+ // TODO: implement more advanced admission policies for memory tiers, for example:
+ // - always evict big items to NVMe only
+ // - do not evict an item if pressure on target Tier is high
+ return true;
+}
+
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::allocateInternal(PoolId pid,
+ typename Item::Key key,
+ uint32_t size,
+ uint32_t creationTime,
+ uint32_t expiryTime,
+ bool fromBgThread) {
+ auto tid = getTargetTierForItem(pid, key, size, creationTime, expiryTime);
+ return allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread);
+}
+
template
typename CacheAllocator::WriteHandle
CacheAllocator::allocateChainedItem(const ReadHandle& parent,
@@ -408,21 +592,29 @@ CacheAllocator::allocateChainedItemInternal(
// number of bytes required for this item
const auto requiredSize = ChainedItem::getRequiredSize(size);
- const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId;
- const auto cid = allocator_->getAllocationClassId(pid, requiredSize);
+ // TODO: is this correct?
+ auto tid = getTierId(*parent);
+ const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId;
+ const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize);
+
+ util::RollingLatencyTracker rollTracker{
+ (*stats_.classAllocLatency)[tid][pid][cid]};
+
+ // TODO: per-tier? Right now stats_ are not used in any public periodic
+ // worker
(*stats_.allocAttempts)[pid][cid].inc();
- void* memory = allocator_->allocate(pid, requiredSize);
+ void* memory = allocator_[tid]->allocate(pid, requiredSize);
if (memory == nullptr) {
- memory = findEviction(pid, cid);
+ memory = findEviction(tid, pid, cid);
}
if (memory == nullptr) {
(*stats_.allocFailures)[pid][cid].inc();
return WriteHandle{};
}
- SCOPE_FAIL { allocator_->free(memory); };
+ SCOPE_FAIL { allocator_[tid]->free(memory); };
auto child = acquire(
new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size,
@@ -466,14 +658,15 @@ void CacheAllocator::addChainedItem(WriteHandle& parent,
// Count a new child
stats_.numChainedChildItems.inc();
- insertInMMContainer(*child);
-
// Increment refcount since this chained item is now owned by the parent
// Parent will decrement the refcount upon release. Since this is an
// internal refcount, we dont include it in active handle tracking.
- child->incRef();
+ auto ret = child->incRef(true);
+ XDCHECK(ret == RefcountWithFlags::incResult::incOk);
XDCHECK_EQ(2u, child->getRefCount());
+ insertInMMContainer(*child);
+
invalidateNvm(*parent);
if (auto eventTracker = getEventTracker()) {
eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(),
@@ -717,7 +910,8 @@ CacheAllocator::replaceChainedItemLocked(Item& oldItem,
// Since this is an internal refcount, we dont include it in active handle
// tracking.
- newItemHdl->incRef();
+ auto ret = newItemHdl->incRef(true);
+ XDCHECK(ret == RefcountWithFlags::incResult::incOk);
return oldItemHdl;
}
@@ -731,8 +925,8 @@ CacheAllocator::releaseBackToAllocator(Item& it,
throw std::runtime_error(
folly::sformat("cannot release this item: {}", it.toString()));
}
-
- const auto allocInfo = allocator_->getAllocInfo(it.getMemory());
+ const auto tid = getTierId(it);
+ const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory());
if (ctx == RemoveContext::kEviction) {
const auto timeNow = util::getCurrentTimeSec();
@@ -756,8 +950,7 @@ CacheAllocator::releaseBackToAllocator(Item& it,
folly::sformat("Can not recycle a chained item {}, toRecyle",
it.toString(), toRecycle->toString()));
}
-
- allocator_->free(&it);
+ allocator_[tid]->free(&it);
return ReleaseRes::kReleased;
}
@@ -826,26 +1019,29 @@ CacheAllocator::releaseBackToAllocator(Item& it,
auto next = head->getNext(compressor_);
const auto childInfo =
- allocator_->getAllocInfo(static_cast(head));
+ allocator_[tid]->getAllocInfo(static_cast(head));
(*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub(
util::getFragmentation(*this, *head));
removeFromMMContainer(*head);
- // If this chained item is marked as exclusive, we will not free it.
- // We must capture the exclusive state before we do the decRef when
- // we know the item must still be valid
- const bool wasExclusive = head->isExclusive();
+ // If this chained item is marked as moving, we will not free it.
+ // We must capture the moving state before we do the decRef when
+ // we know the item must still be valid. Item cannot be marked as
+ // exclusive. Only parent can be marked as such and even parent needs
+ // to be unmark prior to calling releaseBackToAllocator.
+ const bool wasMoving = head->isMoving();
+ XDCHECK(!head->isMarkedForEviction());
// Decref and check if we were the last reference. Now if the item
- // was marked exclusive, after decRef, it will be free to be released
+ // was marked moving, after decRef, it will be free to be released
// by slab release thread
const auto childRef = head->decRef();
- // If the item is already exclusive and we already decremented the
+ // If the item is already moving and we already decremented the
// refcount, we don't need to free this item. We'll let the slab
// release thread take care of that
- if (!wasExclusive) {
+ if (!wasMoving) {
if (childRef != 0) {
throw std::runtime_error(folly::sformat(
"chained item refcount is not zero. We cannot proceed! "
@@ -853,13 +1049,13 @@ CacheAllocator::releaseBackToAllocator(Item& it,
childRef, head->toString()));
}
- // Item is not exclusive and refcount is 0, we can proceed to
+ // Item is not moving and refcount is 0, we can proceed to
// free it or recylce the memory
if (head == toRecycle) {
XDCHECK(ReleaseRes::kReleased != res);
res = ReleaseRes::kRecycled;
} else {
- allocator_->free(head);
+ allocator_[tid]->free(head);
}
}
@@ -874,16 +1070,19 @@ CacheAllocator::releaseBackToAllocator(Item& it,
res = ReleaseRes::kRecycled;
} else {
XDCHECK(it.isDrained());
- allocator_->free(&it);
+ allocator_[tid]->free(&it);
}
return res;
}
template
-void CacheAllocator::incRef(Item& it) {
- it.incRef();
- ++handleCount_.tlStats();
+RefcountWithFlags::incResult CacheAllocator::incRef(Item& it, bool failIfMoving) {
+ auto ret = it.incRef(failIfMoving);
+ if (ret == RefcountWithFlags::incResult::incOk) {
+ ++handleCount_.tlStats();
+ }
+ return ret;
}
template
@@ -903,8 +1102,18 @@ CacheAllocator::acquire(Item* it) {
SCOPE_FAIL { stats_.numRefcountOverflow.inc(); };
- incRef(*it);
- return WriteHandle{it, *this};
+ // TODO: do not block incRef for child items to avoid deadlock
+ auto failIfMoving = getNumTiers() > 1 && !it->isChainedItem();
+ auto incRes = incRef(*it, failIfMoving);
+ if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) {
+ return WriteHandle{it, *this};
+ } else if (incRes == RefcountWithFlags::incResult::incFailedEviction){
+ // item is being evicted
+ return WriteHandle{};
+ } else {
+ // item is being moved - wait for completion
+ return handleWithWaitContextForMovingItem(*it);
+ }
}
template
@@ -946,6 +1155,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem,
}
}
+template
+bool CacheAllocator::replaceInMMContainer(Item* oldItem,
+ Item& newItem) {
+ return replaceInMMContainer(*oldItem, newItem);
+}
+
+template
+bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt,
+ Item& newItem) {
+ auto& oldContainer = getMMContainer(*oldItemIt);
+ auto& newContainer = getMMContainer(newItem);
+
+ // This function is used for eviction across tiers
+ XDCHECK(&oldContainer != &newContainer);
+ oldContainer.remove(oldItemIt);
+
+ return newContainer.add(newItem);
+}
+
template
bool CacheAllocator::replaceChainedItemInMMContainer(
Item& oldItem, Item& newItem) {
@@ -1091,6 +1319,165 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) {
return replaced;
}
+/* Next two methods are used to asynchronously move Item between memory tiers.
+ *
+ * The thread, which moves Item, allocates new Item in the tier we are moving to
+ * and calls moveRegularItemWithSync() method. This method does the following:
+ * 1. Update the access container with the new item from the tier we are
+ * moving to. This Item has moving flag set.
+ * 2. Copy data from the old Item to the new one.
+ *
+ * Concurrent threads which are getting handle to the same key:
+ * 1. When a handle is created it checks if the moving flag is set
+ * 2. If so, Handle implementation creates waitContext and adds it to the
+ * MoveCtx by calling handleWithWaitContextForMovingItem() method.
+ * 3. Wait until the moving thread will complete its job.
+ */
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::handleWithWaitContextForMovingItem(Item& item) {
+ auto shard = getShardForKey(item.getKey());
+ auto& movesMap = getMoveMapForShard(shard);
+ {
+ auto lock = getMoveLockForShard(shard);
+
+ WriteHandle hdl{*this};
+ auto waitContext = hdl.getItemWaitContext();
+
+ auto ret = movesMap.try_emplace(item.getKey(), std::make_unique());
+ ret.first->second->addWaiter(std::move(waitContext));
+
+ return hdl;
+ }
+}
+
+template
+size_t CacheAllocator::wakeUpWaitersLocked(folly::StringPiece key,
+ WriteHandle&& handle) {
+ std::unique_ptr ctx;
+ auto shard = getShardForKey(key);
+ auto& movesMap = getMoveMapForShard(shard);
+ {
+ auto lock = getMoveLockForShard(shard);
+ movesMap.eraseInto(key, [&](auto &&key, auto &&value) {
+ ctx = std::move(value);
+ });
+ }
+
+ if (ctx) {
+ ctx->setItemHandle(std::move(handle));
+ return ctx->numWaiters();
+ }
+
+ return 0;
+}
+
+template
+bool CacheAllocator::moveRegularItemWithSync(
+ Item& oldItem, WriteHandle& newItemHdl) {
+ //on function exit - the new item handle is no longer moving
+ //and other threads may access it - but in case where
+ //we failed to replace in access container we can give the
+ //new item back to the allocator
+ auto guard = folly::makeGuard([&]() {
+ auto ref = newItemHdl->unmarkMoving();
+ if (UNLIKELY(ref == 0)) {
+ const auto res =
+ releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false);
+ XDCHECK(res == ReleaseRes::kReleased);
+ }
+ });
+
+ XDCHECK(oldItem.isMoving());
+ XDCHECK(!oldItem.isExpired());
+ // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_
+ // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_};
+
+ XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize());
+
+ // take care of the flags before we expose the item to be accessed. this
+ // will ensure that when another thread removes the item from RAM, we issue
+ // a delete accordingly. See D7859775 for an example
+ if (oldItem.isNvmClean()) {
+ newItemHdl->markNvmClean();
+ }
+
+ // mark new item as moving to block readers until the data is copied
+ // (moveCb is called). Mark item in MMContainer temporarily (TODO: should
+ // we remove markMoving requirement for the item to be linked?)
+ newItemHdl->markInMMContainer();
+ auto marked = newItemHdl->markMoving(false /* there is already a handle */);
+ newItemHdl->unmarkInMMContainer();
+ XDCHECK(marked);
+
+ auto predicate = [&](const Item& item){
+ // we rely on moving flag being set (it should block all readers)
+ XDCHECK(item.getRefCount() == 0);
+ return true;
+ };
+
+ auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl,
+ predicate);
+ // another thread may have called insertOrReplace which could have
+ // marked this item as unaccessible causing the replaceIf
+ // in the access container to fail - in this case we want
+ // to abort the move since the item is no longer valid
+ if (!replaced) {
+ return false;
+ }
+ // what if another thread calls insertOrReplace now when
+ // the item is moving and already replaced in the hash table?
+ // 1. it succeeds in updating the hash table - so there is
+ // no guarentee that isAccessible() is true
+ // 2. it will then try to remove from MM container
+ // - this operation will wait for newItemHdl to
+ // be unmarkedMoving via the waitContext
+ // 3. replaced handle is returned and eventually drops
+ // ref to 0 and the item is recycled back to allocator.
+
+ if (config_.moveCb) {
+ // Execute the move callback. We cannot make any guarantees about the
+ // consistency of the old item beyond this point, because the callback can
+ // do more than a simple memcpy() e.g. update external references. If there
+ // are any remaining handles to the old item, it is the caller's
+ // responsibility to invalidate them. The move can only fail after this
+ // statement if the old item has been removed or replaced, in which case it
+ // should be fine for it to be left in an inconsistent state.
+ config_.moveCb(oldItem, *newItemHdl, nullptr);
+ } else {
+ std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(),
+ oldItem.getSize());
+ }
+
+ // Adding the item to mmContainer has to succeed since no one can remove the item
+ auto& newContainer = getMMContainer(*newItemHdl);
+ auto mmContainerAdded = newContainer.add(*newItemHdl);
+ XDCHECK(mmContainerAdded);
+
+ // no one can add or remove chained items at this point
+ if (oldItem.hasChainedItem()) {
+ // safe to acquire handle for a moving Item
+ auto incRes = incRef(oldItem, false);
+ XDCHECK(incRes == RefcountWithFlags::incResult::incOk);
+ auto oldHandle = WriteHandle{&oldItem,*this};
+ XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+ XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
+ try {
+ auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
+ transferChainLocked(oldHandle, newItemHdl);
+ } catch (const std::exception& e) {
+ // this should never happen because we drained all the handles.
+ XLOGF(DFATAL, "{}", e.what());
+ throw;
+ }
+
+ XDCHECK(!oldItem.hasChainedItem());
+ XDCHECK(newItemHdl->hasChainedItem());
+ }
+ newItemHdl.unmarkNascent();
+ return true;
+}
+
template
bool CacheAllocator::moveRegularItem(Item& oldItem,
WriteHandle& newItemHdl) {
@@ -1122,15 +1509,15 @@ bool CacheAllocator::moveRegularItem(Item& oldItem,
config_.moveCb(oldItem, *newItemHdl, nullptr);
// Inside the access container's lock, this checks if the old item is
- // accessible and its refcount is zero. If the item is not accessible,
+ // accessible and its refcount is one. If the item is not accessible,
// there is no point to replace it since it had already been removed
// or in the process of being removed. If the item is in cache but the
- // refcount is non-zero, it means user could be attempting to remove
+ // refcount is non-one, it means user could be attempting to remove
// this item through an API such as remove(itemHandle). In this case,
// it is unsafe to replace the old item with a new one, so we should
// also abort.
if (!accessContainer_->replaceIf(oldItem, *newItemHdl,
- itemExclusivePredicate)) {
+ itemSlabMovePredicate)) {
return false;
}
@@ -1151,13 +1538,12 @@ bool CacheAllocator::moveRegularItem(Item& oldItem,
// no one can add or remove chained items at this point
if (oldItem.hasChainedItem()) {
- // safe to acquire handle for a moving Item
- auto oldHandle = acquire(&oldItem);
- XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString();
+ auto oldItemHdl = acquire(&oldItem);
+ XDCHECK_EQ(1u, oldItemHdl->getRefCount()) << oldItemHdl->toString();
XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString();
try {
auto l = chainedItemLocks_.lockExclusive(oldItem.getKey());
- transferChainLocked(oldHandle, newItemHdl);
+ transferChainLocked(oldItemHdl, newItemHdl);
} catch (const std::exception& e) {
// this should never happen because we drained all the handles.
XLOGF(DFATAL, "{}", e.what());
@@ -1179,18 +1565,18 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem,
// This item has been unlinked from its parent and we're the only
// owner of it, so we're done here
- if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) {
+ if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
return false;
}
- const auto parentKey = oldItem.getParentItem(compressor_).getKey();
-
- // Grab lock to prevent anyone else from modifying the chain
+ auto& expectedParent = oldItem.getParentItem(compressor_);
+ const auto parentKey = expectedParent.getKey();
auto l = chainedItemLocks_.lockExclusive(parentKey);
+ // verify old item under the lock
auto parentHandle =
validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey);
- if (!parentHandle) {
+ if (!parentHandle || &expectedParent != parentHandle.get()) {
return false;
}
@@ -1210,7 +1596,7 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem,
// In case someone else had removed this chained item from its parent by now
// So we check again to see if the it has been unlinked from its parent
- if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) {
+ if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) {
return false;
}
@@ -1226,90 +1612,187 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem,
// parent's chain and the MMContainer.
auto oldItemHandle =
replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle);
- XDCHECK(oldItemHandle->isExclusive());
+ XDCHECK(oldItemHandle->isMoving());
XDCHECK(!oldItemHandle->isInMMContainer());
return true;
}
template
-typename CacheAllocator::Item*
-CacheAllocator::findEviction(PoolId pid, ClassId cid) {
- auto& mmContainer = getMMContainer(pid, cid);
+typename CacheAllocator::NvmCacheT::PutToken
+CacheAllocator::createPutToken(Item& item) {
+ const bool evictToNvmCache = shouldWriteToNvmCache(item);
+ return evictToNvmCache ? nvmCache_->createPutToken(item.getKey())
+ : typename NvmCacheT::PutToken{};
+}
+
+template
+void CacheAllocator::unlinkItemForEviction(Item& it) {
+ XDCHECK(it.isMarkedForEviction());
+ XDCHECK(it.getRefCount() == 0);
+ accessContainer_->remove(it);
+ removeFromMMContainer(it);
+ // Since we managed to mark the item for eviction we must be the only
+ // owner of the item.
+ const auto ref = it.unmarkForEviction();
+ XDCHECK(ref == 0u);
+}
+
+template
+typename CacheAllocator::Item*
+CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) {
+ auto& mmContainer = getMMContainer(tid, pid, cid);
+ bool lastTier = tid+1 >= getNumTiers();
// Keep searching for a candidate until we were able to evict it
// or until the search limit has been exhausted
unsigned int searchTries = 0;
- auto itr = mmContainer.getEvictionIterator();
while ((config_.evictionSearchTries == 0 ||
- config_.evictionSearchTries > searchTries) &&
- itr) {
- ++searchTries;
- (*stats_.evictionAttempts)[pid][cid].inc();
+ config_.evictionSearchTries > searchTries)) {
+
+ Item* toRecycle = nullptr;
+ Item* candidate = nullptr;
+ typename NvmCacheT::PutToken token;
+
+ mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle,
+ &searchTries, &mmContainer, &lastTier,
+ &token](auto&& itr) {
+ if (!itr) {
+ ++searchTries;
+ (*stats_.evictionAttempts)[pid][cid].inc();
+ return;
+ }
- Item* toRecycle = itr.get();
+ while ((config_.evictionSearchTries == 0 ||
+ config_.evictionSearchTries > searchTries) &&
+ itr) {
+ ++searchTries;
+ (*stats_.evictionAttempts)[pid][cid].inc();
+
+ auto* toRecycle_ = itr.get();
+ auto* candidate_ =
+ toRecycle_->isChainedItem()
+ ? &toRecycle_->asChainedItem().getParentItem(compressor_)
+ : toRecycle_;
+
+ if (lastTier) {
+ // if it's last tier, the item will be evicted
+ // need to create put token before marking it exclusive
+ token = createPutToken(*candidate_);
+ }
- Item* candidate =
- toRecycle->isChainedItem()
- ? &toRecycle->asChainedItem().getParentItem(compressor_)
- : toRecycle;
+ if (lastTier && shouldWriteToNvmCache(*candidate_) && !token.isValid()) {
+ stats_.evictFailConcurrentFill.inc();
+ } else if ( (lastTier && candidate_->markForEviction()) ||
+ (!lastTier && candidate_->markMoving(true)) ) {
+ XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction());
+ // markForEviction to make sure no other thead is evicting the item
+ // nor holding a handle to that item if this is last tier
+ // since we won't be moving the item to the next tier
+ toRecycle = toRecycle_;
+ candidate = candidate_;
+
+ // Check if parent changed for chained items - if yes, we cannot
+ // remove the child from the mmContainer as we will not be evicting
+ // it. We could abort right here, but we need to cleanup in case
+ // unmarkForEviction() returns 0 - so just go through normal path.
+ if (!toRecycle_->isChainedItem() ||
+ &toRecycle->asChainedItem().getParentItem(compressor_) ==
+ candidate)
+ mmContainer.remove(itr);
+ return;
+ }
- // make sure no other thead is evicting the item
- if (candidate->getRefCount() != 0 || !candidate->markExclusive()) {
- ++itr;
- continue;
- }
+ if (candidate_->hasChainedItem()) {
+ stats_.evictFailParentAC.inc();
+ } else {
+ stats_.evictFailAC.inc();
+ }
- // for chained items, the ownership of the parent can change. We try to
- // evict what we think as parent and see if the eviction of parent
- // recycles the child we intend to.
- bool evictionSuccessful = false;
- {
- auto toReleaseHandle =
- itr->isChainedItem()
- ? advanceIteratorAndTryEvictChainedItem(itr)
- : advanceIteratorAndTryEvictRegularItem(mmContainer, itr);
- evictionSuccessful = toReleaseHandle != nullptr;
- // destroy toReleseHandle. The item won't be released to allocator
- // since we marked it as exclusive.
- }
-
- const auto ref = candidate->unmarkExclusive();
- if (ref == 0u) {
- // Invalidate iterator since later on we may use this mmContainer
- // again, which cannot be done unless we drop this iterator
- itr.destroy();
-
- // recycle the item. it's safe to do so, even if toReleaseHandle was
- // NULL. If `ref` == 0 then it means that we are the last holder of
- // that item.
- if (candidate->hasChainedItem()) {
- (*stats_.chainedItemEvictions)[pid][cid].inc();
- } else {
- (*stats_.regularItemEvictions)[pid][cid].inc();
+ ++itr;
+ XDCHECK(toRecycle == nullptr);
+ XDCHECK(candidate == nullptr);
}
+ });
- if (auto eventTracker = getEventTracker()) {
- eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
- AllocatorApiResult::EVICTED, candidate->getSize(),
- candidate->getConfiguredTTL().count());
- }
+ if (!toRecycle)
+ continue;
- // check if by releasing the item we intend to, we actually
- // recycle the candidate.
- if (ReleaseRes::kRecycled ==
- releaseBackToAllocator(*candidate, RemoveContext::kEviction,
- /* isNascent */ false, toRecycle)) {
- return toRecycle;
+ XDCHECK(toRecycle);
+ XDCHECK(candidate);
+
+ auto evictedToNext = lastTier ? nullptr
+ : tryEvictToNextMemoryTier(*candidate, false);
+ if (!evictedToNext) {
+ //if insertOrReplace was called during move
+ //then candidate will not be accessible (failed replace during tryEvict)
+ // - therefore this was why we failed to
+ // evict to the next tier and insertOrReplace
+ // will remove from NVM cache
+ //however, if candidate is accessible
+ //that means the allocation in the next
+ //tier failed - so we will continue to
+ //evict the item to NVM cache
+ bool failedToReplace = !candidate->isAccessible();
+ if (!token.isValid() && !failedToReplace) {
+ token = createPutToken(*candidate);
}
+ // tryEvictToNextMemoryTier can fail if:
+ // a) allocation of the new item fails in that case,
+ // it should be still possible to mark item for eviction.
+ // b) another thread calls insertOrReplace and the item
+ // is no longer accessible
+ //
+ // in case that we are on the last tier, we whould have already marked
+ // as exclusive since we will not be moving the item to the next tier
+ // but rather just evicting all together, no need to
+ // markForEvictionWhenMoving
+ auto ret = lastTier ? true : candidate->markForEvictionWhenMoving();
+ XDCHECK(ret);
+
+ unlinkItemForEviction(*candidate);
+
+ if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)
+ && !failedToReplace) {
+ nvmCache_->put(*candidate, std::move(token));
+ }
+ // wake up any readers that wait for the move to complete
+ // it's safe to do now, as we have the item marked exclusive and
+ // no other reader can be added to the waiters list
+ wakeUpWaiters(*candidate, {});
+
} else {
- XDCHECK(!evictionSuccessful);
+ XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving());
+ XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+ XDCHECK(!candidate->isAccessible());
+ XDCHECK(candidate->getKey() == evictedToNext->getKey());
+
+ wakeUpWaiters(*candidate, std::move(evictedToNext));
}
- // If we destroyed the itr to possibly evict and failed, we restart
- // from the beginning again
- if (!itr) {
- itr.resetToBegin();
+ XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving());
+
+ // recycle the item. it's safe to do so, even if toReleaseHandle was
+ // NULL. If `ref` == 0 then it means that we are the last holder of
+ // that item.
+ if (candidate->hasChainedItem()) {
+ (*stats_.chainedItemEvictions)[pid][cid].inc();
+ } else {
+ (*stats_.regularItemEvictions)[pid][cid].inc();
+ }
+
+ if (auto eventTracker = getEventTracker()) {
+ eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(),
+ AllocatorApiResult::EVICTED, candidate->getSize(),
+ candidate->getConfiguredTTL().count());
+ }
+
+ // check if by releasing the item we intend to, we actually
+ // recycle the candidate.
+ auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction,
+ /* isNascent */ false, toRecycle);
+ if (ret == ReleaseRes::kRecycled) {
+ return toRecycle;
}
}
return nullptr;
@@ -1363,6 +1846,97 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive(
return true;
}
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::tryEvictToNextMemoryTier(
+ TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+ XDCHECK(item.isMoving());
+ XDCHECK(item.getRefCount() == 0);
+ if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet
+ if(item.isExpired()) {
+ accessContainer_->remove(item);
+ item.unmarkMoving();
+ return acquire(&item);
+ }
+
+ TierId nextTier = tid;
+ while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers
+ if (!shouldEvictToNextMemoryTier(tid, nextTier, pid, item))
+ continue;
+
+ // allocateInternal might trigger another eviction
+ auto newItemHdl = allocateInternalTier(nextTier, pid,
+ item.getKey(),
+ item.getSize(),
+ item.getCreationTime(),
+ item.getExpiryTime(),
+ fromBgThread);
+
+ if (newItemHdl) {
+ XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+ if (!moveRegularItemWithSync(item, newItemHdl)) {
+ return WriteHandle{};
+ }
+ XDCHECK_EQ(newItemHdl->getKey(),item.getKey());
+ item.unmarkMoving();
+ return newItemHdl;
+ } else {
+ return WriteHandle{};
+ }
+ }
+
+ return {};
+}
+
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) {
+ auto tid = getTierId(item);
+ auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+ return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::tryPromoteToNextMemoryTier(
+ TierId tid, PoolId pid, Item& item, bool fromBgThread) {
+ if(item.isExpired()) { return {}; }
+ TierId nextTier = tid;
+ while (nextTier > 0) { // try to evict down to the next memory tiers
+ auto toPromoteTier = nextTier - 1;
+ --nextTier;
+
+ // allocateInternal might trigger another eviction
+ auto newItemHdl = allocateInternalTier(toPromoteTier, pid,
+ item.getKey(),
+ item.getSize(),
+ item.getCreationTime(),
+ item.getExpiryTime(),
+ fromBgThread);
+
+ if (newItemHdl) {
+ XDCHECK_EQ(newItemHdl->getSize(), item.getSize());
+ if (!moveRegularItemWithSync(item, newItemHdl)) {
+ return WriteHandle{};
+ }
+ item.unmarkMoving();
+ return newItemHdl;
+ } else {
+ return WriteHandle{};
+ }
+ }
+
+ return {};
+}
+
+template
+typename CacheAllocator::WriteHandle
+CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) {
+ auto tid = getTierId(item);
+ auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId;
+ return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread);
+}
+
template
typename CacheAllocator::RemoveRes
CacheAllocator::remove(typename Item::Key key) {
@@ -1453,7 +2027,7 @@ bool CacheAllocator::pushToNvmCacheFromRamForTesting(
if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) &&
shouldWriteToNvmCacheExclusive(*handle)) {
- nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey()));
+ nvmCache_->put(*handle, nvmCache_->createPutToken(handle->getKey()));
return true;
}
return false;
@@ -1563,21 +2137,57 @@ void CacheAllocator::invalidateNvm(Item& item) {
}
}
+template
+TierId
+CacheAllocator::getTierId(const Item& item) const {
+ return getTierId(item.getMemory());
+}
+
+template
+TierId
+CacheAllocator::getTierId(const void* ptr) const {
+ for (TierId tid = 0; tid < getNumTiers(); tid++) {
+ if (allocator_[tid]->isMemoryInAllocator(ptr))
+ return tid;
+ }
+
+ throw std::invalid_argument("Item does not belong to any tier!");
+}
+
template
typename CacheAllocator::MMContainer&
CacheAllocator::getMMContainer(const Item& item) const noexcept {
+ const auto tid = getTierId(item);
const auto allocInfo =
- allocator_->getAllocInfo(static_cast(&item));
- return getMMContainer(allocInfo.poolId, allocInfo.classId);
+ allocator_[tid]->getAllocInfo(static_cast(&item));
+ return getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
}
template
typename CacheAllocator::MMContainer&
-CacheAllocator::getMMContainer(PoolId pid,
+CacheAllocator::getMMContainer(TierId tid,
+ PoolId pid,
ClassId cid) const noexcept {
- XDCHECK_LT(static_cast(pid), mmContainers_.size());
- XDCHECK_LT(static_cast(cid), mmContainers_[pid].size());
- return *mmContainers_[pid][cid];
+ XDCHECK_LT(static_cast(tid), mmContainers_.size());
+ XDCHECK_LT(static_cast(pid), mmContainers_[tid].size());
+ XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size());
+ return *mmContainers_[tid][pid][cid];
+}
+
+template
+MMContainerStat CacheAllocator::getMMContainerStat(
+ TierId tid, PoolId pid, ClassId cid) const noexcept {
+ if(static_cast(tid) >= mmContainers_.size()) {
+ return MMContainerStat{};
+ }
+ if (static_cast(pid) >= mmContainers_[tid].size()) {
+ return MMContainerStat{};
+ }
+ if (static_cast(cid) >= mmContainers_[tid][pid].size()) {
+ return MMContainerStat{};
+ }
+ return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats()
+ : MMContainerStat{};
}
template
@@ -1763,8 +2373,9 @@ void CacheAllocator::markUseful(const ReadHandle& handle,
template
bool CacheAllocator::recordAccessInMMContainer(Item& item,
AccessMode mode) {
+ const auto tid = getTierId(item);
const auto allocInfo =
- allocator_->getAllocInfo(static_cast(&item));
+ allocator_[tid]->getAllocInfo(static_cast(&item));
(*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc();
// track recently accessed items if needed
@@ -1772,14 +2383,15 @@ bool CacheAllocator::recordAccessInMMContainer(Item& item,
ring_->trackItem(reinterpret_cast(&item), item.getSize());
}
- auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId);
+ auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId);
return mmContainer.recordAccess(item, mode);
}
template
uint32_t CacheAllocator::getUsableSize(const Item& item) const {
+ const auto tid = getTierId(item);
const auto allocSize =
- allocator_->getAllocInfo(static_cast(&item)).allocSize;
+ allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize;
return item.isChainedItem()
? allocSize - ChainedItem::getRequiredSize(0)
: allocSize - Item::getRequiredSize(item.getKey(), 0);
@@ -1788,8 +2400,10 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const {
template
typename CacheAllocator::SampleItem
CacheAllocator::getSampleItem() {
- size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
- size_t ramCacheSize = allocator_->getMemorySizeInclAdvised();
+ // TODO: is using random tier a good idea?
+ auto tid = folly::Random::rand32() % getNumTiers();
+ static size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0;
+ static size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised();
bool fromNvm =
folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize;
@@ -1798,19 +2412,18 @@ CacheAllocator::getSampleItem() {
}
// Sampling from DRAM cache
- auto item = reinterpret_cast(allocator_->getRandomAlloc());
+ auto item = reinterpret_cast(allocator_[tid]->getRandomAlloc());
if (!item) {
return SampleItem{false /* fromNvm */};
}
// Check that item returned is the same that was sampled
-
auto sharedHdl = std::make_shared(findInternal(item->getKey()));
if (sharedHdl->get() != item) {
return SampleItem{false /* fromNvm */};
}
- const auto allocInfo = allocator_->getAllocInfo(item->getMemory());
+ const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory());
// Convert the Item to IOBuf to make SampleItem
auto iobuf = folly::IOBuf{
@@ -1829,28 +2442,33 @@ CacheAllocator::getSampleItem() {
template
std::vector CacheAllocator::dumpEvictionIterator(
- PoolId pid, ClassId cid, size_t numItems) {
+ PoolId pid, ClassId cid, size_t numItems) {
if (numItems == 0) {
return {};
}
- if (static_cast(pid) >= mmContainers_.size() ||
- static_cast(cid) >= mmContainers_[pid].size()) {
+ // Always evict from the lowest layer.
+ int tid = getNumTiers() - 1;
+
+ if (static_cast(tid) >= mmContainers_.size() ||
+ static_cast(pid) >= mmContainers_[tid].size() ||
+ static_cast(cid) >= mmContainers_[tid][pid].size()) {
throw std::invalid_argument(
- folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid));
+ folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid));
}
std::vector content;
- auto& mm = *mmContainers_[pid][cid];
- auto evictItr = mm.getEvictionIterator();
- size_t i = 0;
- while (evictItr && i < numItems) {
- content.push_back(evictItr->toString());
- ++evictItr;
- ++i;
+ while (tid >= 0) {
+ auto& mm = *mmContainers_[tid][pid][cid];
+ mm.withEvictionIterator([&content, numItems](auto&& itr) {
+ while (itr && content.size() < numItems) {
+ content.push_back(itr->toString());
+ ++itr;
+ }
+ });
+ --tid;
}
-
return content;
}
@@ -2026,19 +2644,50 @@ PoolId CacheAllocator::addPool(
std::shared_ptr resizeStrategy,
bool ensureProvisionable) {
folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_);
- auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable);
+
+ PoolId pid = 0;
+ size_t totalCacheSize = 0;
+
+ for (TierId tid = 0; tid < getNumTiers(); tid++) {
+ totalCacheSize += allocator_[tid]->getMemorySize();
+ }
+
+ for (TierId tid = 0; tid < getNumTiers(); tid++) {
+ auto tierSizeRatio =
+ static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize;
+ size_t tierPoolSize = static_cast(tierSizeRatio * size);
+
+ // TODO: what if we manage to add pool only in one tier?
+ // we should probably remove that on failure
+ auto res = allocator_[tid]->addPool(
+ name, tierPoolSize, allocSizes, ensureProvisionable);
+ XDCHECK(tid == 0 || res == pid);
+ pid = res;
+ }
+
createMMContainers(pid, std::move(config));
setRebalanceStrategy(pid, std::move(rebalanceStrategy));
setResizeStrategy(pid, std::move(resizeStrategy));
+
+ if (backgroundEvictor_.size()) {
+ for (size_t id = 0; id < backgroundEvictor_.size(); id++)
+ backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0));
+ }
+
+ if (backgroundPromoter_.size()) {
+ for (size_t id = 0; id < backgroundPromoter_.size(); id++)
+ backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1));
+ }
+
return pid;
}
template
void CacheAllocator::overridePoolRebalanceStrategy(
PoolId pid, std::shared_ptr rebalanceStrategy) {
- if (static_cast(pid) >= mmContainers_.size()) {
+ if (static_cast(pid) >= mmContainers_[0].size()) {
throw std::invalid_argument(folly::sformat(
- "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+ "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
}
setRebalanceStrategy(pid, std::move(rebalanceStrategy));
}
@@ -2046,9 +2695,9 @@ void CacheAllocator::overridePoolRebalanceStrategy(
template
void CacheAllocator::overridePoolResizeStrategy(
PoolId pid, std::shared_ptr resizeStrategy) {
- if (static_cast(pid) >= mmContainers_.size()) {
+ if (static_cast(pid) >= mmContainers_[0].size()) {
throw std::invalid_argument(folly::sformat(
- "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size()));
+ "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size()));
}
setResizeStrategy(pid, std::move(resizeStrategy));
}
@@ -2060,14 +2709,14 @@ void CacheAllocator::overridePoolOptimizeStrategy(
}
template
-void CacheAllocator::overridePoolConfig(PoolId pid,
+void CacheAllocator