diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml new file mode 100644 index 0000000000..92165f603b --- /dev/null +++ b/.github/workflows/build-cachelib-centos-long.yml @@ -0,0 +1,39 @@ +name: build-cachelib-centos-latest +on: + schedule: + - cron: '0 7 * * *' + +jobs: + build-cachelib-centos8-latest: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + # Docker container image name + container: "centos:latest" + steps: + - name: "update packages" + run: dnf upgrade -y + - name: "install sudo,git" + run: dnf install -y sudo git cmake gcc + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + - name: "build CacheLib using build script" + run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh long diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml new file mode 100644 index 0000000000..5bc3ad3c70 --- /dev/null +++ b/.github/workflows/build-cachelib-debian.yml @@ -0,0 +1,43 @@ +name: build-cachelib-debian-10 +on: + schedule: + - cron: '30 5 * * 0,3' + +jobs: + build-cachelib-debian-10: + name: "Debian/Buster - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + # Docker container image name + container: "debian:buster-slim" + steps: + - name: "update packages" + run: apt-get update + - name: "upgrade packages" + run: apt-get -y upgrade + - name: "install sudo,git" + run: apt-get install -y sudo git procps + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true + echo === env === + env + echo === cc -v === + cc -v || true + echo === g++ -v === + g++ - || true + - name: "checkout sources" + uses: actions/checkout@v2 + - name: "build CacheLib using build script" + run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml new file mode 100644 index 0000000000..be28bc233c --- /dev/null +++ b/.github/workflows/build-cachelib-docker.yml @@ -0,0 +1,49 @@ +name: build-cachelib-docker +on: + push: + pull_request: + +jobs: + build-cachelib-docker: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + env: + REPO: cachelib + GITHUB_REPO: intel/CacheLib + CONTAINER_REG: ghcr.io/pmem/cachelib + CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }} + CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }} + FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }} + HOST_WORKDIR: ${{ github.workspace }} + WORKDIR: docker + IMG_VER: devel + strategy: + matrix: + CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"] + steps: + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Pull the image or rebuild and push it + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION + + - name: Run the build + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 4b4897b610..90c8d739c6 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,6 +1,6 @@ # From: https://github.com/marketplace/actions/clang-format-check#multiple-paths name: clang-format Check -on: [pull_request] +on: [] jobs: formatting-check: name: Formatting Check diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md new file mode 100644 index 0000000000..6976c9ddc9 --- /dev/null +++ b/MultiTierDataMovement.md @@ -0,0 +1,113 @@ +# Background Data Movement + +In order to reduce the number of online evictions and support asynchronous +promotion - we have added two periodic workers to handle eviction and promotion. + +The diagram below shows a simplified version of how the background evictor +thread (green) is integrated to the CacheLib architecture. + +

+ BackgroundEvictor +

+ +## Background Evictors + +The background evictors scan each class to see if there are objects to move the next (lower) +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default +the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also, +the background evictor thread will be woken up everytime there is a failed allocation (from +a request handling thread) and the current percentage of free memory for the +AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter +not as important when there are many allocations occuring from request handling threads. + +- `evictorThreads`: The number of background evictors to run - each thread is a assigned +a set of AllocationClasses to scan and evict objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses. +The default is 1. + +- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any +candidates) + +- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch` +but it specifies how many candidates will be taken into consideration, not the actual number of items to evict. +This option can be used to configure duration of critical section on LRU lock. + + +### FreeThresholdStrategy (default) + +- `lowEvictionAcWatermark`: Triggers background eviction thread to run +when this percentage of the AllocationClass is free. +The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`. + +- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this +percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we +don't set this above `10`. + + +## Background Promoters + +The background promoters scan each class to see if there are objects to move to a lower +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default +the background promoter threads will wake up every 10 ms to scan the AllocationClasses for +objects to promote. + +- `promoterThreads`: The number of background promoters to run - each thread is a assigned +a set of AllocationClasses to scan and promote objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`. + +- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any +candidates) + +- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since +we won't need to modify the data when a user is done with the data. Therefore, for a short time +the data could reside in both tiers until it is evicted from its current tier. The default is to +not allow this (0). Setting the value to 100 will enable duplicate elements in tiers. + +### Background Promotion Strategy (only one currently) + +- `promotionAcWatermark`: Promote items if there is at least this +percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects +to that tier. The objects are chosen from the head of the LRU. The default is `4.0`. +This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`. +- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to +`maxEvictionBatch`. It's value should be lower to decrease contention on hot items. + +## Allocation policies + +- `maxAcAllocationWatermark`: Item is always allocated in topmost tier if at least this +percentage of the AllocationClass is free. +- `minAcAllocationWatermark`: Item is always allocated in bottom tier if only this percent +of the AllocationClass is free. If percentage of free AllocationClasses is between `maxAcAllocationWatermark` +and `minAcAllocationWatermark`: then extra checks (described below) are performed to decide where to put the element. + +By default, allocation will always be performed from the upper tier. + +- `acTopTierEvictionWatermark`: If there is less that this percent of free memory in topmost tier, cachelib will attempt to evict from top tier. This option takes precedence before allocationWatermarks. + +### Extra policies (used only when percentage of free AllocationClasses is between `maxAcAllocationWatermark` +and `minAcAllocationWatermark`) +- `sizeThresholdPolicy`: If item is smaller than this value, always allocate it in upper tier. +- `defaultTierChancePercentage`: Change (0-100%) of allocating item in top tier + +## MMContainer options + +- `lruInsertionPointSpec`: Can be set per tier when LRU2Q is used. Determines where new items are +inserted. 0 = insert to hot queue, 1 = insert to warm queue, 2 = insert to cold queue +- `markUsefulChance`: Per-tier, determines chance of moving item to the head of LRU on access + diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt index 36df0dc19f..e77c25085c 100644 --- a/cachelib/CMakeLists.txt +++ b/cachelib/CMakeLists.txt @@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) +if(COVERAGE_ENABLED) + # Add code coverage + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage") +endif() + # include(fb_cxx_flags) message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}") diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h new file mode 100644 index 0000000000..b77436635f --- /dev/null +++ b/cachelib/allocator/BackgroundMover-inl.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + +template +BackgroundMover::BackgroundMover( + Cache& cache, + std::shared_ptr strategy, + MoverDir direction) + : cache_(cache), strategy_(strategy), direction_(direction) { + if (direction_ == MoverDir::Evict) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems; + + } else if (direction_ == MoverDir::Promote) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems; + } +} + +template +BackgroundMover::~BackgroundMover() { + stop(std::chrono::seconds(0)); +} + +template +void BackgroundMover::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundMover::setAssignedMemory( + std::vector&& assignedMemory) { + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory] { + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundMover::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; }); + + unsigned int moves = 0; + std::set classes{}; + auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats(); + + if (!batch) { + continue; + } + + // try moving BATCH items from the class in order to reach free target + auto moved = moverFunc(cache_, tid, pid, cid, batch); + moves += moved; + moves_per_class_[tid][pid][cid] += moved; + totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize); + } + + numTraversals.inc(); + numMovedItems.add(moves); + totalClasses.add(classes.size()); +} + +template +BackgroundMoverStats BackgroundMover::getStats() const noexcept { + BackgroundMoverStats stats; + stats.numMovedItems = numMovedItems.get(); + stats.runCount = numTraversals.get(); + stats.totalBytesMoved = totalBytesMoved.get(); + stats.totalClasses = totalClasses.get(); + + return stats; +} + +template +std::map>> +BackgroundMover::getClassStats() const noexcept { + return moves_per_class_; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h new file mode 100644 index 0000000000..1246676d6e --- /dev/null +++ b/cachelib/allocator/BackgroundMover.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/PeriodicWorker.h" + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the cache api +template +struct BackgroundMoverAPIWrapper { + static size_t traverseAndEvictItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndEvictItems(tid, pid, cid, batch); + } + + static size_t traverseAndPromoteItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndPromoteItems(tid, pid, cid, batch); + } +}; + +enum class MoverDir { Evict = 0, Promote }; + +// Periodic worker that evicts items from tiers in batches +// The primary aim is to reduce insertion times for new items in the +// cache +template +class BackgroundMover : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param strategy the stragey class that defines how objects are + // moved, + // (promoted vs. evicted and how much) + BackgroundMover(Cache& cache, + std::shared_ptr strategy, + MoverDir direction_); + + ~BackgroundMover() override; + + BackgroundMoverStats getStats() const noexcept; + std::map>> + getClassStats() const noexcept; + + void setAssignedMemory( + std::vector&& assignedMemory); + + private: + std::map>> + moves_per_class_; + // cache allocator's interface for evicting + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + MoverDir direction_; + + std::function + moverFunc; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + AtomicCounter numMovedItems{0}; + AtomicCounter numTraversals{0}; + AtomicCounter totalClasses{0}; + AtomicCounter totalBytesMoved{0}; + + std::vector assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundMover-inl.h" diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h new file mode 100644 index 0000000000..7706a625a5 --- /dev/null +++ b/cachelib/allocator/BackgroundMoverStrategy.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" + + +namespace facebook { +namespace cachelib { + +struct MemoryDescriptorType { + MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : + tid_(tid), pid_(pid), cid_(cid) {} + TierId tid_; + PoolId pid_; + ClassId cid_; +}; + +// Base class for background eviction strategy. +class BackgroundMoverStrategy { + public: + virtual std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) = 0; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/CCacheAllocator.cpp b/cachelib/allocator/CCacheAllocator.cpp index 2709bde377..dd1986114b 100644 --- a/cachelib/allocator/CCacheAllocator.cpp +++ b/cachelib/allocator/CCacheAllocator.cpp @@ -36,7 +36,9 @@ CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator, currentChunksIndex_(0) { auto& currentChunks = chunks_[currentChunksIndex_]; for (auto chunk : *object.chunks()) { - currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk))); + // TODO : pass multi-tier flag when compact cache supports multi-tier config + currentChunks.push_back( + allocator_.unCompress(CompressedPtr(chunk), false /* isMultiTier */)); } } @@ -97,7 +99,9 @@ CCacheAllocator::SerializationType CCacheAllocator::saveState() { std::lock_guard guard(resizeLock_); for (auto chunk : getCurrentChunks()) { - object.chunks()->push_back(allocator_.compress(chunk).saveState()); + // TODO : pass multi-tier flag when compact cache supports multi-tier config + object.chunks()->push_back( + allocator_.compress(chunk, false /* isMultiTier */).saveState()); } return object; } diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt index 78cfa7ca06..6103cdc823 100644 --- a/cachelib/allocator/CMakeLists.txt +++ b/cachelib/allocator/CMakeLists.txt @@ -35,6 +35,7 @@ add_library (cachelib_allocator CCacheManager.cpp ContainerTypes.cpp FreeMemStrategy.cpp + FreeThresholdStrategy.cpp HitsPerSlabStrategy.cpp LruTailAgeStrategy.cpp MarginalHitsOptimizeStrategy.cpp @@ -117,6 +118,8 @@ if (BUILD_TESTS) add_test (tests/ChainedHashTest.cpp) add_test (tests/AllocatorResizeTypeTest.cpp) add_test (tests/AllocatorHitStatsTypeTest.cpp) + add_test (tests/AllocatorMemoryTiersTest.cpp) + add_test (tests/MemoryTiersTest.cpp) add_test (tests/MultiAllocatorTest.cpp) add_test (tests/NvmAdmissionPolicyTest.cpp) add_test (tests/CacheAllocatorConfigTest.cpp) diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h index e225ba8a01..c871358189 100644 --- a/cachelib/allocator/Cache.h +++ b/cachelib/allocator/Cache.h @@ -85,6 +85,9 @@ class CacheBase { CacheBase(CacheBase&&) = default; CacheBase& operator=(CacheBase&&) = default; + // TODO: come up with some reasonable number + static constexpr unsigned kMaxTiers = 2; + // Get a string referring to the cache name for this cache virtual const std::string getCacheName() const = 0; @@ -95,6 +98,12 @@ class CacheBase { // // @param poolId The pool id to query virtual const MemoryPool& getPool(PoolId poolId) const = 0; + + // Get the reference to a memory pool using a tier id, for stats purposes + // + // @param poolId The pool id to query + // @param tierId The tier of the pool id + virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0; // Get Pool specific stats (regular pools). This includes stats from the // Memory Pool and also the cache. @@ -102,6 +111,12 @@ class CacheBase { // @param poolId the pool id virtual PoolStats getPoolStats(PoolId poolId) const = 0; + // Get Allocation Class specific stats. + // + // @param poolId the pool id + // @param classId the class id + virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0; + // @param poolId the pool id virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0; diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 1d89593268..614f031aeb 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -16,6 +16,8 @@ #pragma once +#include + namespace facebook { namespace cachelib { @@ -35,6 +37,7 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config) template CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) : CacheAllocator(InitMemType::kMemAttach, config) { + /* TODO - per tier? */ for (auto pid : *metadata_.compactCachePools()) { isCompactCachePool_[pid] = true; } @@ -53,12 +56,13 @@ CacheAllocator::CacheAllocator( : isOnShm_{type != InitMemType::kNone ? true : config.memMonitoringEnabled()}, config_(config.validate()), + memoryTierConfigs(config.getMemoryTierConfigs()), tempShm_(type == InitMemType::kNone && isOnShm_ - ? std::make_unique(config_.size) + ? std::make_unique(config_.getCacheSize()) : nullptr), shmManager_(type != InitMemType::kNone ? std::make_unique(config_.cacheDir, - config_.usePosixShm) + config_.isUsingPosixShm()) : nullptr), deserializer_(type == InitMemType::kMemAttach ? createDeserializer() : nullptr), @@ -67,12 +71,12 @@ CacheAllocator::CacheAllocator( : serialization::CacheAllocatorMetadata{}}, allocator_(initAllocator(type)), compactCacheManager_(type != InitMemType::kMemAttach - ? std::make_unique(*allocator_) - : restoreCCacheManager()), + ? std::make_unique(*allocator_[0] /* TODO: per tier */) + : restoreCCacheManager(0/* TODO: per tier */)), compressor_(createPtrCompressor()), mmContainers_(type == InitMemType::kMemAttach ? deserializeMMContainers(*deserializer_, compressor_) - : MMContainers{}), + : MMContainers{getNumTiers()}), accessContainer_(initAccessContainer( type, detail::kShmHashTableName, config.accessConfig)), chainedItemAccessContainer_( @@ -81,6 +85,8 @@ CacheAllocator::CacheAllocator( config.chainedItemAccessConfig)), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), + movesMap_(kShards), + moveLock_(kShards), cacheCreationTime_{ type != InitMemType::kMemAttach ? util::getCurrentTimeSec() @@ -105,48 +111,98 @@ CacheAllocator::~CacheAllocator() { } template -ShmSegmentOpts CacheAllocator::createShmCacheOpts() { +ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); auto memoryTierConfigs = config_.getMemoryTierConfigs(); // TODO: we support single tier so far - XDCHECK_EQ(memoryTierConfigs.size(), 1ul); - opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind(); - + if (memoryTierConfigs.size() > 2) { + throw std::invalid_argument("CacheLib only supports two memory tiers"); + } + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); return opts; } +template +size_t CacheAllocator::memoryTierSize(TierId tid) const { + auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config){ + return i + config.getRatio(); + }); + + return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions); +} + +template +std::vector> +CacheAllocator::createPrivateAllocator() { + std::vector> allocators; + + if (isOnShm_) + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + tempShm_->getAddr(), + config_.getCacheSize())); + else + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + config_.getCacheSize())); + + return allocators; +} + template std::unique_ptr -CacheAllocator::createNewMemoryAllocator() { +CacheAllocator::createNewMemoryAllocator(TierId tid) { + size_t tierSize = memoryTierSize(tid); return std::make_unique( getAllocatorConfig(config_), shmManager_ - ->createShm(detail::kShmCacheName, config_.size, - config_.slabMemoryBaseAddr, createShmCacheOpts()) + ->createShm(detail::kShmCacheName + std::to_string(tid), + tierSize, config_.slabMemoryBaseAddr, + createShmCacheOpts(tid)) .addr, - config_.size); + tierSize); } template std::unique_ptr -CacheAllocator::restoreMemoryAllocator() { +CacheAllocator::restoreMemoryAllocator(TierId tid) { return std::make_unique( deserializer_->deserialize(), shmManager_ - ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, - createShmCacheOpts()) - .addr, - config_.size, + ->attachShm(detail::kShmCacheName + std::to_string(tid), + config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr, + memoryTierSize(tid), config_.disableFullCoredump); } +template +std::vector> +CacheAllocator::createAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(createNewMemoryAllocator(tid)); + } + return allocators; +} + +template +std::vector> +CacheAllocator::restoreAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(restoreMemoryAllocator(tid)); + } + return allocators; +} + template std::unique_ptr -CacheAllocator::restoreCCacheManager() { +CacheAllocator::restoreCCacheManager(TierId tid) { return std::make_unique( deserializer_->deserialize(), - *allocator_); + *allocator_[tid]); } template @@ -235,23 +291,30 @@ void CacheAllocator::initWorkers() { config_.poolOptimizeStrategy, config_.ccacheOptimizeStepSizePercent); } + + if (config_.backgroundEvictorEnabled()) { + startNewBackgroundEvictor(config_.backgroundEvictorInterval, + config_.backgroundEvictorStrategy, + config_.backgroundEvictorThreads); + } + + if (config_.backgroundPromoterEnabled()) { + startNewBackgroundPromoter(config_.backgroundPromoterInterval, + config_.backgroundPromoterStrategy, + config_.backgroundPromoterThreads); + } } template -std::unique_ptr CacheAllocator::initAllocator( +std::vector> +CacheAllocator::initAllocator( InitMemType type) { if (type == InitMemType::kNone) { - if (isOnShm_ == true) { - return std::make_unique( - getAllocatorConfig(config_), tempShm_->getAddr(), config_.size); - } else { - return std::make_unique(getAllocatorConfig(config_), - config_.size); - } + return createPrivateAllocator(); } else if (type == InitMemType::kMemNew) { - return createNewMemoryAllocator(); + return createAllocators(); } else if (type == InitMemType::kMemAttach) { - return restoreMemoryAllocator(); + return restoreAllocators(); } // Invalid type @@ -318,13 +381,31 @@ CacheAllocator::allocate(PoolId poolId, ttlSecs == 0 ? 0 : creationTime + ttlSecs); } +template +bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) { + // TODO: should we also work on lower tiers? should we have separate set of params? + if (tid == 1) return false; + return (1-getACStats(tid, pid, cid).usageFraction())*100 <= config_.lowEvictionAcWatermark; +} + +template +size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) { + XDCHECK(numWorkers); + + // TODO: came up with some better sharding (use some hashing) + return (tid + pid + cid) % numWorkers; +} + + template typename CacheAllocator::WriteHandle -CacheAllocator::allocateInternal(PoolId pid, - typename Item::Key key, - uint32_t size, - uint32_t creationTime, - uint32_t expiryTime) { +CacheAllocator::allocateInternalTier(TierId tid, + PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; @@ -333,13 +414,32 @@ CacheAllocator::allocateInternal(PoolId pid, const auto requiredSize = Item::getRequiredSize(key, size); // the allocation class in our memory allocator. - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{ + (*stats_.classAllocLatency)[tid][pid][cid]}; + // TODO: per-tier (*stats_.allocAttempts)[pid][cid].inc(); + + void *memory = nullptr; + + if (tid == 0 && config_.acTopTierEvictionWatermark > 0.0 + && 100.0 * (1 - getACStats(tid, pid, cid).usageFraction()) < config_.acTopTierEvictionWatermark) { + memory = findEviction(tid, pid, cid); + } - void* memory = allocator_->allocate(pid, requiredSize); if (memory == nullptr) { - memory = findEviction(pid, cid); + // TODO: should we try allocate item even if this will result in violating + // acTopTierEvictionWatermark? + memory = allocator_[tid]->allocate(pid, requiredSize); + } + + if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) { + backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp(); + } + + if (memory == nullptr) { + memory = findEviction(tid, pid, cid); } WriteHandle handle; @@ -350,7 +450,7 @@ CacheAllocator::allocateInternal(PoolId pid, // for example. SCOPE_FAIL { // free back the memory to the allocator since we failed. - allocator_->free(memory); + allocator_[tid]->free(memory); }; handle = acquire(new (memory) Item(key, size, creationTime, expiryTime)); @@ -361,7 +461,7 @@ CacheAllocator::allocateInternal(PoolId pid, } } else { // failed to allocate memory. - (*stats_.allocFailures)[pid][cid].inc(); + (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier // wake up rebalancer if (poolRebalancer_) { poolRebalancer_->wakeUp(); @@ -378,6 +478,90 @@ CacheAllocator::allocateInternal(PoolId pid, return handle; } +template +TierId +CacheAllocator::getTargetTierForItem(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime) { + if (getNumTiers() == 1) + return 0; + + if (config_.forceAllocationTier != UINT64_MAX) { + return config_.forceAllocationTier; + } + + const TierId defaultTargetTier = 0; + + const auto requiredSize = Item::getRequiredSize(key, size); + const auto cid = allocator_[defaultTargetTier]->getAllocationClassId(pid, requiredSize); + + auto freePercentage = 100.0 * (1 - getACStats(defaultTargetTier, pid, cid).usageFraction()); + + // TODO: COULD we implement BG worker which would move slabs around + // so that there is similar amount of free space in each pool/ac. + // Should this be responsibility of BG evictor? + + if (freePercentage >= config_.maxAcAllocationWatermark) + return defaultTargetTier; + + if (freePercentage <= config_.minAcAllocationWatermark) + return defaultTargetTier + 1; + + // TODO: we can think about creating different allocation classes for different memory tiers + // and we could look at possible fragmentation when deciding where to put the item + if (config_.sizeThresholdPolicy) + return requiredSize < config_.sizeThresholdPolicy ? defaultTargetTier : defaultTargetTier + 1; + + // TODO: (e.g. always put chained items to second tier) + // if (chainedItemsPolicy) + // return item.isChainedItem() ? defaultTargetTier + 1 : defaultTargetTier; + + // TODO: + // if (expiryTimePolicy) + // return (expiryTime - creationTime) < expiryTimePolicy ? defaultTargetTier : defaultTargetTier + 1; + + // TODO: + // if (keyPolicy) // this can be based on key length or some other properties + // return getTargetTierForKey(key); + + // TODO: + // if (compressabilityPolicy) // if compresses well store on CXL? latency will be higher anyway + // return TODO; + + if (config_.defaultTierChancePercentage >= 100.00) { + return 0; + } + + return (folly::Random::rand32() % 100) < config_.defaultTierChancePercentage ? defaultTargetTier : defaultTargetTier + 1; +} + +template +bool CacheAllocator::shouldEvictToNextMemoryTier( + TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item) +{ + if (config_.disableEvictionToMemory) + return false; + + // TODO: implement more advanced admission policies for memory tiers, for example: + // - always evict big items to NVMe only + // - do not evict an item if pressure on target Tier is high + return true; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternal(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { + auto tid = getTargetTierForItem(pid, key, size, creationTime, expiryTime); + return allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread); +} + template typename CacheAllocator::WriteHandle CacheAllocator::allocateChainedItem(const ReadHandle& parent, @@ -408,21 +592,29 @@ CacheAllocator::allocateChainedItemInternal( // number of bytes required for this item const auto requiredSize = ChainedItem::getRequiredSize(size); - const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId; - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + // TODO: is this correct? + auto tid = getTierId(*parent); + const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId; + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + + util::RollingLatencyTracker rollTracker{ + (*stats_.classAllocLatency)[tid][pid][cid]}; + + // TODO: per-tier? Right now stats_ are not used in any public periodic + // worker (*stats_.allocAttempts)[pid][cid].inc(); - void* memory = allocator_->allocate(pid, requiredSize); + void* memory = allocator_[tid]->allocate(pid, requiredSize); if (memory == nullptr) { - memory = findEviction(pid, cid); + memory = findEviction(tid, pid, cid); } if (memory == nullptr) { (*stats_.allocFailures)[pid][cid].inc(); return WriteHandle{}; } - SCOPE_FAIL { allocator_->free(memory); }; + SCOPE_FAIL { allocator_[tid]->free(memory); }; auto child = acquire( new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size, @@ -466,14 +658,15 @@ void CacheAllocator::addChainedItem(WriteHandle& parent, // Count a new child stats_.numChainedChildItems.inc(); - insertInMMContainer(*child); - // Increment refcount since this chained item is now owned by the parent // Parent will decrement the refcount upon release. Since this is an // internal refcount, we dont include it in active handle tracking. - child->incRef(); + auto ret = child->incRef(true); + XDCHECK(ret == RefcountWithFlags::incResult::incOk); XDCHECK_EQ(2u, child->getRefCount()); + insertInMMContainer(*child); + invalidateNvm(*parent); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(), @@ -717,7 +910,8 @@ CacheAllocator::replaceChainedItemLocked(Item& oldItem, // Since this is an internal refcount, we dont include it in active handle // tracking. - newItemHdl->incRef(); + auto ret = newItemHdl->incRef(true); + XDCHECK(ret == RefcountWithFlags::incResult::incOk); return oldItemHdl; } @@ -731,8 +925,8 @@ CacheAllocator::releaseBackToAllocator(Item& it, throw std::runtime_error( folly::sformat("cannot release this item: {}", it.toString())); } - - const auto allocInfo = allocator_->getAllocInfo(it.getMemory()); + const auto tid = getTierId(it); + const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory()); if (ctx == RemoveContext::kEviction) { const auto timeNow = util::getCurrentTimeSec(); @@ -756,8 +950,7 @@ CacheAllocator::releaseBackToAllocator(Item& it, folly::sformat("Can not recycle a chained item {}, toRecyle", it.toString(), toRecycle->toString())); } - - allocator_->free(&it); + allocator_[tid]->free(&it); return ReleaseRes::kReleased; } @@ -826,26 +1019,29 @@ CacheAllocator::releaseBackToAllocator(Item& it, auto next = head->getNext(compressor_); const auto childInfo = - allocator_->getAllocInfo(static_cast(head)); + allocator_[tid]->getAllocInfo(static_cast(head)); (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub( util::getFragmentation(*this, *head)); removeFromMMContainer(*head); - // If this chained item is marked as exclusive, we will not free it. - // We must capture the exclusive state before we do the decRef when - // we know the item must still be valid - const bool wasExclusive = head->isExclusive(); + // If this chained item is marked as moving, we will not free it. + // We must capture the moving state before we do the decRef when + // we know the item must still be valid. Item cannot be marked as + // exclusive. Only parent can be marked as such and even parent needs + // to be unmark prior to calling releaseBackToAllocator. + const bool wasMoving = head->isMoving(); + XDCHECK(!head->isMarkedForEviction()); // Decref and check if we were the last reference. Now if the item - // was marked exclusive, after decRef, it will be free to be released + // was marked moving, after decRef, it will be free to be released // by slab release thread const auto childRef = head->decRef(); - // If the item is already exclusive and we already decremented the + // If the item is already moving and we already decremented the // refcount, we don't need to free this item. We'll let the slab // release thread take care of that - if (!wasExclusive) { + if (!wasMoving) { if (childRef != 0) { throw std::runtime_error(folly::sformat( "chained item refcount is not zero. We cannot proceed! " @@ -853,13 +1049,13 @@ CacheAllocator::releaseBackToAllocator(Item& it, childRef, head->toString())); } - // Item is not exclusive and refcount is 0, we can proceed to + // Item is not moving and refcount is 0, we can proceed to // free it or recylce the memory if (head == toRecycle) { XDCHECK(ReleaseRes::kReleased != res); res = ReleaseRes::kRecycled; } else { - allocator_->free(head); + allocator_[tid]->free(head); } } @@ -874,16 +1070,19 @@ CacheAllocator::releaseBackToAllocator(Item& it, res = ReleaseRes::kRecycled; } else { XDCHECK(it.isDrained()); - allocator_->free(&it); + allocator_[tid]->free(&it); } return res; } template -void CacheAllocator::incRef(Item& it) { - it.incRef(); - ++handleCount_.tlStats(); +RefcountWithFlags::incResult CacheAllocator::incRef(Item& it, bool failIfMoving) { + auto ret = it.incRef(failIfMoving); + if (ret == RefcountWithFlags::incResult::incOk) { + ++handleCount_.tlStats(); + } + return ret; } template @@ -903,8 +1102,18 @@ CacheAllocator::acquire(Item* it) { SCOPE_FAIL { stats_.numRefcountOverflow.inc(); }; - incRef(*it); - return WriteHandle{it, *this}; + // TODO: do not block incRef for child items to avoid deadlock + auto failIfMoving = getNumTiers() > 1 && !it->isChainedItem(); + auto incRes = incRef(*it, failIfMoving); + if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) { + return WriteHandle{it, *this}; + } else if (incRes == RefcountWithFlags::incResult::incFailedEviction){ + // item is being evicted + return WriteHandle{}; + } else { + // item is being moved - wait for completion + return handleWithWaitContextForMovingItem(*it); + } } template @@ -946,6 +1155,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem, } } +template +bool CacheAllocator::replaceInMMContainer(Item* oldItem, + Item& newItem) { + return replaceInMMContainer(*oldItem, newItem); +} + +template +bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt, + Item& newItem) { + auto& oldContainer = getMMContainer(*oldItemIt); + auto& newContainer = getMMContainer(newItem); + + // This function is used for eviction across tiers + XDCHECK(&oldContainer != &newContainer); + oldContainer.remove(oldItemIt); + + return newContainer.add(newItem); +} + template bool CacheAllocator::replaceChainedItemInMMContainer( Item& oldItem, Item& newItem) { @@ -1091,6 +1319,165 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) { return replaced; } +/* Next two methods are used to asynchronously move Item between memory tiers. + * + * The thread, which moves Item, allocates new Item in the tier we are moving to + * and calls moveRegularItemWithSync() method. This method does the following: + * 1. Update the access container with the new item from the tier we are + * moving to. This Item has moving flag set. + * 2. Copy data from the old Item to the new one. + * + * Concurrent threads which are getting handle to the same key: + * 1. When a handle is created it checks if the moving flag is set + * 2. If so, Handle implementation creates waitContext and adds it to the + * MoveCtx by calling handleWithWaitContextForMovingItem() method. + * 3. Wait until the moving thread will complete its job. + */ +template +typename CacheAllocator::WriteHandle +CacheAllocator::handleWithWaitContextForMovingItem(Item& item) { + auto shard = getShardForKey(item.getKey()); + auto& movesMap = getMoveMapForShard(shard); + { + auto lock = getMoveLockForShard(shard); + + WriteHandle hdl{*this}; + auto waitContext = hdl.getItemWaitContext(); + + auto ret = movesMap.try_emplace(item.getKey(), std::make_unique()); + ret.first->second->addWaiter(std::move(waitContext)); + + return hdl; + } +} + +template +size_t CacheAllocator::wakeUpWaitersLocked(folly::StringPiece key, + WriteHandle&& handle) { + std::unique_ptr ctx; + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + { + auto lock = getMoveLockForShard(shard); + movesMap.eraseInto(key, [&](auto &&key, auto &&value) { + ctx = std::move(value); + }); + } + + if (ctx) { + ctx->setItemHandle(std::move(handle)); + return ctx->numWaiters(); + } + + return 0; +} + +template +bool CacheAllocator::moveRegularItemWithSync( + Item& oldItem, WriteHandle& newItemHdl) { + //on function exit - the new item handle is no longer moving + //and other threads may access it - but in case where + //we failed to replace in access container we can give the + //new item back to the allocator + auto guard = folly::makeGuard([&]() { + auto ref = newItemHdl->unmarkMoving(); + if (UNLIKELY(ref == 0)) { + const auto res = + releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + } + }); + + XDCHECK(oldItem.isMoving()); + XDCHECK(!oldItem.isExpired()); + // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_ + // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_}; + + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); + + // take care of the flags before we expose the item to be accessed. this + // will ensure that when another thread removes the item from RAM, we issue + // a delete accordingly. See D7859775 for an example + if (oldItem.isNvmClean()) { + newItemHdl->markNvmClean(); + } + + // mark new item as moving to block readers until the data is copied + // (moveCb is called). Mark item in MMContainer temporarily (TODO: should + // we remove markMoving requirement for the item to be linked?) + newItemHdl->markInMMContainer(); + auto marked = newItemHdl->markMoving(false /* there is already a handle */); + newItemHdl->unmarkInMMContainer(); + XDCHECK(marked); + + auto predicate = [&](const Item& item){ + // we rely on moving flag being set (it should block all readers) + XDCHECK(item.getRefCount() == 0); + return true; + }; + + auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl, + predicate); + // another thread may have called insertOrReplace which could have + // marked this item as unaccessible causing the replaceIf + // in the access container to fail - in this case we want + // to abort the move since the item is no longer valid + if (!replaced) { + return false; + } + // what if another thread calls insertOrReplace now when + // the item is moving and already replaced in the hash table? + // 1. it succeeds in updating the hash table - so there is + // no guarentee that isAccessible() is true + // 2. it will then try to remove from MM container + // - this operation will wait for newItemHdl to + // be unmarkedMoving via the waitContext + // 3. replaced handle is returned and eventually drops + // ref to 0 and the item is recycled back to allocator. + + if (config_.moveCb) { + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, nullptr); + } else { + std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(), + oldItem.getSize()); + } + + // Adding the item to mmContainer has to succeed since no one can remove the item + auto& newContainer = getMMContainer(*newItemHdl); + auto mmContainerAdded = newContainer.add(*newItemHdl); + XDCHECK(mmContainerAdded); + + // no one can add or remove chained items at this point + if (oldItem.hasChainedItem()) { + // safe to acquire handle for a moving Item + auto incRes = incRef(oldItem, false); + XDCHECK(incRes == RefcountWithFlags::incResult::incOk); + auto oldHandle = WriteHandle{&oldItem,*this}; + XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); + try { + auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); + transferChainLocked(oldHandle, newItemHdl); + } catch (const std::exception& e) { + // this should never happen because we drained all the handles. + XLOGF(DFATAL, "{}", e.what()); + throw; + } + + XDCHECK(!oldItem.hasChainedItem()); + XDCHECK(newItemHdl->hasChainedItem()); + } + newItemHdl.unmarkNascent(); + return true; +} + template bool CacheAllocator::moveRegularItem(Item& oldItem, WriteHandle& newItemHdl) { @@ -1122,15 +1509,15 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, config_.moveCb(oldItem, *newItemHdl, nullptr); // Inside the access container's lock, this checks if the old item is - // accessible and its refcount is zero. If the item is not accessible, + // accessible and its refcount is one. If the item is not accessible, // there is no point to replace it since it had already been removed // or in the process of being removed. If the item is in cache but the - // refcount is non-zero, it means user could be attempting to remove + // refcount is non-one, it means user could be attempting to remove // this item through an API such as remove(itemHandle). In this case, // it is unsafe to replace the old item with a new one, so we should // also abort. if (!accessContainer_->replaceIf(oldItem, *newItemHdl, - itemExclusivePredicate)) { + itemSlabMovePredicate)) { return false; } @@ -1151,13 +1538,12 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, // no one can add or remove chained items at this point if (oldItem.hasChainedItem()) { - // safe to acquire handle for a moving Item - auto oldHandle = acquire(&oldItem); - XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + auto oldItemHdl = acquire(&oldItem); + XDCHECK_EQ(1u, oldItemHdl->getRefCount()) << oldItemHdl->toString(); XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); try { auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); - transferChainLocked(oldHandle, newItemHdl); + transferChainLocked(oldItemHdl, newItemHdl); } catch (const std::exception& e) { // this should never happen because we drained all the handles. XLOGF(DFATAL, "{}", e.what()); @@ -1179,18 +1565,18 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, // This item has been unlinked from its parent and we're the only // owner of it, so we're done here - if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) { + if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { return false; } - const auto parentKey = oldItem.getParentItem(compressor_).getKey(); - - // Grab lock to prevent anyone else from modifying the chain + auto& expectedParent = oldItem.getParentItem(compressor_); + const auto parentKey = expectedParent.getKey(); auto l = chainedItemLocks_.lockExclusive(parentKey); + // verify old item under the lock auto parentHandle = validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey); - if (!parentHandle) { + if (!parentHandle || &expectedParent != parentHandle.get()) { return false; } @@ -1210,7 +1596,7 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, // In case someone else had removed this chained item from its parent by now // So we check again to see if the it has been unlinked from its parent - if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) { + if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { return false; } @@ -1226,90 +1612,187 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, // parent's chain and the MMContainer. auto oldItemHandle = replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle); - XDCHECK(oldItemHandle->isExclusive()); + XDCHECK(oldItemHandle->isMoving()); XDCHECK(!oldItemHandle->isInMMContainer()); return true; } template -typename CacheAllocator::Item* -CacheAllocator::findEviction(PoolId pid, ClassId cid) { - auto& mmContainer = getMMContainer(pid, cid); +typename CacheAllocator::NvmCacheT::PutToken +CacheAllocator::createPutToken(Item& item) { + const bool evictToNvmCache = shouldWriteToNvmCache(item); + return evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) + : typename NvmCacheT::PutToken{}; +} + +template +void CacheAllocator::unlinkItemForEviction(Item& it) { + XDCHECK(it.isMarkedForEviction()); + XDCHECK(it.getRefCount() == 0); + accessContainer_->remove(it); + removeFromMMContainer(it); + // Since we managed to mark the item for eviction we must be the only + // owner of the item. + const auto ref = it.unmarkForEviction(); + XDCHECK(ref == 0u); +} + +template +typename CacheAllocator::Item* +CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) { + auto& mmContainer = getMMContainer(tid, pid, cid); + bool lastTier = tid+1 >= getNumTiers(); // Keep searching for a candidate until we were able to evict it // or until the search limit has been exhausted unsigned int searchTries = 0; - auto itr = mmContainer.getEvictionIterator(); while ((config_.evictionSearchTries == 0 || - config_.evictionSearchTries > searchTries) && - itr) { - ++searchTries; - (*stats_.evictionAttempts)[pid][cid].inc(); + config_.evictionSearchTries > searchTries)) { + + Item* toRecycle = nullptr; + Item* candidate = nullptr; + typename NvmCacheT::PutToken token; + + mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle, + &searchTries, &mmContainer, &lastTier, + &token](auto&& itr) { + if (!itr) { + ++searchTries; + (*stats_.evictionAttempts)[pid][cid].inc(); + return; + } - Item* toRecycle = itr.get(); + while ((config_.evictionSearchTries == 0 || + config_.evictionSearchTries > searchTries) && + itr) { + ++searchTries; + (*stats_.evictionAttempts)[pid][cid].inc(); + + auto* toRecycle_ = itr.get(); + auto* candidate_ = + toRecycle_->isChainedItem() + ? &toRecycle_->asChainedItem().getParentItem(compressor_) + : toRecycle_; + + if (lastTier) { + // if it's last tier, the item will be evicted + // need to create put token before marking it exclusive + token = createPutToken(*candidate_); + } - Item* candidate = - toRecycle->isChainedItem() - ? &toRecycle->asChainedItem().getParentItem(compressor_) - : toRecycle; + if (lastTier && shouldWriteToNvmCache(*candidate_) && !token.isValid()) { + stats_.evictFailConcurrentFill.inc(); + } else if ( (lastTier && candidate_->markForEviction()) || + (!lastTier && candidate_->markMoving(true)) ) { + XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction()); + // markForEviction to make sure no other thead is evicting the item + // nor holding a handle to that item if this is last tier + // since we won't be moving the item to the next tier + toRecycle = toRecycle_; + candidate = candidate_; + + // Check if parent changed for chained items - if yes, we cannot + // remove the child from the mmContainer as we will not be evicting + // it. We could abort right here, but we need to cleanup in case + // unmarkForEviction() returns 0 - so just go through normal path. + if (!toRecycle_->isChainedItem() || + &toRecycle->asChainedItem().getParentItem(compressor_) == + candidate) + mmContainer.remove(itr); + return; + } - // make sure no other thead is evicting the item - if (candidate->getRefCount() != 0 || !candidate->markExclusive()) { - ++itr; - continue; - } + if (candidate_->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } - // for chained items, the ownership of the parent can change. We try to - // evict what we think as parent and see if the eviction of parent - // recycles the child we intend to. - bool evictionSuccessful = false; - { - auto toReleaseHandle = - itr->isChainedItem() - ? advanceIteratorAndTryEvictChainedItem(itr) - : advanceIteratorAndTryEvictRegularItem(mmContainer, itr); - evictionSuccessful = toReleaseHandle != nullptr; - // destroy toReleseHandle. The item won't be released to allocator - // since we marked it as exclusive. - } - - const auto ref = candidate->unmarkExclusive(); - if (ref == 0u) { - // Invalidate iterator since later on we may use this mmContainer - // again, which cannot be done unless we drop this iterator - itr.destroy(); - - // recycle the item. it's safe to do so, even if toReleaseHandle was - // NULL. If `ref` == 0 then it means that we are the last holder of - // that item. - if (candidate->hasChainedItem()) { - (*stats_.chainedItemEvictions)[pid][cid].inc(); - } else { - (*stats_.regularItemEvictions)[pid][cid].inc(); + ++itr; + XDCHECK(toRecycle == nullptr); + XDCHECK(candidate == nullptr); } + }); - if (auto eventTracker = getEventTracker()) { - eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(), - AllocatorApiResult::EVICTED, candidate->getSize(), - candidate->getConfiguredTTL().count()); - } + if (!toRecycle) + continue; - // check if by releasing the item we intend to, we actually - // recycle the candidate. - if (ReleaseRes::kRecycled == - releaseBackToAllocator(*candidate, RemoveContext::kEviction, - /* isNascent */ false, toRecycle)) { - return toRecycle; + XDCHECK(toRecycle); + XDCHECK(candidate); + + auto evictedToNext = lastTier ? nullptr + : tryEvictToNextMemoryTier(*candidate, false); + if (!evictedToNext) { + //if insertOrReplace was called during move + //then candidate will not be accessible (failed replace during tryEvict) + // - therefore this was why we failed to + // evict to the next tier and insertOrReplace + // will remove from NVM cache + //however, if candidate is accessible + //that means the allocation in the next + //tier failed - so we will continue to + //evict the item to NVM cache + bool failedToReplace = !candidate->isAccessible(); + if (!token.isValid() && !failedToReplace) { + token = createPutToken(*candidate); } + // tryEvictToNextMemoryTier can fail if: + // a) allocation of the new item fails in that case, + // it should be still possible to mark item for eviction. + // b) another thread calls insertOrReplace and the item + // is no longer accessible + // + // in case that we are on the last tier, we whould have already marked + // as exclusive since we will not be moving the item to the next tier + // but rather just evicting all together, no need to + // markForEvictionWhenMoving + auto ret = lastTier ? true : candidate->markForEvictionWhenMoving(); + XDCHECK(ret); + + unlinkItemForEviction(*candidate); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate) + && !failedToReplace) { + nvmCache_->put(*candidate, std::move(token)); + } + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*candidate, {}); + } else { - XDCHECK(!evictionSuccessful); + XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving()); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + XDCHECK(!candidate->isAccessible()); + XDCHECK(candidate->getKey() == evictedToNext->getKey()); + + wakeUpWaiters(*candidate, std::move(evictedToNext)); } - // If we destroyed the itr to possibly evict and failed, we restart - // from the beginning again - if (!itr) { - itr.resetToBegin(); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + + // recycle the item. it's safe to do so, even if toReleaseHandle was + // NULL. If `ref` == 0 then it means that we are the last holder of + // that item. + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(), + AllocatorApiResult::EVICTED, candidate->getSize(), + candidate->getConfiguredTTL().count()); + } + + // check if by releasing the item we intend to, we actually + // recycle the candidate. + auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false, toRecycle); + if (ret == ReleaseRes::kRecycled) { + return toRecycle; } } return nullptr; @@ -1363,6 +1846,97 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive( return true; } +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + XDCHECK(item.isMoving()); + XDCHECK(item.getRefCount() == 0); + if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet + if(item.isExpired()) { + accessContainer_->remove(item); + item.unmarkMoving(); + return acquire(&item); + } + + TierId nextTier = tid; + while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers + if (!shouldEvictToNextMemoryTier(tid, nextTier, pid, item)) + continue; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(nextTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + if (!moveRegularItemWithSync(item, newItemHdl)) { + return WriteHandle{}; + } + XDCHECK_EQ(newItemHdl->getKey(),item.getKey()); + item.unmarkMoving(); + return newItemHdl; + } else { + return WriteHandle{}; + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread); +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryPromoteToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + if(item.isExpired()) { return {}; } + TierId nextTier = tid; + while (nextTier > 0) { // try to evict down to the next memory tiers + auto toPromoteTier = nextTier - 1; + --nextTier; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(toPromoteTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + if (!moveRegularItemWithSync(item, newItemHdl)) { + return WriteHandle{}; + } + item.unmarkMoving(); + return newItemHdl; + } else { + return WriteHandle{}; + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread); +} + template typename CacheAllocator::RemoveRes CacheAllocator::remove(typename Item::Key key) { @@ -1453,7 +2027,7 @@ bool CacheAllocator::pushToNvmCacheFromRamForTesting( if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) && shouldWriteToNvmCacheExclusive(*handle)) { - nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey())); + nvmCache_->put(*handle, nvmCache_->createPutToken(handle->getKey())); return true; } return false; @@ -1563,21 +2137,57 @@ void CacheAllocator::invalidateNvm(Item& item) { } } +template +TierId +CacheAllocator::getTierId(const Item& item) const { + return getTierId(item.getMemory()); +} + +template +TierId +CacheAllocator::getTierId(const void* ptr) const { + for (TierId tid = 0; tid < getNumTiers(); tid++) { + if (allocator_[tid]->isMemoryInAllocator(ptr)) + return tid; + } + + throw std::invalid_argument("Item does not belong to any tier!"); +} + template typename CacheAllocator::MMContainer& CacheAllocator::getMMContainer(const Item& item) const noexcept { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - return getMMContainer(allocInfo.poolId, allocInfo.classId); + allocator_[tid]->getAllocInfo(static_cast(&item)); + return getMMContainer(tid, allocInfo.poolId, allocInfo.classId); } template typename CacheAllocator::MMContainer& -CacheAllocator::getMMContainer(PoolId pid, +CacheAllocator::getMMContainer(TierId tid, + PoolId pid, ClassId cid) const noexcept { - XDCHECK_LT(static_cast(pid), mmContainers_.size()); - XDCHECK_LT(static_cast(cid), mmContainers_[pid].size()); - return *mmContainers_[pid][cid]; + XDCHECK_LT(static_cast(tid), mmContainers_.size()); + XDCHECK_LT(static_cast(pid), mmContainers_[tid].size()); + XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size()); + return *mmContainers_[tid][pid][cid]; +} + +template +MMContainerStat CacheAllocator::getMMContainerStat( + TierId tid, PoolId pid, ClassId cid) const noexcept { + if(static_cast(tid) >= mmContainers_.size()) { + return MMContainerStat{}; + } + if (static_cast(pid) >= mmContainers_[tid].size()) { + return MMContainerStat{}; + } + if (static_cast(cid) >= mmContainers_[tid][pid].size()) { + return MMContainerStat{}; + } + return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats() + : MMContainerStat{}; } template @@ -1763,8 +2373,9 @@ void CacheAllocator::markUseful(const ReadHandle& handle, template bool CacheAllocator::recordAccessInMMContainer(Item& item, AccessMode mode) { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); + allocator_[tid]->getAllocInfo(static_cast(&item)); (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc(); // track recently accessed items if needed @@ -1772,14 +2383,15 @@ bool CacheAllocator::recordAccessInMMContainer(Item& item, ring_->trackItem(reinterpret_cast(&item), item.getSize()); } - auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId); + auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId); return mmContainer.recordAccess(item, mode); } template uint32_t CacheAllocator::getUsableSize(const Item& item) const { + const auto tid = getTierId(item); const auto allocSize = - allocator_->getAllocInfo(static_cast(&item)).allocSize; + allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize; return item.isChainedItem() ? allocSize - ChainedItem::getRequiredSize(0) : allocSize - Item::getRequiredSize(item.getKey(), 0); @@ -1788,8 +2400,10 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const { template typename CacheAllocator::SampleItem CacheAllocator::getSampleItem() { - size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0; - size_t ramCacheSize = allocator_->getMemorySizeInclAdvised(); + // TODO: is using random tier a good idea? + auto tid = folly::Random::rand32() % getNumTiers(); + static size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0; + static size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised(); bool fromNvm = folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize; @@ -1798,19 +2412,18 @@ CacheAllocator::getSampleItem() { } // Sampling from DRAM cache - auto item = reinterpret_cast(allocator_->getRandomAlloc()); + auto item = reinterpret_cast(allocator_[tid]->getRandomAlloc()); if (!item) { return SampleItem{false /* fromNvm */}; } // Check that item returned is the same that was sampled - auto sharedHdl = std::make_shared(findInternal(item->getKey())); if (sharedHdl->get() != item) { return SampleItem{false /* fromNvm */}; } - const auto allocInfo = allocator_->getAllocInfo(item->getMemory()); + const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory()); // Convert the Item to IOBuf to make SampleItem auto iobuf = folly::IOBuf{ @@ -1829,28 +2442,33 @@ CacheAllocator::getSampleItem() { template std::vector CacheAllocator::dumpEvictionIterator( - PoolId pid, ClassId cid, size_t numItems) { + PoolId pid, ClassId cid, size_t numItems) { if (numItems == 0) { return {}; } - if (static_cast(pid) >= mmContainers_.size() || - static_cast(cid) >= mmContainers_[pid].size()) { + // Always evict from the lowest layer. + int tid = getNumTiers() - 1; + + if (static_cast(tid) >= mmContainers_.size() || + static_cast(pid) >= mmContainers_[tid].size() || + static_cast(cid) >= mmContainers_[tid][pid].size()) { throw std::invalid_argument( - folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid)); + folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid)); } std::vector content; - auto& mm = *mmContainers_[pid][cid]; - auto evictItr = mm.getEvictionIterator(); - size_t i = 0; - while (evictItr && i < numItems) { - content.push_back(evictItr->toString()); - ++evictItr; - ++i; + while (tid >= 0) { + auto& mm = *mmContainers_[tid][pid][cid]; + mm.withEvictionIterator([&content, numItems](auto&& itr) { + while (itr && content.size() < numItems) { + content.push_back(itr->toString()); + ++itr; + } + }); + --tid; } - return content; } @@ -2026,19 +2644,50 @@ PoolId CacheAllocator::addPool( std::shared_ptr resizeStrategy, bool ensureProvisionable) { folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_); - auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable); + + PoolId pid = 0; + size_t totalCacheSize = 0; + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + totalCacheSize += allocator_[tid]->getMemorySize(); + } + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + auto tierSizeRatio = + static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize; + size_t tierPoolSize = static_cast(tierSizeRatio * size); + + // TODO: what if we manage to add pool only in one tier? + // we should probably remove that on failure + auto res = allocator_[tid]->addPool( + name, tierPoolSize, allocSizes, ensureProvisionable); + XDCHECK(tid == 0 || res == pid); + pid = res; + } + createMMContainers(pid, std::move(config)); setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); + + if (backgroundEvictor_.size()) { + for (size_t id = 0; id < backgroundEvictor_.size(); id++) + backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0)); + } + + if (backgroundPromoter_.size()) { + for (size_t id = 0; id < backgroundPromoter_.size(); id++) + backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1)); + } + return pid; } template void CacheAllocator::overridePoolRebalanceStrategy( PoolId pid, std::shared_ptr rebalanceStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setRebalanceStrategy(pid, std::move(rebalanceStrategy)); } @@ -2046,9 +2695,9 @@ void CacheAllocator::overridePoolRebalanceStrategy( template void CacheAllocator::overridePoolResizeStrategy( PoolId pid, std::shared_ptr resizeStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setResizeStrategy(pid, std::move(resizeStrategy)); } @@ -2060,14 +2709,14 @@ void CacheAllocator::overridePoolOptimizeStrategy( } template -void CacheAllocator::overridePoolConfig(PoolId pid, +void CacheAllocator::overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config) { - if (static_cast(pid) >= mmContainers_.size()) { + // TODO: add generic tier id checking + if (static_cast(pid) >= mmContainers_[tid].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size())); } - - auto& pool = allocator_->getPool(pid); + auto& pool = allocator_[tid]->getPool(pid); for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { MMConfig mmConfig = config; mmConfig.addExtraConfig( @@ -2075,29 +2724,35 @@ void CacheAllocator::overridePoolConfig(PoolId pid, ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - DCHECK_NOTNULL(mmContainers_[pid][cid].get()); - mmContainers_[pid][cid]->setConfig(mmConfig); + DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get()); + mmContainers_[tid][pid][cid]->setConfig(mmConfig); } } template void CacheAllocator::createMMContainers(const PoolId pid, MMConfig config) { - auto& pool = allocator_->getPool(pid); + // pools on each layer should have the same number of class id, etc. + // TODO: think about deduplication + auto& pool = allocator_[0]->getPool(pid); + for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { config.addExtraConfig( config_.trackTailHits ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - mmContainers_[pid][cid].reset(new MMContainer(config, compressor_)); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_)); + } } } template PoolId CacheAllocator::getPoolId( folly::StringPiece name) const noexcept { - return allocator_->getPoolId(name.str()); + // each tier has the same pools + return allocator_[0]->getPoolId(name.str()); } // The Function returns a consolidated vector of Release Slab @@ -2140,7 +2795,9 @@ std::set CacheAllocator::filterCompactCachePools( template std::set CacheAllocator::getRegularPoolIds() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); - return filterCompactCachePools(allocator_->getPoolIds()); + // TODO - get rid of the duplication - right now, each tier + // holds pool objects with mostly the same info + return filterCompactCachePools(allocator_[0]->getPoolIds()); } template @@ -2165,10 +2822,9 @@ std::set CacheAllocator::getRegularPoolIdsForResize() // getAdvisedMemorySize - then pools may be overLimit even when // all slabs are not allocated. Otherwise, pools may be overLimit // only after all slabs are allocated. - // - return (allocator_->allSlabsAllocated()) || - (allocator_->getAdvisedMemorySize() != 0) - ? filterCompactCachePools(allocator_->getPoolsOverLimit()) + return (allocator_[currentTier()]->allSlabsAllocated()) || + (allocator_[currentTier()]->getAdvisedMemorySize() != 0) + ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit()) : std::set{}; } @@ -2177,9 +2833,19 @@ const std::string CacheAllocator::getCacheName() const { return config_.cacheName; } +template +size_t CacheAllocator::getPoolSize(PoolId poolId) const { + size_t poolSize = 0; + for (auto& allocator: allocator_) { + const auto& pool = allocator->getPool(poolId); + poolSize += pool.getPoolSize(); + } + return poolSize; +} + template PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { - const auto& pool = allocator_->getPool(poolId); + const auto& pool = allocator_[currentTier()]->getPool(poolId); const auto& allocSizes = pool.getAllocSizes(); auto mpStats = pool.getStats(); const auto& classIds = mpStats.classIds; @@ -2198,7 +2864,7 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { if (!isCompactCache) { for (const ClassId cid : classIds) { uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get(); - XDCHECK(mmContainers_[poolId][cid], + XDCHECK(mmContainers_[currentTier()][poolId][cid], folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid)); cacheStats.insert( {cid, @@ -2208,16 +2874,14 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { (*stats_.fragmentationSize)[poolId][cid].get(), classHits, (*stats_.chainedItemEvictions)[poolId][cid].get(), (*stats_.regularItemEvictions)[poolId][cid].get(), - mmContainers_[poolId][cid]->getStats()} - - }); + getMMContainerStat(currentTier(), poolId, cid)}}); totalHits += classHits; } } PoolStats ret; ret.isCompactCache = isCompactCache; - ret.poolName = allocator_->getPoolName(poolId); + ret.poolName = allocator_[currentTier()]->getPoolName(poolId); ret.poolSize = pool.getPoolSize(); ret.poolUsableSize = pool.getPoolUsableSize(); ret.poolAdvisedSize = pool.getPoolAdvisedSize(); @@ -2229,29 +2893,39 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { return ret; } +template +ACStats CacheAllocator::getACStats(TierId tid, + PoolId poolId, + ClassId classId) const { + const auto& pool = allocator_[tid]->getPool(poolId); + const auto& ac = pool.getAllocationClass(classId); + + auto stats = ac.getStats(); + stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId]; + return stats; +} + template PoolEvictionAgeStats CacheAllocator::getPoolEvictionAgeStats( PoolId pid, unsigned int slabProjectionLength) const { PoolEvictionAgeStats stats; - - const auto& pool = allocator_->getPool(pid); + const auto& pool = allocator_[currentTier()]->getPool(pid); const auto& allocSizes = pool.getAllocSizes(); for (ClassId cid = 0; cid < static_cast(allocSizes.size()); ++cid) { - auto& mmContainer = getMMContainer(pid, cid); + auto& mmContainer = getMMContainer(currentTier(), pid, cid); const auto numItemsPerSlab = - allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); + allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); const auto projectionLength = numItemsPerSlab * slabProjectionLength; stats.classEvictionAgeStats[cid] = mmContainer.getEvictionAgeStat(projectionLength); } - return stats; } template CacheMetadata CacheAllocator::getCacheMetadata() const noexcept { return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion, - kCacheNvmFormatVersion, config_.size}; + kCacheNvmFormatVersion, config_.getCacheSize()}; } template @@ -2283,7 +2957,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } try { - auto releaseContext = allocator_->startSlabRelease( + auto releaseContext = allocator_[currentTier()]->startSlabRelease( pid, victim, receiver, mode, hint, [this]() -> bool { return shutDownInProgress_; }); @@ -2292,15 +2966,15 @@ void CacheAllocator::releaseSlab(PoolId pid, return; } - releaseSlabImpl(releaseContext); - if (!allocator_->allAllocsFreed(releaseContext)) { + releaseSlabImpl(currentTier(), releaseContext); + if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) { throw std::runtime_error( folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}", releaseContext.getPoolId(), releaseContext.getClassId())); } - allocator_->completeSlabRelease(releaseContext); + allocator_[currentTier()]->completeSlabRelease(releaseContext); } catch (const exception::SlabReleaseAborted& e) { stats_.numAbortedSlabReleases.inc(); throw exception::SlabReleaseAborted(folly::sformat( @@ -2311,8 +2985,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } template -SlabReleaseStats CacheAllocator::getSlabReleaseStats() - const noexcept { +SlabReleaseStats CacheAllocator::getSlabReleaseStats() const noexcept { std::lock_guard l(workersMutex_); return SlabReleaseStats{stats_.numActiveSlabReleases.get(), stats_.numReleasedForRebalance.get(), @@ -2330,7 +3003,7 @@ SlabReleaseStats CacheAllocator::getSlabReleaseStats() } template -void CacheAllocator::releaseSlabImpl( +void CacheAllocator::releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext) { auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs()); bool releaseStuck = false; @@ -2358,10 +3031,11 @@ void CacheAllocator::releaseSlabImpl( // 3. If 2 is successful, Move or Evict // 4. Move on to the next item if current item is freed for (auto alloc : releaseContext.getActiveAllocations()) { + auto startTimeSec = util::getCurrentTimeSec(); // Need to mark an item for release before proceeding // If we can't mark as moving, it means the item is already freed const bool isAlreadyFreed = - !markExclusiveForSlabRelease(releaseContext, alloc, throttler); + !markMovingForSlabRelease(releaseContext, alloc, throttler); if (isAlreadyFreed) { continue; } @@ -2375,7 +3049,7 @@ void CacheAllocator::releaseSlabImpl( if (!isMoved) { evictForSlabRelease(releaseContext, item, throttler); } - XDCHECK(allocator_->isAllocFreed(releaseContext, alloc)); + XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc)); } } @@ -2389,6 +3063,14 @@ void CacheAllocator::throttleWith(util::Throttler& t, } } +template +typename RefcountWithFlags::Value CacheAllocator::unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle) +{ + auto ret = item.unmarkMoving(); + wakeUpWaiters(item, std::move(handle)); + return ret; +} + template bool CacheAllocator::moveForSlabRelease( const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) { @@ -2398,7 +3080,7 @@ bool CacheAllocator::moveForSlabRelease( bool isMoved = false; auto startTime = util::getCurrentTimeSec(); - WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem); + WriteHandle newItemHdl{}; for (unsigned int itemMovingAttempts = 0; itemMovingAttempts < config_.movingTries; @@ -2406,16 +3088,24 @@ bool CacheAllocator::moveForSlabRelease( stats_.numMoveAttempts.inc(); // Nothing to move and the key is likely also bogus for chained items. - if (oldItem.isOnlyExclusive()) { - oldItem.unmarkExclusive(); + if (oldItem.isOnlyMoving()) { + auto ret = unmarkMovingAndWakeUpWaiters(oldItem, {}); + XDCHECK(ret == 0); const auto res = releaseBackToAllocator(oldItem, RemoveContext::kNormal, false); XDCHECK(res == ReleaseRes::kReleased); return true; } + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still trying to move Item: {}. " + "Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, oldItem.toString(), + ctx.getPoolId(), ctx.getClassId()); + }); + if (!newItemHdl) { - // try to allocate again if it previously wasn't successful newItemHdl = allocateNewItemForOldItem(oldItem); } @@ -2426,14 +3116,6 @@ bool CacheAllocator::moveForSlabRelease( break; } } - - throttleWith(throttler, [&] { - XLOGF(WARN, - "Spent {} seconds, slab release still trying to move Item: {}. " - "Pool: {}, Class: {}.", - util::getCurrentTimeSec() - startTime, oldItem.toString(), - ctx.getPoolId(), ctx.getClassId()); - }); } // Return false if we've exhausted moving tries. @@ -2446,7 +3128,7 @@ bool CacheAllocator::moveForSlabRelease( // that's identical to this one to replace it. Here we just need to wait // until all users have dropped the item handles before we can proceed. startTime = util::getCurrentTimeSec(); - while (!oldItem.isOnlyExclusive()) { + while (!oldItem.isOnlyMoving()) { throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still waiting for refcount to " @@ -2455,8 +3137,12 @@ bool CacheAllocator::moveForSlabRelease( ctx.getPoolId(), ctx.getClassId()); }); } - const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory()); - allocator_->free(&oldItem); + auto tid = getTierId(oldItem); + auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl)); + XDCHECK(ref == 0); + + const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory()); + allocator_[tid]->free(&oldItem); (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, oldItem)); @@ -2465,10 +3151,10 @@ bool CacheAllocator::moveForSlabRelease( } template -typename CacheAllocator::ReadHandle +typename CacheAllocator::WriteHandle CacheAllocator::validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey) { - ReadHandle parentHandle{}; + WriteHandle parentHandle{}; try { parentHandle = findInternal(parentKey); // If the parent is not the same as the parent of the chained item, @@ -2487,6 +3173,7 @@ CacheAllocator::validateAndGetParentHandleForChainedMoveLocked( template typename CacheAllocator::WriteHandle CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { + XDCHECK(oldItem.isMoving()); if (oldItem.isChainedItem()) { const auto& oldChainedItem = oldItem.asChainedItem(); const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey(); @@ -2500,33 +3187,35 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { return {}; } - // Set up the destination for the move. Since oldChainedItem would have - // the exclusive bit set, it won't be picked for eviction. + // Set up the destination for the move. Since oldChainedItem would + // be marked as moving, it won't be picked for eviction. auto newItemHdl = - allocateChainedItemInternal(parentHandle, oldChainedItem.getSize()); + allocateChainedItemInternal(parentHandle, oldItem.getSize()); if (!newItemHdl) { return {}; } - XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize()); + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); auto parentPtr = parentHandle.getInternal(); XDCHECK_EQ(reinterpret_cast(parentPtr), reinterpret_cast( - &oldChainedItem.getParentItem(compressor_))); + &newItemHdl->asChainedItem().getParentItem(compressor_))); return newItemHdl; } const auto allocInfo = - allocator_->getAllocInfo(static_cast(&oldItem)); + allocator_[getTierId(oldItem)]->getAllocInfo(static_cast(&oldItem)); // Set up the destination for the move. Since oldItem would have the moving // bit set, it won't be picked for eviction. - auto newItemHdl = allocateInternal(allocInfo.poolId, - oldItem.getKey(), - oldItem.getSize(), - oldItem.getCreationTime(), - oldItem.getExpiryTime()); + auto newItemHdl = allocateInternalTier(getTierId(oldItem), + allocInfo.poolId, + oldItem.getKey(), + oldItem.getSize(), + oldItem.getCreationTime(), + oldItem.getExpiryTime(), + false); if (!newItemHdl) { return {}; } @@ -2545,7 +3234,8 @@ bool CacheAllocator::tryMovingForSlabRelease( // a regular item or chained item is synchronized with any potential // user-side mutation. std::unique_ptr syncObj; - if (config_.movingSync) { + if (config_.movingSync && getNumTiers() == 1) { + // TODO: use moving-bit synchronization for single tier as well if (!oldItem.isChainedItem()) { syncObj = config_.movingSync(oldItem.getKey()); } else { @@ -2553,7 +3243,7 @@ bool CacheAllocator::tryMovingForSlabRelease( // item is still valid. const std::string parentKey = oldItem.asChainedItem().getParentItem(compressor_).getKey().str(); - if (oldItem.isOnlyExclusive()) { + if (oldItem.isOnlyMoving()) { // If chained item no longer has a refcount, its parent is already // being released, so we abort this try to moving. return false; @@ -2570,67 +3260,59 @@ bool CacheAllocator::tryMovingForSlabRelease( } } - return oldItem.isChainedItem() - ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) - : moveRegularItem(oldItem, newItemHdl); + // TODO: we can unify move*Item and move*ItemWithSync by always + // using the moving bit to block readers. + if (getNumTiers() == 1) { + return oldItem.isChainedItem() + ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) + : moveRegularItem(oldItem, newItemHdl); + } else { + if (oldItem.isChainedItem() || oldItem.hasChainedItem()) { + // TODO: add support for chained items + return false; + } else { + //move can fail if another thread calls insertOrReplace + //in this case oldItem is no longer valid (not accessible, + //it gets removed from MMContainer and evictForSlabRelease + //will send it back to the allocator + bool ret = moveRegularItemWithSync(oldItem, newItemHdl); + if (!ret) { + //we failed to move - newItemHdl was released back to allocator + //by the moveRegularItemWithSync but oldItem is not accessible + //and no longer valid - we need to clean it up here + XDCHECK(!oldItem.isAccessible()); + oldItem.markForEvictionWhenMoving(); + unlinkItemForEviction(oldItem); + wakeUpWaiters(oldItem, {}); + } else { + removeFromMMContainer(oldItem); + } + return ret; + } + } +} + +template +void CacheAllocator::wakeUpWaiters(Item& item, WriteHandle handle) +{ + // readers do not block on 'moving' items in case there is only one tier + if (getNumTiers() > 1) { + wakeUpWaitersLocked(item.getKey(), std::move(handle)); + } } template void CacheAllocator::evictForSlabRelease( const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) { auto startTime = util::getCurrentTimeSec(); + while (true) { + XDCHECK(item.isMoving()); stats_.numEvictionAttempts.inc(); - // if the item is already in a state where only the exclusive bit is set, - // nothing needs to be done. We simply need to unmark exclusive bit and free - // the item. - if (item.isOnlyExclusive()) { - item.unmarkExclusive(); - const auto res = - releaseBackToAllocator(item, RemoveContext::kNormal, false); - XDCHECK(ReleaseRes::kReleased == res); - return; - } - - // Since we couldn't move, we now evict this item. Owning handle will be - // the item's handle for regular/normal items and will be the parent - // handle for chained items. - auto owningHandle = - item.isChainedItem() - ? evictChainedItemForSlabRelease(item.asChainedItem()) - : evictNormalItemForSlabRelease(item); - - // we managed to evict the corresponding owner of the item and have the - // last handle for the owner. - if (owningHandle) { - const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - if (owningHandle->hasChainedItem()) { - (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId] - .inc(); - } else { - (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId] - .inc(); - } - - stats_.numEvictionSuccesses.inc(); - - // we have the last handle. no longer need to hold on to the exclusive bit - item.unmarkExclusive(); - - // manually decrement the refcount to call releaseBackToAllocator - const auto ref = decRef(*owningHandle); - XDCHECK(ref == 0); - const auto res = releaseBackToAllocator(*owningHandle.release(), - RemoveContext::kEviction, false); - XDCHECK(res == ReleaseRes::kReleased); - return; - } - if (shutDownInProgress_) { - item.unmarkExclusive(); - allocator_->abortSlabRelease(ctx); + auto ref = unmarkMovingAndWakeUpWaiters(item, {}); + allocator_[getTierId(item)]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while trying to evict" " Item: {} Pool: {}, Class: {}.", @@ -2649,268 +3331,141 @@ void CacheAllocator::evictForSlabRelease( .toString()) : ""); }); - } -} - -template -typename CacheAllocator::WriteHandle -CacheAllocator::advanceIteratorAndTryEvictRegularItem( - MMContainer& mmContainer, EvictionIterator& itr) { - // we should flush this to nvmcache if it is not already present in nvmcache - // and the item is not expired. - Item& item = *itr; - const bool evictToNvmCache = shouldWriteToNvmCache(item); - - auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) - : typename NvmCacheT::PutToken{}; - - // record the in-flight eviciton. If not, we move on to next item to avoid - // stalling eviction. - if (evictToNvmCache && !token.isValid()) { - ++itr; - stats_.evictFailConcurrentFill.inc(); - return WriteHandle{}; - } - - // If there are other accessors, we should abort. Acquire a handle here since - // if we remove the item from both access containers and mm containers - // below, we will need a handle to ensure proper cleanup in case we end up - // not evicting this item - auto evictHandle = accessContainer_->removeIf(item, &itemExclusivePredicate); - if (!evictHandle) { - ++itr; - stats_.evictFailAC.inc(); - return evictHandle; - } - - mmContainer.remove(itr); - XDCHECK_EQ(reinterpret_cast(evictHandle.get()), - reinterpret_cast(&item)); - XDCHECK(!evictHandle->isInMMContainer()); - XDCHECK(!evictHandle->isAccessible()); - - // Invalidate iterator since later on if we are not evicting this - // item, we may need to rely on the handle we created above to ensure - // proper cleanup if the item's raw refcount has dropped to 0. - // And since this item may be a parent item that has some child items - // in this very same mmContainer, we need to make sure we drop this - // exclusive iterator so we can gain access to it when we're cleaning - // up the child items - itr.destroy(); - - // Ensure that there are no accessors after removing from the access - // container - XDCHECK(evictHandle->getRefCount() == 1); - - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { - XDCHECK(token.isValid()); - nvmCache_->put(evictHandle, std::move(token)); - } - return evictHandle; -} - -template -typename CacheAllocator::WriteHandle -CacheAllocator::advanceIteratorAndTryEvictChainedItem( - EvictionIterator& itr) { - XDCHECK(itr->isChainedItem()); - - ChainedItem* candidate = &itr->asChainedItem(); - ++itr; - - // The parent could change at any point through transferChain. However, if - // that happens, we would realize that the releaseBackToAllocator return - // kNotRecycled and we would try another chained item, leading to transient - // failure. - auto& parent = candidate->getParentItem(compressor_); + // if the item is already in a state where only the exclusive bit is set, + // nothing needs to be done. We simply need to call unmarkMoving and free + // the item. + if (item.isOnlyMoving()) { + auto ref = unmarkMovingAndWakeUpWaiters(item, {}); + XDCHECK(ref == 0); + const auto res = + releaseBackToAllocator(item, RemoveContext::kNormal, false); + XDCHECK(ReleaseRes::kReleased == res); + return; + } - const bool evictToNvmCache = shouldWriteToNvmCache(parent); + typename NvmCacheT::PutToken token; + Item* evicted; + if (item.isChainedItem()) { + auto& expectedParent = item.asChainedItem().getParentItem(compressor_); + + if (getNumTiers() == 1) { + // TODO: unify this with multi-tier implementation + // right now, taking a chained item lock here would lead to deadlock + const std::string parentKey = expectedParent.getKey().str(); + auto l = chainedItemLocks_.lockExclusive(parentKey); + + // check if the child is still in mmContainer and the expected parent is + // valid under the chained item lock. + if (expectedParent.getKey() != parentKey || !item.isInMMContainer() || + item.isOnlyMoving() || + &expectedParent != &item.asChainedItem().getParentItem(compressor_) || + !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { + continue; + } - auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey()) - : typename NvmCacheT::PutToken{}; + // search if the child is present in the chain + { + auto parentHandle = findInternal(parentKey); + if (!parentHandle || parentHandle != &expectedParent) { + continue; + } + + ChainedItem* head = nullptr; + { // scope for the handle + auto headHandle = findChainedItem(expectedParent); + head = headHandle ? &headHandle->asChainedItem() : nullptr; + } + + bool found = false; + while (head) { + if (head == &item) { + found = true; + break; + } + head = head->getNext(compressor_); + } + + if (!found) { + continue; + } + } + } - // if token is invalid, return. iterator is already advanced. - if (evictToNvmCache && !token.isValid()) { - stats_.evictFailConcurrentFill.inc(); - return WriteHandle{}; - } + evicted = &expectedParent; + token = createPutToken(*evicted); + if (evicted->markForEviction()) { + // unmark the child so it will be freed + item.unmarkMoving(); + unlinkItemForEviction(*evicted); + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*evicted, {}); + } else { + // TODO: potential deadlock with markUseful for parent item + // for now, we do not block any reader on child items but this + // should probably be fixed + continue; + } + } else { + evicted = &item; - // check if the parent exists in the hashtable and refcount is drained. - auto parentHandle = - accessContainer_->removeIf(parent, &itemExclusivePredicate); - if (!parentHandle) { - stats_.evictFailParentAC.inc(); - return parentHandle; - } + token = createPutToken(*evicted); + if (evicted->markForEvictionWhenMoving()) { + unlinkItemForEviction(*evicted); + wakeUpWaiters(*evicted, {}); + } else { + continue; + } + } - // Invalidate iterator since later on we may use the mmContainer - // associated with this iterator which cannot be done unless we - // drop this iterator - // - // This must be done once we know the parent is not nullptr. - // Since we can very well be the last holder of this parent item, - // which may have a chained item that is linked in this MM container. - itr.destroy(); + if (token.isValid() && shouldWriteToNvmCacheExclusive(*evicted)) { + nvmCache_->put(*evicted, std::move(token)); + } - // Ensure we have the correct parent and we're the only user of the - // parent, then free it from access container. Otherwise, we abort - XDCHECK_EQ(reinterpret_cast(&parent), - reinterpret_cast(parentHandle.get())); - XDCHECK_EQ(1u, parent.getRefCount()); + const auto allocInfo = + allocator_[getTierId(*evicted)]->getAllocInfo(static_cast(evicted)); + if (evicted->hasChainedItem()) { + (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc(); + } else { + (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc(); + } - removeFromMMContainer(*parentHandle); + stats_.numEvictionSuccesses.inc(); - XDCHECK(!parent.isInMMContainer()); - XDCHECK(!parent.isAccessible()); + XDCHECK(evicted->getRefCount() == 0); + const auto res = + releaseBackToAllocator(*evicted, RemoveContext::kEviction, false); - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { - XDCHECK(token.isValid()); - XDCHECK(parentHandle->hasChainedItem()); - nvmCache_->put(parentHandle, std::move(token)); + if (getNumTiers() == 1) { + XDCHECK(res == ReleaseRes::kReleased); + } else { + const bool isAlreadyFreed = + !markMovingForSlabRelease(ctx, &item, throttler); + if (!isAlreadyFreed) { + continue; + } + } + + return; } - - return parentHandle; } template +template typename CacheAllocator::WriteHandle -CacheAllocator::evictNormalItemForSlabRelease(Item& item) { - XDCHECK(item.isExclusive()); - - if (item.isOnlyExclusive()) { - return WriteHandle{}; - } - - auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; - - const bool evictToNvmCache = shouldWriteToNvmCache(item); - auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) - : typename NvmCacheT::PutToken{}; - - // We remove the item from both access and mm containers. It doesn't matter - // if someone else calls remove on the item at this moment, the item cannot - // be freed as long as we have the exclusive bit set. - auto handle = accessContainer_->removeIf(item, std::move(predicate)); +CacheAllocator::removeIf(Item& item, Fn&& predicate) { + auto handle = accessContainer_->removeIf(item, std::forward(predicate)); - if (!handle) { - return handle; - } - - XDCHECK_EQ(reinterpret_cast(handle.get()), + if (handle) { + XDCHECK_EQ(reinterpret_cast(handle.get()), reinterpret_cast(&item)); - XDCHECK_EQ(1u, handle->getRefCount()); - removeFromMMContainer(item); - - // now that we are the only handle and we actually removed something from - // the RAM cache, we enqueue it to nvmcache. - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { - nvmCache_->put(handle, std::move(token)); + removeFromMMContainer(item); } return handle; } -template -typename CacheAllocator::WriteHandle -CacheAllocator::evictChainedItemForSlabRelease(ChainedItem& child) { - XDCHECK(child.isExclusive()); - - // We have the child marked as moving, but dont know anything about the - // state of the parent. Unlike the case of regular eviction where we are - // sure that the child is inside the MMContainer, ensuring its parent is - // valid, we can not make any assumptions here. We try to find the parent - // first through the access container and then verify that the parent's - // chain points to the child before cleaning up the parent. If the parent - // was in the process of being re-allocated or child was being removed - // concurrently, we would synchronize here on one of the checks. - Item& expectedParent = child.getParentItem(compressor_); - - // Grab exclusive lock since we are modifying the chain. at this point, we - // dont know the state of the parent. so we need to do some validity checks - // after we have the chained item lock to ensure that we got the lock off of - // a valid state. - const std::string parentKey = expectedParent.getKey().str(); - auto l = chainedItemLocks_.lockExclusive(parentKey); - - // check if the child is still in mmContainer and the expected parent is - // valid under the chained item lock. - if (expectedParent.getKey() != parentKey || !child.isInMMContainer() || - child.isOnlyExclusive() || - &expectedParent != &child.getParentItem(compressor_) || - !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { - return {}; - } - - // search if the child is present in the chain - auto parentHandle = findInternal(parentKey); - if (!parentHandle || parentHandle != &expectedParent) { - return {}; - } - - ChainedItem* head = nullptr; - { // scope for the handle - auto headHandle = findChainedItem(expectedParent); - head = headHandle ? &headHandle->asChainedItem() : nullptr; - } - - bool found = false; - while (head) { - if (head == &child) { - found = true; - break; - } - head = head->getNext(compressor_); - } - - if (!found) { - return {}; - } - - // if we found the child in the parent's chain, we remove it and ensure that - // the handle we obtained was the last one. Before that, create a put token - // to guard any racing cache find to avoid item re-appearing in NvmCache. - const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent); - - auto token = evictToNvmCache - ? nvmCache_->createPutToken(expectedParent.getKey()) - : typename NvmCacheT::PutToken{}; - - if (!accessContainer_->removeIf(expectedParent, - parentEvictForSlabReleasePredicate)) { - return {}; - } - - // at this point, we should be the last handle owner - XDCHECK_EQ(1u, parentHandle->getRefCount()); - - // We remove the parent from both access and mm containers. It doesn't - // matter if someone else calls remove on the parent at this moment, it - // cannot be freed since we hold an active item handle - removeFromMMContainer(*parentHandle); - - // In case someone else had removed this chained item from its parent by now - // So we check again to see if it has been unlinked from its parent - if (!child.isInMMContainer() || child.isOnlyExclusive()) { - return {}; - } - - // check after removing from the MMContainer that the parent is still not - // being marked as moving. If parent is moving, it will release the child - // item and we will wait for that. - if (parentHandle->isExclusive()) { - return {}; - } - - // now that we are the only handle and we actually removed something from - // the RAM cache, we enqueue it to nvmcache. - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { - DCHECK(parentHandle->hasChainedItem()); - nvmCache_->put(parentHandle, std::move(token)); - } - - return parentHandle; -} - template bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { if (!handle) { @@ -2919,18 +3474,11 @@ bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { // We remove the item from both access and mm containers. // We want to make sure the caller is the only one holding the handle. - auto removedHandle = - accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); - if (removedHandle) { - removeFromMMContainer(*(handle.getInternal())); - return true; - } - - return false; + return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate); } template -bool CacheAllocator::markExclusiveForSlabRelease( +bool CacheAllocator::markMovingForSlabRelease( const SlabReleaseContext& ctx, void* alloc, util::Throttler& throttler) { // MemoryAllocator::processAllocForRelease will execute the callback // if the item is not already free. So there are three outcomes here: @@ -2945,18 +3493,23 @@ bool CacheAllocator::markExclusiveForSlabRelease( // At first, we assume this item was already freed bool itemFreed = true; bool markedMoving = false; - const auto fn = [&markedMoving, &itemFreed](void* memory) { + TierId tid = getTierId(alloc); + auto numTiers = getNumTiers(); + const auto fn = [&markedMoving, &itemFreed, numTiers](void* memory) { // Since this callback is executed, the item is not yet freed itemFreed = false; Item* item = static_cast(memory); - if (item->markExclusive()) { + // TODO: for chained items, moving bit is only used to avoid + // freeing the item prematurely + auto failIfRefNotZero = numTiers > 1 && !item->isChainedItem(); + if (item->markMoving(failIfRefNotZero)) { markedMoving = true; } }; auto startTime = util::getCurrentTimeSec(); while (true) { - allocator_->processAllocForRelease(ctx, alloc, fn); + allocator_[tid]->processAllocForRelease(ctx, alloc, fn); // If item is already freed we give up trying to mark the item moving // and return false, otherwise if marked as moving, we return true. @@ -2971,7 +3524,7 @@ bool CacheAllocator::markExclusiveForSlabRelease( itemFreed = true; if (shutDownInProgress_) { - allocator_->abortSlabRelease(ctx); + allocator_[tid]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while still trying to mark" " as moving for Item: {}. Pool: {}, Class: {}.", @@ -2994,12 +3547,15 @@ template CCacheT* CacheAllocator::addCompactCache(folly::StringPiece name, size_t size, Args&&... args) { + if (getNumTiers() != 1) + throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported."); + if (!config_.isCompactCacheEnabled()) { throw std::logic_error("Compact cache is not enabled"); } folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); - auto poolId = allocator_->addPool(name, size, {Slab::kSize}); + auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize}); isCompactCachePool_[poolId] = true; auto ptr = std::make_unique( @@ -3108,12 +3664,15 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get(); *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get(); + // TODO: implement serialization for multiple tiers auto serializeMMContainers = [](MMContainers& mmContainers) { MMSerializationTypeContainer state; - for (unsigned int i = 0; i < mmContainers.size(); ++i) { + for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) { for (unsigned int j = 0; j < mmContainers[i].size(); ++j) { - if (mmContainers[i][j]) { - state.pools_ref()[i][j] = mmContainers[i][j]->saveState(); + for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) { + if (mmContainers[i][j][k]) { + state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState(); + } } } } @@ -3123,7 +3682,8 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { serializeMMContainers(mmContainers_); AccessSerializationType accessContainerState = accessContainer_->saveState(); - MemoryAllocator::SerializationType allocatorState = allocator_->saveState(); + // TODO: foreach allocator + MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState(); CCacheManager::SerializationType ccState = compactCacheManager_->saveState(); AccessSerializationType chainedItemAccessContainerState = @@ -3148,6 +3708,8 @@ bool CacheAllocator::stopWorkers(std::chrono::seconds timeout) { success &= stopPoolResizer(timeout); success &= stopMemMonitor(timeout); success &= stopReaper(timeout); + success &= stopBackgroundEvictor(timeout); + success &= stopBackgroundPromoter(timeout); return success; } @@ -3185,6 +3747,8 @@ CacheAllocator::shutDown() { (shmShutDownStatus == ShmShutDownRes::kSuccess); shmManager_.reset(); + // TODO: save per-tier state + if (shmShutDownSucceeded) { if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt) return ShutDownStatus::kSuccess; @@ -3248,22 +3812,26 @@ CacheAllocator::deserializeMMContainers( const auto container = deserializer.deserialize(); - MMContainers mmContainers; + /* TODO: right now, we create empty containers because deserialization + * only works for a single (topmost) tier. */ + MMContainers mmContainers{getNumTiers()}; for (auto& kvPool : *container.pools_ref()) { auto i = static_cast(kvPool.first); auto& pool = getPool(i); for (auto& kv : kvPool.second) { auto j = static_cast(kv.first); - MMContainerPtr ptr = - std::make_unique(kv.second, - compressor); - auto config = ptr->getConfig(); - config.addExtraConfig(config_.trackTailHits - ? pool.getAllocationClass(j).getAllocsPerSlab() - : 0); - ptr->setConfig(config); - mmContainers[i][j] = std::move(ptr); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + MMContainerPtr ptr = + std::make_unique(kv.second, + compressor); + auto config = ptr->getConfig(); + config.addExtraConfig(config_.trackTailHits + ? pool.getAllocationClass(j).getAllocsPerSlab() + : 0); + ptr->setConfig(config); + mmContainers[tid][i][j] = std::move(ptr); + } } } // We need to drop the unevictableMMContainer in the desierializer. @@ -3403,6 +3971,8 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime(); ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; ret.reaperStats = getReaperStats(); + ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict); + ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote); ret.numActiveHandles = getNumActiveHandles(); ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_; @@ -3414,11 +3984,14 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { template CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { - const auto totalCacheSize = allocator_->getMemorySize(); - const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised(); - + size_t totalCacheSize = 0; + size_t configuredTotalCacheSize = 0; + for(auto& allocator: allocator_) { + totalCacheSize += allocator->getMemorySize(); + configuredTotalCacheSize += allocator->getMemorySizeInclAdvised(); + } auto addSize = [this](size_t a, PoolId pid) { - return a + allocator_->getPool(pid).getPoolSize(); + return a + allocator_[currentTier()]->getPool(pid).getPoolSize(); }; const auto regularPoolIds = getRegularPoolIds(); const auto ccCachePoolIds = getCCachePoolIds(); @@ -3431,9 +4004,9 @@ CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { configuredTotalCacheSize, configuredRegularCacheSize, configuredCompactCacheSize, - allocator_->getAdvisedMemorySize(), + allocator_[currentTier()]->getAdvisedMemorySize(), memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0, - allocator_->getUnreservedMemorySize(), + allocator_[currentTier()]->getUnreservedMemorySize(), nvmCache_ ? nvmCache_->getSize() : 0, util::getMemAvailable(), util::getRSSBytes()}; @@ -3586,6 +4159,64 @@ bool CacheAllocator::startNewReaper( return true; } +template +auto CacheAllocator::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid) +{ + std::vector asssignedMemory; + // TODO: for now, only evict from tier 0 + auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds()); + for (const auto pid : pools) { + const auto& mpStats = getPoolByTid(pid,tid).getStats(); + for (const auto cid : mpStats.classIds) { + if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) { + asssignedMemory.emplace_back(tid, pid, cid); + } + } + } + return asssignedMemory; +} + +template +bool CacheAllocator::startNewBackgroundEvictor( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + backgroundEvictor_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy, MoverDir::Evict); + result = result && ret; + + if (result) { + backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0)); + } + } + return result; +} + +template +bool CacheAllocator::startNewBackgroundPromoter( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + XDCHECK(getNumTiers() > 1); + backgroundPromoter_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy, MoverDir::Promote); + result = result && ret; + + if (result) { + backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1)); + } + } + return result; +} + template bool CacheAllocator::stopPoolRebalancer( std::chrono::seconds timeout) { @@ -3613,6 +4244,26 @@ bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { return stopWorker("Reaper", reaper_, timeout); } +template +bool CacheAllocator::stopBackgroundEvictor(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundEvictor_.size(); i++) { + auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::stopBackgroundPromoter(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundPromoter_.size(); i++) { + auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout); + result = result && ret; + } + return result; +} + template bool CacheAllocator::cleanupStrayShmSegments( const std::string& cacheDir, bool posix) { @@ -3621,6 +4272,8 @@ bool CacheAllocator::cleanupStrayShmSegments( // cache dir exists. clean up only if there are no other processes // attached. if another process was attached, the following would fail. ShmManager::cleanup(cacheDir, posix); + + // TODO: cleanup per-tier state } catch (const std::exception& e) { XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what()); return false; @@ -3630,7 +4283,8 @@ bool CacheAllocator::cleanupStrayShmSegments( // Any other concurrent process can not be attached to the segments or // even if it does, we want to mark it for destruction. ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix); - ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix); + ShmManager::removeByName(cacheDir, detail::kShmCacheName + + std::to_string(0 /* TODO: per tier */), posix); ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix); ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName, posix); @@ -3644,14 +4298,16 @@ uint64_t CacheAllocator::getItemPtrAsOffset(const void* ptr) { // the two differ (e.g. Mac OS 12) - causing templating instantiation // errors downstream. + auto tid = getTierId(ptr); + // if this succeeeds, the address is valid within the cache. - allocator_->getAllocInfo(ptr); + allocator_[tid]->getAllocInfo(ptr); if (!isOnShm_ || !shmManager_) { throw std::invalid_argument("Shared memory not used"); } - const auto& shm = shmManager_->getShmByName(detail::kShmCacheName); + const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid)); return reinterpret_cast(ptr) - reinterpret_cast(shm.getCurrentMapping().addr); diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index ed0096390a..d02c093c12 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -38,6 +40,7 @@ #include #pragma GCC diagnostic pop +#include "cachelib/allocator/BackgroundMover.h" #include "cachelib/allocator/CCacheManager.h" #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/CacheAllocatorConfig.h" @@ -710,6 +713,11 @@ class CacheAllocator : public CacheBase { // @return the full usable size for this item uint32_t getUsableSize(const Item& item) const; + // gets the allocation class assigned to BG worker + auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid); + bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid); + size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers); + // Get a random item from memory // This is useful for profiling and sampling cachelib managed memory // @@ -806,7 +814,7 @@ class CacheAllocator : public CacheBase { // @param config new config for the pool // // @throw std::invalid_argument if the poolId is invalid - void overridePoolConfig(PoolId pid, const MMConfig& config); + void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config); // update an existing pool's rebalance strategy // @@ -847,8 +855,9 @@ class CacheAllocator : public CacheBase { // @return true if the operation succeeded. false if the size of the pool is // smaller than _bytes_ // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call shrinkPool for specific tier? bool shrinkPool(PoolId pid, size_t bytes) { - return allocator_->shrinkPool(pid, bytes); + return allocator_[currentTier()]->shrinkPool(pid, bytes); } // grow an existing pool by _bytes_. This will fail if there is no @@ -857,8 +866,9 @@ class CacheAllocator : public CacheBase { // @return true if the pool was grown. false if the necessary number of // bytes were not available. // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call growPool for specific tier? bool growPool(PoolId pid, size_t bytes) { - return allocator_->growPool(pid, bytes); + return allocator_[currentTier()]->growPool(pid, bytes); } // move bytes from one pool to another. The source pool should be at least @@ -871,7 +881,7 @@ class CacheAllocator : public CacheBase { // correct size to do the transfer. // @throw std::invalid_argument if src or dest is invalid pool bool resizePools(PoolId src, PoolId dest, size_t bytes) override { - return allocator_->resizePools(src, dest, bytes); + return allocator_[currentTier()]->resizePools(src, dest, bytes); } // Add a new compact cache with given name and size @@ -1053,6 +1063,11 @@ class CacheAllocator : public CacheBase { // @param reaperThrottleConfig throttling config bool startNewReaper(std::chrono::milliseconds interval, util::Throttler::Config reaperThrottleConfig); + + bool startNewBackgroundPromoter(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + bool startNewBackgroundEvictor(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); // Stop existing workers with a timeout bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{ @@ -1062,6 +1077,8 @@ class CacheAllocator : public CacheBase { 0}); bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0}); bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0}); // Set pool optimization to either true or false // @@ -1076,12 +1093,13 @@ class CacheAllocator : public CacheBase { // @throw std::invalid_argument if the memory does not belong to this // cache allocator AllocInfo getAllocInfo(const void* memory) const { - return allocator_->getAllocInfo(memory); + return allocator_[getTierId(memory)]->getAllocInfo(memory); } // return the ids for the set of existing pools in this cache. std::set getPoolIds() const override final { - return allocator_->getPoolIds(); + // all tiers have the same pool ids. TODO: deduplicate + return allocator_[0]->getPoolIds(); } // return a list of pool ids that are backing compact caches. This includes @@ -1093,18 +1111,22 @@ class CacheAllocator : public CacheBase { // return the pool with speicified id. const MemoryPool& getPool(PoolId pid) const override final { - return allocator_->getPool(pid); + return allocator_[currentTier()]->getPool(pid); + } + + const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final { + return allocator_[tid]->getPool(pid); } // calculate the number of slabs to be advised/reclaimed in each pool PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { auto regularPoolIds = getRegularPoolIds(); - return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds); + return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds); } // update number of slabs to advise in the cache void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final { - allocator_->updateNumSlabsToAdvise(numSlabsToAdvise); + allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise); } // returns a valid PoolId corresponding to the name or kInvalidPoolId if the @@ -1112,8 +1134,9 @@ class CacheAllocator : public CacheBase { PoolId getPoolId(folly::StringPiece name) const noexcept; // returns the pool's name by its poolId. - std::string getPoolName(PoolId poolId) const override { - return allocator_->getPoolName(poolId); + std::string getPoolName(PoolId poolId) const { + // all tiers have the same pool names. + return allocator_[0]->getPoolName(poolId); } // get stats related to all kinds of slab release events. @@ -1145,6 +1168,52 @@ class CacheAllocator : public CacheBase { auto stats = reaper_ ? reaper_->getStats() : ReaperStats{}; return stats; } + + // returns the background mover stats + BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const { + + auto stats = BackgroundMoverStats{}; + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) + stats += bg->getStats(); + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + } + return stats; + + } + + + std::map>> + getBackgroundMoverClassStats(MoverDir direction) const { + std::map>> stats; + + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } + + return stats; + } + // return the LruType of an item typename MMType::LruType getItemLruType(const Item& item) const; @@ -1159,6 +1228,9 @@ class CacheAllocator : public CacheBase { // whether it is object-cache bool isObjectCache() const override final { return false; } + // combined pool size for all memory tiers + size_t getPoolSize(PoolId pid) const; + // pool stats by pool id PoolStats getPoolStats(PoolId pid) const override final; @@ -1175,6 +1247,9 @@ class CacheAllocator : public CacheBase { // return cache's memory usage stats CacheMemoryStats getCacheMemoryStats() const override final; + // return stats for Allocation Class + ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final; + // return the nvm cache stats map util::StatsMap getNvmCacheStatsMap() const override final; @@ -1284,6 +1359,7 @@ class CacheAllocator : public CacheBase { sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item), "vtable overhead"); + // Check for CompressedPtr single/multi tier support static_assert(32 == sizeof(Item), "item overhead is 32 bytes"); // make sure there is no overhead in ChainedItem on top of a regular Item @@ -1308,7 +1384,7 @@ class CacheAllocator : public CacheBase { private: // wrapper around Item's refcount and active handle tracking - FOLLY_ALWAYS_INLINE void incRef(Item& it); + FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(Item& it, bool failIfMoving); FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it); // drops the refcount and if needed, frees the allocation back to the memory @@ -1359,6 +1435,12 @@ class CacheAllocator : public CacheBase { bool nascent = false, const Item* toRecycle = nullptr); + // Must be called by the thread which called markForEviction and + // succeeded. After this call, the item is unlinked from Access and + // MM Containers. The item is no longer marked as exclusive and it's + // ref count is 0 - it's available for recycling. + void unlinkItemForEviction(Item& it); + // acquires an handle on the item. returns an empty handle if it is null. // @param it pointer to an item // @return WriteHandle return a handle to this item @@ -1378,11 +1460,14 @@ class CacheAllocator : public CacheBase { using MMContainerPtr = std::unique_ptr; using MMContainers = - std::array, - MemoryPoolManager::kMaxPools>; + std::vector, + MemoryPoolManager::kMaxPools>>; void createMMContainers(const PoolId pid, MMConfig config); + TierId getTierId(const Item& item) const; + TierId getTierId(const void* ptr) const; + // acquire the MMContainer corresponding to the the Item's class and pool. // // @return pointer to the MMContainer. @@ -1390,7 +1475,12 @@ class CacheAllocator : public CacheBase { // allocation from the memory allocator. MMContainer& getMMContainer(const Item& item) const noexcept; - MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept; + MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept; + + // Get stats of the specified pid and cid. + // If such mmcontainer is not valid (pool id or cid out of bound) + // or the mmcontainer is not initialized, return an empty stat. + MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept; // create a new cache allocation. The allocation can be initialized // appropriately and made accessible through insert or insertOrReplace. @@ -1420,7 +1510,20 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromBgThread = false); + + // create a new cache allocation on specific memory tier. + // For description see allocateInternal. + // + // @param tid id a memory tier + WriteHandle allocateInternalTier(TierId tid, + PoolId id, + Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread); // Allocate a chained item // @@ -1448,17 +1551,17 @@ class CacheAllocator : public CacheBase { // @return handle to the parent item if the validations pass // otherwise, an empty Handle is returned. // - ReadHandle validateAndGetParentHandleForChainedMoveLocked( + WriteHandle validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey); // Given an existing item, allocate a new one for the // existing one to later be moved into. // - // @param oldItem the item we want to allocate a new item for + // @param item reference to the item we want to allocate a new item for // // @return handle to the newly allocated item // - WriteHandle allocateNewItemForOldItem(const Item& oldItem); + WriteHandle allocateNewItemForOldItem(const Item& item); // internal helper that grabs a refcounted handle to the item. This does // not record the access to reflect in the mmContainer. @@ -1507,12 +1610,21 @@ class CacheAllocator : public CacheBase { // not exist. FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode); + // Moves a regular item to a different memory tier. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being moved into + // + // @return true If the move was completed, and the containers were updated + // successfully. + bool moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl); + // Moves a regular item to a different slab. This should only be used during // slab release after the item's exclusive bit has been set. The user supplied // callback is responsible for copying the contents and fixing the semantics // of chained item. // - // @param oldItem Reference to the item being moved + // @param oldItem item being moved // @param newItemHdl Reference to the handle of the new item being moved into // // @return true If the move was completed, and the containers were updated @@ -1592,6 +1704,10 @@ class CacheAllocator : public CacheBase { // false if the item is not in MMContainer bool removeFromMMContainer(Item& item); + using EvictionIterator = typename MMContainer::LockedIterator; + + WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); } + // Replaces an item in the MMContainer with another item, at the same // position. // @@ -1602,6 +1718,17 @@ class CacheAllocator : public CacheBase { // destination item did not exist in the container, or if the // source item already existed. bool replaceInMMContainer(Item& oldItem, Item& newItem); + bool replaceInMMContainer(Item* oldItem, Item& newItem); + bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem); + + TierId getTargetTierForItem(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime); + + bool shouldEvictToNextMemoryTier( + TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item); // Replaces an item in the MMContainer with another item, at the same // position. Or, if the two chained items belong to two different MM @@ -1658,28 +1785,41 @@ class CacheAllocator : public CacheBase { // @param pid the id of the pool to look for evictions inside // @param cid the id of the class to look for evictions inside // @return An evicted item or nullptr if there is no suitable candidate. - Item* findEviction(PoolId pid, ClassId cid); + Item* findEviction(TierId tid, PoolId pid, ClassId cid); - using EvictionIterator = typename MMContainer::LockedIterator; + // Try to move the item down to the next memory tier + // + // @param tid current tier ID of the item + // @param pid the pool ID the item belong to. + // @param item the item to evict + // + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); - // Advance the current iterator and try to evict a regular item + WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread); + + // Wakes up waiters if there are any // - // @param mmContainer the container to look for evictions. - // @param itr iterator holding the item + // @param item wakes waiters that are waiting on that item + // @param handle handle to pass to the waiters + void wakeUpWaiters(Item& item, WriteHandle handle); + + // Unmarks item as moving and wakes up any waiters waiting on that item // - // @return valid handle to regular item on success. This will be the last - // handle to the item. On failure an empty handle. - WriteHandle advanceIteratorAndTryEvictRegularItem(MMContainer& mmContainer, - EvictionIterator& itr); + // @param item wakes waiters that are waiting on that item + // @param handle handle to pass to the waiters + typename RefcountWithFlags::Value unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle); - // Advance the current iterator and try to evict a chained item - // Iterator may also be reset during the course of this function + // Try to move the item down to the next memory tier // - // @param itr iterator holding the item + // @param item the item to evict // - // @return valid handle to the parent item on success. This will be the last - // handle to the item - WriteHandle advanceIteratorAndTryEvictChainedItem(EvictionIterator& itr); + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread); // Deserializer CacheAllocatorMetadata and verify the version // @@ -1693,7 +1833,7 @@ class CacheAllocator : public CacheBase { const typename Item::PtrCompressor& compressor); unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final { - return allocator_->reclaimSlabsAndGrow(id, numSlabs); + return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs); } FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const { @@ -1752,26 +1892,27 @@ class CacheAllocator : public CacheBase { const void* hint = nullptr) final; // @param releaseContext slab release context - void releaseSlabImpl(const SlabReleaseContext& releaseContext); + void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext); // @return true when successfully marked as moving, // fasle when this item has already been freed - bool markExclusiveForSlabRelease(const SlabReleaseContext& ctx, - void* alloc, - util::Throttler& throttler); + bool markMovingForSlabRelease(const SlabReleaseContext& ctx, + void* alloc, + util::Throttler& throttler); // "Move" (by copying) the content in this item to another memory // location by invoking the move callback. // // // @param ctx slab release context - // @param item old item to be moved elsewhere + // @param oldItem old item to be moved elsewhere + // @param handle handle to the item or to it's parent (if chained) // @param throttler slow this function down as not to take too much cpu // // @return true if the item has been moved // false if we have exhausted moving attempts bool moveForSlabRelease(const SlabReleaseContext& ctx, - Item& item, + Item& oldItem, util::Throttler& throttler); // "Move" (by copying) the content in this item to another memory @@ -1794,18 +1935,13 @@ class CacheAllocator : public CacheBase { Item& item, util::Throttler& throttler); - // Helper function to evict a normal item for slab release - // - // @return last handle for corresponding to item on success. empty handle on - // failure. caller can retry if needed. - WriteHandle evictNormalItemForSlabRelease(Item& item); + typename NvmCacheT::PutToken createPutToken(Item& item); - // Helper function to evict a child item for slab release - // As a side effect, the parent item is also evicted + // Helper function to remove a item if predicates is true. // - // @return last handle to the parent item of the child on success. empty - // handle on failure. caller can retry. - WriteHandle evictChainedItemForSlabRelease(ChainedItem& item); + // @return last handle to the item on success. empty handle on failure. + template + WriteHandle removeIf(Item& item, Fn&& predicate); // Helper function to remove a item if expired. // @@ -1824,10 +1960,141 @@ class CacheAllocator : public CacheBase { // primitives. So we consciously exempt ourselves here from TSAN data race // detection. folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__); - auto slabsSkipped = allocator_->forEachAllocation(std::forward(f)); + auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward(f)); stats().numReaperSkippedSlabs.add(slabsSkipped); } + // exposed for the background evictor to iterate through the memory and evict + // in batch. This should improve insertion path for tiered memory config + size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { +auto& mmContainer = getMMContainer(tid, pid, cid); + size_t evictions = 0; + size_t evictionCandidates = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) { + while (candidates.size() < batch && + (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && + itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + if (candidate->markMoving(true)) { + mmContainer.remove(itr); + candidates.push_back(candidate); + } else { + ++itr; + } + } + }); + + for (Item *candidate : candidates) { + auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */); + if (!evictedToNext) { + auto token = createPutToken(*candidate); + + auto ret = candidate->markForEvictionWhenMoving(); + XDCHECK(ret); + + unlinkItemForEviction(*candidate); + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*candidate, WriteHandle{}); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) { + nvmCache_->put(*candidate, std::move(token)); + } + } else { + evictions++; + XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving()); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + XDCHECK(!candidate->isAccessible()); + XDCHECK(candidate->getKey() == evictedToNext->getKey()); + + wakeUpWaiters(*candidate, std::move(evictedToNext)); + } + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + return evictions; + } + + size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { +auto& mmContainer = getMMContainer(tid, pid, cid); + size_t promotions = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + + mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + // TODO: only allow it for read-only items? + // or implement mvcc + if (candidate->markMoving(true)) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto promoted = tryPromoteToNextMemoryTier(*candidate, true); + if (promoted) { + promotions++; + removeFromMMContainer(*candidate); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + wakeUpWaiters(*candidate, std::move(promoted)); + } else { + // we failed to allocate a new item, this item is no longer moving + auto ref = unmarkMovingAndWakeUpWaiters(*candidate, {}); + if (UNLIKELY(ref == 0)) { + const auto res = + releaseBackToAllocator(*candidate, + RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + } + return promotions; + } + // returns true if nvmcache is enabled and we should write this item to // nvmcache. bool shouldWriteToNvmCache(const Item& item); @@ -1868,10 +2135,10 @@ class CacheAllocator : public CacheBase { std::unique_ptr& worker, std::chrono::seconds timeout = std::chrono::seconds{0}); - ShmSegmentOpts createShmCacheOpts(); - std::unique_ptr createNewMemoryAllocator(); - std::unique_ptr restoreMemoryAllocator(); - std::unique_ptr restoreCCacheManager(); + ShmSegmentOpts createShmCacheOpts(TierId tid); + std::unique_ptr createNewMemoryAllocator(TierId tid); + std::unique_ptr restoreMemoryAllocator(TierId tid); + std::unique_ptr restoreCCacheManager(TierId tid); PoolIds filterCompactCachePools(const PoolIds& poolIds) const; @@ -1891,7 +2158,7 @@ class CacheAllocator : public CacheBase { } typename Item::PtrCompressor createPtrCompressor() const { - return allocator_->createPtrCompressor(); + return typename Item::PtrCompressor(allocator_); } // helper utility to throttle and optionally log. @@ -1914,9 +2181,14 @@ class CacheAllocator : public CacheBase { // @param type the type of initialization // @return nullptr if the type is invalid - // @return pointer to memory allocator + // @return vector of pointers to memory allocator // @throw std::runtime_error if type is invalid - std::unique_ptr initAllocator(InitMemType type); + std::vector> initAllocator(InitMemType type); + + std::vector> createPrivateAllocator(); + std::vector> createAllocators(); + std::vector> restoreAllocators(); + // @param type the type of initialization // @return nullptr if the type is invalid // @return pointer to access container @@ -1928,18 +2200,14 @@ class CacheAllocator : public CacheBase { std::optional saveNvmCache(); void saveRamCache(); - static bool itemExclusivePredicate(const Item& item) { - return item.getRefCount() == 0; + static bool itemSlabMovePredicate(const Item& item) { + return item.isMoving() && item.getRefCount() == 0; } static bool itemExpiryPredicate(const Item& item) { return item.getRefCount() == 1 && item.isExpired(); } - static bool parentEvictForSlabReleasePredicate(const Item& item) { - return item.getRefCount() == 1 && !item.isExclusive(); - } - std::unique_ptr createDeserializer(); // Execute func on each item. `func` can throw exception but must ensure @@ -1978,6 +2246,100 @@ class CacheAllocator : public CacheBase { // BEGIN private members + TierId currentTier() const { + // TODO: every function which calls this method should be refactored. + // We should go case by case and either make such function work on + // all tiers or expose separate parameter to describe the tier ID. + return 0; + } + + unsigned getNumTiers() const { + return memoryTierConfigs.size(); + } + + size_t memoryTierSize(TierId tid) const; + + WriteHandle handleWithWaitContextForMovingItem(Item& item); + + size_t wakeUpWaitersLocked(folly::StringPiece key, WriteHandle&& handle); + + class MoveCtx { + public: + MoveCtx() {} + + ~MoveCtx() { + // prevent any further enqueue to waiters + // Note: we don't need to hold locks since no one can enqueue + // after this point. + wakeUpWaiters(); + } + + // record the item handle. Upon destruction we will wake up the waiters + // and pass a clone of the handle to the callBack. By default we pass + // a null handle + void setItemHandle(WriteHandle _it) { it = std::move(_it); } + + // enqueue a waiter into the waiter list + // @param waiter WaitContext + void addWaiter(std::shared_ptr> waiter) { + XDCHECK(waiter); + waiters.push_back(std::move(waiter)); + } + + size_t numWaiters() const { return waiters.size(); } + + private: + // notify all pending waiters that are waiting for the fetch. + void wakeUpWaiters() { + bool refcountOverflowed = false; + for (auto& w : waiters) { + // If refcount overflowed earlier, then we will return miss to + // all subsequent waitors. + if (refcountOverflowed) { + w->set(WriteHandle{}); + continue; + } + + try { + w->set(it.clone()); + } catch (const exception::RefcountOverflow&) { + // We'll return a miss to the user's pending read, + // so we should enqueue a delete via NvmCache. + // TODO: cache.remove(it); + refcountOverflowed = true; + } + } + } + + WriteHandle it; // will be set when Context is being filled + std::vector>> waiters; // list of + // waiters + }; + using MoveMap = + folly::F14ValueMap, + folly::HeterogeneousAccessHash>; + + static size_t getShardForKey(folly::StringPiece key) { + return folly::Hash()(key) % kShards; + } + + MoveMap& getMoveMapForShard(size_t shard) { + return movesMap_[shard].movesMap_; + } + + MoveMap& getMoveMap(folly::StringPiece key) { + return getMoveMapForShard(getShardForKey(key)); + } + + std::unique_lock getMoveLockForShard(size_t shard) { + return std::unique_lock(moveLock_[shard].moveLock_); + } + + std::unique_lock getMoveLock(folly::StringPiece key) { + return getMoveLockForShard(getShardForKey(key)); + } + // Whether the memory allocator for this cache allocator was created on shared // memory. The hash table, chained item hash table etc is also created on // shared memory except for temporary shared memory mode when they're created @@ -1986,6 +2348,8 @@ class CacheAllocator : public CacheBase { Config config_{}; + const typename Config::MemoryTierConfigs memoryTierConfigs; + // Manages the temporary shared memory segment for memory allocator that // is not persisted when cache process exits. std::unique_ptr tempShm_; @@ -2003,9 +2367,10 @@ class CacheAllocator : public CacheBase { const MMConfig mmConfig_{}; // the memory allocator for allocating out of the available memory. - std::unique_ptr allocator_; + std::vector> allocator_; // compact cache allocator manager + // TODO: per tier? std::unique_ptr compactCacheManager_; // compact cache instances reside here when user "add" or "attach" compact @@ -2055,6 +2420,10 @@ class CacheAllocator : public CacheBase { // free memory monitor std::unique_ptr memMonitor_; + + // background evictor + std::vector>> backgroundEvictor_; + std::vector>> backgroundPromoter_; // check whether a pool is a slabs pool std::array isCompactCachePool_{}; @@ -2067,6 +2436,22 @@ class CacheAllocator : public CacheBase { // poolResizer_, poolOptimizer_, memMonitor_, reaper_ mutable std::mutex workersMutex_; + static constexpr size_t kShards = 8192; // TODO: need to define right value + + struct MovesMapShard { + alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_; + }; + + struct MoveLock { + alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_; + }; + + // a map of all pending moves + std::vector movesMap_; + + // a map of move locks for each shard + std::vector moveLock_; + // time when the ram cache was first created const uint32_t cacheCreationTime_{0}; @@ -2100,6 +2485,7 @@ class CacheAllocator : public CacheBase { // Make this friend to give access to acquire and release friend ReadHandle; friend ReaperAPIWrapper; + friend BackgroundMoverAPIWrapper; friend class CacheAPIWrapperForNvm; friend class FbInternalRuntimeUpdateWrapper; friend class objcache2::ObjectCache; diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h index ec44ff8467..503b121ccf 100644 --- a/cachelib/allocator/CacheAllocatorConfig.h +++ b/cachelib/allocator/CacheAllocatorConfig.h @@ -31,6 +31,7 @@ #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/NvmAdmissionPolicy.h" #include "cachelib/allocator/PoolOptimizeStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/allocator/RebalanceStrategy.h" #include "cachelib/allocator/Util.h" #include "cachelib/common/EventInterface.h" @@ -207,6 +208,9 @@ class CacheAllocatorConfig { // Accepts vector of MemoryTierCacheConfig. Each vector element describes // configuration for a single memory cache tier. Tier sizes are specified as // ratios, the number of parts of total cache size each tier would occupy. + // @throw std::invalid_argument if: + // - the size of configs is 0 + // - the size of configs is greater than kMaxCacheMemoryTiers CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs); // Return reference to MemoryTierCacheConfigs. @@ -262,6 +266,16 @@ class CacheAllocatorConfig { std::chrono::seconds regularInterval, std::chrono::seconds ccacheInterval, uint32_t ccacheStepSizePercent); + + // Enable the background evictor - scans a tier to look for objects + // to evict to the next tier + CacheAllocatorConfig& enableBackgroundEvictor( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + CacheAllocatorConfig& enableBackgroundPromoter( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); // This enables an optimization for Pool rebalancing and resizing. // The rough idea is to ensure only the least useful items are evicted when @@ -337,6 +351,17 @@ class CacheAllocatorConfig { poolOptimizeStrategy != nullptr; } + // @return whether background evictor thread is enabled + bool backgroundEvictorEnabled() const noexcept { + return backgroundEvictorInterval.count() > 0 && + backgroundEvictorStrategy != nullptr; + } + + bool backgroundPromoterEnabled() const noexcept { + return backgroundPromoterInterval.count() > 0 && + backgroundPromoterStrategy != nullptr; + } + // @return whether memory monitor is enabled bool memMonitoringEnabled() const noexcept { return memMonitorConfig.mode != MemoryMonitor::Disabled && @@ -374,8 +399,7 @@ class CacheAllocatorConfig { std::map serialize() const; // The max number of memory cache tiers - // TODO: increase this number when multi-tier configs are enabled - inline static const size_t kMaxCacheMemoryTiers = 1; + inline static const size_t kMaxCacheMemoryTiers = 2; // Cache name for users to indentify their own cache. std::string cacheName{""}; @@ -448,6 +472,16 @@ class CacheAllocatorConfig { // The slab release process is considered as being stuck if it does not // make any progress for the below threshold std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)}; + + // rebalance to avoid alloc fialures. + std::shared_ptr backgroundEvictorStrategy; + std::shared_ptr backgroundPromoterStrategy; + // time interval to sleep between runs of the background evictor + std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}}; + std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}}; + + size_t backgroundEvictorThreads{1}; + size_t backgroundPromoterThreads{1}; // time interval to sleep between iterations of pool size optimization, // for regular pools and compact caches @@ -587,6 +621,32 @@ class CacheAllocatorConfig { // If true, we will delay worker start until user explicitly calls // CacheAllocator::startCacheWorkers() bool delayCacheWorkersStart{false}; + + // see MultiTierDataMovement.md + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{1}; + uint64_t minPromotionBatch{1}; + + uint64_t maxEvictionPromotionHotness{60}; + + bool disableEvictionToMemory{false}; + double minAcAllocationWatermark{0.0}; + double maxAcAllocationWatermark{0.0}; + double acTopTierEvictionWatermark{0.0}; // TODO: make it per TIER? + uint64_t sizeThresholdPolicy{0}; + double defaultTierChancePercentage{100.0}; // TODO: default could be based on ratio + uint64_t forceAllocationTier{UINT64_MAX}; friend CacheT; @@ -924,6 +984,26 @@ CacheAllocatorConfig& CacheAllocatorConfig::enablePoolRebalancing( return *this; } +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundEvictor( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t evictorThreads) { + backgroundEvictorStrategy = strategy; + backgroundEvictorInterval = interval; + backgroundEvictorThreads = evictorThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundPromoter( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t promoterThreads) { + backgroundPromoterStrategy = strategy; + backgroundPromoterInterval = interval; + backgroundPromoterThreads = promoterThreads; + return *this; +} + template CacheAllocatorConfig& CacheAllocatorConfig::enablePoolResizing( std::shared_ptr resizeStrategy, @@ -1085,7 +1165,7 @@ std::map CacheAllocatorConfig::serialize() const { configMap["size"] = std::to_string(size); configMap["cacheDir"] = cacheDir; - configMap["posixShm"] = usePosixShm ? "set" : "empty"; + configMap["posixShm"] = isUsingPosixShm() ? "set" : "empty"; configMap["defaultAllocSizes"] = ""; // Stringify std::set diff --git a/cachelib/allocator/CacheItem-inl.h b/cachelib/allocator/CacheItem-inl.h index f59fa9d599..b33c1ea28a 100644 --- a/cachelib/allocator/CacheItem-inl.h +++ b/cachelib/allocator/CacheItem-inl.h @@ -148,15 +148,16 @@ std::string CacheItem::toString() const { return folly::sformat( "item: " "memory={}:raw-ref={}:size={}:key={}:hex-key={}:" - "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime=" + "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:" + "isMoving={}:references={}:ctime=" "{}:" "expTime={}:updateTime={}:isNvmClean={}:isNvmEvicted={}:hasChainedItem=" "{}", this, getRefCountAndFlagsRaw(), getSize(), folly::humanify(getKey().str()), folly::hexlify(getKey()), - isInMMContainer(), isAccessible(), isExclusive(), getRefCount(), - getCreationTime(), getExpiryTime(), getLastAccessTime(), isNvmClean(), - isNvmEvicted(), hasChainedItem()); + isInMMContainer(), isAccessible(), isMarkedForEviction(), isMoving(), + getRefCount(), getCreationTime(), getExpiryTime(), getLastAccessTime(), + isNvmClean(), isNvmEvicted(), hasChainedItem()); } } @@ -217,23 +218,43 @@ bool CacheItem::isInMMContainer() const noexcept { } template -bool CacheItem::markExclusive() noexcept { - return ref_.markExclusive(); +bool CacheItem::markForEviction() noexcept { + return ref_.markForEviction(); } template -RefcountWithFlags::Value CacheItem::unmarkExclusive() noexcept { - return ref_.unmarkExclusive(); +RefcountWithFlags::Value CacheItem::unmarkForEviction() noexcept { + return ref_.unmarkForEviction(); } template -bool CacheItem::isExclusive() const noexcept { - return ref_.isExclusive(); +bool CacheItem::isMarkedForEviction() const noexcept { + return ref_.isMarkedForEviction(); } template -bool CacheItem::isOnlyExclusive() const noexcept { - return ref_.isOnlyExclusive(); +bool CacheItem::markForEvictionWhenMoving() { + return ref_.markForEvictionWhenMoving(); +} + +template +bool CacheItem::markMoving(bool failIfRefNotZero) { + return ref_.markMoving(failIfRefNotZero); +} + +template +RefcountWithFlags::Value CacheItem::unmarkMoving() noexcept { + return ref_.unmarkMoving(); +} + +template +bool CacheItem::isMoving() const noexcept { + return ref_.isMoving(); +} + +template +bool CacheItem::isOnlyMoving() const noexcept { + return ref_.isOnlyMoving(); } template @@ -335,7 +356,7 @@ bool CacheItem::updateExpiryTime(uint32_t expiryTimeSecs) noexcept { // check for moving to make sure we are not updating the expiry time while at // the same time re-allocating the item with the old state of the expiry time // in moveRegularItem(). See D6852328 - if (isExclusive() || !isInMMContainer() || isChainedItem()) { + if (isMoving() || isMarkedForEviction() || !isInMMContainer() || isChainedItem()) { return false; } // attempt to atomically update the value of expiryTime @@ -451,12 +472,14 @@ std::string CacheChainedItem::toString() const { return folly::sformat( "chained item: " "memory={}:raw-ref={}:size={}:parent-compressed-ptr={}:" - "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime={}" + "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:" + "isMoving={}:references={}:ctime={}" ":" "expTime={}:updateTime={}", this, Item::getRefCountAndFlagsRaw(), Item::getSize(), cPtr.getRaw(), - Item::isInMMContainer(), Item::isAccessible(), Item::isExclusive(), - Item::getRefCount(), Item::getCreationTime(), Item::getExpiryTime(), + Item::isInMMContainer(), Item::isAccessible(), + Item::isMarkedForEviction(), Item::isMoving(), Item::getRefCount(), + Item::getCreationTime(), Item::getExpiryTime(), Item::getLastAccessTime()); } diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h index 06136db032..6728b654eb 100644 --- a/cachelib/allocator/CacheItem.h +++ b/cachelib/allocator/CacheItem.h @@ -46,6 +46,9 @@ class BaseAllocatorTest; template class AllocatorHitStatsTest; +template +class AllocatorMemoryTiersTest; + template class MapTest; @@ -305,12 +308,17 @@ class CACHELIB_PACKED_ATTR CacheItem { */ RefcountWithFlags::Value getRefCountAndFlagsRaw() const noexcept; - FOLLY_ALWAYS_INLINE void incRef() { - if (LIKELY(ref_.incRef())) { - return; + // Increments item's ref count + // + // @return true on success, failure if item is marked as exclusive + // @throw exception::RefcountOverflow on ref count overflow + FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(bool failIfMoving) { + try { + return ref_.incRef(failIfMoving); + } catch (exception::RefcountOverflow& e) { + throw exception::RefcountOverflow( + folly::sformat("{} item: {}", e.what(), toString())); } - throw exception::RefcountOverflow( - folly::sformat("Refcount maxed out. item: {}", toString())); } FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef() { @@ -344,23 +352,43 @@ class CACHELIB_PACKED_ATTR CacheItem { /** * The following two functions corresond to whether or not an item is - * currently in the process of being moved. This happens during a slab - * rebalance, eviction or resize operation. + * currently in the process of being evicted. * - * An item can only be marked exclusive when `isInMMContainer` returns true. + * An item can only be marked exclusive when `isInMMContainer` returns true + * and item is not already exclusive nor moving and the ref count is 0. * This operation is atomic. * - * User can also query if an item "isOnlyExclusive". This returns true only - * if the refcount is 0 and only the exclusive bit is set. - * - * Unmarking exclusive does not depend on `isInMMContainer`. + * Unmarking exclusive does not depend on `isInMMContainer` * Unmarking exclusive will also return the refcount at the moment of * unmarking. */ - bool markExclusive() noexcept; - RefcountWithFlags::Value unmarkExclusive() noexcept; - bool isExclusive() const noexcept; - bool isOnlyExclusive() const noexcept; + bool markForEviction() noexcept; + RefcountWithFlags::Value unmarkForEviction() noexcept; + bool isMarkedForEviction() const noexcept; + + /** + * The following functions correspond to whether or not an item is + * currently in the processed of being moved. When moving, ref count + * is always >= 1. + * + * An item can only be marked moving when `isInMMContainer` returns true + * and item is not already exclusive nor moving. + * + * User can also query if an item "isOnlyMoving". This returns true only + * if the refcount is one and only the exclusive bit is set. + * + * Unmarking moving does not depend on `isInMMContainer` + * Unmarking moving will also return the refcount at the moment of + * unmarking. + */ + bool markMoving(bool failIfRefNotZero); + RefcountWithFlags::Value unmarkMoving() noexcept; + bool isMoving() const noexcept; + bool isOnlyMoving() const noexcept; + + /** This function attempts to mark item as exclusive. + * Can only be called on the item that is moving.*/ + bool markForEvictionWhenMoving(); /** * Item cannot be marked both chained allocation and @@ -448,6 +476,8 @@ class CACHELIB_PACKED_ATTR CacheItem { FRIEND_TEST(ItemTest, NonStringKey); template friend class facebook::cachelib::tests::AllocatorHitStatsTest; + template + friend class facebook::cachelib::tests::AllocatorMemoryTiersTest; }; // A chained item has a hook pointing to the next chained item. The hook is diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp index c16149df6b..b4770a3480 100644 --- a/cachelib/allocator/CacheStats.cpp +++ b/cachelib/allocator/CacheStats.cpp @@ -44,6 +44,8 @@ void Stats::init() { initToZero(*fragmentationSize); initToZero(*chainedItemEvictions); initToZero(*regularItemEvictions); + + classAllocLatency = std::make_unique(); } template @@ -51,7 +53,7 @@ struct SizeVerify {}; void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { #ifndef SKIP_SIZE_VERIFY - SizeVerify a = SizeVerify<16176>{}; + SizeVerify a = SizeVerify<16192>{}; std::ignore = a; #endif ret.numCacheGets = numCacheGets.get(); diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h index fb9955b805..46c051be14 100644 --- a/cachelib/allocator/CacheStats.h +++ b/cachelib/allocator/CacheStats.h @@ -25,6 +25,7 @@ #include "cachelib/allocator/memory/Slab.h" #include "cachelib/common/FastStats.h" #include "cachelib/common/PercentileStats.h" +#include "cachelib/common/RollingStats.h" #include "cachelib/common/Time.h" namespace facebook { @@ -289,6 +290,27 @@ struct ReaperStats { uint64_t avgTraversalTimeMs{0}; }; +// Mover Stats +struct BackgroundMoverStats { + // the number of items this worker moved by looking at pools/classes stats + uint64_t numMovedItems{0}; + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + // total number of classes + uint64_t totalClasses{0}; + // eviction size + uint64_t totalBytesMoved{0}; + + BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) { + numMovedItems += rhs.numMovedItems; + runCount += rhs.runCount; + totalClasses += rhs.totalClasses; + totalBytesMoved += rhs.totalBytesMoved; + return *this; + } +}; + + // CacheMetadata type to export struct CacheMetadata { // allocator_version @@ -309,6 +331,11 @@ struct Stats; // Stats that apply globally in cache and // the ones that are aggregated over all pools struct GlobalCacheStats { + // background eviction stats + BackgroundMoverStats evictionStats; + + BackgroundMoverStats promotionStats; + // number of calls to CacheAllocator::find uint64_t numCacheGets{0}; diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h index b2a5f8c469..19a15fbbd4 100644 --- a/cachelib/allocator/CacheStatsInternal.h +++ b/cachelib/allocator/CacheStatsInternal.h @@ -21,6 +21,7 @@ #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/memory/MemoryAllocator.h" #include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -229,6 +230,14 @@ struct Stats { std::unique_ptr chainedItemEvictions{}; std::unique_ptr regularItemEvictions{}; + using PerTierPoolClassRollingStats = std::array< + std::array, + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; + + // rolling latency tracking for every alloc class in every pool + std::unique_ptr classAllocLatency{}; + // Eviction failures due to parent cannot be removed from access container AtomicCounter evictFailParentAC{0}; diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp new file mode 100644 index 0000000000..4a900c2cb1 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeThresholdStrategy.h" + +#include + +namespace facebook { +namespace cachelib { + +FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch) + : lowEvictionAcWatermark(lowEvictionAcWatermark), + highEvictionAcWatermark(highEvictionAcWatermark), + maxEvictionBatch(maxEvictionBatch), + minEvictionBatch(minEvictionBatch) {} + +std::vector FreeThresholdStrategy::calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + auto stats = cache.getACStats(tid, pid, cid); + if ((1-stats.usageFraction())*100 >= highEvictionAcWatermark) { + batches.push_back(0); + } else { + auto toFreeMemPercent = highEvictionAcWatermark - (1-stats.usageFraction())*100; + auto toFreeItems = static_cast( + toFreeMemPercent * (stats.totalSlabs() * Slab::kSize) / stats.allocSize); + batches.push_back(toFreeItems); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch; + if (cappedBatchSize < minEvictionBatch) + return minEvictionBatch; + else + return cappedBatchSize; + }); + + return batches; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h new file mode 100644 index 0000000000..94316bfe82 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background mover strategy. +class FreeThresholdStrategy : public BackgroundMoverStrategy { + public: + FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch); + ~FreeThresholdStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVecs); + + private: + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + uint64_t maxEvictionBatch{40}; + uint64_t minEvictionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h index 11d2bed2be..06c21bffe4 100644 --- a/cachelib/allocator/Handle.h +++ b/cachelib/allocator/Handle.h @@ -400,6 +400,12 @@ struct ReadHandleImpl { } } + protected: + friend class ReadHandleImpl; + // Method used only by ReadHandleImpl ctor + void discard() { + it_.store(nullptr, std::memory_order_relaxed); + } private: // we are waiting on Item* to be set to a value. One of the valid values is // nullptr. So choose something that we dont expect to indicate a ptr @@ -479,7 +485,8 @@ struct ReadHandleImpl { // Handle which has the item already FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept - : alloc_(&alloc), it_(it) {} + : alloc_(&alloc), it_(it) { + } // handle that has a wait context allocated. Used for async handles // In this case, the it_ will be filled in asynchronously and mulitple diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h index ba388d40a4..33b356077d 100644 --- a/cachelib/allocator/MM2Q-inl.h +++ b/cachelib/allocator/MM2Q-inl.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + namespace facebook { namespace cachelib { @@ -104,6 +106,9 @@ bool MM2Q::Container::recordAccess(T& node, return false; } + if (config_.markUsefulChance < 100.0 && folly::Random::rand32() % 100 >= config_.markUsefulChance) + return false; + return lruMutex_->lock_combine(func); } return false; @@ -223,15 +228,32 @@ void MM2Q::Container::rebalance() noexcept { template T::*HookPtr> bool MM2Q::Container::add(T& node) noexcept { const auto currTime = static_cast