diff --git a/.github/workflows/build-cachelib-centos-8-1.yml b/.github/workflows/build-cachelib-centos-8-1.yml index 5eb1090b0a..3983e0c78b 100644 --- a/.github/workflows/build-cachelib-centos-8-1.yml +++ b/.github/workflows/build-cachelib-centos-8-1.yml @@ -14,6 +14,7 @@ name: build-cachelib-centos-8-1 on: # push: + pull_request: schedule: - cron: '0 11 * * 1,3,5' jobs: diff --git a/.github/workflows/build-cachelib-centos-8-5.yml b/.github/workflows/build-cachelib-centos-8-5.yml index 3ffee37765..4e6c2d12e1 100644 --- a/.github/workflows/build-cachelib-centos-8-5.yml +++ b/.github/workflows/build-cachelib-centos-8-5.yml @@ -14,6 +14,7 @@ name: build-cachelib-centos-8.5 on: # push: + pull_request: schedule: - cron: '0 9 * * *' jobs: diff --git a/.github/workflows/build-cachelib-centos-long.yml b/.github/workflows/build-cachelib-centos-long.yml new file mode 100644 index 0000000000..92165f603b --- /dev/null +++ b/.github/workflows/build-cachelib-centos-long.yml @@ -0,0 +1,39 @@ +name: build-cachelib-centos-latest +on: + schedule: + - cron: '0 7 * * *' + +jobs: + build-cachelib-centos8-latest: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + # Docker container image name + container: "centos:latest" + steps: + - name: "update packages" + run: dnf upgrade -y + - name: "install sudo,git" + run: dnf install -y sudo git cmake gcc + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + - name: "build CacheLib using build script" + run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh long diff --git a/.github/workflows/build-cachelib-debian-10.yml b/.github/workflows/build-cachelib-debian-10.yml index c7c67e0724..7f0ab29a6c 100644 --- a/.github/workflows/build-cachelib-debian-10.yml +++ b/.github/workflows/build-cachelib-debian-10.yml @@ -14,6 +14,7 @@ name: build-cachelib-debian-10 on: # push: + pull_request: schedule: - cron: '0 13 * * *' jobs: @@ -51,6 +52,9 @@ jobs: g++ - || true - name: "checkout sources" uses: actions/checkout@v2 + - name: "Add Git safe directory" + # Workaround for Docker image bug (GitHub issue #199). + run: git config --system --add safe.directory $GITHUB_WORKSPACE - name: "Install Prerequisites" run: ./contrib/build.sh -S -B - name: "Test: update-submodules" diff --git a/.github/workflows/build-cachelib-debian.yml b/.github/workflows/build-cachelib-debian.yml new file mode 100644 index 0000000000..5bc3ad3c70 --- /dev/null +++ b/.github/workflows/build-cachelib-debian.yml @@ -0,0 +1,43 @@ +name: build-cachelib-debian-10 +on: + schedule: + - cron: '30 5 * * 0,3' + +jobs: + build-cachelib-debian-10: + name: "Debian/Buster - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + # Docker container image name + container: "debian:buster-slim" + steps: + - name: "update packages" + run: apt-get update + - name: "upgrade packages" + run: apt-get -y upgrade + - name: "install sudo,git" + run: apt-get install -y sudo git procps + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 ; true + echo === env === + env + echo === cc -v === + cc -v || true + echo === g++ -v === + g++ - || true + - name: "checkout sources" + uses: actions/checkout@v2 + - name: "build CacheLib using build script" + run: ./contrib/build.sh -j -v -T + - name: "run tests" + timeout-minutes: 60 + run: cd opt/cachelib/tests && ../../../run_tests.sh diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml new file mode 100644 index 0000000000..be28bc233c --- /dev/null +++ b/.github/workflows/build-cachelib-docker.yml @@ -0,0 +1,49 @@ +name: build-cachelib-docker +on: + push: + pull_request: + +jobs: + build-cachelib-docker: + name: "CentOS/latest - Build CacheLib with all dependencies" + runs-on: ubuntu-latest + env: + REPO: cachelib + GITHUB_REPO: intel/CacheLib + CONTAINER_REG: ghcr.io/pmem/cachelib + CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }} + CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }} + FORCE_IMAGE_ACTION: ${{ secrets.FORCE_IMAGE_ACTION }} + HOST_WORKDIR: ${{ github.workspace }} + WORKDIR: docker + IMG_VER: devel + strategy: + matrix: + CONFIG: ["OS=centos OS_VER=8streams PUSH_IMAGE=1"] + steps: + - name: "System Information" + run: | + echo === uname === + uname -a + echo === /etc/os-release === + cat /etc/os-release + echo === df -hl === + df -hl + echo === free -h === + free -h + echo === top === + top -b -n1 -1 -Eg || timeout 1 top -b -n1 + echo === env === + env + echo === gcc -v === + gcc -v + - name: "checkout sources" + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Pull the image or rebuild and push it + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./pull-or-rebuild-image.sh $FORCE_IMAGE_ACTION + + - name: Run the build + run: cd $WORKDIR && ${{ matrix.CONFIG }} ./build.sh diff --git a/.github/workflows/build-cachelib-fedora-36.yml b/.github/workflows/build-cachelib-fedora-36.yml index 216dbf5841..f8c0424400 100644 --- a/.github/workflows/build-cachelib-fedora-36.yml +++ b/.github/workflows/build-cachelib-fedora-36.yml @@ -14,6 +14,7 @@ name: build-cachelib-fedora-36 on: # push: + pull_request: schedule: - cron: '0 19 * * *' jobs: diff --git a/.github/workflows/build-cachelib-rockylinux-8.yml b/.github/workflows/build-cachelib-rockylinux-8.yml index 879dc27566..c8af12327d 100644 --- a/.github/workflows/build-cachelib-rockylinux-8.yml +++ b/.github/workflows/build-cachelib-rockylinux-8.yml @@ -14,6 +14,7 @@ name: build-cachelib-rockylinux-8.6 on: # push: + pull_request: schedule: - cron: '0 15 * * 2,4,6' jobs: diff --git a/.github/workflows/build-cachelib-rockylinux-9.yml b/.github/workflows/build-cachelib-rockylinux-9.yml index f6a86d75a0..e26eac6ff1 100644 --- a/.github/workflows/build-cachelib-rockylinux-9.yml +++ b/.github/workflows/build-cachelib-rockylinux-9.yml @@ -14,6 +14,7 @@ name: build-cachelib-rockylinux-9.0 on: # push: + pull_request: schedule: - cron: '0 17 * * *' jobs: diff --git a/.github/workflows/build-cachelib-ubuntu-18.yml b/.github/workflows/build-cachelib-ubuntu-18.yml index fad34c0897..ad068278a4 100644 --- a/.github/workflows/build-cachelib-ubuntu-18.yml +++ b/.github/workflows/build-cachelib-ubuntu-18.yml @@ -19,6 +19,7 @@ name: build-cachelib-ubuntu-18 on: # push: + pull_request: schedule: - cron: '0 5 * * 2,4,6' jobs: diff --git a/.github/workflows/build-cachelib-ubuntu-20.yml b/.github/workflows/build-cachelib-ubuntu-20.yml index 35a3f507e2..a8380fdb96 100644 --- a/.github/workflows/build-cachelib-ubuntu-20.yml +++ b/.github/workflows/build-cachelib-ubuntu-20.yml @@ -15,6 +15,7 @@ name: build-cachelib-ubuntu-20 on: # push: + pull_request: schedule: - cron: '0 5 * * 1,3,5' jobs: diff --git a/.github/workflows/build-cachelib-ubuntu-22.yml b/.github/workflows/build-cachelib-ubuntu-22.yml index b4374a5b96..4db194431d 100644 --- a/.github/workflows/build-cachelib-ubuntu-22.yml +++ b/.github/workflows/build-cachelib-ubuntu-22.yml @@ -15,6 +15,7 @@ name: build-cachelib-ubuntu-22 on: # push: + pull_request: schedule: - cron: '0 7 * * *' jobs: diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index 4b4897b610..90c8d739c6 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,6 +1,6 @@ # From: https://github.com/marketplace/actions/clang-format-check#multiple-paths name: clang-format Check -on: [pull_request] +on: [] jobs: formatting-check: name: Formatting Check diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md new file mode 100644 index 0000000000..cccc14b947 --- /dev/null +++ b/MultiTierDataMovement.md @@ -0,0 +1,90 @@ +# Background Data Movement + +In order to reduce the number of online evictions and support asynchronous +promotion - we have added two periodic workers to handle eviction and promotion. + +The diagram below shows a simplified version of how the background evictor +thread (green) is integrated to the CacheLib architecture. + +

+ BackgroundEvictor +

+ +## Background Evictors + +The background evictors scan each class to see if there are objects to move the next (lower) +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default +the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also, +the background evictor thread will be woken up everytime there is a failed allocation (from +a request handling thread) and the current percentage of free memory for the +AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter +not as important when there are many allocations occuring from request handling threads. + +- `evictorThreads`: The number of background evictors to run - each thread is a assigned +a set of AllocationClasses to scan and evict objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses. +The default is 1. + +- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any +candidates) + +- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch` +but it specifies how many candidates will be taken into consideration, not the actual number of items to evict. +This option can be used to configure duration of critical section on LRU lock. + + +### FreeThresholdStrategy (default) + +- `lowEvictionAcWatermark`: Triggers background eviction thread to run +when this percentage of the AllocationClass is free. +The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`. + +- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this +percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we +don't set this above `10`. + + +## Background Promoters + +The background promoters scan each class to see if there are objects to move to a lower +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default +the background promoter threads will wake up every 10 ms to scan the AllocationClasses for +objects to promote. + +- `promoterThreads`: The number of background promoters to run - each thread is a assigned +a set of AllocationClasses to scan and promote objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`. + +- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and it might increase contention with user threads. + +- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any +candidates) + +- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since +we won't need to modify the data when a user is done with the data. Therefore, for a short time +the data could reside in both tiers until it is evicted from its current tier. The default is to +not allow this (0). Setting the value to 100 will enable duplicate elements in tiers. + +### Background Promotion Strategy (only one currently) + +- `promotionAcWatermark`: Promote items if there is at least this +percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects +to that tier. The objects are chosen from the head of the LRU. The default is `4.0`. +This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`. +- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to +`maxEvictionBatch`. It's value should be lower to decrease contention on hot items. + diff --git a/README.md b/README.md index e05c932d1e..523b8b20a1 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ cd CacheLib Re-running `./contrib/build.sh` will update CacheLib and its dependencies to their latest versions and rebuild them. -See [build](https://cachelib.org/docs/installation/installation) for more details about +See [build](https://cachelib.org/docs/installation/) for more details about the building and installation process. diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt index 36df0dc19f..e77c25085c 100644 --- a/cachelib/CMakeLists.txt +++ b/cachelib/CMakeLists.txt @@ -85,6 +85,11 @@ set(CMAKE_MODULE_PATH set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) +if(COVERAGE_ENABLED) + # Add code coverage + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -fprofile-arcs -ftest-coverage") +endif() + # include(fb_cxx_flags) message(STATUS "Update CXXFLAGS: ${CMAKE_CXX_FLAGS}") diff --git a/cachelib/allocator/BackgroundMover-inl.h b/cachelib/allocator/BackgroundMover-inl.h new file mode 100644 index 0000000000..b77436635f --- /dev/null +++ b/cachelib/allocator/BackgroundMover-inl.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + +template +BackgroundMover::BackgroundMover( + Cache& cache, + std::shared_ptr strategy, + MoverDir direction) + : cache_(cache), strategy_(strategy), direction_(direction) { + if (direction_ == MoverDir::Evict) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndEvictItems; + + } else if (direction_ == MoverDir::Promote) { + moverFunc = BackgroundMoverAPIWrapper::traverseAndPromoteItems; + } +} + +template +BackgroundMover::~BackgroundMover() { + stop(std::chrono::seconds(0)); +} + +template +void BackgroundMover::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundMover interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundMover::setAssignedMemory( + std::vector&& assignedMemory) { + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory] { + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundMover::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this] { return assignedMemory_; }); + + unsigned int moves = 0; + std::set classes{}; + auto batches = strategy_->calculateBatchSizes(cache_, assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid, tid).getStats(); + + if (!batch) { + continue; + } + + // try moving BATCH items from the class in order to reach free target + auto moved = moverFunc(cache_, tid, pid, cid, batch); + moves += moved; + moves_per_class_[tid][pid][cid] += moved; + totalBytesMoved.add(moved * mpStats.acStats.at(cid).allocSize); + } + + numTraversals.inc(); + numMovedItems.add(moves); + totalClasses.add(classes.size()); +} + +template +BackgroundMoverStats BackgroundMover::getStats() const noexcept { + BackgroundMoverStats stats; + stats.numMovedItems = numMovedItems.get(); + stats.runCount = numTraversals.get(); + stats.totalBytesMoved = totalBytesMoved.get(); + stats.totalClasses = totalClasses.get(); + + return stats; +} + +template +std::map>> +BackgroundMover::getClassStats() const noexcept { + return moves_per_class_; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundMover.h b/cachelib/allocator/BackgroundMover.h new file mode 100644 index 0000000000..1246676d6e --- /dev/null +++ b/cachelib/allocator/BackgroundMover.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/PeriodicWorker.h" + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the cache api +template +struct BackgroundMoverAPIWrapper { + static size_t traverseAndEvictItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndEvictItems(tid, pid, cid, batch); + } + + static size_t traverseAndPromoteItems(C& cache, + unsigned int tid, + unsigned int pid, + unsigned int cid, + size_t batch) { + return cache.traverseAndPromoteItems(tid, pid, cid, batch); + } +}; + +enum class MoverDir { Evict = 0, Promote }; + +// Periodic worker that evicts items from tiers in batches +// The primary aim is to reduce insertion times for new items in the +// cache +template +class BackgroundMover : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param strategy the stragey class that defines how objects are + // moved, + // (promoted vs. evicted and how much) + BackgroundMover(Cache& cache, + std::shared_ptr strategy, + MoverDir direction_); + + ~BackgroundMover() override; + + BackgroundMoverStats getStats() const noexcept; + std::map>> + getClassStats() const noexcept; + + void setAssignedMemory( + std::vector&& assignedMemory); + + private: + std::map>> + moves_per_class_; + // cache allocator's interface for evicting + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + MoverDir direction_; + + std::function + moverFunc; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + AtomicCounter numMovedItems{0}; + AtomicCounter numTraversals{0}; + AtomicCounter totalClasses{0}; + AtomicCounter totalBytesMoved{0}; + + std::vector assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundMover-inl.h" diff --git a/cachelib/allocator/BackgroundMoverStrategy.h b/cachelib/allocator/BackgroundMoverStrategy.h new file mode 100644 index 0000000000..7706a625a5 --- /dev/null +++ b/cachelib/allocator/BackgroundMoverStrategy.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" + + +namespace facebook { +namespace cachelib { + +struct MemoryDescriptorType { + MemoryDescriptorType(TierId tid, PoolId pid, ClassId cid) : + tid_(tid), pid_(pid), cid_(cid) {} + TierId tid_; + PoolId pid_; + ClassId cid_; +}; + +// Base class for background eviction strategy. +class BackgroundMoverStrategy { + public: + virtual std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) = 0; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/CCacheAllocator.cpp b/cachelib/allocator/CCacheAllocator.cpp index 2709bde377..dd1986114b 100644 --- a/cachelib/allocator/CCacheAllocator.cpp +++ b/cachelib/allocator/CCacheAllocator.cpp @@ -36,7 +36,9 @@ CCacheAllocator::CCacheAllocator(MemoryAllocator& allocator, currentChunksIndex_(0) { auto& currentChunks = chunks_[currentChunksIndex_]; for (auto chunk : *object.chunks()) { - currentChunks.push_back(allocator_.unCompress(CompressedPtr(chunk))); + // TODO : pass multi-tier flag when compact cache supports multi-tier config + currentChunks.push_back( + allocator_.unCompress(CompressedPtr(chunk), false /* isMultiTier */)); } } @@ -97,7 +99,9 @@ CCacheAllocator::SerializationType CCacheAllocator::saveState() { std::lock_guard guard(resizeLock_); for (auto chunk : getCurrentChunks()) { - object.chunks()->push_back(allocator_.compress(chunk).saveState()); + // TODO : pass multi-tier flag when compact cache supports multi-tier config + object.chunks()->push_back( + allocator_.compress(chunk, false /* isMultiTier */).saveState()); } return object; } diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt index 78cfa7ca06..6103cdc823 100644 --- a/cachelib/allocator/CMakeLists.txt +++ b/cachelib/allocator/CMakeLists.txt @@ -35,6 +35,7 @@ add_library (cachelib_allocator CCacheManager.cpp ContainerTypes.cpp FreeMemStrategy.cpp + FreeThresholdStrategy.cpp HitsPerSlabStrategy.cpp LruTailAgeStrategy.cpp MarginalHitsOptimizeStrategy.cpp @@ -117,6 +118,8 @@ if (BUILD_TESTS) add_test (tests/ChainedHashTest.cpp) add_test (tests/AllocatorResizeTypeTest.cpp) add_test (tests/AllocatorHitStatsTypeTest.cpp) + add_test (tests/AllocatorMemoryTiersTest.cpp) + add_test (tests/MemoryTiersTest.cpp) add_test (tests/MultiAllocatorTest.cpp) add_test (tests/NvmAdmissionPolicyTest.cpp) add_test (tests/CacheAllocatorConfigTest.cpp) diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h index e225ba8a01..c871358189 100644 --- a/cachelib/allocator/Cache.h +++ b/cachelib/allocator/Cache.h @@ -85,6 +85,9 @@ class CacheBase { CacheBase(CacheBase&&) = default; CacheBase& operator=(CacheBase&&) = default; + // TODO: come up with some reasonable number + static constexpr unsigned kMaxTiers = 2; + // Get a string referring to the cache name for this cache virtual const std::string getCacheName() const = 0; @@ -95,6 +98,12 @@ class CacheBase { // // @param poolId The pool id to query virtual const MemoryPool& getPool(PoolId poolId) const = 0; + + // Get the reference to a memory pool using a tier id, for stats purposes + // + // @param poolId The pool id to query + // @param tierId The tier of the pool id + virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0; // Get Pool specific stats (regular pools). This includes stats from the // Memory Pool and also the cache. @@ -102,6 +111,12 @@ class CacheBase { // @param poolId the pool id virtual PoolStats getPoolStats(PoolId poolId) const = 0; + // Get Allocation Class specific stats. + // + // @param poolId the pool id + // @param classId the class id + virtual ACStats getACStats(TierId tid, PoolId poolId, ClassId classId) const = 0; + // @param poolId the pool id virtual AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId poolId) const = 0; diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 1d89593268..01a5f5e49e 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -16,6 +16,8 @@ #pragma once +#include + namespace facebook { namespace cachelib { @@ -35,6 +37,7 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config) template CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) : CacheAllocator(InitMemType::kMemAttach, config) { + /* TODO - per tier? */ for (auto pid : *metadata_.compactCachePools()) { isCompactCachePool_[pid] = true; } @@ -53,12 +56,13 @@ CacheAllocator::CacheAllocator( : isOnShm_{type != InitMemType::kNone ? true : config.memMonitoringEnabled()}, config_(config.validate()), + memoryTierConfigs(config.getMemoryTierConfigs()), tempShm_(type == InitMemType::kNone && isOnShm_ - ? std::make_unique(config_.size) + ? std::make_unique(config_.getCacheSize()) : nullptr), shmManager_(type != InitMemType::kNone ? std::make_unique(config_.cacheDir, - config_.usePosixShm) + config_.isUsingPosixShm()) : nullptr), deserializer_(type == InitMemType::kMemAttach ? createDeserializer() : nullptr), @@ -67,12 +71,12 @@ CacheAllocator::CacheAllocator( : serialization::CacheAllocatorMetadata{}}, allocator_(initAllocator(type)), compactCacheManager_(type != InitMemType::kMemAttach - ? std::make_unique(*allocator_) - : restoreCCacheManager()), + ? std::make_unique(*allocator_[0] /* TODO: per tier */) + : restoreCCacheManager(0/* TODO: per tier */)), compressor_(createPtrCompressor()), mmContainers_(type == InitMemType::kMemAttach ? deserializeMMContainers(*deserializer_, compressor_) - : MMContainers{}), + : MMContainers{getNumTiers()}), accessContainer_(initAccessContainer( type, detail::kShmHashTableName, config.accessConfig)), chainedItemAccessContainer_( @@ -81,6 +85,8 @@ CacheAllocator::CacheAllocator( config.chainedItemAccessConfig)), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), + movesMap_(kShards), + moveLock_(kShards), cacheCreationTime_{ type != InitMemType::kMemAttach ? util::getCurrentTimeSec() @@ -105,48 +111,98 @@ CacheAllocator::~CacheAllocator() { } template -ShmSegmentOpts CacheAllocator::createShmCacheOpts() { +ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); auto memoryTierConfigs = config_.getMemoryTierConfigs(); // TODO: we support single tier so far - XDCHECK_EQ(memoryTierConfigs.size(), 1ul); - opts.memBindNumaNodes = memoryTierConfigs[0].getMemBind(); - + if (memoryTierConfigs.size() > 2) { + throw std::invalid_argument("CacheLib only supports two memory tiers"); + } + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); return opts; } +template +size_t CacheAllocator::memoryTierSize(TierId tid) const { + auto partitions = std::accumulate(memoryTierConfigs.begin(), memoryTierConfigs.end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config){ + return i + config.getRatio(); + }); + + return memoryTierConfigs[tid].calculateTierSize(config_.getCacheSize(), partitions); +} + +template +std::vector> +CacheAllocator::createPrivateAllocator() { + std::vector> allocators; + + if (isOnShm_) + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + tempShm_->getAddr(), + config_.getCacheSize())); + else + allocators.emplace_back(std::make_unique( + getAllocatorConfig(config_), + config_.getCacheSize())); + + return allocators; +} + template std::unique_ptr -CacheAllocator::createNewMemoryAllocator() { +CacheAllocator::createNewMemoryAllocator(TierId tid) { + size_t tierSize = memoryTierSize(tid); return std::make_unique( getAllocatorConfig(config_), shmManager_ - ->createShm(detail::kShmCacheName, config_.size, - config_.slabMemoryBaseAddr, createShmCacheOpts()) + ->createShm(detail::kShmCacheName + std::to_string(tid), + tierSize, config_.slabMemoryBaseAddr, + createShmCacheOpts(tid)) .addr, - config_.size); + tierSize); } template std::unique_ptr -CacheAllocator::restoreMemoryAllocator() { +CacheAllocator::restoreMemoryAllocator(TierId tid) { return std::make_unique( deserializer_->deserialize(), shmManager_ - ->attachShm(detail::kShmCacheName, config_.slabMemoryBaseAddr, - createShmCacheOpts()) - .addr, - config_.size, + ->attachShm(detail::kShmCacheName + std::to_string(tid), + config_.slabMemoryBaseAddr, createShmCacheOpts(tid)).addr, + memoryTierSize(tid), config_.disableFullCoredump); } +template +std::vector> +CacheAllocator::createAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(createNewMemoryAllocator(tid)); + } + return allocators; +} + +template +std::vector> +CacheAllocator::restoreAllocators() { + std::vector> allocators; + for (int tid = 0; tid < getNumTiers(); tid++) { + allocators.emplace_back(restoreMemoryAllocator(tid)); + } + return allocators; +} + template std::unique_ptr -CacheAllocator::restoreCCacheManager() { +CacheAllocator::restoreCCacheManager(TierId tid) { return std::make_unique( deserializer_->deserialize(), - *allocator_); + *allocator_[tid]); } template @@ -235,23 +291,30 @@ void CacheAllocator::initWorkers() { config_.poolOptimizeStrategy, config_.ccacheOptimizeStepSizePercent); } + + if (config_.backgroundEvictorEnabled()) { + startNewBackgroundEvictor(config_.backgroundEvictorInterval, + config_.backgroundEvictorStrategy, + config_.backgroundEvictorThreads); + } + + if (config_.backgroundPromoterEnabled()) { + startNewBackgroundPromoter(config_.backgroundPromoterInterval, + config_.backgroundPromoterStrategy, + config_.backgroundPromoterThreads); + } } template -std::unique_ptr CacheAllocator::initAllocator( +std::vector> +CacheAllocator::initAllocator( InitMemType type) { if (type == InitMemType::kNone) { - if (isOnShm_ == true) { - return std::make_unique( - getAllocatorConfig(config_), tempShm_->getAddr(), config_.size); - } else { - return std::make_unique(getAllocatorConfig(config_), - config_.size); - } + return createPrivateAllocator(); } else if (type == InitMemType::kMemNew) { - return createNewMemoryAllocator(); + return createAllocators(); } else if (type == InitMemType::kMemAttach) { - return restoreMemoryAllocator(); + return restoreAllocators(); } // Invalid type @@ -318,13 +381,31 @@ CacheAllocator::allocate(PoolId poolId, ttlSecs == 0 ? 0 : creationTime + ttlSecs); } +template +bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) { + // TODO: should we also work on lower tiers? should we have separate set of params? + if (tid == 1) return false; + return (1-getACStats(tid, pid, cid).usageFraction())*100 <= config_.lowEvictionAcWatermark; +} + +template +size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) { + XDCHECK(numWorkers); + + // TODO: came up with some better sharding (use some hashing) + return (tid + pid + cid) % numWorkers; +} + + template typename CacheAllocator::WriteHandle -CacheAllocator::allocateInternal(PoolId pid, - typename Item::Key key, - uint32_t size, - uint32_t creationTime, - uint32_t expiryTime) { +CacheAllocator::allocateInternalTier(TierId tid, + PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; @@ -333,13 +414,21 @@ CacheAllocator::allocateInternal(PoolId pid, const auto requiredSize = Item::getRequiredSize(key, size); // the allocation class in our memory allocator. - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{ + (*stats_.classAllocLatency)[tid][pid][cid]}; + // TODO: per-tier (*stats_.allocAttempts)[pid][cid].inc(); - - void* memory = allocator_->allocate(pid, requiredSize); + + void* memory = allocator_[tid]->allocate(pid, requiredSize); + + if (backgroundEvictor_.size() && !fromBgThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) { + backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp(); + } + if (memory == nullptr) { - memory = findEviction(pid, cid); + memory = findEviction(tid, pid, cid); } WriteHandle handle; @@ -350,7 +439,7 @@ CacheAllocator::allocateInternal(PoolId pid, // for example. SCOPE_FAIL { // free back the memory to the allocator since we failed. - allocator_->free(memory); + allocator_[tid]->free(memory); }; handle = acquire(new (memory) Item(key, size, creationTime, expiryTime)); @@ -361,7 +450,7 @@ CacheAllocator::allocateInternal(PoolId pid, } } else { // failed to allocate memory. - (*stats_.allocFailures)[pid][cid].inc(); + (*stats_.allocFailures)[pid][cid].inc(); // TODO: per-tier // wake up rebalancer if (poolRebalancer_) { poolRebalancer_->wakeUp(); @@ -378,6 +467,22 @@ CacheAllocator::allocateInternal(PoolId pid, return handle; } +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternal(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread) { + auto tid = 0; /* TODO: consult admission policy */ + for(TierId tid = 0; tid < getNumTiers(); ++tid) { + auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromBgThread); + if (handle) return handle; + } + return {}; +} + template typename CacheAllocator::WriteHandle CacheAllocator::allocateChainedItem(const ReadHandle& parent, @@ -408,21 +513,29 @@ CacheAllocator::allocateChainedItemInternal( // number of bytes required for this item const auto requiredSize = ChainedItem::getRequiredSize(size); - const auto pid = allocator_->getAllocInfo(parent->getMemory()).poolId; - const auto cid = allocator_->getAllocationClassId(pid, requiredSize); + // TODO: is this correct? + auto tid = getTierId(*parent); + + const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId; + const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{ + (*stats_.classAllocLatency)[tid][pid][cid]}; + + // TODO: per-tier? Right now stats_ are not used in any public periodic + // worker (*stats_.allocAttempts)[pid][cid].inc(); - void* memory = allocator_->allocate(pid, requiredSize); + void* memory = allocator_[tid]->allocate(pid, requiredSize); if (memory == nullptr) { - memory = findEviction(pid, cid); + memory = findEviction(tid, pid, cid); } if (memory == nullptr) { (*stats_.allocFailures)[pid][cid].inc(); return WriteHandle{}; } - SCOPE_FAIL { allocator_->free(memory); }; + SCOPE_FAIL { allocator_[tid]->free(memory); }; auto child = acquire( new (memory) ChainedItem(compressor_.compress(parent.getInternal()), size, @@ -466,14 +579,15 @@ void CacheAllocator::addChainedItem(WriteHandle& parent, // Count a new child stats_.numChainedChildItems.inc(); - insertInMMContainer(*child); - // Increment refcount since this chained item is now owned by the parent // Parent will decrement the refcount upon release. Since this is an // internal refcount, we dont include it in active handle tracking. - child->incRef(); + auto ret = child->incRef(true); + XDCHECK(ret == RefcountWithFlags::incResult::incOk); XDCHECK_EQ(2u, child->getRefCount()); + insertInMMContainer(*child); + invalidateNvm(*parent); if (auto eventTracker = getEventTracker()) { eventTracker->record(AllocatorApiEvent::ADD_CHAINED, parent->getKey(), @@ -717,7 +831,8 @@ CacheAllocator::replaceChainedItemLocked(Item& oldItem, // Since this is an internal refcount, we dont include it in active handle // tracking. - newItemHdl->incRef(); + auto ret = newItemHdl->incRef(true); + XDCHECK(ret == RefcountWithFlags::incResult::incOk); return oldItemHdl; } @@ -731,8 +846,8 @@ CacheAllocator::releaseBackToAllocator(Item& it, throw std::runtime_error( folly::sformat("cannot release this item: {}", it.toString())); } - - const auto allocInfo = allocator_->getAllocInfo(it.getMemory()); + const auto tid = getTierId(it); + const auto allocInfo = allocator_[tid]->getAllocInfo(it.getMemory()); if (ctx == RemoveContext::kEviction) { const auto timeNow = util::getCurrentTimeSec(); @@ -756,8 +871,7 @@ CacheAllocator::releaseBackToAllocator(Item& it, folly::sformat("Can not recycle a chained item {}, toRecyle", it.toString(), toRecycle->toString())); } - - allocator_->free(&it); + allocator_[tid]->free(&it); return ReleaseRes::kReleased; } @@ -826,26 +940,29 @@ CacheAllocator::releaseBackToAllocator(Item& it, auto next = head->getNext(compressor_); const auto childInfo = - allocator_->getAllocInfo(static_cast(head)); + allocator_[tid]->getAllocInfo(static_cast(head)); (*stats_.fragmentationSize)[childInfo.poolId][childInfo.classId].sub( util::getFragmentation(*this, *head)); removeFromMMContainer(*head); - // If this chained item is marked as exclusive, we will not free it. - // We must capture the exclusive state before we do the decRef when - // we know the item must still be valid - const bool wasExclusive = head->isExclusive(); + // If this chained item is marked as moving, we will not free it. + // We must capture the moving state before we do the decRef when + // we know the item must still be valid. Item cannot be marked as + // exclusive. Only parent can be marked as such and even parent needs + // to be unmark prior to calling releaseBackToAllocator. + const bool wasMoving = head->isMoving(); + XDCHECK(!head->isMarkedForEviction()); // Decref and check if we were the last reference. Now if the item - // was marked exclusive, after decRef, it will be free to be released + // was marked moving, after decRef, it will be free to be released // by slab release thread const auto childRef = head->decRef(); - // If the item is already exclusive and we already decremented the + // If the item is already moving and we already decremented the // refcount, we don't need to free this item. We'll let the slab // release thread take care of that - if (!wasExclusive) { + if (!wasMoving) { if (childRef != 0) { throw std::runtime_error(folly::sformat( "chained item refcount is not zero. We cannot proceed! " @@ -853,13 +970,13 @@ CacheAllocator::releaseBackToAllocator(Item& it, childRef, head->toString())); } - // Item is not exclusive and refcount is 0, we can proceed to + // Item is not moving and refcount is 0, we can proceed to // free it or recylce the memory if (head == toRecycle) { XDCHECK(ReleaseRes::kReleased != res); res = ReleaseRes::kRecycled; } else { - allocator_->free(head); + allocator_[tid]->free(head); } } @@ -874,16 +991,19 @@ CacheAllocator::releaseBackToAllocator(Item& it, res = ReleaseRes::kRecycled; } else { XDCHECK(it.isDrained()); - allocator_->free(&it); + allocator_[tid]->free(&it); } return res; } template -void CacheAllocator::incRef(Item& it) { - it.incRef(); - ++handleCount_.tlStats(); +RefcountWithFlags::incResult CacheAllocator::incRef(Item& it, bool failIfMoving) { + auto ret = it.incRef(failIfMoving); + if (ret == RefcountWithFlags::incResult::incOk) { + ++handleCount_.tlStats(); + } + return ret; } template @@ -903,8 +1023,18 @@ CacheAllocator::acquire(Item* it) { SCOPE_FAIL { stats_.numRefcountOverflow.inc(); }; - incRef(*it); - return WriteHandle{it, *this}; + // TODO: do not block incRef for child items to avoid deadlock + auto failIfMoving = getNumTiers() > 1 && !it->isChainedItem(); + auto incRes = incRef(*it, failIfMoving); + if (LIKELY(incRes == RefcountWithFlags::incResult::incOk)) { + return WriteHandle{it, *this}; + } else if (incRes == RefcountWithFlags::incResult::incFailedEviction){ + // item is being evicted + return WriteHandle{}; + } else { + // item is being moved - wait for completion + return handleWithWaitContextForMovingItem(*it); + } } template @@ -946,6 +1076,25 @@ bool CacheAllocator::replaceInMMContainer(Item& oldItem, } } +template +bool CacheAllocator::replaceInMMContainer(Item* oldItem, + Item& newItem) { + return replaceInMMContainer(*oldItem, newItem); +} + +template +bool CacheAllocator::replaceInMMContainer(EvictionIterator& oldItemIt, + Item& newItem) { + auto& oldContainer = getMMContainer(*oldItemIt); + auto& newContainer = getMMContainer(newItem); + + // This function is used for eviction across tiers + XDCHECK(&oldContainer != &newContainer); + oldContainer.remove(oldItemIt); + + return newContainer.add(newItem); +} + template bool CacheAllocator::replaceChainedItemInMMContainer( Item& oldItem, Item& newItem) { @@ -1091,6 +1240,165 @@ CacheAllocator::insertOrReplace(const WriteHandle& handle) { return replaced; } +/* Next two methods are used to asynchronously move Item between memory tiers. + * + * The thread, which moves Item, allocates new Item in the tier we are moving to + * and calls moveRegularItemWithSync() method. This method does the following: + * 1. Update the access container with the new item from the tier we are + * moving to. This Item has moving flag set. + * 2. Copy data from the old Item to the new one. + * + * Concurrent threads which are getting handle to the same key: + * 1. When a handle is created it checks if the moving flag is set + * 2. If so, Handle implementation creates waitContext and adds it to the + * MoveCtx by calling handleWithWaitContextForMovingItem() method. + * 3. Wait until the moving thread will complete its job. + */ +template +typename CacheAllocator::WriteHandle +CacheAllocator::handleWithWaitContextForMovingItem(Item& item) { + auto shard = getShardForKey(item.getKey()); + auto& movesMap = getMoveMapForShard(shard); + { + auto lock = getMoveLockForShard(shard); + + WriteHandle hdl{*this}; + auto waitContext = hdl.getItemWaitContext(); + + auto ret = movesMap.try_emplace(item.getKey(), std::make_unique()); + ret.first->second->addWaiter(std::move(waitContext)); + + return hdl; + } +} + +template +size_t CacheAllocator::wakeUpWaitersLocked(folly::StringPiece key, + WriteHandle&& handle) { + std::unique_ptr ctx; + auto shard = getShardForKey(key); + auto& movesMap = getMoveMapForShard(shard); + { + auto lock = getMoveLockForShard(shard); + movesMap.eraseInto(key, [&](auto &&key, auto &&value) { + ctx = std::move(value); + }); + } + + if (ctx) { + ctx->setItemHandle(std::move(handle)); + return ctx->numWaiters(); + } + + return 0; +} + +template +bool CacheAllocator::moveRegularItemWithSync( + Item& oldItem, WriteHandle& newItemHdl) { + //on function exit - the new item handle is no longer moving + //and other threads may access it - but in case where + //we failed to replace in access container we can give the + //new item back to the allocator + auto guard = folly::makeGuard([&]() { + auto ref = newItemHdl->unmarkMoving(); + if (UNLIKELY(ref == 0)) { + const auto res = + releaseBackToAllocator(*newItemHdl, RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + } + }); + + XDCHECK(oldItem.isMoving()); + XDCHECK(!oldItem.isExpired()); + // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_ + // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_}; + + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); + + // take care of the flags before we expose the item to be accessed. this + // will ensure that when another thread removes the item from RAM, we issue + // a delete accordingly. See D7859775 for an example + if (oldItem.isNvmClean()) { + newItemHdl->markNvmClean(); + } + + // mark new item as moving to block readers until the data is copied + // (moveCb is called). Mark item in MMContainer temporarily (TODO: should + // we remove markMoving requirement for the item to be linked?) + newItemHdl->markInMMContainer(); + auto marked = newItemHdl->markMoving(false /* there is already a handle */); + newItemHdl->unmarkInMMContainer(); + XDCHECK(marked); + + auto predicate = [&](const Item& item){ + // we rely on moving flag being set (it should block all readers) + XDCHECK(item.getRefCount() == 0); + return true; + }; + + auto replaced = accessContainer_->replaceIf(oldItem, *newItemHdl, + predicate); + // another thread may have called insertOrReplace which could have + // marked this item as unaccessible causing the replaceIf + // in the access container to fail - in this case we want + // to abort the move since the item is no longer valid + if (!replaced) { + return false; + } + // what if another thread calls insertOrReplace now when + // the item is moving and already replaced in the hash table? + // 1. it succeeds in updating the hash table - so there is + // no guarentee that isAccessible() is true + // 2. it will then try to remove from MM container + // - this operation will wait for newItemHdl to + // be unmarkedMoving via the waitContext + // 3. replaced handle is returned and eventually drops + // ref to 0 and the item is recycled back to allocator. + + if (config_.moveCb) { + // Execute the move callback. We cannot make any guarantees about the + // consistency of the old item beyond this point, because the callback can + // do more than a simple memcpy() e.g. update external references. If there + // are any remaining handles to the old item, it is the caller's + // responsibility to invalidate them. The move can only fail after this + // statement if the old item has been removed or replaced, in which case it + // should be fine for it to be left in an inconsistent state. + config_.moveCb(oldItem, *newItemHdl, nullptr); + } else { + std::memcpy(newItemHdl->getMemory(), oldItem.getMemory(), + oldItem.getSize()); + } + + // Adding the item to mmContainer has to succeed since no one can remove the item + auto& newContainer = getMMContainer(*newItemHdl); + auto mmContainerAdded = newContainer.add(*newItemHdl); + XDCHECK(mmContainerAdded); + + // no one can add or remove chained items at this point + if (oldItem.hasChainedItem()) { + // safe to acquire handle for a moving Item + auto incRes = incRef(oldItem, false); + XDCHECK(incRes == RefcountWithFlags::incResult::incOk); + auto oldHandle = WriteHandle{&oldItem,*this}; + XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); + try { + auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); + transferChainLocked(oldHandle, newItemHdl); + } catch (const std::exception& e) { + // this should never happen because we drained all the handles. + XLOGF(DFATAL, "{}", e.what()); + throw; + } + + XDCHECK(!oldItem.hasChainedItem()); + XDCHECK(newItemHdl->hasChainedItem()); + } + newItemHdl.unmarkNascent(); + return true; +} + template bool CacheAllocator::moveRegularItem(Item& oldItem, WriteHandle& newItemHdl) { @@ -1122,15 +1430,15 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, config_.moveCb(oldItem, *newItemHdl, nullptr); // Inside the access container's lock, this checks if the old item is - // accessible and its refcount is zero. If the item is not accessible, + // accessible and its refcount is one. If the item is not accessible, // there is no point to replace it since it had already been removed // or in the process of being removed. If the item is in cache but the - // refcount is non-zero, it means user could be attempting to remove + // refcount is non-one, it means user could be attempting to remove // this item through an API such as remove(itemHandle). In this case, // it is unsafe to replace the old item with a new one, so we should // also abort. if (!accessContainer_->replaceIf(oldItem, *newItemHdl, - itemExclusivePredicate)) { + itemSlabMovePredicate)) { return false; } @@ -1151,13 +1459,12 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, // no one can add or remove chained items at this point if (oldItem.hasChainedItem()) { - // safe to acquire handle for a moving Item - auto oldHandle = acquire(&oldItem); - XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); + auto oldItemHdl = acquire(&oldItem); + XDCHECK_EQ(1u, oldItemHdl->getRefCount()) << oldItemHdl->toString(); XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); try { auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); - transferChainLocked(oldHandle, newItemHdl); + transferChainLocked(oldItemHdl, newItemHdl); } catch (const std::exception& e) { // this should never happen because we drained all the handles. XLOGF(DFATAL, "{}", e.what()); @@ -1179,18 +1486,18 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, // This item has been unlinked from its parent and we're the only // owner of it, so we're done here - if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) { + if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { return false; } - const auto parentKey = oldItem.getParentItem(compressor_).getKey(); - - // Grab lock to prevent anyone else from modifying the chain + auto& expectedParent = oldItem.getParentItem(compressor_); + const auto parentKey = expectedParent.getKey(); auto l = chainedItemLocks_.lockExclusive(parentKey); + // verify old item under the lock auto parentHandle = validateAndGetParentHandleForChainedMoveLocked(oldItem, parentKey); - if (!parentHandle) { + if (!parentHandle || &expectedParent != parentHandle.get()) { return false; } @@ -1210,7 +1517,7 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, // In case someone else had removed this chained item from its parent by now // So we check again to see if the it has been unlinked from its parent - if (!oldItem.isInMMContainer() || oldItem.isOnlyExclusive()) { + if (!oldItem.isInMMContainer() || oldItem.isOnlyMoving()) { return false; } @@ -1226,90 +1533,187 @@ bool CacheAllocator::moveChainedItem(ChainedItem& oldItem, // parent's chain and the MMContainer. auto oldItemHandle = replaceChainedItemLocked(oldItem, std::move(newItemHdl), *parentHandle); - XDCHECK(oldItemHandle->isExclusive()); + XDCHECK(oldItemHandle->isMoving()); XDCHECK(!oldItemHandle->isInMMContainer()); return true; } template -typename CacheAllocator::Item* -CacheAllocator::findEviction(PoolId pid, ClassId cid) { - auto& mmContainer = getMMContainer(pid, cid); +typename CacheAllocator::NvmCacheT::PutToken +CacheAllocator::createPutToken(Item& item) { + const bool evictToNvmCache = shouldWriteToNvmCache(item); + return evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) + : typename NvmCacheT::PutToken{}; +} + +template +void CacheAllocator::unlinkItemForEviction(Item& it) { + XDCHECK(it.isMarkedForEviction()); + XDCHECK(it.getRefCount() == 0); + accessContainer_->remove(it); + removeFromMMContainer(it); + // Since we managed to mark the item for eviction we must be the only + // owner of the item. + const auto ref = it.unmarkForEviction(); + XDCHECK(ref == 0u); +} + +template +typename CacheAllocator::Item* +CacheAllocator::findEviction(TierId tid, PoolId pid, ClassId cid) { + auto& mmContainer = getMMContainer(tid, pid, cid); + bool lastTier = tid+1 >= getNumTiers(); // Keep searching for a candidate until we were able to evict it // or until the search limit has been exhausted unsigned int searchTries = 0; - auto itr = mmContainer.getEvictionIterator(); while ((config_.evictionSearchTries == 0 || - config_.evictionSearchTries > searchTries) && - itr) { - ++searchTries; - (*stats_.evictionAttempts)[pid][cid].inc(); + config_.evictionSearchTries > searchTries)) { + + Item* toRecycle = nullptr; + Item* candidate = nullptr; + typename NvmCacheT::PutToken token; + + mmContainer.withEvictionIterator([this, pid, cid, &candidate, &toRecycle, + &searchTries, &mmContainer, &lastTier, + &token](auto&& itr) { + if (!itr) { + ++searchTries; + (*stats_.evictionAttempts)[pid][cid].inc(); + return; + } + + while ((config_.evictionSearchTries == 0 || + config_.evictionSearchTries > searchTries) && + itr) { + ++searchTries; + (*stats_.evictionAttempts)[pid][cid].inc(); + + auto* toRecycle_ = itr.get(); + auto* candidate_ = + toRecycle_->isChainedItem() + ? &toRecycle_->asChainedItem().getParentItem(compressor_) + : toRecycle_; + + if (lastTier) { + // if it's last tier, the item will be evicted + // need to create put token before marking it exclusive + token = createPutToken(*candidate_); + } - Item* toRecycle = itr.get(); + if (lastTier && shouldWriteToNvmCache(*candidate_) && !token.isValid()) { + stats_.evictFailConcurrentFill.inc(); + } else if ( (lastTier && candidate_->markForEviction()) || + (!lastTier && candidate_->markMoving(true)) ) { + XDCHECK(candidate_->isMoving() || candidate_->isMarkedForEviction()); + // markForEviction to make sure no other thead is evicting the item + // nor holding a handle to that item if this is last tier + // since we won't be moving the item to the next tier + toRecycle = toRecycle_; + candidate = candidate_; + + // Check if parent changed for chained items - if yes, we cannot + // remove the child from the mmContainer as we will not be evicting + // it. We could abort right here, but we need to cleanup in case + // unmarkForEviction() returns 0 - so just go through normal path. + if (!toRecycle_->isChainedItem() || + &toRecycle->asChainedItem().getParentItem(compressor_) == + candidate) + mmContainer.remove(itr); + return; + } - Item* candidate = - toRecycle->isChainedItem() - ? &toRecycle->asChainedItem().getParentItem(compressor_) - : toRecycle; + if (candidate_->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } - // make sure no other thead is evicting the item - if (candidate->getRefCount() != 0 || !candidate->markExclusive()) { - ++itr; + ++itr; + XDCHECK(toRecycle == nullptr); + XDCHECK(candidate == nullptr); + } + }); + + if (!toRecycle) continue; - } - // for chained items, the ownership of the parent can change. We try to - // evict what we think as parent and see if the eviction of parent - // recycles the child we intend to. - bool evictionSuccessful = false; - { - auto toReleaseHandle = - itr->isChainedItem() - ? advanceIteratorAndTryEvictChainedItem(itr) - : advanceIteratorAndTryEvictRegularItem(mmContainer, itr); - evictionSuccessful = toReleaseHandle != nullptr; - // destroy toReleseHandle. The item won't be released to allocator - // since we marked it as exclusive. - } - - const auto ref = candidate->unmarkExclusive(); - if (ref == 0u) { - // Invalidate iterator since later on we may use this mmContainer - // again, which cannot be done unless we drop this iterator - itr.destroy(); - - // recycle the item. it's safe to do so, even if toReleaseHandle was - // NULL. If `ref` == 0 then it means that we are the last holder of - // that item. - if (candidate->hasChainedItem()) { - (*stats_.chainedItemEvictions)[pid][cid].inc(); - } else { - (*stats_.regularItemEvictions)[pid][cid].inc(); + XDCHECK(toRecycle); + XDCHECK(candidate); + + auto evictedToNext = lastTier ? nullptr + : tryEvictToNextMemoryTier(*candidate, false); + if (!evictedToNext) { + //if insertOrReplace was called during move + //then candidate will not be accessible (failed replace during tryEvict) + // - therefore this was why we failed to + // evict to the next tier and insertOrReplace + // will remove from NVM cache + //however, if candidate is accessible + //that means the allocation in the next + //tier failed - so we will continue to + //evict the item to NVM cache + bool failedToReplace = !candidate->isAccessible(); + if (!token.isValid() && !failedToReplace) { + token = createPutToken(*candidate); } - - if (auto eventTracker = getEventTracker()) { - eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(), - AllocatorApiResult::EVICTED, candidate->getSize(), - candidate->getConfiguredTTL().count()); + // tryEvictToNextMemoryTier can fail if: + // a) allocation of the new item fails in that case, + // it should be still possible to mark item for eviction. + // b) another thread calls insertOrReplace and the item + // is no longer accessible + // + // in case that we are on the last tier, we whould have already marked + // as exclusive since we will not be moving the item to the next tier + // but rather just evicting all together, no need to + // markForEvictionWhenMoving + auto ret = lastTier ? true : candidate->markForEvictionWhenMoving(); + XDCHECK(ret); + + unlinkItemForEviction(*candidate); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate) + && !failedToReplace) { + nvmCache_->put(*candidate, std::move(token)); } + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*candidate, {}); - // check if by releasing the item we intend to, we actually - // recycle the candidate. - if (ReleaseRes::kRecycled == - releaseBackToAllocator(*candidate, RemoveContext::kEviction, - /* isNascent */ false, toRecycle)) { - return toRecycle; - } } else { - XDCHECK(!evictionSuccessful); + XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving()); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + XDCHECK(!candidate->isAccessible()); + XDCHECK(candidate->getKey() == evictedToNext->getKey()); + + wakeUpWaiters(*candidate, std::move(evictedToNext)); + } + + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + + // recycle the item. it's safe to do so, even if toReleaseHandle was + // NULL. If `ref` == 0 then it means that we are the last holder of + // that item. + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + if (auto eventTracker = getEventTracker()) { + eventTracker->record(AllocatorApiEvent::DRAM_EVICT, candidate->getKey(), + AllocatorApiResult::EVICTED, candidate->getSize(), + candidate->getConfiguredTTL().count()); } - // If we destroyed the itr to possibly evict and failed, we restart - // from the beginning again - if (!itr) { - itr.resetToBegin(); + // check if by releasing the item we intend to, we actually + // recycle the candidate. + auto ret = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false, toRecycle); + if (ret == ReleaseRes::kRecycled) { + return toRecycle; } } return nullptr; @@ -1363,6 +1767,94 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive( return true; } +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + XDCHECK(item.isMoving()); + XDCHECK(item.getRefCount() == 0); + if(item.hasChainedItem()) return WriteHandle{}; // TODO: We do not support ChainedItem yet + if(item.isExpired()) { + accessContainer_->remove(item); + item.unmarkMoving(); + return acquire(&item); + } + + TierId nextTier = tid; // TODO - calculate this based on some admission policy + while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(nextTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + if (!moveRegularItemWithSync(item, newItemHdl)) { + return WriteHandle{}; + } + XDCHECK_EQ(newItemHdl->getKey(),item.getKey()); + item.unmarkMoving(); + return newItemHdl; + } else { + return WriteHandle{}; + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryEvictToNextMemoryTier(tid, pid, item, fromBgThread); +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryPromoteToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromBgThread) { + if(item.isExpired()) { return {}; } + TierId nextTier = tid; + while (nextTier > 0) { // try to evict down to the next memory tiers + auto toPromoteTier = nextTier - 1; + --nextTier; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(toPromoteTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromBgThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + if (!moveRegularItemWithSync(item, newItemHdl)) { + return WriteHandle{}; + } + item.unmarkMoving(); + return newItemHdl; + } else { + return WriteHandle{}; + } + } + + return {}; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread); +} + template typename CacheAllocator::RemoveRes CacheAllocator::remove(typename Item::Key key) { @@ -1453,7 +1945,7 @@ bool CacheAllocator::pushToNvmCacheFromRamForTesting( if (handle && nvmCache_ && shouldWriteToNvmCache(*handle) && shouldWriteToNvmCacheExclusive(*handle)) { - nvmCache_->put(handle, nvmCache_->createPutToken(handle->getKey())); + nvmCache_->put(*handle, nvmCache_->createPutToken(handle->getKey())); return true; } return false; @@ -1563,21 +2055,57 @@ void CacheAllocator::invalidateNvm(Item& item) { } } +template +TierId +CacheAllocator::getTierId(const Item& item) const { + return getTierId(item.getMemory()); +} + +template +TierId +CacheAllocator::getTierId(const void* ptr) const { + for (TierId tid = 0; tid < getNumTiers(); tid++) { + if (allocator_[tid]->isMemoryInAllocator(ptr)) + return tid; + } + + throw std::invalid_argument("Item does not belong to any tier!"); +} + template typename CacheAllocator::MMContainer& CacheAllocator::getMMContainer(const Item& item) const noexcept { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - return getMMContainer(allocInfo.poolId, allocInfo.classId); + allocator_[tid]->getAllocInfo(static_cast(&item)); + return getMMContainer(tid, allocInfo.poolId, allocInfo.classId); } template typename CacheAllocator::MMContainer& -CacheAllocator::getMMContainer(PoolId pid, +CacheAllocator::getMMContainer(TierId tid, + PoolId pid, ClassId cid) const noexcept { - XDCHECK_LT(static_cast(pid), mmContainers_.size()); - XDCHECK_LT(static_cast(cid), mmContainers_[pid].size()); - return *mmContainers_[pid][cid]; + XDCHECK_LT(static_cast(tid), mmContainers_.size()); + XDCHECK_LT(static_cast(pid), mmContainers_[tid].size()); + XDCHECK_LT(static_cast(cid), mmContainers_[tid][pid].size()); + return *mmContainers_[tid][pid][cid]; +} + +template +MMContainerStat CacheAllocator::getMMContainerStat( + TierId tid, PoolId pid, ClassId cid) const noexcept { + if(static_cast(tid) >= mmContainers_.size()) { + return MMContainerStat{}; + } + if (static_cast(pid) >= mmContainers_[tid].size()) { + return MMContainerStat{}; + } + if (static_cast(cid) >= mmContainers_[tid][pid].size()) { + return MMContainerStat{}; + } + return mmContainers_[tid][pid][cid] ? mmContainers_[tid][pid][cid]->getStats() + : MMContainerStat{}; } template @@ -1763,8 +2291,9 @@ void CacheAllocator::markUseful(const ReadHandle& handle, template bool CacheAllocator::recordAccessInMMContainer(Item& item, AccessMode mode) { + const auto tid = getTierId(item); const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); + allocator_[tid]->getAllocInfo(static_cast(&item)); (*stats_.cacheHits)[allocInfo.poolId][allocInfo.classId].inc(); // track recently accessed items if needed @@ -1772,14 +2301,15 @@ bool CacheAllocator::recordAccessInMMContainer(Item& item, ring_->trackItem(reinterpret_cast(&item), item.getSize()); } - auto& mmContainer = getMMContainer(allocInfo.poolId, allocInfo.classId); + auto& mmContainer = getMMContainer(tid, allocInfo.poolId, allocInfo.classId); return mmContainer.recordAccess(item, mode); } template uint32_t CacheAllocator::getUsableSize(const Item& item) const { + const auto tid = getTierId(item); const auto allocSize = - allocator_->getAllocInfo(static_cast(&item)).allocSize; + allocator_[tid]->getAllocInfo(static_cast(&item)).allocSize; return item.isChainedItem() ? allocSize - ChainedItem::getRequiredSize(0) : allocSize - Item::getRequiredSize(item.getKey(), 0); @@ -1788,8 +2318,10 @@ uint32_t CacheAllocator::getUsableSize(const Item& item) const { template typename CacheAllocator::SampleItem CacheAllocator::getSampleItem() { - size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0; - size_t ramCacheSize = allocator_->getMemorySizeInclAdvised(); + // TODO: is using random tier a good idea? + auto tid = folly::Random::rand32() % getNumTiers(); + static size_t nvmCacheSize = nvmCache_ ? nvmCache_->getUsableSize() : 0; + static size_t ramCacheSize = allocator_[tid]->getMemorySizeInclAdvised(); bool fromNvm = folly::Random::rand64(0, nvmCacheSize + ramCacheSize) >= ramCacheSize; @@ -1798,19 +2330,18 @@ CacheAllocator::getSampleItem() { } // Sampling from DRAM cache - auto item = reinterpret_cast(allocator_->getRandomAlloc()); + auto item = reinterpret_cast(allocator_[tid]->getRandomAlloc()); if (!item) { return SampleItem{false /* fromNvm */}; } // Check that item returned is the same that was sampled - auto sharedHdl = std::make_shared(findInternal(item->getKey())); if (sharedHdl->get() != item) { return SampleItem{false /* fromNvm */}; } - const auto allocInfo = allocator_->getAllocInfo(item->getMemory()); + const auto allocInfo = allocator_[tid]->getAllocInfo(item->getMemory()); // Convert the Item to IOBuf to make SampleItem auto iobuf = folly::IOBuf{ @@ -1829,28 +2360,33 @@ CacheAllocator::getSampleItem() { template std::vector CacheAllocator::dumpEvictionIterator( - PoolId pid, ClassId cid, size_t numItems) { + PoolId pid, ClassId cid, size_t numItems) { if (numItems == 0) { return {}; } - if (static_cast(pid) >= mmContainers_.size() || - static_cast(cid) >= mmContainers_[pid].size()) { + // Always evict from the lowest layer. + int tid = getNumTiers() - 1; + + if (static_cast(tid) >= mmContainers_.size() || + static_cast(pid) >= mmContainers_[tid].size() || + static_cast(cid) >= mmContainers_[tid][pid].size()) { throw std::invalid_argument( - folly::sformat("Invalid PoolId: {} and ClassId: {}.", pid, cid)); + folly::sformat("Invalid TierId: {} and PoolId: {} and ClassId: {}.", tid, pid, cid)); } std::vector content; - auto& mm = *mmContainers_[pid][cid]; - auto evictItr = mm.getEvictionIterator(); - size_t i = 0; - while (evictItr && i < numItems) { - content.push_back(evictItr->toString()); - ++evictItr; - ++i; + while (tid >= 0) { + auto& mm = *mmContainers_[tid][pid][cid]; + mm.withEvictionIterator([&content, numItems](auto&& itr) { + while (itr && content.size() < numItems) { + content.push_back(itr->toString()); + ++itr; + } + }); + --tid; } - return content; } @@ -2026,19 +2562,50 @@ PoolId CacheAllocator::addPool( std::shared_ptr resizeStrategy, bool ensureProvisionable) { folly::SharedMutex::WriteHolder w(poolsResizeAndRebalanceLock_); - auto pid = allocator_->addPool(name, size, allocSizes, ensureProvisionable); + + PoolId pid = 0; + size_t totalCacheSize = 0; + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + totalCacheSize += allocator_[tid]->getMemorySize(); + } + + for (TierId tid = 0; tid < getNumTiers(); tid++) { + auto tierSizeRatio = + static_cast(allocator_[tid]->getMemorySize()) / totalCacheSize; + size_t tierPoolSize = static_cast(tierSizeRatio * size); + + // TODO: what if we manage to add pool only in one tier? + // we should probably remove that on failure + auto res = allocator_[tid]->addPool( + name, tierPoolSize, allocSizes, ensureProvisionable); + XDCHECK(tid == 0 || res == pid); + pid = res; + } + createMMContainers(pid, std::move(config)); setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); + + if (backgroundEvictor_.size()) { + for (size_t id = 0; id < backgroundEvictor_.size(); id++) + backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0)); + } + + if (backgroundPromoter_.size()) { + for (size_t id = 0; id < backgroundPromoter_.size(); id++) + backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1)); + } + return pid; } template void CacheAllocator::overridePoolRebalanceStrategy( PoolId pid, std::shared_ptr rebalanceStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setRebalanceStrategy(pid, std::move(rebalanceStrategy)); } @@ -2046,9 +2613,9 @@ void CacheAllocator::overridePoolRebalanceStrategy( template void CacheAllocator::overridePoolResizeStrategy( PoolId pid, std::shared_ptr resizeStrategy) { - if (static_cast(pid) >= mmContainers_.size()) { + if (static_cast(pid) >= mmContainers_[0].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[0].size())); } setResizeStrategy(pid, std::move(resizeStrategy)); } @@ -2060,14 +2627,14 @@ void CacheAllocator::overridePoolOptimizeStrategy( } template -void CacheAllocator::overridePoolConfig(PoolId pid, +void CacheAllocator::overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config) { - if (static_cast(pid) >= mmContainers_.size()) { + // TODO: add generic tier id checking + if (static_cast(pid) >= mmContainers_[tid].size()) { throw std::invalid_argument(folly::sformat( - "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_.size())); + "Invalid PoolId: {}, size of pools: {}", pid, mmContainers_[tid].size())); } - - auto& pool = allocator_->getPool(pid); + auto& pool = allocator_[tid]->getPool(pid); for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { MMConfig mmConfig = config; mmConfig.addExtraConfig( @@ -2075,29 +2642,35 @@ void CacheAllocator::overridePoolConfig(PoolId pid, ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - DCHECK_NOTNULL(mmContainers_[pid][cid].get()); - mmContainers_[pid][cid]->setConfig(mmConfig); + DCHECK_NOTNULL(mmContainers_[tid][pid][cid].get()); + mmContainers_[tid][pid][cid]->setConfig(mmConfig); } } template void CacheAllocator::createMMContainers(const PoolId pid, MMConfig config) { - auto& pool = allocator_->getPool(pid); + // pools on each layer should have the same number of class id, etc. + // TODO: think about deduplication + auto& pool = allocator_[0]->getPool(pid); + for (unsigned int cid = 0; cid < pool.getNumClassId(); ++cid) { config.addExtraConfig( config_.trackTailHits ? pool.getAllocationClass(static_cast(cid)) .getAllocsPerSlab() : 0); - mmContainers_[pid][cid].reset(new MMContainer(config, compressor_)); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_)); + } } } template PoolId CacheAllocator::getPoolId( folly::StringPiece name) const noexcept { - return allocator_->getPoolId(name.str()); + // each tier has the same pools + return allocator_[0]->getPoolId(name.str()); } // The Function returns a consolidated vector of Release Slab @@ -2140,7 +2713,9 @@ std::set CacheAllocator::filterCompactCachePools( template std::set CacheAllocator::getRegularPoolIds() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); - return filterCompactCachePools(allocator_->getPoolIds()); + // TODO - get rid of the duplication - right now, each tier + // holds pool objects with mostly the same info + return filterCompactCachePools(allocator_[0]->getPoolIds()); } template @@ -2165,10 +2740,9 @@ std::set CacheAllocator::getRegularPoolIdsForResize() // getAdvisedMemorySize - then pools may be overLimit even when // all slabs are not allocated. Otherwise, pools may be overLimit // only after all slabs are allocated. - // - return (allocator_->allSlabsAllocated()) || - (allocator_->getAdvisedMemorySize() != 0) - ? filterCompactCachePools(allocator_->getPoolsOverLimit()) + return (allocator_[currentTier()]->allSlabsAllocated()) || + (allocator_[currentTier()]->getAdvisedMemorySize() != 0) + ? filterCompactCachePools(allocator_[currentTier()]->getPoolsOverLimit()) : std::set{}; } @@ -2177,9 +2751,19 @@ const std::string CacheAllocator::getCacheName() const { return config_.cacheName; } +template +size_t CacheAllocator::getPoolSize(PoolId poolId) const { + size_t poolSize = 0; + for (auto& allocator: allocator_) { + const auto& pool = allocator->getPool(poolId); + poolSize += pool.getPoolSize(); + } + return poolSize; +} + template PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { - const auto& pool = allocator_->getPool(poolId); + const auto& pool = allocator_[currentTier()]->getPool(poolId); const auto& allocSizes = pool.getAllocSizes(); auto mpStats = pool.getStats(); const auto& classIds = mpStats.classIds; @@ -2198,7 +2782,7 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { if (!isCompactCache) { for (const ClassId cid : classIds) { uint64_t classHits = (*stats_.cacheHits)[poolId][cid].get(); - XDCHECK(mmContainers_[poolId][cid], + XDCHECK(mmContainers_[currentTier()][poolId][cid], folly::sformat("Pid {}, Cid {} not initialized.", poolId, cid)); cacheStats.insert( {cid, @@ -2208,16 +2792,14 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { (*stats_.fragmentationSize)[poolId][cid].get(), classHits, (*stats_.chainedItemEvictions)[poolId][cid].get(), (*stats_.regularItemEvictions)[poolId][cid].get(), - mmContainers_[poolId][cid]->getStats()} - - }); + getMMContainerStat(currentTier(), poolId, cid)}}); totalHits += classHits; } } PoolStats ret; ret.isCompactCache = isCompactCache; - ret.poolName = allocator_->getPoolName(poolId); + ret.poolName = allocator_[currentTier()]->getPoolName(poolId); ret.poolSize = pool.getPoolSize(); ret.poolUsableSize = pool.getPoolUsableSize(); ret.poolAdvisedSize = pool.getPoolAdvisedSize(); @@ -2229,29 +2811,39 @@ PoolStats CacheAllocator::getPoolStats(PoolId poolId) const { return ret; } +template +ACStats CacheAllocator::getACStats(TierId tid, + PoolId poolId, + ClassId classId) const { + const auto& pool = allocator_[tid]->getPool(poolId); + const auto& ac = pool.getAllocationClass(classId); + + auto stats = ac.getStats(); + stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][poolId][classId]; + return stats; +} + template PoolEvictionAgeStats CacheAllocator::getPoolEvictionAgeStats( PoolId pid, unsigned int slabProjectionLength) const { PoolEvictionAgeStats stats; - - const auto& pool = allocator_->getPool(pid); + const auto& pool = allocator_[currentTier()]->getPool(pid); const auto& allocSizes = pool.getAllocSizes(); for (ClassId cid = 0; cid < static_cast(allocSizes.size()); ++cid) { - auto& mmContainer = getMMContainer(pid, cid); + auto& mmContainer = getMMContainer(currentTier(), pid, cid); const auto numItemsPerSlab = - allocator_->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); + allocator_[currentTier()]->getPool(pid).getAllocationClass(cid).getAllocsPerSlab(); const auto projectionLength = numItemsPerSlab * slabProjectionLength; stats.classEvictionAgeStats[cid] = mmContainer.getEvictionAgeStat(projectionLength); } - return stats; } template CacheMetadata CacheAllocator::getCacheMetadata() const noexcept { return CacheMetadata{kCachelibVersion, kCacheRamFormatVersion, - kCacheNvmFormatVersion, config_.size}; + kCacheNvmFormatVersion, config_.getCacheSize()}; } template @@ -2283,7 +2875,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } try { - auto releaseContext = allocator_->startSlabRelease( + auto releaseContext = allocator_[currentTier()]->startSlabRelease( pid, victim, receiver, mode, hint, [this]() -> bool { return shutDownInProgress_; }); @@ -2292,15 +2884,15 @@ void CacheAllocator::releaseSlab(PoolId pid, return; } - releaseSlabImpl(releaseContext); - if (!allocator_->allAllocsFreed(releaseContext)) { + releaseSlabImpl(currentTier(), releaseContext); + if (!allocator_[currentTier()]->allAllocsFreed(releaseContext)) { throw std::runtime_error( folly::sformat("Was not able to free all allocs. PoolId: {}, AC: {}", releaseContext.getPoolId(), releaseContext.getClassId())); } - allocator_->completeSlabRelease(releaseContext); + allocator_[currentTier()]->completeSlabRelease(releaseContext); } catch (const exception::SlabReleaseAborted& e) { stats_.numAbortedSlabReleases.inc(); throw exception::SlabReleaseAborted(folly::sformat( @@ -2311,8 +2903,7 @@ void CacheAllocator::releaseSlab(PoolId pid, } template -SlabReleaseStats CacheAllocator::getSlabReleaseStats() - const noexcept { +SlabReleaseStats CacheAllocator::getSlabReleaseStats() const noexcept { std::lock_guard l(workersMutex_); return SlabReleaseStats{stats_.numActiveSlabReleases.get(), stats_.numReleasedForRebalance.get(), @@ -2330,7 +2921,7 @@ SlabReleaseStats CacheAllocator::getSlabReleaseStats() } template -void CacheAllocator::releaseSlabImpl( +void CacheAllocator::releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext) { auto startTime = std::chrono::milliseconds(util::getCurrentTimeMs()); bool releaseStuck = false; @@ -2358,10 +2949,11 @@ void CacheAllocator::releaseSlabImpl( // 3. If 2 is successful, Move or Evict // 4. Move on to the next item if current item is freed for (auto alloc : releaseContext.getActiveAllocations()) { + auto startTimeSec = util::getCurrentTimeSec(); // Need to mark an item for release before proceeding // If we can't mark as moving, it means the item is already freed const bool isAlreadyFreed = - !markExclusiveForSlabRelease(releaseContext, alloc, throttler); + !markMovingForSlabRelease(releaseContext, alloc, throttler); if (isAlreadyFreed) { continue; } @@ -2375,7 +2967,7 @@ void CacheAllocator::releaseSlabImpl( if (!isMoved) { evictForSlabRelease(releaseContext, item, throttler); } - XDCHECK(allocator_->isAllocFreed(releaseContext, alloc)); + XDCHECK(allocator_[tid]->isAllocFreed(releaseContext, alloc)); } } @@ -2389,6 +2981,14 @@ void CacheAllocator::throttleWith(util::Throttler& t, } } +template +typename RefcountWithFlags::Value CacheAllocator::unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle) +{ + auto ret = item.unmarkMoving(); + wakeUpWaiters(item, std::move(handle)); + return ret; +} + template bool CacheAllocator::moveForSlabRelease( const SlabReleaseContext& ctx, Item& oldItem, util::Throttler& throttler) { @@ -2398,7 +2998,7 @@ bool CacheAllocator::moveForSlabRelease( bool isMoved = false; auto startTime = util::getCurrentTimeSec(); - WriteHandle newItemHdl = allocateNewItemForOldItem(oldItem); + WriteHandle newItemHdl{}; for (unsigned int itemMovingAttempts = 0; itemMovingAttempts < config_.movingTries; @@ -2406,16 +3006,24 @@ bool CacheAllocator::moveForSlabRelease( stats_.numMoveAttempts.inc(); // Nothing to move and the key is likely also bogus for chained items. - if (oldItem.isOnlyExclusive()) { - oldItem.unmarkExclusive(); + if (oldItem.isOnlyMoving()) { + auto ret = unmarkMovingAndWakeUpWaiters(oldItem, {}); + XDCHECK(ret == 0); const auto res = releaseBackToAllocator(oldItem, RemoveContext::kNormal, false); XDCHECK(res == ReleaseRes::kReleased); return true; } + throttleWith(throttler, [&] { + XLOGF(WARN, + "Spent {} seconds, slab release still trying to move Item: {}. " + "Pool: {}, Class: {}.", + util::getCurrentTimeSec() - startTime, oldItem.toString(), + ctx.getPoolId(), ctx.getClassId()); + }); + if (!newItemHdl) { - // try to allocate again if it previously wasn't successful newItemHdl = allocateNewItemForOldItem(oldItem); } @@ -2426,14 +3034,6 @@ bool CacheAllocator::moveForSlabRelease( break; } } - - throttleWith(throttler, [&] { - XLOGF(WARN, - "Spent {} seconds, slab release still trying to move Item: {}. " - "Pool: {}, Class: {}.", - util::getCurrentTimeSec() - startTime, oldItem.toString(), - ctx.getPoolId(), ctx.getClassId()); - }); } // Return false if we've exhausted moving tries. @@ -2446,7 +3046,7 @@ bool CacheAllocator::moveForSlabRelease( // that's identical to this one to replace it. Here we just need to wait // until all users have dropped the item handles before we can proceed. startTime = util::getCurrentTimeSec(); - while (!oldItem.isOnlyExclusive()) { + while (!oldItem.isOnlyMoving()) { throttleWith(throttler, [&] { XLOGF(WARN, "Spent {} seconds, slab release still waiting for refcount to " @@ -2455,8 +3055,12 @@ bool CacheAllocator::moveForSlabRelease( ctx.getPoolId(), ctx.getClassId()); }); } - const auto allocInfo = allocator_->getAllocInfo(oldItem.getMemory()); - allocator_->free(&oldItem); + auto tid = getTierId(oldItem); + auto ref = unmarkMovingAndWakeUpWaiters(oldItem, std::move(newItemHdl)); + XDCHECK(ref == 0); + + const auto allocInfo = allocator_[tid]->getAllocInfo(oldItem.getMemory()); + allocator_[tid]->free(&oldItem); (*stats_.fragmentationSize)[allocInfo.poolId][allocInfo.classId].sub( util::getFragmentation(*this, oldItem)); @@ -2465,10 +3069,10 @@ bool CacheAllocator::moveForSlabRelease( } template -typename CacheAllocator::ReadHandle +typename CacheAllocator::WriteHandle CacheAllocator::validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey) { - ReadHandle parentHandle{}; + WriteHandle parentHandle{}; try { parentHandle = findInternal(parentKey); // If the parent is not the same as the parent of the chained item, @@ -2487,6 +3091,7 @@ CacheAllocator::validateAndGetParentHandleForChainedMoveLocked( template typename CacheAllocator::WriteHandle CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { + XDCHECK(oldItem.isMoving()); if (oldItem.isChainedItem()) { const auto& oldChainedItem = oldItem.asChainedItem(); const auto parentKey = oldChainedItem.getParentItem(compressor_).getKey(); @@ -2500,33 +3105,35 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { return {}; } - // Set up the destination for the move. Since oldChainedItem would have - // the exclusive bit set, it won't be picked for eviction. + // Set up the destination for the move. Since oldChainedItem would + // be marked as moving, it won't be picked for eviction. auto newItemHdl = - allocateChainedItemInternal(parentHandle, oldChainedItem.getSize()); + allocateChainedItemInternal(parentHandle, oldItem.getSize()); if (!newItemHdl) { return {}; } - XDCHECK_EQ(newItemHdl->getSize(), oldChainedItem.getSize()); + XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); auto parentPtr = parentHandle.getInternal(); XDCHECK_EQ(reinterpret_cast(parentPtr), reinterpret_cast( - &oldChainedItem.getParentItem(compressor_))); + &newItemHdl->asChainedItem().getParentItem(compressor_))); return newItemHdl; } const auto allocInfo = - allocator_->getAllocInfo(static_cast(&oldItem)); + allocator_[getTierId(oldItem)]->getAllocInfo(static_cast(&oldItem)); // Set up the destination for the move. Since oldItem would have the moving // bit set, it won't be picked for eviction. - auto newItemHdl = allocateInternal(allocInfo.poolId, - oldItem.getKey(), - oldItem.getSize(), - oldItem.getCreationTime(), - oldItem.getExpiryTime()); + auto newItemHdl = allocateInternalTier(getTierId(oldItem), + allocInfo.poolId, + oldItem.getKey(), + oldItem.getSize(), + oldItem.getCreationTime(), + oldItem.getExpiryTime(), + false); if (!newItemHdl) { return {}; } @@ -2545,7 +3152,8 @@ bool CacheAllocator::tryMovingForSlabRelease( // a regular item or chained item is synchronized with any potential // user-side mutation. std::unique_ptr syncObj; - if (config_.movingSync) { + if (config_.movingSync && getNumTiers() == 1) { + // TODO: use moving-bit synchronization for single tier as well if (!oldItem.isChainedItem()) { syncObj = config_.movingSync(oldItem.getKey()); } else { @@ -2553,7 +3161,7 @@ bool CacheAllocator::tryMovingForSlabRelease( // item is still valid. const std::string parentKey = oldItem.asChainedItem().getParentItem(compressor_).getKey().str(); - if (oldItem.isOnlyExclusive()) { + if (oldItem.isOnlyMoving()) { // If chained item no longer has a refcount, its parent is already // being released, so we abort this try to moving. return false; @@ -2570,67 +3178,59 @@ bool CacheAllocator::tryMovingForSlabRelease( } } - return oldItem.isChainedItem() - ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) - : moveRegularItem(oldItem, newItemHdl); + // TODO: we can unify move*Item and move*ItemWithSync by always + // using the moving bit to block readers. + if (getNumTiers() == 1) { + return oldItem.isChainedItem() + ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) + : moveRegularItem(oldItem, newItemHdl); + } else { + if (oldItem.isChainedItem() || oldItem.hasChainedItem()) { + // TODO: add support for chained items + return false; + } else { + //move can fail if another thread calls insertOrReplace + //in this case oldItem is no longer valid (not accessible, + //it gets removed from MMContainer and evictForSlabRelease + //will send it back to the allocator + bool ret = moveRegularItemWithSync(oldItem, newItemHdl); + if (!ret) { + //we failed to move - newItemHdl was released back to allocator + //by the moveRegularItemWithSync but oldItem is not accessible + //and no longer valid - we need to clean it up here + XDCHECK(!oldItem.isAccessible()); + oldItem.markForEvictionWhenMoving(); + unlinkItemForEviction(oldItem); + wakeUpWaiters(oldItem, {}); + } else { + removeFromMMContainer(oldItem); + } + return ret; + } + } +} + +template +void CacheAllocator::wakeUpWaiters(Item& item, WriteHandle handle) +{ + // readers do not block on 'moving' items in case there is only one tier + if (getNumTiers() > 1) { + wakeUpWaitersLocked(item.getKey(), std::move(handle)); + } } template void CacheAllocator::evictForSlabRelease( const SlabReleaseContext& ctx, Item& item, util::Throttler& throttler) { auto startTime = util::getCurrentTimeSec(); + while (true) { + XDCHECK(item.isMoving()); stats_.numEvictionAttempts.inc(); - // if the item is already in a state where only the exclusive bit is set, - // nothing needs to be done. We simply need to unmark exclusive bit and free - // the item. - if (item.isOnlyExclusive()) { - item.unmarkExclusive(); - const auto res = - releaseBackToAllocator(item, RemoveContext::kNormal, false); - XDCHECK(ReleaseRes::kReleased == res); - return; - } - - // Since we couldn't move, we now evict this item. Owning handle will be - // the item's handle for regular/normal items and will be the parent - // handle for chained items. - auto owningHandle = - item.isChainedItem() - ? evictChainedItemForSlabRelease(item.asChainedItem()) - : evictNormalItemForSlabRelease(item); - - // we managed to evict the corresponding owner of the item and have the - // last handle for the owner. - if (owningHandle) { - const auto allocInfo = - allocator_->getAllocInfo(static_cast(&item)); - if (owningHandle->hasChainedItem()) { - (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId] - .inc(); - } else { - (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId] - .inc(); - } - - stats_.numEvictionSuccesses.inc(); - - // we have the last handle. no longer need to hold on to the exclusive bit - item.unmarkExclusive(); - - // manually decrement the refcount to call releaseBackToAllocator - const auto ref = decRef(*owningHandle); - XDCHECK(ref == 0); - const auto res = releaseBackToAllocator(*owningHandle.release(), - RemoveContext::kEviction, false); - XDCHECK(res == ReleaseRes::kReleased); - return; - } - if (shutDownInProgress_) { - item.unmarkExclusive(); - allocator_->abortSlabRelease(ctx); + auto ref = unmarkMovingAndWakeUpWaiters(item, {}); + allocator_[getTierId(item)]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while trying to evict" " Item: {} Pool: {}, Class: {}.", @@ -2649,268 +3249,141 @@ void CacheAllocator::evictForSlabRelease( .toString()) : ""); }); - } -} - -template -typename CacheAllocator::WriteHandle -CacheAllocator::advanceIteratorAndTryEvictRegularItem( - MMContainer& mmContainer, EvictionIterator& itr) { - // we should flush this to nvmcache if it is not already present in nvmcache - // and the item is not expired. - Item& item = *itr; - const bool evictToNvmCache = shouldWriteToNvmCache(item); - - auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) - : typename NvmCacheT::PutToken{}; - - // record the in-flight eviciton. If not, we move on to next item to avoid - // stalling eviction. - if (evictToNvmCache && !token.isValid()) { - ++itr; - stats_.evictFailConcurrentFill.inc(); - return WriteHandle{}; - } - - // If there are other accessors, we should abort. Acquire a handle here since - // if we remove the item from both access containers and mm containers - // below, we will need a handle to ensure proper cleanup in case we end up - // not evicting this item - auto evictHandle = accessContainer_->removeIf(item, &itemExclusivePredicate); - if (!evictHandle) { - ++itr; - stats_.evictFailAC.inc(); - return evictHandle; - } - - mmContainer.remove(itr); - XDCHECK_EQ(reinterpret_cast(evictHandle.get()), - reinterpret_cast(&item)); - XDCHECK(!evictHandle->isInMMContainer()); - XDCHECK(!evictHandle->isAccessible()); - - // Invalidate iterator since later on if we are not evicting this - // item, we may need to rely on the handle we created above to ensure - // proper cleanup if the item's raw refcount has dropped to 0. - // And since this item may be a parent item that has some child items - // in this very same mmContainer, we need to make sure we drop this - // exclusive iterator so we can gain access to it when we're cleaning - // up the child items - itr.destroy(); - - // Ensure that there are no accessors after removing from the access - // container - XDCHECK(evictHandle->getRefCount() == 1); - - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { - XDCHECK(token.isValid()); - nvmCache_->put(evictHandle, std::move(token)); - } - return evictHandle; -} - -template -typename CacheAllocator::WriteHandle -CacheAllocator::advanceIteratorAndTryEvictChainedItem( - EvictionIterator& itr) { - XDCHECK(itr->isChainedItem()); - - ChainedItem* candidate = &itr->asChainedItem(); - ++itr; - - // The parent could change at any point through transferChain. However, if - // that happens, we would realize that the releaseBackToAllocator return - // kNotRecycled and we would try another chained item, leading to transient - // failure. - auto& parent = candidate->getParentItem(compressor_); + // if the item is already in a state where only the exclusive bit is set, + // nothing needs to be done. We simply need to call unmarkMoving and free + // the item. + if (item.isOnlyMoving()) { + auto ref = unmarkMovingAndWakeUpWaiters(item, {}); + XDCHECK(ref == 0); + const auto res = + releaseBackToAllocator(item, RemoveContext::kNormal, false); + XDCHECK(ReleaseRes::kReleased == res); + return; + } - const bool evictToNvmCache = shouldWriteToNvmCache(parent); + typename NvmCacheT::PutToken token; + Item* evicted; + if (item.isChainedItem()) { + auto& expectedParent = item.asChainedItem().getParentItem(compressor_); + + if (getNumTiers() == 1) { + // TODO: unify this with multi-tier implementation + // right now, taking a chained item lock here would lead to deadlock + const std::string parentKey = expectedParent.getKey().str(); + auto l = chainedItemLocks_.lockExclusive(parentKey); + + // check if the child is still in mmContainer and the expected parent is + // valid under the chained item lock. + if (expectedParent.getKey() != parentKey || !item.isInMMContainer() || + item.isOnlyMoving() || + &expectedParent != &item.asChainedItem().getParentItem(compressor_) || + !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { + continue; + } - auto token = evictToNvmCache ? nvmCache_->createPutToken(parent.getKey()) - : typename NvmCacheT::PutToken{}; + // search if the child is present in the chain + { + auto parentHandle = findInternal(parentKey); + if (!parentHandle || parentHandle != &expectedParent) { + continue; + } + + ChainedItem* head = nullptr; + { // scope for the handle + auto headHandle = findChainedItem(expectedParent); + head = headHandle ? &headHandle->asChainedItem() : nullptr; + } + + bool found = false; + while (head) { + if (head == &item) { + found = true; + break; + } + head = head->getNext(compressor_); + } + + if (!found) { + continue; + } + } + } - // if token is invalid, return. iterator is already advanced. - if (evictToNvmCache && !token.isValid()) { - stats_.evictFailConcurrentFill.inc(); - return WriteHandle{}; - } + evicted = &expectedParent; + token = createPutToken(*evicted); + if (evicted->markForEviction()) { + // unmark the child so it will be freed + item.unmarkMoving(); + unlinkItemForEviction(*evicted); + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*evicted, {}); + } else { + // TODO: potential deadlock with markUseful for parent item + // for now, we do not block any reader on child items but this + // should probably be fixed + continue; + } + } else { + evicted = &item; - // check if the parent exists in the hashtable and refcount is drained. - auto parentHandle = - accessContainer_->removeIf(parent, &itemExclusivePredicate); - if (!parentHandle) { - stats_.evictFailParentAC.inc(); - return parentHandle; - } + token = createPutToken(*evicted); + if (evicted->markForEvictionWhenMoving()) { + unlinkItemForEviction(*evicted); + wakeUpWaiters(*evicted, {}); + } else { + continue; + } + } - // Invalidate iterator since later on we may use the mmContainer - // associated with this iterator which cannot be done unless we - // drop this iterator - // - // This must be done once we know the parent is not nullptr. - // Since we can very well be the last holder of this parent item, - // which may have a chained item that is linked in this MM container. - itr.destroy(); + if (token.isValid() && shouldWriteToNvmCacheExclusive(*evicted)) { + nvmCache_->put(*evicted, std::move(token)); + } - // Ensure we have the correct parent and we're the only user of the - // parent, then free it from access container. Otherwise, we abort - XDCHECK_EQ(reinterpret_cast(&parent), - reinterpret_cast(parentHandle.get())); - XDCHECK_EQ(1u, parent.getRefCount()); + const auto allocInfo = + allocator_[getTierId(*evicted)]->getAllocInfo(static_cast(evicted)); + if (evicted->hasChainedItem()) { + (*stats_.chainedItemEvictions)[allocInfo.poolId][allocInfo.classId].inc(); + } else { + (*stats_.regularItemEvictions)[allocInfo.poolId][allocInfo.classId].inc(); + } - removeFromMMContainer(*parentHandle); + stats_.numEvictionSuccesses.inc(); - XDCHECK(!parent.isInMMContainer()); - XDCHECK(!parent.isAccessible()); + XDCHECK(evicted->getRefCount() == 0); + const auto res = + releaseBackToAllocator(*evicted, RemoveContext::kEviction, false); - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { - XDCHECK(token.isValid()); - XDCHECK(parentHandle->hasChainedItem()); - nvmCache_->put(parentHandle, std::move(token)); + if (getNumTiers() == 1) { + XDCHECK(res == ReleaseRes::kReleased); + } else { + const bool isAlreadyFreed = + !markMovingForSlabRelease(ctx, &item, throttler); + if (!isAlreadyFreed) { + continue; + } + } + + return; } - - return parentHandle; } template +template typename CacheAllocator::WriteHandle -CacheAllocator::evictNormalItemForSlabRelease(Item& item) { - XDCHECK(item.isExclusive()); - - if (item.isOnlyExclusive()) { - return WriteHandle{}; - } - - auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; - - const bool evictToNvmCache = shouldWriteToNvmCache(item); - auto token = evictToNvmCache ? nvmCache_->createPutToken(item.getKey()) - : typename NvmCacheT::PutToken{}; - - // We remove the item from both access and mm containers. It doesn't matter - // if someone else calls remove on the item at this moment, the item cannot - // be freed as long as we have the exclusive bit set. - auto handle = accessContainer_->removeIf(item, std::move(predicate)); +CacheAllocator::removeIf(Item& item, Fn&& predicate) { + auto handle = accessContainer_->removeIf(item, std::forward(predicate)); - if (!handle) { - return handle; - } - - XDCHECK_EQ(reinterpret_cast(handle.get()), + if (handle) { + XDCHECK_EQ(reinterpret_cast(handle.get()), reinterpret_cast(&item)); - XDCHECK_EQ(1u, handle->getRefCount()); - removeFromMMContainer(item); - - // now that we are the only handle and we actually removed something from - // the RAM cache, we enqueue it to nvmcache. - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(item)) { - nvmCache_->put(handle, std::move(token)); + removeFromMMContainer(item); } return handle; } -template -typename CacheAllocator::WriteHandle -CacheAllocator::evictChainedItemForSlabRelease(ChainedItem& child) { - XDCHECK(child.isExclusive()); - - // We have the child marked as moving, but dont know anything about the - // state of the parent. Unlike the case of regular eviction where we are - // sure that the child is inside the MMContainer, ensuring its parent is - // valid, we can not make any assumptions here. We try to find the parent - // first through the access container and then verify that the parent's - // chain points to the child before cleaning up the parent. If the parent - // was in the process of being re-allocated or child was being removed - // concurrently, we would synchronize here on one of the checks. - Item& expectedParent = child.getParentItem(compressor_); - - // Grab exclusive lock since we are modifying the chain. at this point, we - // dont know the state of the parent. so we need to do some validity checks - // after we have the chained item lock to ensure that we got the lock off of - // a valid state. - const std::string parentKey = expectedParent.getKey().str(); - auto l = chainedItemLocks_.lockExclusive(parentKey); - - // check if the child is still in mmContainer and the expected parent is - // valid under the chained item lock. - if (expectedParent.getKey() != parentKey || !child.isInMMContainer() || - child.isOnlyExclusive() || - &expectedParent != &child.getParentItem(compressor_) || - !expectedParent.isAccessible() || !expectedParent.hasChainedItem()) { - return {}; - } - - // search if the child is present in the chain - auto parentHandle = findInternal(parentKey); - if (!parentHandle || parentHandle != &expectedParent) { - return {}; - } - - ChainedItem* head = nullptr; - { // scope for the handle - auto headHandle = findChainedItem(expectedParent); - head = headHandle ? &headHandle->asChainedItem() : nullptr; - } - - bool found = false; - while (head) { - if (head == &child) { - found = true; - break; - } - head = head->getNext(compressor_); - } - - if (!found) { - return {}; - } - - // if we found the child in the parent's chain, we remove it and ensure that - // the handle we obtained was the last one. Before that, create a put token - // to guard any racing cache find to avoid item re-appearing in NvmCache. - const bool evictToNvmCache = shouldWriteToNvmCache(expectedParent); - - auto token = evictToNvmCache - ? nvmCache_->createPutToken(expectedParent.getKey()) - : typename NvmCacheT::PutToken{}; - - if (!accessContainer_->removeIf(expectedParent, - parentEvictForSlabReleasePredicate)) { - return {}; - } - - // at this point, we should be the last handle owner - XDCHECK_EQ(1u, parentHandle->getRefCount()); - - // We remove the parent from both access and mm containers. It doesn't - // matter if someone else calls remove on the parent at this moment, it - // cannot be freed since we hold an active item handle - removeFromMMContainer(*parentHandle); - - // In case someone else had removed this chained item from its parent by now - // So we check again to see if it has been unlinked from its parent - if (!child.isInMMContainer() || child.isOnlyExclusive()) { - return {}; - } - - // check after removing from the MMContainer that the parent is still not - // being marked as moving. If parent is moving, it will release the child - // item and we will wait for that. - if (parentHandle->isExclusive()) { - return {}; - } - - // now that we are the only handle and we actually removed something from - // the RAM cache, we enqueue it to nvmcache. - if (evictToNvmCache && shouldWriteToNvmCacheExclusive(*parentHandle)) { - DCHECK(parentHandle->hasChainedItem()); - nvmCache_->put(parentHandle, std::move(token)); - } - - return parentHandle; -} - template bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { if (!handle) { @@ -2919,18 +3392,11 @@ bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { // We remove the item from both access and mm containers. // We want to make sure the caller is the only one holding the handle. - auto removedHandle = - accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); - if (removedHandle) { - removeFromMMContainer(*(handle.getInternal())); - return true; - } - - return false; + return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate); } template -bool CacheAllocator::markExclusiveForSlabRelease( +bool CacheAllocator::markMovingForSlabRelease( const SlabReleaseContext& ctx, void* alloc, util::Throttler& throttler) { // MemoryAllocator::processAllocForRelease will execute the callback // if the item is not already free. So there are three outcomes here: @@ -2945,18 +3411,23 @@ bool CacheAllocator::markExclusiveForSlabRelease( // At first, we assume this item was already freed bool itemFreed = true; bool markedMoving = false; - const auto fn = [&markedMoving, &itemFreed](void* memory) { + TierId tid = getTierId(alloc); + auto numTiers = getNumTiers(); + const auto fn = [&markedMoving, &itemFreed, numTiers](void* memory) { // Since this callback is executed, the item is not yet freed itemFreed = false; Item* item = static_cast(memory); - if (item->markExclusive()) { + // TODO: for chained items, moving bit is only used to avoid + // freeing the item prematurely + auto failIfRefNotZero = numTiers > 1 && !item->isChainedItem(); + if (item->markMoving(failIfRefNotZero)) { markedMoving = true; } }; auto startTime = util::getCurrentTimeSec(); while (true) { - allocator_->processAllocForRelease(ctx, alloc, fn); + allocator_[tid]->processAllocForRelease(ctx, alloc, fn); // If item is already freed we give up trying to mark the item moving // and return false, otherwise if marked as moving, we return true. @@ -2971,7 +3442,7 @@ bool CacheAllocator::markExclusiveForSlabRelease( itemFreed = true; if (shutDownInProgress_) { - allocator_->abortSlabRelease(ctx); + allocator_[tid]->abortSlabRelease(ctx); throw exception::SlabReleaseAborted( folly::sformat("Slab Release aborted while still trying to mark" " as moving for Item: {}. Pool: {}, Class: {}.", @@ -2994,12 +3465,15 @@ template CCacheT* CacheAllocator::addCompactCache(folly::StringPiece name, size_t size, Args&&... args) { + if (getNumTiers() != 1) + throw std::runtime_error("TODO: compact cache for multi-tier Cache not supported."); + if (!config_.isCompactCacheEnabled()) { throw std::logic_error("Compact cache is not enabled"); } folly::SharedMutex::WriteHolder lock(compactCachePoolsLock_); - auto poolId = allocator_->addPool(name, size, {Slab::kSize}); + auto poolId = allocator_[0]->addPool(name, size, {Slab::kSize}); isCompactCachePool_[poolId] = true; auto ptr = std::make_unique( @@ -3108,12 +3582,15 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { *metadata_.numChainedChildItems() = stats_.numChainedChildItems.get(); *metadata_.numAbortedSlabReleases() = stats_.numAbortedSlabReleases.get(); + // TODO: implement serialization for multiple tiers auto serializeMMContainers = [](MMContainers& mmContainers) { MMSerializationTypeContainer state; - for (unsigned int i = 0; i < mmContainers.size(); ++i) { + for (unsigned int i = 0; i < 1 /* TODO: */ ; ++i) { for (unsigned int j = 0; j < mmContainers[i].size(); ++j) { - if (mmContainers[i][j]) { - state.pools_ref()[i][j] = mmContainers[i][j]->saveState(); + for (unsigned int k = 0; k < mmContainers[i][j].size(); ++k) { + if (mmContainers[i][j][k]) { + state.pools_ref()[j][k] = mmContainers[i][j][k]->saveState(); + } } } } @@ -3123,7 +3600,8 @@ folly::IOBufQueue CacheAllocator::saveStateToIOBuf() { serializeMMContainers(mmContainers_); AccessSerializationType accessContainerState = accessContainer_->saveState(); - MemoryAllocator::SerializationType allocatorState = allocator_->saveState(); + // TODO: foreach allocator + MemoryAllocator::SerializationType allocatorState = allocator_[0]->saveState(); CCacheManager::SerializationType ccState = compactCacheManager_->saveState(); AccessSerializationType chainedItemAccessContainerState = @@ -3148,6 +3626,8 @@ bool CacheAllocator::stopWorkers(std::chrono::seconds timeout) { success &= stopPoolResizer(timeout); success &= stopMemMonitor(timeout); success &= stopReaper(timeout); + success &= stopBackgroundEvictor(timeout); + success &= stopBackgroundPromoter(timeout); return success; } @@ -3185,6 +3665,8 @@ CacheAllocator::shutDown() { (shmShutDownStatus == ShmShutDownRes::kSuccess); shmManager_.reset(); + // TODO: save per-tier state + if (shmShutDownSucceeded) { if (!nvmShutDownStatusOpt || *nvmShutDownStatusOpt) return ShutDownStatus::kSuccess; @@ -3248,22 +3730,26 @@ CacheAllocator::deserializeMMContainers( const auto container = deserializer.deserialize(); - MMContainers mmContainers; + /* TODO: right now, we create empty containers because deserialization + * only works for a single (topmost) tier. */ + MMContainers mmContainers{getNumTiers()}; for (auto& kvPool : *container.pools_ref()) { auto i = static_cast(kvPool.first); auto& pool = getPool(i); for (auto& kv : kvPool.second) { auto j = static_cast(kv.first); - MMContainerPtr ptr = - std::make_unique(kv.second, - compressor); - auto config = ptr->getConfig(); - config.addExtraConfig(config_.trackTailHits - ? pool.getAllocationClass(j).getAllocsPerSlab() - : 0); - ptr->setConfig(config); - mmContainers[i][j] = std::move(ptr); + for (TierId tid = 0; tid < getNumTiers(); tid++) { + MMContainerPtr ptr = + std::make_unique(kv.second, + compressor); + auto config = ptr->getConfig(); + config.addExtraConfig(config_.trackTailHits + ? pool.getAllocationClass(j).getAllocsPerSlab() + : 0); + ptr->setConfig(config); + mmContainers[tid][i][j] = std::move(ptr); + } } } // We need to drop the unevictableMMContainer in the desierializer. @@ -3403,6 +3889,8 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { ret.nvmUpTime = currTime - nvmCacheState_.getCreationTime(); ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; ret.reaperStats = getReaperStats(); + ret.evictionStats = getBackgroundMoverStats(MoverDir::Evict); + ret.promotionStats = getBackgroundMoverStats(MoverDir::Promote); ret.numActiveHandles = getNumActiveHandles(); ret.isNewRamCache = cacheCreationTime_ == cacheInstanceCreationTime_; @@ -3414,11 +3902,14 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { template CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { - const auto totalCacheSize = allocator_->getMemorySize(); - const auto configuredTotalCacheSize = allocator_->getMemorySizeInclAdvised(); - + size_t totalCacheSize = 0; + size_t configuredTotalCacheSize = 0; + for(auto& allocator: allocator_) { + totalCacheSize += allocator->getMemorySize(); + configuredTotalCacheSize += allocator->getMemorySizeInclAdvised(); + } auto addSize = [this](size_t a, PoolId pid) { - return a + allocator_->getPool(pid).getPoolSize(); + return a + allocator_[currentTier()]->getPool(pid).getPoolSize(); }; const auto regularPoolIds = getRegularPoolIds(); const auto ccCachePoolIds = getCCachePoolIds(); @@ -3431,9 +3922,9 @@ CacheMemoryStats CacheAllocator::getCacheMemoryStats() const { configuredTotalCacheSize, configuredRegularCacheSize, configuredCompactCacheSize, - allocator_->getAdvisedMemorySize(), + allocator_[currentTier()]->getAdvisedMemorySize(), memMonitor_ ? memMonitor_->getMaxAdvisePct() : 0, - allocator_->getUnreservedMemorySize(), + allocator_[currentTier()]->getUnreservedMemorySize(), nvmCache_ ? nvmCache_->getSize() : 0, util::getMemAvailable(), util::getRSSBytes()}; @@ -3586,6 +4077,64 @@ bool CacheAllocator::startNewReaper( return true; } +template +auto CacheAllocator::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid) +{ + std::vector asssignedMemory; + // TODO: for now, only evict from tier 0 + auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds()); + for (const auto pid : pools) { + const auto& mpStats = getPoolByTid(pid,tid).getStats(); + for (const auto cid : mpStats.classIds) { + if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) { + asssignedMemory.emplace_back(tid, pid, cid); + } + } + } + return asssignedMemory; +} + +template +bool CacheAllocator::startNewBackgroundEvictor( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + backgroundEvictor_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy, MoverDir::Evict); + result = result && ret; + + if (result) { + backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0)); + } + } + return result; +} + +template +bool CacheAllocator::startNewBackgroundPromoter( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + XDCHECK(getNumTiers() > 1); + backgroundPromoter_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy, MoverDir::Promote); + result = result && ret; + + if (result) { + backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1)); + } + } + return result; +} + template bool CacheAllocator::stopPoolRebalancer( std::chrono::seconds timeout) { @@ -3613,6 +4162,26 @@ bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { return stopWorker("Reaper", reaper_, timeout); } +template +bool CacheAllocator::stopBackgroundEvictor(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundEvictor_.size(); i++) { + auto ret = stopWorker("BackgroundEvictor", backgroundEvictor_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::stopBackgroundPromoter(std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundPromoter_.size(); i++) { + auto ret = stopWorker("BackgroundPromoter", backgroundPromoter_[i], timeout); + result = result && ret; + } + return result; +} + template bool CacheAllocator::cleanupStrayShmSegments( const std::string& cacheDir, bool posix) { @@ -3621,6 +4190,8 @@ bool CacheAllocator::cleanupStrayShmSegments( // cache dir exists. clean up only if there are no other processes // attached. if another process was attached, the following would fail. ShmManager::cleanup(cacheDir, posix); + + // TODO: cleanup per-tier state } catch (const std::exception& e) { XLOGF(ERR, "Error cleaning up {}. Exception: ", cacheDir, e.what()); return false; @@ -3630,7 +4201,8 @@ bool CacheAllocator::cleanupStrayShmSegments( // Any other concurrent process can not be attached to the segments or // even if it does, we want to mark it for destruction. ShmManager::removeByName(cacheDir, detail::kShmInfoName, posix); - ShmManager::removeByName(cacheDir, detail::kShmCacheName, posix); + ShmManager::removeByName(cacheDir, detail::kShmCacheName + + std::to_string(0 /* TODO: per tier */), posix); ShmManager::removeByName(cacheDir, detail::kShmHashTableName, posix); ShmManager::removeByName(cacheDir, detail::kShmChainedItemHashTableName, posix); @@ -3644,14 +4216,16 @@ uint64_t CacheAllocator::getItemPtrAsOffset(const void* ptr) { // the two differ (e.g. Mac OS 12) - causing templating instantiation // errors downstream. + auto tid = getTierId(ptr); + // if this succeeeds, the address is valid within the cache. - allocator_->getAllocInfo(ptr); + allocator_[tid]->getAllocInfo(ptr); if (!isOnShm_ || !shmManager_) { throw std::invalid_argument("Shared memory not used"); } - const auto& shm = shmManager_->getShmByName(detail::kShmCacheName); + const auto& shm = shmManager_->getShmByName(detail::kShmCacheName + std::to_string(tid)); return reinterpret_cast(ptr) - reinterpret_cast(shm.getCurrentMapping().addr); diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index ed0096390a..39a1c4881b 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -38,6 +40,7 @@ #include #pragma GCC diagnostic pop +#include "cachelib/allocator/BackgroundMover.h" #include "cachelib/allocator/CCacheManager.h" #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/CacheAllocatorConfig.h" @@ -710,6 +713,11 @@ class CacheAllocator : public CacheBase { // @return the full usable size for this item uint32_t getUsableSize(const Item& item) const; + // gets the allocation class assigned to BG worker + auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid); + bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid); + size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers); + // Get a random item from memory // This is useful for profiling and sampling cachelib managed memory // @@ -806,7 +814,7 @@ class CacheAllocator : public CacheBase { // @param config new config for the pool // // @throw std::invalid_argument if the poolId is invalid - void overridePoolConfig(PoolId pid, const MMConfig& config); + void overridePoolConfig(TierId tid, PoolId pid, const MMConfig& config); // update an existing pool's rebalance strategy // @@ -847,8 +855,9 @@ class CacheAllocator : public CacheBase { // @return true if the operation succeeded. false if the size of the pool is // smaller than _bytes_ // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call shrinkPool for specific tier? bool shrinkPool(PoolId pid, size_t bytes) { - return allocator_->shrinkPool(pid, bytes); + return allocator_[currentTier()]->shrinkPool(pid, bytes); } // grow an existing pool by _bytes_. This will fail if there is no @@ -857,8 +866,9 @@ class CacheAllocator : public CacheBase { // @return true if the pool was grown. false if the necessary number of // bytes were not available. // @throw std::invalid_argument if the poolId is invalid. + // TODO: should call growPool for specific tier? bool growPool(PoolId pid, size_t bytes) { - return allocator_->growPool(pid, bytes); + return allocator_[currentTier()]->growPool(pid, bytes); } // move bytes from one pool to another. The source pool should be at least @@ -871,7 +881,7 @@ class CacheAllocator : public CacheBase { // correct size to do the transfer. // @throw std::invalid_argument if src or dest is invalid pool bool resizePools(PoolId src, PoolId dest, size_t bytes) override { - return allocator_->resizePools(src, dest, bytes); + return allocator_[currentTier()]->resizePools(src, dest, bytes); } // Add a new compact cache with given name and size @@ -1053,6 +1063,11 @@ class CacheAllocator : public CacheBase { // @param reaperThrottleConfig throttling config bool startNewReaper(std::chrono::milliseconds interval, util::Throttler::Config reaperThrottleConfig); + + bool startNewBackgroundPromoter(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + bool startNewBackgroundEvictor(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); // Stop existing workers with a timeout bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{ @@ -1062,6 +1077,8 @@ class CacheAllocator : public CacheBase { 0}); bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0}); bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0}); // Set pool optimization to either true or false // @@ -1076,12 +1093,13 @@ class CacheAllocator : public CacheBase { // @throw std::invalid_argument if the memory does not belong to this // cache allocator AllocInfo getAllocInfo(const void* memory) const { - return allocator_->getAllocInfo(memory); + return allocator_[getTierId(memory)]->getAllocInfo(memory); } // return the ids for the set of existing pools in this cache. std::set getPoolIds() const override final { - return allocator_->getPoolIds(); + // all tiers have the same pool ids. TODO: deduplicate + return allocator_[0]->getPoolIds(); } // return a list of pool ids that are backing compact caches. This includes @@ -1093,18 +1111,22 @@ class CacheAllocator : public CacheBase { // return the pool with speicified id. const MemoryPool& getPool(PoolId pid) const override final { - return allocator_->getPool(pid); + return allocator_[currentTier()]->getPool(pid); + } + + const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final { + return allocator_[tid]->getPool(pid); } // calculate the number of slabs to be advised/reclaimed in each pool PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { auto regularPoolIds = getRegularPoolIds(); - return allocator_->calcNumSlabsToAdviseReclaim(regularPoolIds); + return allocator_[currentTier()]->calcNumSlabsToAdviseReclaim(regularPoolIds); } // update number of slabs to advise in the cache void updateNumSlabsToAdvise(int32_t numSlabsToAdvise) override final { - allocator_->updateNumSlabsToAdvise(numSlabsToAdvise); + allocator_[currentTier()]->updateNumSlabsToAdvise(numSlabsToAdvise); } // returns a valid PoolId corresponding to the name or kInvalidPoolId if the @@ -1112,8 +1134,9 @@ class CacheAllocator : public CacheBase { PoolId getPoolId(folly::StringPiece name) const noexcept; // returns the pool's name by its poolId. - std::string getPoolName(PoolId poolId) const override { - return allocator_->getPoolName(poolId); + std::string getPoolName(PoolId poolId) const { + // all tiers have the same pool names. + return allocator_[0]->getPoolName(poolId); } // get stats related to all kinds of slab release events. @@ -1145,6 +1168,52 @@ class CacheAllocator : public CacheBase { auto stats = reaper_ ? reaper_->getStats() : ReaperStats{}; return stats; } + + // returns the background mover stats + BackgroundMoverStats getBackgroundMoverStats(MoverDir direction) const { + + auto stats = BackgroundMoverStats{}; + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) + stats += bg->getStats(); + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + } + return stats; + + } + + + std::map>> + getBackgroundMoverClassStats(MoverDir direction) const { + std::map>> stats; + + if (direction == MoverDir::Evict) { + for (auto &bg : backgroundEvictor_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } else if (direction == MoverDir::Promote) { + for (auto &bg : backgroundPromoter_) { + for (auto &tid : bg->getClassStats()) { + for (auto &pid : tid.second) { + for (auto &cid : pid.second) { + stats[tid.first][pid.first][cid.first] += cid.second; + } + } + } + } + } + + return stats; + } + // return the LruType of an item typename MMType::LruType getItemLruType(const Item& item) const; @@ -1159,6 +1228,9 @@ class CacheAllocator : public CacheBase { // whether it is object-cache bool isObjectCache() const override final { return false; } + // combined pool size for all memory tiers + size_t getPoolSize(PoolId pid) const; + // pool stats by pool id PoolStats getPoolStats(PoolId pid) const override final; @@ -1175,6 +1247,9 @@ class CacheAllocator : public CacheBase { // return cache's memory usage stats CacheMemoryStats getCacheMemoryStats() const override final; + // return stats for Allocation Class + ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const override final; + // return the nvm cache stats map util::StatsMap getNvmCacheStatsMap() const override final; @@ -1284,6 +1359,7 @@ class CacheAllocator : public CacheBase { sizeof(typename RefcountWithFlags::Value) + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(KAllocation)) == sizeof(Item), "vtable overhead"); + // Check for CompressedPtr single/multi tier support static_assert(32 == sizeof(Item), "item overhead is 32 bytes"); // make sure there is no overhead in ChainedItem on top of a regular Item @@ -1308,7 +1384,7 @@ class CacheAllocator : public CacheBase { private: // wrapper around Item's refcount and active handle tracking - FOLLY_ALWAYS_INLINE void incRef(Item& it); + FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(Item& it, bool failIfMoving); FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef(Item& it); // drops the refcount and if needed, frees the allocation back to the memory @@ -1359,6 +1435,12 @@ class CacheAllocator : public CacheBase { bool nascent = false, const Item* toRecycle = nullptr); + // Must be called by the thread which called markForEviction and + // succeeded. After this call, the item is unlinked from Access and + // MM Containers. The item is no longer marked as exclusive and it's + // ref count is 0 - it's available for recycling. + void unlinkItemForEviction(Item& it); + // acquires an handle on the item. returns an empty handle if it is null. // @param it pointer to an item // @return WriteHandle return a handle to this item @@ -1378,11 +1460,14 @@ class CacheAllocator : public CacheBase { using MMContainerPtr = std::unique_ptr; using MMContainers = - std::array, - MemoryPoolManager::kMaxPools>; + std::vector, + MemoryPoolManager::kMaxPools>>; void createMMContainers(const PoolId pid, MMConfig config); + TierId getTierId(const Item& item) const; + TierId getTierId(const void* ptr) const; + // acquire the MMContainer corresponding to the the Item's class and pool. // // @return pointer to the MMContainer. @@ -1390,7 +1475,12 @@ class CacheAllocator : public CacheBase { // allocation from the memory allocator. MMContainer& getMMContainer(const Item& item) const noexcept; - MMContainer& getMMContainer(PoolId pid, ClassId cid) const noexcept; + MMContainer& getMMContainer(TierId tid, PoolId pid, ClassId cid) const noexcept; + + // Get stats of the specified pid and cid. + // If such mmcontainer is not valid (pool id or cid out of bound) + // or the mmcontainer is not initialized, return an empty stat. + MMContainerStat getMMContainerStat(TierId tid, PoolId pid, ClassId cid) const noexcept; // create a new cache allocation. The allocation can be initialized // appropriately and made accessible through insert or insertOrReplace. @@ -1420,7 +1510,20 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromBgThread = false); + + // create a new cache allocation on specific memory tier. + // For description see allocateInternal. + // + // @param tid id a memory tier + WriteHandle allocateInternalTier(TierId tid, + PoolId id, + Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromBgThread); // Allocate a chained item // @@ -1448,17 +1551,17 @@ class CacheAllocator : public CacheBase { // @return handle to the parent item if the validations pass // otherwise, an empty Handle is returned. // - ReadHandle validateAndGetParentHandleForChainedMoveLocked( + WriteHandle validateAndGetParentHandleForChainedMoveLocked( const ChainedItem& item, const Key& parentKey); // Given an existing item, allocate a new one for the // existing one to later be moved into. // - // @param oldItem the item we want to allocate a new item for + // @param item reference to the item we want to allocate a new item for // // @return handle to the newly allocated item // - WriteHandle allocateNewItemForOldItem(const Item& oldItem); + WriteHandle allocateNewItemForOldItem(const Item& item); // internal helper that grabs a refcounted handle to the item. This does // not record the access to reflect in the mmContainer. @@ -1507,12 +1610,21 @@ class CacheAllocator : public CacheBase { // not exist. FOLLY_ALWAYS_INLINE WriteHandle findFastImpl(Key key, AccessMode mode); + // Moves a regular item to a different memory tier. + // + // @param oldItem Reference to the item being moved + // @param newItemHdl Reference to the handle of the new item being moved into + // + // @return true If the move was completed, and the containers were updated + // successfully. + bool moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl); + // Moves a regular item to a different slab. This should only be used during // slab release after the item's exclusive bit has been set. The user supplied // callback is responsible for copying the contents and fixing the semantics // of chained item. // - // @param oldItem Reference to the item being moved + // @param oldItem item being moved // @param newItemHdl Reference to the handle of the new item being moved into // // @return true If the move was completed, and the containers were updated @@ -1592,6 +1704,10 @@ class CacheAllocator : public CacheBase { // false if the item is not in MMContainer bool removeFromMMContainer(Item& item); + using EvictionIterator = typename MMContainer::LockedIterator; + + WriteHandle acquire(EvictionIterator& it) { return acquire(it.get()); } + // Replaces an item in the MMContainer with another item, at the same // position. // @@ -1602,6 +1718,8 @@ class CacheAllocator : public CacheBase { // destination item did not exist in the container, or if the // source item already existed. bool replaceInMMContainer(Item& oldItem, Item& newItem); + bool replaceInMMContainer(Item* oldItem, Item& newItem); + bool replaceInMMContainer(EvictionIterator& oldItemIt, Item& newItem); // Replaces an item in the MMContainer with another item, at the same // position. Or, if the two chained items belong to two different MM @@ -1658,28 +1776,41 @@ class CacheAllocator : public CacheBase { // @param pid the id of the pool to look for evictions inside // @param cid the id of the class to look for evictions inside // @return An evicted item or nullptr if there is no suitable candidate. - Item* findEviction(PoolId pid, ClassId cid); + Item* findEviction(TierId tid, PoolId pid, ClassId cid); - using EvictionIterator = typename MMContainer::LockedIterator; + // Try to move the item down to the next memory tier + // + // @param tid current tier ID of the item + // @param pid the pool ID the item belong to. + // @param item the item to evict + // + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + WriteHandle tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromBgThread); + + WriteHandle tryPromoteToNextMemoryTier(Item& item, bool fromBgThread); - // Advance the current iterator and try to evict a regular item + // Wakes up waiters if there are any // - // @param mmContainer the container to look for evictions. - // @param itr iterator holding the item + // @param item wakes waiters that are waiting on that item + // @param handle handle to pass to the waiters + void wakeUpWaiters(Item& item, WriteHandle handle); + + // Unmarks item as moving and wakes up any waiters waiting on that item // - // @return valid handle to regular item on success. This will be the last - // handle to the item. On failure an empty handle. - WriteHandle advanceIteratorAndTryEvictRegularItem(MMContainer& mmContainer, - EvictionIterator& itr); + // @param item wakes waiters that are waiting on that item + // @param handle handle to pass to the waiters + typename RefcountWithFlags::Value unmarkMovingAndWakeUpWaiters(Item &item, WriteHandle handle); - // Advance the current iterator and try to evict a chained item - // Iterator may also be reset during the course of this function + // Try to move the item down to the next memory tier // - // @param itr iterator holding the item + // @param item the item to evict // - // @return valid handle to the parent item on success. This will be the last - // handle to the item - WriteHandle advanceIteratorAndTryEvictChainedItem(EvictionIterator& itr); + // @return valid handle to the item. This will be the last + // handle to the item. On failure an empty handle. + WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromBgThread); // Deserializer CacheAllocatorMetadata and verify the version // @@ -1693,7 +1824,7 @@ class CacheAllocator : public CacheBase { const typename Item::PtrCompressor& compressor); unsigned int reclaimSlabs(PoolId id, size_t numSlabs) final { - return allocator_->reclaimSlabsAndGrow(id, numSlabs); + return allocator_[currentTier()]->reclaimSlabsAndGrow(id, numSlabs); } FOLLY_ALWAYS_INLINE EventTracker* getEventTracker() const { @@ -1752,26 +1883,27 @@ class CacheAllocator : public CacheBase { const void* hint = nullptr) final; // @param releaseContext slab release context - void releaseSlabImpl(const SlabReleaseContext& releaseContext); + void releaseSlabImpl(TierId tid, const SlabReleaseContext& releaseContext); // @return true when successfully marked as moving, // fasle when this item has already been freed - bool markExclusiveForSlabRelease(const SlabReleaseContext& ctx, - void* alloc, - util::Throttler& throttler); + bool markMovingForSlabRelease(const SlabReleaseContext& ctx, + void* alloc, + util::Throttler& throttler); // "Move" (by copying) the content in this item to another memory // location by invoking the move callback. // // // @param ctx slab release context - // @param item old item to be moved elsewhere + // @param oldItem old item to be moved elsewhere + // @param handle handle to the item or to it's parent (if chained) // @param throttler slow this function down as not to take too much cpu // // @return true if the item has been moved // false if we have exhausted moving attempts bool moveForSlabRelease(const SlabReleaseContext& ctx, - Item& item, + Item& oldItem, util::Throttler& throttler); // "Move" (by copying) the content in this item to another memory @@ -1794,18 +1926,13 @@ class CacheAllocator : public CacheBase { Item& item, util::Throttler& throttler); - // Helper function to evict a normal item for slab release - // - // @return last handle for corresponding to item on success. empty handle on - // failure. caller can retry if needed. - WriteHandle evictNormalItemForSlabRelease(Item& item); + typename NvmCacheT::PutToken createPutToken(Item& item); - // Helper function to evict a child item for slab release - // As a side effect, the parent item is also evicted + // Helper function to remove a item if predicates is true. // - // @return last handle to the parent item of the child on success. empty - // handle on failure. caller can retry. - WriteHandle evictChainedItemForSlabRelease(ChainedItem& item); + // @return last handle to the item on success. empty handle on failure. + template + WriteHandle removeIf(Item& item, Fn&& predicate); // Helper function to remove a item if expired. // @@ -1824,10 +1951,141 @@ class CacheAllocator : public CacheBase { // primitives. So we consciously exempt ourselves here from TSAN data race // detection. folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__); - auto slabsSkipped = allocator_->forEachAllocation(std::forward(f)); + auto slabsSkipped = allocator_[currentTier()]->forEachAllocation(std::forward(f)); stats().numReaperSkippedSlabs.add(slabsSkipped); } + // exposed for the background evictor to iterate through the memory and evict + // in batch. This should improve insertion path for tiered memory config + size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { +auto& mmContainer = getMMContainer(tid, pid, cid); + size_t evictions = 0; + size_t evictionCandidates = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + mmContainer.withEvictionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr) { + while (candidates.size() < batch && + (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && + itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + if (candidate->markMoving(true)) { + mmContainer.remove(itr); + candidates.push_back(candidate); + } else { + ++itr; + } + } + }); + + for (Item *candidate : candidates) { + auto evictedToNext = tryEvictToNextMemoryTier(*candidate, true /* from BgThread */); + if (!evictedToNext) { + auto token = createPutToken(*candidate); + + auto ret = candidate->markForEvictionWhenMoving(); + XDCHECK(ret); + + unlinkItemForEviction(*candidate); + // wake up any readers that wait for the move to complete + // it's safe to do now, as we have the item marked exclusive and + // no other reader can be added to the waiters list + wakeUpWaiters(*candidate, WriteHandle{}); + + if (token.isValid() && shouldWriteToNvmCacheExclusive(*candidate)) { + nvmCache_->put(*candidate, std::move(token)); + } + } else { + evictions++; + XDCHECK(!evictedToNext->isMarkedForEviction() && !evictedToNext->isMoving()); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + XDCHECK(!candidate->isAccessible()); + XDCHECK(candidate->getKey() == evictedToNext->getKey()); + + wakeUpWaiters(*candidate, std::move(evictedToNext)); + } + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + return evictions; + } + + size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { +auto& mmContainer = getMMContainer(tid, pid, cid); + size_t promotions = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + + mmContainer.withPromotionIterator([&tries, &candidates, &batch, &mmContainer, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + // TODO: only allow it for read-only items? + // or implement mvcc + if (candidate->markMoving(true)) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto promoted = tryPromoteToNextMemoryTier(*candidate, true); + if (promoted) { + promotions++; + removeFromMMContainer(*candidate); + XDCHECK(!candidate->isMarkedForEviction() && !candidate->isMoving()); + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + wakeUpWaiters(*candidate, std::move(promoted)); + } else { + // we failed to allocate a new item, this item is no longer moving + auto ref = unmarkMovingAndWakeUpWaiters(*candidate, {}); + if (UNLIKELY(ref == 0)) { + const auto res = + releaseBackToAllocator(*candidate, + RemoveContext::kNormal, false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + } + return promotions; + } + // returns true if nvmcache is enabled and we should write this item to // nvmcache. bool shouldWriteToNvmCache(const Item& item); @@ -1868,10 +2126,10 @@ class CacheAllocator : public CacheBase { std::unique_ptr& worker, std::chrono::seconds timeout = std::chrono::seconds{0}); - ShmSegmentOpts createShmCacheOpts(); - std::unique_ptr createNewMemoryAllocator(); - std::unique_ptr restoreMemoryAllocator(); - std::unique_ptr restoreCCacheManager(); + ShmSegmentOpts createShmCacheOpts(TierId tid); + std::unique_ptr createNewMemoryAllocator(TierId tid); + std::unique_ptr restoreMemoryAllocator(TierId tid); + std::unique_ptr restoreCCacheManager(TierId tid); PoolIds filterCompactCachePools(const PoolIds& poolIds) const; @@ -1891,7 +2149,7 @@ class CacheAllocator : public CacheBase { } typename Item::PtrCompressor createPtrCompressor() const { - return allocator_->createPtrCompressor(); + return typename Item::PtrCompressor(allocator_); } // helper utility to throttle and optionally log. @@ -1914,9 +2172,14 @@ class CacheAllocator : public CacheBase { // @param type the type of initialization // @return nullptr if the type is invalid - // @return pointer to memory allocator + // @return vector of pointers to memory allocator // @throw std::runtime_error if type is invalid - std::unique_ptr initAllocator(InitMemType type); + std::vector> initAllocator(InitMemType type); + + std::vector> createPrivateAllocator(); + std::vector> createAllocators(); + std::vector> restoreAllocators(); + // @param type the type of initialization // @return nullptr if the type is invalid // @return pointer to access container @@ -1928,18 +2191,14 @@ class CacheAllocator : public CacheBase { std::optional saveNvmCache(); void saveRamCache(); - static bool itemExclusivePredicate(const Item& item) { - return item.getRefCount() == 0; + static bool itemSlabMovePredicate(const Item& item) { + return item.isMoving() && item.getRefCount() == 0; } static bool itemExpiryPredicate(const Item& item) { return item.getRefCount() == 1 && item.isExpired(); } - static bool parentEvictForSlabReleasePredicate(const Item& item) { - return item.getRefCount() == 1 && !item.isExclusive(); - } - std::unique_ptr createDeserializer(); // Execute func on each item. `func` can throw exception but must ensure @@ -1978,6 +2237,100 @@ class CacheAllocator : public CacheBase { // BEGIN private members + TierId currentTier() const { + // TODO: every function which calls this method should be refactored. + // We should go case by case and either make such function work on + // all tiers or expose separate parameter to describe the tier ID. + return 0; + } + + unsigned getNumTiers() const { + return memoryTierConfigs.size(); + } + + size_t memoryTierSize(TierId tid) const; + + WriteHandle handleWithWaitContextForMovingItem(Item& item); + + size_t wakeUpWaitersLocked(folly::StringPiece key, WriteHandle&& handle); + + class MoveCtx { + public: + MoveCtx() {} + + ~MoveCtx() { + // prevent any further enqueue to waiters + // Note: we don't need to hold locks since no one can enqueue + // after this point. + wakeUpWaiters(); + } + + // record the item handle. Upon destruction we will wake up the waiters + // and pass a clone of the handle to the callBack. By default we pass + // a null handle + void setItemHandle(WriteHandle _it) { it = std::move(_it); } + + // enqueue a waiter into the waiter list + // @param waiter WaitContext + void addWaiter(std::shared_ptr> waiter) { + XDCHECK(waiter); + waiters.push_back(std::move(waiter)); + } + + size_t numWaiters() const { return waiters.size(); } + + private: + // notify all pending waiters that are waiting for the fetch. + void wakeUpWaiters() { + bool refcountOverflowed = false; + for (auto& w : waiters) { + // If refcount overflowed earlier, then we will return miss to + // all subsequent waitors. + if (refcountOverflowed) { + w->set(WriteHandle{}); + continue; + } + + try { + w->set(it.clone()); + } catch (const exception::RefcountOverflow&) { + // We'll return a miss to the user's pending read, + // so we should enqueue a delete via NvmCache. + // TODO: cache.remove(it); + refcountOverflowed = true; + } + } + } + + WriteHandle it; // will be set when Context is being filled + std::vector>> waiters; // list of + // waiters + }; + using MoveMap = + folly::F14ValueMap, + folly::HeterogeneousAccessHash>; + + static size_t getShardForKey(folly::StringPiece key) { + return folly::Hash()(key) % kShards; + } + + MoveMap& getMoveMapForShard(size_t shard) { + return movesMap_[shard].movesMap_; + } + + MoveMap& getMoveMap(folly::StringPiece key) { + return getMoveMapForShard(getShardForKey(key)); + } + + std::unique_lock getMoveLockForShard(size_t shard) { + return std::unique_lock(moveLock_[shard].moveLock_); + } + + std::unique_lock getMoveLock(folly::StringPiece key) { + return getMoveLockForShard(getShardForKey(key)); + } + // Whether the memory allocator for this cache allocator was created on shared // memory. The hash table, chained item hash table etc is also created on // shared memory except for temporary shared memory mode when they're created @@ -1986,6 +2339,8 @@ class CacheAllocator : public CacheBase { Config config_{}; + const typename Config::MemoryTierConfigs memoryTierConfigs; + // Manages the temporary shared memory segment for memory allocator that // is not persisted when cache process exits. std::unique_ptr tempShm_; @@ -2003,9 +2358,10 @@ class CacheAllocator : public CacheBase { const MMConfig mmConfig_{}; // the memory allocator for allocating out of the available memory. - std::unique_ptr allocator_; + std::vector> allocator_; // compact cache allocator manager + // TODO: per tier? std::unique_ptr compactCacheManager_; // compact cache instances reside here when user "add" or "attach" compact @@ -2055,6 +2411,10 @@ class CacheAllocator : public CacheBase { // free memory monitor std::unique_ptr memMonitor_; + + // background evictor + std::vector>> backgroundEvictor_; + std::vector>> backgroundPromoter_; // check whether a pool is a slabs pool std::array isCompactCachePool_{}; @@ -2067,6 +2427,22 @@ class CacheAllocator : public CacheBase { // poolResizer_, poolOptimizer_, memMonitor_, reaper_ mutable std::mutex workersMutex_; + static constexpr size_t kShards = 8192; // TODO: need to define right value + + struct MovesMapShard { + alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_; + }; + + struct MoveLock { + alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_; + }; + + // a map of all pending moves + std::vector movesMap_; + + // a map of move locks for each shard + std::vector moveLock_; + // time when the ram cache was first created const uint32_t cacheCreationTime_{0}; @@ -2100,6 +2476,7 @@ class CacheAllocator : public CacheBase { // Make this friend to give access to acquire and release friend ReadHandle; friend ReaperAPIWrapper; + friend BackgroundMoverAPIWrapper; friend class CacheAPIWrapperForNvm; friend class FbInternalRuntimeUpdateWrapper; friend class objcache2::ObjectCache; diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h index ec44ff8467..a089e754c0 100644 --- a/cachelib/allocator/CacheAllocatorConfig.h +++ b/cachelib/allocator/CacheAllocatorConfig.h @@ -31,6 +31,7 @@ #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/NvmAdmissionPolicy.h" #include "cachelib/allocator/PoolOptimizeStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/allocator/RebalanceStrategy.h" #include "cachelib/allocator/Util.h" #include "cachelib/common/EventInterface.h" @@ -207,6 +208,9 @@ class CacheAllocatorConfig { // Accepts vector of MemoryTierCacheConfig. Each vector element describes // configuration for a single memory cache tier. Tier sizes are specified as // ratios, the number of parts of total cache size each tier would occupy. + // @throw std::invalid_argument if: + // - the size of configs is 0 + // - the size of configs is greater than kMaxCacheMemoryTiers CacheAllocatorConfig& configureMemoryTiers(const MemoryTierConfigs& configs); // Return reference to MemoryTierCacheConfigs. @@ -262,6 +266,16 @@ class CacheAllocatorConfig { std::chrono::seconds regularInterval, std::chrono::seconds ccacheInterval, uint32_t ccacheStepSizePercent); + + // Enable the background evictor - scans a tier to look for objects + // to evict to the next tier + CacheAllocatorConfig& enableBackgroundEvictor( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + CacheAllocatorConfig& enableBackgroundPromoter( + std::shared_ptr backgroundMoverStrategy, + std::chrono::milliseconds regularInterval, size_t threads); // This enables an optimization for Pool rebalancing and resizing. // The rough idea is to ensure only the least useful items are evicted when @@ -337,6 +351,17 @@ class CacheAllocatorConfig { poolOptimizeStrategy != nullptr; } + // @return whether background evictor thread is enabled + bool backgroundEvictorEnabled() const noexcept { + return backgroundEvictorInterval.count() > 0 && + backgroundEvictorStrategy != nullptr; + } + + bool backgroundPromoterEnabled() const noexcept { + return backgroundPromoterInterval.count() > 0 && + backgroundPromoterStrategy != nullptr; + } + // @return whether memory monitor is enabled bool memMonitoringEnabled() const noexcept { return memMonitorConfig.mode != MemoryMonitor::Disabled && @@ -374,8 +399,7 @@ class CacheAllocatorConfig { std::map serialize() const; // The max number of memory cache tiers - // TODO: increase this number when multi-tier configs are enabled - inline static const size_t kMaxCacheMemoryTiers = 1; + inline static const size_t kMaxCacheMemoryTiers = 2; // Cache name for users to indentify their own cache. std::string cacheName{""}; @@ -448,6 +472,16 @@ class CacheAllocatorConfig { // The slab release process is considered as being stuck if it does not // make any progress for the below threshold std::chrono::milliseconds slabReleaseStuckThreshold{std::chrono::seconds(60)}; + + // rebalance to avoid alloc fialures. + std::shared_ptr backgroundEvictorStrategy; + std::shared_ptr backgroundPromoterStrategy; + // time interval to sleep between runs of the background evictor + std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}}; + std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}}; + + size_t backgroundEvictorThreads{1}; + size_t backgroundPromoterThreads{1}; // time interval to sleep between iterations of pool size optimization, // for regular pools and compact caches @@ -587,6 +621,25 @@ class CacheAllocatorConfig { // If true, we will delay worker start until user explicitly calls // CacheAllocator::startCacheWorkers() bool delayCacheWorkersStart{false}; + + // see MultiTierDataMovement.md + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{1}; + uint64_t minPromotionBatch{1}; + + uint64_t maxEvictionPromotionHotness{60}; + friend CacheT; @@ -924,6 +977,26 @@ CacheAllocatorConfig& CacheAllocatorConfig::enablePoolRebalancing( return *this; } +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundEvictor( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t evictorThreads) { + backgroundEvictorStrategy = strategy; + backgroundEvictorInterval = interval; + backgroundEvictorThreads = evictorThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundPromoter( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t promoterThreads) { + backgroundPromoterStrategy = strategy; + backgroundPromoterInterval = interval; + backgroundPromoterThreads = promoterThreads; + return *this; +} + template CacheAllocatorConfig& CacheAllocatorConfig::enablePoolResizing( std::shared_ptr resizeStrategy, @@ -1085,7 +1158,7 @@ std::map CacheAllocatorConfig::serialize() const { configMap["size"] = std::to_string(size); configMap["cacheDir"] = cacheDir; - configMap["posixShm"] = usePosixShm ? "set" : "empty"; + configMap["posixShm"] = isUsingPosixShm() ? "set" : "empty"; configMap["defaultAllocSizes"] = ""; // Stringify std::set diff --git a/cachelib/allocator/CacheItem-inl.h b/cachelib/allocator/CacheItem-inl.h index f59fa9d599..b33c1ea28a 100644 --- a/cachelib/allocator/CacheItem-inl.h +++ b/cachelib/allocator/CacheItem-inl.h @@ -148,15 +148,16 @@ std::string CacheItem::toString() const { return folly::sformat( "item: " "memory={}:raw-ref={}:size={}:key={}:hex-key={}:" - "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime=" + "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:" + "isMoving={}:references={}:ctime=" "{}:" "expTime={}:updateTime={}:isNvmClean={}:isNvmEvicted={}:hasChainedItem=" "{}", this, getRefCountAndFlagsRaw(), getSize(), folly::humanify(getKey().str()), folly::hexlify(getKey()), - isInMMContainer(), isAccessible(), isExclusive(), getRefCount(), - getCreationTime(), getExpiryTime(), getLastAccessTime(), isNvmClean(), - isNvmEvicted(), hasChainedItem()); + isInMMContainer(), isAccessible(), isMarkedForEviction(), isMoving(), + getRefCount(), getCreationTime(), getExpiryTime(), getLastAccessTime(), + isNvmClean(), isNvmEvicted(), hasChainedItem()); } } @@ -217,23 +218,43 @@ bool CacheItem::isInMMContainer() const noexcept { } template -bool CacheItem::markExclusive() noexcept { - return ref_.markExclusive(); +bool CacheItem::markForEviction() noexcept { + return ref_.markForEviction(); } template -RefcountWithFlags::Value CacheItem::unmarkExclusive() noexcept { - return ref_.unmarkExclusive(); +RefcountWithFlags::Value CacheItem::unmarkForEviction() noexcept { + return ref_.unmarkForEviction(); } template -bool CacheItem::isExclusive() const noexcept { - return ref_.isExclusive(); +bool CacheItem::isMarkedForEviction() const noexcept { + return ref_.isMarkedForEviction(); } template -bool CacheItem::isOnlyExclusive() const noexcept { - return ref_.isOnlyExclusive(); +bool CacheItem::markForEvictionWhenMoving() { + return ref_.markForEvictionWhenMoving(); +} + +template +bool CacheItem::markMoving(bool failIfRefNotZero) { + return ref_.markMoving(failIfRefNotZero); +} + +template +RefcountWithFlags::Value CacheItem::unmarkMoving() noexcept { + return ref_.unmarkMoving(); +} + +template +bool CacheItem::isMoving() const noexcept { + return ref_.isMoving(); +} + +template +bool CacheItem::isOnlyMoving() const noexcept { + return ref_.isOnlyMoving(); } template @@ -335,7 +356,7 @@ bool CacheItem::updateExpiryTime(uint32_t expiryTimeSecs) noexcept { // check for moving to make sure we are not updating the expiry time while at // the same time re-allocating the item with the old state of the expiry time // in moveRegularItem(). See D6852328 - if (isExclusive() || !isInMMContainer() || isChainedItem()) { + if (isMoving() || isMarkedForEviction() || !isInMMContainer() || isChainedItem()) { return false; } // attempt to atomically update the value of expiryTime @@ -451,12 +472,14 @@ std::string CacheChainedItem::toString() const { return folly::sformat( "chained item: " "memory={}:raw-ref={}:size={}:parent-compressed-ptr={}:" - "isInMMContainer={}:isAccessible={}:isExclusive={}:references={}:ctime={}" + "isInMMContainer={}:isAccessible={}:isMarkedForEviction={}:" + "isMoving={}:references={}:ctime={}" ":" "expTime={}:updateTime={}", this, Item::getRefCountAndFlagsRaw(), Item::getSize(), cPtr.getRaw(), - Item::isInMMContainer(), Item::isAccessible(), Item::isExclusive(), - Item::getRefCount(), Item::getCreationTime(), Item::getExpiryTime(), + Item::isInMMContainer(), Item::isAccessible(), + Item::isMarkedForEviction(), Item::isMoving(), Item::getRefCount(), + Item::getCreationTime(), Item::getExpiryTime(), Item::getLastAccessTime()); } diff --git a/cachelib/allocator/CacheItem.h b/cachelib/allocator/CacheItem.h index 06136db032..6728b654eb 100644 --- a/cachelib/allocator/CacheItem.h +++ b/cachelib/allocator/CacheItem.h @@ -46,6 +46,9 @@ class BaseAllocatorTest; template class AllocatorHitStatsTest; +template +class AllocatorMemoryTiersTest; + template class MapTest; @@ -305,12 +308,17 @@ class CACHELIB_PACKED_ATTR CacheItem { */ RefcountWithFlags::Value getRefCountAndFlagsRaw() const noexcept; - FOLLY_ALWAYS_INLINE void incRef() { - if (LIKELY(ref_.incRef())) { - return; + // Increments item's ref count + // + // @return true on success, failure if item is marked as exclusive + // @throw exception::RefcountOverflow on ref count overflow + FOLLY_ALWAYS_INLINE RefcountWithFlags::incResult incRef(bool failIfMoving) { + try { + return ref_.incRef(failIfMoving); + } catch (exception::RefcountOverflow& e) { + throw exception::RefcountOverflow( + folly::sformat("{} item: {}", e.what(), toString())); } - throw exception::RefcountOverflow( - folly::sformat("Refcount maxed out. item: {}", toString())); } FOLLY_ALWAYS_INLINE RefcountWithFlags::Value decRef() { @@ -344,23 +352,43 @@ class CACHELIB_PACKED_ATTR CacheItem { /** * The following two functions corresond to whether or not an item is - * currently in the process of being moved. This happens during a slab - * rebalance, eviction or resize operation. + * currently in the process of being evicted. * - * An item can only be marked exclusive when `isInMMContainer` returns true. + * An item can only be marked exclusive when `isInMMContainer` returns true + * and item is not already exclusive nor moving and the ref count is 0. * This operation is atomic. * - * User can also query if an item "isOnlyExclusive". This returns true only - * if the refcount is 0 and only the exclusive bit is set. - * - * Unmarking exclusive does not depend on `isInMMContainer`. + * Unmarking exclusive does not depend on `isInMMContainer` * Unmarking exclusive will also return the refcount at the moment of * unmarking. */ - bool markExclusive() noexcept; - RefcountWithFlags::Value unmarkExclusive() noexcept; - bool isExclusive() const noexcept; - bool isOnlyExclusive() const noexcept; + bool markForEviction() noexcept; + RefcountWithFlags::Value unmarkForEviction() noexcept; + bool isMarkedForEviction() const noexcept; + + /** + * The following functions correspond to whether or not an item is + * currently in the processed of being moved. When moving, ref count + * is always >= 1. + * + * An item can only be marked moving when `isInMMContainer` returns true + * and item is not already exclusive nor moving. + * + * User can also query if an item "isOnlyMoving". This returns true only + * if the refcount is one and only the exclusive bit is set. + * + * Unmarking moving does not depend on `isInMMContainer` + * Unmarking moving will also return the refcount at the moment of + * unmarking. + */ + bool markMoving(bool failIfRefNotZero); + RefcountWithFlags::Value unmarkMoving() noexcept; + bool isMoving() const noexcept; + bool isOnlyMoving() const noexcept; + + /** This function attempts to mark item as exclusive. + * Can only be called on the item that is moving.*/ + bool markForEvictionWhenMoving(); /** * Item cannot be marked both chained allocation and @@ -448,6 +476,8 @@ class CACHELIB_PACKED_ATTR CacheItem { FRIEND_TEST(ItemTest, NonStringKey); template friend class facebook::cachelib::tests::AllocatorHitStatsTest; + template + friend class facebook::cachelib::tests::AllocatorMemoryTiersTest; }; // A chained item has a hook pointing to the next chained item. The hook is diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp index c16149df6b..b4770a3480 100644 --- a/cachelib/allocator/CacheStats.cpp +++ b/cachelib/allocator/CacheStats.cpp @@ -44,6 +44,8 @@ void Stats::init() { initToZero(*fragmentationSize); initToZero(*chainedItemEvictions); initToZero(*regularItemEvictions); + + classAllocLatency = std::make_unique(); } template @@ -51,7 +53,7 @@ struct SizeVerify {}; void Stats::populateGlobalCacheStats(GlobalCacheStats& ret) const { #ifndef SKIP_SIZE_VERIFY - SizeVerify a = SizeVerify<16176>{}; + SizeVerify a = SizeVerify<16192>{}; std::ignore = a; #endif ret.numCacheGets = numCacheGets.get(); diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h index fb9955b805..46c051be14 100644 --- a/cachelib/allocator/CacheStats.h +++ b/cachelib/allocator/CacheStats.h @@ -25,6 +25,7 @@ #include "cachelib/allocator/memory/Slab.h" #include "cachelib/common/FastStats.h" #include "cachelib/common/PercentileStats.h" +#include "cachelib/common/RollingStats.h" #include "cachelib/common/Time.h" namespace facebook { @@ -289,6 +290,27 @@ struct ReaperStats { uint64_t avgTraversalTimeMs{0}; }; +// Mover Stats +struct BackgroundMoverStats { + // the number of items this worker moved by looking at pools/classes stats + uint64_t numMovedItems{0}; + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + // total number of classes + uint64_t totalClasses{0}; + // eviction size + uint64_t totalBytesMoved{0}; + + BackgroundMoverStats& operator+=(const BackgroundMoverStats& rhs) { + numMovedItems += rhs.numMovedItems; + runCount += rhs.runCount; + totalClasses += rhs.totalClasses; + totalBytesMoved += rhs.totalBytesMoved; + return *this; + } +}; + + // CacheMetadata type to export struct CacheMetadata { // allocator_version @@ -309,6 +331,11 @@ struct Stats; // Stats that apply globally in cache and // the ones that are aggregated over all pools struct GlobalCacheStats { + // background eviction stats + BackgroundMoverStats evictionStats; + + BackgroundMoverStats promotionStats; + // number of calls to CacheAllocator::find uint64_t numCacheGets{0}; diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h index b2a5f8c469..19a15fbbd4 100644 --- a/cachelib/allocator/CacheStatsInternal.h +++ b/cachelib/allocator/CacheStatsInternal.h @@ -21,6 +21,7 @@ #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/memory/MemoryAllocator.h" #include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -229,6 +230,14 @@ struct Stats { std::unique_ptr chainedItemEvictions{}; std::unique_ptr regularItemEvictions{}; + using PerTierPoolClassRollingStats = std::array< + std::array, + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; + + // rolling latency tracking for every alloc class in every pool + std::unique_ptr classAllocLatency{}; + // Eviction failures due to parent cannot be removed from access container AtomicCounter evictFailParentAC{0}; diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp new file mode 100644 index 0000000000..4a900c2cb1 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeThresholdStrategy.h" + +#include + +namespace facebook { +namespace cachelib { + +FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch) + : lowEvictionAcWatermark(lowEvictionAcWatermark), + highEvictionAcWatermark(highEvictionAcWatermark), + maxEvictionBatch(maxEvictionBatch), + minEvictionBatch(minEvictionBatch) {} + +std::vector FreeThresholdStrategy::calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + auto stats = cache.getACStats(tid, pid, cid); + if ((1-stats.usageFraction())*100 >= highEvictionAcWatermark) { + batches.push_back(0); + } else { + auto toFreeMemPercent = highEvictionAcWatermark - (1-stats.usageFraction())*100; + auto toFreeItems = static_cast( + toFreeMemPercent * (stats.totalSlabs() * Slab::kSize) / stats.allocSize); + batches.push_back(toFreeItems); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch; + if (cappedBatchSize < minEvictionBatch) + return minEvictionBatch; + else + return cappedBatchSize; + }); + + return batches; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h new file mode 100644 index 0000000000..94316bfe82 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background mover strategy. +class FreeThresholdStrategy : public BackgroundMoverStrategy { + public: + FreeThresholdStrategy(double lowEvictionAcWatermark, + double highEvictionAcWatermark, + uint64_t maxEvictionBatch, + uint64_t minEvictionBatch); + ~FreeThresholdStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVecs); + + private: + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + uint64_t maxEvictionBatch{40}; + uint64_t minEvictionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h index a125ace1b7..06c21bffe4 100644 --- a/cachelib/allocator/Handle.h +++ b/cachelib/allocator/Handle.h @@ -242,8 +242,6 @@ struct ReadHandleImpl { return hdl; } - bool isWriteHandle() const { return false; } - protected: // accessor. Calling getInternal() on handle with isReady() == false blocks // the thread until the handle is ready. @@ -402,6 +400,12 @@ struct ReadHandleImpl { } } + protected: + friend class ReadHandleImpl; + // Method used only by ReadHandleImpl ctor + void discard() { + it_.store(nullptr, std::memory_order_relaxed); + } private: // we are waiting on Item* to be set to a value. One of the valid values is // nullptr. So choose something that we dont expect to indicate a ptr @@ -481,7 +485,8 @@ struct ReadHandleImpl { // Handle which has the item already FOLLY_ALWAYS_INLINE ReadHandleImpl(Item* it, CacheT& alloc) noexcept - : alloc_(&alloc), it_(it) {} + : alloc_(&alloc), it_(it) { + } // handle that has a wait context allocated. Used for async handles // In this case, the it_ will be filled in asynchronously and mulitple @@ -571,8 +576,6 @@ struct WriteHandleImpl : public ReadHandleImpl { // creating this item handle. WriteHandleImpl clone() const { return WriteHandleImpl{ReadHandle::clone()}; } - bool isWriteHandle() const { return true; } - // Friends friend ReadHandle; // Only CacheAllocator and NvmCache can create non-default constructed handles diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h index ba388d40a4..07aae775f7 100644 --- a/cachelib/allocator/MM2Q-inl.h +++ b/cachelib/allocator/MM2Q-inl.h @@ -258,6 +258,16 @@ void MM2Q::Container::withEvictionIterator(F&& fun) { } } +// returns the head of the hot queue for promotion +template T::*HookPtr> +template +void +MM2Q::Container::withPromotionIterator(F&& fun) { + lruMutex_->lock_combine([this, &fun]() { + fun(LockedIterator{LockHolder{}, lru_.begin(LruType::Hot)}); + }); +} + template T::*HookPtr> void MM2Q::Container::removeLocked(T& node, bool doRebalance) noexcept { diff --git a/cachelib/allocator/MM2Q.h b/cachelib/allocator/MM2Q.h index 982eca21f9..62b644f9c6 100644 --- a/cachelib/allocator/MM2Q.h +++ b/cachelib/allocator/MM2Q.h @@ -68,6 +68,7 @@ class MM2Q { enum LruType { Warm, WarmTail, Hot, Cold, ColdTail, NumTypes }; // Config class for MM2Q + // TODO: implement support for useCombinedLockForIterators struct Config { // Create from serialized config explicit Config(SerializationConfigType configState) @@ -501,6 +502,11 @@ class MM2Q { // Iterator passed as parameter. template void withEvictionIterator(F&& f); + + // Execute provided function under container lock. Function gets + // iterator passed as parameter. + template + void withPromotionIterator(F&& f); // get the current config as a copy Config getConfig() const; diff --git a/cachelib/allocator/MMLru-inl.h b/cachelib/allocator/MMLru-inl.h index d35759f212..751bcca5c1 100644 --- a/cachelib/allocator/MMLru-inl.h +++ b/cachelib/allocator/MMLru-inl.h @@ -229,6 +229,18 @@ void MMLru::Container::withEvictionIterator(F&& fun) { } } +template T::*HookPtr> +template +void +MMLru::Container::withPromotionIterator(F&& fun) { + if (config_.useCombinedLockForIterators) { + lruMutex_->lock_combine([this, &fun]() { fun(Iterator{lru_.begin()}); }); + } else { + LockHolder lck{*lruMutex_}; + fun(Iterator{lru_.begin()}); + } +} + template T::*HookPtr> void MMLru::Container::ensureNotInsertionPoint(T& node) noexcept { // If we are removing the insertion point node, grow tail before we remove diff --git a/cachelib/allocator/MMLru.h b/cachelib/allocator/MMLru.h index 29c6d02689..cf3253349a 100644 --- a/cachelib/allocator/MMLru.h +++ b/cachelib/allocator/MMLru.h @@ -230,12 +230,13 @@ class MMLru { // lruInsertionPointSpec = 2, we insert at a point 1/4th from tail uint8_t lruInsertionPointSpec{0}; + // Whether to use combined locking for withEvictionIterator. + bool useCombinedLockForIterators{true}; + // Minimum interval between reconfigurations. If 0, reconfigure is never // called. std::chrono::seconds mmReconfigureIntervalSecs{}; - // Whether to use combined locking for withEvictionIterator. - bool useCombinedLockForIterators{false}; }; // The container object which can be used to keep track of objects of type @@ -376,6 +377,9 @@ class MMLru { template void withEvictionIterator(F&& f); + template + void withPromotionIterator(F&& f); + // get copy of current config Config getConfig() const; diff --git a/cachelib/allocator/MMTinyLFU-inl.h b/cachelib/allocator/MMTinyLFU-inl.h index 46640b24ca..9203a54dd6 100644 --- a/cachelib/allocator/MMTinyLFU-inl.h +++ b/cachelib/allocator/MMTinyLFU-inl.h @@ -227,6 +227,13 @@ void MMTinyLFU::Container::withEvictionIterator(F&& fun) { fun(getEvictionIterator()); } +template T::*HookPtr> +template +void +MMTinyLFU::Container::withPromotionIterator(F&& fun) { + throw std::runtime_error("Not supported"); +} + template T::*HookPtr> void MMTinyLFU::Container::removeLocked(T& node) noexcept { if (isTiny(node)) { diff --git a/cachelib/allocator/MMTinyLFU.h b/cachelib/allocator/MMTinyLFU.h index c8f2699264..eb45cefd22 100644 --- a/cachelib/allocator/MMTinyLFU.h +++ b/cachelib/allocator/MMTinyLFU.h @@ -496,6 +496,9 @@ class MMTinyLFU { // iterator passed as parameter. template void withEvictionIterator(F&& f); + + template + void withPromotionIterator(F&& f); // for saving the state of the lru // diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h index a60fb64d3e..1b9477c048 100644 --- a/cachelib/allocator/MemoryTierCacheConfig.h +++ b/cachelib/allocator/MemoryTierCacheConfig.h @@ -23,10 +23,7 @@ namespace cachelib { class MemoryTierCacheConfig { public: // Creates instance of MemoryTierCacheConfig for Posix/SysV Shared memory. - static MemoryTierCacheConfig fromShm() { - // TODO: expand this method when adding support for file-mapped memory - return MemoryTierCacheConfig(); - } + static MemoryTierCacheConfig fromShm() { return MemoryTierCacheConfig(); } // Specifies ratio of this memory tier to other tiers. Absolute size // of each tier can be calculated as: @@ -49,7 +46,7 @@ class MemoryTierCacheConfig { const NumaBitMask& getMemBind() const noexcept { return numaNodes; } - size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) { + size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const { // TODO: Call this method when tiers are enabled in allocator // to calculate tier sizes in bytes. if (!partitionNum) { diff --git a/cachelib/allocator/PoolOptimizer.cpp b/cachelib/allocator/PoolOptimizer.cpp index 8d67762be8..d101231a04 100644 --- a/cachelib/allocator/PoolOptimizer.cpp +++ b/cachelib/allocator/PoolOptimizer.cpp @@ -51,6 +51,8 @@ void PoolOptimizer::optimizeRegularPoolSizes() { void PoolOptimizer::optimizeCompactCacheSizes() { try { + // TODO: should optimizer look at each tier individually? + // If yes, then resizePools should be per-tier auto strategy = cache_.getPoolOptimizeStrategy(); if (!strategy) { strategy = strategy_; diff --git a/cachelib/allocator/PromotionStrategy.h b/cachelib/allocator/PromotionStrategy.h new file mode 100644 index 0000000000..1022aca0f8 --- /dev/null +++ b/cachelib/allocator/PromotionStrategy.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/BackgroundMoverStrategy.h" +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background eviction strategy. +class PromotionStrategy : public BackgroundMoverStrategy { + public: + PromotionStrategy(uint64_t promotionAcWatermark, + uint64_t maxPromotionBatch, + uint64_t minPromotionBatch) + : promotionAcWatermark(promotionAcWatermark), + maxPromotionBatch(maxPromotionBatch), + minPromotionBatch(minPromotionBatch) {} + ~PromotionStrategy() {} + + std::vector calculateBatchSizes( + const CacheBase& cache, + std::vector acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + XDCHECK(tid > 0); + auto stats = cache.getACStats(tid - 1, pid, cid); + if ((1-stats.usageFraction())*100 < promotionAcWatermark) + batches.push_back(0); + else { + auto maxPossibleItemsToPromote = static_cast( + (promotionAcWatermark - (1-stats.usageFraction())*100) * + (stats.totalSlabs() * Slab::kSize) / stats.allocSize); + batches.push_back(maxPossibleItemsToPromote); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform( + batches.begin(), batches.end(), batches.begin(), [&](auto numItems) { + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxPromotionBatch * numItems / maxBatch; + if (cappedBatchSize < minPromotionBatch) + return minPromotionBatch; + else + return cappedBatchSize; + }); + + return batches; + } + + private: + double promotionAcWatermark{4.0}; + uint64_t maxPromotionBatch{40}; + uint64_t minPromotionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/Refcount.h b/cachelib/allocator/Refcount.h index c60dea34f1..8251ef15ba 100644 --- a/cachelib/allocator/Refcount.h +++ b/cachelib/allocator/Refcount.h @@ -130,34 +130,41 @@ class FOLLY_PACK_ATTR RefcountWithFlags { RefcountWithFlags& operator=(const RefcountWithFlags&) = delete; RefcountWithFlags(RefcountWithFlags&&) = delete; RefcountWithFlags& operator=(RefcountWithFlags&&) = delete; - + enum incResult { + incOk, + incFailedMoving, + incFailedEviction + }; // Bumps up the reference count only if the new count will be strictly less - // than or equal to the maxCount. - // @return true if refcount is bumped. false otherwise. - FOLLY_ALWAYS_INLINE bool incRef() noexcept { - Value* const refPtr = &refCount_; - unsigned int nCASFailures = 0; - constexpr bool isWeak = false; - Value oldVal = __atomic_load_n(refPtr, __ATOMIC_RELAXED); - - while (true) { - const Value newCount = oldVal + static_cast(1); - if (UNLIKELY((oldVal & kAccessRefMask) == (kAccessRefMask))) { - return false; - } - - if (__atomic_compare_exchange_n(refPtr, &oldVal, newCount, isWeak, - __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) { - return true; - } - - if ((++nCASFailures % 4) == 0) { - // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl - // above should take about 100 clock cycles. we pause once every 400 - // cycles or so if we are extremely unlucky. - folly::asm_volatile_pause(); - } - } + // than or equal to the maxCount and the item is not exclusive + // @return true if refcount is bumped. false otherwise (if item is exclusive) + // @throw exception::RefcountOverflow if new count would be greater than + // maxCount + FOLLY_ALWAYS_INLINE incResult incRef(bool failIfMoving) { + incResult res = incOk; + auto predicate = [failIfMoving, &res](const Value curValue) { + Value bitMask = getAdminRef(); + + const bool exlusiveBitIsSet = curValue & bitMask; + if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) { + throw exception::RefcountOverflow("Refcount maxed out."); + } else if (exlusiveBitIsSet && (curValue & kAccessRefMask) == 0) { + res = incFailedEviction; + return false; + } else if (exlusiveBitIsSet && failIfMoving) { + res = incFailedMoving; + return false; + } + res = incOk; + return true; + }; + + auto newValue = [](const Value curValue) { + return (curValue + static_cast(1)); + }; + + atomicUpdateValue(predicate, newValue); + return res; } // Bumps down the reference count @@ -167,33 +174,38 @@ class FOLLY_PACK_ATTR RefcountWithFlags { // @throw RefcountUnderflow when we are trying to decremenet from 0 // refcount and have a refcount leak. FOLLY_ALWAYS_INLINE Value decRef() { - Value* const refPtr = &refCount_; - unsigned int nCASFailures = 0; - constexpr bool isWeak = false; - - Value oldVal = __atomic_load_n(refPtr, __ATOMIC_RELAXED); - while (true) { - const Value newCount = oldVal - static_cast(1); - if ((oldVal & kAccessRefMask) == 0) { + auto predicate = [](const Value curValue) { + if ((curValue & kAccessRefMask) == 0) { throw exception::RefcountUnderflow( "Trying to decRef with no refcount. RefCount Leak!"); } + return true; + }; - if (__atomic_compare_exchange_n(refPtr, &oldVal, newCount, isWeak, - __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) { - return newCount & kRefMask; - } - if ((++nCASFailures % 4) == 0) { - // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl - // above should take about 100 clock cycles. we pause once every 400 - // cycles or so if we are extremely unlucky - folly::asm_volatile_pause(); - } - } + Value retValue; + auto newValue = [&retValue](const Value curValue) { + retValue = (curValue - static_cast(1)); + return retValue; + }; + + auto updated = atomicUpdateValue(predicate, newValue); + XDCHECK(updated); + + return retValue & kRefMask; } - // Return refcount excluding control bits and flags - Value getAccessRef() const noexcept { return getRaw() & kAccessRefMask; } + // Return refcount excluding moving refcount, control bits and flags. + Value getAccessRef() const noexcept { + auto raw = getRaw(); + auto accessRef = raw & kAccessRefMask; + + if ((raw & getAdminRef()) && accessRef >= 1) { + // if item is moving, ignore the extra ref + return accessRef - static_cast(1); + } else { + return accessRef; + } + } // Return access ref and the admin ref bits Value getRefWithAccessAndAdmin() const noexcept { @@ -246,65 +258,145 @@ class FOLLY_PACK_ATTR RefcountWithFlags { } /** - * The following four functions are used to track whether or not - * an item is currently in the process of being moved. This happens during a - * slab rebalance or resize operation or during eviction. + * The following two functions correspond to whether or not an item is + * currently in the process of being evicted. When item is marked for + * eviction, `kExclusive` bit is set and ref count is zero. * - * An item can only be marked exclusive when `isInMMContainer` returns true - * and the item is not yet marked as exclusive. This operation is atomic. + * An item can only be marked for eviction when `isInMMContainer` and + * `isAccessible` return true and item is not already marked for eviction + * nor moving and the ref count is 0. This operation is atomic. * - * User can also query if an item "isOnlyExclusive". This returns true only - * if the refcount is 0 and only the exclusive bit is set. - * - * Unmarking exclusive does not depend on `isInMMContainer`. - * Unmarking exclusive will also return the refcount at the moment of - * unmarking. + * Unmarking eviction does not depend on `isInMMContainer` nor `isAccessible` */ - bool markExclusive() noexcept { - Value bitMask = getAdminRef(); - Value conditionBitMask = getAdminRef(); - - Value* const refPtr = &refCount_; - unsigned int nCASFailures = 0; - constexpr bool isWeak = false; - Value curValue = __atomic_load_n(refPtr, __ATOMIC_RELAXED); - while (true) { + bool markForEviction() noexcept { + auto predicate = [](const Value curValue) { + Value conditionBitMask = getAdminRef(); const bool flagSet = curValue & conditionBitMask; - const bool alreadyExclusive = curValue & bitMask; + const bool alreadyExclusive = curValue & getAdminRef(); + const bool accessible = curValue & getAdminRef(); + if (!flagSet || alreadyExclusive) { return false; } - - const Value newValue = curValue | bitMask; - if (__atomic_compare_exchange_n(refPtr, &curValue, newValue, isWeak, - __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) { - XDCHECK(newValue & conditionBitMask); - return true; + if ((curValue & kAccessRefMask) != 0) { + return false; } - - if ((++nCASFailures % 4) == 0) { - // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl - // above should take about 100 clock cycles. we pause once every 400 - // cycles or so if we are extremely unlucky. - folly::asm_volatile_pause(); + if (!accessible) { + return false; } - } + + return true; + }; + + auto newValue = [](const Value curValue) { + return curValue | getAdminRef(); + }; + + return atomicUpdateValue(predicate, newValue); } - Value unmarkExclusive() noexcept { + + Value unmarkForEviction() noexcept { + XDCHECK(isMarkedForEviction()); Value bitMask = ~getAdminRef(); return __atomic_and_fetch(&refCount_, bitMask, __ATOMIC_ACQ_REL) & kRefMask; } - bool isExclusive() const noexcept { - return getRaw() & getAdminRef(); + + bool isMarkedForEviction() const noexcept { + auto raw = getRaw(); + return (raw & getAdminRef()) && ((raw & kAccessRefMask) == 0); + } + + /** + * The following functions correspond to whether or not an item is + * currently in the processed of being moved. When moving, internal + * ref count is always >= 1 and `kExclusive` bit is set. getRefCount + * does not return the extra ref (it can return 0). + * + * An item can only be marked moving when `isInMMContainer` returns true + * and item is not already marked for eviction nor moving. + * + * User can also query if an item "isOnlyMoving". This returns true only + * if the refcount is one and only the exlusive bit is set. + * + * Unmarking moving does not depend on `isInMMContainer` + */ + bool markMoving(bool failIfRefNotZero) { + auto predicate = [failIfRefNotZero](const Value curValue) { + Value conditionBitMask = getAdminRef(); + const bool flagSet = curValue & conditionBitMask; + const bool alreadyExclusive = curValue & getAdminRef(); + if (failIfRefNotZero && (curValue & kAccessRefMask) != 0) { + return false; + } + if (!flagSet || alreadyExclusive) { + return false; + } + if (UNLIKELY((curValue & kAccessRefMask) == (kAccessRefMask))) { + throw exception::RefcountOverflow("Refcount maxed out."); + } + + return true; + }; + + auto newValue = [](const Value curValue) { + // Set exclusive flag and make the ref count non-zero (to distinguish + // from exclusive case). This extra ref will not be reported to the + // user + return (curValue + static_cast(1)) | getAdminRef(); + }; + + return atomicUpdateValue(predicate, newValue); + } + + Value unmarkMoving() noexcept { + XDCHECK(isMoving()); + auto predicate = [](const Value curValue) { + XDCHECK((curValue & kAccessRefMask) != 0); + return true; + }; + + Value retValue; + auto newValue = [&retValue](const Value curValue) { + retValue = + (curValue - static_cast(1)) & ~getAdminRef(); + return retValue; + }; + + auto updated = atomicUpdateValue(predicate, newValue); + XDCHECK(updated); + + return retValue & kRefMask; } - bool isOnlyExclusive() const noexcept { - // An item is only exclusive when its refcount is zero and only the - // exclusive bit among all the control bits is set. This indicates an item - // is exclusive to the current thread. No other thread is allowed to - // do anything with it. + + bool isMoving() const noexcept { + auto raw = getRaw(); + return (raw & getAdminRef()) && ((raw & kAccessRefMask) != 0); + } + + /** This function attempts to mark item for eviction. + * Can only be called on the item that is moving.*/ + bool markForEvictionWhenMoving() { + XDCHECK(isMoving()); + + auto predicate = [](const Value curValue) { + return (curValue & kAccessRefMask) == 1; + }; + + auto newValue = [](const Value curValue) { + XDCHECK((curValue & kAccessRefMask) == 1); + return (curValue - static_cast(1)); + }; + + return atomicUpdateValue(predicate, newValue); + } + + bool isOnlyMoving() const noexcept { + // An item is only moving when its refcount is one and only the exclusive + // bit among all the control bits is set. This indicates an item is already + // on its way out of cache and does not need to be moved. auto ref = getRefWithAccessAndAdmin(); - bool anyOtherBitSet = ref & ~getAdminRef(); - if (anyOtherBitSet) { + Value valueWithoutExclusiveBit = ref & ~getAdminRef(); + if (valueWithoutExclusiveBit != 1) { return false; } return ref & getAdminRef(); @@ -370,6 +462,39 @@ class FOLLY_PACK_ATTR RefcountWithFlags { } private: + /** + * Helper function to modify refCount_ atomically. + * + * If predicate(currentValue) is true, then it atomically assigns result + * of newValueF(currentValue) to refCount_ and returns true. Otherwise + * returns false and leaves refCount_ unmodified. + */ + template + bool atomicUpdateValue(P&& predicate, F&& newValueF) { + Value* const refPtr = &refCount_; + unsigned int nCASFailures = 0; + constexpr bool isWeak = false; + Value curValue = __atomic_load_n(refPtr, __ATOMIC_RELAXED); + while (true) { + if (!predicate(curValue)) { + return false; + } + + const Value newValue = newValueF(curValue); + if (__atomic_compare_exchange_n(refPtr, &curValue, newValue, isWeak, + __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) { + return true; + } + + if ((++nCASFailures % 4) == 0) { + // this pause takes up to 40 clock cycles on intel and the lock cmpxchgl + // above should take about 100 clock cycles. we pause once every 400 + // cycles or so if we are extremely unlucky. + folly::asm_volatile_pause(); + } + } + } + template static Value getFlag() noexcept { static_assert(flagBit >= kNumAccessRefBits + kNumAdminRefBits, diff --git a/cachelib/allocator/datastruct/DList.h b/cachelib/allocator/datastruct/DList.h index 2e872c8ee0..4d862b1908 100644 --- a/cachelib/allocator/datastruct/DList.h +++ b/cachelib/allocator/datastruct/DList.h @@ -221,6 +221,10 @@ class DList { curr_ = dir_ == Direction::FROM_HEAD ? dlist_->head_ : dlist_->tail_; } + Direction getDirection() noexcept { + return dir_; + } + protected: void goForward() noexcept; void goBackward() noexcept; diff --git a/cachelib/allocator/datastruct/MultiDList-inl.h b/cachelib/allocator/datastruct/MultiDList-inl.h index e20510d4fc..cd79b600c5 100644 --- a/cachelib/allocator/datastruct/MultiDList-inl.h +++ b/cachelib/allocator/datastruct/MultiDList-inl.h @@ -25,12 +25,26 @@ void MultiDList::Iterator::goForward() noexcept { } // Move iterator forward ++currIter_; - // If we land at the rend of this list, move to the previous list. - while (index_ != kInvalidIndex && - currIter_ == mlist_.lists_[index_]->rend()) { - --index_; - if (index_ != kInvalidIndex) { - currIter_ = mlist_.lists_[index_]->rbegin(); + + if (currIter_.getDirection() == DListIterator::Direction::FROM_HEAD) { + // If we land at the rend of this list, move to the previous list. + while (index_ != kInvalidIndex && index_ != mlist_.lists_.size() && + currIter_ == mlist_.lists_[index_]->end()) { + ++index_; + if (index_ != kInvalidIndex && index_ != mlist_.lists_.size()) { + currIter_ = mlist_.lists_[index_]->begin(); + } else { + return; + } + } + } else { + // If we land at the rend of this list, move to the previous list. + while (index_ != kInvalidIndex && + currIter_ == mlist_.lists_[index_]->rend()) { + --index_; + if (index_ != kInvalidIndex) { + currIter_ = mlist_.lists_[index_]->rbegin(); + } } } } @@ -71,6 +85,25 @@ void MultiDList::Iterator::initToValidRBeginFrom( : mlist_.lists_[index_]->rbegin(); } +template T::*HookPtr> +void MultiDList::Iterator::initToValidBeginFrom( + size_t listIdx) noexcept { + // Find the first non-empty list. + index_ = listIdx; + while (index_ != mlist_.lists_.size() && + mlist_.lists_[index_]->size() == 0) { + ++index_; + } + if (index_ == mlist_.lists_.size()) { + //we reached the end - we should get set to + //invalid index + index_ = std::numeric_limits::max(); + } + currIter_ = index_ == std::numeric_limits::max() + ? mlist_.lists_[0]->begin() + : mlist_.lists_[index_]->begin(); +} + template T::*HookPtr> typename MultiDList::Iterator& MultiDList::Iterator::operator++() noexcept { @@ -97,7 +130,16 @@ typename MultiDList::Iterator MultiDList::rbegin( if (listIdx >= lists_.size()) { throw std::invalid_argument("Invalid list index for MultiDList iterator."); } - return MultiDList::Iterator(*this, listIdx); + return MultiDList::Iterator(*this, listIdx, false); +} + +template T::*HookPtr> +typename MultiDList::Iterator MultiDList::begin( + size_t listIdx) const { + if (listIdx >= lists_.size()) { + throw std::invalid_argument("Invalid list index for MultiDList iterator."); + } + return MultiDList::Iterator(*this, listIdx, true); } template T::*HookPtr> diff --git a/cachelib/allocator/datastruct/MultiDList.h b/cachelib/allocator/datastruct/MultiDList.h index 1a59baa715..bd7be00bd4 100644 --- a/cachelib/allocator/datastruct/MultiDList.h +++ b/cachelib/allocator/datastruct/MultiDList.h @@ -110,14 +110,18 @@ class MultiDList { } explicit Iterator(const MultiDList& mlist, - size_t listIdx) noexcept + size_t listIdx, bool head) noexcept : currIter_(mlist.lists_[mlist.lists_.size() - 1]->rbegin()), mlist_(mlist) { XDCHECK_LT(listIdx, mlist.lists_.size()); - initToValidRBeginFrom(listIdx); + if (head) { + initToValidBeginFrom(listIdx); + } else { + initToValidRBeginFrom(listIdx); + } // We should either point to an element or the end() iterator // which has an invalid index_. - XDCHECK(index_ == kInvalidIndex || currIter_.get() != nullptr); + XDCHECK(index_ == kInvalidIndex || index_ == mlist.lists_.size() || currIter_.get() != nullptr); } virtual ~Iterator() = default; @@ -169,6 +173,9 @@ class MultiDList { // reset iterator to the beginning of a speicific queue void initToValidRBeginFrom(size_t listIdx) noexcept; + + // reset iterator to the head of a specific queue + void initToValidBeginFrom(size_t listIdx) noexcept; // Index of current list size_t index_{0}; @@ -184,6 +191,9 @@ class MultiDList { // provides an iterator starting from the tail of a specific list. Iterator rbegin(size_t idx) const; + + // provides an iterator starting from the head of a specific list. + Iterator begin(size_t idx) const; // Iterator to compare against for the end. Iterator rend() const noexcept; diff --git a/cachelib/allocator/datastruct/serialize/objects.thrift b/cachelib/allocator/datastruct/serialize/objects.thrift index bd2c8b79bc..223b804e5b 100644 --- a/cachelib/allocator/datastruct/serialize/objects.thrift +++ b/cachelib/allocator/datastruct/serialize/objects.thrift @@ -22,17 +22,17 @@ namespace cpp2 facebook.cachelib.serialization // Saved state for an SList struct SListObject { - 2: required i64 size, - 3: required i64 compressedHead, // Pointer to the head element + 2: required i64 size; + 3: required i64 compressedHead; // Pointer to the head element // TODO(bwatling): remove the default value and clean up SList::SList() once // we can rely on 'compressedTail' always being valid. - 4: i64 compressedTail = -1, // Pointer to the tail element + 4: i64 compressedTail = -1; // Pointer to the tail element } struct DListObject { - 1: required i64 compressedHead, - 2: required i64 compressedTail, - 3: required i64 size, + 1: required i64 compressedHead; + 2: required i64 compressedTail; + 3: required i64 size; } struct MultiDListObject { diff --git a/cachelib/allocator/memory/AllocationClass.cpp b/cachelib/allocator/memory/AllocationClass.cpp index 71089153e9..512df86bbe 100644 --- a/cachelib/allocator/memory/AllocationClass.cpp +++ b/cachelib/allocator/memory/AllocationClass.cpp @@ -50,7 +50,7 @@ AllocationClass::AllocationClass(ClassId classId, poolId_(poolId), allocationSize_(allocSize), slabAlloc_(s), - freedAllocations_{slabAlloc_.createPtrCompressor()} { + freedAllocations_{slabAlloc_.createSingleTierPtrCompressor()} { checkState(); } @@ -102,7 +102,7 @@ AllocationClass::AllocationClass( currSlab_(s.getSlabForIdx(*object.currSlabIdx())), slabAlloc_(s), freedAllocations_(*object.freedAllocationsObject(), - slabAlloc_.createPtrCompressor()), + slabAlloc_.createSingleTierPtrCompressor()), canAllocate_(*object.canAllocate()) { if (!slabAlloc_.isRestorable()) { throw std::logic_error("The allocation class cannot be restored."); @@ -356,9 +356,10 @@ std::pair> AllocationClass::pruneFreeAllocs( // allocated slab, release any freed allocations belonging to this slab. // Set the bit to true if the corresponding allocation is freed, false // otherwise. - FreeList freeAllocs{slabAlloc_.createPtrCompressor()}; - FreeList notInSlab{slabAlloc_.createPtrCompressor()}; - FreeList inSlab{slabAlloc_.createPtrCompressor()}; + FreeList freeAllocs{slabAlloc_.createSingleTierPtrCompressor()}; + FreeList notInSlab{slabAlloc_.createSingleTierPtrCompressor()}; + FreeList inSlab{slabAlloc_.createSingleTierPtrCompressor()}; + lock_->lock_combine([&]() { // Take the allocation class free list offline diff --git a/cachelib/allocator/memory/AllocationClass.h b/cachelib/allocator/memory/AllocationClass.h index b602e4d66d..269887f207 100644 --- a/cachelib/allocator/memory/AllocationClass.h +++ b/cachelib/allocator/memory/AllocationClass.h @@ -94,14 +94,6 @@ class AllocationClass { return static_cast(Slab::kSize / allocationSize_); } - // total number of slabs under this AllocationClass. - unsigned int getNumSlabs() const { - return lock_->lock_combine([this]() { - return static_cast(freeSlabs_.size() + - allocatedSlabs_.size()); - }); - } - // fetch stats about this allocation class. ACStats getStats() const; @@ -453,7 +445,7 @@ class AllocationClass { struct CACHELIB_PACKED_ATTR FreeAlloc { using CompressedPtr = facebook::cachelib::CompressedPtr; using PtrCompressor = - facebook::cachelib::PtrCompressor; + facebook::cachelib::SingleTierPtrCompressor; SListHook hook_{}; }; diff --git a/cachelib/allocator/memory/CompressedPtr.h b/cachelib/allocator/memory/CompressedPtr.h index 96d39ae2b9..d664063ea3 100644 --- a/cachelib/allocator/memory/CompressedPtr.h +++ b/cachelib/allocator/memory/CompressedPtr.h @@ -27,18 +27,32 @@ namespace cachelib { class SlabAllocator; -// the following are for pointer compression for the memory allocator. We -// compress pointers by storing the slab index and the alloc index of the -// allocation inside the slab. With slab worth kNumSlabBits of data, if we -// have the min allocation size as 64 bytes, that requires kNumSlabBits - 6 -// bits for storing the alloc index. This leaves the remaining (32 - -// (kNumSlabBits - 6)) bits for the slab index. Hence we can index 256 GiB -// of memory in slabs and index anything more than 64 byte allocations inside -// the slab using a 32 bit representation. -// +template +class PtrCompressor; + // This CompressedPtr makes decompression fast by staying away from division and -// modulo arithmetic and doing those during the compression time. We most often -// decompress a CompressedPtr than compress a pointer while creating one. +// modulo arithmetic and doing those during the compression time. We most often +// decompress a CompressedPtr than compress a pointer while creating one. This +// is used for pointer compression by the memory allocator. + +// We compress pointers by storing the tier index, slab index and alloc index of +// the allocation inside the slab. + +// In original design (without memory tiers): +// Each slab addresses 22 bits of allocations (kNumSlabBits). This is split into +// allocation index and allocation size. If we have the min allocation size of +// 64 bytes (kMinAllocPower = 6 bits), remaining kNumSlabBits(22) - +// kMinAllocPower(6) = 16 bits for storing the alloc index. This leaves the +// remaining 32 - (kNumSlabBits - kMinAllocPower) = 16 bits for the slab +// index. Hence we can index 256 GiB of memory. + +// In multi-tier design: +// kNumSlabIds and kMinAllocPower remains unchanged. The tier id occupies the +// 32nd bit only since its value cannot exceed kMaxTiers(2). This leaves the +// remaining 32 - (kNumSlabBits - kMinAllocPower) - 1 bit for tier id = 15 bits +// for the slab index. Hence we can index 128 GiB of memory per tier in +// multi-tier configuration. + class CACHELIB_PACKED_ATTR CompressedPtr { public: using PtrType = uint32_t; @@ -62,9 +76,10 @@ class CACHELIB_PACKED_ATTR CompressedPtr { return static_cast(1) << (Slab::kMinAllocPower); } - // maximum adressable memory for pointer compression to work. + // maximum addressable memory for pointer compression to work. static constexpr size_t getMaxAddressableSize() noexcept { - return static_cast(1) << (kNumSlabIdxBits + Slab::kNumSlabBits); + return static_cast(1) + << (numSlabIdxBits(false) + Slab::kNumSlabBits); } // default construct to nullptr. @@ -89,8 +104,11 @@ class CACHELIB_PACKED_ATTR CompressedPtr { PtrType ptr_{kNull}; // create a compressed pointer for a valid memory allocation. - CompressedPtr(uint32_t slabIdx, uint32_t allocIdx) - : ptr_(compress(slabIdx, allocIdx)) {} + CompressedPtr(uint32_t slabIdx, + uint32_t allocIdx, + bool isMultiTiered, + TierId tid = 0) + : ptr_(compress(slabIdx, allocIdx, isMultiTiered, tid)) {} constexpr explicit CompressedPtr(PtrType ptr) noexcept : ptr_{ptr} {} @@ -100,55 +118,88 @@ class CACHELIB_PACKED_ATTR CompressedPtr { static constexpr unsigned int kNumAllocIdxBits = Slab::kNumSlabBits - Slab::kMinAllocPower; + // Use 32nd bit position for TierId + static constexpr unsigned int kNumTierIdxOffset = 31; + static constexpr PtrType kAllocIdxMask = ((PtrType)1 << kNumAllocIdxBits) - 1; - // Number of bits for the slab index. This will be the top 16 bits of the - // compressed ptr. - static constexpr unsigned int kNumSlabIdxBits = - NumBits::value - kNumAllocIdxBits; + // kNumTierIdxBits most significant bits + static constexpr PtrType kTierIdxMask = (PtrType)1 << kNumTierIdxOffset; + + // Number of bits for the slab index. + // If CacheLib is single tiered, slab index will be the top 16 bits + // of the compressed ptr. + // Else if CacheLib is multi-tiered, the topmost 32nd bit will be + // reserved for tier id. The following 15 bits will be reserved for + // the slab index. + static constexpr unsigned int numSlabIdxBits(bool isMultiTiered) { + return kNumTierIdxOffset - kNumAllocIdxBits + (!isMultiTiered); + } // Compress the given slabIdx and allocIdx into a 32-bit compressed // pointer. - static PtrType compress(uint32_t slabIdx, uint32_t allocIdx) noexcept { + static PtrType compress(uint32_t slabIdx, + uint32_t allocIdx, + bool isMultiTiered, + TierId tid) noexcept { XDCHECK_LE(allocIdx, kAllocIdxMask); - XDCHECK_LT(slabIdx, (1u << kNumSlabIdxBits) - 1); - return (slabIdx << kNumAllocIdxBits) + allocIdx; + XDCHECK_LT(slabIdx, (1u << numSlabIdxBits(isMultiTiered)) - 1); + if (!isMultiTiered) { + return (slabIdx << kNumAllocIdxBits) + allocIdx; + } + return (static_cast(tid) << kNumTierIdxOffset) + + (slabIdx << kNumAllocIdxBits) + allocIdx; } // Get the slab index of the compressed ptr - uint32_t getSlabIdx() const noexcept { + uint32_t getSlabIdx(bool isMultiTiered) const noexcept { XDCHECK(!isNull()); - return static_cast(ptr_ >> kNumAllocIdxBits); + auto noTierIdPtr = isMultiTiered ? ptr_ & ~kTierIdxMask : ptr_; + return static_cast(noTierIdPtr >> kNumAllocIdxBits); } // Get the allocation index of the compressed ptr uint32_t getAllocIdx() const noexcept { XDCHECK(!isNull()); + // Note: tid check not required in ptr_ since only + // the lower 16 bits are being read here. return static_cast(ptr_ & kAllocIdxMask); } + uint32_t getTierId(bool isMultiTiered) const noexcept { + XDCHECK(!isNull()); + return isMultiTiered ? static_cast(ptr_ >> kNumTierIdxOffset) : 0; + } + + void setTierId(TierId tid) noexcept { + ptr_ += static_cast(tid) << kNumTierIdxOffset; + } + friend SlabAllocator; + template + friend class PtrCompressor; }; template -class PtrCompressor { +class SingleTierPtrCompressor { public: - explicit PtrCompressor(const AllocatorT& allocator) noexcept + explicit SingleTierPtrCompressor(const AllocatorT& allocator) noexcept : allocator_(allocator) {} const CompressedPtr compress(const PtrType* uncompressed) const { - return allocator_.compress(uncompressed); + return allocator_.compress(uncompressed, false /* isMultiTiered */); } PtrType* unCompress(const CompressedPtr compressed) const { - return static_cast(allocator_.unCompress(compressed)); + return static_cast( + allocator_.unCompress(compressed, false /* isMultiTiered */)); } - bool operator==(const PtrCompressor& rhs) const noexcept { + bool operator==(const SingleTierPtrCompressor& rhs) const noexcept { return &allocator_ == &rhs.allocator_; } - bool operator!=(const PtrCompressor& rhs) const noexcept { + bool operator!=(const SingleTierPtrCompressor& rhs) const noexcept { return !(*this == rhs); } @@ -156,5 +207,53 @@ class PtrCompressor { // memory allocator that does the pointer compression. const AllocatorT& allocator_; }; + +template +class PtrCompressor { + public: + explicit PtrCompressor(const AllocatorContainer& allocators) noexcept + : allocators_(allocators) {} + + const CompressedPtr compress(const PtrType* uncompressed) const { + if (uncompressed == nullptr) + return CompressedPtr{}; + + TierId tid; + for (tid = 0; tid < allocators_.size(); tid++) { + if (allocators_[tid]->isMemoryInAllocator( + static_cast(uncompressed))) + break; + } + + bool isMultiTiered = allocators_.size() > 1; + auto cptr = allocators_[tid]->compress(uncompressed, isMultiTiered); + if (isMultiTiered) { // config has multiple tiers + cptr.setTierId(tid); + } + return cptr; + } + + PtrType* unCompress(const CompressedPtr compressed) const { + if (compressed.isNull()) { + return nullptr; + } + bool isMultiTiered = allocators_.size() > 1; + auto& allocator = *allocators_[compressed.getTierId(isMultiTiered)]; + return static_cast( + allocator.unCompress(compressed, isMultiTiered)); + } + + bool operator==(const PtrCompressor& rhs) const noexcept { + return &allocators_ == &rhs.allocators_; + } + + bool operator!=(const PtrCompressor& rhs) const noexcept { + return !(*this == rhs); + } + + private: + // memory allocator that does the pointer compression. + const AllocatorContainer& allocators_; +}; } // namespace cachelib } // namespace facebook diff --git a/cachelib/allocator/memory/MemoryAllocator.h b/cachelib/allocator/memory/MemoryAllocator.h index 509664afa6..a77d23494c 100644 --- a/cachelib/allocator/memory/MemoryAllocator.h +++ b/cachelib/allocator/memory/MemoryAllocator.h @@ -516,12 +516,13 @@ class MemoryAllocator { using CompressedPtr = facebook::cachelib::CompressedPtr; template using PtrCompressor = - facebook::cachelib::PtrCompressor; - + facebook::cachelib::PtrCompressor>>; + template - PtrCompressor createPtrCompressor() { - return slabAllocator_.createPtrCompressor(); - } + using SingleTierPtrCompressor = + facebook::cachelib::PtrCompressor; // compress a given pointer to a valid allocation made out of this allocator // through an allocate() or nullptr. Calling this otherwise with invalid @@ -534,8 +535,9 @@ class MemoryAllocator { // as the original pointer is valid. // // @throw std::invalid_argument if the ptr is invalid. - CompressedPtr CACHELIB_INLINE compress(const void* ptr) const { - return slabAllocator_.compress(ptr); + CompressedPtr CACHELIB_INLINE compress(const void* ptr, + bool isMultiTiered) const { + return slabAllocator_.compress(ptr, isMultiTiered); } // retrieve the raw pointer corresponding to the compressed pointer. This is @@ -546,8 +548,9 @@ class MemoryAllocator { // @return the raw pointer corresponding to this compressed pointer. // // @throw std::invalid_argument if the compressed pointer is invalid. - void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr) const { - return slabAllocator_.unCompress(cPtr); + void* CACHELIB_INLINE unCompress(const CompressedPtr cPtr, + bool isMultiTiered) const { + return slabAllocator_.unCompress(cPtr, isMultiTiered); } // a special implementation of pointer compression for benchmarking purposes. @@ -644,6 +647,13 @@ class MemoryAllocator { memoryPoolManager_.updateNumSlabsToAdvise(numSlabs); } + // returns ture if ptr points to memory which is managed by this + // allocator + bool isMemoryInAllocator(const void *ptr) { + return ptr && ptr >= slabAllocator_.getSlabMemoryBegin() + && ptr < slabAllocator_.getSlabMemoryEnd(); + } + private: // @param memory pointer to the memory. // @return the MemoryPool corresponding to the memory. diff --git a/cachelib/allocator/memory/MemoryAllocatorStats.h b/cachelib/allocator/memory/MemoryAllocatorStats.h index 74ebbe64dd..acda9ee530 100644 --- a/cachelib/allocator/memory/MemoryAllocatorStats.h +++ b/cachelib/allocator/memory/MemoryAllocatorStats.h @@ -20,6 +20,7 @@ #include #include "cachelib/allocator/memory/Slab.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -47,6 +48,9 @@ struct ACStats { // true if the allocation class is full. bool full; + // Rolling allocation latency (in ns) + util::RollingStats allocLatencyNs; + constexpr unsigned long long totalSlabs() const noexcept { return freeSlabs + usedSlabs; } @@ -54,6 +58,17 @@ struct ACStats { constexpr size_t getTotalFreeMemory() const noexcept { return Slab::kSize * freeSlabs + freeAllocs * allocSize; } + + constexpr double usageFraction() const noexcept { + if (usedSlabs == 0) + return 0.0; + + return activeAllocs / (usedSlabs * allocsPerSlab); + } + + constexpr size_t totalAllocatedSize() const noexcept { + return activeAllocs * allocSize; + } }; // structure to query stats corresponding to a MemoryPool diff --git a/cachelib/allocator/memory/Slab.h b/cachelib/allocator/memory/Slab.h index 4784bee8e9..897ad4e349 100644 --- a/cachelib/allocator/memory/Slab.h +++ b/cachelib/allocator/memory/Slab.h @@ -50,6 +50,8 @@ namespace cachelib { * independantly by the SlabAllocator. */ +// identifier for the memory tier +using TierId = int8_t; // identifier for the memory pool using PoolId = int8_t; // identifier for the allocation class diff --git a/cachelib/allocator/memory/SlabAllocator.cpp b/cachelib/allocator/memory/SlabAllocator.cpp index ade5a8e535..080a3ee174 100644 --- a/cachelib/allocator/memory/SlabAllocator.cpp +++ b/cachelib/allocator/memory/SlabAllocator.cpp @@ -40,7 +40,9 @@ using namespace facebook::cachelib; namespace { -size_t roundDownToSlabSize(size_t size) { return size - (size % sizeof(Slab)); } +static inline size_t roundDownToSlabSize(size_t size) { + return size - (size % sizeof(Slab)); +} } // namespace // definitions to avoid ODR violation. @@ -48,7 +50,6 @@ using PtrType = CompressedPtr::PtrType; constexpr uint64_t SlabAllocator::kAddressMask; constexpr PtrType CompressedPtr::kAllocIdxMask; constexpr unsigned int CompressedPtr::kNumAllocIdxBits; -constexpr unsigned int CompressedPtr::kNumSlabIdxBits; constexpr unsigned int SlabAllocator::kLockSleepMS; constexpr size_t SlabAllocator::kPagesPerStep; diff --git a/cachelib/allocator/memory/SlabAllocator.h b/cachelib/allocator/memory/SlabAllocator.h index 5f5bf3265a..a80a54672c 100644 --- a/cachelib/allocator/memory/SlabAllocator.h +++ b/cachelib/allocator/memory/SlabAllocator.h @@ -225,7 +225,8 @@ class SlabAllocator { // the corresponding memory allocator. trying to inline this just increases // the code size and does not move the needle on the benchmarks much. // Calling this with invalid input in optimized build is undefined behavior. - CompressedPtr CACHELIB_INLINE compress(const void* ptr) const { + CompressedPtr CACHELIB_INLINE compress(const void* ptr, + bool isMultiTiered) const { if (ptr == nullptr) { return CompressedPtr{}; } @@ -246,18 +247,23 @@ class SlabAllocator { static_cast(reinterpret_cast(ptr) - reinterpret_cast(slab)) / allocSize; - return CompressedPtr{slabIndex, allocIdx}; + return CompressedPtr{slabIndex, allocIdx, isMultiTiered}; } // uncompress the point and return the raw ptr. This function never throws // in optimized build and assumes that the caller is responsible for calling // it with a valid compressed pointer. - void* CACHELIB_INLINE unCompress(const CompressedPtr ptr) const { + void* CACHELIB_INLINE unCompress(const CompressedPtr ptr, + bool isMultiTiered) const { if (ptr.isNull()) { return nullptr; } - const SlabIdx slabIndex = ptr.getSlabIdx(); + /* TODO: isMultiTiered set to false by default. + Multi-tiering flag will have no impact till + rest of the multi-tiering changes are merged. + */ + const SlabIdx slabIndex = ptr.getSlabIdx(isMultiTiered); const uint32_t allocIdx = ptr.getAllocIdx(); const Slab* slab = &slabMemoryStart_[slabIndex]; @@ -312,8 +318,19 @@ class SlabAllocator { } template - PtrCompressor createPtrCompressor() const { - return PtrCompressor(*this); + SingleTierPtrCompressor createSingleTierPtrCompressor() const { + return SingleTierPtrCompressor(*this); + } + + // returns starting address of memory we own. + const Slab* getSlabMemoryBegin() const noexcept { + return reinterpret_cast(memoryStart_); + } + + // returns first byte after the end of memory region we own. + const Slab* getSlabMemoryEnd() const noexcept { + return reinterpret_cast(reinterpret_cast(memoryStart_) + + memorySize_); } private: @@ -333,12 +350,6 @@ class SlabAllocator { // @throw std::invalid_argument if the state is invalid. void checkState() const; - // returns first byte after the end of memory region we own. - const Slab* getSlabMemoryEnd() const noexcept { - return reinterpret_cast(reinterpret_cast(memoryStart_) + - memorySize_); - } - // returns true if we have slabbed all the memory that is available to us. // false otherwise. bool allMemorySlabbed() const noexcept { diff --git a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp index cbd8cbef17..a19b9749a6 100644 --- a/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp +++ b/cachelib/allocator/memory/tests/MemoryAllocatorTest.cpp @@ -401,13 +401,28 @@ TEST_F(MemoryAllocatorTest, PointerCompression) { for (const auto& pool : poolAllocs) { const auto& allocs = pool.second; for (const auto* alloc : allocs) { - CompressedPtr ptr = m.compress(alloc); + CompressedPtr ptr = m.compress(alloc, false /* isMultiTiered */); ASSERT_FALSE(ptr.isNull()); - ASSERT_EQ(alloc, m.unCompress(ptr)); + ASSERT_EQ(alloc, m.unCompress(ptr, false /* isMultiTiered */)); } } - ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr))); + ASSERT_EQ(nullptr, + m.unCompress(m.compress(nullptr, false /* isMultiTiered */), + false /* isMultiTiered */)); + + // test pointer compression with multi-tier + for (const auto& pool : poolAllocs) { + const auto& allocs = pool.second; + for (const auto* alloc : allocs) { + CompressedPtr ptr = m.compress(alloc, true /* isMultiTiered */); + ASSERT_FALSE(ptr.isNull()); + ASSERT_EQ(alloc, m.unCompress(ptr, true /* isMultiTiered */)); + } + } + + ASSERT_EQ(nullptr, m.unCompress(m.compress(nullptr, true /* isMultiTiered */), + true /* isMultiTiered */)); } TEST_F(MemoryAllocatorTest, Restorable) { diff --git a/cachelib/allocator/nvmcache/NvmCache-inl.h b/cachelib/allocator/nvmcache/NvmCache-inl.h index b79712e607..3cef6b7ad0 100644 --- a/cachelib/allocator/nvmcache/NvmCache-inl.h +++ b/cachelib/allocator/nvmcache/NvmCache-inl.h @@ -460,19 +460,18 @@ uint32_t NvmCache::getStorageSizeInNvm(const Item& it) { } template -std::unique_ptr NvmCache::makeNvmItem(const WriteHandle& hdl) { - const auto& item = *hdl; +std::unique_ptr NvmCache::makeNvmItem(const Item& item) { auto poolId = cache_.getAllocInfo((void*)(&item)).poolId; if (item.isChainedItem()) { throw std::invalid_argument(folly::sformat( - "Chained item can not be flushed separately {}", hdl->toString())); + "Chained item can not be flushed separately {}", item.toString())); } auto chainedItemRange = - CacheAPIWrapperForNvm::viewAsChainedAllocsRange(cache_, *hdl); + CacheAPIWrapperForNvm::viewAsChainedAllocsRange(cache_, item); if (config_.encodeCb && !config_.encodeCb(EncodeDecodeContext{ - *(hdl.getInternal()), chainedItemRange})) { + const_cast(item), chainedItemRange})) { return nullptr; } @@ -496,12 +495,10 @@ std::unique_ptr NvmCache::makeNvmItem(const WriteHandle& hdl) { } template -void NvmCache::put(WriteHandle& hdl, PutToken token) { +void NvmCache::put(Item& item, PutToken token) { util::LatencyTracker tracker(stats().nvmInsertLatency_); - HashedKey hk{hdl->getKey()}; + HashedKey hk{item.getKey()}; - XDCHECK(hdl); - auto& item = *hdl; // for regular items that can only write to nvmcache upon eviction, we // should not be recording a write for an nvmclean item unless it is marked // as evicted from nvmcache. @@ -526,7 +523,7 @@ void NvmCache::put(WriteHandle& hdl, PutToken token) { return; } - auto nvmItem = makeNvmItem(hdl); + auto nvmItem = makeNvmItem(item); if (!nvmItem) { stats().numNvmPutEncodeFailure.inc(); return; diff --git a/cachelib/allocator/nvmcache/NvmCache.h b/cachelib/allocator/nvmcache/NvmCache.h index c9f5c753f0..245431f7d0 100644 --- a/cachelib/allocator/nvmcache/NvmCache.h +++ b/cachelib/allocator/nvmcache/NvmCache.h @@ -158,11 +158,11 @@ class NvmCache { PutToken createPutToken(folly::StringPiece key); // store the given item in navy - // @param hdl handle to cache item. should not be null + // @param item cache item // @param token the put token for the item. this must have been // obtained before enqueueing the put to maintain // consistency - void put(WriteHandle& hdl, PutToken token); + void put(Item& item, PutToken token); // returns the current state of whether nvmcache is enabled or not. nvmcache // can be disabled if the backend implementation ends up in a corrupt state @@ -286,7 +286,7 @@ class NvmCache { // returns true if there is tombstone entry for the key. bool hasTombStone(HashedKey hk); - std::unique_ptr makeNvmItem(const WriteHandle& handle); + std::unique_ptr makeNvmItem(const Item& item); // wrap an item into a blob for writing into navy. Blob makeBlob(const Item& it); diff --git a/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp b/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp index 7355627fea..ec74c51980 100644 --- a/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp +++ b/cachelib/allocator/nvmcache/tests/NvmCacheTests.cpp @@ -245,7 +245,13 @@ TEST_F(NvmCacheTest, EvictToNvmGetCheckCtime) { ASSERT_NE(nullptr, it); cache_->insertOrReplace(it); keyToCtime.insert({key, it->getCreationTime()}); + // Avoid any nvm eviction being dropped due to the race with still + // outstanding remove operation for insertion + if (i % 100 == 0) { + nvm.flushNvmCache(); + } } + nvm.flushNvmCache(); const auto nEvictions = this->evictionCount() - evictBefore; ASSERT_LT(0, nEvictions); @@ -331,6 +337,11 @@ TEST_F(NvmCacheTest, Delete) { auto it = nvm.allocate(pid, key, 15 * 1024); ASSERT_NE(nullptr, it); nvm.insertOrReplace(it); + // Avoid any nvm eviction being dropped due to the race with still + // outstanding remove operation for insertion + if (i % 100 == 0) { + nvm.flushNvmCache(); + } } nvm.flushNvmCache(); @@ -533,6 +544,11 @@ TEST_F(NvmCacheTest, NvmEvicted) { auto it = nvm.allocate(pid, key, allocSize); ASSERT_NE(nullptr, it); nvm.insertOrReplace(it); + // Avoid any nvm eviction being dropped due to the race with still + // outstanding remove operation for insertion + if (i % 100 == 0) { + nvm.flushNvmCache(); + } } nvm.flushNvmCache(); diff --git a/cachelib/allocator/nvmcache/tests/NvmTestBase.h b/cachelib/allocator/nvmcache/tests/NvmTestBase.h index fd88875fa9..70f00f2e52 100644 --- a/cachelib/allocator/nvmcache/tests/NvmTestBase.h +++ b/cachelib/allocator/nvmcache/tests/NvmTestBase.h @@ -108,7 +108,7 @@ class NvmCacheTest : public testing::Test { void pushToNvmCacheFromRamForTesting(WriteHandle& handle) { auto nvmCache = getNvmCache(); if (nvmCache) { - nvmCache->put(handle, nvmCache->createPutToken(handle->getKey())); + nvmCache->put(*handle, nvmCache->createPutToken(handle->getKey())); } } @@ -127,7 +127,7 @@ class NvmCacheTest : public testing::Test { } std::unique_ptr makeNvmItem(const WriteHandle& handle) { - return getNvmCache()->makeNvmItem(handle); + return getNvmCache()->makeNvmItem(*handle); } std::unique_ptr createItemAsIOBuf(folly::StringPiece key, diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp new file mode 100644 index 0000000000..ad845d94df --- /dev/null +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) Intel Corporation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/tests/AllocatorMemoryTiersTest.h" + +namespace facebook { +namespace cachelib { +namespace tests { + +using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; +//using LruTestAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; +// TODO(MEMORY_TIER): add more tests with different eviction policies +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersBackgroundMovers ) { this->testMultiTiersBackgroundMovers(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEvictionWithReader) { this->testMultiTiersReplaceDuringEvictionWithReader(); } + +} // end of namespace tests +} // end of namespace cachelib +} // end of namespace facebook diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h new file mode 100644 index 0000000000..c724cd9617 --- /dev/null +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/CacheAllocatorConfig.h" +#include "cachelib/allocator/MemoryTierCacheConfig.h" +#include "cachelib/allocator/tests/TestBase.h" +#include "cachelib/allocator/FreeThresholdStrategy.h" +#include "cachelib/allocator/PromotionStrategy.h" + +#include +#include +#include +#include +#include + +namespace facebook { +namespace cachelib { +namespace tests { + +template +class AllocatorMemoryTiersTest : public AllocatorTest { + private: + template + void testMultiTiersAsyncOpDuringMove(std::unique_ptr& alloc, + PoolId& pool, bool& quit, MvCallback&& moveCb) { + typename AllocatorT::Config config; + config.setCacheSize(4 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")) + }); + + config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */, + -1 /* movingAttemptsLimit */); + + alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + + int i = 0; + while(!quit) { + auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + } + + public: + void testMultiTiersInvalid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + ASSERT_NO_THROW(config.configureMemoryTiers( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))})); + } + + void testMultiTiersValid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + ASSERT_NO_THROW(config.configureMemoryTiers( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))})); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersBackgroundMovers() { + typename AllocatorT::Config config; + config.setCacheSize(10 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.usePosixForShm(); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")) + }); + config.enableBackgroundEvictor(std::make_shared(2, 10, 100, 40), + std::chrono::milliseconds(10),1); + config.enableBackgroundPromoter(std::make_shared(5, 4, 2), + std::chrono::milliseconds(10),1); + + auto allocator = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(allocator != nullptr); + const size_t numBytes = allocator->getCacheMemoryStats().ramCacheSize; + + auto poolId = allocator->addPool("default", numBytes); + + const unsigned int keyLen = 100; + const unsigned int size = 100; + unsigned int allocs = 0; + + //we should work on pool stats because filluppooluntil evictions + //will finish once we evict an item from tier 0 to tier 1 and + //there will be unallocated memory left. + while (allocs < 174760) { + const auto key = this->getRandomNewKey(*allocator, keyLen); + ASSERT_EQ(allocator->find(key), nullptr); + auto handle = util::allocateAccessible(*allocator, poolId, key, size); + allocs++; + } + + const auto key = this->getRandomNewKey(*allocator, keyLen); + auto handle = util::allocateAccessible(*allocator, poolId, key, size); + ASSERT_NE(nullptr, handle); + const uint8_t cid = allocator->getAllocInfo(handle->getMemory()).classId; + ASSERT_EQ(cid,5); + auto stats = allocator->getGlobalCacheStats(); + auto slabStats = allocator->getACStats(0,0,cid); + const auto& mpStats = allocator->getPoolByTid(poolId, 0).getStats(); + //cache is 10MB should move about 1MB to reach 10% free + uint32_t approxEvict = (1024*1024)/mpStats.acStats.at(cid).allocSize; + while (stats.evictionStats.numMovedItems < approxEvict*0.95 && (1-slabStats.usageFraction()) >= 0.095) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + stats = allocator->getGlobalCacheStats(); + slabStats = allocator->getACStats(0,0,cid); + } + ASSERT_GE(1-slabStats.usageFraction(),0.095); + + auto perclassEstats = allocator->getBackgroundMoverClassStats(MoverDir::Evict); + auto perclassPstats = allocator->getBackgroundMoverClassStats(MoverDir::Promote); + + ASSERT_GE(stats.evictionStats.numMovedItems,1); + ASSERT_GE(stats.evictionStats.runCount,1); + ASSERT_GE(stats.promotionStats.numMovedItems,1); + + ASSERT_GE(perclassEstats[0][0][cid], 1); + ASSERT_GE(perclassPstats[1][0][cid], 1); + + } + + void testMultiTiersValidMixed() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + ASSERT_NO_THROW(config.configureMemoryTiers( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))})); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersRemoveDuringEviction() { + std::unique_ptr alloc; + PoolId pool; + std::unique_ptr t; + folly::Latch latch(1); + bool quit = false; + + auto moveCb = [&] (typename AllocatorT::Item& oldItem, + typename AllocatorT::Item& newItem, + typename AllocatorT::Item* /* parentPtr */) { + + auto key = oldItem.getKey(); + t = std::make_unique([&](){ + // remove() function is blocked by wait context + // till item is moved to next tier. So that, we should + // notify latch before calling remove() + latch.count_down(); + alloc->remove(key); + }); + // wait till async thread is running + latch.wait(); + memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + quit = true; + }; + + testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb); + + t->join(); + } + + void testMultiTiersReplaceDuringEviction() { + std::unique_ptr alloc; + PoolId pool; + std::unique_ptr t; + folly::Latch latch(1); + bool quit = false; + + auto moveCb = [&] (typename AllocatorT::Item& oldItem, + typename AllocatorT::Item& newItem, + typename AllocatorT::Item* /* parentPtr */) { + auto key = oldItem.getKey(); + if(!quit) { + // we need to replace only once because subsequent allocate calls + // will cause evictions recursevly + quit = true; + t = std::make_unique([&](){ + auto handle = alloc->allocate(pool, key, std::string("new value").size()); + // insertOrReplace() function is blocked by wait context + // till item is moved to next tier. So that, we should + // notify latch before calling insertOrReplace() + latch.count_down(); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + }); + // wait till async thread is running + latch.wait(); + } + memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + }; + + testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb); + + t->join(); + + } + + + void gdb_sync1() {} + void gdb_sync2() {} + void gdb_sync3() {} + using ReadHandle = typename AllocatorT::ReadHandle; + void testMultiTiersReplaceDuringEvictionWithReader() { + sem_unlink ("/gdb1_sem"); + sem_t *sem = sem_open ("/gdb1_sem", O_CREAT | O_EXCL, S_IRUSR | S_IWUSR, 0); + int gdbfd = open("/tmp/gdb1.gdb",O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR); + char gdbcmds[] = + "set attached=1\n" + "break gdb_sync1\n" + "break gdb_sync2\n" + "break moveRegularItemWithSync\n" + "c\n" + "set scheduler-locking on\n" + "thread 1\n" + "c\n" + "thread 4\n" + "c\n" + "thread 5\n" + "break nativeFutexWaitImpl thread 5\n" + "c\n" + "thread 4\n" + "break nativeFutexWaitImpl thread 4\n" + "c\n" + "thread 1\n" + "break releaseBackToAllocator\n" + "c\n" + "c\n" + "thread 5\n" + "c\n" + "thread 4\n" + "c\n" + "thread 1\n" + "break gdb_sync3\n" + "c\n" + "quit\n"; + int ret = write(gdbfd,gdbcmds,strlen(gdbcmds)); + int ppid = getpid(); //parent pid + //int pid = 0; + int pid = fork(); + if (pid == 0) { + sem_wait(sem); + sem_close(sem); + sem_unlink("/gdb1_sem"); + char cmdpid[256]; + sprintf(cmdpid,"%d",ppid); + int f = execlp("gdb","gdb","--pid",cmdpid,"--batch-silent","--command=/tmp/gdb1.gdb",(char*) 0); + ASSERT(f != -1); + } + sem_post(sem); + //wait for gdb to run + int attached = 0; + while (attached == 0); + + std::unique_ptr alloc; + PoolId pool; + bool quit = false; + + typename AllocatorT::Config config; + config.setCacheSize(4 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind(std::string("0")) + }); + + alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + pool = alloc->addPool("default", alloc->getCacheMemoryStats().ramCacheSize); + + int i = 0; + typename AllocatorT::Item* evicted; + std::unique_ptr t; + std::unique_ptr r; + while(!quit) { + auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size()); + ASSERT(handle != nullptr); + if (i == 1) { + evicted = static_cast(handle.get()); + folly::Latch latch_t(1); + t = std::make_unique([&](){ + auto handleNew = alloc->allocate(pool, std::to_string(1), std::string("new value").size()); + ASSERT(handleNew != nullptr); + latch_t.count_down(); + //first breakpoint will be this one because + //thread 1 still has more items to fill up the + //cache before an evict is evicted + gdb_sync1(); + ASSERT(evicted->isMoving()); + //need to suspend thread 1 - who is doing the eviction + //gdb will do this for us + folly::Latch latch(1); + r = std::make_unique([&](){ + ASSERT(evicted->isMoving()); + latch.count_down(); + auto handleEvict = alloc->find(std::to_string(1)); + //does find block until done moving?? yes + while (evicted->isMarkedForEviction()); //move will fail + XDCHECK(handleEvict == nullptr) << handleEvict->toString(); + ASSERT(handleEvict == nullptr); + }); + latch.wait(); + gdb_sync2(); + alloc->insertOrReplace(handleNew); + ASSERT(!evicted->isAccessible()); //move failed + quit = true; + }); + latch_t.wait(); + } + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + t->join(); + r->join(); + gdb_sync3(); + } +}; +} // namespace tests +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/tests/AllocatorResizeTest.h b/cachelib/allocator/tests/AllocatorResizeTest.h index d65205ac74..883dd9c056 100644 --- a/cachelib/allocator/tests/AllocatorResizeTest.h +++ b/cachelib/allocator/tests/AllocatorResizeTest.h @@ -966,23 +966,23 @@ class AllocatorResizeTest : public AllocatorTest { for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) { alloc.memMonitor_->adviseAwaySlabs(); std::this_thread::sleep_for(std::chrono::seconds{2}); - ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), i * perIterAdvSize); + ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), i * perIterAdvSize); } i--; // This should fail alloc.memMonitor_->adviseAwaySlabs(); std::this_thread::sleep_for(std::chrono::seconds{2}); - auto totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize(); + auto totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(); ASSERT_EQ(totalAdvisedAwayMemory, i * perIterAdvSize); // Try to reclaim back for (i = 1; i <= numItersToMaxAdviseAway + 1; i++) { alloc.memMonitor_->reclaimSlabs(); std::this_thread::sleep_for(std::chrono::seconds{2}); - ASSERT_EQ(alloc.allocator_->getAdvisedMemorySize(), + ASSERT_EQ(alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(), totalAdvisedAwayMemory - i * perIterAdvSize); } - totalAdvisedAwayMemory = alloc.allocator_->getAdvisedMemorySize(); + totalAdvisedAwayMemory = alloc.allocator_[0 /* TODO - extend test */]->getAdvisedMemorySize(); ASSERT_EQ(totalAdvisedAwayMemory, 0); } } diff --git a/cachelib/allocator/tests/AllocatorTypeTest.cpp b/cachelib/allocator/tests/AllocatorTypeTest.cpp index 97e08cd7ed..a572485e8d 100644 --- a/cachelib/allocator/tests/AllocatorTypeTest.cpp +++ b/cachelib/allocator/tests/AllocatorTypeTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/tests/BaseAllocatorTest.h" #include "cachelib/allocator/tests/TestBase.h" @@ -226,6 +227,11 @@ TYPED_TEST(BaseAllocatorTest, ReaperSkippingSlabTraversalWhileSlabReleasing) { } TYPED_TEST(BaseAllocatorTest, ReaperShutDown) { this->testReaperShutDown(); } +TYPED_TEST(BaseAllocatorTest, ReaperShutDownFile) { + this->testReaperShutDown( + {MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))}); +} TYPED_TEST(BaseAllocatorTest, ShutDownWithActiveHandles) { this->testShutDownWithActiveHandles(); @@ -403,6 +409,7 @@ TYPED_TEST(BaseAllocatorTest, RateMap) { this->testRateMap(); } TYPED_TEST(BaseAllocatorTest, StatSnapshotTest) { this->testStatSnapshotTest(); } +TYPED_TEST(BaseAllocatorTest, BasicMultiTier) {this->testBasicMultiTier(); } namespace { // the tests that cannot be done by TYPED_TEST. diff --git a/cachelib/allocator/tests/BaseAllocatorTest.h b/cachelib/allocator/tests/BaseAllocatorTest.h index d684545cb9..5733798e98 100644 --- a/cachelib/allocator/tests/BaseAllocatorTest.h +++ b/cachelib/allocator/tests/BaseAllocatorTest.h @@ -713,35 +713,29 @@ class BaseAllocatorTest : public AllocatorTest { auto handle = alloc.find("key"); ASSERT_NE(handle, nullptr); ASSERT_TRUE(isConst(handle->getMemory())); - ASSERT_EQ(handle.isWriteHandle(), false); // read handle clone auto handle2 = handle.clone(); ASSERT_TRUE(isConst(handle2->getMemory())); - ASSERT_EQ(handle2.isWriteHandle(), false); // upgrade a read handle to a write handle auto handle3 = std::move(handle).toWriteHandle(); ASSERT_FALSE(isConst(handle3->getMemory())); - ASSERT_EQ(handle3.isWriteHandle(), true); } { auto handle = alloc.findToWrite("key"); ASSERT_NE(handle, nullptr); ASSERT_FALSE(isConst(handle->getMemory())); - ASSERT_EQ(handle.isWriteHandle(), true); // write handle clone auto handle2 = handle.clone(); ASSERT_FALSE(isConst(handle2->getMemory())); - ASSERT_EQ(handle2.isWriteHandle(), true); // downgrade a write handle to a read handle ReadHandle handle3 = handle.clone(); ASSERT_NE(handle3, nullptr); ASSERT_TRUE(isConst(handle3->getMemory())); - ASSERT_EQ(handle3.isWriteHandle(), false); } { @@ -752,7 +746,7 @@ class BaseAllocatorTest : public AllocatorTest { // This is like doing a "clone" and setting it into wait context waitContext->set(alloc.find("key")); auto handle2 = std::move(handle).toWriteHandle(); - ASSERT_EQ(handle2.isWriteHandle(), true); + ASSERT_FALSE(isConst(handle2->getMemory())); } } @@ -1259,7 +1253,8 @@ class BaseAllocatorTest : public AllocatorTest { this->testLruLength(alloc, poolId, sizes, keyLen, evictedKeys); } - void testReaperShutDown() { + void testReaperShutDown( + typename AllocatorT::Config::MemoryTierConfigs cfgs = {}) { const size_t nSlabs = 20; const size_t size = nSlabs * Slab::kSize; @@ -1269,6 +1264,8 @@ class BaseAllocatorTest : public AllocatorTest { config.setAccessConfig({8, 8}); config.enableCachePersistence(this->cacheDir_); config.enableItemReaperInBackground(std::chrono::seconds(1), {}); + if (cfgs.size()) + config.configureMemoryTiers(cfgs); std::vector keys; { AllocatorT alloc(AllocatorT::SharedMemNew, config); @@ -4080,15 +4077,16 @@ class BaseAllocatorTest : public AllocatorTest { // Check that item is in the expected container. bool findItem(AllocatorT& allocator, typename AllocatorT::Item* item) { auto& container = allocator.getMMContainer(*item); - auto itr = container.getEvictionIterator(); bool found = false; - while (itr) { - if (itr.get() == item) { - found = true; - break; + container.withEvictionIterator([&found, &item](auto&& itr) { + while (itr) { + if (itr.get() == item) { + found = true; + break; + } + ++itr; } - ++itr; - } + }); return found; } @@ -4238,13 +4236,13 @@ class BaseAllocatorTest : public AllocatorTest { // Had a bug: D4799860 where we allocated the wrong size for chained item { const auto parentAllocInfo = - alloc.allocator_->getAllocInfo(itemHandle->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(itemHandle->getMemory()); const auto child1AllocInfo = - alloc.allocator_->getAllocInfo(chainedItemHandle->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle->getMemory()); const auto child2AllocInfo = - alloc.allocator_->getAllocInfo(chainedItemHandle2->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle2->getMemory()); const auto child3AllocInfo = - alloc.allocator_->getAllocInfo(chainedItemHandle3->getMemory()); + alloc.allocator_[0 /* TODO - extend test */]->getAllocInfo(chainedItemHandle3->getMemory()); const auto parentCid = parentAllocInfo.classId; const auto child1Cid = child1AllocInfo.classId; @@ -5476,8 +5474,12 @@ class BaseAllocatorTest : public AllocatorTest { ASSERT_TRUE(big->isInMMContainer()); auto& mmContainer = alloc.getMMContainer(*big); - auto itr = mmContainer.getEvictionIterator(); - ASSERT_EQ(big.get(), &(*itr)); + + typename AllocatorT::Item* evictionCandidate = nullptr; + mmContainer.withEvictionIterator( + [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); }); + + ASSERT_EQ(big.get(), evictionCandidate); alloc.remove("hello"); } @@ -5491,8 +5493,11 @@ class BaseAllocatorTest : public AllocatorTest { ASSERT_TRUE(small2->isInMMContainer()); auto& mmContainer = alloc.getMMContainer(*small2); - auto itr = mmContainer.getEvictionIterator(); - ASSERT_EQ(small2.get(), &(*itr)); + + typename AllocatorT::Item* evictionCandidate = nullptr; + mmContainer.withEvictionIterator( + [&evictionCandidate](auto&& itr) { evictionCandidate = itr.get(); }); + ASSERT_EQ(small2.get(), evictionCandidate); alloc.remove("hello"); } @@ -6290,6 +6295,86 @@ class BaseAllocatorTest : public AllocatorTest { }); EXPECT_EQ(intervalNameExists, 4); } + + void testSingleTierMemoryAllocatorSize() { + typename AllocatorT::Config config; + static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */ + config.setCacheSize(cacheSize); + config.enableCachePersistence(folly::sformat("/tmp/single-tier-test/{}", ::getpid())); + + AllocatorT alloc(AllocatorT::SharedMemNew, config); + + EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize); + } + + void testSingleTierMemoryAllocatorSizeAnonymous() { + typename AllocatorT::Config config; + static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */ + config.setCacheSize(cacheSize); + + AllocatorT alloc(config); + + EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize); + } + + void testBasicMultiTier() { + using Item = typename AllocatorT::Item; + const static std::string data = "data"; + + std::set movedKeys; + auto moveCb = [&](const Item& oldItem, Item& newItem, Item* /* parentPtr */) { + std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + movedKeys.insert(oldItem.getKey().str()); + }; + + typename AllocatorT::Config config; + static constexpr size_t cacheSize = 100 * 1024 * 1024; /* 100 MB */ + config.setCacheSize(100 * 1024 * 1024); /* 100 MB */ + config.enableCachePersistence(folly::sformat("/tmp/multi-tier-test/{}", ::getpid())); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm().setRatio(1) + .setMemBind(std::string("0")), + MemoryTierCacheConfig::fromShm().setRatio(1) + .setMemBind(std::string("0")), + }); + config.enableMovingOnSlabRelease(moveCb); + + AllocatorT alloc(AllocatorT::SharedMemNew, config); + + EXPECT_EQ(alloc.allocator_.size(), 2); + EXPECT_LE(alloc.allocator_[0]->getMemorySize(), cacheSize / 2); + EXPECT_LE(alloc.allocator_[1]->getMemorySize(), cacheSize / 2); + + const size_t numBytes = alloc.getCacheMemoryStats().ramCacheSize; + auto pid = alloc.addPool("default", numBytes); + + static constexpr size_t numOps = cacheSize / 1024; + for (int i = 0; i < numOps; i++) { + std::string key = std::to_string(i); + auto h = alloc.allocate(pid, key, 1024); + EXPECT_TRUE(h); + + std::memcpy(h->getMemory(), data.data(), data.size()); + + alloc.insertOrReplace(h); + } + + EXPECT_TRUE(movedKeys.size() > 0); + + size_t movedButStillInMemory = 0; + for (const auto &k : movedKeys) { + auto h = alloc.find(k); + + if (h) { + movedButStillInMemory++; + /* All moved elements should be in the second tier. */ + EXPECT_TRUE(alloc.allocator_[1]->isMemoryInAllocator(h->getMemory())); + EXPECT_EQ(data, std::string((char*)h->getMemory(), data.size())); + } + } + + EXPECT_TRUE(movedButStillInMemory > 0); + } }; } // namespace tests } // namespace cachelib diff --git a/cachelib/allocator/tests/CacheBaseTest.cpp b/cachelib/allocator/tests/CacheBaseTest.cpp index 928fcc0c67..dae14c5335 100644 --- a/cachelib/allocator/tests/CacheBaseTest.cpp +++ b/cachelib/allocator/tests/CacheBaseTest.cpp @@ -33,7 +33,10 @@ class CacheBaseTest : public CacheBase, public SlabAllocatorTestBase { const std::string getCacheName() const override { return cacheName; } bool isObjectCache() const override { return false; } const MemoryPool& getPool(PoolId) const override { return memoryPool_; } + //TODO: support tiers + const MemoryPool& getPoolByTid(PoolId, TierId tid) const override { return memoryPool_; } PoolStats getPoolStats(PoolId) const override { return PoolStats(); } + ACStats getACStats(TierId, PoolId, ClassId) const { return ACStats(); }; AllSlabReleaseEvents getAllSlabReleaseEvents(PoolId) const override { return AllSlabReleaseEvents{}; } diff --git a/cachelib/allocator/tests/ItemHandleTest.cpp b/cachelib/allocator/tests/ItemHandleTest.cpp index d992a84011..5213166816 100644 --- a/cachelib/allocator/tests/ItemHandleTest.cpp +++ b/cachelib/allocator/tests/ItemHandleTest.cpp @@ -39,6 +39,8 @@ struct TestItem { using ChainedItem = int; void reset() {} + + folly::StringPiece getKey() const { return folly::StringPiece(); } }; struct TestNvmCache; @@ -80,6 +82,12 @@ struct TestAllocator { void adjustHandleCountForThread_private(int i) { tlRef_.tlStats() += i; } + bool addWaitContextForMovingItem( + folly::StringPiece key, + std::shared_ptr> waiter) { + return false; + } + util::FastStats tlRef_; }; } // namespace diff --git a/cachelib/allocator/tests/ItemTest.cpp b/cachelib/allocator/tests/ItemTest.cpp index b0f3a2fdec..2f25cc07f0 100644 --- a/cachelib/allocator/tests/ItemTest.cpp +++ b/cachelib/allocator/tests/ItemTest.cpp @@ -82,11 +82,23 @@ TEST(ItemTest, ExpiryTime) { EXPECT_TRUE(result); EXPECT_EQ(tenMins, item->getConfiguredTTL()); + // So that exclusive bit will be set + item->markAccessible(); // Test that writes fail while the item is moving - item->markExclusive(); + result = item->markMoving(true); + EXPECT_TRUE(result); + result = item->updateExpiryTime(0); + EXPECT_FALSE(result); + item->unmarkMoving(); + + // Test that writes fail while the item is marked for eviction + item->markAccessible(); + result = item->markForEviction(); + EXPECT_TRUE(result); result = item->updateExpiryTime(0); EXPECT_FALSE(result); - item->unmarkExclusive(); + item->unmarkForEviction(); + item->unmarkAccessible(); // Test that writes fail while the item is not in an MMContainer item->unmarkInMMContainer(); diff --git a/cachelib/allocator/tests/MM2QTest.cpp b/cachelib/allocator/tests/MM2QTest.cpp index e11dd95f5a..0e01ffa56f 100644 --- a/cachelib/allocator/tests/MM2QTest.cpp +++ b/cachelib/allocator/tests/MM2QTest.cpp @@ -223,6 +223,19 @@ void MMTypeTest::testIterate(std::vector>& nodes, } } +template +void MMTypeTest::testIterateHot(std::vector>& nodes, + Container& c) { + auto it = nodes.rbegin(); + c.withPromotionIterator([&it,&c](auto &&it2q) { + while (it2q && c.isHot(*it2q)) { + ASSERT_EQ(it2q->getId(), (*it)->getId()); + ++it2q; + ++it; + } + }); +} + template void MMTypeTest::testMatch(std::string expected, MMTypeTest::Container& c) { @@ -238,6 +251,23 @@ void MMTypeTest::testMatch(std::string expected, ASSERT_EQ(expected, actual); } +template +void MMTypeTest::testMatchHot(std::string expected, + MMTypeTest::Container& c) { + int index = -1; + std::string actual; + c.withPromotionIterator([&c,&actual,&index](auto &&it2q) { + while (it2q) { + ++index; + actual += folly::stringPrintf( + "%d:%s, ", it2q->getId(), + (c.isHot(*it2q) ? "H" : (c.isCold(*it2q) ? "C" : "W"))); + ++it2q; + } + }); + ASSERT_EQ(expected, actual); +} + TEST_F(MM2QTest, DetailedTest) { MM2Q::Config config; config.lruRefreshTime = 0; @@ -259,8 +289,11 @@ TEST_F(MM2QTest, DetailedTest) { } testIterate(nodes, c); + testIterateHot(nodes, c); testMatch("0:C, 1:C, 2:C, 3:C, 4:H, 5:H, ", c); + testMatchHot("5:H, 4:H, 3:C, 2:C, 1:C, 0:C, ", c); + // Move 3 to top of the hot cache c.recordAccess(*(nodes[4]), AccessMode::kRead); testMatch("0:C, 1:C, 2:C, 3:C, 5:H, 4:H, ", c); diff --git a/cachelib/allocator/tests/MMTypeTest.h b/cachelib/allocator/tests/MMTypeTest.h index d38f6ce2c1..dbc55677ea 100644 --- a/cachelib/allocator/tests/MMTypeTest.h +++ b/cachelib/allocator/tests/MMTypeTest.h @@ -147,7 +147,9 @@ class MMTypeTest : public testing::Test { void testRecordAccessBasic(Config c); void testSerializationBasic(Config c); void testIterate(std::vector>& nodes, Container& c); + void testIterateHot(std::vector>& nodes, Container& c); void testMatch(std::string expected, Container& c); + void testMatchHot(std::string expected, Container& c); size_t getListSize(const Container& c, typename MMType::LruType list); void verifyIterationVariants(Container& c); }; diff --git a/cachelib/allocator/tests/MemoryTiersTest.cpp b/cachelib/allocator/tests/MemoryTiersTest.cpp new file mode 100644 index 0000000000..298195378c --- /dev/null +++ b/cachelib/allocator/tests/MemoryTiersTest.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include "cachelib/allocator/CacheAllocator.h" +#include "cachelib/allocator/tests/TestBase.h" + +namespace facebook { +namespace cachelib { +namespace tests { + +using LruAllocatorConfig = CacheAllocatorConfig; +using LruMemoryTierConfigs = LruAllocatorConfig::MemoryTierConfigs; +using Strings = std::vector; +using Ratios = std::vector; + +constexpr size_t MB = 1024ULL * 1024ULL; +constexpr size_t GB = MB * 1024ULL; + +const size_t defaultTotalCacheSize{1 * GB}; +const std::string defaultCacheDir{"/tmp/metadataDir"}; + +template +class MemoryTiersTest : public AllocatorTest { + public: + void basicCheck(LruAllocatorConfig& actualConfig, + size_t expectedTotalCacheSize = defaultTotalCacheSize, + const std::string& expectedCacheDir = defaultCacheDir) { + EXPECT_EQ(actualConfig.getCacheSize(), expectedTotalCacheSize); + auto configs = actualConfig.getMemoryTierConfigs(); + + size_t sum_ratios = std::accumulate( + configs.begin(), configs.end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config) { + return i + config.getRatio(); + }); + size_t sum_sizes = std::accumulate( + configs.begin(), configs.end(), 0UL, + [&](const size_t i, const MemoryTierCacheConfig& config) { + return i + config.calculateTierSize(actualConfig.getCacheSize(), + sum_ratios); + }); + + EXPECT_GE(expectedTotalCacheSize, sum_ratios * Slab::kSize); + EXPECT_LE(sum_sizes, expectedTotalCacheSize); + EXPECT_GE(sum_sizes, expectedTotalCacheSize - configs.size() * Slab::kSize); + } + + LruAllocatorConfig createTestCacheConfig( + const Ratios& tierRatios = {1}, + bool setPosixForShm = true, + size_t cacheSize = defaultTotalCacheSize, + const std::string& cacheDir = defaultCacheDir) { + LruAllocatorConfig cfg; + cfg.setCacheSize(cacheSize).enableCachePersistence(cacheDir); + + if (setPosixForShm) + cfg.usePosixForShm(); + LruMemoryTierConfigs tierConfigs; + tierConfigs.reserve(tierRatios.size()); + for (auto i = 0; i < tierRatios.size(); ++i) { + tierConfigs.push_back(MemoryTierCacheConfig::fromShm() + .setRatio(tierRatios[i]) + .setMemBind(std::string("0"))); + } + + cfg.configureMemoryTiers(tierConfigs); + return cfg; + } + + LruAllocatorConfig createTieredCacheConfig(size_t totalCacheSize, + size_t numTiers = 2) { + LruAllocatorConfig tieredCacheConfig{}; + std::vector configs; + for (auto i = 1; i <= numTiers; ++i) { + configs.push_back(MemoryTierCacheConfig::fromShm().setRatio(1).setMemBind( + std::string("0"))); + } + tieredCacheConfig.setCacheSize(totalCacheSize) + .enableCachePersistence( + folly::sformat("/tmp/multi-tier-test/{}", ::getpid())) + .usePosixForShm() + .configureMemoryTiers(configs); + return tieredCacheConfig; + } + + LruAllocatorConfig createDramCacheConfig(size_t totalCacheSize) { + LruAllocatorConfig dramConfig{}; + dramConfig.setCacheSize(totalCacheSize); + return dramConfig; + } + + void validatePoolSize(PoolId poolId, + std::unique_ptr& allocator, + size_t expectedSize) { + size_t actualSize = allocator->getPoolSize(poolId); + EXPECT_EQ(actualSize, expectedSize); + } + + void testAddPool(std::unique_ptr& alloc, + size_t poolSize, + bool isSizeValid = true, + size_t numTiers = 2) { + if (isSizeValid) { + auto pool = alloc->addPool("validPoolSize", poolSize); + EXPECT_LE(alloc->getPoolSize(pool), poolSize); + if (poolSize >= numTiers * Slab::kSize) + EXPECT_GE(alloc->getPoolSize(pool), + poolSize - numTiers * Slab::kSize); + } else { + EXPECT_THROW(alloc->addPool("invalidPoolSize", poolSize), + std::invalid_argument); + // TODO: test this for all tiers + EXPECT_EQ(alloc->getPoolIds().size(), 0); + } + } +}; + +using LruMemoryTiersTest = MemoryTiersTest; + +TEST_F(LruMemoryTiersTest, TestValid1TierConfig) { + LruAllocatorConfig cfg = createTestCacheConfig().validate(); + basicCheck(cfg); +} + +TEST_F(LruMemoryTiersTest, TestValid2TierConfig) { + LruAllocatorConfig cfg = createTestCacheConfig({1, 1}); + basicCheck(cfg); +} + +TEST_F(LruMemoryTiersTest, TestValid2TierRatioConfig) { + LruAllocatorConfig cfg = createTestCacheConfig({5, 2}); + basicCheck(cfg); +} + +TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigNumberOfPartitionsTooLarge) { + EXPECT_THROW(createTestCacheConfig({defaultTotalCacheSize, 1}).validate(), + std::invalid_argument); +} + +TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesAndRatioNotSet) { + EXPECT_THROW(createTestCacheConfig({1, 0}), std::invalid_argument); +} + +TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatiosCacheSizeNotSet) { + EXPECT_THROW(createTestCacheConfig({1, 1}, true, + /* cacheSize */ 0) + .validate(), + std::invalid_argument); +} + +TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigRatioNotSet) { + EXPECT_THROW(createTestCacheConfig({1, 0}), std::invalid_argument); +} + +TEST_F(LruMemoryTiersTest, TestInvalid2TierConfigSizesNeCacheSize) { + EXPECT_THROW(createTestCacheConfig({0, 0}), std::invalid_argument); +} + +TEST_F(LruMemoryTiersTest, TestPoolAllocations) { + std::vector totalCacheSizes = {8 * GB, 2 * GB}; + + static const size_t numExtraSizes = 4; + static const size_t numExtraSlabs = 20; + + for (size_t i = 0; i < numExtraSizes; i++) { + totalCacheSizes.push_back(totalCacheSizes.back() + + (folly::Random::rand64() % numExtraSlabs) * + Slab::kSize); + } + + size_t min_ratio = 1; + size_t max_ratio = 111; + + static const size_t numCombinations = 10; + + for (auto totalCacheSize : totalCacheSizes) { + for (size_t k = 0; k < numCombinations; k++) { + const size_t i = folly::Random::rand32() % max_ratio + min_ratio; + const size_t j = folly::Random::rand32() % max_ratio + min_ratio; + LruAllocatorConfig cfg = + createTestCacheConfig({i, j}, + /* usePoisx */ true, totalCacheSize); + basicCheck(cfg, totalCacheSize); + + std::unique_ptr alloc = std::unique_ptr( + new LruAllocator(LruAllocator::SharedMemNew, cfg)); + + size_t size = (folly::Random::rand64() % + (alloc->getCacheMemoryStats().ramCacheSize - Slab::kSize)) + + Slab::kSize; + testAddPool(alloc, size, true); + } + } +} + +TEST_F(LruMemoryTiersTest, TestPoolInvalidAllocations) { + std::vector totalCacheSizes = {48 * MB, 51 * MB, 256 * MB, + 1 * GB, 5 * GB, 8 * GB}; + size_t min_ratio = 1; + size_t max_ratio = 111; + + static const size_t numCombinations = 10; + + for (auto totalCacheSize : totalCacheSizes) { + for (size_t k = 0; k < numCombinations; k++) { + const size_t i = folly::Random::rand32() % max_ratio + min_ratio; + const size_t j = folly::Random::rand32() % max_ratio + min_ratio; + LruAllocatorConfig cfg = + createTestCacheConfig({i, j}, + /* usePoisx */ true, totalCacheSize); + + std::unique_ptr alloc = nullptr; + try { + alloc = std::unique_ptr( + new LruAllocator(LruAllocator::SharedMemNew, cfg)); + } catch(...) { + // expection only if cache too small + size_t sum_ratios = std::accumulate( + cfg.getMemoryTierConfigs().begin(), cfg.getMemoryTierConfigs().end(), 0UL, + [](const size_t i, const MemoryTierCacheConfig& config) { + return i + config.getRatio(); + }); + auto tier1slabs = cfg.getMemoryTierConfigs()[0].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize; + auto tier2slabs = cfg.getMemoryTierConfigs()[1].calculateTierSize(cfg.getCacheSize(), sum_ratios) / Slab::kSize; + EXPECT_TRUE(tier1slabs <= 2 || tier2slabs <= 2); + + continue; + } + + size_t size = (folly::Random::rand64() % (100 * GB)) + + alloc->getCacheMemoryStats().ramCacheSize; + testAddPool(alloc, size, false); + } + } +} +} // namespace tests +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/tests/NvmTestUtils.h b/cachelib/allocator/tests/NvmTestUtils.h index 6d6242aadf..cad96c41d4 100644 --- a/cachelib/allocator/tests/NvmTestUtils.h +++ b/cachelib/allocator/tests/NvmTestUtils.h @@ -27,7 +27,7 @@ namespace utils { using NavyConfig = navy::NavyConfig; inline NavyConfig getNvmTestConfig(const std::string& cacheDir) { NavyConfig config{}; - config.setSimpleFile(cacheDir + "/navy", 100 * 1024ULL * 1024ULL); + config.setSimpleFile(cacheDir + "/navy", 200 * 1024ULL * 1024ULL); config.setDeviceMetadataSize(4 * 1024 * 1024); config.setBlockSize(1024); config.setNavyReqOrderingShards(10); diff --git a/cachelib/allocator/tests/RefCountTest.cpp b/cachelib/allocator/tests/RefCountTest.cpp index b355a48a8e..862271b03d 100644 --- a/cachelib/allocator/tests/RefCountTest.cpp +++ b/cachelib/allocator/tests/RefCountTest.cpp @@ -30,6 +30,7 @@ class RefCountTest : public AllocTestBase { public: static void testMultiThreaded(); static void testBasic(); + static void testMarkForEvictionAndMoving(); }; void RefCountTest::testMultiThreaded() { @@ -51,7 +52,7 @@ void RefCountTest::testMultiThreaded() { nLocalRef--; ref.markAccessible(); } else { - ref.incRef(); + ref.incRef(true); nLocalRef++; ref.unmarkAccessible(); } @@ -81,7 +82,7 @@ void RefCountTest::testBasic() { ASSERT_EQ(0, ref.getRaw()); ASSERT_FALSE(ref.isInMMContainer()); ASSERT_FALSE(ref.isAccessible()); - ASSERT_FALSE(ref.isExclusive()); + ASSERT_FALSE(ref.isMoving()); ASSERT_FALSE(ref.template isFlagSet()); ASSERT_FALSE(ref.template isFlagSet()); @@ -89,7 +90,7 @@ void RefCountTest::testBasic() { ref.markInMMContainer(); ASSERT_TRUE(ref.isInMMContainer()); ASSERT_FALSE(ref.isAccessible()); - ASSERT_FALSE(ref.isExclusive()); + ASSERT_FALSE(ref.isMoving()); ASSERT_EQ(0, ref.getAccessRef()); ASSERT_FALSE(ref.template isFlagSet()); ASSERT_FALSE(ref.template isFlagSet()); @@ -100,18 +101,18 @@ void RefCountTest::testBasic() { ASSERT_FALSE(ref.template isFlagSet()); for (uint32_t i = 0; i < RefcountWithFlags::kAccessRefMask; i++) { - ASSERT_TRUE(ref.incRef()); + ASSERT_EQ(ref.incRef(true),RefcountWithFlags::incOk); } // Incrementing past the max will fail auto rawRef = ref.getRaw(); - ASSERT_FALSE(ref.incRef()); + ASSERT_THROW(ref.incRef(true), std::overflow_error); ASSERT_EQ(rawRef, ref.getRaw()); // Bumping up access ref shouldn't affect admin ref and flags ASSERT_TRUE(ref.isInMMContainer()); ASSERT_FALSE(ref.isAccessible()); - ASSERT_FALSE(ref.isExclusive()); + ASSERT_FALSE(ref.isMoving()); ASSERT_EQ(RefcountWithFlags::kAccessRefMask, ref.getAccessRef()); ASSERT_TRUE(ref.template isFlagSet()); ASSERT_FALSE(ref.template isFlagSet()); @@ -128,7 +129,7 @@ void RefCountTest::testBasic() { // Bumping down access ref shouldn't affect admin ref and flags ASSERT_TRUE(ref.isInMMContainer()); ASSERT_FALSE(ref.isAccessible()); - ASSERT_FALSE(ref.isExclusive()); + ASSERT_FALSE(ref.isMoving()); ASSERT_EQ(0, ref.getAccessRef()); ASSERT_TRUE(ref.template isFlagSet()); ASSERT_FALSE(ref.template isFlagSet()); @@ -136,7 +137,7 @@ void RefCountTest::testBasic() { ref.template unSetFlag(); ASSERT_TRUE(ref.isInMMContainer()); ASSERT_FALSE(ref.isAccessible()); - ASSERT_FALSE(ref.isExclusive()); + ASSERT_FALSE(ref.isMoving()); ASSERT_EQ(0, ref.getAccessRef()); ASSERT_FALSE(ref.template isFlagSet()); ASSERT_FALSE(ref.template isFlagSet()); @@ -145,33 +146,119 @@ void RefCountTest::testBasic() { ASSERT_EQ(0, ref.getRaw()); ASSERT_FALSE(ref.isInMMContainer()); ASSERT_FALSE(ref.isAccessible()); - ASSERT_FALSE(ref.isExclusive()); + ASSERT_FALSE(ref.isMoving()); ASSERT_EQ(0, ref.getAccessRef()); ASSERT_FALSE(ref.template isFlagSet()); ASSERT_FALSE(ref.template isFlagSet()); // conditionally set flags - ASSERT_FALSE((ref.markExclusive())); + ASSERT_FALSE(ref.markMoving(true)); ref.markInMMContainer(); - ASSERT_TRUE((ref.markExclusive())); - ASSERT_FALSE((ref.isOnlyExclusive())); + // only first one succeeds + ASSERT_TRUE(ref.markMoving(true)); + ASSERT_FALSE(ref.markMoving(true)); ref.unmarkInMMContainer(); + ref.template setFlag(); - // Have no other admin refcount but with a flag still means "isOnlyExclusive" - ASSERT_TRUE((ref.isOnlyExclusive())); + // Have no other admin refcount but with a flag still means "isOnlyMoving" + ASSERT_TRUE((ref.isOnlyMoving())); - // Set some flags and verify that "isOnlyExclusive" does not care about flags + // Set some flags and verify that "isOnlyMoving" does not care about flags ref.markIsChainedItem(); ASSERT_TRUE(ref.isChainedItem()); - ASSERT_TRUE((ref.isOnlyExclusive())); + ASSERT_TRUE((ref.isOnlyMoving())); ref.unmarkIsChainedItem(); ASSERT_FALSE(ref.isChainedItem()); - ASSERT_TRUE((ref.isOnlyExclusive())); + ASSERT_TRUE((ref.isOnlyMoving())); +} + +void RefCountTest::testMarkForEvictionAndMoving() { + { + // cannot mark for eviction when not accessible or not in MMContainer + RefcountWithFlags ref; + ASSERT_FALSE(ref.markForEviction()); + + ref.markInMMContainer(); + ASSERT_FALSE(ref.markForEviction()); + ref.unmarkInMMContainer(); + + ref.markAccessible(); + ASSERT_FALSE(ref.markForEviction()); + } + + { + // can mark for eviction when accessible and in MMContainer + // and unmarkForEviction return value contains admin bits + RefcountWithFlags ref; + ref.markInMMContainer(); + ref.markAccessible(); + ASSERT_TRUE(ref.markForEviction()); + ASSERT_TRUE(ref.unmarkForEviction() > 0); + } + + { + // cannot mark for eviction when moving + RefcountWithFlags ref; + ref.markInMMContainer(); + ref.markAccessible(); + + ASSERT_TRUE(ref.markMoving(true)); + ASSERT_FALSE(ref.markForEviction()); + + ref.unmarkInMMContainer(); + ref.unmarkAccessible(); + auto ret = ref.unmarkMoving(); + ASSERT_EQ(ret, 0); + } + + { + // cannot mark moving when marked for eviction + RefcountWithFlags ref; + ref.markInMMContainer(); + ref.markAccessible(); + + ASSERT_TRUE(ref.markForEviction()); + ASSERT_FALSE(ref.markMoving(true)); + + ref.unmarkInMMContainer(); + ref.unmarkAccessible(); + auto ret = ref.unmarkForEviction(); + ASSERT_EQ(ret, 0); + } + + { + // can mark moving when ref count > 0 + RefcountWithFlags ref; + ref.markInMMContainer(); + ref.markAccessible(); + + ref.incRef(true); + + ASSERT_TRUE(ref.markMoving(false)); + + ref.unmarkInMMContainer(); + ref.unmarkAccessible(); + auto ret = ref.unmarkMoving(); + ASSERT_EQ(ret, 1); + } + + { + // cannot mark for eviction when ref count > 0 + RefcountWithFlags ref; + ref.markInMMContainer(); + ref.markAccessible(); + + ref.incRef(true); + ASSERT_FALSE(ref.markForEviction()); + } } } // namespace TEST_F(RefCountTest, MutliThreaded) { testMultiThreaded(); } TEST_F(RefCountTest, Basic) { testBasic(); } +TEST_F(RefCountTest, MarkForEvictionAndMoving) { + testMarkForEvictionAndMoving(); +} } // namespace tests } // namespace cachelib } // namespace facebook diff --git a/cachelib/allocator/tests/TestBase-inl.h b/cachelib/allocator/tests/TestBase-inl.h index bf7355c87d..4d45e981bc 100644 --- a/cachelib/allocator/tests/TestBase-inl.h +++ b/cachelib/allocator/tests/TestBase-inl.h @@ -312,7 +312,7 @@ void AllocatorTest::testShmIsRemoved( ASSERT_FALSE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm)); ASSERT_FALSE(AllocatorT::ShmManager::segmentExists( - config.getCacheDir(), detail::kShmCacheName, config.usePosixShm)); + config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm)); ASSERT_FALSE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmChainedItemHashTableName, config.usePosixShm)); @@ -326,7 +326,7 @@ void AllocatorTest::testShmIsNotRemoved( ASSERT_TRUE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmHashTableName, config.usePosixShm)); ASSERT_TRUE(AllocatorT::ShmManager::segmentExists( - config.getCacheDir(), detail::kShmCacheName, config.usePosixShm)); + config.getCacheDir(), detail::kShmCacheName + std::to_string(0), config.usePosixShm)); ASSERT_TRUE(AllocatorT::ShmManager::segmentExists( config.getCacheDir(), detail::kShmChainedItemHashTableName, config.usePosixShm)); diff --git a/cachelib/benchmarks/PtrCompressionBench.cpp b/cachelib/benchmarks/PtrCompressionBench.cpp index aeaa2c3b11..5daefc146f 100644 --- a/cachelib/benchmarks/PtrCompressionBench.cpp +++ b/cachelib/benchmarks/PtrCompressionBench.cpp @@ -61,7 +61,8 @@ void buildAllocs(size_t poolSize) { void* alloc = ma->allocate(pid, size); XDCHECK_GE(size, CompressedPtr::getMinAllocSize()); if (alloc != nullptr) { - validAllocs.push_back({alloc, ma->compress(alloc)}); + validAllocs.push_back( + {alloc, ma->compress(alloc, false /* isMultiTiered */)}); validAllocsAlt.push_back({alloc, ma->compressAlt(alloc)}); numAllocations++; } @@ -83,7 +84,7 @@ BENCHMARK(CompressionAlt) { BENCHMARK_RELATIVE(Compression) { for (const auto& alloc : validAllocs) { - CompressedPtr c = m->compress(alloc.first); + CompressedPtr c = m->compress(alloc.first, false /* isMultiTiered */); folly::doNotOptimizeAway(c); } } @@ -97,7 +98,7 @@ BENCHMARK(DeCompressAlt) { BENCHMARK_RELATIVE(DeCompress) { for (const auto& alloc : validAllocs) { - void* ptr = m->unCompress(alloc.second); + void* ptr = m->unCompress(alloc.second, false /* isMultiTiered */); folly::doNotOptimizeAway(ptr); } } diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h index ed8bfd1b04..a73763bcc4 100644 --- a/cachelib/cachebench/cache/Cache-inl.h +++ b/cachelib/cachebench/cache/Cache-inl.h @@ -46,6 +46,16 @@ Cache::Cache(const CacheConfig& config, config_.getRebalanceStrategy(), std::chrono::seconds(config_.poolRebalanceIntervalSec)); + allocatorConfig_.enableBackgroundEvictor( + config_.getBackgroundEvictorStrategy(), + std::chrono::milliseconds(config_.backgroundEvictorIntervalMilSec), + config_.evictorThreads); + + allocatorConfig_.enableBackgroundPromoter( + config_.getBackgroundPromoterStrategy(), + std::chrono::milliseconds(config_.backgroundPromoterIntervalMilSec), + config_.promoterThreads); + if (config_.moveOnSlabRelease && movingSync != nullptr) { allocatorConfig_.enableMovingOnSlabRelease( [](Item& oldItem, Item& newItem, Item* parentPtr) { @@ -98,6 +108,12 @@ Cache::Cache(const CacheConfig& config, } }); + allocatorConfig_.maxEvictionBatch = config_.maxEvictionBatch; + allocatorConfig_.maxPromotionBatch = config_.maxPromotionBatch; + allocatorConfig_.minEvictionBatch = config_.minEvictionBatch; + allocatorConfig_.minPromotionBatch = config_.minPromotionBatch; + allocatorConfig_.maxEvictionPromotionHotness = config_.maxEvictionPromotionHotness; + if (config_.enableItemDestructorCheck) { auto removeCB = [&](const typename Allocator::DestructorData& data) { if (!itemRecords_.validate(data)) { @@ -620,10 +636,36 @@ Stats Cache::getStats() const { aggregate += poolStats; } + std::map>> allocationClassStats{}; + + for (size_t pid = 0; pid < pools_.size(); pid++) { + auto cids = cache_->getPoolStats(static_cast(pid)).getClassIds(); + for (TierId tid = 0; tid < cache_->getNumTiers(); tid++) { + for (auto cid : cids) + allocationClassStats[tid][pid][cid] = cache_->getACStats(tid, pid, cid); + } + } + const auto cacheStats = cache_->getGlobalCacheStats(); const auto rebalanceStats = cache_->getSlabReleaseStats(); const auto navyStats = cache_->getNvmCacheStatsMap().toMap(); + ret.allocationClassStats = allocationClassStats; + + ret.backgndEvicStats.nEvictedItems = + cacheStats.evictionStats.numMovedItems; + ret.backgndEvicStats.nTraversals = + cacheStats.evictionStats.runCount; + ret.backgndEvicStats.nClasses = + cacheStats.evictionStats.totalClasses; + ret.backgndEvicStats.evictionSize = + cacheStats.evictionStats.totalBytesMoved; + + ret.backgndPromoStats.nPromotedItems = + cacheStats.promotionStats.numMovedItems; + ret.backgndPromoStats.nTraversals = + cacheStats.promotionStats.runCount; + ret.numEvictions = aggregate.numEvictions(); ret.numItems = aggregate.numItems(); ret.evictAttempts = cacheStats.evictionAttempts; @@ -677,6 +719,9 @@ Stats Cache::getStats() const { ret.nvmCounters = cache_->getNvmCacheStatsMap().toMap(); } + ret.backgroundEvictionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Evict); + ret.backgroundPromotionClasses = cache_->getBackgroundMoverClassStats(MoverDir::Promote); + // nvm stats from navy if (!isRamOnly() && !navyStats.empty()) { auto lookup = [&navyStats](const std::string& key) { diff --git a/cachelib/cachebench/cache/Cache.cpp b/cachelib/cachebench/cache/Cache.cpp index 70feb0f764..ea9d6b106c 100644 --- a/cachelib/cachebench/cache/Cache.cpp +++ b/cachelib/cachebench/cache/Cache.cpp @@ -20,6 +20,12 @@ DEFINE_bool(report_api_latency, false, "Enable reporting cache API latency tracking"); +DEFINE_string( + report_ac_memory_usage_stats, + "", + "Enable reporting statistics for each allocation class. Set to" + "'human_readable' to print KB/MB/GB or to 'raw' to print in bytes."); + namespace facebook { namespace cachelib { namespace cachebench {} // namespace cachebench diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h index a86dd06c49..a85c1efb66 100644 --- a/cachelib/cachebench/cache/Cache.h +++ b/cachelib/cachebench/cache/Cache.h @@ -44,6 +44,7 @@ #include "cachelib/cachebench/util/NandWrites.h" DECLARE_bool(report_api_latency); +DECLARE_string(report_ac_memory_usage_stats); namespace facebook { namespace cachelib { @@ -324,6 +325,10 @@ class Cache { // return the stats for the pool. PoolStats getPoolStats(PoolId pid) const { return cache_->getPoolStats(pid); } + ACStats getACStats(TierId tid, PoolId pid, ClassId cid) const { + return cache_->getACStats(tid, pid, cid); + } + // return the total number of inconsistent operations detected since start. unsigned int getInconsistencyCount() const { return inconsistencyCount_.load(std::memory_order_relaxed); diff --git a/cachelib/cachebench/cache/CacheStats.h b/cachelib/cachebench/cache/CacheStats.h index d6c9e53584..993f3845a3 100644 --- a/cachelib/cachebench/cache/CacheStats.h +++ b/cachelib/cachebench/cache/CacheStats.h @@ -21,11 +21,38 @@ #include "cachelib/common/PercentileStats.h" DECLARE_bool(report_api_latency); +DECLARE_string(report_ac_memory_usage_stats); namespace facebook { namespace cachelib { namespace cachebench { + +struct BackgroundEvictionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t nEvictedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t nTraversals{0}; + + // number of classes + uint64_t nClasses{0}; + + // size of evicted items + uint64_t evictionSize{0}; +}; + +struct BackgroundPromotionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t nPromotedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t nTraversals{0}; +}; + struct Stats { + BackgroundEvictionStats backgndEvicStats; + BackgroundPromotionStats backgndPromoStats; + uint64_t numEvictions{0}; uint64_t numItems{0}; @@ -100,11 +127,16 @@ struct Stats { uint64_t invalidDestructorCount{0}; int64_t unDestructedItemCount{0}; + std::map>> allocationClassStats; + // populate the counters related to nvm usage. Cache implementation can decide // what to populate since not all of those are interesting when running // cachebench. std::unordered_map nvmCounters; + std::map>> backgroundEvictionClasses; + std::map>> backgroundPromotionClasses; + // errors from the nvm engine. std::unordered_map nvmErrors; @@ -125,12 +157,80 @@ struct Stats { << std::endl; out << folly::sformat("RAM Evictions : {:,}", numEvictions) << std::endl; + auto foreachAC = [&](auto &map, auto cb) { + for (auto &tidStats : map) { + for (auto &pidStat : tidStats.second) { + for (auto &cidStat : pidStat.second) { + cb(tidStats.first, pidStat.first, cidStat.first, cidStat.second); + } + } + } + }; + for (auto pid = 0U; pid < poolUsageFraction.size(); pid++) { out << folly::sformat("Fraction of pool {:,} used : {:.2f}", pid, poolUsageFraction[pid]) << std::endl; } + if (FLAGS_report_ac_memory_usage_stats != "") { + auto formatMemory = [&](size_t bytes) -> std::tuple { + if (FLAGS_report_ac_memory_usage_stats == "raw") { + return {"B", bytes}; + } + + constexpr double KB = 1024.0; + constexpr double MB = 1024.0 * 1024; + constexpr double GB = 1024.0 * 1024 * 1024; + + if (bytes >= GB) { + return {"GB", static_cast(bytes) / GB}; + } else if (bytes >= MB) { + return {"MB", static_cast(bytes) / MB}; + } else if (bytes >= KB) { + return {"KB", static_cast(bytes) / KB}; + } else { + return {"B", bytes}; + } + }; + + auto foreachAC = [&](auto cb) { + for (auto& tidStat : allocationClassStats) { + for (auto& pidStat : tidStat.second) { + for (auto& cidStat : pidStat.second) { + cb(tidStat.first, pidStat.first, cidStat.first, cidStat.second); + } + } + } + }; + + + foreachAC([&](auto tid, auto pid, auto cid, auto stats) { + auto [allocSizeSuffix, allocSize] = formatMemory(stats.allocSize); + auto [memorySizeSuffix, memorySize] = + formatMemory(stats.totalAllocatedSize()); + + // If the pool is not full, extrapolate usageFraction for AC assuming it + // will grow at the same rate. This value will be the same for all ACs. + auto acUsageFraction = (poolUsageFraction[pid] < 1.0) + ? poolUsageFraction[pid] + : stats.usageFraction(); + + out << folly::sformat( + "tid{:2} pid{:2} cid{:4} {:8.2f}{} usageFraction: {:4.2f} " + "memorySize: {:8.2f}{} " + "rollingAvgAllocLatency: {:8.2f}ns", + tid, pid, cid, allocSize, allocSizeSuffix, acUsageFraction, + memorySize, memorySizeSuffix, + stats.allocLatencyNs.estimate()) + << std::endl; + }); + } + + out << folly::sformat("Tier 0 Background Evicted Items : {:,}", + backgndEvicStats.nEvictedItems) << std::endl; + out << folly::sformat("Tier 0 Background Traversals : {:,}", + backgndEvicStats.nTraversals) << std::endl; if (numCacheGets > 0) { out << folly::sformat("Cache Gets : {:,}", numCacheGets) << std::endl; out << folly::sformat("Hit Ratio : {:6.2f}%", overallHitRatio) @@ -161,6 +261,22 @@ struct Stats { } } + if (!backgroundEvictionClasses.empty() && backgndEvicStats.nEvictedItems > 0 ) { + out << "== Class Background Eviction Counters Map ==" << std::endl; + foreachAC(backgroundEvictionClasses, [&](auto tid, auto pid, auto cid, auto evicted){ + out << folly::sformat("tid{:2} pid{:2} cid{:4} evicted: {:4}", + tid, pid, cid, evicted) << std::endl; + }); + } + + if (!backgroundPromotionClasses.empty() && backgndPromoStats.nPromotedItems > 0) { + out << "== Class Background Promotion Counters Map ==" << std::endl; + foreachAC(backgroundPromotionClasses, [&](auto tid, auto pid, auto cid, auto promoted){ + out << folly::sformat("tid{:2} pid{:2} cid{:4} promoted: {:4}", + tid, pid, cid, promoted) << std::endl; + }); + } + if (numNvmGets > 0 || numNvmDeletes > 0 || numNvmPuts > 0) { const double ramHitRatio = invertPctFn(numCacheGetMiss, numCacheGets); const double nvmHitRatio = invertPctFn(numNvmGetMiss, numNvmGets); diff --git a/cachelib/cachebench/runner/AsyncCacheStressor.h b/cachelib/cachebench/runner/AsyncCacheStressor.h index 5b50db43b4..830b503795 100644 --- a/cachelib/cachebench/runner/AsyncCacheStressor.h +++ b/cachelib/cachebench/runner/AsyncCacheStressor.h @@ -287,6 +287,10 @@ class AsyncCacheStressor : public Stressor { ++stats.get; auto lock = chainedItemAcquireUniqueLock(*key); + // This was moved outside the lambda, as otherwise gcc-8.x crashes with an + // internal compiler error here (suspected regression in folly). + XDCHECK(req->sizeBegin + 1 != req->sizeEnd); + auto onReadyFn = [&, req, key, l = std::move(lock), pid](auto hdl) { WriteHandle wHdl; if (hdl == nullptr) { @@ -303,7 +307,6 @@ class AsyncCacheStressor : public Stressor { } else { wHdl = std::move(hdl).toWriteHandle(); } - XDCHECK(req->sizeBegin + 1 != req->sizeEnd); bool chainSuccessful = false; for (auto j = req->sizeBegin + 1; j != req->sizeEnd; j++) { ++stats.addChained; diff --git a/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json new file mode 100644 index 0000000000..076550bc5c --- /dev/null +++ b/cachelib/cachebench/test_configs/consistency/navy-multi-tier.json @@ -0,0 +1,54 @@ +{ + "cache_config" : { + "cacheSizeMB" : 300, + "poolRebalanceIntervalSec" : 1, + "moveOnSlabRelease" : true, + + "cacheDir": "/tmp/mem-tier2", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + }, + { + "ratio": 1, + "memBindNodes": 0 + } + ], + + "numPools" : 2, + "poolSizes" : [0.5, 0.5], + "allocFactor" : 2.0, + "nvmCacheSizeMB" : 1024 + }, + "test_config" : + { + + "checkConsistency" : true, + + "numOps" : 60000, + "numThreads" : 20, + "numKeys" : 200000, + + + "keySizeRange" : [1, 8, 64], + "keySizeRangeProbability" : [0.5, 0.5], + + "valSizeRange" : [256, 1024, 4096, 8192], + "valSizeRangeProbability" : [0.2, 0.7, 0.1], + + "chainedItemLengthRange" : [1, 2, 4, 32], + "chainedItemLengthRangeProbability" : [0.8, 0.18, 0.02], + + "chainedItemValSizeRange" : [1, 128, 256, 1024, 4096, 20480], + "chainedItemValSizeRangeProbability" : [0.1, 0.1, 0.2, 0.3, 0.3], + + "getRatio" : 0.8, + "setRatio" : 0.1, + "delRatio" : 0.0, + "addChainedRatio" : 0.05, + "keyPoolDistribution": [0.5, 0.5], + "opPoolDistribution" : [0.5, 0.5] + } + +} diff --git a/cachelib/cachebench/test_configs/consistency/navy.json b/cachelib/cachebench/test_configs/consistency/navy.json index 73b016a50f..b95b056d31 100644 --- a/cachelib/cachebench/test_configs/consistency/navy.json +++ b/cachelib/cachebench/test_configs/consistency/navy.json @@ -14,8 +14,8 @@ "checkConsistency" : true, - "numOps" : 30000000, - "numThreads" : 40, + "numOps" : 600000, + "numThreads" : 20, "numKeys" : 200000, diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json new file mode 100644 index 0000000000..d9acdf7c6c --- /dev/null +++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-4GB-DRAM-4GB-PMEM.json @@ -0,0 +1,42 @@ +{ + "cache_config": { + "cacheSizeMB": 8192, + "poolRebalanceIntervalSec": 0, + "cacheDir": "/tmp/mem-tiers", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + }, + { + "ratio": 1, + "memBindNodes": 0 + } + ] + }, + "test_config": + { + "addChainedRatio": 0.0, + "delRatio": 0.0, + "enableLookaside": true, + "getRatio": 0.7684563460126871, + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "loneGetRatio": 0.2315436539873129, + "numKeys": 71605574, + "numOps": 5000000, + "numThreads": 24, + "popDistFile": "pop.json", + + "setRatio": 0.0, + "valSizeDistFile": "sizes.json" + } + +} diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json new file mode 100644 index 0000000000..6d47e08b74 --- /dev/null +++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-DRAM.json @@ -0,0 +1,32 @@ +{ + "cache_config": { + "cacheSizeMB": 8192, + "poolRebalanceIntervalSec": 0, + "cacheDir": "/tmp/mem-tier" + }, + "test_config": + { + "addChainedRatio": 0.0, + "delRatio": 0.0, + "enableLookaside": true, + "getRatio": 0.7684563460126871, + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "loneGetRatio": 0.2315436539873129, + "numKeys": 71605574, + "numOps": 5000000, + "numThreads": 24, + "popDistFile": "pop.json", + + "setRatio": 0.0, + "valSizeDistFile": "sizes.json" + } + +} diff --git a/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json new file mode 100644 index 0000000000..4feab55154 --- /dev/null +++ b/cachelib/cachebench/test_configs/hit_ratio/graph_cache_leader_fbobj/config-8GB-PMEM.json @@ -0,0 +1,38 @@ +{ + "cache_config": { + "cacheSizeMB": 8192, + "poolRebalanceIntervalSec": 0, + "cacheDir": "/tmp/mem-tier", + "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": 0 + } + ] + }, + "test_config": + { + "addChainedRatio": 0.0, + "delRatio": 0.0, + "enableLookaside": true, + "getRatio": 0.7684563460126871, + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "loneGetRatio": 0.2315436539873129, + "numKeys": 71605574, + "numOps": 5000000, + "numThreads": 24, + "popDistFile": "pop.json", + + "setRatio": 0.0, + "valSizeDistFile": "sizes.json" + } + +} diff --git a/cachelib/cachebench/test_configs/simple_tiers_test.json b/cachelib/cachebench/test_configs/simple_tiers_test.json index 182bb514cb..58302b9f20 100644 --- a/cachelib/cachebench/test_configs/simple_tiers_test.json +++ b/cachelib/cachebench/test_configs/simple_tiers_test.json @@ -1,14 +1,18 @@ // @nolint instantiates a small cache and runs a quick run of basic operations. { "cache_config" : { - "cacheSizeMB" : 512, - "usePosixShm" : false, + "cacheSizeMB" : 1024, "cacheDir" : "/tmp/mem-tiers", "memoryTiers" : [ + { + "ratio": 1, + "memBindNodes": "0" + }, { "ratio": 1, "memBindNodes": "0" } + ], "poolRebalanceIntervalSec" : 1, "moveOnSlabRelease" : false, @@ -19,7 +23,7 @@ "test_config" : { "numOps" : 100000, "numThreads" : 32, - "numKeys" : 1000000, + "numKeys" : 2000000, "keySizeRange" : [1, 8, 64], "keySizeRangeProbability" : [0.3, 0.7], @@ -33,4 +37,4 @@ "keyPoolDistribution": [0.4, 0.6], "opPoolDistribution" : [0.5, 0.5] } - } \ No newline at end of file + } diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp index b9ba839218..fc37e5cc30 100644 --- a/cachelib/cachebench/util/CacheConfig.cpp +++ b/cachelib/cachebench/util/CacheConfig.cpp @@ -19,6 +19,8 @@ #include "cachelib/allocator/HitsPerSlabStrategy.h" #include "cachelib/allocator/LruTailAgeStrategy.h" #include "cachelib/allocator/RandomStrategy.h" +#include "cachelib/allocator/FreeThresholdStrategy.h" +#include "cachelib/allocator/PromotionStrategy.h" namespace facebook { namespace cachelib { @@ -28,6 +30,9 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, cacheDir); JSONSetVal(configJson, cacheSizeMB); JSONSetVal(configJson, poolRebalanceIntervalSec); + JSONSetVal(configJson, backgroundEvictorIntervalMilSec); + JSONSetVal(configJson, backgroundPromoterIntervalMilSec); + JSONSetVal(configJson, backgroundEvictorStrategy); JSONSetVal(configJson, moveOnSlabRelease); JSONSetVal(configJson, rebalanceStrategy); JSONSetVal(configJson, rebalanceMinSlabs); @@ -101,10 +106,27 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, nvmAdmissionRetentionTimeThreshold); JSONSetVal(configJson, customConfigJson); + + //Background related configs + JSONSetVal(configJson, lowEvictionAcWatermark); + JSONSetVal(configJson, highEvictionAcWatermark); + JSONSetVal(configJson, minAcAllocationWatermark); + JSONSetVal(configJson, maxAcAllocationWatermark); + JSONSetVal(configJson, numDuplicateElements); + JSONSetVal(configJson, syncPromotion); + JSONSetVal(configJson, evictorThreads); + JSONSetVal(configJson, promoterThreads); + JSONSetVal(configJson, promotionAcWatermark); + JSONSetVal(configJson, maxEvictionBatch); + JSONSetVal(configJson, maxPromotionBatch); + JSONSetVal(configJson, minEvictionBatch); + JSONSetVal(configJson, minPromotionBatch); + JSONSetVal(configJson, maxEvictionPromotionHotness); + // if you added new fields to the configuration, update the JSONSetVal // to make them available for the json configs and increment the size // below - checkCorrectSize(); + checkCorrectSize(); if (numPools != poolSizes.size()) { throw std::invalid_argument(folly::sformat( @@ -140,6 +162,20 @@ MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) { checkCorrectSize(); } + +std::shared_ptr CacheConfig::getBackgroundEvictorStrategy() const { + if (backgroundEvictorIntervalMilSec == 0) { + return nullptr; + } + return std::make_shared(lowEvictionAcWatermark, highEvictionAcWatermark, maxEvictionBatch, minEvictionBatch); +} + +std::shared_ptr CacheConfig::getBackgroundPromoterStrategy() const { + if (backgroundPromoterIntervalMilSec == 0) { + return nullptr; + } + return std::make_shared(promotionAcWatermark, maxPromotionBatch, minPromotionBatch); +} } // namespace cachebench } // namespace cachelib } // namespace facebook diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h index a1b8f52011..a4713a1cc4 100644 --- a/cachelib/cachebench/util/CacheConfig.h +++ b/cachelib/cachebench/util/CacheConfig.h @@ -20,6 +20,7 @@ #include "cachelib/allocator/CacheAllocator.h" #include "cachelib/allocator/RebalanceStrategy.h" +#include "cachelib/allocator/BackgroundMoverStrategy.h" #include "cachelib/cachebench/util/JSONConfig.h" #include "cachelib/common/Ticker.h" #include "cachelib/navy/common/Device.h" @@ -71,7 +72,10 @@ struct CacheConfig : public JSONConfig { uint64_t cacheSizeMB{0}; uint64_t poolRebalanceIntervalSec{0}; + uint64_t backgroundEvictorIntervalMilSec{0}; + uint64_t backgroundPromoterIntervalMilSec{0}; std::string rebalanceStrategy; + std::string backgroundEvictorStrategy; uint64_t rebalanceMinSlabs{1}; double rebalanceDiffRatio{0.25}; bool moveOnSlabRelease{false}; @@ -92,7 +96,7 @@ struct CacheConfig : public JSONConfig { bool lruUpdateOnWrite{false}; bool lruUpdateOnRead{true}; bool tryLockUpdate{false}; - bool useCombinedLockForIterators{false}; + bool useCombinedLockForIterators{true}; // LRU param uint64_t lruIpSpec{0}; @@ -249,6 +253,27 @@ struct CacheConfig : public JSONConfig { // eviction-age is more than this threshold. 0 means no threshold uint32_t nvmAdmissionRetentionTimeThreshold{0}; + // See BackgroundMovers.md for complete description + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double minAcAllocationWatermark{0.0}; + double maxAcAllocationWatermark{0.0}; + + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{5}; + uint64_t minPromotionBatch{5}; + + uint64_t maxEvictionPromotionHotness{60}; + // // Options below are not to be populated with JSON // @@ -284,6 +309,8 @@ struct CacheConfig : public JSONConfig { CacheConfig() {} std::shared_ptr getRebalanceStrategy() const; + std::shared_ptr getBackgroundEvictorStrategy() const; + std::shared_ptr getBackgroundPromoterStrategy() const; }; } // namespace cachebench } // namespace cachelib diff --git a/cachelib/cachebench/util/NandWrites.cpp b/cachelib/cachebench/util/NandWrites.cpp index ae82aca65c..370ddfa2b6 100644 --- a/cachelib/cachebench/util/NandWrites.cpp +++ b/cachelib/cachebench/util/NandWrites.cpp @@ -400,6 +400,7 @@ uint64_t nandWriteBytes(const folly::StringPiece& deviceName, const folly::StringPiece&)>> vendorMap{{"samsung", samsungWriteBytes}, {"mz1lb960hbjr-", samsungWriteBytes}, + {"mzol23t8hcls-", samsungWriteBytes}, // The Samsung PM983a doesn't include Samsung in the model // number at this time, but it's a Samsung device. {"liteon", liteonWriteBytes}, diff --git a/cachelib/cachebench/util/tests/NandWritesTest.cpp b/cachelib/cachebench/util/tests/NandWritesTest.cpp index 0002e8a837..af09593f41 100644 --- a/cachelib/cachebench/util/tests/NandWritesTest.cpp +++ b/cachelib/cachebench/util/tests/NandWritesTest.cpp @@ -240,6 +240,96 @@ TEST_F(NandWritesTest, nandWriteBytes_handlesSamsungPM983aDevice) { EXPECT_EQ(nandWriteBytes("nvme1n1", kNvmePath, mockFactory_), 35061362294784); } +TEST_F(NandWritesTest, nandWriteBytes_handlesSamsungPM9A3Device) { + constexpr auto& kListOutput = R"EOF({ + "Devices" : [ + { + "DevicePath" : "/dev/nvme0n1", + "Firmware" : "P1FB007", + "Index" : 0, + "NameSpace" : 1, + "ModelNumber" : "MTFDHBA512TCK", + "ProductName" : "Non-Volatile memory controller: Micron Technology Inc Device 0x5410", + "SerialNumber" : " 21062E6B8061", + "UsedBytes" : 512110190592, + "MaximumLBA" : 1000215216, + "PhysicalSize" : 512110190592, + "SectorSize" : 512 + }, + { + "DevicePath" : "/dev/nvme1n1", + "Firmware" : "GDA82F2Q", + "Index" : 1, + "NameSpace" : 1, + "ModelNumber" : "MZOL23T8HCLS-00AFB", + "ProductName" : "Unknown device", + "SerialNumber" : "S5X9NG0T116005", + "UsedBytes" : 104910848, + "MaximumLBA" : 918149526, + "PhysicalSize" : 3760740458496, + "SectorSize" : 4096 + }, + { + "DevicePath" : "/dev/nvme2n1", + "Firmware" : "GDA82F2Q", + "Index" : 2, + "NameSpace" : 1, + "ModelNumber" : "MZOL23T8HCLS-00AFB", + "ProductName" : "Unknown device", + "SerialNumber" : "S5X9NG0T116027", + "UsedBytes" : 0, + "MaximumLBA" : 918149526, + "PhysicalSize" : 3760740458496, + "SectorSize" : 4096 + } + ] +})EOF"; + + constexpr auto& kSmartLogOutput = R"EOF( +[015:000] PhysicallyWrittenBytes : 241393664 +[031:016] Physically Read Bytes : 106217472 +[037:032] Bad NAND Block Count (Raw Value) : 0 +[039:038] Bad NAND Block Count (Normalized Value) : 100 +[047:040] Uncorrectable Read Error Count : 0 +[055:048] Soft ECC Error Count : 0 +[059:056] SSD End to end Correction Count (Detected Errors) : 0 +[063:060] SSD End to end Correction Count (Corrected Errors): 0 +[064:064] System Data Percentage Used : 0 +[068:065] User Data Erase Count (Min) : 0 +[072:069] User Data Erase Count (Max) : 1 +[080:073] Refresh Count : 0 +[086:081] Program Fail Count (Raw Value) : 0 +[088:087] Program Fail Count (Normalized Value) : 100 +[094:089] User Data Erase Fail Count (Raw Value) : 0 +[096:095] User Data Erase Fail Count (Normalized Value) : 100 +[102:097] System Area Erase Fail Count (Raw Value) : 0 +[104:103] System Area Erase Fail Count (Normalized value) : 100 +[105:105] Thermal Throttling Status : 0 +[106:106] Thermal Throttling Count : 0 +[108:107] PHY Error Count : 0 +[110:109] Bad DLLP Count : 0 +[112:111] Bad TLP Count : 0 +[114:113] Reserved : 0 +[118:115] Incomplete Shutdowns : 0 +[119:119] % Free Blocks : 96 +[121:120] PCIe Correctable Error Count (RTS) : 0 +[123:122] PCIe Correctable Error Count (RRS) : 0 +[131:124] XOR Recovery Count : 0 +[137:132] Bad System NAND block count (Raw Value) : 0 +[139:138] Bad System NAND block count (Normalized Value) : 100 +[141:140] Capacitor Health : 163 +[157:142] Endurance Estimate : 28862181 +[165:158] Security Version Number : 4294967296 +[167:166] Log Page Version : 1 +)EOF"; + + mockFactory_->expectedCommands( + {{{kNvmePath, "list", "-o", "json"}, kListOutput}, + {{kNvmePath, "samsung", "vs-smart-add-log", "/dev/nvme1n1"}, + kSmartLogOutput}}); + EXPECT_EQ(nandWriteBytes("nvme1n1", kNvmePath, mockFactory_), 241393664); +} + TEST_F(NandWritesTest, nandWriteBytes_handlesSeagateDevice) { constexpr auto& kListOutput = R"EOF({ "Devices" : [ diff --git a/cachelib/cachebench/workload/KVReplayGenerator.h b/cachelib/cachebench/workload/KVReplayGenerator.h index 4b12970081..a9124e2bd7 100644 --- a/cachelib/cachebench/workload/KVReplayGenerator.h +++ b/cachelib/cachebench/workload/KVReplayGenerator.h @@ -230,10 +230,10 @@ inline bool KVReplayGenerator::parseRequest(const std::string& line, // Set op const auto& op = fields[SampleFields::OP]; - // TODO only memcache optypes are supported - if (!op.compare("GET")) { + // TODO implement GET_LEASE and SET_LEASE emulations + if (!op.compare("GET") || !op.compare("GET_LEASE")) { req->req_.setOp(OpType::kGet); - } else if (!op.compare("SET")) { + } else if (!op.compare("SET") || !op.compare("SET_LEASE")) { req->req_.setOp(OpType::kSet); } else if (!op.compare("DELETE")) { req->req_.setOp(OpType::kDel); diff --git a/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp b/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp index 72a55a4020..16e4e52060 100644 --- a/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp +++ b/cachelib/cachebench/workload/tests/KVReplayGeneratorTest.cpp @@ -56,6 +56,18 @@ struct TraceEntry { size_t expKeySize = std::max(keySize_, reqKey.size()); expKeySize = std::min(expKeySize, 256); ASSERT_EQ(reqKey.size(), expKeySize); + ASSERT_EQ(req.req_.getOp(), getOpType()); + } + + OpType getOpType() { + if (!op_.compare("GET") || !op_.compare("GET_LEASE")) { + return OpType::kGet; + } else if (!op_.compare("SET") || !op_.compare("SET_LEASE")) { + return OpType::kSet; + } else if (!op_.compare("DELETE")) { + return OpType::kDel; + } + return OpType::kSize; } std::string key_; @@ -86,8 +98,11 @@ TEST(KVReplayGeneratorTest, BasicFormat) { // ,,,,, {7, "GET", 0, 2, std::nullopt, true}, {7, "GET", 0, 2, 50, true}, + {7, "GET_LEASE", 0, 2, 50, true}, {20, "SET", 100, 35, std::nullopt, true}, {20, "SET", 100, 35, 3600, true}, + {20, "SAT", 100, 35, 3600, false}, // invalid op name + {20, "SET_LEASE", 100, 35, 3600, true}, {7, "GET", 0, 0, std::nullopt, false}, // invalid op count {7, "GET", 0, 0, 600, false}, // invalid op count {1024, "SET", 100, 35, 300, true}, // key truncated diff --git a/cachelib/common/RollingStats.h b/cachelib/common/RollingStats.h new file mode 100644 index 0000000000..4d179681ad --- /dev/null +++ b/cachelib/common/RollingStats.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "cachelib/common/Utils.h" + +namespace facebook { +namespace cachelib { +namespace util { + +class RollingStats { + public: + // track latency by taking the value of duration directly. + void trackValue(double value) { + // This is a highly unlikely scenario where + // cnt_ reaches numerical limits. Skip update + // of the rolling average anymore. + if (cnt_ == std::numeric_limits::max()) { + cnt_ = 0; + return; + } + auto ratio = static_cast(cnt_) / (cnt_ + 1); + avg_ *= ratio; + ++cnt_; + avg_ += value / cnt_; + } + + // Return the rolling average. + double estimate() { return avg_; } + + private: + double avg_{0}; + uint64_t cnt_{0}; +}; + +class RollingLatencyTracker { + public: + explicit RollingLatencyTracker(RollingStats& stats) + : stats_(&stats), begin_(std::chrono::steady_clock::now()) {} + RollingLatencyTracker() {} + ~RollingLatencyTracker() { + if (stats_) { + auto tp = std::chrono::steady_clock::now(); + auto diffNanos = + std::chrono::duration_cast(tp - begin_) + .count(); + stats_->trackValue(static_cast(diffNanos)); + } + } + + RollingLatencyTracker(const RollingLatencyTracker&) = delete; + RollingLatencyTracker& operator=(const RollingLatencyTracker&) = delete; + + RollingLatencyTracker(RollingLatencyTracker&& rhs) noexcept + : stats_(rhs.stats_), begin_(rhs.begin_) { + rhs.stats_ = nullptr; + } + + RollingLatencyTracker& operator=(RollingLatencyTracker&& rhs) noexcept { + if (this != &rhs) { + this->~RollingLatencyTracker(); + new (this) RollingLatencyTracker(std::move(rhs)); + } + return *this; + } + + private: + RollingStats* stats_{nullptr}; + std::chrono::time_point begin_; +}; +} // namespace util +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/experimental/objcache2/ObjectCache-inl.h b/cachelib/experimental/objcache2/ObjectCache-inl.h index 345a27d528..9f1b91631d 100644 --- a/cachelib/experimental/objcache2/ObjectCache-inl.h +++ b/cachelib/experimental/objcache2/ObjectCache-inl.h @@ -128,8 +128,8 @@ std::shared_ptr ObjectCache::find(folly::StringPiece key) { succL1Lookups_.inc(); auto ptr = found->template getMemoryAs()->objectPtr; - // Just release the handle. Cache destorys object when all handles released. - auto deleter = [h = std::move(found)](const T*) {}; + // Use custom deleter + auto deleter = Deleter(std::move(found)); return std::shared_ptr(reinterpret_cast(ptr), std::move(deleter)); } @@ -146,19 +146,20 @@ std::shared_ptr ObjectCache::findToWrite( succL1Lookups_.inc(); auto ptr = found->template getMemoryAs()->objectPtr; - // Just release the handle. Cache destorys object when all handles released. - auto deleter = [h = std::move(found)](T*) {}; + // Use custom deleter + auto deleter = Deleter(std::move(found)); return std::shared_ptr(reinterpret_cast(ptr), std::move(deleter)); } template template -std::pair::AllocStatus, std::shared_ptr> +std::tuple::AllocStatus, + std::shared_ptr, + std::shared_ptr> ObjectCache::insertOrReplace(folly::StringPiece key, std::unique_ptr object, size_t objectSize, - uint32_t ttlSecs, - std::shared_ptr* replacedPtr) { + uint32_t ttlSecs) { if (config_.objectSizeTrackingEnabled && objectSize == 0) { throw std::invalid_argument( "Object size tracking is enabled but object size is set to be 0."); @@ -176,7 +177,8 @@ ObjectCache::insertOrReplace(folly::StringPiece key, allocateFromL1(key, ttlSecs, 0 /* use current time as creationTime */); if (!handle) { insertErrors_.inc(); - return {AllocStatus::kAllocError, std::shared_ptr(std::move(object))}; + return {AllocStatus::kAllocError, std::shared_ptr(std::move(object)), + nullptr}; } // We don't release the object here because insertOrReplace could throw when // the replaced item is out of refcount; in this case, the object isn't @@ -187,21 +189,17 @@ ObjectCache::insertOrReplace(folly::StringPiece key, auto replaced = this->l1Cache_->insertOrReplace(handle); + std::shared_ptr replacedPtr = nullptr; if (replaced) { replaces_.inc(); - if (replacedPtr) { - auto itemPtr = reinterpret_cast(replaced->getMemory()); - // Just release the handle. Cache destorys object when all handles - // released. - auto deleter = [h = std::move(replaced)](T*) {}; - *replacedPtr = std::shared_ptr( - reinterpret_cast(itemPtr->objectPtr), std::move(deleter)); - } + auto itemPtr = reinterpret_cast(replaced->getMemory()); + // Just release the handle. Cache destorys object when all handles + // released. + auto deleter = [h = std::move(replaced)](T*) {}; + replacedPtr = std::shared_ptr(reinterpret_cast(itemPtr->objectPtr), + std::move(deleter)); } - // Just release the handle. Cache destorys object when all handles released. - auto deleter = [h = std::move(handle)](T*) {}; - // update total object size if (config_.objectSizeTrackingEnabled) { totalObjectSizeBytes_.fetch_add(objectSize, std::memory_order_relaxed); @@ -209,7 +207,11 @@ ObjectCache::insertOrReplace(folly::StringPiece key, // Release the object as it has been successfully inserted to the cache. object.release(); - return {AllocStatus::kSuccess, std::shared_ptr(ptr, std::move(deleter))}; + + // Use custom deleter + auto deleter = Deleter(std::move(handle)); + return {AllocStatus::kSuccess, std::shared_ptr(ptr, std::move(deleter)), + replacedPtr}; } template @@ -254,8 +256,8 @@ ObjectCache::insert(folly::StringPiece key, object.release(); } - // Just release the handle. Cache destorys object when all handles released. - auto deleter = [h = std::move(handle)](T*) {}; + // Use custom deleter + auto deleter = Deleter(std::move(handle)); return {success ? AllocStatus::kSuccess : AllocStatus::kKeyAlreadyExists, std::shared_ptr(ptr, std::move(deleter))}; } diff --git a/cachelib/experimental/objcache2/ObjectCache.h b/cachelib/experimental/objcache2/ObjectCache.h index 85abac068e..1253213284 100644 --- a/cachelib/experimental/objcache2/ObjectCache.h +++ b/cachelib/experimental/objcache2/ObjectCache.h @@ -94,6 +94,43 @@ class ObjectCache : public ObjectCacheBase { // make constructor private, but constructable by std::make_unique struct InternalConstructor {}; + template + class Deleter { + public: + using ReadHandle = typename AllocatorT::ReadHandle; + using WriteHandle = typename AllocatorT::WriteHandle; + using Handle = std::variant; + + explicit Deleter(typename AllocatorT::ReadHandle&& hdl) + : hdl_(std::move(hdl)) {} + explicit Deleter(typename AllocatorT::WriteHandle&& hdl) + : hdl_(std::move(hdl)) {} + + void operator()(T*) { + // Just release the handle. + // Cache destorys object when all handles released. + std::holds_alternative(hdl_) + ? std::get(hdl_).reset() + : std::get(hdl_).reset(); + } + + WriteHandle& getWriteHandleRef() { + if (std::holds_alternative(hdl_)) { + hdl_ = std::move(std::get(hdl_)).toWriteHandle(); + } + return std::get(hdl_); + } + + ReadHandle& getReadHandleRef() { + return std::holds_alternative(hdl_) + ? std::get(hdl_) + : std::get(hdl_); + } + + private: + Handle hdl_; + }; + public: using ItemDestructor = std::function; using Key = KAllocation::Key; @@ -146,22 +183,20 @@ class ObjectCache : public ObjectCacheBase { // if objectSizeTracking is enabled, a non-zero value must // be passed. // @param ttlSecs object expiring seconds. - // @param replacedPtr a pointer to a shared_ptr, if it is not nullptr it will - // be assigned to the replaced object. // // @throw cachelib::exception::RefcountOverflow if the item we are replacing // is already out of refcounts. // @throw std::invalid_argument if objectSizeTracking is enabled but // objectSize is 0. - // @return a pair of allocation status and shared_ptr of newly inserted - // object. + // @return a tuple of allocation status, shared_ptr of newly inserted + // object and shared_ptr of old object that has been replaced (nullptr + // if no replacement happened) template - std::pair> insertOrReplace( - folly::StringPiece key, - std::unique_ptr object, - size_t objectSize = 0, - uint32_t ttlSecs = 0, - std::shared_ptr* replacedPtr = nullptr); + std::tuple, std::shared_ptr> + insertOrReplace(folly::StringPiece key, + std::unique_ptr object, + size_t objectSize = 0, + uint32_t ttlSecs = 0); // Insert the object into the cache with given key. If the key exists in the // cache, the new object won't be inserted. @@ -232,6 +267,63 @@ class ObjectCache : public ObjectCacheBase { : sizeController_->getCurrentEntriesLimit(); } + // Get the expiry timestamp of the object + // @param object object shared pointer returned from ObjectCache APIs + // + // @return the expiry timestamp in seconds of the object + // 0 if object is nullptr + template + uint32_t getExpiryTimeSec(const std::shared_ptr& object) const { + if (object == nullptr) { + return 0; + } + return getReadHandleRefInternal(object)->getExpiryTime(); + } + + // Get the configured TTL of the object + // @param object object shared pointer returned from ObjectCache APIs + // + // @return the configured TTL in seconds of the object + // 0 if object is nullptr + template + std::chrono::seconds getConfiguredTtl( + const std::shared_ptr& object) const { + if (object == nullptr) { + return std::chrono::seconds{0}; + } + return getReadHandleRefInternal(object)->getConfiguredTTL(); + } + + // Update the expiry timestamp of an object + // + // @param object object shared pointer returned from ObjectCache APIs + // @param expiryTimeSecs the expiryTime in seconds to update + // + // @return boolean indicating whether expiry time was successfully updated + template + bool updateExpiryTimeSec(std::shared_ptr& object, + uint32_t expiryTimeSecs) { + if (object == nullptr) { + return false; + } + return getWriteHandleRefInternal(object)->updateExpiryTime( + expiryTimeSecs); + } + + // Update expiry time to @ttl seconds from now. + // + // @param object object shared pointer returned from ObjectCache APIs + // @param ttl TTL in seconds (from now) + // + // @return boolean indicating whether TTL was successfully extended + template + bool extendTtl(std::shared_ptr& object, std::chrono::seconds ttl) { + if (object == nullptr) { + return false; + } + return getWriteHandleRefInternal(object)->extendTTL(ttl); + } + protected: // Serialize cache allocator config for exporting to Scuba std::map serializeConfigParams() const override; @@ -272,6 +364,28 @@ class ObjectCache : public ObjectCacheBase { bool stopSizeController(std::chrono::seconds timeout = std::chrono::seconds{ 0}); + // Get a ReadHandle reference from the object shared_ptr + template + typename AllocatorT::ReadHandle& getReadHandleRefInternal( + const std::shared_ptr& object) const { + auto* deleter = std::get_deleter>(object); + XDCHECK(deleter != nullptr); + auto& hdl = deleter->getReadHandleRef(); + XDCHECK(hdl != nullptr); + return hdl; + } + + // Get a WriteHandle reference from the object shared_ptr + template + typename AllocatorT::WriteHandle& getWriteHandleRefInternal( + std::shared_ptr& object) { + auto* deleter = std::get_deleter>(object); + XDCHECK(deleter != nullptr); + auto& hdl = deleter->getWriteHandleRef(); + XDCHECK(hdl != nullptr); + return hdl; + } + // Config passed to the cache. Config config_{}; diff --git a/cachelib/experimental/objcache2/persistence/Serialization.h b/cachelib/experimental/objcache2/persistence/Serialization.h index cccb414b45..4edad88e4b 100644 --- a/cachelib/experimental/objcache2/persistence/Serialization.h +++ b/cachelib/experimental/objcache2/persistence/Serialization.h @@ -68,9 +68,9 @@ struct ObjectDeserializer { Deserializer deserializer{reinterpret_cast(payload.begin()), reinterpret_cast(payload.end())}; auto ptr = std::make_unique(deserializer.deserialize()); - auto [allocStatus, _] = + auto res = objCache_.insertOrReplace(key, std::move(ptr), objectSize, ttlSecs); - return allocStatus == ObjectCache::AllocStatus::kSuccess; + return std::get<0>(res) == ObjectCache::AllocStatus::kSuccess; } // cache key of the object to be deserialized diff --git a/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp b/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp index 0bcc120de9..701654cd1c 100644 --- a/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp +++ b/cachelib/experimental/objcache2/tests/ObjectCacheTest.cpp @@ -206,12 +206,12 @@ class ObjectCacheTest : public ::testing::Test { foo->a = 1; foo->b = 2; foo->c = 3; - auto res = objcache->insertOrReplace("Foo", std::move(foo)); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res.first); - ASSERT_NE(nullptr, res.second); - EXPECT_EQ(1, res.second->a); - EXPECT_EQ(2, res.second->b); - EXPECT_EQ(3, res.second->c); + auto [allocRes, ptr, _] = objcache->insertOrReplace("Foo", std::move(foo)); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, allocRes); + ASSERT_NE(nullptr, ptr); + EXPECT_EQ(1, ptr->a); + EXPECT_EQ(2, ptr->b); + EXPECT_EQ(3, ptr->c); auto found2 = objcache->template find("Foo"); ASSERT_NE(nullptr, found2); @@ -238,7 +238,7 @@ class ObjectCacheTest : public ::testing::Test { foo->b = 2; foo->c = 3; auto res1 = objcache->insertOrReplace("Foo", std::move(foo)); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res1.first); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res1)); auto found1 = objcache->template find("Foo"); ASSERT_NE(nullptr, found1); @@ -251,7 +251,7 @@ class ObjectCacheTest : public ::testing::Test { foo2->e = 5; foo2->f = 6; auto res2 = objcache->insertOrReplace("Foo2", std::move(foo2)); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res2.first); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res2)); auto found2 = objcache->template find("Foo2"); ASSERT_NE(nullptr, found2); @@ -272,7 +272,7 @@ class ObjectCacheTest : public ::testing::Test { foo4->b = 2; foo4->c = 3; auto res1 = objcache->insertOrReplace("Foo4", std::move(foo4)); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res1.first); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res1)); auto found1 = objcache->template find("Foo4"); ASSERT_NE(nullptr, found1); @@ -285,7 +285,7 @@ class ObjectCacheTest : public ::testing::Test { foo5->e = 5; foo5->f = 6; auto res2 = objcache->insertOrReplace("Foo5", std::move(foo5)); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res2.first); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res2)); auto found2 = objcache->template find("Foo5"); ASSERT_NE(nullptr, found2); @@ -385,11 +385,14 @@ class ObjectCacheTest : public ::testing::Test { foo1->a = 1; foo1->b = 2; foo1->c = 3; - std::shared_ptr replaced; - auto res = - objcache->insertOrReplace("Foo", std::move(foo1), 0, 0, &replaced); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res.first); - EXPECT_EQ(nullptr, replaced); + + auto [res1, ptr1, replaced1] = + objcache->insertOrReplace("Foo", std::move(foo1)); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res1); + EXPECT_EQ(1, ptr1->a); + EXPECT_EQ(2, ptr1->b); + EXPECT_EQ(3, ptr1->c); + EXPECT_EQ(nullptr, replaced1); auto found1 = objcache->template find("Foo"); ASSERT_NE(nullptr, found1); @@ -401,12 +404,16 @@ class ObjectCacheTest : public ::testing::Test { foo2->a = 10; foo2->b = 20; foo2->c = 30; - res = objcache->insertOrReplace("Foo", std::move(foo2), 0, 0, &replaced); - EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res.first); - ASSERT_NE(nullptr, replaced); - EXPECT_EQ(1, replaced->a); - EXPECT_EQ(2, replaced->b); - EXPECT_EQ(3, replaced->c); + auto [res2, ptr2, replaced2] = + objcache->insertOrReplace("Foo", std::move(foo2)); + EXPECT_EQ(ObjectCache::AllocStatus::kSuccess, res2); + EXPECT_EQ(10, ptr2->a); + EXPECT_EQ(20, ptr2->b); + EXPECT_EQ(30, ptr2->c); + ASSERT_NE(nullptr, replaced2); + EXPECT_EQ(1, replaced2->a); + EXPECT_EQ(2, replaced2->b); + EXPECT_EQ(3, replaced2->c); auto found2 = objcache->template find("Foo"); ASSERT_NE(nullptr, found2); @@ -497,7 +504,7 @@ class ObjectCacheTest : public ::testing::Test { // replace foo1 with foo2 { auto res = objcache->insertOrReplace("Foo", std::move(foo2), foo2Size); - ASSERT_EQ(ObjectCache::AllocStatus::kSuccess, res.first); + ASSERT_EQ(ObjectCache::AllocStatus::kSuccess, std::get<0>(res)); auto found = objcache->template find("Foo"); ASSERT_NE(nullptr, found); @@ -879,6 +886,69 @@ class ObjectCacheTest : public ::testing::Test { } } + void testGetTtl() { + const uint32_t ttlSecs = 600; + + ObjectCacheConfig config; + config.setCacheName("test").setCacheCapacity(10'000).setItemDestructor( + [&](ObjectCacheDestructorData data) { data.deleteObject(); }); + auto objcache = ObjectCache::create(config); + + auto before = util::getCurrentTimeSec(); + std::this_thread::sleep_for(std::chrono::seconds{3}); + objcache->insertOrReplace("Foo", std::make_unique(), 0 /*object size*/, + ttlSecs); + + // lookup via find API + auto found1 = objcache->template find("Foo"); + ASSERT_NE(nullptr, found1); + + // get TTL info + EXPECT_EQ(ttlSecs, objcache->getConfiguredTtl(found1).count()); + EXPECT_LE(before + ttlSecs, objcache->getExpiryTimeSec(found1)); + + // lookup via findToWrite API + auto found2 = objcache->template findToWrite("Foo"); + ASSERT_NE(nullptr, found2); + + // get TTL info + EXPECT_EQ(ttlSecs, objcache->getConfiguredTtl(found2).count()); + EXPECT_LE(before + ttlSecs, objcache->getExpiryTimeSec(found2)); + } + + void testUpdateTtl() { + const uint32_t ttlSecs = 600; + + ObjectCacheConfig config; + config.setCacheName("test").setCacheCapacity(10'000).setItemDestructor( + [&](ObjectCacheDestructorData data) { data.deleteObject(); }); + auto objcache = ObjectCache::create(config); + + auto insertionTime = util::getCurrentTimeSec(); + objcache->insertOrReplace("Foo", std::make_unique(), 0 /*object size*/, + ttlSecs); + + auto found = objcache->template find("Foo"); + ASSERT_NE(nullptr, found); + + // get TTL info + EXPECT_EQ(ttlSecs, objcache->getConfiguredTtl(found).count()); + EXPECT_LE(insertionTime + ttlSecs, objcache->getExpiryTimeSec(found)); + + // update expiry time + auto currExpTime = objcache->getExpiryTimeSec(found); + EXPECT_TRUE(objcache->updateExpiryTimeSec(found, currExpTime + ttlSecs)); + EXPECT_EQ(2 * ttlSecs, objcache->getConfiguredTtl(found).count()); + EXPECT_EQ(currExpTime + ttlSecs, objcache->getExpiryTimeSec(found)); + + // extend TTL + auto now = util::getCurrentTimeSec(); + std::this_thread::sleep_for(std::chrono::seconds{3}); + EXPECT_TRUE(objcache->extendTtl(found, std::chrono::seconds(3 * ttlSecs))); + EXPECT_LE(now + ttlSecs, objcache->getExpiryTimeSec(found)); + EXPECT_LE(3 * ttlSecs, objcache->getConfiguredTtl(found).count()); + } + void testMultithreadReplace() { // Sanity test to see if insertOrReplace across multiple // threads are safe. @@ -1072,6 +1142,32 @@ class ObjectCacheTest : public ::testing::Test { fs[i].join(); } } + + void testMultithreadUpdateTtl() { + // Sanity test to see if update TTL across multiple + // threads is safe. + ObjectCacheConfig config; + config.setCacheName("test").setCacheCapacity(10'000).setItemDestructor( + [&](ObjectCacheDestructorData data) { data.deleteObject(); }); + auto objcache = ObjectCache::create(config); + objcache->insertOrReplace("key", std::make_unique(), 0, 60); + + auto runUpdateTtlOps = [&] { + for (int i = 0; i < 2000; i++) { + auto found = objcache->template find("key"); + auto configuredTtlSecs = objcache->getConfiguredTtl(found).count(); + objcache->extendTtl(found, std::chrono::seconds{configuredTtlSecs}); + } + }; + + std::vector ts; + for (int i = 0; i < 10; i++) { + ts.push_back(std::thread{runUpdateTtlOps}); + } + for (int i = 0; i < 10; i++) { + ts[i].join(); + } + } }; using AllocatorTypes = ::testing::TypestestPersistenceHighLoad(); } +TYPED_TEST(ObjectCacheTest, GetTtl) { this->testGetTtl(); } +TYPED_TEST(ObjectCacheTest, UpdateTtl) { this->testUpdateTtl(); } + TYPED_TEST(ObjectCacheTest, MultithreadReplace) { this->testMultithreadReplace(); } @@ -1128,6 +1227,9 @@ TYPED_TEST(ObjectCacheTest, MultithreadFindAndEviction) { TYPED_TEST(ObjectCacheTest, MultithreadFindAndReplaceWith10Shards) { this->testMultithreadFindAndReplaceWith10Shards(); } +TYPED_TEST(ObjectCacheTest, MultithreadUpdateTtl) { + this->testMultithreadUpdateTtl(); +} using ObjectCache = ObjectCache; TEST(ObjectCacheTest, LruEviction) { diff --git a/cachelib/external/fbthrift b/cachelib/external/fbthrift index 33a9fbc258..cbc3de581f 160000 --- a/cachelib/external/fbthrift +++ b/cachelib/external/fbthrift @@ -1 +1 @@ -Subproject commit 33a9fbc258f21818f20ea03a55c979014882e84d +Subproject commit cbc3de581fdf36ba474b0c135b9e785e504f1c1e diff --git a/cachelib/external/fizz b/cachelib/external/fizz index 9198ca6e7d..287625bd66 160000 --- a/cachelib/external/fizz +++ b/cachelib/external/fizz @@ -1 +1 @@ -Subproject commit 9198ca6e7daa50fae6b8413d745b4faaf97dfd10 +Subproject commit 287625bd6676b812e75ad0b088a61f72b4c9e681 diff --git a/cachelib/external/folly b/cachelib/external/folly index 128cfac6ac..ce2b95715d 160000 --- a/cachelib/external/folly +++ b/cachelib/external/folly @@ -1 +1 @@ -Subproject commit 128cfac6ac3d69825bad2af852fced3f63d87411 +Subproject commit ce2b95715de229fcb51bd97410469a3ad4d2bfb2 diff --git a/cachelib/external/wangle b/cachelib/external/wangle index 6bc77c8d46..44690e7894 160000 --- a/cachelib/external/wangle +++ b/cachelib/external/wangle @@ -1 +1 @@ -Subproject commit 6bc77c8d46b5ef68d77e921bb1e3d1e576adb8fe +Subproject commit 44690e7894842a7127245837b69627d4b964aabd diff --git a/cachelib/navy/block_cache/BlockCache.cpp b/cachelib/navy/block_cache/BlockCache.cpp index f34605b68b..84dadd13e7 100644 --- a/cachelib/navy/block_cache/BlockCache.cpp +++ b/cachelib/navy/block_cache/BlockCache.cpp @@ -723,9 +723,11 @@ void BlockCache::getCounters(const CounterVisitor& visitor) const { reclaimValueChecksumErrorCount_.get(), CounterVisitor::CounterType::RATE); visitor("navy_bc_cleanup_entry_header_checksum_errors", - cleanupEntryHeaderChecksumErrorCount_.get()); + cleanupEntryHeaderChecksumErrorCount_.get(), + CounterVisitor::CounterType::RATE); visitor("navy_bc_cleanup_value_checksum_errors", - cleanupValueChecksumErrorCount_.get()); + cleanupValueChecksumErrorCount_.get(), + CounterVisitor::CounterType::RATE); visitor("navy_bc_succ_lookups", succLookupCount_.get(), CounterVisitor::CounterType::RATE); visitor("navy_bc_removes", removeCount_.get(), @@ -750,7 +752,8 @@ void BlockCache::getCounters(const CounterVisitor& visitor) const { visitor("navy_bc_reinsertion_errors", reinsertionErrorCount_.get(), CounterVisitor::CounterType::RATE); visitor("navy_bc_lookup_for_item_destructor_errors", - lookupForItemDestructorErrorCount_.get()); + lookupForItemDestructorErrorCount_.get(), + CounterVisitor::CounterType::RATE); visitor("navy_bc_remove_attempt_collisions", removeAttemptCollisions_.get(), CounterVisitor::CounterType::RATE); // Allocator visits region manager diff --git a/cachelib/navy/driver/Driver.cpp b/cachelib/navy/driver/Driver.cpp index 1615d1cc48..29215cc161 100644 --- a/cachelib/navy/driver/Driver.cpp +++ b/cachelib/navy/driver/Driver.cpp @@ -273,8 +273,10 @@ void Driver::getCounters(const CounterVisitor& visitor) const { CounterVisitor::CounterType::RATE); visitor("navy_rejected_bytes", rejectedBytes_.get(), CounterVisitor::CounterType::RATE); - visitor("navy_accepted_bytes", acceptedBytes_.get()); - visitor("navy_accepted", acceptedCount_.get()); + visitor("navy_accepted_bytes", acceptedBytes_.get(), + CounterVisitor::CounterType::RATE); + visitor("navy_accepted", acceptedCount_.get(), + CounterVisitor::CounterType::RATE); visitor("navy_parcel_memory", parcelMemory_.get()); visitor("navy_concurrent_inserts", concurrentInserts_.get()); diff --git a/cachelib/persistence/tests/PersistenceCache.h b/cachelib/persistence/tests/PersistenceCache.h index 5400b4d4ea..1db5b5fc8a 100644 --- a/cachelib/persistence/tests/PersistenceCache.h +++ b/cachelib/persistence/tests/PersistenceCache.h @@ -213,7 +213,7 @@ class PersistenceCache { public: const uint32_t kNumKeys = 1024 * 1024; // 1 million const size_t kCacheSize = 100 * kNumKeys; // 100MB - const size_t kCapacity = 4 * kCacheSize; // 400MB + const size_t kCapacity = 5 * kCacheSize; // 500MB std::unique_ptr buffer_; std::string cacheDir_; diff --git a/contrib/build-package.sh b/contrib/build-package.sh index ff487967cb..f0f3283df0 100755 --- a/contrib/build-package.sh +++ b/contrib/build-package.sh @@ -78,9 +78,8 @@ build_tests= show_help= many_jobs= verbose= -PREFIX="$PWD/opt/cachelib/" - -while getopts :BSdhijtvp: param +install_path= +while getopts :BSdhijtvI: param do case $param in i) install=yes ;; @@ -91,7 +90,7 @@ do v) verbose=yes ;; j) many_jobs=yes ;; t) build_tests=yes ;; - p) PREFIX=$OPTARG ;; + I) install_path=${OPTARG} ; install=yes ;; ?) die "unknown option. See -h for help." esac done @@ -102,11 +101,12 @@ test "$#" -eq 0 \ && die "missing dependancy name to build. See -h for help" ###################################### -## Check which dependecy was requested +## Check which dependency was requested ###################################### external_git_clone= external_git_branch= +# external_git_tag can also be used for commit hashes external_git_tag= update_submodules= cmake_custom_params= @@ -160,6 +160,7 @@ case "$1" in REPODIR=cachelib/external/$NAME SRCDIR=$REPODIR external_git_clone=yes + external_git_tag="8.0.1" cmake_custom_params="-DBUILD_SHARED_LIBS=ON" if test "$build_tests" = "yes" ; then cmake_custom_params="$cmake_custom_params -DFMT_TEST=YES" @@ -174,7 +175,10 @@ case "$1" in REPODIR=cachelib/external/$NAME SRCDIR=$REPODIR/build/cmake external_git_clone=yes - external_git_branch=release + # Previously, we pinned to release branch. v1.5.4 needed + # CMake >= 3.18, later reverted. While waiting for v1.5.5, + # pin to the fix: https://github.com/facebook/zstd/pull/3510 + external_git_tag=8420502e if test "$build_tests" = "yes" ; then cmake_custom_params="-DZSTD_BUILD_TESTS=ON" else @@ -276,6 +280,7 @@ test -d cachelib || die "expected 'cachelib' directory not found in $PWD" # After ensuring we are in the correct directory, set the installation prefix" +PREFIX=${install_path:-"$PWD/opt/cachelib/"} CMAKE_PARAMS="$CMAKE_PARAMS -DCMAKE_INSTALL_PREFIX=$PREFIX" CMAKE_PREFIX_PATH="$PREFIX/lib/cmake:$PREFIX/lib64/cmake:$PREFIX/lib:$PREFIX/lib64:$PREFIX:${CMAKE_PREFIX_PATH:-}" export CMAKE_PREFIX_PATH diff --git a/contrib/prerequisites-arch.sh b/contrib/prerequisites-arch.sh index 85a8656f7b..249f6c8082 100755 --- a/contrib/prerequisites-arch.sh +++ b/contrib/prerequisites-arch.sh @@ -19,4 +19,5 @@ sudo pacman -S --needed --noconfirm cmake \ boost \ double-conversion \ libdwarf \ + numactl \ libsodium diff --git a/contrib/prerequisites-fedora32.sh b/contrib/prerequisites-fedora32.sh index 235d6c1a8a..942cac0470 100755 --- a/contrib/prerequisites-fedora32.sh +++ b/contrib/prerequisites-fedora32.sh @@ -21,6 +21,7 @@ sudo dnf -y install bison flex patch bzip2 cmake \ zlib-devel lz4-devel xz-devel bzip2-devel \ jemalloc-devel snappy-devel libsodium-devel libdwarf-devel libaio-devel \ gmock-devel gflags-devel gtest gtest-devel \ + numactl-devel \ fmt fmt-devel # DO NOT INSTALL glog-devel - need to build from source for the glog-*.cmake files diff --git a/contrib/prerequisites-fedora34.sh b/contrib/prerequisites-fedora34.sh index 7e45c8740d..c7182cc513 100755 --- a/contrib/prerequisites-fedora34.sh +++ b/contrib/prerequisites-fedora34.sh @@ -19,4 +19,5 @@ sudo dnf -y install bison flex patch bzip2 cmake \ double-conversion double-conversion-devel make g++ \ boost-devel libevent-devel openssl-devel libunwind-devel \ zlib-devel lz4-devel xz-devel bzip2-devel \ - jemalloc-devel snappy-devel libsodium-devel libdwarf-devel libaio-devel + jemalloc-devel snappy-devel libsodium-devel libdwarf-devel libaio-devel \ + numactl-devel diff --git a/contrib/prerequisites-rocky9.sh b/contrib/prerequisites-rocky9.sh index bec5b82011..06720aba2e 100755 --- a/contrib/prerequisites-rocky9.sh +++ b/contrib/prerequisites-rocky9.sh @@ -38,7 +38,8 @@ sudo dnf install -y \ jemalloc-devel \ libsodium-devel \ libaio-devel \ - binutils-devel + binutils-devel \ + numactl-devel sudo dnf install -y \ diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 0000000000..bb82f0142d --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2022, Intel Corporation + +# +# build.sh - runs a Docker container from a Docker image with environment +# prepared for running CacheLib builds and tests. It uses Docker image +# tagged as described in ./images/build-image.sh. +# +# Notes: +# - set env var 'HOST_WORKDIR' to where the root of this project is on the host machine, +# - set env var 'OS' and 'OS_VER' properly to a system/Docker you want to build this +# repo on (for proper values take a look at the list of Dockerfiles at the +# utils/docker/images directory in this repo), e.g. OS=ubuntu, OS_VER=20.04, +# - set env var 'CONTAINER_REG' to container registry address +# [and possibly user/org name, and package name], e.g. "/pmem/CacheLib", +# - set env var 'DNS_SERVER' if you use one, +# - set env var 'COMMAND' to execute specific command within Docker container or +# env var 'TYPE' to pick command based on one of the predefined types of build (see below). +# + +set -e + +source $(dirname ${0})/set-ci-vars.sh +IMG_VER=${IMG_VER:-devel} +TAG="${OS}-${OS_VER}-${IMG_VER}" +IMAGE_NAME=${CONTAINER_REG}:${TAG} +CONTAINER_NAME=CacheLib-${OS}-${OS_VER} +WORKDIR=/CacheLib # working dir within Docker container +SCRIPTSDIR=${WORKDIR}/docker + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set " \ + "(e.g. OS=fedora, OS_VER=32)." + exit 1 +fi + +if [[ -z "${HOST_WORKDIR}" ]]; then + echo "ERROR: The variable HOST_WORKDIR has to contain a path to " \ + "the root of this project on the host machine." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +# Set command to execute in the Docker container +COMMAND="./run-build.sh"; +echo "COMMAND to execute within Docker container: ${COMMAND}" + +if [ -n "${DNS_SERVER}" ]; then DOCKER_OPTS="${DOCKER_OPTS} --dns=${DNS_SERVER}"; fi + +# Check if we are running on a CI (Travis or GitHub Actions) +[ -n "${GITHUB_ACTIONS}" -o -n "${TRAVIS}" ] && CI_RUN="YES" || CI_RUN="NO" + +# Do not allocate a pseudo-TTY if we are running on GitHub Actions +[ ! "${GITHUB_ACTIONS}" ] && DOCKER_OPTS="${DOCKER_OPTS} --tty=true" + + +echo "Running build using Docker image: ${IMAGE_NAME}" + +# Run a container with +# - environment variables set (--env) +# - host directory containing source mounted (-v) +# - working directory set (-w) +docker run --privileged=true --name=${CONTAINER_NAME} -i \ + ${DOCKER_OPTS} \ + --env http_proxy=${http_proxy} \ + --env https_proxy=${https_proxy} \ + --env TERM=xterm-256color \ + --env WORKDIR=${WORKDIR} \ + --env SCRIPTSDIR=${SCRIPTSDIR} \ + --env GITHUB_REPO=${GITHUB_REPO} \ + --env CI_RUN=${CI_RUN} \ + --env TRAVIS=${TRAVIS} \ + --env GITHUB_ACTIONS=${GITHUB_ACTIONS} \ + --env CI_COMMIT=${CI_COMMIT} \ + --env CI_COMMIT_RANGE=${CI_COMMIT_RANGE} \ + --env CI_BRANCH=${CI_BRANCH} \ + --env CI_EVENT_TYPE=${CI_EVENT_TYPE} \ + --env CI_REPO_SLUG=${CI_REPO_SLUG} \ + --env DOC_UPDATE_GITHUB_TOKEN=${DOC_UPDATE_GITHUB_TOKEN} \ + --env DOC_UPDATE_BOT_NAME=${DOC_UPDATE_BOT_NAME} \ + --env DOC_REPO_OWNER=${DOC_REPO_OWNER} \ + --env COVERITY_SCAN_TOKEN=${COVERITY_SCAN_TOKEN} \ + --env COVERITY_SCAN_NOTIFICATION_EMAIL=${COVERITY_SCAN_NOTIFICATION_EMAIL} \ + --env TEST_TIMEOUT=${TEST_TIMEOUT} \ + --env TZ='Europe/Warsaw' \ + --shm-size=4G \ + -v ${HOST_WORKDIR}:${WORKDIR} \ + -v /etc/localtime:/etc/localtime \ + -w ${SCRIPTSDIR} \ + ${IMAGE_NAME} ${COMMAND} + diff --git a/docker/images/build-image.sh b/docker/images/build-image.sh new file mode 100755 index 0000000000..985a6e0ff1 --- /dev/null +++ b/docker/images/build-image.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2016-2021, Intel Corporation +# +# build-image.sh - prepares a Docker image with -based environment for +# testing (or dev) purpose, tagged with ${CONTAINER_REG}:${OS}-${OS_VER}-${IMG_VER}, +# according to the ${OS}-${OS_VER}.Dockerfile file located in the same directory. +# IMG_VER is a version of Docker image (it usually relates to project's release tag) +# and it defaults to "devel". +# + +set -e +IMG_VER=${IMG_VER:-devel} +TAG="${OS}-${OS_VER}-${IMG_VER}" + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set " \ + "(e.g. OS=fedora, OS_VER=34)." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +echo "Check if the file ${OS}-${OS_VER}.Dockerfile exists" +if [[ ! -f "${OS}-${OS_VER}.Dockerfile" ]]; then + echo "Error: ${OS}-${OS_VER}.Dockerfile does not exist." + exit 1 +fi + +echo "Build a Docker image tagged with: ${CONTAINER_REG}:${TAG}" +docker build -t ${CONTAINER_REG}:${TAG} \ + --build-arg http_proxy=$http_proxy \ + --build-arg https_proxy=$https_proxy \ + -f ${OS}-${OS_VER}.Dockerfile . diff --git a/docker/images/centos-8streams.Dockerfile b/docker/images/centos-8streams.Dockerfile new file mode 100644 index 0000000000..29752c5d98 --- /dev/null +++ b/docker/images/centos-8streams.Dockerfile @@ -0,0 +1,24 @@ +FROM quay.io/centos/centos:stream8 + +RUN dnf install -y \ +cmake \ +sudo \ +git \ +tzdata \ +vim \ +gdb \ +clang \ +python36 \ +glibc-devel.i686 \ +xmlto \ +uuid \ +libuuid-devel \ +json-c-devel \ +perf \ +numactl + +COPY ./install-cachelib-deps.sh ./install-cachelib-deps.sh +RUN ./install-cachelib-deps.sh + +COPY ./install-dsa-deps.sh ./install-dsa-deps.sh +RUN ./install-dsa-deps.sh diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh new file mode 100755 index 0000000000..6d8fbdef7b --- /dev/null +++ b/docker/images/install-cachelib-deps.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2022, Intel Corporation + +git clone -b develop https://github.com/intel/CacheLib CacheLib + +./CacheLib/contrib/prerequisites-centos8.sh + +for pkg in zstd googleflags googlelog googletest sparsemap fmt folly fizz wangle fbthrift ; +do + sudo ./CacheLib/contrib/build-package.sh -j -I /opt/ "$pkg" +done + +rm -rf CacheLib diff --git a/docker/images/install-dsa-deps.sh b/docker/images/install-dsa-deps.sh new file mode 100755 index 0000000000..b4c62ecc93 --- /dev/null +++ b/docker/images/install-dsa-deps.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Copyright 2023, Intel Corporation + +# Install idxd-config +git clone https://github.com/intel/idxd-config.git +cd idxd-config +./autogen.sh +./configure CFLAGS='-g -O2' --prefix=/usr --sysconfdir=/etc --libdir=/usr/lib64 +make +make check +sudo make install +cd ../ +rm -rf idxd-config + +# Install DML Library +git clone --recursive https://github.com/intel/DML.git +cd DML +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --target install +cd ../../ +rm -rf DML diff --git a/docker/images/push-image.sh b/docker/images/push-image.sh new file mode 100755 index 0000000000..8f516b4205 --- /dev/null +++ b/docker/images/push-image.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2016-2021, Intel Corporation + +# +# push-image.sh - pushes the Docker image tagged as described in +# ./build-image.sh, to the ${CONTAINER_REG}. +# +# The script utilizes ${CONTAINER_REG_USER} and ${CONTAINER_REG_PASS} variables to +# log in to the ${CONTAINER_REG}. The variables can be set in the CI's configuration +# for automated builds. +# + +set -e +IMG_VER=${IMG_VER:-devel} +TAG="${OS}-${OS_VER}-${IMG_VER}" + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set " \ + "(e.g. OS=fedora, OS_VER=34)." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +if [[ -z "${CONTAINER_REG_USER}" || -z "${CONTAINER_REG_PASS}" ]]; then + echo "ERROR: variables CONTAINER_REG_USER=\"${CONTAINER_REG_USER}\" and " \ + "CONTAINER_REG_PASS=\"${CONTAINER_REG_PASS}\"" \ + "have to be set properly to allow login to the Container Registry." + exit 1 +fi + +# Check if the image tagged with ${CONTAINER_REG}:${TAG} exists locally +if [[ ! $(docker images -a | awk -v pattern="^${CONTAINER_REG}:${TAG}\$" \ + '$1":"$2 ~ pattern') ]] +then + echo "ERROR: Docker image tagged ${CONTAINER_REG}:${TAG} does not exist locally." + exit 1 +fi + +echo "Log in to the Container Registry: ${CONTAINER_REG}" +echo "${CONTAINER_REG_PASS}" | docker login ghcr.io -u="${CONTAINER_REG_USER}" --password-stdin + +echo "Push the image to the Container Registry" +docker push ${CONTAINER_REG}:${TAG} diff --git a/docker/pull-or-rebuild-image.sh b/docker/pull-or-rebuild-image.sh new file mode 100755 index 0000000000..dcdcb40e8c --- /dev/null +++ b/docker/pull-or-rebuild-image.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2016-2021, Intel Corporation + +# +# pull-or-rebuild-image.sh - rebuilds the Docker image used in the +# current build (if necessary) or pulls it from the Container Registry. +# Docker image is tagged as described in docker/build-image.sh, +# but IMG_VER defaults in this script to "latest" (just in case it's +# used locally without building any images). +# +# If Docker was rebuilt and all requirements are fulfilled (more details in +# push_image function below) image will be pushed to the ${CONTAINER_REG}. +# +# The script rebuilds the Docker image if: +# 1. the Dockerfile for the current OS version (${OS}-${OS_VER}.Dockerfile) +# or any .sh script in the Dockerfiles directory were modified and committed, or +# 2. "rebuild" param was passed as a first argument to this script. +# +# The script pulls the Docker image if: +# 1. it does not have to be rebuilt (based on committed changes), or +# 2. "pull" param was passed as a first argument to this script. +# + +set -e + +source $(dirname ${0})/set-ci-vars.sh +IMG_VER=${IMG_VER:-latest} +TAG="${OS}-${OS_VER}-${IMG_VER}" +IMAGES_DIR_NAME=images +BASE_DIR=docker/${IMAGES_DIR_NAME} + +if [[ -z "${OS}" || -z "${OS_VER}" ]]; then + echo "ERROR: The variables OS and OS_VER have to be set properly " \ + "(eg. OS=fedora, OS_VER=34)." + exit 1 +fi + +if [[ -z "${CONTAINER_REG}" ]]; then + echo "ERROR: CONTAINER_REG environment variable is not set " \ + "(e.g. \"//\")." + exit 1 +fi + +function build_image() { + echo "Building the Docker image for the ${OS}-${OS_VER}.Dockerfile" + pushd ${IMAGES_DIR_NAME} + ./build-image.sh + popd +} + +function pull_image() { + echo "Pull the image '${CONTAINER_REG}:${TAG}' from the Container Registry." + docker pull ${CONTAINER_REG}:${TAG} +} + +function push_image { + # Check if the image has to be pushed to the Container Registry: + # - only upstream (not forked) repository, + # - stable-* or master branch, + # - not a pull_request event, + # - and PUSH_IMAGE flag was set for current build. + if [[ "${CI_REPO_SLUG}" == "${GITHUB_REPO}" \ + && (${CI_BRANCH} == develop || ${CI_BRANCH} == main) \ + && ${CI_EVENT_TYPE} != "pull_request" \ + && ${PUSH_IMAGE} == "1" ]] + then + echo "The image will be pushed to the Container Registry: ${CONTAINER_REG}" + pushd ${IMAGES_DIR_NAME} + ./push-image.sh + popd + else + echo "Skip pushing the image to the Container Registry." + fi +} + +# If "rebuild" or "pull" are passed to the script as param, force rebuild/pull. +if [[ "${1}" == "rebuild" ]]; then + build_image + push_image + exit 0 +elif [[ "${1}" == "pull" ]]; then + pull_image + exit 0 +fi + +# Determine if we need to rebuild the image or just pull it from +# the Container Registry, based on committed changes. +if [ -n "${CI_COMMIT_RANGE}" ]; then + commits=$(git rev-list ${CI_COMMIT_RANGE}) +else + commits=${CI_COMMIT} +fi + +if [[ -z "${commits}" ]]; then + echo "'commits' variable is empty. Docker image will be pulled." +fi + +echo "Commits in the commit range:" +for commit in ${commits}; do echo ${commit}; done + +echo "Files modified within the commit range:" +files=$(for commit in ${commits}; do git diff-tree --no-commit-id --name-only \ + -r ${commit}; done | sort -u) +for file in ${files}; do echo ${file}; done + +# Check if committed file modifications require the Docker image to be rebuilt +for file in ${files}; do + # Check if modified files are relevant to the current build + if [[ ${file} =~ ^(${BASE_DIR})\/(${OS})-(${OS_VER})\.Dockerfile$ ]] \ + || [[ ${file} =~ ^(${BASE_DIR})\/.*\.sh$ ]] + then + build_image + push_image + exit 0 + fi +done + +# Getting here means rebuilding the Docker image isn't required (based on changed files). +# Pull the image from the Container Registry or rebuild anyway, if pull fails. +if ! pull_image; then + build_image + push_image +fi diff --git a/docker/run-build.sh b/docker/run-build.sh new file mode 100755 index 0000000000..02c7caf731 --- /dev/null +++ b/docker/run-build.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2022, Intel Corporation + +set -e + +function sudo_password() { + echo ${USERPASS} | sudo -Sk $* +} + +cd .. +mkdir build +cd build +cmake ../cachelib -DBUILD_TESTS=ON -DCMAKE_INSTALL_PREFIX=/opt -DCMAKE_BUILD_TYPE=Debug +sudo_password make install -j$(nproc) + +cd /opt/tests && $WORKDIR/run_tests.sh diff --git a/docker/set-ci-vars.sh b/docker/set-ci-vars.sh new file mode 100755 index 0000000000..f6f52132c8 --- /dev/null +++ b/docker/set-ci-vars.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020-2021, Intel Corporation + +# +# set-ci-vars.sh -- set CI variables common for both: +# Travis and GitHub Actions CIs +# + +set -e + +function get_commit_range_from_last_merge { + # get commit id of the last merge + LAST_MERGE=$(git log --merges --pretty=%H -1) + LAST_COMMIT=$(git log --pretty=%H -1) + RANGE_END="HEAD" + if [ -n "${GITHUB_ACTIONS}" ] && [ "${GITHUB_EVENT_NAME}" == "pull_request" ] && [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then + # GitHub Actions commits its own merge in case of pull requests + # so the first merge commit has to be skipped. + + LAST_COMMIT=$(git log --pretty=%H -2 | tail -n1) + LAST_MERGE=$(git log --merges --pretty=%H -2 | tail -n1) + # If still the last commit is a merge commit it means we're manually + # merging changes (probably back from stable branch). We have to use + # left parent of the merge and the current commit for COMMIT_RANGE. + if [ "${LAST_MERGE}" == "${LAST_COMMIT}" ]; then + LAST_MERGE=$(git log --merges --pretty=%P -2 | tail -n1 | cut -d" " -f1) + RANGE_END=${LAST_COMMIT} + fi + elif [ "${LAST_MERGE}" == "${LAST_COMMIT}" ] && + ([ "${TRAVIS_EVENT_TYPE}" == "push" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]); then + # Other case in which last commit equals last merge, is when committing + # a manual merge. Push events don't set proper COMMIT_RANGE. + # It has to be then set: from merge's left parent to the current commit. + LAST_MERGE=$(git log --merges --pretty=%P -1 | cut -d" " -f1) + fi + if [ "${LAST_MERGE}" == "" ]; then + # possible in case of shallow clones + # or new repos with no merge commits yet + # - pick up the first commit + LAST_MERGE=$(git log --pretty=%H | tail -n1) + fi + COMMIT_RANGE="${LAST_MERGE}..${RANGE_END}" + # make sure it works now + if ! git rev-list ${COMMIT_RANGE} >/dev/null; then + COMMIT_RANGE="" + fi + echo ${COMMIT_RANGE} +} + +COMMIT_RANGE_FROM_LAST_MERGE=$(get_commit_range_from_last_merge) + +if [ -n "${TRAVIS}" ]; then + CI_COMMIT=${TRAVIS_COMMIT} + CI_COMMIT_RANGE="${TRAVIS_COMMIT_RANGE/.../..}" + CI_BRANCH=${TRAVIS_BRANCH} + CI_EVENT_TYPE=${TRAVIS_EVENT_TYPE} + CI_REPO_SLUG=${TRAVIS_REPO_SLUG} + + # CI_COMMIT_RANGE is usually invalid for force pushes - fix it when used + # with non-upstream repository + if [ -n "${CI_COMMIT_RANGE}" -a "${CI_REPO_SLUG}" != "${GITHUB_REPO}" ]; then + if ! git rev-list ${CI_COMMIT_RANGE}; then + CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE} + fi + fi + + case "${TRAVIS_CPU_ARCH}" in + "amd64") + CI_CPU_ARCH="x86_64" + ;; + *) + CI_CPU_ARCH=${TRAVIS_CPU_ARCH} + ;; + esac + +elif [ -n "${GITHUB_ACTIONS}" ]; then + CI_COMMIT=${GITHUB_SHA} + CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE} + CI_BRANCH=$(echo ${GITHUB_REF} | cut -d'/' -f3) + CI_REPO_SLUG=${GITHUB_REPOSITORY} + CI_CPU_ARCH="x86_64" # GitHub Actions supports only x86_64 + + case "${GITHUB_EVENT_NAME}" in + "schedule") + CI_EVENT_TYPE="cron" + ;; + *) + CI_EVENT_TYPE=${GITHUB_EVENT_NAME} + ;; + esac + +else + CI_COMMIT=$(git log --pretty=%H -1) + CI_COMMIT_RANGE=${COMMIT_RANGE_FROM_LAST_MERGE} + CI_CPU_ARCH="x86_64" +fi + +export CI_COMMIT=${CI_COMMIT} +export CI_COMMIT_RANGE=${CI_COMMIT_RANGE} +export CI_BRANCH=${CI_BRANCH} +export CI_EVENT_TYPE=${CI_EVENT_TYPE} +export CI_REPO_SLUG=${CI_REPO_SLUG} +export CI_CPU_ARCH=${CI_CPU_ARCH} + +echo CI_COMMIT=${CI_COMMIT} +echo CI_COMMIT_RANGE=${CI_COMMIT_RANGE} +echo CI_BRANCH=${CI_BRANCH} +echo CI_EVENT_TYPE=${CI_EVENT_TYPE} +echo CI_REPO_SLUG=${CI_REPO_SLUG} +echo CI_CPU_ARCH=${CI_CPU_ARCH} diff --git a/run_code_coverage.sh b/run_code_coverage.sh new file mode 100755 index 0000000000..7722e262bf --- /dev/null +++ b/run_code_coverage.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#Build CacheLib with flag -DCOVERAGE_ENABLED=ON + +# Track coverage +lcov -c -i -b . -d . -o Coverage.baseline +./run_tests.sh +lcov -c -d . -b . -o Coverage.out +lcov -a Coverage.baseline -a Coverage.out -o Coverage.combined + +# Generate report +COVERAGE_DIR='coverage_report' +genhtml Coverage.combined -o ${COVERAGE_DIR} +COVERAGE_REPORT="${COVERAGE_DIR}.tgz" +tar -zcvf ${COVERAGE_REPORT} ${COVERAGE_DIR} +echo "Created coverage report ${COVERAGE_REPORT}" + +# Cleanup +rm Coverage.baseline Coverage.out Coverage.combined +rm -rf ${COVERAGE_DIR} diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000000..e575dbc62a --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Newline separated list of tests to ignore +BLACKLIST="allocator-test-NavySetupTest +allocator-test-NvmCacheTests +shm-test-test_page_size" + +if [ "$1" == "long" ]; then + find -type f -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c +else + find -type f \( -not -name "*bench*" -and -not -name "navy*" \) -executable | grep -vF "$BLACKLIST" | xargs -n1 bash -c +fi + +../bin/cachebench --json_test_config ../test_configs/consistency/navy.json +../bin/cachebench --json_test_config ../test_configs/consistency/navy-multi-tier.json diff --git a/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md b/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md index ce99649133..d17b7ac522 100644 --- a/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md +++ b/website/docs/Cache_Library_User_Guides/Cachebench_FB_HW_eval.md @@ -17,10 +17,11 @@ sufficient free memory (50+GB) and SSD capacity (1TB). * SSD Capacity: 100GB or more available capacity * Internet connection capable of accessing github.com and installing packages -## Set up the SSD devices using mdraid +## Set up the SSD devices -To gather SSD performance metrics, the SSD must be setup first. An example -below sets up a raid device to handle two ssds being used by CacheBench. +To gather SSD performance metrics, the SSD must be setup first. Cachebench (and CacheLib) supports using various types of devices for NVM cache including a raw block device or a regular file. When one wants to use multiple SSDs as NVM cache, the CacheLib also provides a native support for RAID0 (i.e., striping). + +Optionally, as an example, an user can setup and use md devices as follows. In this example, the md device is created from two ssd devices to be used as a raw block device in CacheBench. ```sh mdadm --create /dev/md0 --force --raid-devices=2 --level=0 --chunk=256 /dev/nvme1n1 /dev/nvme2n1 @@ -142,7 +143,7 @@ mdadm --create /dev/md0 --force --raid-devices=2 --level=0 --chunk=256 /dev/nvme make install ``` -See [build and installation](/docs/installation/installation) for further details. +See [build and installation](/docs/installation) for further details. ## Running the benchmark for SSD perf testing @@ -196,7 +197,6 @@ For a full list of options that can be configured, see [configuring cachebench]( using the `--progress` and specifying a duration in seconds. If `--progress-stats-file` is also specified, on every progress interval, `cachebench` would log the internal stats to the specified file. - ## Running cachebench with the trace workload Meta is sharing anonymized traces captured from large scale production cache services. These traces are licensed under the same license as CacheLib. They are meant to help academic and industry researchers to optimize for our caching workloads. One can freely download it from our AWS S3 bucket and run the CacheBench to replay the trace with varying configuration as follows. diff --git a/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md b/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md index eb72646542..8c878e1be6 100644 --- a/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md +++ b/website/docs/Cache_Library_User_Guides/Cachebench_Overview.md @@ -53,6 +53,6 @@ developer's need. The following are few examples. ## Building cachebench -Follow instructions in [Installation](/docs/installation/installation) to build +Follow instructions in [Installation](/docs/installation) to build cachebench. This should install cachebench in your local machine under ```opt/cachelib/bin/cachebench``` diff --git a/website/docs/installation/testing.md b/website/docs/installation/testing.md index 02b2cb747c..d8730127b4 100644 --- a/website/docs/installation/testing.md +++ b/website/docs/installation/testing.md @@ -11,7 +11,7 @@ of the cache infrastructure. ## Building CacheLib Unit Tests To build the cachelib unit tests, use one of the following commands -(see [installation](docs/installation/installation) instructions for more details): +(see [installation](/docs/installation) instructions for more details): 1. Use `./contrib/build.sh` script with the `-T` option. 2. Use `./contrib/build-package.sh -t cachelib` (with the `-t` option) @@ -42,7 +42,7 @@ Running a single unit test binary: ```sh $ cd opt/cachelib/tests -$ ./allocator-test-ItemTest +$ ./allocator-test-ItemTest [==========] Running 6 tests from 1 test suite. [----------] Global test environment set-up. [----------] 6 tests from ItemTest diff --git a/website/package.json b/website/package.json index 8c58fda9a2..ac9801eeeb 100644 --- a/website/package.json +++ b/website/package.json @@ -43,7 +43,8 @@ "ansi-html": "0.0.8", "ua-parser-js": "^1.0.33", "eta": "^2.0.0", - "http-cache-semantics": "^4.1.1" + "http-cache-semantics": "^4.1.1", + "@braintree/sanitize-url": "^6.0.1" }, "browserslist": { "production": [ diff --git a/website/yarn.lock b/website/yarn.lock index 19f12eb0d5..51e55da8c7 100644 --- a/website/yarn.lock +++ b/website/yarn.lock @@ -1809,10 +1809,10 @@ "@babel/helper-validator-identifier" "^7.18.6" to-fast-properties "^2.0.0" -"@braintree/sanitize-url@^6.0.0": - version "6.0.0" - resolved "https://registry.yarnpkg.com/@braintree/sanitize-url/-/sanitize-url-6.0.0.tgz#fe364f025ba74f6de6c837a84ef44bdb1d61e68f" - integrity sha512-mgmE7XBYY/21erpzhexk4Cj1cyTQ9LzvnTxtzM17BJ7ERMNE6W72mQRo0I1Ud8eFJ+RVVIcBNhLFZ3GX4XFz5w== +"@braintree/sanitize-url@^6.0.0", "@braintree/sanitize-url@^6.0.1": + version "6.0.2" + resolved "https://registry.yarnpkg.com/@braintree/sanitize-url/-/sanitize-url-6.0.2.tgz#6110f918d273fe2af8ea1c4398a88774bb9fc12f" + integrity sha512-Tbsj02wXCbqGmzdnXNk0SOF19ChhRU70BsroIi4Pm6Ehp56in6vch94mfbdQ17DozxkL3BAVjbZ4Qc1a0HFRAg== "@colors/colors@1.5.0": version "1.5.0"