diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..26bdfedd5 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,29 @@ +## Description + + + +## Type of Change + +* Types + - [ ] Bug fix + - [ ] New feature + - [ ] Transfer Engine + - [ ] Mooncake Store + - [ ] Mooncake EP + - [ ] Integration + - [ ] P2P Store + - [ ] Python Wheel + - [ ] Breaking change + - [ ] CI/CD + - [ ] Documentation update + - [ ] Other + +## How Has This Been Tested? + + + +## Checklist + +- [ ] I have performed a self-review of my own code. +- [ ] I have updated the documentation. +- [ ] I have added tests to prove my changes are effective. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 11670d901..91151b3ec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -166,6 +166,28 @@ jobs: deactivate shell: bash + - name: Start Mooncake Master + run: | + source test_env/bin/activate + mkdir -p /tmp/mooncake_storage + mooncake_master \ + --eviction_high_watermark_ratio=0.95 \ + --cluster_id=ci_test_cluster \ + --port 50051 & + sleep 3 + shell: bash + + - name: Run Python Tensor API Performance Test (CI check) + env: + MOONCAKE_MASTER: "127.0.0.1:50051" + MOONCAKE_TE_META_DATA_SERVER: "http://127.0.0.1:8080/metadata" + MOONCAKE_PROTOCOL: "tcp" + LOCAL_HOSTNAME: "127.0.0.1" + run: | + source test_env/bin/activate + python scripts/test_tensor_api.py -n 1 + shell: bash + build-flags: runs-on: ubuntu-22.04 strategy: diff --git a/CMakeLists.txt b/CMakeLists.txt index 13179e792..a664909d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,7 @@ if (BUILD_UNIT_TESTS) enable_testing() endif() +option(WITH_TE "build mooncake transfer engine and sample code" ON) option(WITH_STORE "build mooncake store library and sample code" ON) option(WITH_P2P_STORE "build p2p store library and sample code" OFF) option(WITH_RUST_EXAMPLE "build the Rust interface and sample code for the transfer engine" OFF) @@ -45,8 +46,10 @@ add_subdirectory(mooncake-common) include_directories(mooncake-common/etcd) include_directories(mooncake-common/include) -add_subdirectory(mooncake-transfer-engine) -include_directories(mooncake-transfer-engine/include) +if (WITH_TE) + add_subdirectory(mooncake-transfer-engine) + include_directories(mooncake-transfer-engine/include) +endif() if (WITH_STORE) message(STATUS "Mooncake Store will be built") diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..18c914718 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/README.md b/README.md index 0be1e7800..a91b3e7c5 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ | Traces | Technical Report | Blog - | Slack + | Slack

@@ -18,6 +18,7 @@ [![PyPI - Downloads](https://img.shields.io/pypi/dm/mooncake-transfer-engine)](https://pypi.org/project/mooncake-transfer-engine) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/kvcache-ai/Mooncake) [![GitHub commit activity](https://img.shields.io/github/commit-activity/w/kvcache-ai/Mooncake)](https://github.com/kvcache-ai/Mooncake/graphs/commit-activity) + [![license](https://img.shields.io/github/license/kvcache-ai/mooncake.svg)](https://github.com/kvcache-ai/Mooncake/blob/main/LICENSE-APACHE)
diff --git a/doc/en/mooncake-store.md b/doc/en/mooncake-store.md index 30a2a903e..02827b4b6 100644 --- a/doc/en/mooncake-store.md +++ b/doc/en/mooncake-store.md @@ -488,6 +488,18 @@ There are two startup parameters in `master_service` related to the soft pin mec Notably, soft pinned objects can still be removed using APIs such as `Remove` or `RemoveAll`. +### Zombie Object Cleanup + +If a Client crashes or experiences a network failure after sending a `PutStart` request but before it can send the corresponding `PutEnd` or `PutRevoke` request to the Master, the object initiated by `PutStart` enters a "zombie" state—rendering it neither usable nor deletable. The existence of such "zombie objects" not only consumes storage space but also prevents subsequent `Put` operations on the same keys. To mitigate these issues, the Master records the start time of each `PutStart` request and employs two timeout thresholds—`put_start_discard_timeout` and `put_start_release_timeout`—to clean up zombie objects. + +#### `PutStart` Preemption + +If an object receives neither a `PutEnd` nor a `PutRevoke` request within `put_start_discard_timeout` (default: 30 seconds) after its `PutStart`, any subsequent `PutStart` request for the same object will be allowed to "preempt" the previous `PutStart`. This enables the new request to proceed with writing the object, thereby preventing a single faulty Client from permanently blocking access to that object. Note that during such preemption, the storage space allocated by the old `PutStart` is not reused; instead, new space is allocated for the preempting `PutStart`. The space previously allocated by the old `PutStart` will be reclaimed via the mechanism described below. + +#### Space Reclaim + +Replica space allocated during a `PutStart` is considered releasable by the Master if the write operation is neither completed (via `PutEnd`) nor canceled (via `PutRevoke`) within `put_start_release_timeout` (default: 10 minutes) after the `PutStart`. When object eviction is triggered—either due to allocation failures or because storage utilization exceeds the configured threshold—these releasable replica spaces are prioritized for release to reclaim storage capacity. + ### Preferred Segment Allocation Mooncake Store provides a **preferred segment allocation** feature that allows users to specify a preferred storage segment (node) for object allocation. This feature is particularly useful for optimizing data locality and reducing network overhead in distributed scenarios. diff --git a/doc/en/transfer-engine.md b/doc/en/transfer-engine.md index 6ccacd92f..4f9bad7e7 100644 --- a/doc/en/transfer-engine.md +++ b/doc/en/transfer-engine.md @@ -443,3 +443,4 @@ For advanced users, TransferEngine provides the following advanced runtime optio - `MC_MIN_PRC_PORT` Specifies the minimum port number for RPC service. The default value is 15000. - `MC_MAX_PRC_PORT` Specifies the maximum port number for RPC service. The default value is 17000. - `MC_PATH_ROUNDROBIN` Use round-robin mode in the RDMA path selection. This may be beneficial for transferring large bulks. +- `MC_ENDPOINT_STORE_TYPE` Choose FIFO Endpoint Store (`FIFO`) or Sieve Endpoint Store (`SIEVE`), default is `SIEVE`. diff --git a/doc/en/troubleshooting.md b/doc/en/troubleshooting.md index 10cad63a7..bd7d5bf90 100644 --- a/doc/en/troubleshooting.md +++ b/doc/en/troubleshooting.md @@ -64,6 +64,13 @@ In addition, if the error `Failed to get description of XXX` is displayed, it in ## SGLang Common Questions +### Do I need RDMA to run SGLang and Mooncake? + +When using Mooncake for KV cache transfer in SGLang PD disaggregation deployments, GPUDirect RDMA (GDR) is required. + +When using Mooncake as a KV cache storage backend in SGLang HiCache, RDMA is recommended for better performance. +However, if RDMA NICs are not available, the TCP protocol is also supported. + ### How to make sure GPUDirect RDMA (GDR) is supported 1. Verify the presence of an RDMA-capable NIC (e.g., Mellanox, ERDMA) and drivers. @@ -84,7 +91,7 @@ lsmod | grep peer_mem lsmod | grep nvidia_peer_mem ``` -3. If you use container to run SGLang, please make sure RDMA and GDR driver are installed in the container and run container in previledge mode. Requirements: (1) privileged mode must be enabled. (2) RDMA devices/NVIDIA devices mounted into container +3. If you use container to run SGLang, please make sure RDMA and GDR driver are installed in the container and run container in privileged mode. Requirements: (1) privileged mode must be enabled. (2) RDMA devices/NVIDIA devices mounted into container 4. Check the connectivity Benchmark end-to-end performance using ib_write_bw. @@ -93,7 +100,6 @@ apt install perftest # server side ib_write_bw -d [rdma_device] -R -x gdr # client side -# server side ib_write_bw -d [rdma_device] -R -x gdr [server_ip] ``` Expected Output: diff --git a/doc/zh/mooncake-store.md b/doc/zh/mooncake-store.md index 1568eb60c..1801dfe9d 100644 --- a/doc/zh/mooncake-store.md +++ b/doc/zh/mooncake-store.md @@ -492,6 +492,18 @@ virtual std::unique_ptr Allocate( 被软固定的对象仍然可以通过 `Remove`、`RemoveAll` 等 API 主动删除。 +### 僵尸对象清理机制 + +如果Client因为进程崩溃或网络故障等原因,在发送完`PutStart`请求后无法向Master发送对应的`PutEnd`或`PutRevoke`请求,就会导致`PutStart`的对象处于无法使用也无法删除的“僵尸”状态。“僵尸对象”的存在不仅会占用存储空间,还会导致后续对相同key的`Put`操作无法进行。为了避免这些问题,Master会记录每个对象`PutStart`请求的开始时间,并基于两个超时时间:`put_start_discard_timeout`和`put_start_release_timeout`,对僵尸对象进行清理。 + +#### `PutStart`顶替 + +如果一个对象在`PutStart`后的`put_start_discard_timeout`(默认为30秒)时间内没有收到任何的`PutEnd`或是`PutRevoke`请求,那么后续新来的对该对象的`PutStart`操作将能够“顶替”旧的`PutStart`操作,继续进行对该对象的写入,从而避免单个Client的故障导致一些对象永远无法使用。需要注意的是,在发生“顶替”时,不会复用旧`PutStart`分配的空间,而是会重新分配空间供新`PutStart`使用,旧`PutStart`分配的空间将通过下述机制进行回收。 + +#### 空间回收 + +在`PutStart`中为对象分配的副本空间,如果在`PutStart`后的`put_start_release_timeout`(默认为10分钟)时间内没有完成写入(收到`PutEnd`)或被撤销(收到`PutRevoke`),将会被Master视为是可释放的。在因空间分配失败或空间使用率高于设定水位而触发对象淘汰时,这些可释放的对象副本空间将会被优先释放以回收存储空间。 + ### 首选段分配 Mooncake Store 提供了**首选段分配**功能,允许用户为对象分配指定首选的存储段(节点)。此功能特别适用于优化数据局部性和减少分布式场景中的网络开销。 diff --git a/doc/zh/transfer-engine.md b/doc/zh/transfer-engine.md index 97341601f..ba9437536 100644 --- a/doc/zh/transfer-engine.md +++ b/doc/zh/transfer-engine.md @@ -415,4 +415,5 @@ int init(const std::string &metadata_conn_string, - `MC_FORCE_TCP` 强制使用 TCP 作为主要传输方式,无论是否安装了有效的 RDMA 网卡 - `MC_MIN_PRC_PORT` 指定 RPC 服务使用的最小端口号。默认值为 15000。 - `MC_MAX_PRC_PORT` 指定 RPC 服务使用的最大端口号。默认值为 17000。 -- `MC_PATH_ROUNDROBIN` 指定 RDMA 路径选择使用 Round Robin 模式,这对于传输大块数据可能有利。 \ No newline at end of file +- `MC_PATH_ROUNDROBIN` 指定 RDMA 路径选择使用 Round Robin 模式,这对于传输大块数据可能有利。 +- `MC_ENDPOINT_STORE_TYPE` 选择 FIFO Endpoint Store (`FIFO`) 或者 Sieve Endpoint Store (`SIEVE`),模式是 `SIEVE`。 \ No newline at end of file diff --git a/docs/source/design/mooncake-store.md b/docs/source/design/mooncake-store.md index b78a604c8..17a252914 100644 --- a/docs/source/design/mooncake-store.md +++ b/docs/source/design/mooncake-store.md @@ -488,6 +488,18 @@ There are two startup parameters in `master_service` related to the soft pin mec Notably, soft pinned objects can still be removed using APIs such as `Remove` or `RemoveAll`. +### Zombie Object Cleanup + +If a Client crashes or experiences a network failure after sending a `PutStart` request but before it can send the corresponding `PutEnd` or `PutRevoke` request to the Master, the object initiated by `PutStart` enters a "zombie" state—rendering it neither usable nor deletable. The existence of such "zombie objects" not only consumes storage space but also prevents subsequent `Put` operations on the same keys. To mitigate these issues, the Master records the start time of each `PutStart` request and employs two timeout thresholds—`put_start_discard_timeout` and `put_start_release_timeout`—to clean up zombie objects. + +#### `PutStart` Preemption + +If an object receives neither a `PutEnd` nor a `PutRevoke` request within `put_start_discard_timeout` (default: 30 seconds) after its `PutStart`, any subsequent `PutStart` request for the same object will be allowed to "preempt" the previous `PutStart`. This enables the new request to proceed with writing the object, thereby preventing a single faulty Client from permanently blocking access to that object. Note that during such preemption, the storage space allocated by the old `PutStart` is not reused; instead, new space is allocated for the preempting `PutStart`. The space previously allocated by the old `PutStart` will be reclaimed via the mechanism described below. + +#### Space Reclaim + +Replica space allocated during a `PutStart` is considered releasable by the Master if the write operation is neither completed (via `PutEnd`) nor canceled (via `PutRevoke`) within `put_start_release_timeout` (default: 10 minutes) after the `PutStart`. When object eviction is triggered—either due to allocation failures or because storage utilization exceeds the configured threshold—these releasable replica spaces are prioritized for release to reclaim storage capacity. + ### Preferred Segment Allocation Mooncake Store provides a **preferred segment allocation** feature that allows users to specify a preferred storage segment (node) for object allocation. This feature is particularly useful for optimizing data locality and reducing network overhead in distributed scenarios. diff --git a/docs/source/getting_started/examples/sglang-integration/hicache-integration-v1.md b/docs/source/getting_started/examples/sglang-integration/hicache-integration-v1.md index 02d9cde19..806a67c70 100644 --- a/docs/source/getting_started/examples/sglang-integration/hicache-integration-v1.md +++ b/docs/source/getting_started/examples/sglang-integration/hicache-integration-v1.md @@ -102,28 +102,43 @@ For more details, please refer to [Mooncake official installation guide](https:/ **Mooncake** is a distributed system that efficiently aggregates memory resources across multiple servers. It can also be deployed on a single server for simpler setups. -When integrated with **SGLang**, the system conceptually consists of four key components: `the master service`, `metadata service`, `store service`, and the `SGLang server`. Among them, the `master service` and `metadata service` are responsible for object and metadata maintenance. The `store service` manages a contiguous memory segment that contributes to the distributed KV cache, making its memory accessible to both local and remote `SGLang servers`. Data transfer occurs directly between the `store service` and `SGLang servers`, bypassing the `master service`. +When integrated with **SGLang**, the system conceptually consists of four key components: `the master service`, `metadata service` (Optional), `store service` (Optional), and the `SGLang server`. Among them, the `master service` and `metadata service` are responsible for object and metadata maintenance. The `store service` manages a contiguous memory segment that contributes to the distributed KV cache, making its memory accessible to both local and remote `SGLang servers`. Data transfer occurs directly between the `store service` and `SGLang servers`, bypassing the `master service`. ### Single Server Deployment -**Launch Mooncake `metadata service`:** +**Launch Mooncake `metadata service` (Optional):** ```bash python -m mooncake.http_metadata_server ``` +This service is responsible for centralized metadata management including internal connection status and related metadata. + +Deployment of the `metadata service` can be skipped in the following cases: +* Mooncake supports non-centralized metadata management via a P2P handshake mechanism to exchange metadata. When using this mode, deployment of the `metadata service` can be skipped. +* Mooncake also supports embedding `mededata service` into `master service`. In this case, only the `master service` needs to be started. + **Launch Mooncake `master service`:** +The `master service` orchestrates the logical storage space pool across the entire cluster, managing KV cache space allocation and eviction. + +To start `mooncake_master`: + ```bash mooncake_master --eviction_high_watermark_ratio=0.95 ``` +To start `mooncake_master` with embedded `metadata service` (so that a separate `metadata service` deployment can be skipped): + +```bash +mooncake_master --enable_http_metadata_server=true --http_metadata_server_port=8080 --eviction_high_watermark_ratio=0.95 +``` + **Understanding `eviction_high_watermark_ratio`:** When a `PutStart` request fails due to insufficient memory, or when the eviction thread detects that space usage has reached the configured high watermark ratio, an eviction task is triggered to free up space by evicting a portion of objects. -Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator-benchmark-result.html) - provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly. +Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator-benchmark-result.html) provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly. **Launch Mooncake `store service` (Optional):** @@ -132,88 +147,125 @@ First, create and save a configuration file in JSON format. For example: ```json { "local_hostname": "localhost", - "metadata_server": "http://localhost:8080/metadata", - "master_server_address": "localhost:50051", + "metadata_server": "http://127.0.0.1:8080/metadata", + "master_server_address": "127.0.0.1:50051", "protocol": "rdma", - "device_name": "mlx5_0,mlx5_1", - "global_segment_size": 2684354560, + "device_name": "", + "global_segment_size": "4gb", "local_buffer_size": 0 } ``` -Parameter Explanation: +Note: If the `metadata service` is not deployed, set this field to: -* `local_hostname`: The hostname of the `store service`. -* `metadata_server`: The network address of the `metadata service`. The default port is 8080. -* `master_server_address`: The network address of the `master service`. The default port is 50051. -* `protocol`: The protocol used by the Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended. -* `device_name`: The RDMA devices used by Mooncake. This parameter is required only when the protocol is set to `"rdma"`. Available devices can be listed using the `ibv_devices` command. -* `global_segment_size`: The amount of memory (in bytes) contributed to the global memory pool. A larger value allows Mooncake to cache more KV tensors. -* `local_buffer_size`: Local buffer is used to do request operations such as `Get` or `Put`. In this case, it is set to 0 because the instance functions solely as a storage server, contributing memory to the global pool without issuing any request operations. +```json + "metadata_server": "P2PHANDSHAKE", +``` Then start the `store service`: ```bash -python -m mooncake.mooncake_store_service --config=[config_path] +python -m mooncake.mooncake_store_service --config=[config_path] --port=8081 ``` Mooncake `store service` configuration can also be provided via environment variables: ```bash +MOONCAKE_LOCAL_HOSTNAME="localhost" \ MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ -MOONCAKE_GLOBAL_SEGMENT_SIZE=4294967296 \ +MOONCAKE_MASTER="127.0.0.1:50051" \ MOONCAKE_PROTOCOL="rdma" \ -MOONCAKE_DEVICE="erdma_0,erdma_1" \ -MOONCAKE_MASTER=127.0.0.1:50051 \ -python -m mooncake.mooncake_store_service +MOONCAKE_DEVICE="" \ +MOONCAKE_GLOBAL_SEGMENT_SIZE="4gb" \ +MOONCAKE_LOCAL_BUFFER_SIZE=0 \ +python -m mooncake.mooncake_store_service --port=8081 ``` +**Parameter Explanation:** + +* `local_hostname`, `MOONCAKE_LOCAL_HOSTNAME`: The hostname of the `store service`. +* `metadata_server`, `MOONCAKE_TE_META_DATA_SERVER` : The network address of the `metadata service`. The default port is 8080. If the `metadata service` is not deployed, set this field to: `"metadata_server": "P2PHANDSHAKE"`. +* `master_server_address`, `MOONCAKE_MASTER`: The network address of the `master service`. The default port is 50051. +* `protocol`, `MOONCAKE_PROTOCOL`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended. +* `device_name`, `MOONCAKE_DEVICE`: The RDMA devices used by Mooncake. This field can usually be left empty, as Mooncake automatically discovers available NICs by default. This parameter is required only when the protocol is set to `"rdma"` **and** a specific set of NICs needs to be used. Example: `"device_name": "mlx5_0,mlx5_1"`. To list available devices, run `ibv_devices`. **Note:** If the environment variable `MC_MS_AUTO_DISC` is set to `1`, any `device_name` or `MOONCAKE_DEVICE` configuration will be overridden, and Mooncake will switch to auto-discovery mode. +* `global_segment_size`, `MOONCAKE_GLOBAL_SEGMENT_SIZE`: The amount of memory contributed to the global memory pool. Accepts either bytes (integer) or a string with the `gb` suffix, e.g., `"4294967296"` or `"4gb"`. A larger value allows Mooncake to cache more KV tensors. +* `local_buffer_size`, `MOONCAKE_LOCAL_BUFFER_SIZE`: Local buffer is used to do request operations such as `Get` or `Put`. In this case, it is set to 0 because the instance functions solely as a storage server, contributing memory to the global pool without issuing any request operations. + +**Important: Understanding Global Segment Size** + +`global_segment_size` and `MOONCAKE_GLOBAL_SEGMENT_SIZE`: This parameter specifies the amount of memory each instance contributes to the distributed memory pool. The total memory available for KV cache storage across the cluster is the sum of the memory contributed by all instances. + +Adjust this value according to system’s available memory and expected cache requirements. Note: If `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also takes on the role of the `store service`, which simplifies deployment but couples the two components together. Users can choose the deployment approach that best fits their needs. **Start the `SGLang server` with Mooncake enabled:** -Mooncake configuration can be provided via environment variables. Note that, for optimal performance, the Mooncake backend currently supports only the `page_first` layout (which optimizes memory access patterns for KV cache operations). +There are three ways to configure Mooncake: -There are two ways to configure Mooncake: 1. Using environment variables; 2. Using extra-config of sglang arguments. +1. Via extra configuration passed through sglang parameters +2. Using JSON configuration files +3. Using environment variables -**Using env variables to configure Mooncake** +Mooncake loads configuration in the following priority order: + +1. If Mooncake-specific options are provided in `--hicache-storage-backend-extra-config`, they are used first. +2. If not, Mooncake checks whether the environment variable `DEFAULT_MOONCAKE_CONFIG_PATH_ENV` is set, and loads the JSON config file from that path. +3. If neither of the above is provided, Mooncake falls back to environment variables. + +**Using extra-config of sglang arguments to configure Mooncake** ```bash -MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ -MOONCAKE_MASTER=127.0.0.1:50051 \ -MOONCAKE_PROTOCOL="rdma" \ -MOONCAKE_DEVICE="mlx5_0,mlx5_1" \ -MOONCAKE_GLOBAL_SEGMENT_SIZE=4294967296 \ python -m sglang.launch_server \ --enable-hierarchical-cache \ - --hicache-storage-backend mooncake\ - --model-path [model_path] + --hicache-storage-backend mooncake \ + --model-path [model_path] \ + --hicache-storage-backend-extra-config '{"master_server_address": "127.0.0.1:50051", "local_hostname": "localhost", "metadata_server": "http://127.0.0.1:8080/metadata", "global_segment_size": "4gb", "protocol": "rdma", "device_name": ""}' ``` -Parameter Explanation: - -* `MOONCAKE_TE_META_DATA_SERVER`: The network address of the `metadata service`. The default port is 8080. -* `MOONCAKE_MASTER`: The network address of the `master service`. The default port is 50051. -* `MOONCAKE_PROTOCOL`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended. -* `MOONCAKE_DEVICE`: The RDMA devices used by Mooncake. This parameter is required only when the protocol is set to `"rdma"`. Available devices can be listed using the `ibv_devices` command. -* `MOONCAKE_GLOBAL_SEGMENT_SIZE`: The amount of memory (in bytes) contributed to the global memory pool. If at least one `store service` is launched, then this value could be set to `0`. In this case, the `SGLang server` will not contribute any memory to the system. Note that KV tensors cached in the contributed memory will be lost once this process terminates; however, this will not cause any system errors. +**Using JSON file to configure Mooncake** -**Using extra-config of sglang arguments to configure Mooncake** +SGLang server can load Mooncake config from `SGLANG_HICACHE_MOONCAKE_CONFIG_PATH`. ```bash +export SGLANG_HICACHE_MOONCAKE_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hicache/mooncake_config.json + +echo '{ + "local_hostname": "localhost", + "metadata_server": "http://127.0.0.1:8080/metadata", + "master_server_address": "127.0.0.1:50051", + "protocol": "rdma", + "device_name": "", + "global_segment_size": "4gb" +}' > ${SGLANG_HICACHE_MOONCAKE_CONFIG_PATH} + python -m sglang.launch_server \ --enable-hierarchical-cache \ --hicache-storage-backend mooncake \ - --model-path [model_path] \ - --hicache-storage-backend-extra-config '{"master_server_address": "127.0.0.1:50051", "local_hostname": "localhost", "metadata_server": "http://127.0.0.1:8080/metadata", "global_segment_size": 4294967296, "local_buffer_size": 16777216, "protocol": "rdma", "device_name": "mlx5_0,mlx5_1"}' + --model-path [model_path] ``` -**Important: Understanding Global Segment Size** +**Using env variables to configure Mooncake** -`global_segment_size` for `store service` and `MOONCAKE_GLOBAL_SEGMENT_SIZE` for `SGLang service`: This parameter specifies the amount of memory each instance contributes to the distributed memory pool. The total memory available for KV cache storage across the cluster is the sum of the memory contributed by all instances. +```bash +MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ +MOONCAKE_MASTER="127.0.0.1:50051" \ +MOONCAKE_PROTOCOL="rdma" \ +MOONCAKE_DEVICE="" \ +MOONCAKE_GLOBAL_SEGMENT_SIZE="4gb" \ +python -m sglang.launch_server \ + --enable-hierarchical-cache \ + --hicache-storage-backend mooncake\ + --model-path [model_path] +``` -Adjust this value according to system’s available memory and expected cache requirements. +**Parameter Explanation:** + +The Mooncake parameters used here are essentially the same as those configured for the `store service`. + +In particular, for the `global segment size`, if at least one `store service` instance is running, this value can be set to `0`. In this case, the SGLang server will not contribute any memory to the system. Note that KV tensors stored in this contributed memory will be lost when the process exits; however, this will **not** cause any system errors. + +**Important:** when `tp > 1`, each Tensor Parallel (TP) rank launches its own Mooncake backend instance and contributes `1/global_segment_size` memory. Therefore, the total memory consumption equals `global segment size`. **HiCache Related Parameters for SGLang Server** @@ -281,11 +333,23 @@ python -m sglang_router.launch_router \ ## Troubleshooting -**RDMA Registration Failure**: +**RDMA Registration Failure:** * In some environments, RDMA registration may require root privileges. In this case, try running the program as root. * In certain environments (e.g., eRDMA), there is an upper limit on the total amount of RDMA memory that can be registered. Once this limit is exceeded, registration will fail. To resolve this, you can lower the value of `MOONCAKE_GLOBAL_SEGMENT_SIZE`, or reduce the host memory allocated to HiCache in the `SGLang server` (since this memory is fully registered with RDMA to enable zero-copy). +**HiCache CPU Memory Usage:** + +When using HiCache, the default L2 host DRAM (CPU memory) size for KV cache is **2 times** the size of the L1 device memory (GPU memory) for KV cache. + +If the model is small but the GPU memory is large — especially in multi-TP (tensor parallel) setups — this may cause the L1 KV cache to become very large, which in turn can consume excessive CPU DRAM. + +In such cases, you should manually configure an appropriate L2 cache size based on your hardware. This can be done by setting `--hicache-ratio` or `--hicache-size`. + +**More Information:** + +Additional troubleshooting information can be found [here](https://kvcache-ai.github.io/Mooncake/troubleshooting/troubleshooting.html). + ## Test Mooncake Store This test is intended for developers to quickly verify that the MooncakeStore class interfaces are functioning correctly. @@ -296,7 +360,6 @@ First, start the `metadata service` and `master service`. Then run the `test_moo MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \ MOONCAKE_MASTER=127.0.0.1:50051 \ MOONCAKE_PROTOCOL="rdma" \ -MOONCAKE_DEVICE="mlx5_0,mlx5_1" \ MOONCAKE_GLOBAL_SEGMENT_SIZE=16777216 \ python3 [path of test_mooncake_store.py] ``` diff --git a/docs/source/http-api-reference/http-service.md b/docs/source/http-api-reference/http-service.md new file mode 100644 index 000000000..578b65408 --- /dev/null +++ b/docs/source/http-api-reference/http-service.md @@ -0,0 +1,126 @@ +# Mooncake Store HTTP Service + +The Mooncake Store HTTP Service provides RESTful endpoints for cluster management, monitoring, and data operations. This service is embedded within the `mooncake_master` process and can be enabled alongside the primary RPC services. + +## Overview + +The HTTP service serves multiple purposes: +- **Metrics & Monitoring**: Prometheus-compatible metrics endpoints +- **Cluster Management**: Query and manage distributed storage segments +- **Data Inspection**: Examine stored objects and their replicas +- **Health Checks**: Service availability and status verification + +## HTTP Endpoints + +### Metrics Endpoints + +#### `/metrics` +Prometheus-compatible metrics endpoint providing detailed system metrics in text format. + +**Method**: `GET` +**Content-Type**: `text/plain; version=0.0.4` +**Response**: Comprehensive metrics including request counts, error rates, latency statistics, and resource utilization + +**Example**: +```bash +curl http://localhost:8080/metrics +``` + +#### `/metrics/summary` +Human-readable metrics summary with key performance indicators. + +**Method**: `GET` +**Content-Type**: `text/plain; version=0.0.4` +**Response**: Condensed overview of system health and performance metrics + +**Example**: +```bash +curl http://localhost:8080/metrics/summary +``` + +### Data Management Endpoints + +#### `/query_key` +Retrieve replica information for a specific key, including memory locations and transport endpoints. + +**Method**: `GET` +**Parameters**: `key` (query parameter) - The object key to query +**Content-Type**: `text/plain; version=0.0.4` +**Response**: JSON-formatted replica descriptors for memory replicas + +**Example**: +```bash +curl "http://localhost:8080/query_key?key=my_object" +``` + +**Response Format**: +```json +{ + "transport_endpoint_": "hostname:port", + "buffer_descriptors": [...] +} +``` + +#### `/get_all_keys` +List all keys currently stored in the distributed system. + +**Method**: `GET` +**Content-Type**: `text/plain; version=0.0.4` +**Response**: Newline-separated list of all stored keys + +**Example**: +```bash +curl http://localhost:8080/get_all_keys +``` + +### Segment Management Endpoints + +#### `/get_all_segments` +List all mounted segments in the cluster. + +**Method**: `GET` +**Content-Type**: `text/plain; version=0.0.4` +**Response**: Newline-separated list of segment names + +**Example**: +```bash +curl http://localhost:8080/get_all_segments +``` + +#### `/query_segment` +Query detailed information about a specific segment, including used and available capacity. + +**Method**: `GET` +**Parameters**: `segment` (query parameter) - Segment name to query +**Content-Type**: `text/plain; version=0.0.4` +**Response**: Multi-line text with segment details + +**Example**: +```bash +curl "http://localhost:8080/query_segment?segment=segment_name" +``` + +**Response Format**: +``` +segment_name +Used(bytes): 1073741824 +Capacity(bytes): 4294967296 +``` + +### Health Check Endpoints + +#### `/health` +Basic health check endpoint for service availability verification. + +**Method**: `GET` +**Content-Type**: `text/plain; version=0.0.4` +**Response**: `OK` when service is healthy +**Status Codes**: +- `200 OK`: Service is healthy +- Other: Service may be experiencing issues + +**Example**: +```bash +curl http://localhost:8080/health +``` + diff --git a/docs/source/index.md b/docs/source/index.md index e09185d59..4014d7c7e 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -86,6 +86,7 @@ performance/allocator-benchmark-result.md python-api-reference/mooncake-store python-api-reference/transfer-engine +http-api-reference/http-service ::: % Explanation of Mooncake internals diff --git a/docs/source/troubleshooting/troubleshooting.md b/docs/source/troubleshooting/troubleshooting.md index 3f04a7dd5..e6c50f595 100644 --- a/docs/source/troubleshooting/troubleshooting.md +++ b/docs/source/troubleshooting/troubleshooting.md @@ -71,3 +71,47 @@ If the network state is unstable, some requests may not be delivered, displaying Note: In most cases, the errors output, except for the first occurrence, are `work request flushed error`. This is because when the first error occurs, the RDMA driver sets the connection to an unavailable state, so tasks in the submission queue are blocked from execution and subsequent errors are reported. Therefore, it is recommended to locate the first occurrence of the error and check it. In addition, if the error `Failed to get description of XXX` is displayed, it indicates that the Segment name input by the user when calling the `openSegment` interface cannot be found in the etcd database. For memory read/write scenarios, the Segment name needs to strictly match the `local_hostname` field filled in by the other node during initialization. + +## SGLang Common Questions + +### Do I need RDMA to run SGLang and Mooncake? + +When using Mooncake for KV cache transfer in SGLang PD disaggregation deployments, GPUDirect RDMA (GDR) is required. + +When using Mooncake as a KV cache storage backend in SGLang HiCache, RDMA is recommended for better performance. +However, if RDMA NICs are not available, the TCP protocol is also supported. + +### How to make sure GPUDirect RDMA (GDR) is supported + +1. Verify the presence of an RDMA-capable NIC (e.g., Mellanox, ERDMA) and drivers. +``` +ibv_devices +lspci | grep rdma +lsmod | grep -E 'ib_core|mlx4_core|mlx5_core|nvidia_peer_mem' +``` +If no RDMA devices appear: (1) Confirm physical NIC presence via lspci +(2) Install vendor-specific drivers (e.g., Mellanox MLNX_OFED) + +2. check GDR driver is ready, and peer_memory module (part of MLNX_OFED) should be installed +``` +# Check peer_memory module (from MLNX_OFED) +lsmod | grep peer_mem + +# Verify NVIDIA peer memory module +lsmod | grep nvidia_peer_mem +``` + +3. If you use container to run SGLang, please make sure RDMA and GDR driver are installed in the container and run container in privileged mode. Requirements: (1) privileged mode must be enabled. (2) RDMA devices/NVIDIA devices mounted into container + +4. Check the connectivity +Benchmark end-to-end performance using ib_write_bw. +``` +apt install perftest +# server side +ib_write_bw -d [rdma_device] -R -x gdr +# client side +ib_write_bw -d [rdma_device] -R -x gdr [server_ip] +``` +Expected Output: +Successful bidirectional transfer with "BW peak" reported +Errors with -x gdr indicate GDR setup failures diff --git a/mooncake-common/common.cmake b/mooncake-common/common.cmake index a35104d38..d12bdbde3 100644 --- a/mooncake-common/common.cmake +++ b/mooncake-common/common.cmake @@ -74,6 +74,7 @@ option(WITH_RUST_EXAMPLE "build the Rust interface and sample code for the trans option(WITH_METRICS "enable metrics and metrics reporting thread" ON) option(USE_3FS "option for using 3FS storage backend" OFF) option(WITH_NVIDIA_PEERMEM "disable to support RDMA without nvidia-peermem. If WITH_NVIDIA_PEERMEM=OFF then USE_CUDA=ON is required." ON) +option(USE_EVENT_DRIVEN_COMPLETION "option for using event-driven completion (store & transfer engine)" OFF) option(USE_LRU_MASTER "option for using LRU in master service" OFF) set(LRU_MAX_CAPACITY 1000) @@ -83,6 +84,12 @@ if (USE_LRU_MASTER) add_compile_definitions(LRU_MAX_CAPACITY) endif() +if (USE_EVENT_DRIVEN_COMPLETION) + add_compile_definitions(USE_EVENT_DRIVEN_COMPLETION) + message(STATUS "Event-driven completion is enabled") +else() + message(STATUS "Event-driven completion is disabled") +endif() if (USE_NVMEOF) set(USE_CUDA ON) diff --git a/mooncake-common/src/CMakeLists.txt b/mooncake-common/src/CMakeLists.txt index 13dc70d67..63bb25d05 100644 --- a/mooncake-common/src/CMakeLists.txt +++ b/mooncake-common/src/CMakeLists.txt @@ -10,4 +10,8 @@ add_library(mooncake_common target_link_libraries(mooncake_common PUBLIC yaml-cpp jsoncpp -) \ No newline at end of file +) + +if (BUILD_SHARED_LIBS) + install(TARGETS mooncake_common DESTINATION lib) +endif() diff --git a/mooncake-ep/include/mooncake_backend.h b/mooncake-ep/include/mooncake_backend.h index 47f9faff5..b012eb6f4 100644 --- a/mooncake-ep/include/mooncake_backend.h +++ b/mooncake-ep/include/mooncake_backend.h @@ -8,8 +8,6 @@ namespace mooncake { -std::string getCudaTopologyJson(const std::vector& filter); - class MooncakeBackend final : public ::c10d::Backend { public: struct MooncakeBackendOptions final : ::c10d::Backend::Options { @@ -63,7 +61,6 @@ class MooncakeBackend final : public ::c10d::Backend { static void setHostIp(const std::string& hostIp) { hostIp_ = hostIp; } static void setDeviceFilter(std::vector filters) { - hca_filters_ = filters; engine_.setWhitelistFilters(std::move(filters)); } @@ -75,7 +72,6 @@ class MooncakeBackend final : public ::c10d::Backend { private: static TransferEngine engine_; static Transport* transport_; - static std::vector hca_filters_; static int backendIndex_; bool isCpu_{false}; static std::string hostIp_; diff --git a/mooncake-ep/src/CMakeLists.txt b/mooncake-ep/src/CMakeLists.txt index cfbda215e..be988ca5b 100644 --- a/mooncake-ep/src/CMakeLists.txt +++ b/mooncake-ep/src/CMakeLists.txt @@ -1,4 +1,4 @@ -add_library(mooncake_ep mooncake_backend.cpp mooncake_cuda_topology.cpp mooncake_ep_buffer.cpp mooncake_ep_kernel.cu mooncake_worker.cu mooncake_worker_thread.cpp mooncake_ibgda/mlx5gda.cpp) +add_library(mooncake_ep mooncake_backend.cpp mooncake_ep_buffer.cpp mooncake_ep_kernel.cu mooncake_worker.cu mooncake_worker_thread.cpp mooncake_ibgda/mlx5gda.cpp) set_target_properties(mooncake_ep PROPERTIES POSITION_INDEPENDENT_CODE ON) target_link_libraries(mooncake_ep PUBLIC ${TORCH_LIBRARIES} transfer_engine ibverbs mlx5) diff --git a/mooncake-ep/src/mooncake_backend.cpp b/mooncake-ep/src/mooncake_backend.cpp index 0d56a8d28..707ab3fca 100644 --- a/mooncake-ep/src/mooncake_backend.cpp +++ b/mooncake-ep/src/mooncake_backend.cpp @@ -16,9 +16,8 @@ constexpr const char* SPARSE_ERROR_MSG = "Sparse op not supported."; constexpr const char* REDUCE_DTYPE_ERROR_MSG = "Unsupported reduce dtype: "; std::string MooncakeBackend::hostIp_ = "127.0.0.1"; -TransferEngine MooncakeBackend::engine_ = TransferEngine(); +TransferEngine MooncakeBackend::engine_ = TransferEngine(true); Transport* MooncakeBackend::transport_ = nullptr; -std::vector MooncakeBackend::hca_filters_; int MooncakeBackend::backendIndex_ = 0; MooncakeWorker MooncakeBackend::worker_; @@ -34,11 +33,7 @@ MooncakeBackend::MooncakeBackend( // Initialize transfer engine if (!transport_) { engine_.init(P2PHANDSHAKE, hostIp_); - std::string topology = getCudaTopologyJson(hca_filters_); - void** args = (void**)malloc(2 * sizeof(void*)); - args[0] = (void*)topology.c_str(); - args[1] = nullptr; - transport_ = engine_.installTransport("rdma", args); + transport_ = engine_.installTransport("rdma", nullptr); TORCH_CHECK(transport_ != nullptr, c10::str("Failed to install transport")); } diff --git a/mooncake-ep/src/mooncake_cuda_topology.cpp b/mooncake-ep/src/mooncake_cuda_topology.cpp deleted file mode 100644 index c7ada6140..000000000 --- a/mooncake-ep/src/mooncake_cuda_topology.cpp +++ /dev/null @@ -1,147 +0,0 @@ -#include -#include -#include -#include -#include - -namespace mooncake { - -struct InfinibandDevice { - std::string name; - std::string pci_bus_id; -}; - -static std::vector listInfiniBandDevices( - const std::vector &filter) { - int num_devices = 0; - std::vector devices; - - struct ibv_device **device_list = ibv_get_device_list(&num_devices); - if (!device_list) { - LOG(WARNING) << "No RDMA devices found, check your device installation"; - return {}; - } - if (device_list && num_devices <= 0) { - LOG(WARNING) << "No RDMA devices found, check your device installation"; - ibv_free_device_list(device_list); - return {}; - } - - for (int i = 0; i < num_devices; ++i) { - std::string device_name = ibv_get_device_name(device_list[i]); - if (!filter.empty() && std::find(filter.begin(), filter.end(), - device_name) == filter.end()) - continue; - char path[PATH_MAX + 32]; - char resolved_path[PATH_MAX]; - // Get the PCI bus id for the infiniband device. Note that - // "/sys/class/infiniband/mlx5_X/" is a symlink to - // "/sys/devices/pciXXXX:XX/XXXX:XX:XX.X/infiniband/mlx5_X/". - snprintf(path, sizeof(path), "/sys/class/infiniband/%s/../..", - device_name.c_str()); - if (realpath(path, resolved_path) == NULL) { - PLOG(ERROR) << "listInfiniBandDevices: realpath " << path - << " failed"; - continue; - } - std::string pci_bus_id = basename(resolved_path); - - devices.push_back( - InfinibandDevice{.name = std::move(device_name), - .pci_bus_id = std::move(pci_bus_id)}); - } - ibv_free_device_list(device_list); - return devices; -} - -static int getPciDistance(const char *bus1, const char *bus2) { - char buf[PATH_MAX]; - char path1[PATH_MAX]; - char path2[PATH_MAX]; - snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s", bus1); - if (realpath(buf, path1) == NULL) { - return -1; - } - snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s", bus2); - if (realpath(buf, path2) == NULL) { - return -1; - } - - char *ptr1 = path1; - char *ptr2 = path2; - while (*ptr1 && *ptr1 == *ptr2) { - ptr1++; - ptr2++; - } - int distance = 0; - for (; *ptr1; ptr1++) { - distance += (*ptr1 == '/'); - } - for (; *ptr2; ptr2++) { - distance += (*ptr2 == '/'); - } - - return distance; -} - -static std::vector discoverCudaTopology( - const std::vector &all_hca) { - std::vector topology; - int device_count; - if (cudaGetDeviceCount(&device_count) != cudaSuccess) { - device_count = 0; - } - for (int i = 0; i < device_count; i++) { - char pci_bus_id[20]; - if (cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i) != - cudaSuccess) { - continue; - } - for (char *ch = pci_bus_id; (*ch = tolower(*ch)); ch++); - - std::vector preferred_hca; - std::vector avail_hca; - - // Find HCAs with minimum distance in one pass - int min_distance = INT_MAX; - std::vector min_distance_hcas; - - for (const auto &hca : all_hca) { - int distance = getPciDistance(hca.pci_bus_id.c_str(), pci_bus_id); - if (distance >= 0) { - if (distance < min_distance) { - min_distance = distance; - min_distance_hcas.clear(); - min_distance_hcas.push_back(hca.name); - } else if (distance == min_distance) { - min_distance_hcas.push_back(hca.name); - } - } - } - - // Add HCAs with minimum distance to preferred_hca, others to avail_hca - for (const auto &hca : all_hca) { - if (std::find(min_distance_hcas.begin(), min_distance_hcas.end(), - hca.name) != min_distance_hcas.end()) { - preferred_hca.push_back(hca.name); - } else { - avail_hca.push_back(hca.name); - } - } - topology.push_back( - TopologyEntry{.name = "cuda:" + std::to_string(i), - .preferred_hca = std::move(preferred_hca), - .avail_hca = std::move(avail_hca)}); - } - return topology; -} - -std::string getCudaTopologyJson(const std::vector &filter) { - Json::Value topology(Json::objectValue); - auto all_hca = listInfiniBandDevices(filter); - for (auto &ent : discoverCudaTopology(all_hca)) { - topology[ent.name] = ent.toJson(); - } - return topology.toStyledString(); -} -} // namespace mooncake \ No newline at end of file diff --git a/mooncake-integration/CMakeLists.txt b/mooncake-integration/CMakeLists.txt index 5a79ff087..ccdc66606 100644 --- a/mooncake-integration/CMakeLists.txt +++ b/mooncake-integration/CMakeLists.txt @@ -29,15 +29,17 @@ message("${PYTHON_SYS_PATH}") set(PYTHON_PACKAGE_NAME "mooncake") -pybind11_add_module(engine ${SOURCES} ${CACHE_ALLOCATOR_SOURCES} - transfer_engine/transfer_engine_py.cpp -) +if (WITH_TE) + pybind11_add_module(engine ${SOURCES} ${CACHE_ALLOCATOR_SOURCES} + transfer_engine/transfer_engine_py.cpp + ) -target_link_libraries(engine PUBLIC - transfer_engine - glog::glog - gflags::gflags -) + target_link_libraries(engine PUBLIC + transfer_engine + glog::glog + gflags::gflags + ) +endif() set(ALLOCATOR_SO_PATH "${CMAKE_BINARY_DIR}/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so") if(USE_MNNVL) @@ -113,4 +115,6 @@ if (WITH_STORE) install(TARGETS store DESTINATION ${PYTHON_SYS_PATH}/${PYTHON_PACKAGE_NAME}) endif() -install(TARGETS engine DESTINATION ${PYTHON_SYS_PATH}/${PYTHON_PACKAGE_NAME}) +if (WITH_TE) + install(TARGETS engine DESTINATION ${PYTHON_SYS_PATH}/${PYTHON_PACKAGE_NAME}) +endif() diff --git a/mooncake-integration/store/store_py.cpp b/mooncake-integration/store/store_py.cpp index 20c66ee00..ad285c8aa 100644 --- a/mooncake-integration/store/store_py.cpp +++ b/mooncake-integration/store/store_py.cpp @@ -171,6 +171,113 @@ class MooncakeStorePyWrapper { } } + pybind11::list batch_get_tensor(const std::vector &keys) { + if (!store_ || !store_->client_) { + LOG(ERROR) << "Client is not initialized"; + py::list empty_list; + for (size_t i = 0; i < keys.size(); ++i) { + empty_list.append(py::none()); + } + return empty_list; + } + + // Phase 1: Batch Get Buffers (GIL Released) + std::vector> buffer_handles; + { + py::gil_scoped_release release_gil; + // This internal call already handles logging for query failures + buffer_handles = store_->batch_get_buffer(keys); + } + + py::list results_list; + + try { + py::gil_scoped_acquire acquire_gil; + auto torch = torch_module(); + + for (const auto &buffer_handle : buffer_handles) { + if (!buffer_handle) { + results_list.append(py::none()); + continue; + } + + auto total_length = buffer_handle->size(); + if (total_length <= sizeof(TensorMetadata)) { + LOG(ERROR) << "Invalid data format: insufficient data for " + "metadata"; + results_list.append(py::none()); + continue; + } + + char *exported_data = new char[total_length]; + if (!exported_data) { + LOG(ERROR) << "Failed to allocate memory for tensor data"; + results_list.append(py::none()); + continue; + } + + memcpy(exported_data, buffer_handle->ptr(), total_length); + + TensorMetadata metadata; + memcpy(&metadata, exported_data, sizeof(TensorMetadata)); + + if (metadata.ndim < 0 || metadata.ndim > 4) { + delete[] exported_data; + LOG(ERROR) + << "Invalid tensor metadata: ndim=" << metadata.ndim; + results_list.append(py::none()); + continue; + } + + TensorDtype dtype_enum = + static_cast(metadata.dtype); + if (dtype_enum == TensorDtype::UNKNOWN) { + delete[] exported_data; + LOG(ERROR) << "Unknown tensor dtype!"; + results_list.append(py::none()); + continue; + } + + size_t tensor_size = total_length - sizeof(TensorMetadata); + if (tensor_size == 0) { + delete[] exported_data; + LOG(ERROR) << "Invalid data format: no tensor data found"; + results_list.append(py::none()); + continue; + } + + pybind11::object np_array; + int dtype_index = static_cast(dtype_enum); + if (dtype_index >= 0 && + dtype_index < static_cast(array_creators.size())) { + // This call MUST take ownership of exported_data + np_array = array_creators[dtype_index]( + exported_data, sizeof(TensorMetadata), tensor_size); + } else { + delete[] exported_data; // Free memory on error + LOG(ERROR) << "Unsupported dtype enum: " << dtype_index; + results_list.append(py::none()); + continue; + } + + if (metadata.ndim > 0) { + std::vector shape_vec; + for (int i = 0; i < metadata.ndim; i++) { + shape_vec.push_back(metadata.shape[i]); + } + py::tuple shape_tuple = py::cast(shape_vec); + np_array = np_array.attr("reshape")(shape_tuple); + } + pybind11::object tensor = torch.attr("from_numpy")(np_array); + results_list.append(tensor); + } + } catch (const pybind11::error_already_set &e) { + LOG(ERROR) << "Failed during batch tensor deserialization: " + << e.what(); + } + return results_list; + } + int put_tensor(const std::string &key, pybind11::object tensor) { if (!store_ || !store_->client_) { LOG(ERROR) << "Client is not initialized"; @@ -240,6 +347,160 @@ class MooncakeStorePyWrapper { return -static_cast(ErrorCode::INVALID_PARAMS); } } + + std::vector batch_put_tensor(const std::vector &keys, + const pybind11::list &tensors_list) { + if (!store_ || !store_->client_) { + LOG(ERROR) << "Client is not initialized"; + return std::vector( + keys.size(), -static_cast(ErrorCode::INVALID_PARAMS)); + } + + if (keys.size() != tensors_list.size()) { + LOG(ERROR) << "Keys and tensors list size mismatch. keys=" + << keys.size() << ", tensors=" << tensors_list.size(); + return std::vector( + keys.size(), -static_cast(ErrorCode::INVALID_PARAMS)); + } + + if (keys.empty()) { + return std::vector(); + } + + struct TensorInfo { + uintptr_t data_ptr; + size_t tensor_size; + TensorMetadata metadata; + bool valid = false; // Mark if metadata extraction was successful + }; + + std::vector infos(keys.size()); + std::vector results(keys.size(), 0); // Default to success + + // Phase 1: Extract Metadata (GIL Held) + try { + for (size_t i = 0; i < keys.size(); ++i) { + py::object tensor = tensors_list[i]; + + if (!(tensor.attr("__class__") + .attr("__name__") + .cast() + .find("Tensor") != std::string::npos)) { + LOG(ERROR) + << "Input at index " << i << " is not a PyTorch tensor"; + results[i] = -static_cast(ErrorCode::INVALID_PARAMS); + continue; + } + + uintptr_t data_ptr = + tensor.attr("data_ptr")().cast(); + size_t numel = tensor.attr("numel")().cast(); + size_t element_size = + tensor.attr("element_size")().cast(); + size_t tensor_size = numel * element_size; + + pybind11::object shape_obj = tensor.attr("shape"); + pybind11::object dtype_obj = tensor.attr("dtype"); + + TensorDtype dtype_enum = get_tensor_dtype(dtype_obj); + if (dtype_enum == TensorDtype::UNKNOWN) { + LOG(ERROR) + << "Unsupported tensor dtype for key " << keys[i]; + results[i] = -static_cast(ErrorCode::INVALID_PARAMS); + continue; + } + + pybind11::tuple shape_tuple = + pybind11::cast(shape_obj); + int32_t ndim = static_cast(shape_tuple.size()); + if (ndim > 4) { + LOG(ERROR) << "Tensor " << keys[i] + << " has more than 4 dimensions: " << ndim; + results[i] = -static_cast(ErrorCode::INVALID_PARAMS); + continue; + } + + TensorMetadata metadata; + metadata.dtype = static_cast(dtype_enum); + metadata.ndim = ndim; + + for (int j = 0; j < 4; j++) { + metadata.shape[j] = + (j < ndim) ? shape_tuple[j].cast() : -1; + } + + infos[i] = TensorInfo{data_ptr, tensor_size, metadata, true}; + } + } catch (const pybind11::error_already_set &e) { + LOG(ERROR) << "Failed to access tensor data during batch put: " + << e.what(); + return results; + } + + std::vector valid_keys; + std::vector buffer_ptrs; + std::vector buffer_sizes; + std::vector> + temp_handles; // Manages lifetime of allocated buffers + std::vector valid_indices; // To map results back + + { + py::gil_scoped_release release_gil; + + for (size_t i = 0; i < infos.size(); ++i) { + if (!infos[i].valid) { + continue; // Skip items that failed metadata extraction + } + + const auto &info = infos[i]; + size_t total_size = sizeof(TensorMetadata) + info.tensor_size; + + // Allocate a contiguous buffer for this tensor (metadata + + // data) + auto alloc_result = + store_->client_buffer_allocator_->allocate(total_size); + + if (!alloc_result) { + LOG(ERROR) + << "Failed to allocate buffer for key: " << keys[i] + << "size is: " << total_size; + results[i] = -static_cast(ErrorCode::INVALID_PARAMS); + continue; // Skip this item + } + + auto &handle = *alloc_result; + + // Copy metadata + memcpy(handle.ptr(), &info.metadata, sizeof(TensorMetadata)); + // Copy tensor data + memcpy( + static_cast(handle.ptr()) + sizeof(TensorMetadata), + reinterpret_cast(info.data_ptr), info.tensor_size); + + // Add to the list for batch_put_from + valid_keys.push_back(keys[i]); + buffer_ptrs.push_back(handle.ptr()); + buffer_sizes.push_back(total_size); + temp_handles.push_back( + std::make_unique(std::move(handle))); + valid_indices.push_back(i); + } + + if (valid_keys.empty()) { + return results; + } + + std::vector batch_op_results = + store_->batch_put_from(valid_keys, buffer_ptrs, buffer_sizes); + + for (size_t i = 0; i < batch_op_results.size(); ++i) { + size_t original_index = valid_indices[i]; + results[original_index] = batch_op_results[i]; + } + } + + return results; + } }; PYBIND11_MODULE(store, m) { @@ -400,6 +661,11 @@ PYBIND11_MODULE(store, m) { "Get a PyTorch tensor from the store") .def("put_tensor", &MooncakeStorePyWrapper::put_tensor, py::arg("key"), py::arg("tensor"), "Put a PyTorch tensor into the store") + .def("batch_get_tensor", &MooncakeStorePyWrapper::batch_get_tensor, + py::arg("keys"), "Get a batch of PyTorch tensors from the store") + .def("batch_put_tensor", &MooncakeStorePyWrapper::batch_put_tensor, + py::arg("keys"), py::arg("tensors_list"), + "Put a batch of PyTorch tensors into the store") .def( "register_buffer", [](MooncakeStorePyWrapper &self, uintptr_t buffer_ptr, diff --git a/mooncake-store/include/allocation_strategy.h b/mooncake-store/include/allocation_strategy.h index 3e7fa65dc..4c2b19495 100644 --- a/mooncake-store/include/allocation_strategy.h +++ b/mooncake-store/include/allocation_strategy.h @@ -1,13 +1,12 @@ #pragma once #include -#include #include -#include #include #include #include -#include +#include +#include #include #include "allocator.h" // Contains BufferAllocator declaration @@ -18,7 +17,7 @@ namespace mooncake { /** * @brief Abstract interface for allocation strategy, responsible for - * allocating multiple slices across multiple replicas using available + * allocating a slice (with one or more replicas) using available * BufferAllocators. * * The allocation strategy follows best-effort semantics: if the requested @@ -31,9 +30,8 @@ class AllocationStrategy { virtual ~AllocationStrategy() = default; /** - * @brief Allocates multiple slices across the requested number of replicas - * using best-effort semantics. Each replica will contain all - * requested slices. + * @brief Allocates a slice across the requested number of replicas + * using best-effort semantics. * * The allocation follows best-effort semantics: if the full requested * replica count cannot be satisfied, the method will allocate as many @@ -44,7 +42,7 @@ class AllocationStrategy { * @param allocators_by_name Container of mounted allocators, key is * segment_name, value is the corresponding * allocators - * @param slice_sizes Sizes of slices to be allocated in each replica + * @param slice_length Length of the slice to be allocated * @param config Replica configuration containing number of replicas and * placement constraints * @return tl::expected, ErrorCode> containing @@ -60,8 +58,7 @@ class AllocationStrategy { const std::unordered_map< std::string, std::vector>>& allocators_by_name, - const std::vector& slice_sizes, - const ReplicateConfig& config) = 0; + const size_t slice_length, const ReplicateConfig& config) = 0; }; /** @@ -87,189 +84,110 @@ class RandomAllocationStrategy : public AllocationStrategy { const std::unordered_map< std::string, std::vector>>& allocators_by_name, - const std::vector& slice_sizes, const ReplicateConfig& config) { - if (auto validation_error = - validateInput(slice_sizes, config.replica_num)) { - return tl::make_unexpected(*validation_error); + const size_t slice_length, const ReplicateConfig& config) { + // Validate input parameters + if (slice_length == 0 || config.replica_num == 0) { + return tl::make_unexpected(ErrorCode::INVALID_PARAMS); } - std::vector>> - replica_buffers(config.replica_num); - for (auto& replica_buffer : replica_buffers) { - replica_buffer.reserve(slice_sizes.size()); - } - - // Track the actual number of replicas we can allocate - size_t actual_replica_count = config.replica_num; - - // Allocate each slice across replicas - for (size_t slice_idx = 0; slice_idx < slice_sizes.size(); - ++slice_idx) { - auto slice_replicas = allocateSlice(allocators, allocators_by_name, - slice_sizes[slice_idx], - actual_replica_count, config); - - if (slice_replicas.empty()) { - return tl::make_unexpected(ErrorCode::NO_AVAILABLE_HANDLE); - } - - if (slice_replicas.size() < actual_replica_count) { - actual_replica_count = slice_replicas.size(); - // NOTE: replica allocation is best effort - VLOG(1) << "Failed to allocate all replicas for slice " - << slice_idx << ", reducing replica count to " - << actual_replica_count; - - // Resize replica_buffers to match the new count - replica_buffers.resize(actual_replica_count); - } - - for (size_t replica_idx = 0; replica_idx < actual_replica_count; - ++replica_idx) { - replica_buffers[replica_idx].push_back( - std::move(slice_replicas[replica_idx])); + // Fast path: single allocator case + if (allocators.size() == 1) { + if (auto buffer = allocators[0]->allocate(slice_length)) { + std::vector result; + result.emplace_back(std::move(buffer), + ReplicaStatus::PROCESSING); + return result; } + return tl::make_unexpected(ErrorCode::NO_AVAILABLE_HANDLE); } std::vector replicas; - replicas.reserve(actual_replica_count); - for (size_t replica_idx = 0; replica_idx < actual_replica_count; - ++replica_idx) { - replicas.emplace_back(std::move(replica_buffers[replica_idx]), - ReplicaStatus::PROCESSING); - } - - return replicas; - } - - std::optional validateInput( - const std::vector& slice_sizes, size_t replica_num) const { - if (replica_num == 0 || slice_sizes.empty() || - std::count(slice_sizes.begin(), slice_sizes.end(), 0) > 0) { - return ErrorCode::INVALID_PARAMS; - } - - return std::nullopt; - } - - /** - * @brief Allocates replicas for a single slice across different segments - */ - std::vector> allocateSlice( - const std::vector>& allocators, - const std::unordered_map< - std::string, std::vector>>& - allocators_by_name, - size_t slice_size, size_t replica_num, const ReplicateConfig& config, - std::unordered_set& used_segments) { - std::vector> buffers; - buffers.reserve(replica_num); - - for (size_t i = 0; i < replica_num; ++i) { - auto buffer = - allocateSingleBuffer(allocators, allocators_by_name, slice_size, - config, used_segments); + replicas.reserve(config.replica_num); - if (!buffer) { - break; - } - - used_segments.insert(buffer->getSegmentName()); - buffers.push_back(std::move(buffer)); - } - - return buffers; - } - - std::vector> allocateSlice( - const std::vector>& allocators, - const std::unordered_map< - std::string, std::vector>>& - allocators_by_name, - size_t slice_size, size_t replica_num, const ReplicateConfig& config) { - std::unordered_set empty_segments; - return allocateSlice(allocators, allocators_by_name, slice_size, - replica_num, config, empty_segments); - } - - /** - * @brief Allocates a single buffer respecting preferences and exclusions - */ - std::unique_ptr allocateSingleBuffer( - const std::vector>& allocators, - const std::unordered_map< - std::string, std::vector>>& - allocators_by_name, - size_t size, const ReplicateConfig& config, - const std::unordered_set& excluded_segments) { - // Try preferred segment first - if (!config.preferred_segment.empty() && - !excluded_segments.contains(config.preferred_segment)) { + // Try preferred segment first if specified + if (!config.preferred_segment.empty()) { auto preferred_it = allocators_by_name.find(config.preferred_segment); if (preferred_it != allocators_by_name.end()) { for (auto& allocator : preferred_it->second) { - if (auto buffer = allocator->allocate(size)) { - return buffer; + if (auto buffer = allocator->allocate(slice_length)) { + replicas.emplace_back(std::move(buffer), + ReplicaStatus::PROCESSING); + break; } } } } - return tryRandomAllocate(allocators, size, excluded_segments); - } - - /** - * @brief Attempts allocation with random selection from allocators that can - * fit the size - */ - std::unique_ptr tryRandomAllocate( - const std::vector>& allocators, - size_t size, const std::unordered_set& excluded_segments) { - std::vector eligible_indices; - eligible_indices.reserve(allocators.size()); - for (size_t i = 0; i < allocators.size(); ++i) { - if (!excluded_segments.contains(allocators[i]->getSegmentName()) && - allocators[i]->getLargestFreeRegion() >= size) { - eligible_indices.push_back(i); - } + if (replicas.size() == config.replica_num) { + return replicas; } - if (eligible_indices.empty()) { - return nullptr; + // If replica_num is not satisfied, allocate the remaining replicas + // randomly Randomly select a starting point from allocators_by_name + if (allocators_by_name.empty()) { + if (replicas.empty()) { + return tl::make_unexpected(ErrorCode::NO_AVAILABLE_HANDLE); + } + return replicas; } - // Thread-local random number generator for thread safety - thread_local std::mt19937 rng(std::random_device{}()); - std::shuffle(eligible_indices.begin(), eligible_indices.end(), rng); + static thread_local std::mt19937 generator(clock()); + std::uniform_int_distribution distribution( + 0, allocators_by_name.size() - 1); + size_t start_idx = distribution(generator); + + // Get iterator to the starting point + auto start_it = allocators_by_name.begin(); + std::advance(start_it, start_idx); + + auto it = start_it; + size_t max_retry = std::min(kMaxRetryLimit, allocators_by_name.size()); + size_t retry_count = 0; + + // Try to allocate remaining replicas, starting from random position + // TODO: Change the segment data structure to avoid traversing the + // entire map every time + while (replicas.size() < config.replica_num && + retry_count < max_retry) { + // Skip preferred segment if it was already allocated + if (it->first != config.preferred_segment) { + // Try each allocator in this segment + bool allocated = false; + for (auto& allocator : it->second) { + if (auto buffer = allocator->allocate(slice_length)) { + replicas.emplace_back(std::move(buffer), + ReplicaStatus::PROCESSING); + // Allocate at most one replica per segment + allocated = true; + break; + } + } + if (!allocated) { + ++retry_count; + } + } + // Move to next segment (circular) + ++it; + if (it == allocators_by_name.end()) { + it = allocators_by_name.begin(); + } - const size_t max_tries = - std::min(kMaxRetryLimit, eligible_indices.size()); - for (size_t i = 0; i < max_tries; ++i) { - auto& allocator = allocators[eligible_indices[i]]; - if (auto buffer = allocator->allocate(size)) { - return buffer; + // If we have cycled through all segments, break + if (it == start_it) { + break; } - retry_counter_.fetch_add(1); // Track allocation attempts } - return nullptr; + // Return allocated replicas (may be fewer than requested) + if (replicas.empty()) { + return tl::make_unexpected(ErrorCode::NO_AVAILABLE_HANDLE); + } + return replicas; } - /** - * @brief Get the number of allocation retry attempts - */ - uint64_t getRetryCount() const { return retry_counter_.load(); } - - /** - * @brief Reset the retry counter - */ - void resetRetryCount() { retry_counter_.store(0); } - private: - static constexpr size_t kMaxRetryLimit = 10; - // Observer for allocation retries - std::atomic_uint64_t retry_counter_{0}; + static constexpr size_t kMaxRetryLimit = 100; }; -} // namespace mooncake +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-store/include/client.h b/mooncake-store/include/client.h index bb19d0ccf..2aefd76c7 100644 --- a/mooncake-store/include/client.h +++ b/mooncake-store/include/client.h @@ -19,6 +19,7 @@ #include "transfer_task.h" #include "types.h" #include "replica.h" +#include "master_metric_manager.h" namespace mooncake { @@ -254,6 +255,11 @@ class Client { return metrics_->summary_metrics(); } + tl::expected + CalcCacheStats() { + return master_client_.CalcCacheStats(); + } + // For Prometheus-style metrics tl::expected SerializeMetrics() { if (metrics_ == nullptr) { @@ -283,6 +289,7 @@ class Client { const std::string& local_hostname, const std::string& metadata_connstring, const std::string& protocol, const std::optional& device_names); + void InitTransferSubmitter(); ErrorCode TransferData(const Replica::Descriptor& replica_descriptor, std::vector& slices, TransferRequest::OpCode op_code); @@ -333,6 +340,9 @@ class Client { const std::vector& query_results, std::unordered_map>& slices); + // Client identification + const UUID client_id_; + // Client-side metrics std::unique_ptr metrics_; @@ -358,10 +368,6 @@ class Client { std::thread ping_thread_; std::atomic ping_running_{false}; void PingThreadMain(bool is_ha_mode, std::string current_master_address); - - // Client identification - UUID client_id_; - bool te_initialized_{false}; }; } // namespace mooncake diff --git a/mooncake-store/include/client_buffer.hpp b/mooncake-store/include/client_buffer.hpp index 0a1ca1e4c..79c393604 100644 --- a/mooncake-store/include/client_buffer.hpp +++ b/mooncake-store/include/client_buffer.hpp @@ -103,11 +103,11 @@ uint64_t calculate_total_size(const Replica::Descriptor& replica); * @brief Allocate slices from a buffer handle based on replica descriptor * @param slices Output vector to store the allocated slices * @param replica The replica descriptor defining the slice structure - * @param buffer_handle The buffer handle to allocate slices from + * @param buffer_ptr The buffer pointer to allocate slices from * @return 0 on success, non-zero on error */ int allocateSlices(std::vector& slices, const Replica::Descriptor& replica, - BufferHandle& buffer_handle); + void* buffer_ptr); } // namespace mooncake \ No newline at end of file diff --git a/mooncake-store/include/master_client.h b/mooncake-store/include/master_client.h index c7f26ecc8..8c1d20cb8 100644 --- a/mooncake-store/include/master_client.h +++ b/mooncake-store/include/master_client.h @@ -11,6 +11,7 @@ #include "replica.h" #include "types.h" #include "rpc_types.h" +#include "master_metric_manager.h" namespace mooncake { @@ -21,7 +22,8 @@ static const std::string kDefaultMasterAddress = "localhost:50051"; */ class MasterClient { public: - MasterClient(MasterClientMetric* metrics = nullptr) : metrics_(metrics) { + MasterClient(const UUID& client_id, MasterClientMetric* metrics = nullptr) + : client_id_(client_id), metrics_(metrics) { coro_io::client_pool::pool_config pool_conf{}; const char* value = std::getenv("MC_RPC_PROTOCOL"); @@ -62,6 +64,14 @@ class MasterClient { [[nodiscard]] std::vector> BatchExistKey( const std::vector& object_keys); + /** + * @brief Calculate cache hit rate metrics + * @param object_keys None + * @return Map containing metrics + */ + [[nodiscard]] tl::expected + CalcCacheStats(); + /** * @brief Gets object metadata without transferring data * @param object_key Key to query @@ -179,11 +189,10 @@ class MasterClient { /** * @brief Registers a segment to master for allocation * @param segment Segment to register - * @param client_id The uuid of the client * @return tl::expected indicating success/failure */ [[nodiscard]] tl::expected MountSegment( - const Segment& segment, const UUID& client_id); + const Segment& segment); /** * @brief Re-mount segments, invoked when the client is the first time to @@ -191,20 +200,18 @@ class MasterClient { * to remount. This function is idempotent. Client should retry if the * return code is not ErrorCode::OK. * @param segments Segments to remount - * @param client_id The uuid of the client * @return tl::expected indicating success/failure */ [[nodiscard]] tl::expected ReMountSegment( - const std::vector& segments, const UUID& client_id); + const std::vector& segments); /** * @brief Unregisters a memory segment from master * @param segment_id ID of the segment to unmount - * @param client_id The uuid of the client * @return tl::expected indicating success/failure */ [[nodiscard]] tl::expected UnmountSegment( - const UUID& segment_id, const UUID& client_id); + const UUID& segment_id); /** * @brief Gets the cluster ID for the current client to use as subdirectory @@ -215,12 +222,10 @@ class MasterClient { /** * @brief Pings master to check its availability - * @param client_id The uuid of the client * @return tl::expected * containing view version and client status */ - [[nodiscard]] tl::expected Ping( - const UUID& client_id); + [[nodiscard]] tl::expected Ping(); private: /** @@ -275,6 +280,9 @@ class MasterClient { }; RpcClientAccessor client_accessor_; + // The client identification. + const UUID client_id_; + // Metrics for tracking RPC operations MasterClientMetric* metrics_; std::shared_ptr> diff --git a/mooncake-store/include/master_config.h b/mooncake-store/include/master_config.h index c3fdbf3bc..025e111e3 100644 --- a/mooncake-store/include/master_config.h +++ b/mooncake-store/include/master_config.h @@ -37,6 +37,9 @@ struct MasterConfig { bool enable_http_metadata_server; uint32_t http_metadata_server_port; std::string http_metadata_server_host; + + uint64_t put_start_discard_timeout_sec; + uint64_t put_start_release_timeout_sec; }; class MasterServiceSupervisorConfig { @@ -66,6 +69,8 @@ class MasterServiceSupervisorConfig { std::string root_fs_dir = DEFAULT_ROOT_FS_DIR; int64_t global_file_segment_size = DEFAULT_GLOBAL_FILE_SEGMENT_SIZE; BufferAllocatorType memory_allocator = BufferAllocatorType::OFFSET; + uint64_t put_start_discard_timeout_sec = DEFAULT_PUT_START_DISCARD_TIMEOUT; + uint64_t put_start_release_timeout_sec = DEFAULT_PUT_START_RELEASE_TIMEOUT; MasterServiceSupervisorConfig() = default; @@ -102,6 +107,9 @@ class MasterServiceSupervisorConfig { memory_allocator = BufferAllocatorType::OFFSET; } + put_start_discard_timeout_sec = config.put_start_discard_timeout_sec; + put_start_release_timeout_sec = config.put_start_release_timeout_sec; + validate(); } @@ -166,6 +174,8 @@ class WrappedMasterServiceConfig { std::string root_fs_dir = DEFAULT_ROOT_FS_DIR; int64_t global_file_segment_size = DEFAULT_GLOBAL_FILE_SEGMENT_SIZE; BufferAllocatorType memory_allocator = BufferAllocatorType::OFFSET; + uint64_t put_start_discard_timeout_sec = DEFAULT_PUT_START_DISCARD_TIMEOUT; + uint64_t put_start_release_timeout_sec = DEFAULT_PUT_START_RELEASE_TIMEOUT; WrappedMasterServiceConfig() = default; @@ -196,6 +206,9 @@ class WrappedMasterServiceConfig { } else { memory_allocator = mooncake::BufferAllocatorType::OFFSET; } + + put_start_discard_timeout_sec = config.put_start_discard_timeout_sec; + put_start_release_timeout_sec = config.put_start_release_timeout_sec; } // From MasterServiceSupervisorConfig, enable_ha is set to true @@ -221,6 +234,8 @@ class WrappedMasterServiceConfig { root_fs_dir = config.root_fs_dir; global_file_segment_size = config.global_file_segment_size; memory_allocator = config.memory_allocator; + put_start_discard_timeout_sec = config.put_start_discard_timeout_sec; + put_start_release_timeout_sec = config.put_start_release_timeout_sec; } }; @@ -244,6 +259,8 @@ class MasterServiceConfigBuilder { std::string root_fs_dir_ = DEFAULT_ROOT_FS_DIR; int64_t global_file_segment_size_ = DEFAULT_GLOBAL_FILE_SEGMENT_SIZE; BufferAllocatorType memory_allocator_ = BufferAllocatorType::OFFSET; + uint64_t put_start_discard_timeout_sec_ = DEFAULT_PUT_START_DISCARD_TIMEOUT; + uint64_t put_start_release_timeout_sec_ = DEFAULT_PUT_START_RELEASE_TIMEOUT; public: MasterServiceConfigBuilder() = default; @@ -312,6 +329,18 @@ class MasterServiceConfigBuilder { return *this; } + MasterServiceConfigBuilder& set_put_start_discard_timeout_sec( + uint64_t put_start_discard_timeout_sec) { + put_start_discard_timeout_sec_ = put_start_discard_timeout_sec; + return *this; + } + + MasterServiceConfigBuilder& set_put_start_release_timeout_sec( + uint64_t put_start_release_timeout_sec) { + put_start_release_timeout_sec_ = put_start_release_timeout_sec; + return *this; + } + MasterServiceConfig build() const; }; @@ -331,6 +360,8 @@ class MasterServiceConfig { std::string root_fs_dir = DEFAULT_ROOT_FS_DIR; int64_t global_file_segment_size = DEFAULT_GLOBAL_FILE_SEGMENT_SIZE; BufferAllocatorType memory_allocator = BufferAllocatorType::OFFSET; + uint64_t put_start_discard_timeout_sec = DEFAULT_PUT_START_DISCARD_TIMEOUT; + uint64_t put_start_release_timeout_sec = DEFAULT_PUT_START_RELEASE_TIMEOUT; MasterServiceConfig() = default; @@ -349,6 +380,8 @@ class MasterServiceConfig { root_fs_dir = config.root_fs_dir; global_file_segment_size = config.global_file_segment_size; memory_allocator = config.memory_allocator; + put_start_discard_timeout_sec = config.put_start_discard_timeout_sec; + put_start_release_timeout_sec = config.put_start_release_timeout_sec; } // Static factory method to create a builder @@ -370,6 +403,8 @@ inline MasterServiceConfig MasterServiceConfigBuilder::build() const { config.root_fs_dir = root_fs_dir_; config.global_file_segment_size = global_file_segment_size_; config.memory_allocator = memory_allocator_; + config.put_start_discard_timeout_sec = put_start_discard_timeout_sec_; + config.put_start_release_timeout_sec = put_start_release_timeout_sec_; return config; } diff --git a/mooncake-store/include/master_metric_manager.h b/mooncake-store/include/master_metric_manager.h index e11ceaa33..bcfdca871 100644 --- a/mooncake-store/include/master_metric_manager.h +++ b/mooncake-store/include/master_metric_manager.h @@ -19,6 +19,39 @@ class MasterMetricManager { MasterMetricManager(MasterMetricManager&&) = delete; MasterMetricManager& operator=(MasterMetricManager&&) = delete; + // Memory Storage Metrics(global & segment) + void inc_allocated_mem_size(const std::string& segment, int64_t val = 1); + void dec_allocated_mem_size(const std::string& segment, int64_t val = 1); + void reset_allocated_mem_size(); + void inc_total_mem_capacity(const std::string& segment, int64_t val = 1); + void dec_total_mem_capacity(const std::string& segment, int64_t val = 1); + void reset_total_mem_capacity(); + double get_global_mem_used_ratio(void); + + void inc_mem_cache_hit_nums(int64_t val = 1); + void inc_file_cache_hit_nums(int64_t val = 1); + void inc_mem_cache_nums(int64_t val = 1); + void inc_file_cache_nums(int64_t val = 1); + void dec_mem_cache_nums(int64_t val = 1); + void dec_file_cache_nums(int64_t val = 1); + + void inc_valid_get_nums(int64_t val = 1); + void inc_total_get_nums(int64_t val = 1); + + enum class CacheHitStat { + MEMORY_HITS, + SSD_HITS, + MEMORY_TOTAL, + SSD_TOTAL, + MEMORY_HIT_RATE, + SSD_HIT_RATE, + OVERALL_HIT_RATE, + VALID_GET_RATE + }; + using CacheHitStatDict = std::unordered_map; + void add_stat_to_dict(CacheHitStatDict&, CacheHitStat, double); + CacheHitStatDict calculate_cache_stats(); + // Memory Storage Metrics void inc_allocated_mem_size(int64_t val = 1); void dec_allocated_mem_size(int64_t val = 1); @@ -26,7 +59,9 @@ class MasterMetricManager { void dec_total_mem_capacity(int64_t val = 1); int64_t get_allocated_mem_size(); int64_t get_total_mem_capacity(); - double get_global_mem_used_ratio(void); + double get_segment_mem_used_ratio(const std::string& segment); + int64_t get_segment_allocated_mem_size(const std::string& segment); + int64_t get_segment_total_mem_capacity(const std::string& segment); // File Storage Metrics void inc_allocated_file_size(int64_t val = 1); @@ -161,6 +196,15 @@ class MasterMetricManager { int64_t get_evicted_key_count(); int64_t get_evicted_size(); + // PutStart Discard Metrics + void inc_put_start_discard_cnt(int64_t count, int64_t size); + void inc_put_start_release_cnt(int64_t count, int64_t size); + + // PutStart Discard Metrics Getters + int64_t get_put_start_discard_cnt(); + int64_t get_put_start_release_cnt(); + int64_t get_put_start_discarded_staging_size(); + // --- Serialization --- /** * @brief Serializes all managed metrics into Prometheus text format. @@ -185,8 +229,16 @@ class MasterMetricManager { // --- Metric Members --- // Memory Storage Metrics - ylt::metric::gauge_t mem_allocated_size_; // Use update for gauge - ylt::metric::gauge_t mem_total_capacity_; // Use update for gauge + ylt::metric::gauge_t + mem_allocated_size_; // Overall memory usage update for gauge + ylt::metric::gauge_t + mem_total_capacity_; // Overall memory capacity update for gauge + ylt::metric::dynamic_gauge_1t + mem_allocated_size_per_segment_; // Segment memory usage update for + // gauge + ylt::metric::dynamic_gauge_1t + mem_total_capacity_per_segment_; // Segment memory capacity update for + // gauge // File Storage Metrics ylt::metric::gauge_t file_allocated_size_; @@ -255,12 +307,36 @@ class MasterMetricManager { ylt::metric::counter_t batch_put_revoke_items_; ylt::metric::counter_t batch_put_revoke_failed_items_; + // cache hit Statistics + ylt::metric::counter_t mem_cache_hit_nums_; + ylt::metric::counter_t file_cache_hit_nums_; + ylt::metric::gauge_t mem_cache_nums_; + ylt::metric::gauge_t file_cache_nums_; + + ylt::metric::counter_t valid_get_nums_; + ylt::metric::counter_t total_get_nums_; + + static const inline std::unordered_map + stat_names_ = {{CacheHitStat::MEMORY_HITS, "memory_hits"}, + {CacheHitStat::SSD_HITS, "ssd_hits"}, + {CacheHitStat::MEMORY_TOTAL, "memory_total"}, + {CacheHitStat::SSD_TOTAL, "ssd_total"}, + {CacheHitStat::MEMORY_HIT_RATE, "memory_hit_rate"}, + {CacheHitStat::SSD_HIT_RATE, "ssd_hit_rate"}, + {CacheHitStat::OVERALL_HIT_RATE, "overall_hit_rate"}, + {CacheHitStat::VALID_GET_RATE, "valid_get_rate"}}; + // Eviction Metrics ylt::metric::counter_t eviction_success_; ylt::metric::counter_t eviction_attempts_; ylt::metric::counter_t evicted_key_count_; ylt::metric::counter_t evicted_size_; + // PutStart Discard Metrics + ylt::metric::counter_t put_start_discard_cnt_; + ylt::metric::counter_t put_start_release_cnt_; + ylt::metric::gauge_t put_start_discarded_staging_size_; + // Some metrics are used only in HA mode. Use a flag to control the output // content. bool enable_ha_{false}; diff --git a/mooncake-store/include/master_service.h b/mooncake-store/include/master_service.h index 28c318bbe..348407c99 100644 --- a/mooncake-store/include/master_service.h +++ b/mooncake-store/include/master_service.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -132,14 +133,14 @@ class MasterService { /** * @brief Start a put operation for an object - * @param[out] replica_list Vector to store replica information for slices + * @param[out] replica_list Vector to store replica information for the + * slice * @return ErrorCode::OK on success, ErrorCode::OBJECT_NOT_FOUND if exists, * ErrorCode::NO_AVAILABLE_HANDLE if allocation fails, * ErrorCode::INVALID_PARAMS if slice size is invalid */ - auto PutStart(const std::string& key, - const std::vector& slice_lengths, - const ReplicateConfig& config) + auto PutStart(const UUID& client_id, const std::string& key, + const uint64_t slice_length, const ReplicateConfig& config) -> tl::expected, ErrorCode>; /** @@ -148,8 +149,8 @@ class MasterService { * @return ErrorCode::OK on success, ErrorCode::OBJECT_NOT_FOUND if not * found, ErrorCode::INVALID_WRITE if replica status is invalid */ - auto PutEnd(const std::string& key, ReplicaType replica_type) - -> tl::expected; + auto PutEnd(const UUID& client_id, const std::string& key, + ReplicaType replica_type) -> tl::expected; /** * @brief Revoke a put operation, replica_type indicates the type of @@ -157,8 +158,8 @@ class MasterService { * @return ErrorCode::OK on success, ErrorCode::OBJECT_NOT_FOUND if not * found, ErrorCode::INVALID_WRITE if replica status is invalid */ - auto PutRevoke(const std::string& key, ReplicaType replica_type) - -> tl::expected; + auto PutRevoke(const UUID& client_id, const std::string& key, + ReplicaType replica_type) -> tl::expected; /** * @brief Complete a batch of put operations @@ -166,7 +167,7 @@ class MasterService { * found, ErrorCode::INVALID_WRITE if replica status is invalid */ std::vector> BatchPutEnd( - const std::vector& keys); + const UUID& client_id, const std::vector& keys); /** * @brief Revoke a batch of put operations @@ -174,7 +175,7 @@ class MasterService { * found, ErrorCode::INVALID_WRITE if replica status is invalid */ std::vector> BatchPutRevoke( - const std::vector& keys); + const UUID& client_id, const std::vector& keys); /** * @brief Remove an object and its replicas @@ -245,15 +246,18 @@ class MasterService { if (soft_pin_timeout) { MasterMetricManager::instance().dec_soft_pin_key_count(1); } - MasterMetricManager::instance().dec_allocated_file_size( - disk_replica_size); } ObjectMetadata() = delete; - ObjectMetadata(size_t value_length, std::vector&& reps, - bool enable_soft_pin) - : replicas(std::move(reps)), + ObjectMetadata( + const UUID& client_id_, + const std::chrono::steady_clock::time_point put_start_time_, + size_t value_length, std::vector&& reps, + bool enable_soft_pin) + : client_id(client_id_), + put_start_time(put_start_time_), + replicas(std::move(reps)), size(value_length), lease_timeout(), soft_pin_timeout(std::nullopt) { @@ -263,16 +267,6 @@ class MasterService { MasterMetricManager::instance().inc_soft_pin_key_count(1); } MasterMetricManager::instance().observe_value_size(value_length); - // Automatic update allocated_file_size via RAII - for (const auto& replica : replicas) { - if (replica.is_disk_replica()) { - disk_replica_size += replica.get_descriptor() - .get_disk_descriptor() - .object_size; - } - } - MasterMetricManager::instance().inc_allocated_file_size( - disk_replica_size); } ObjectMetadata(const ObjectMetadata&) = delete; @@ -280,6 +274,9 @@ class MasterService { ObjectMetadata(ObjectMetadata&&) = delete; ObjectMetadata& operator=(ObjectMetadata&&) = delete; + const UUID client_id; + const std::chrono::steady_clock::time_point put_start_time; + std::vector replicas; size_t size; // Default constructor, creates a time_point representing @@ -287,7 +284,6 @@ class MasterService { std::chrono::steady_clock::time_point lease_timeout; // hard lease std::optional soft_pin_timeout; // optional soft pin, only set for vip objects - uint64_t disk_replica_size = 0; // Check if there are some replicas with a different status than the // given value. If there are, return the status of the first replica @@ -374,6 +370,31 @@ class MasterService { return replica.status() == ReplicaStatus::COMPLETE; }); } + + bool HasCompletedReplicas() const { + return std::any_of( + replicas.begin(), replicas.end(), [](const Replica& replica) { + return replica.status() == ReplicaStatus::COMPLETE; + }); + } + + std::vector DiscardProcessingReplicas() { + auto partition_point = std::partition( + replicas.begin(), replicas.end(), [](const Replica& replica) { + return replica.status() != ReplicaStatus::PROCESSING; + }); + + std::vector discarded_replicas; + if (partition_point != replicas.end()) { + discarded_replicas.reserve( + std::distance(partition_point, replicas.end())); + std::move(partition_point, replicas.end(), + std::back_inserter(discarded_replicas)); + replicas.erase(partition_point, replicas.end()); + } + + return discarded_replicas; + } }; static constexpr size_t kNumShards = 1024; // Number of metadata shards @@ -383,6 +404,7 @@ class MasterService { mutable Mutex mutex; std::unordered_map metadata GUARDED_BY(mutex); + std::unordered_set processing_keys GUARDED_BY(mutex); }; std::array metadata_shards_; @@ -394,6 +416,19 @@ class MasterService { // Helper to clean up stale handles pointing to unmounted segments bool CleanupStaleHandles(ObjectMetadata& metadata); + /** + * @brief Helper to discard expired processing keys. + */ + void DiscardExpiredProcessingKeys( + MetadataShard& shard, const std::chrono::steady_clock::time_point& now); + + /** + * @brief Helper to release space of expired discarded replicas. + * @return Number of released objects that have memory replicas + */ + uint64_t ReleaseExpiredDiscardedReplicas( + const std::chrono::steady_clock::time_point& now); + // Eviction thread function void EvictionThreadFunc(); @@ -421,20 +456,29 @@ class MasterService { : service_(service), key_(key), shard_idx_(service_->getShardIndex(key)), - lock_(&service_->metadata_shards_[shard_idx_].mutex), - it_(service_->metadata_shards_[shard_idx_].metadata.find(key)) { + shard_(service_->metadata_shards_[shard_idx_]), + lock_(&shard_.mutex), + it_(shard_.metadata.find(key)), + processing_it_(shard_.processing_keys.find(key)) { // Automatically clean up invalid handles - if (it_ != service_->metadata_shards_[shard_idx_].metadata.end()) { + if (it_ != shard_.metadata.end()) { if (service_->CleanupStaleHandles(it_->second)) { - service_->metadata_shards_[shard_idx_].metadata.erase(it_); - it_ = service_->metadata_shards_[shard_idx_].metadata.end(); + this->Erase(); + + if (processing_it_ != shard_.processing_keys.end()) { + this->EraseFromProcessing(); + } } } } // Check if metadata exists bool Exists() const NO_THREAD_SAFETY_ANALYSIS { - return it_ != service_->metadata_shards_[shard_idx_].metadata.end(); + return it_ != shard_.metadata.end(); + } + + bool InProcessing() const NO_THREAD_SAFETY_ANALYSIS { + return processing_it_ != shard_.processing_keys.end(); } // Get metadata (only call when Exists() is true) @@ -442,16 +486,23 @@ class MasterService { // Delete current metadata (for PutRevoke or Remove operations) void Erase() NO_THREAD_SAFETY_ANALYSIS { - service_->metadata_shards_[shard_idx_].metadata.erase(it_); - it_ = service_->metadata_shards_[shard_idx_].metadata.end(); + shard_.metadata.erase(it_); + it_ = shard_.metadata.end(); + } + + void EraseFromProcessing() NO_THREAD_SAFETY_ANALYSIS { + shard_.processing_keys.erase(processing_it_); + processing_it_ = shard_.processing_keys.end(); } private: MasterService* service_; std::string key_; size_t shard_idx_; + MetadataShard& shard_; MutexLocker lock_; std::unordered_map::iterator it_; + std::unordered_set::iterator processing_it_; }; friend class MetadataAccessor; @@ -493,6 +544,43 @@ class MasterService { SegmentManager segment_manager_; BufferAllocatorType memory_allocator_type_; std::shared_ptr allocation_strategy_; + + // Discarded replicas management + const std::chrono::seconds put_start_discard_timeout_sec_; + const std::chrono::seconds put_start_release_timeout_sec_; + class DiscardedReplicas { + public: + DiscardedReplicas() = delete; + + DiscardedReplicas(std::vector&& replicas, + std::chrono::steady_clock::time_point ttl) + : replicas_(std::move(replicas)), ttl_(ttl), mem_size_(0) { + for (auto& replica : replicas_) { + mem_size_ += replica.get_memory_buffer_size(); + } + MasterMetricManager::instance().inc_put_start_discard_cnt( + 1, mem_size_); + } + + ~DiscardedReplicas() { + MasterMetricManager::instance().inc_put_start_release_cnt( + 1, mem_size_); + } + + uint64_t memSize() const { return mem_size_; } + + bool isExpired(const std::chrono::steady_clock::time_point& now) const { + return ttl_ <= now; + } + + private: + std::vector replicas_; + std::chrono::steady_clock::time_point ttl_; + uint64_t mem_size_; + }; + std::mutex discarded_replicas_mutex_; + std::list discarded_replicas_ + GUARDED_BY(discarded_replicas_mutex_); }; -} // namespace mooncake +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-store/include/replica.h b/mooncake-store/include/replica.h index c37e6149a..740d0f6df 100644 --- a/mooncake-store/include/replica.h +++ b/mooncake-store/include/replica.h @@ -87,7 +87,7 @@ struct ReplicateConfig { }; struct MemoryReplicaData { - std::vector> buffers; + std::unique_ptr buffer; }; struct DiskReplicaData { @@ -96,8 +96,8 @@ struct DiskReplicaData { }; struct MemoryDescriptor { - std::vector buffer_descriptors; - YLT_REFL(MemoryDescriptor, buffer_descriptors); + AllocatedBuffer::Descriptor buffer_descriptor; + YLT_REFL(MemoryDescriptor, buffer_descriptor); }; struct DiskDescriptor { @@ -111,14 +111,57 @@ class Replica { struct Descriptor; // memory replica constructor - Replica(std::vector> buffers, - ReplicaStatus status) - : data_(MemoryReplicaData{std::move(buffers)}), status_(status) {} + Replica(std::unique_ptr buffer, ReplicaStatus status) + : data_(MemoryReplicaData{std::move(buffer)}), status_(status) {} // disk replica constructor Replica(std::string file_path, uint64_t object_size, ReplicaStatus status) : data_(DiskReplicaData{std::move(file_path), object_size}), - status_(status) {} + status_(status) { + // Automatic update allocated_file_size via RAII + MasterMetricManager::instance().inc_allocated_file_size(object_size); + } + + ~Replica() { + if (status_ != ReplicaStatus::UNDEFINED && is_disk_replica()) { + const auto& disk_data = std::get(data_); + MasterMetricManager::instance().dec_allocated_file_size( + disk_data.object_size); + } + } + + // Copy-construction is not allowed. + Replica(const Replica&) = delete; + Replica& operator=(const Replica&) = delete; + + // Move-construction is allowed. + Replica(Replica&& src) noexcept + : data_(std::move(src.data_)), status_(src.status_) { + // Mark the source as moved-from so its destructor doesn't + // double-decrement metrics. + src.status_ = ReplicaStatus::UNDEFINED; + } + + Replica& operator=(Replica&& src) noexcept { + if (this == &src) { + // Same object, skip moving. + return *this; + } + + // Decrement metric for the current object before overwriting. + if (status_ != ReplicaStatus::UNDEFINED && is_disk_replica()) { + const auto& disk_data = std::get(data_); + MasterMetricManager::instance().dec_allocated_file_size( + disk_data.object_size); + } + + data_ = std::move(src.data_); + status_ = src.status_; + // Mark src as moved-from. + src.status_ = ReplicaStatus::UNDEFINED; + + return *this; + } [[nodiscard]] Descriptor get_descriptor() const; @@ -139,15 +182,21 @@ class Replica { [[nodiscard]] bool has_invalid_mem_handle() const { if (is_memory_replica()) { const auto& mem_data = std::get(data_); - return std::any_of( - mem_data.buffers.begin(), mem_data.buffers.end(), - [](const std::unique_ptr& buf_ptr) { - return !buf_ptr->isAllocatorValid(); - }); + return !mem_data.buffer->isAllocatorValid(); } return false; // DiskReplicaData does not have handles } + [[nodiscard]] size_t get_memory_buffer_size() const { + if (is_memory_replica()) { + const auto& mem_data = std::get(data_); + return mem_data.buffer->size(); + } else { + LOG(ERROR) << "Invalid replica type: " << type(); + return 0; + } + } + [[nodiscard]] std::vector> get_segment_names() const; @@ -237,12 +286,13 @@ inline Replica::Descriptor Replica::get_descriptor() const { if (is_memory_replica()) { const auto& mem_data = std::get(data_); MemoryDescriptor mem_desc; - mem_desc.buffer_descriptors.reserve(mem_data.buffers.size()); - for (const auto& buf_ptr : mem_data.buffers) { - if (buf_ptr) { - mem_desc.buffer_descriptors.push_back( - buf_ptr->get_descriptor()); - } + if (mem_data.buffer) { + mem_desc.buffer_descriptor = mem_data.buffer->get_descriptor(); + } else { + mem_desc.buffer_descriptor.size_ = 0; + mem_desc.buffer_descriptor.buffer_address_ = 0; + mem_desc.buffer_descriptor.transport_endpoint_ = ""; + LOG(ERROR) << "Trying to get invalid memory replica descriptor"; } desc.descriptor_variant = std::move(mem_desc); } else if (is_disk_replica()) { @@ -260,15 +310,11 @@ inline std::vector> Replica::get_segment_names() const { if (is_memory_replica()) { const auto& mem_data = std::get(data_); - std::vector> segment_names( - mem_data.buffers.size()); - for (size_t i = 0; i < mem_data.buffers.size(); ++i) { - if (mem_data.buffers[i] && - mem_data.buffers[i]->isAllocatorValid()) { - segment_names[i] = mem_data.buffers[i]->getSegmentName(); - } else { - segment_names[i] = std::nullopt; - } + std::vector> segment_names; + if (mem_data.buffer && mem_data.buffer->isAllocatorValid()) { + segment_names.push_back(mem_data.buffer->getSegmentName()); + } else { + segment_names.push_back(std::nullopt); } return segment_names; } @@ -281,10 +327,8 @@ inline std::ostream& operator<<(std::ostream& os, const Replica& replica) { if (replica.is_memory_replica()) { const auto& mem_data = std::get(replica.data_); os << "type: MEMORY, buffers: ["; - for (const auto& buf_ptr : mem_data.buffers) { - if (buf_ptr) { - os << *buf_ptr; - } + if (mem_data.buffer) { + os << *mem_data.buffer; } os << "]"; } else if (replica.is_disk_replica()) { diff --git a/mooncake-store/include/rpc_service.h b/mooncake-store/include/rpc_service.h index 355e444c1..95d32ffa1 100644 --- a/mooncake-store/include/rpc_service.h +++ b/mooncake-store/include/rpc_service.h @@ -26,6 +26,9 @@ class WrappedMasterService { tl::expected ExistKey(const std::string& key); + tl::expected + CalcCacheStats(); + std::vector> BatchExistKey( const std::vector& keys); @@ -41,25 +44,27 @@ class WrappedMasterService { BatchGetReplicaList(const std::vector& keys); tl::expected, ErrorCode> PutStart( - const std::string& key, const std::vector& slice_lengths, - const ReplicateConfig& config); + const UUID& client_id, const std::string& key, + const uint64_t slice_length, const ReplicateConfig& config); - tl::expected PutEnd(const std::string& key, + tl::expected PutEnd(const UUID& client_id, + const std::string& key, ReplicaType replica_type); - tl::expected PutRevoke(const std::string& key, + tl::expected PutRevoke(const UUID& client_id, + const std::string& key, ReplicaType replica_type); std::vector, ErrorCode>> - BatchPutStart(const std::vector& keys, - const std::vector>& slice_lengths, + BatchPutStart(const UUID& client_id, const std::vector& keys, + const std::vector& slice_lengths, const ReplicateConfig& config); std::vector> BatchPutEnd( - const std::vector& keys); + const UUID& client_id, const std::vector& keys); std::vector> BatchPutRevoke( - const std::vector& keys); + const UUID& client_id, const std::vector& keys); tl::expected Remove(const std::string& key); @@ -92,4 +97,4 @@ class WrappedMasterService { void RegisterRpcService(coro_rpc::coro_rpc_server& server, mooncake::WrappedMasterService& wrapped_master_service); -} // namespace mooncake +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-store/include/transfer_task.h b/mooncake-store/include/transfer_task.h index a97b47bc8..6b8fc82c3 100644 --- a/mooncake-store/include/transfer_task.h +++ b/mooncake-store/include/transfer_task.h @@ -400,36 +400,35 @@ class TransferSubmitter { /** * @brief Select the optimal transfer strategy */ - TransferStrategy selectStrategy( - const std::vector& handles, - const std::vector& slices) const; + TransferStrategy selectStrategy(const AllocatedBuffer::Descriptor& handle, + const std::vector& slices) const; /** * @brief Check if all handles refer to local segments */ - bool isLocalTransfer( - const std::vector& handles) const; + bool isLocalTransfer(const AllocatedBuffer::Descriptor& handle) const; /** * @brief Validate transfer parameters */ - bool validateTransferParams( - const std::vector& handles, - const std::vector& slices, bool is_multi_buffers = false) const; + bool validateTransferParams(const AllocatedBuffer::Descriptor& handle, + const std::vector& slices) const; /** * @brief Submit memcpy operation asynchronously */ std::optional submitMemcpyOperation( - const std::vector& handles, - std::vector& slices, TransferRequest::OpCode op_code); + const AllocatedBuffer::Descriptor& handle, + const std::vector& slices, + const TransferRequest::OpCode op_code); /** * @brief Submit transfer engine operation asynchronously */ std::optional submitTransferEngineOperation( - const std::vector& handles, - std::vector& slices, TransferRequest::OpCode op_code); + const AllocatedBuffer::Descriptor& handle, + const std::vector& slices, + const TransferRequest::OpCode op_code); std::optional submitFileReadOperation( const Replica::Descriptor& replica, std::vector& slices, diff --git a/mooncake-store/include/types.h b/mooncake-store/include/types.h index 5f9f540ca..100cc13de 100644 --- a/mooncake-store/include/types.h +++ b/mooncake-store/include/types.h @@ -41,6 +41,9 @@ static const int64_t DEFAULT_GLOBAL_FILE_SEGMENT_SIZE = static const std::string PUT_NO_SPACE_HELPER_STR = // A helpful string " due to insufficient space. Consider lowering " "eviction_high_watermark_ratio or mounting more segments."; +static constexpr uint64_t DEFAULT_PUT_START_DISCARD_TIMEOUT = 30; // 30 seconds +static constexpr uint64_t DEFAULT_PUT_START_RELEASE_TIMEOUT = + 600; // 10 minutes // Forward declarations class BufferAllocatorBase; @@ -113,6 +116,7 @@ enum class ErrorCode : int32_t { // Parameter errors (Range: -600 to -699) INVALID_PARAMS = -600, ///< Invalid parameters. + ILLEGAL_CLIENT = -601, ///< Illegal client to do the operation. // Engine operation errors (Range: -700 to -799) INVALID_WRITE = -700, ///< Invalid write operation. diff --git a/mooncake-store/src/CMakeLists.txt b/mooncake-store/src/CMakeLists.txt index 385c84045..7d6300909 100644 --- a/mooncake-store/src/CMakeLists.txt +++ b/mooncake-store/src/CMakeLists.txt @@ -48,6 +48,10 @@ if (STORE_USE_ETCD) add_dependencies(mooncake_store build_etcd_wrapper) endif() +if (BUILD_SHARED_LIBS) + install(TARGETS mooncake_store DESTINATION lib) +endif() + # Master binary add_executable(mooncake_master master.cpp) @@ -66,6 +70,7 @@ target_link_libraries(mooncake_master PRIVATE mooncake_store cachelib_memory_allocator pthread + ibverbs mooncake_common ${ETCD_WRAPPER_LIB} ${MASTER_EXTRA_LIBS} @@ -75,4 +80,4 @@ if (STORE_USE_ETCD) add_dependencies(mooncake_master build_etcd_wrapper) endif() -install(TARGETS mooncake_master DESTINATION bin) \ No newline at end of file +install(TARGETS mooncake_master DESTINATION bin) diff --git a/mooncake-store/src/allocator.cpp b/mooncake-store/src/allocator.cpp index 18ac7e00a..6bb951bcc 100644 --- a/mooncake-store/src/allocator.cpp +++ b/mooncake-store/src/allocator.cpp @@ -18,13 +18,13 @@ std::string AllocatedBuffer::getSegmentName() const noexcept { } AllocatedBuffer::~AllocatedBuffer() { + // Note: This is an edge case. If the 'weak_ptr' is released, the segment + // has already been deallocated at this point, and its memory usage details + // (capacity/allocated) no longer need to be maintained. auto alloc = allocator_.lock(); if (alloc) { alloc->deallocate(this); VLOG(1) << "buf_handle_deallocated size=" << size_; - } else { - MasterMetricManager::instance().dec_allocated_mem_size(size_); - VLOG(1) << "allocator=expired_or_null in buf_handle_destructor"; } } @@ -92,7 +92,10 @@ CachelibBufferAllocator::CachelibBufferAllocator(std::string segment_name, << static_cast(pool_id_); } -CachelibBufferAllocator::~CachelibBufferAllocator() = default; +CachelibBufferAllocator::~CachelibBufferAllocator() { + MasterMetricManager::instance().dec_allocated_mem_size(segment_name_, + cur_size_); +}; std::unique_ptr CachelibBufferAllocator::allocate( size_t size) { @@ -117,7 +120,7 @@ std::unique_ptr CachelibBufferAllocator::allocate( VLOG(1) << "allocation_succeeded size=" << size << " segment=" << segment_name_ << " address=" << buffer; cur_size_.fetch_add(size); - MasterMetricManager::instance().inc_allocated_mem_size(size); + MasterMetricManager::instance().inc_allocated_mem_size(segment_name_, size); return std::make_unique(shared_from_this(), buffer, size); } @@ -128,7 +131,8 @@ void CachelibBufferAllocator::deallocate(AllocatedBuffer* handle) { size_t freed_size = handle->size_; // Store size before handle might become invalid cur_size_.fetch_sub(freed_size); - MasterMetricManager::instance().dec_allocated_mem_size(freed_size); + MasterMetricManager::instance().dec_allocated_mem_size(segment_name_, + freed_size); VLOG(1) << "deallocation_succeeded address=" << handle->buffer_ptr_ << " size=" << freed_size << " segment=" << segment_name_; } catch (const std::exception& e) { @@ -180,7 +184,10 @@ OffsetBufferAllocator::OffsetBufferAllocator(std::string segment_name, } } -OffsetBufferAllocator::~OffsetBufferAllocator() = default; +OffsetBufferAllocator::~OffsetBufferAllocator() { + MasterMetricManager::instance().dec_allocated_mem_size(segment_name_, + cur_size_); +}; std::unique_ptr OffsetBufferAllocator::allocate(size_t size) { if (!offset_allocator_) { @@ -217,7 +224,7 @@ std::unique_ptr OffsetBufferAllocator::allocate(size_t size) { } cur_size_.fetch_add(size); - MasterMetricManager::instance().inc_allocated_mem_size(size); + MasterMetricManager::instance().inc_allocated_mem_size(segment_name_, size); return allocated_buffer; } @@ -228,7 +235,8 @@ void OffsetBufferAllocator::deallocate(AllocatedBuffer* handle) { size_t freed_size = handle->size(); handle->offset_handle_.reset(); cur_size_.fetch_sub(freed_size); - MasterMetricManager::instance().dec_allocated_mem_size(freed_size); + MasterMetricManager::instance().dec_allocated_mem_size(segment_name_, + freed_size); VLOG(1) << "deallocation_succeeded address=" << handle->data() << " size=" << freed_size << " segment=" << segment_name_; } catch (const std::exception& e) { diff --git a/mooncake-store/src/client.cpp b/mooncake-store/src/client.cpp index 52bf75215..79a705ea3 100644 --- a/mooncake-store/src/client.cpp +++ b/mooncake-store/src/client.cpp @@ -36,12 +36,13 @@ namespace mooncake { Client::Client(const std::string& local_hostname, const std::string& metadata_connstring) - : metrics_(ClientMetric::Create()), - master_client_(metrics_ ? &metrics_->master_client_metric : nullptr), + : client_id_(generate_uuid()), + metrics_(ClientMetric::Create()), + master_client_(client_id_, + metrics_ ? &metrics_->master_client_metric : nullptr), local_hostname_(local_hostname), metadata_connstring_(metadata_connstring), write_thread_pool_(2) { - client_id_ = generate_uuid(); LOG(INFO) << "client_id=" << client_id_; if (metrics_) { @@ -126,16 +127,10 @@ static inline void rtrim(std::string& s) { s.end()); } -static std::vector get_auto_discover_filters(bool auto_discover) { +static std::vector get_auto_discover_filters() { std::vector whitelst_filters; char* ev_ad = std::getenv("MC_MS_FILTERS"); if (ev_ad) { - if (!auto_discover) { - LOG(WARNING) - << "auto discovery not set, but find whitelist filters: " - << ev_ad; - return whitelst_filters; - } LOG(INFO) << "whitelist filters: " << ev_ad; char delimiter = ','; char* end = ev_ad + std::strlen(ev_ad); @@ -233,130 +228,132 @@ ErrorCode Client::InitTransferEngine( const std::string& local_hostname, const std::string& metadata_connstring, const std::string& protocol, const std::optional& device_names) { - if (!te_initialized_) { - // get auto_discover and filters from env - std::optional env_auto_discover = get_auto_discover(); - bool auto_discover = false; - if (env_auto_discover.has_value()) { - // Use user-specified auto-discover setting - auto_discover = env_auto_discover.value(); - } else { - // Enable auto-discover for RDMA if no devices are specified - if (protocol == "rdma" && !device_names.has_value()) { - LOG(INFO) - << "Set auto discovery ON by default for RDMA protocol, " - "since no " - "device names provided"; - auto_discover = true; - } + // get auto_discover and filters from env + std::optional env_auto_discover = get_auto_discover(); + bool auto_discover = false; + if (env_auto_discover.has_value()) { + // Use user-specified auto-discover setting + auto_discover = env_auto_discover.value(); + } else { + // Enable auto-discover for RDMA if no devices are specified + if (protocol == "rdma" && !device_names.has_value()) { + LOG(INFO) << "Set auto discovery ON by default for RDMA protocol, " + "since no " + "device names provided"; + auto_discover = true; + } + } + transfer_engine_->setAutoDiscover(auto_discover); + + // Honor filters when auto-discovery is enabled; otherwise warn once + if (auto_discover) { + LOG(INFO) << "Transfer engine auto discovery is enabled for protocol: " + << protocol; + auto filters = get_auto_discover_filters(); + transfer_engine_->setWhitelistFilters(std::move(filters)); + } else { + const char* env_filters = std::getenv("MC_MS_FILTERS"); + if (env_filters && *env_filters != '\0') { + LOG(WARNING) + << "MC_MS_FILTERS is set but auto discovery is disabled; " + << "ignoring whitelist: " << env_filters; } - transfer_engine_->setAutoDiscover(auto_discover); + } - auto [hostname, port] = parseHostNameWithPort(local_hostname); - int rc = transfer_engine_->init(metadata_connstring, local_hostname, - hostname, port); - if (rc != 0) { - LOG(ERROR) << "Failed to initialize transfer engine, rc=" << rc; - return ErrorCode::INTERNAL_ERROR; - } + auto [hostname, port] = parseHostNameWithPort(local_hostname); + int rc = transfer_engine_->init(metadata_connstring, local_hostname, + hostname, port); + if (rc != 0) { + LOG(ERROR) << "Failed to initialize transfer engine, rc=" << rc; + return ErrorCode::INTERNAL_ERROR; + } - if (auto_discover) { - LOG(INFO) - << "Transfer engine auto discovery is enabled for protocol: " - << protocol; - auto filters = get_auto_discover_filters(auto_discover); - transfer_engine_->setWhitelistFilters(std::move(filters)); - } else { - LOG(INFO) - << "Transfer engine auto discovery is disabled for protocol: " - << protocol; + if (!auto_discover) { + LOG(INFO) << "Transfer engine auto discovery is disabled for protocol: " + << protocol; - Transport* transport = nullptr; + Transport* transport = nullptr; - if (protocol == "rdma") { - if (!device_names.has_value() || device_names->empty()) { - LOG(ERROR) - << "RDMA protocol requires device names when auto " - "discovery is disabled"; - return ErrorCode::INVALID_PARAMS; - } + if (protocol == "rdma") { + if (!device_names.has_value() || device_names->empty()) { + LOG(ERROR) << "RDMA protocol requires device names when auto " + "discovery is disabled"; + return ErrorCode::INVALID_PARAMS; + } - LOG(INFO) << "Using specified RDMA devices: " - << device_names.value(); + LOG(INFO) << "Using specified RDMA devices: " + << device_names.value(); - std::vector devices = - splitString(device_names.value(), ',', /*skip_empty=*/true); + std::vector devices = + splitString(device_names.value(), ',', /*skip_empty=*/true); - // Manually discover topology with specified devices only - auto topology = transfer_engine_->getLocalTopology(); - if (topology) { - topology->discover(devices); - LOG(INFO) << "Topology discovery complete with specified " - "devices. Found " - << topology->getHcaList().size() << " HCAs"; - } + // Manually discover topology with specified devices only + auto topology = transfer_engine_->getLocalTopology(); + if (topology) { + topology->discover(devices); + LOG(INFO) << "Topology discovery complete with specified " + "devices. Found " + << topology->getHcaList().size() << " HCAs"; + } - transport = transfer_engine_->installTransport("rdma", nullptr); - if (!transport) { - LOG(ERROR) - << "Failed to install RDMA transport with specified " - "devices"; - return ErrorCode::INTERNAL_ERROR; - } - } else if (protocol == "tcp") { - if (device_names.has_value()) { - LOG(WARNING) - << "TCP protocol does not use device names, ignoring"; - } + transport = transfer_engine_->installTransport("rdma", nullptr); + if (!transport) { + LOG(ERROR) << "Failed to install RDMA transport with specified " + "devices"; + return ErrorCode::INTERNAL_ERROR; + } + } else if (protocol == "tcp") { + if (device_names.has_value()) { + LOG(WARNING) + << "TCP protocol does not use device names, ignoring"; + } - try { - transport = - transfer_engine_->installTransport("tcp", nullptr); - } catch (std::exception& e) { - LOG(ERROR) - << "tcp_transport_install_failed error_message=\"" - << e.what() << "\""; - return ErrorCode::INTERNAL_ERROR; - } + try { + transport = transfer_engine_->installTransport("tcp", nullptr); + } catch (std::exception& e) { + LOG(ERROR) << "tcp_transport_install_failed error_message=\"" + << e.what() << "\""; + return ErrorCode::INTERNAL_ERROR; + } - if (!transport) { - LOG(ERROR) << "Failed to install TCP transport"; - return ErrorCode::INTERNAL_ERROR; - } - } else if (protocol == "ascend") { - if (device_names.has_value()) { - LOG(WARNING) << "Ascend protocol does not use device " - "names, ignoring"; - } - try { - transport = - transfer_engine_->installTransport("ascend", nullptr); - } catch (std::exception& e) { - LOG(ERROR) - << "ascend_transport_install_failed error_message=\"" - << e.what() << "\""; - return ErrorCode::INTERNAL_ERROR; - } + if (!transport) { + LOG(ERROR) << "Failed to install TCP transport"; + return ErrorCode::INTERNAL_ERROR; + } + } else if (protocol == "ascend") { + if (device_names.has_value()) { + LOG(WARNING) << "Ascend protocol does not use device " + "names, ignoring"; + } + try { + transport = + transfer_engine_->installTransport("ascend", nullptr); + } catch (std::exception& e) { + LOG(ERROR) << "ascend_transport_install_failed error_message=\"" + << e.what() << "\""; + return ErrorCode::INTERNAL_ERROR; + } - if (!transport) { - LOG(ERROR) << "Failed to install Ascend transport"; - return ErrorCode::INTERNAL_ERROR; - } - } else { - LOG(ERROR) << "unsupported_protocol protocol=" << protocol; - return ErrorCode::INVALID_PARAMS; + if (!transport) { + LOG(ERROR) << "Failed to install Ascend transport"; + return ErrorCode::INTERNAL_ERROR; } + } else { + LOG(ERROR) << "unsupported_protocol protocol=" << protocol; + return ErrorCode::INVALID_PARAMS; } } + return ErrorCode::OK; +} + +void Client::InitTransferSubmitter() { // Initialize TransferSubmitter after transfer engine is ready // Keep using logical local_hostname for name-based behaviors; endpoint is // used separately where needed. transfer_submitter_ = std::make_unique( *transfer_engine_, storage_backend_, metrics_ ? &metrics_->transfer_metric : nullptr); - - return ErrorCode::OK; } std::optional> Client::Create( @@ -397,18 +394,20 @@ std::optional> Client::Create( // Initialize transfer engine if (transfer_engine == nullptr) { client->transfer_engine_ = std::make_shared(); + err = client->InitTransferEngine(local_hostname, metadata_connstring, + protocol, device_names); + if (err != ErrorCode::OK) { + LOG(ERROR) << "Failed to initialize transfer engine"; + return std::nullopt; + } } else { client->transfer_engine_ = transfer_engine; - client->te_initialized_ = true; - LOG(INFO) << "Use exist transfer engine instance"; - } - err = client->InitTransferEngine(local_hostname, metadata_connstring, - protocol, device_names); - if (err != ErrorCode::OK) { - LOG(ERROR) << "Failed to initialize transfer engine"; - return std::nullopt; + LOG(INFO) << "Use existing transfer engine instance. Skip its " + "initialization."; } + client->InitTransferSubmitter(); + return client; } @@ -594,11 +593,11 @@ std::vector> Client::BatchGetWhenPreferSameNode( continue; } auto& memory_descriptor = replica.get_memory_descriptor(); - if (memory_descriptor.buffer_descriptors.empty()) { + if (memory_descriptor.buffer_descriptor.size_ == 0) { results[i] = tl::unexpected(ErrorCode::INVALID_REPLICA); continue; } - auto& buffer_descriptor = memory_descriptor.buffer_descriptors[0]; + auto& buffer_descriptor = memory_descriptor.buffer_descriptor; auto seg = buffer_descriptor.transport_endpoint_; auto& op = seg_to_op_map[seg]; op.replicas.emplace_back(replica); @@ -1245,12 +1244,11 @@ std::vector> Client::BatchPutWhenPreferSameNode( continue; } auto& memory_descriptor = replica.get_memory_descriptor(); - if (memory_descriptor.buffer_descriptors.empty()) { - op.SetError(ErrorCode::INVALID_PARAMS, - "buffer descriptors is empty."); + if (memory_descriptor.buffer_descriptor.size_ == 0) { + op.SetError(ErrorCode::INVALID_PARAMS, "buffer size is 0."); continue; } - auto& buffer_descriptor = memory_descriptor.buffer_descriptors[0]; + auto& buffer_descriptor = memory_descriptor.buffer_descriptor; auto seg = buffer_descriptor.transport_endpoint_; if (seg_to_ops.find(seg) == seg_to_ops.end()) { seg_to_ops.emplace(seg, PutOperation(op.key, op.slices)); @@ -1291,7 +1289,7 @@ std::vector> Client::BatchPutWhenPreferSameNode( WaitForTransfers(merged_ops); for (auto& op : merged_ops) { auto& memory_descriptor = op.replicas[0].get_memory_descriptor(); - auto& buffer_descriptor = memory_descriptor.buffer_descriptors[0]; + auto& buffer_descriptor = memory_descriptor.buffer_descriptor; auto seg = buffer_descriptor.transport_endpoint_; seg_to_ops.at(seg).state = op.state; } @@ -1300,7 +1298,7 @@ std::vector> Client::BatchPutWhenPreferSameNode( continue; } auto& memory_descriptor = op.replicas[0].get_memory_descriptor(); - auto& buffer_descriptor = memory_descriptor.buffer_descriptors[0]; + auto& buffer_descriptor = memory_descriptor.buffer_descriptor; auto seg = buffer_descriptor.transport_endpoint_; op.state = seg_to_ops.at(seg).state; auto state = std::make_shared(); @@ -1424,7 +1422,7 @@ tl::expected Client::MountSegment(const void* buffer, segment.te_endpoint = local_hostname_; } - auto mount_result = master_client_.MountSegment(segment, client_id_); + auto mount_result = master_client_.MountSegment(segment); if (!mount_result) { ErrorCode err = mount_result.error(); LOG(ERROR) << "mount_segment_to_master_failed base=" << buffer @@ -1454,8 +1452,7 @@ tl::expected Client::UnmountSegment(const void* buffer, return tl::unexpected(ErrorCode::INVALID_PARAMS); } - auto unmount_result = - master_client_.UnmountSegment(segment->second.id, client_id_); + auto unmount_result = master_client_.UnmountSegment(segment->second.id); if (!unmount_result) { ErrorCode err = unmount_result.error(); LOG(ERROR) << "Failed to unmount segment from master: " @@ -1617,9 +1614,7 @@ ErrorCode Client::TransferRead(const Replica::Descriptor& replica_descriptor, size_t total_size = 0; if (replica_descriptor.is_memory_replica()) { auto& mem_desc = replica_descriptor.get_memory_descriptor(); - for (const auto& handle : mem_desc.buffer_descriptors) { - total_size += handle.size_; - } + total_size = mem_desc.buffer_descriptor.size_; } else { auto& disk_desc = replica_descriptor.get_disk_descriptor(); total_size = disk_desc.object_size; @@ -1657,8 +1652,7 @@ void Client::PingThreadMain(bool is_ha_mode, auto& segment = it.second; segments.emplace_back(segment); } - auto remount_result = - master_client_.ReMountSegment(segments, client_id_); + auto remount_result = master_client_.ReMountSegment(segments); if (!remount_result) { ErrorCode err = remount_result.error(); LOG(ERROR) << "Failed to remount segments: " << err; @@ -1677,7 +1671,7 @@ void Client::PingThreadMain(bool is_ha_mode, } // Ping master - auto ping_result = master_client_.Ping(client_id_); + auto ping_result = master_client_.Ping(); if (ping_result) { // Reset ping failure count ping_fail_count = 0; diff --git a/mooncake-store/src/client_buffer.cpp b/mooncake-store/src/client_buffer.cpp index 5acfc2932..23fad52da 100644 --- a/mooncake-store/src/client_buffer.cpp +++ b/mooncake-store/src/client_buffer.cpp @@ -72,36 +72,29 @@ uint64_t calculate_total_size(const Replica::Descriptor& replica) { auto& disk_descriptor = replica.get_disk_descriptor(); total_length = disk_descriptor.object_size; } else { - for (auto& handle : - replica.get_memory_descriptor().buffer_descriptors) { - total_length += handle.size_; - } + total_length = replica.get_memory_descriptor().buffer_descriptor.size_; } return total_length; } int allocateSlices(std::vector& slices, - const Replica::Descriptor& replica, - BufferHandle& buffer_handle) { - uint64_t offset = 0; + const Replica::Descriptor& replica, void* buffer_ptr) { if (replica.is_memory_replica() == false) { // For disk-based replica, split into slices based on file size + uint64_t offset = 0; uint64_t total_length = replica.get_disk_descriptor().object_size; while (offset < total_length) { auto chunk_size = std::min(total_length - offset, kMaxSliceSize); - void* chunk_ptr = static_cast(buffer_handle.ptr()) + offset; + void* chunk_ptr = static_cast(buffer_ptr) + offset; slices.emplace_back(Slice{chunk_ptr, chunk_size}); offset += chunk_size; } } else { // For memory-based replica, split into slices based on buffer // descriptors - for (auto& handle : - replica.get_memory_descriptor().buffer_descriptors) { - void* chunk_ptr = static_cast(buffer_handle.ptr()) + offset; - slices.emplace_back(Slice{chunk_ptr, handle.size_}); - offset += handle.size_; - } + auto& handle = replica.get_memory_descriptor().buffer_descriptor; + void* chunk_ptr = buffer_ptr; + slices.emplace_back(Slice{chunk_ptr, handle.size_}); } return 0; } diff --git a/mooncake-store/src/http_metadata_server.cpp b/mooncake-store/src/http_metadata_server.cpp index 2dd1bbe80..e35152edf 100644 --- a/mooncake-store/src/http_metadata_server.cpp +++ b/mooncake-store/src/http_metadata_server.cpp @@ -11,7 +11,7 @@ namespace mooncake { HttpMetadataServer::HttpMetadataServer(uint16_t port, const std::string& host) : port_(port), host_(host), - server_(std::make_unique(4, port)), + server_(std::make_unique(4, port, host_)), running_(false) { init_server(); } diff --git a/mooncake-store/src/master.cpp b/mooncake-store/src/master.cpp index b96b52be2..80f886b48 100644 --- a/mooncake-store/src/master.cpp +++ b/mooncake-store/src/master.cpp @@ -86,6 +86,14 @@ DEFINE_int32(http_metadata_server_port, 8080, DEFINE_string(http_metadata_server_host, "0.0.0.0", "Host for HTTP metadata server to bind to"); +DEFINE_uint64(put_start_discard_timeout_sec, + mooncake::DEFAULT_PUT_START_DISCARD_TIMEOUT, + "Timeout for discarding uncompleted PutStart operations"); +DEFINE_uint64(put_start_release_timeout_sec, + mooncake::DEFAULT_PUT_START_RELEASE_TIMEOUT, + "Timeout for releasing space allocated in uncompleted PutStart " + "operations"); + void InitMasterConf(const mooncake::DefaultConfig& default_config, mooncake::MasterConfig& master_config) { // Initialize the master service configuration from the default config @@ -147,6 +155,12 @@ void InitMasterConf(const mooncake::DefaultConfig& default_config, default_config.GetString("http_metadata_server_host", &master_config.http_metadata_server_host, FLAGS_http_metadata_server_host); + default_config.GetUInt64("put_start_discard_timeout_sec", + &master_config.put_start_discard_timeout_sec, + FLAGS_put_start_discard_timeout_sec); + default_config.GetUInt64("put_start_release_timeout_sec", + &master_config.put_start_release_timeout_sec, + FLAGS_put_start_release_timeout_sec); } void LoadConfigFromCmdline(mooncake::MasterConfig& master_config, @@ -303,6 +317,20 @@ void LoadConfigFromCmdline(mooncake::MasterConfig& master_config, master_config.http_metadata_server_host = FLAGS_http_metadata_server_host; } + if ((google::GetCommandLineFlagInfo("put_start_discard_timeout_sec", + &info) && + !info.is_default) || + !conf_set) { + master_config.put_start_discard_timeout_sec = + FLAGS_put_start_discard_timeout_sec; + } + if ((google::GetCommandLineFlagInfo("put_start_release_timeout_sec", + &info) && + !info.is_default) || + !conf_set) { + master_config.put_start_release_timeout_sec = + FLAGS_put_start_release_timeout_sec; + } } // Function to start HTTP metadata server @@ -404,7 +432,11 @@ int main(int argc, char* argv[]) { << ", http_metadata_server_port=" << master_config.http_metadata_server_port << ", http_metadata_server_host=" - << master_config.http_metadata_server_host; + << master_config.http_metadata_server_host + << ", put_start_discard_timeout_sec=" + << master_config.put_start_discard_timeout_sec + << ", put_start_release_timeout_sec=" + << master_config.put_start_release_timeout_sec; // Start HTTP metadata server if enabled std::unique_ptr http_metadata_server; diff --git a/mooncake-store/src/master_client.cpp b/mooncake-store/src/master_client.cpp index cb9923409..cb5408bd8 100644 --- a/mooncake-store/src/master_client.cpp +++ b/mooncake-store/src/master_client.cpp @@ -13,6 +13,7 @@ #include "rpc_service.h" #include "types.h" #include "utils/scoped_vlog_timer.h" +#include "master_metric_manager.h" namespace mooncake { @@ -34,6 +35,11 @@ struct RpcNameTraits<&WrappedMasterService::GetReplicaList> { static constexpr const char* value = "GetReplicaList"; }; +template <> +struct RpcNameTraits<&WrappedMasterService::CalcCacheStats> { + static constexpr const char* value = "CalcCacheStats"; +}; + template <> struct RpcNameTraits<&WrappedMasterService::GetReplicaListByRegex> { static constexpr const char* value = "GetReplicaListByRegex"; @@ -253,6 +259,12 @@ std::vector> MasterClient::BatchExistKey( return result; } +tl::expected +MasterClient::CalcCacheStats() { + return invoke_rpc<&WrappedMasterService::CalcCacheStats, + MasterMetricManager::CacheHitStatDict>(); +} + tl::expected>, ErrorCode> MasterClient::GetReplicaListByRegex(const std::string& str) { @@ -297,16 +309,14 @@ MasterClient::PutStart(const std::string& key, ScopedVLogTimer timer(1, "MasterClient::PutStart"); timer.LogRequest("key=", key, ", slice_count=", slice_lengths.size()); - // Convert size_t to uint64_t for RPC - std::vector rpc_slice_lengths; - rpc_slice_lengths.reserve(slice_lengths.size()); - for (const auto& length : slice_lengths) { - rpc_slice_lengths.push_back(length); + uint64_t total_slice_length = 0; + for (const auto& slice_length : slice_lengths) { + total_slice_length += slice_length; } auto result = invoke_rpc<&WrappedMasterService::PutStart, std::vector>( - key, rpc_slice_lengths, config); + client_id_, key, total_slice_length, config); timer.LogResponseExpected(result); return result; } @@ -319,9 +329,19 @@ MasterClient::BatchPutStart( ScopedVLogTimer timer(1, "MasterClient::BatchPutStart"); timer.LogRequest("keys_count=", keys.size()); + std::vector total_slice_lengths; + total_slice_lengths.reserve(slice_lengths.size()); + for (const auto& slice_lengths : slice_lengths) { + uint64_t total_slice_length = 0; + for (const auto& slice_length : slice_lengths) { + total_slice_length += slice_length; + } + total_slice_lengths.emplace_back(total_slice_length); + } + auto result = invoke_batch_rpc<&WrappedMasterService::BatchPutStart, std::vector>( - keys.size(), keys, slice_lengths, config); + keys.size(), client_id_, keys, total_slice_lengths, config); timer.LogResponse("result=", result.size(), " operations"); return result; } @@ -331,8 +351,8 @@ tl::expected MasterClient::PutEnd(const std::string& key, ScopedVLogTimer timer(1, "MasterClient::PutEnd"); timer.LogRequest("key=", key); - auto result = - invoke_rpc<&WrappedMasterService::PutEnd, void>(key, replica_type); + auto result = invoke_rpc<&WrappedMasterService::PutEnd, void>( + client_id_, key, replica_type); timer.LogResponseExpected(result); return result; } @@ -343,7 +363,7 @@ std::vector> MasterClient::BatchPutEnd( timer.LogRequest("keys_count=", keys.size()); auto result = invoke_batch_rpc<&WrappedMasterService::BatchPutEnd, void>( - keys.size(), keys); + keys.size(), client_id_, keys); timer.LogResponse("result=", result.size(), " operations"); return result; } @@ -353,8 +373,8 @@ tl::expected MasterClient::PutRevoke( ScopedVLogTimer timer(1, "MasterClient::PutRevoke"); timer.LogRequest("key=", key); - auto result = - invoke_rpc<&WrappedMasterService::PutRevoke, void>(key, replica_type); + auto result = invoke_rpc<&WrappedMasterService::PutRevoke, void>( + client_id_, key, replica_type); timer.LogResponseExpected(result); return result; } @@ -365,7 +385,7 @@ std::vector> MasterClient::BatchPutRevoke( timer.LogRequest("keys_count=", keys.size()); auto result = invoke_batch_rpc<&WrappedMasterService::BatchPutRevoke, void>( - keys.size(), keys); + keys.size(), client_id_, keys); timer.LogResponse("result=", result.size(), " operations"); return result; } @@ -399,48 +419,47 @@ tl::expected MasterClient::RemoveAll() { } tl::expected MasterClient::MountSegment( - const Segment& segment, const UUID& client_id) { + const Segment& segment) { ScopedVLogTimer timer(1, "MasterClient::MountSegment"); timer.LogRequest("base=", segment.base, ", size=", segment.size, ", name=", segment.name, ", id=", segment.id, - ", client_id=", client_id); + ", client_id=", client_id_); auto result = invoke_rpc<&WrappedMasterService::MountSegment, void>( - segment, client_id); + segment, client_id_); timer.LogResponseExpected(result); return result; } tl::expected MasterClient::ReMountSegment( - const std::vector& segments, const UUID& client_id) { + const std::vector& segments) { ScopedVLogTimer timer(1, "MasterClient::ReMountSegment"); timer.LogRequest("segments_num=", segments.size(), - ", client_id=", client_id); + ", client_id=", client_id_); auto result = invoke_rpc<&WrappedMasterService::ReMountSegment, void>( - segments, client_id); + segments, client_id_); timer.LogResponseExpected(result); return result; } tl::expected MasterClient::UnmountSegment( - const UUID& segment_id, const UUID& client_id) { + const UUID& segment_id) { ScopedVLogTimer timer(1, "MasterClient::UnmountSegment"); - timer.LogRequest("segment_id=", segment_id, ", client_id=", client_id); + timer.LogRequest("segment_id=", segment_id, ", client_id=", client_id_); auto result = invoke_rpc<&WrappedMasterService::UnmountSegment, void>( - segment_id, client_id); + segment_id, client_id_); timer.LogResponseExpected(result); return result; } -tl::expected MasterClient::Ping( - const UUID& client_id) { +tl::expected MasterClient::Ping() { ScopedVLogTimer timer(1, "MasterClient::Ping"); - timer.LogRequest("client_id=", client_id); + timer.LogRequest("client_id=", client_id_); auto result = - invoke_rpc<&WrappedMasterService::Ping, PingResponse>(client_id); + invoke_rpc<&WrappedMasterService::Ping, PingResponse>(client_id_); timer.LogResponseExpected(result); return result; } diff --git a/mooncake-store/src/master_metric_manager.cpp b/mooncake-store/src/master_metric_manager.cpp index 330f1d841..080309134 100644 --- a/mooncake-store/src/master_metric_manager.cpp +++ b/mooncake-store/src/master_metric_manager.cpp @@ -1,8 +1,10 @@ #include "master_metric_manager.h" +#include #include // For std::fixed, std::setprecision #include // For string building during serialization #include // Required by histogram serialization +#include #include "utils.h" @@ -23,11 +25,17 @@ MasterMetricManager::MasterMetricManager() "Total memory bytes currently allocated across all segments"), mem_total_capacity_("master_total_capacity_bytes", "Total memory capacity across all mounted segments"), - file_total_capacity_("master_total_file_capacity_bytes", - "Total capacity for file storage in 3fs/nfs"), + mem_allocated_size_per_segment_( + "segment_allocated_bytes", + "Total memory bytes currently allocated of the segment", {"segment"}), + mem_total_capacity_per_segment_( + "segment_total_capacity_bytes", + "Total memory capacity of the mounted segment", {"segment"}), file_allocated_size_( "master_allocated_file_size_bytes", "Total bytes currently allocated for file storage in 3fs/nfs"), + file_total_capacity_("master_total_file_capacity_bytes", + "Total capacity for file storage in 3fs/nfs"), key_count_("master_key_count", "Total number of keys managed by the master"), soft_pin_key_count_( @@ -106,6 +114,19 @@ MasterMetricManager::MasterMetricManager() ping_failures_("master_ping_failures_total", "Total number of failed ping requests"), + // Initialize cache hit rate metrics + mem_cache_hit_nums_("mem_cache_hit_nums_", + "Total number of cache hits in the memory pool"), + file_cache_hit_nums_("file_cache_hit_nums_", + "Total number of cache hits in the ssd"), + mem_cache_nums_("mem_cache_nums_", + "Total number of cached values in the memory pool"), + file_cache_nums_("file_cache_nums_", + "Total number of cached values in the ssd"), + valid_get_nums_("valid_get_nums_", + "Total number of valid get operations"), + total_get_nums_("total_get_nums_", "Total number of get operations"), + // Initialize Batch Request Counters batch_exist_key_requests_( "master_batch_exist_key_requests_total", @@ -189,23 +210,51 @@ MasterMetricManager::MasterMetricManager() evicted_key_count_("master_evicted_key_count", "Total number of keys evicted"), evicted_size_("master_evicted_size_bytes", - "Total bytes of evicted objects") {} + "Total bytes of evicted objects"), + + // Initialize Discarded Replicas Counters + put_start_discard_cnt_("master_put_start_discard_cnt", + "Total number of discarded PutStart operations"), + put_start_release_cnt_("master_put_start_release_cnt", + "Total number of released PutStart operations"), + put_start_discarded_staging_size_( + "master_put_start_discarded_staging_size", + "Total size of memory replicas in discarded but not yet released " + "PutStart operations") {} // --- Metric Interface Methods --- // Memory Storage Metrics -void MasterMetricManager::inc_allocated_mem_size(int64_t val) { +void MasterMetricManager::inc_allocated_mem_size(const std::string& segment, + int64_t val) { mem_allocated_size_.inc(val); + if (!segment.empty()) mem_allocated_size_per_segment_.inc({segment}, val); } -void MasterMetricManager::dec_allocated_mem_size(int64_t val) { + +void MasterMetricManager::dec_allocated_mem_size(const std::string& segment, + int64_t val) { mem_allocated_size_.dec(val); + if (!segment.empty()) mem_allocated_size_per_segment_.dec({segment}, val); +} + +void MasterMetricManager::reset_allocated_mem_size() { + mem_allocated_size_.reset(); } -void MasterMetricManager::inc_total_mem_capacity(int64_t val) { +void MasterMetricManager::inc_total_mem_capacity(const std::string& segment, + int64_t val) { mem_total_capacity_.inc(val); + if (!segment.empty()) mem_total_capacity_per_segment_.inc({segment}, val); } -void MasterMetricManager::dec_total_mem_capacity(int64_t val) { + +void MasterMetricManager::dec_total_mem_capacity(const std::string& segment, + int64_t val) { mem_total_capacity_.dec(val); + if (!segment.empty()) mem_total_capacity_per_segment_.dec({segment}, val); +} + +void MasterMetricManager::reset_total_mem_capacity() { + mem_total_capacity_.reset(); } int64_t MasterMetricManager::get_allocated_mem_size() { @@ -225,6 +274,26 @@ double MasterMetricManager::get_global_mem_used_ratio(void) { return allocated / capacity; } +int64_t MasterMetricManager::get_segment_allocated_mem_size( + const std::string& segment) { + return mem_allocated_size_per_segment_.value({segment}); +} + +int64_t MasterMetricManager::get_segment_total_mem_capacity( + const std::string& segment) { + return mem_total_capacity_per_segment_.value({segment}); +} + +double MasterMetricManager::get_segment_mem_used_ratio( + const std::string& segment) { + double allocated = get_segment_allocated_mem_size(segment); + double capacity = get_segment_total_mem_capacity(segment); + if (capacity == 0) { + return 0.0; + } + return allocated / capacity; +} + // File Storage Metrics void MasterMetricManager::inc_allocated_file_size(int64_t val) { file_allocated_size_.inc(val); @@ -291,6 +360,32 @@ int64_t MasterMetricManager::get_active_clients() { return active_clients_.value(); } +// cache hit rate metrics +void MasterMetricManager::inc_mem_cache_hit_nums(int64_t val) { + mem_cache_hit_nums_.inc(val); +} +void MasterMetricManager::inc_file_cache_hit_nums(int64_t val) { + file_cache_hit_nums_.inc(val); +} +void MasterMetricManager::inc_mem_cache_nums(int64_t val) { + mem_cache_nums_.inc(val); +} +void MasterMetricManager::inc_file_cache_nums(int64_t val) { + file_cache_nums_.inc(val); +} +void MasterMetricManager::dec_mem_cache_nums(int64_t val) { + mem_cache_nums_.dec(val); +} +void MasterMetricManager::dec_file_cache_nums(int64_t val) { + file_cache_nums_.dec(val); +} +void MasterMetricManager::inc_valid_get_nums(int64_t val) { + valid_get_nums_.inc(val); +} +void MasterMetricManager::inc_total_get_nums(int64_t val) { + total_get_nums_.inc(val); +} + // Operation Statistics (Counters) void MasterMetricManager::inc_exist_key_requests(int64_t val) { exist_key_requests_.inc(val); @@ -439,6 +534,19 @@ void MasterMetricManager::inc_batch_put_revoke_partial_success( batch_put_revoke_failed_items_.inc(failed_items); } +// PutStart Discard Metrics +void MasterMetricManager::inc_put_start_discard_cnt(int64_t count, + int64_t size) { + put_start_discard_cnt_.inc(count); + put_start_discarded_staging_size_.inc(size); +} + +void MasterMetricManager::inc_put_start_release_cnt(int64_t count, + int64_t size) { + put_start_release_cnt_.inc(count); + put_start_discarded_staging_size_.dec(size); +} + int64_t MasterMetricManager::get_put_start_requests() { return put_start_requests_.value(); } @@ -670,6 +778,19 @@ int64_t MasterMetricManager::get_evicted_size() { return evicted_size_.value(); } +// PutStart Discard Metrics Getters +int64_t MasterMetricManager::get_put_start_discard_cnt() { + return put_start_discard_cnt_.value(); +} + +int64_t MasterMetricManager::get_put_start_release_cnt() { + return put_start_release_cnt_.value(); +} + +int64_t MasterMetricManager::get_put_start_discarded_staging_size() { + return put_start_discarded_staging_size_.value(); +} + // --- Setters --- void MasterMetricManager::set_enable_ha(bool enable_ha) { enable_ha_ = enable_ha; @@ -692,6 +813,8 @@ std::string MasterMetricManager::serialize_metrics() { // Serialize Gauges serialize_metric(mem_allocated_size_); serialize_metric(mem_total_capacity_); + serialize_metric(mem_allocated_size_per_segment_); + serialize_metric(mem_total_capacity_per_segment_); serialize_metric(file_allocated_size_); serialize_metric(file_total_capacity_); serialize_metric(key_count_); @@ -751,9 +874,77 @@ std::string MasterMetricManager::serialize_metrics() { serialize_metric(evicted_key_count_); serialize_metric(evicted_size_); + // Serialize PutStart Discard Metrics + serialize_metric(put_start_discard_cnt_); + serialize_metric(put_start_release_cnt_); + serialize_metric(put_start_discarded_staging_size_); + return ss.str(); } +MasterMetricManager::CacheHitStatDict +MasterMetricManager::calculate_cache_stats() { + MasterMetricManager::CacheHitStatDict stats_dict; + int64_t mem_cache_hits = mem_cache_hit_nums_.value(); + int64_t ssd_cache_hits = file_cache_hit_nums_.value(); + int64_t mem_total_cache = mem_cache_nums_.value(); + int64_t ssd_total_cache = file_cache_nums_.value(); + + int64_t total_hits = mem_cache_hits + ssd_cache_hits; + int64_t total_cache = mem_total_cache + ssd_total_cache; + + int64_t valid_get_nums = valid_get_nums_.value(); + int64_t total_get_nums = total_get_nums_.value(); + + double mem_hit_rate = 0.0; + if (mem_total_cache > 0) { + mem_hit_rate = static_cast(mem_cache_hits) / + static_cast(mem_total_cache); + mem_hit_rate = std::round(mem_hit_rate * 100.0) / 100.0; + } + + double ssd_hit_rate = 0.0; + if (ssd_total_cache > 0) { + ssd_hit_rate = static_cast(ssd_cache_hits) / + static_cast(ssd_total_cache); + ssd_hit_rate = std::round(ssd_hit_rate * 100.0) / 100.0; + } + + double total_hit_rate = 0.0; + if (total_cache > 0) { + total_hit_rate = + static_cast(total_hits) / static_cast(total_cache); + total_hit_rate = std::round(total_hit_rate * 100.0) / 100.0; + } + + double valid_get_rate = 0.0; + if (total_get_nums > 0) { + valid_get_rate = static_cast(valid_get_nums) / + static_cast(total_get_nums); + valid_get_rate = std::round(valid_get_rate * 100.0) / 100.0; + } + + add_stat_to_dict(stats_dict, CacheHitStat::MEMORY_HITS, mem_cache_hits); + add_stat_to_dict(stats_dict, CacheHitStat::SSD_HITS, ssd_cache_hits); + add_stat_to_dict(stats_dict, CacheHitStat::MEMORY_TOTAL, mem_total_cache); + add_stat_to_dict(stats_dict, CacheHitStat::SSD_TOTAL, ssd_total_cache); + add_stat_to_dict(stats_dict, CacheHitStat::MEMORY_HIT_RATE, mem_hit_rate); + add_stat_to_dict(stats_dict, CacheHitStat::SSD_HIT_RATE, ssd_hit_rate); + add_stat_to_dict(stats_dict, CacheHitStat::OVERALL_HIT_RATE, + total_hit_rate); + add_stat_to_dict(stats_dict, CacheHitStat::VALID_GET_RATE, valid_get_rate); + return stats_dict; +} + +void MasterMetricManager::add_stat_to_dict( + MasterMetricManager::CacheHitStatDict& dict, + MasterMetricManager::CacheHitStat type, double value) { + auto it = stat_names_.find(type); + if (it != stat_names_.end()) { + dict[it->first] = value; + } +} + // --- Human-Readable Summary --- std::string MasterMetricManager::get_summary_string() { std::stringstream ss; @@ -832,6 +1023,12 @@ std::string MasterMetricManager::get_summary_string() { int64_t ping = ping_requests_.value(); int64_t ping_fails = ping_failures_.value(); + // Discard counters + int64_t put_start_discard_cnt = put_start_discard_cnt_.value(); + int64_t put_start_release_cnt = put_start_release_cnt_.value(); + int64_t put_start_discarded_staging_size = + put_start_discarded_staging_size_.value(); + // --- Format the summary string --- ss << "Mem Storage: " << byte_size_to_string(mem_allocated) << " / " << byte_size_to_string(mem_capacity); @@ -907,6 +1104,11 @@ std::string MasterMetricManager::get_summary_string() { << eviction_attempts << ", " << "keys=" << evicted_key_count << ", " << "size=" << byte_size_to_string(evicted_size); + // Discard summary + ss << " | Discard: " << "Released/Total=" << put_start_release_cnt << "/" + << put_start_discard_cnt << ", StagingSize=" + << byte_size_to_string(put_start_discarded_staging_size); + return ss.str(); } diff --git a/mooncake-store/src/master_service.cpp b/mooncake-store/src/master_service.cpp index 3346ff7a9..56b9294b7 100644 --- a/mooncake-store/src/master_service.cpp +++ b/mooncake-store/src/master_service.cpp @@ -27,7 +27,9 @@ MasterService::MasterService(const MasterServiceConfig& config) global_file_segment_size_(config.global_file_segment_size), segment_manager_(config.memory_allocator), memory_allocator_type_(config.memory_allocator), - allocation_strategy_(std::make_shared()) { + allocation_strategy_(std::make_shared()), + put_start_discard_timeout_sec_(config.put_start_discard_timeout_sec), + put_start_release_timeout_sec_(config.put_start_release_timeout_sec) { if (eviction_ratio_ < 0.0 || eviction_ratio_ > 1.0) { LOG(ERROR) << "Eviction ratio must be between 0.0 and 1.0, " << "current value: " << eviction_ratio_; @@ -41,6 +43,16 @@ MasterService::MasterService(const MasterServiceConfig& config) throw std::invalid_argument("Invalid eviction high watermark ratio"); } + if (put_start_release_timeout_sec_ <= put_start_discard_timeout_sec_) { + LOG(ERROR) << "put_start_release_timeout=" + << put_start_release_timeout_sec_.count() + << " must be larger than put_start_discard_timeout_sec=" + << put_start_discard_timeout_sec_.count(); + throw std::invalid_argument( + "put_start_release_timeout must be larger than " + "put_start_discard_timeout_sec"); + } + eviction_running_ = true; eviction_thread_ = std::thread(&MasterService::EvictionThreadFunc, this); VLOG(1) << "action=start_eviction_thread"; @@ -313,6 +325,9 @@ auto MasterService::GetReplicaListByRegex(const std::string& regex_pattern) auto MasterService::GetReplicaList(std::string_view key) -> tl::expected { MetadataAccessor accessor(this, std::string(key)); + + MasterMetricManager::instance().inc_total_get_nums(); + if (!accessor.Exists()) { VLOG(1) << "key=" << key << ", info=object_not_found"; return tl::make_unexpected(ErrorCode::OBJECT_NOT_FOUND); @@ -332,6 +347,12 @@ auto MasterService::GetReplicaList(std::string_view key) return tl::make_unexpected(ErrorCode::REPLICA_IS_NOT_READY); } + if (replica_list[0].is_memory_replica()) { + MasterMetricManager::instance().inc_mem_cache_hit_nums(); + } else if (replica_list[0].is_disk_replica()) { + MasterMetricManager::instance().inc_file_cache_hit_nums(); + } + MasterMetricManager::instance().inc_valid_get_nums(); // Grant a lease to the object so it will not be removed // when the client is reading it. metadata.GrantLease(default_kv_lease_ttl_, default_kv_soft_pin_ttl_); @@ -340,44 +361,59 @@ auto MasterService::GetReplicaList(std::string_view key) default_kv_lease_ttl_); } -auto MasterService::PutStart(const std::string& key, - const std::vector& slice_lengths, +auto MasterService::PutStart(const UUID& client_id, const std::string& key, + const uint64_t slice_length, const ReplicateConfig& config) -> tl::expected, ErrorCode> { - if (config.replica_num == 0 || key.empty() || slice_lengths.empty()) { + if (config.replica_num == 0 || key.empty() || slice_length == 0) { LOG(ERROR) << "key=" << key << ", replica_num=" << config.replica_num - << ", slice_count=" << slice_lengths.size() + << ", slice_length=" << slice_length << ", key_size=" << key.size() << ", error=invalid_params"; return tl::make_unexpected(ErrorCode::INVALID_PARAMS); } // Validate slice lengths uint64_t total_length = 0; - for (size_t i = 0; i < slice_lengths.size(); ++i) { - if ((memory_allocator_type_ == BufferAllocatorType::CACHELIB) && - (slice_lengths[i] > kMaxSliceSize)) { - LOG(ERROR) << "key=" << key << ", slice_index=" << i - << ", slice_size=" << slice_lengths[i] - << ", max_size=" << kMaxSliceSize - << ", error=invalid_slice_size"; - return tl::make_unexpected(ErrorCode::INVALID_PARAMS); - } - total_length += slice_lengths[i]; + if ((memory_allocator_type_ == BufferAllocatorType::CACHELIB) && + (slice_length > kMaxSliceSize)) { + LOG(ERROR) << "key=" << key << ", slice_length=" << slice_length + << ", max_size=" << kMaxSliceSize + << ", error=invalid_slice_size"; + return tl::make_unexpected(ErrorCode::INVALID_PARAMS); } + total_length += slice_length; VLOG(1) << "key=" << key << ", value_length=" << total_length - << ", slice_count=" << slice_lengths.size() << ", config=" << config + << ", slice_length=" << slice_length << ", config=" << config << ", action=put_start_begin"; // Lock the shard and check if object already exists size_t shard_idx = getShardIndex(key); MutexLocker lock(&metadata_shards_[shard_idx].mutex); + const auto now = std::chrono::steady_clock::now(); auto it = metadata_shards_[shard_idx].metadata.find(key); if (it != metadata_shards_[shard_idx].metadata.end() && !CleanupStaleHandles(it->second)) { - LOG(INFO) << "key=" << key << ", info=object_already_exists"; - return tl::make_unexpected(ErrorCode::OBJECT_ALREADY_EXISTS); + auto& metadata = it->second; + // If the object's PutStart expired and has not completed any + // replicas, we can discard it and allow the new PutStart to + // go. + if (!metadata.HasCompletedReplicas() && + metadata.put_start_time + put_start_discard_timeout_sec_ < now) { + auto replicas = metadata.DiscardProcessingReplicas(); + if (!replicas.empty()) { + std::lock_guard lock(discarded_replicas_mutex_); + discarded_replicas_.emplace_back( + std::move(replicas), + metadata.put_start_time + put_start_release_timeout_sec_); + } + metadata_shards_[shard_idx].processing_keys.erase(key); + metadata_shards_[shard_idx].metadata.erase(it); + } else { + LOG(INFO) << "key=" << key << ", info=object_already_exists"; + return tl::make_unexpected(ErrorCode::OBJECT_ALREADY_EXISTS); + } } // Allocate replicas @@ -389,7 +425,7 @@ auto MasterService::PutStart(const std::string& key, auto& allocators_by_name = allocator_access.getAllocatorsByName(); auto allocation_result = allocation_strategy_->Allocate( - allocators, allocators_by_name, slice_lengths, config); + allocators, allocators_by_name, slice_length, config); if (!allocation_result.has_value()) { VLOG(1) << "Failed to allocate all replicas for key=" << key @@ -422,12 +458,16 @@ auto MasterService::PutStart(const std::string& key, // PutEnd is called. metadata_shards_[shard_idx].metadata.emplace( std::piecewise_construct, std::forward_as_tuple(key), - std::forward_as_tuple(total_length, std::move(replicas), + std::forward_as_tuple(client_id, now, total_length, std::move(replicas), config.with_soft_pin)); + // Also insert the metadata into processing set for monitoring. + metadata_shards_[shard_idx].processing_keys.insert(key); + return replica_list; } -auto MasterService::PutEnd(const std::string& key, ReplicaType replica_type) +auto MasterService::PutEnd(const UUID& client_id, const std::string& key, + ReplicaType replica_type) -> tl::expected { MetadataAccessor accessor(this, key); if (!accessor.Exists()) { @@ -436,11 +476,28 @@ auto MasterService::PutEnd(const std::string& key, ReplicaType replica_type) } auto& metadata = accessor.Get(); + if (client_id != metadata.client_id) { + LOG(ERROR) << "Illegal client " << client_id << " to PutEnd key " << key + << ", was PutStart-ed by " << metadata.client_id; + return tl::make_unexpected(ErrorCode::ILLEGAL_CLIENT); + } + for (auto& replica : metadata.replicas) { if (replica.type() == replica_type) { replica.mark_complete(); } } + + // If the object is completed, remove it from the processing set. + if (metadata.IsAllReplicasComplete() && accessor.InProcessing()) { + accessor.EraseFromProcessing(); + } + + if (replica_type == ReplicaType::MEMORY) { + MasterMetricManager::instance().inc_mem_cache_nums(); + } else if (replica_type == ReplicaType::DISK) { + MasterMetricManager::instance().inc_file_cache_nums(); + } // 1. Set lease timeout to now, indicating that the object has no lease // at beginning. 2. If this object has soft pin enabled, set it to be soft // pinned. @@ -448,7 +505,8 @@ auto MasterService::PutEnd(const std::string& key, ReplicaType replica_type) return {}; } -auto MasterService::PutRevoke(const std::string& key, ReplicaType replica_type) +auto MasterService::PutRevoke(const UUID& client_id, const std::string& key, + ReplicaType replica_type) -> tl::expected { MetadataAccessor accessor(this, key); if (!accessor.Exists()) { @@ -457,6 +515,12 @@ auto MasterService::PutRevoke(const std::string& key, ReplicaType replica_type) } auto& metadata = accessor.Get(); + if (client_id != metadata.client_id) { + LOG(ERROR) << "Illegal client " << client_id << " to PutRevoke key " + << key << ", was PutStart-ed by " << metadata.client_id; + return tl::make_unexpected(ErrorCode::ILLEGAL_CLIENT); + } + if (auto status = metadata.HasDiffRepStatus(ReplicaStatus::PROCESSING, replica_type)) { LOG(ERROR) << "key=" << key << ", status=" << *status @@ -464,7 +528,19 @@ auto MasterService::PutRevoke(const std::string& key, ReplicaType replica_type) return tl::make_unexpected(ErrorCode::INVALID_WRITE); } + if (replica_type == ReplicaType::MEMORY) { + MasterMetricManager::instance().dec_mem_cache_nums(); + } else if (replica_type == ReplicaType::DISK) { + MasterMetricManager::instance().dec_file_cache_nums(); + } + metadata.EraseReplica(replica_type); + + // If the object is completed, remove it from the processing set. + if (metadata.IsAllReplicasComplete() && accessor.InProcessing()) { + accessor.EraseFromProcessing(); + } + if (metadata.IsValid() == false) { accessor.Erase(); } @@ -472,21 +548,21 @@ auto MasterService::PutRevoke(const std::string& key, ReplicaType replica_type) } std::vector> MasterService::BatchPutEnd( - const std::vector& keys) { + const UUID& client_id, const std::vector& keys) { std::vector> results; results.reserve(keys.size()); for (const auto& key : keys) { - results.emplace_back(PutEnd(key, ReplicaType::MEMORY)); + results.emplace_back(PutEnd(client_id, key, ReplicaType::MEMORY)); } return results; } std::vector> MasterService::BatchPutRevoke( - const std::vector& keys) { + const UUID& client_id, const std::vector& keys) { std::vector> results; results.reserve(keys.size()); for (const auto& key : keys) { - results.emplace_back(PutRevoke(key, ReplicaType::MEMORY)); + results.emplace_back(PutRevoke(client_id, key, ReplicaType::MEMORY)); } return results; } @@ -578,10 +654,11 @@ long MasterService::RemoveAll() { continue; } - // Only remove objects with expired leases + // Only remove completed objects with expired leases auto it = shard.metadata.begin(); while (it != shard.metadata.end()) { - if (it->second.IsLeaseExpired(now)) { + if (it->second.IsLeaseExpired(now) && + it->second.IsAllReplicasComplete()) { total_freed_size += it->second.size * it->second.GetMemReplicaCount(); it = shard.metadata.erase(it); @@ -680,6 +757,81 @@ void MasterService::EvictionThreadFunc() { VLOG(1) << "action=eviction_thread_stopped"; } +void MasterService::DiscardExpiredProcessingKeys( + MetadataShard& shard, const std::chrono::steady_clock::time_point& now) { + std::list discarded_replicas; + + for (auto key_it = shard.processing_keys.begin(); + key_it != shard.processing_keys.end();) { + auto it = shard.metadata.find(*key_it); + if (it == shard.metadata.end()) { + // The key has been removed from metadata. This should be + // impossible. + LOG(ERROR) << "Key " << *key_it + << " was removed while in processing"; + key_it = shard.processing_keys.erase(key_it); + continue; + } + + auto& metadata = it->second; + // If the object is not valid or not in processing state, just + // remove it from the processing set. + if (!metadata.IsValid() || metadata.IsAllReplicasComplete()) { + if (!metadata.IsValid()) { + shard.metadata.erase(it); + } + key_it = shard.processing_keys.erase(key_it); + continue; + } + + // If the object's PutStart timedout, discard and release it's + // space. Note that instead of releasing the space directly, we + // insert the replicas into the discarded list so that the + // discarding and releasing operations can be recorded in + // statistics. + const auto ttl = + metadata.put_start_time + put_start_release_timeout_sec_; + if (ttl < now) { + auto replicas = metadata.DiscardProcessingReplicas(); + if (!replicas.empty()) { + discarded_replicas.emplace_back(std::move(replicas), ttl); + } + + if (!metadata.IsValid()) { + // All replicas of this object are discarded, just + // remove the whole object. + shard.metadata.erase(it); + } + + key_it = shard.processing_keys.erase(key_it); + continue; + } + + key_it++; + } + + if (!discarded_replicas.empty()) { + std::lock_guard lock(discarded_replicas_mutex_); + discarded_replicas_.splice(discarded_replicas_.end(), + std::move(discarded_replicas)); + } +} + +uint64_t MasterService::ReleaseExpiredDiscardedReplicas( + const std::chrono::steady_clock::time_point& now) { + uint64_t released_cnt = 0; + std::lock_guard lock(discarded_replicas_mutex_); + discarded_replicas_.remove_if( + [&now, &released_cnt](const DiscardedReplicas& item) { + const bool expired = item.isExpired(now); + if (expired && item.memSize() > 0) { + released_cnt++; + } + return expired; + }); + return released_cnt; +} + void MasterService::BatchEvict(double evict_ratio_target, double evict_ratio_lowerbound) { if (evict_ratio_target < evict_ratio_lowerbound) { @@ -708,6 +860,10 @@ void MasterService::BatchEvict(double evict_ratio_target, metadata_shards_[(start_idx + i) % metadata_shards_.size()]; MutexLocker lock(&shard.mutex); + // Discard expired processing keys first so that they won't be counted + // in later evictions. + DiscardExpiredProcessingKeys(shard, now); + // object_count must be updated at beginning as it will be used later // to compute ideal_evict_num object_count += shard.metadata.size(); @@ -786,9 +942,13 @@ void MasterService::BatchEvict(double evict_ratio_target, } } + // Try releasing discarded replicas before we decide whether to do the + // second pass. + uint64_t released_discarded_cnt = ReleaseExpiredDiscardedReplicas(now); + // The ideal number of objects to evict in the second pass - long target_evict_num = - std::ceil(object_count * evict_ratio_lowerbound) - evicted_count; + long target_evict_num = std::ceil(object_count * evict_ratio_lowerbound) - + evicted_count - released_discarded_cnt; // The actual number of objects we can evict in the second pass target_evict_num = std::min(target_evict_num, @@ -909,7 +1069,7 @@ void MasterService::BatchEvict(double evict_ratio_target, } } - if (evicted_count > 0) { + if (evicted_count > 0 || released_discarded_cnt > 0) { need_eviction_ = false; MasterMetricManager::instance().inc_eviction_success(evicted_count, total_freed_size); @@ -1052,4 +1212,4 @@ std::string MasterService::ResolvePath(const std::string& key) const { return full_path.lexically_normal().string(); } -} // namespace mooncake +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-store/src/pybind_client.cpp b/mooncake-store/src/pybind_client.cpp index ef8e074cc..9ac7ca519 100644 --- a/mooncake-store/src/pybind_client.cpp +++ b/mooncake-store/src/pybind_client.cpp @@ -193,6 +193,8 @@ tl::expected PyClient::setup_internal( client_buffer_allocator_ = ClientBufferAllocator::create(local_buffer_size, this->protocol); if (local_buffer_size > 0) { + LOG(INFO) << "Registering local memory: " << local_buffer_size + << " bytes"; auto result = client_->RegisterLocalMemory( client_buffer_allocator_->getBase(), local_buffer_size, kWildcardLocation, false, true); @@ -614,7 +616,7 @@ std::shared_ptr PyClient::get_buffer(const std::string &key) { // Create slices for the allocated buffer std::vector slices; - allocateSlices(slices, replica, buffer_handle); + allocateSlices(slices, replica, buffer_handle.ptr()); // Get the object data auto get_result = client_->Get(key, query_result.value(), slices); @@ -691,7 +693,7 @@ std::vector> PyClient::batch_get_buffer_internal( auto buffer_handle = std::make_unique(std::move(*alloc_result)); std::vector slices; - allocateSlices(slices, replica, *buffer_handle); + allocateSlices(slices, replica, buffer_handle->ptr()); valid_ops.emplace_back( KeyOp{.original_index = i, @@ -819,23 +821,7 @@ tl::expected PyClient::get_into_internal( // Step 2: Split user buffer according to object info and create // slices std::vector slices; - uint64_t offset = 0; - - if (replica.is_memory_replica() == false) { - while (offset < total_size) { - auto chunk_size = std::min(total_size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffer) + offset; - slices.emplace_back(Slice{chunk_ptr, chunk_size}); - offset += chunk_size; - } - } else { - for (auto &handle : - replica.get_memory_descriptor().buffer_descriptors) { - void *chunk_ptr = static_cast(buffer) + offset; - slices.emplace_back(Slice{chunk_ptr, handle.size_}); - offset += handle.size_; - } - } + allocateSlices(slices, replica, buffer); // Step 3: Read data directly into user buffer auto get_result = client_->Get(key, query_result.value(), slices); @@ -1064,22 +1050,7 @@ std::vector> PyClient::batch_get_into_internal( // Create slices for this key's buffer std::vector key_slices; - uint64_t offset = 0; - if (replica.is_memory_replica() == false) { - while (offset < total_size) { - auto chunk_size = std::min(total_size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffers[i]) + offset; - key_slices.emplace_back(Slice{chunk_ptr, chunk_size}); - offset += chunk_size; - } - } else { - for (auto &handle : - replica.get_memory_descriptor().buffer_descriptors) { - void *chunk_ptr = static_cast(buffers[i]) + offset; - key_slices.emplace_back(Slice{chunk_ptr, handle.size_}); - offset += handle.size_; - } - } + allocateSlices(key_slices, replica, buffers[i]); // Store operation info for batch processing valid_operations.push_back( @@ -1215,8 +1186,8 @@ std::vector PyClient::batch_put_from_multi_buffers( auto duration_call = std::chrono::duration_cast( std::chrono::steady_clock::now() - start); - LOG(INFO) << "batch_put_from_multi_buffers: " << duration_call.count() - << "us"; + VLOG(1) << "batch_put_from_multi_buffers: " << duration_call.count() + << " us"; return results; } @@ -1273,8 +1244,8 @@ std::vector PyClient::batch_get_into_multi_buffers( } auto duration_call = std::chrono::duration_cast( std::chrono::steady_clock::now() - start); - LOG(INFO) << "batch_get_into_multi_buffers: " << duration_call.count() - << "us"; + VLOG(1) << "batch_get_into_multi_buffers: " << duration_call.count() + << " us"; return results; } diff --git a/mooncake-store/src/rpc_service.cpp b/mooncake-store/src/rpc_service.cpp index 057e5908b..88159e9e4 100644 --- a/mooncake-store/src/rpc_service.cpp +++ b/mooncake-store/src/rpc_service.cpp @@ -86,13 +86,11 @@ void WrappedMasterService::init_http_server() { if (replicas[i].is_memory_replica()) { auto& memory_descriptors = replicas[i].get_memory_descriptor(); - for (const auto& handle : - memory_descriptors.buffer_descriptors) { - std::string tmp = ""; - struct_json::to_json(handle, tmp); - ss += tmp; - ss += "\n"; - } + std::string tmp = ""; + struct_json::to_json( + memory_descriptors.buffer_descriptor, tmp); + ss += tmp; + ss += "\n"; } } resp.set_status_and_content(status_type::ok, std::move(ss)); @@ -174,6 +172,11 @@ void WrappedMasterService::init_http_server() { LOG(INFO) << "HTTP metrics server started on port " << http_server_.port(); } +tl::expected +WrappedMasterService::CalcCacheStats() { + return MasterMetricManager::instance().calculate_cache_stats(); +} + tl::expected WrappedMasterService::ExistKey( const std::string& key) { return execute_rpc( @@ -291,51 +294,57 @@ WrappedMasterService::BatchGetReplicaList( } tl::expected, ErrorCode> -WrappedMasterService::PutStart(const std::string& key, - const std::vector& slice_lengths, +WrappedMasterService::PutStart(const UUID& client_id, const std::string& key, + const uint64_t slice_length, const ReplicateConfig& config) { return execute_rpc( "PutStart", - [&] { return master_service_.PutStart(key, slice_lengths, config); }, + [&] { + return master_service_.PutStart(client_id, key, slice_length, + config); + }, [&](auto& timer) { - timer.LogRequest("key=", key, - ", slice_lengths=", slice_lengths.size()); + timer.LogRequest("client_id=", client_id, ", key=", key, + ", slice_length=", slice_length); }, [&] { MasterMetricManager::instance().inc_put_start_requests(); }, [] { MasterMetricManager::instance().inc_put_start_failures(); }); } tl::expected WrappedMasterService::PutEnd( - const std::string& key, ReplicaType replica_type) { + const UUID& client_id, const std::string& key, ReplicaType replica_type) { return execute_rpc( - "PutEnd", [&] { return master_service_.PutEnd(key, replica_type); }, + "PutEnd", + [&] { return master_service_.PutEnd(client_id, key, replica_type); }, [&](auto& timer) { - timer.LogRequest("key=", key, ", replica_type=", replica_type); + timer.LogRequest("client_id=", client_id, ", key=", key, + ", replica_type=", replica_type); }, [] { MasterMetricManager::instance().inc_put_end_requests(); }, [] { MasterMetricManager::instance().inc_put_end_failures(); }); } tl::expected WrappedMasterService::PutRevoke( - const std::string& key, ReplicaType replica_type) { + const UUID& client_id, const std::string& key, ReplicaType replica_type) { return execute_rpc( "PutRevoke", - [&] { return master_service_.PutRevoke(key, replica_type); }, + [&] { return master_service_.PutRevoke(client_id, key, replica_type); }, [&](auto& timer) { - timer.LogRequest("key=", key, ", replica_type=", replica_type); + timer.LogRequest("client_id=", client_id, ", key=", key, + ", replica_type=", replica_type); }, [] { MasterMetricManager::instance().inc_put_revoke_requests(); }, [] { MasterMetricManager::instance().inc_put_revoke_failures(); }); } std::vector, ErrorCode>> -WrappedMasterService::BatchPutStart( - const std::vector& keys, - const std::vector>& slice_lengths, - const ReplicateConfig& config) { +WrappedMasterService::BatchPutStart(const UUID& client_id, + const std::vector& keys, + const std::vector& slice_lengths, + const ReplicateConfig& config) { ScopedVLogTimer timer(1, "BatchPutStart"); const size_t total_keys = keys.size(); - timer.LogRequest("keys_count=", total_keys); + timer.LogRequest("client_id=", client_id, ", keys_count=", total_keys); MasterMetricManager::instance().inc_batch_put_start_requests(total_keys); std::vector, ErrorCode>> @@ -345,24 +354,17 @@ WrappedMasterService::BatchPutStart( if (config.prefer_alloc_in_same_node) { ReplicateConfig new_config = config; for (size_t i = 0; i < keys.size(); ++i) { - auto& slice_lens = slice_lengths[i]; - std::vector alloc_slice_lens; - size_t all_slice_len = 0; - for (auto& slice_len : slice_lens) { - all_slice_len += slice_len; - } - alloc_slice_lens.emplace_back(all_slice_len); - auto result = - master_service_.PutStart(keys[i], alloc_slice_lens, new_config); + auto result = master_service_.PutStart( + client_id, keys[i], slice_lengths[i], new_config); results.emplace_back(result); if ((i == 0) && result.has_value()) { std::string preferred_segment; for (const auto& replica : result.value()) { if (replica.is_memory_replica()) { auto handles = - replica.get_memory_descriptor().buffer_descriptors; - if (!handles.empty()) { - preferred_segment = handles[0].transport_endpoint_; + replica.get_memory_descriptor().buffer_descriptor; + if (!handles.transport_endpoint_.empty()) { + preferred_segment = handles.transport_endpoint_; } } } @@ -373,8 +375,8 @@ WrappedMasterService::BatchPutStart( } } else { for (size_t i = 0; i < keys.size(); ++i) { - results.emplace_back( - master_service_.PutStart(keys[i], slice_lengths[i], config)); + results.emplace_back(master_service_.PutStart( + client_id, keys[i], slice_lengths[i], config)); } } @@ -416,17 +418,18 @@ WrappedMasterService::BatchPutStart( } std::vector> WrappedMasterService::BatchPutEnd( - const std::vector& keys) { + const UUID& client_id, const std::vector& keys) { ScopedVLogTimer timer(1, "BatchPutEnd"); const size_t total_keys = keys.size(); - timer.LogRequest("keys_count=", total_keys); + timer.LogRequest("client_id=", client_id, ", keys_count=", total_keys); MasterMetricManager::instance().inc_batch_put_end_requests(total_keys); std::vector> results; results.reserve(keys.size()); for (const auto& key : keys) { - results.emplace_back(master_service_.PutEnd(key, ReplicaType::MEMORY)); + results.emplace_back( + master_service_.PutEnd(client_id, key, ReplicaType::MEMORY)); } size_t failure_count = 0; @@ -454,10 +457,10 @@ std::vector> WrappedMasterService::BatchPutEnd( } std::vector> WrappedMasterService::BatchPutRevoke( - const std::vector& keys) { + const UUID& client_id, const std::vector& keys) { ScopedVLogTimer timer(1, "BatchPutRevoke"); const size_t total_keys = keys.size(); - timer.LogRequest("keys_count=", total_keys); + timer.LogRequest("client_id=", client_id, ", keys_count=", total_keys); MasterMetricManager::instance().inc_batch_put_revoke_requests(total_keys); std::vector> results; @@ -465,7 +468,7 @@ std::vector> WrappedMasterService::BatchPutRevoke( for (const auto& key : keys) { results.emplace_back( - master_service_.PutRevoke(key, ReplicaType::MEMORY)); + master_service_.PutRevoke(client_id, key, ReplicaType::MEMORY)); } size_t failure_count = 0; @@ -633,4 +636,4 @@ void RegisterRpcService( &wrapped_master_service); } -} // namespace mooncake +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-store/src/segment.cpp b/mooncake-store/src/segment.cpp index 8d57f79ef..25ca33da4 100644 --- a/mooncake-store/src/segment.cpp +++ b/mooncake-store/src/segment.cpp @@ -81,7 +81,7 @@ ErrorCode ScopedSegmentAccess::MountSegment(const Segment& segment, segment_manager_->mounted_segments_[segment.id] = { segment, SegmentStatus::OK, std::move(allocator)}; - MasterMetricManager::instance().inc_total_mem_capacity(size); + MasterMetricManager::instance().inc_total_mem_capacity(segment.name, size); return ErrorCode::OK; } @@ -199,12 +199,18 @@ ErrorCode ScopedSegmentAccess::CommitUnmountSegment( << ", error=segment_not_found_in_client_segments"; } + // segment_id -> segment_name + std::string segment_name; + auto&& segment = segment_manager_->mounted_segments_.find(segment_id); + if (segment != segment_manager_->mounted_segments_.end()) { + segment_name = segment->second.segment.name; + } // Remove from mounted_segments_ segment_manager_->mounted_segments_.erase(segment_id); // Decrease the total capacity MasterMetricManager::instance().dec_total_mem_capacity( - metrics_dec_capacity); + segment_name, metrics_dec_capacity); return ErrorCode::OK; } diff --git a/mooncake-store/src/transfer_task.cpp b/mooncake-store/src/transfer_task.cpp index 158487d5b..bd67aa0be 100644 --- a/mooncake-store/src/transfer_task.cpp +++ b/mooncake-store/src/transfer_task.cpp @@ -5,6 +5,7 @@ #include #include #include "transfer_engine.h" +#include "transport/transport.h" namespace mooncake { @@ -244,9 +245,11 @@ void TransferEngineOperationState::check_task_status() { case TransferStatusEnum::FAILED: case TransferStatusEnum::CANCELED: case TransferStatusEnum::INVALID: +#ifndef USE_ASCEND_DIRECT LOG(ERROR) << "Transfer failed for batch " << batch_id_ << " task " << i << " with status " << static_cast(status.s); +#endif has_failure = true; break; default: @@ -284,8 +287,6 @@ void TransferEngineOperationState::set_result_internal(ErrorCode error_code) { VLOG(1) << "Setting transfer result for batch " << batch_id_ << " to " << static_cast(error_code); result_.emplace(error_code); - - cv_.notify_all(); } void TransferEngineOperationState::wait_for_completion() { @@ -293,10 +294,57 @@ void TransferEngineOperationState::wait_for_completion() { return; } - VLOG(1) << "Starting transfer engine polling for batch " << batch_id_; constexpr int64_t timeout_seconds = 60; - constexpr int64_t kOneSecondInNano = 1000 * 1000 * 1000; +#ifdef USE_EVENT_DRIVEN_COMPLETION + VLOG(1) << "Waiting for transfer engine completion for batch " << batch_id_; + + // Wait directly on BatchDesc's condition variable. + auto& batch_desc = Transport::toBatchDesc(batch_id_); + bool completed; + bool failed = false; + + // Fast path: if already finished, avoid taking the mutex and waiting. + // Use acquire here to pair with the writer's release-store, because this + // path may skip taking the mutex. It ensures all prior updates are visible. + completed = batch_desc.is_finished.load(std::memory_order_acquire); + if (!completed) { + // Use the same mutex as the notifier when updating the predicate to + // avoid missed notifications. The predicate is re-checked under the + // lock. Under the mutex, relaxed is sufficient; the mutex acquire + // orders prior writes. + std::unique_lock lock(batch_desc.completion_mutex); + completed = batch_desc.completion_cv.wait_for( + lock, std::chrono::seconds(timeout_seconds), [&batch_desc] { + return batch_desc.is_finished.load(std::memory_order_relaxed); + }); + } // Explicitly release completion_mutex before acquiring mutex_ + + // Once completion is observed, read failure flag. + if (completed) { + failed = batch_desc.has_failure.load(std::memory_order_relaxed); + } + + ErrorCode error_code = + completed ? (failed ? ErrorCode::TRANSFER_FAIL : ErrorCode::OK) + : ErrorCode::TRANSFER_FAIL; + + { + std::lock_guard lock(mutex_); + set_result_internal(error_code); + } + + if (completed) { + VLOG(1) << "Transfer engine operation completed for batch " << batch_id_ + << " with result: " << static_cast(error_code); + } else { + LOG(ERROR) << "Failed to complete transfers after " << timeout_seconds + << " seconds for batch " << batch_id_; + } +#else + VLOG(1) << "Starting transfer engine polling for batch " << batch_id_; + + constexpr int64_t kOneSecondInNano = 1000 * 1000 * 1000; const int64_t start_ts = getCurrentTimeInNano(); while (true) { @@ -320,6 +368,7 @@ void TransferEngineOperationState::wait_for_completion() { VLOG(1) << "Transfer engine operation still pending for batch " << batch_id_; } +#endif } // ============================================================================ @@ -392,23 +441,21 @@ std::optional TransferSubmitter::submit( std::optional future; if (replica.is_memory_replica()) { - std::vector handles; auto& mem_desc = replica.get_memory_descriptor(); - handles = mem_desc.buffer_descriptors; + auto& handle = mem_desc.buffer_descriptor; - if (!validateTransferParams(handles, slices)) { + if (!validateTransferParams(handle, slices)) { return std::nullopt; } - TransferStrategy strategy = selectStrategy(handles, slices); + TransferStrategy strategy = selectStrategy(handle, slices); switch (strategy) { case TransferStrategy::LOCAL_MEMCPY: - future = submitMemcpyOperation(handles, slices, op_code); + future = submitMemcpyOperation(handle, slices, op_code); break; case TransferStrategy::TRANSFER_ENGINE: - future = - submitTransferEngineOperation(handles, slices, op_code); + future = submitTransferEngineOperation(handle, slices, op_code); break; default: LOG(ERROR) << "Unknown transfer strategy: " << strategy; @@ -436,11 +483,10 @@ std::optional TransferSubmitter::submit_batch( auto& replica = replicas[i]; auto& slices = all_slices[i]; auto& mem_desc = replica.get_memory_descriptor(); - if (!validateTransferParams(mem_desc.buffer_descriptors, slices, - true)) { + if (!validateTransferParams(mem_desc.buffer_descriptor, slices)) { return std::nullopt; } - auto handle = mem_desc.buffer_descriptors[0]; + auto& handle = mem_desc.buffer_descriptor; uint64_t offset = 0; SegmentHandle seg = engine_.openSegment(handle.transport_endpoint_); if (seg == static_cast(ERR_INVALID_ARGUMENT)) { @@ -470,16 +516,17 @@ std::optional TransferSubmitter::submit_batch( } std::optional TransferSubmitter::submitMemcpyOperation( - const std::vector& handles, - std::vector& slices, TransferRequest::OpCode op_code) { + const AllocatedBuffer::Descriptor& handle, const std::vector& slices, + const TransferRequest::OpCode op_code) { auto state = std::make_shared(); // Create memcpy operations std::vector operations; - operations.reserve(handles.size()); + operations.reserve(slices.size()); + uint64_t base_address = static_cast(handle.buffer_address_); + uint64_t offset = 0; - for (size_t i = 0; i < handles.size(); ++i) { - const auto& handle = handles[i]; + for (size_t i = 0; i < slices.size(); ++i) { const auto& slice = slices[i]; if (slice.ptr == nullptr) continue; @@ -491,23 +538,24 @@ std::optional TransferSubmitter::submitMemcpyOperation( // READ: from handle (remote buffer) to slice (local // buffer) dest = slice.ptr; - src = reinterpret_cast(handle.buffer_address_); + src = reinterpret_cast(base_address + offset); } else { // WRITE: from slice (local buffer) to handle (remote // buffer) - dest = reinterpret_cast(handle.buffer_address_); + dest = reinterpret_cast(base_address + offset); src = slice.ptr; } + offset += slice.size; - operations.emplace_back(dest, src, handle.size_); + operations.emplace_back(dest, src, slice.size); } // Submit memcpy operations to worker pool for async execution MemcpyTask task(std::move(operations), state); memcpy_pool_->submitTask(std::move(task)); - VLOG(1) << "Memcpy transfer submitted to worker pool with " - << handles.size() << " operations"; + VLOG(1) << "Memcpy transfer submitted to worker pool with " << slices.size() + << " operations"; return TransferFuture(state); } @@ -548,39 +596,39 @@ std::optional TransferSubmitter::submitTransfer( } std::optional TransferSubmitter::submitTransferEngineOperation( - const std::vector& handles, - std::vector& slices, TransferRequest::OpCode op_code) { + const AllocatedBuffer::Descriptor& handle, const std::vector& slices, + const TransferRequest::OpCode op_code) { + if (handle.transport_endpoint_.empty()) { + LOG(ERROR) << "Transport endpoint is empty for handle with address " + << handle.buffer_address_; + return std::nullopt; + } + SegmentHandle seg = engine_.openSegment(handle.transport_endpoint_); + + if (seg == static_cast(ERR_INVALID_ARGUMENT)) { + LOG(ERROR) << "Failed to open segment for endpoint='" + << handle.transport_endpoint_ << "'"; + return std::nullopt; + } + // Create transfer requests std::vector requests; - requests.reserve(handles.size()); + requests.reserve(slices.size()); + uint64_t base_address = static_cast(handle.buffer_address_); + uint64_t offset = 0; - for (size_t i = 0; i < handles.size(); ++i) { - const auto& handle = handles[i]; + for (size_t i = 0; i < slices.size(); ++i) { const auto& slice = slices[i]; - if (slice.ptr == nullptr) continue; - if (handle.transport_endpoint_.empty()) { - LOG(ERROR) << "Transport endpoint is empty for handle with address " - << handle.buffer_address_; - return std::nullopt; - } - - SegmentHandle seg = engine_.openSegment(handle.transport_endpoint_); - - if (seg == static_cast(ERR_INVALID_ARGUMENT)) { - LOG(ERROR) << "Failed to open segment for endpoint='" - << handle.transport_endpoint_ << "'"; - return std::nullopt; - } - TransferRequest request; request.opcode = op_code; request.source = static_cast(slice.ptr); request.target_id = seg; - request.target_offset = handle.buffer_address_; - request.length = handle.size_; + request.target_offset = base_address + offset; + request.length = slice.size; + offset += slice.size; requests.emplace_back(request); } return submitTransfer(requests); @@ -604,7 +652,7 @@ std::optional TransferSubmitter::submitFileReadOperation( } TransferStrategy TransferSubmitter::selectStrategy( - const std::vector& handles, + const AllocatedBuffer::Descriptor& handle, const std::vector& slices) const { // Check if memcpy operations are enabled via environment variable if (!memcpy_enabled_) { @@ -614,7 +662,7 @@ TransferStrategy TransferSubmitter::selectStrategy( } // Check conditions for local memcpy optimization - if (isLocalTransfer(handles)) { + if (isLocalTransfer(handle)) { return TransferStrategy::LOCAL_MEMCPY; } @@ -622,15 +670,12 @@ TransferStrategy TransferSubmitter::selectStrategy( } bool TransferSubmitter::isLocalTransfer( - const std::vector& handles) const { + const AllocatedBuffer::Descriptor& handle) const { std::string local_ep = engine_.getLocalIpAndPort(); if (!local_ep.empty()) { - return std::all_of(handles.begin(), handles.end(), - [&local_ep](const auto& h) { - return !h.transport_endpoint_.empty() && - h.transport_endpoint_ == local_ep; - }); + return !handle.transport_endpoint_.empty() && + handle.transport_endpoint_ == local_ep; } // Without a local endpoint we cannot prove locality; disable memcpy. @@ -638,39 +683,17 @@ bool TransferSubmitter::isLocalTransfer( } bool TransferSubmitter::validateTransferParams( - const std::vector& handles, - const std::vector& slices, bool is_multi_buffers) const { - if (handles.empty()) { - LOG(ERROR) << "handles is empty"; - return false; + const AllocatedBuffer::Descriptor& handle, + const std::vector& slices) const { + uint64_t all_slice_len = 0; + for (auto slice : slices) { + all_slice_len += slice.size; } - - if (handles.size() > slices.size()) { - LOG(ERROR) << "invalid_partition_count handles_size=" << handles.size() - << " slices_size=" << slices.size(); + if (handle.size_ != all_slice_len) { + LOG(ERROR) << "handles len:" << handle.size_ + << ", all_slice_len:" << all_slice_len; return false; } - if (is_multi_buffers) { - uint64_t all_slice_len = 0; - for (auto slice : slices) { - all_slice_len += slice.size; - } - if (handles[0].size_ != all_slice_len) { - LOG(ERROR) << "handles len:" << handles[0].size_ - << ", all_slice_len:" << all_slice_len; - return false; - } - } else { - for (size_t i = 0; i < handles.size(); ++i) { - if (handles[i].size_ != slices[i].size) { - LOG(ERROR) << "Size of replica partition " << i << " (" - << handles[i].size_ - << ") does not match provided buffer (" - << slices[i].size << ")"; - return false; - } - } - } return true; } diff --git a/mooncake-store/src/types.cpp b/mooncake-store/src/types.cpp index b54cbc7ec..3508f9c5f 100644 --- a/mooncake-store/src/types.cpp +++ b/mooncake-store/src/types.cpp @@ -19,6 +19,7 @@ const std::string& toString(ErrorCode errorCode) noexcept { {ErrorCode::INVALID_KEY, "INVALID_KEY"}, {ErrorCode::WRITE_FAIL, "WRITE_FAIL"}, {ErrorCode::INVALID_PARAMS, "INVALID_PARAMS"}, + {ErrorCode::ILLEGAL_CLIENT, "ILLEGAL_CLIENT"}, {ErrorCode::INVALID_WRITE, "INVALID_WRITE"}, {ErrorCode::INVALID_READ, "INVALID_READ"}, {ErrorCode::INVALID_REPLICA, "INVALID_REPLICA"}, diff --git a/mooncake-store/tests/CMakeLists.txt b/mooncake-store/tests/CMakeLists.txt index 28bd63c97..3cf5fea82 100644 --- a/mooncake-store/tests/CMakeLists.txt +++ b/mooncake-store/tests/CMakeLists.txt @@ -5,6 +5,7 @@ function(add_store_test name) cachelib_memory_allocator ${ETCD_WRAPPER_LIB} glog + ibverbs gtest gtest_main pthread diff --git a/mooncake-store/tests/allocation_strategy_test.cpp b/mooncake-store/tests/allocation_strategy_test.cpp index 679ca8064..778dcb8fe 100644 --- a/mooncake-store/tests/allocation_strategy_test.cpp +++ b/mooncake-store/tests/allocation_strategy_test.cpp @@ -3,8 +3,10 @@ #include #include +#include #include #include +#include #include #include "allocator.h" @@ -72,33 +74,6 @@ INSTANTIATE_TEST_SUITE_P( } }); -// Unit test class for testing individual functions -class AllocationStrategyUnitTest : public ::testing::Test { - protected: - void SetUp() override { - strategy_ = std::make_unique(); - } - - // Helper function to create test allocators - std::shared_ptr CreateTestAllocator( - const std::string& segment_name, size_t base_offset, - BufferAllocatorType type, size_t size = 64 * MB) { - const size_t base = 0x100000000ULL + base_offset; // 4GB + offset - switch (type) { - case BufferAllocatorType::CACHELIB: - return std::make_shared( - segment_name, base, size, segment_name); - case BufferAllocatorType::OFFSET: - return std::make_shared( - segment_name, base, size, segment_name); - default: - throw std::invalid_argument("Invalid allocator type"); - } - } - - std::unique_ptr strategy_; -}; - // Test basic functionality with empty allocators map (non-parameterized) TEST_F(AllocationStrategyTest, EmptyAllocatorsMap) { std::unordered_map> empty_allocators; ReplicateConfig config{1, false, "local"}; - std::vector slice_sizes = {100}; + size_t slice_length = 100; auto result = strategy_->Allocate( - empty_allocators, empty_allocators_by_name, slice_sizes, config); + empty_allocators, empty_allocators_by_name, slice_length, config); EXPECT_FALSE(result.has_value()); EXPECT_EQ(result.error(), ErrorCode::NO_AVAILABLE_HANDLE); } @@ -122,9 +97,9 @@ TEST_F(AllocationStrategyTest, PreferredSegmentWithEmptyAllocators) { std::vector> empty_allocators; ReplicateConfig config{1, false, "preferred_segment"}; - std::vector slice_sizes = {100}; + size_t slice_length = 100; auto result = strategy_->Allocate( - empty_allocators, empty_allocators_by_name, slice_sizes, config); + empty_allocators, empty_allocators_by_name, slice_length, config); EXPECT_FALSE(result.has_value()); EXPECT_EQ(result.error(), ErrorCode::NO_AVAILABLE_HANDLE); } @@ -145,10 +120,10 @@ TEST_P(AllocationStrategyParameterizedTest, PreferredSegmentAllocation) { allocators.push_back(allocator2); ReplicateConfig config{1, false, "preferred"}; - std::vector slice_sizes = {1024}; + size_t slice_length = 1024; auto result = strategy_->Allocate(allocators, allocators_by_name, - slice_sizes, config); + slice_length, config); ASSERT_TRUE(result.has_value()); EXPECT_EQ(result.value().size(), 1); ASSERT_FALSE(result.value().empty()); @@ -157,9 +132,8 @@ TEST_P(AllocationStrategyParameterizedTest, PreferredSegmentAllocation) { auto descriptor = replica.get_descriptor(); ASSERT_TRUE(descriptor.is_memory_replica()); const auto& mem_desc = descriptor.get_memory_descriptor(); - ASSERT_EQ(mem_desc.buffer_descriptors.size(), 1); - EXPECT_EQ(mem_desc.buffer_descriptors[0].transport_endpoint_, "preferred"); - EXPECT_EQ(mem_desc.buffer_descriptors[0].size_, 1024); + EXPECT_EQ(mem_desc.buffer_descriptor.transport_endpoint_, "preferred"); + EXPECT_EQ(mem_desc.buffer_descriptor.size_, 1024); } // Test fallback to random allocation when preferred segment doesn't exist @@ -178,10 +152,10 @@ TEST_P(AllocationStrategyParameterizedTest, PreferredSegmentNotFound) { allocators.push_back(allocator2); ReplicateConfig config{1, false, "nonexistent"}; - std::vector slice_sizes = {1024}; + size_t slice_length = 1024; auto result = strategy_->Allocate(allocators, allocators_by_name, - slice_sizes, config); + slice_length, config); ASSERT_TRUE(result.has_value()); EXPECT_EQ(result.value().size(), 1); @@ -189,14 +163,13 @@ TEST_P(AllocationStrategyParameterizedTest, PreferredSegmentNotFound) { auto descriptor = replica.get_descriptor(); ASSERT_TRUE(descriptor.is_memory_replica()); const auto& mem_desc = descriptor.get_memory_descriptor(); - ASSERT_EQ(mem_desc.buffer_descriptors.size(), 1); - std::string segment_ep = mem_desc.buffer_descriptors[0].transport_endpoint_; + std::string segment_ep = mem_desc.buffer_descriptor.transport_endpoint_; EXPECT_TRUE(segment_ep == "segment1" || segment_ep == "segment2"); - EXPECT_EQ(mem_desc.buffer_descriptors[0].size_, 1024); + EXPECT_EQ(mem_desc.buffer_descriptor.size_, 1024); } -// Test multiple slices allocation -TEST_P(AllocationStrategyParameterizedTest, MultipleSlicesAllocation) { +// Test single slice allocation +TEST_P(AllocationStrategyParameterizedTest, SingleSliceAllocation) { auto allocator1 = CreateTestAllocator("segment1", 0); auto allocator2 = CreateTestAllocator("segment2", 0x10000000ULL); @@ -211,10 +184,10 @@ TEST_P(AllocationStrategyParameterizedTest, MultipleSlicesAllocation) { allocators.push_back(allocator2); ReplicateConfig config{1, false, ""}; - std::vector slice_sizes = {1024, 2048, 512}; + size_t slice_length = 1024; auto result = strategy_->Allocate(allocators, allocators_by_name, - slice_sizes, config); + slice_length, config); ASSERT_TRUE(result.has_value()); EXPECT_EQ(result.value().size(), 1); @@ -222,10 +195,7 @@ TEST_P(AllocationStrategyParameterizedTest, MultipleSlicesAllocation) { auto descriptor = replica.get_descriptor(); ASSERT_TRUE(descriptor.is_memory_replica()); const auto& mem_desc = descriptor.get_memory_descriptor(); - ASSERT_EQ(mem_desc.buffer_descriptors.size(), 3); - EXPECT_EQ(mem_desc.buffer_descriptors[0].size_, 1024); - EXPECT_EQ(mem_desc.buffer_descriptors[1].size_, 2048); - EXPECT_EQ(mem_desc.buffer_descriptors[2].size_, 512); + EXPECT_EQ(mem_desc.buffer_descriptor.size_, 1024); } // Test multiple replicas allocation @@ -247,21 +217,19 @@ TEST_P(AllocationStrategyParameterizedTest, MultipleReplicasAllocation) { allocators.push_back(allocator3); ReplicateConfig config{3, false, ""}; // Request 3 replicas - std::vector slice_sizes = {1024, 2048}; + size_t slice_length = 1024; auto result = strategy_->Allocate(allocators, allocators_by_name, - slice_sizes, config); + slice_length, config); ASSERT_TRUE(result.has_value()); EXPECT_EQ(result.value().size(), 3); - // Check each replica has all slices + // Check each replica has the correct slice size for (const auto& replica : result.value()) { auto descriptor = replica.get_descriptor(); ASSERT_TRUE(descriptor.is_memory_replica()); const auto& mem_desc = descriptor.get_memory_descriptor(); - ASSERT_EQ(mem_desc.buffer_descriptors.size(), 2); - EXPECT_EQ(mem_desc.buffer_descriptors[0].size_, 1024); - EXPECT_EQ(mem_desc.buffer_descriptors[1].size_, 2048); + EXPECT_EQ(mem_desc.buffer_descriptor.size_, 1024); } // Check that replicas are on different segments @@ -293,31 +261,33 @@ TEST_P(AllocationStrategyParameterizedTest, PreferredSegmentInsufficientSpace) { // First, fill up the preferred allocator ReplicateConfig config{1, false, "preferred"}; - std::vector large_slices = {10 * 1024 * 1024, 10 * 1024 * 1024, - 10 * 1024 * 1024, 10 * 1024 * 1024, - 10 * 1024 * 1024, 10 * 1024 * 1024, - 3 * 1024 * 1024}; // 63MB out of 64MB - - auto large_result = strategy_->Allocate(allocators, allocators_by_name, - large_slices, config); - ASSERT_TRUE(large_result.has_value()); - auto large_desc = large_result.value()[0].get_descriptor(); - ASSERT_TRUE(large_desc.is_memory_replica()); - EXPECT_EQ(large_desc.get_memory_descriptor() - .buffer_descriptors[0] - .transport_endpoint_, - "preferred"); + // Store the results of the allocations to avoid deallocation of the buffers + // before the test is done + std::vector> results; + // Allocate multiple times to fill up the preferred allocator + for (int i = 0; i < 4; ++i) { + size_t large_slice = 15 * 1024 * 1024; // 10MB + auto large_result = strategy_->Allocate(allocators, allocators_by_name, + large_slice, config); + ASSERT_TRUE(large_result.has_value()); + auto last_desc = large_result.value()[0].get_descriptor(); + ASSERT_TRUE(last_desc.is_memory_replica()); + EXPECT_EQ(last_desc.get_memory_descriptor() + .buffer_descriptor.transport_endpoint_, + "preferred"); + results.emplace_back(std::move(large_result.value())); + } // Now try to allocate more than remaining space in preferred segment - std::vector small_slice = {2 * 1024 * 1024}; + size_t small_slice = 5 * 1024 * 1024; // 2MB auto result = strategy_->Allocate(allocators, allocators_by_name, small_slice, config); ASSERT_TRUE(result.has_value()); auto small_desc = result.value()[0].get_descriptor(); ASSERT_TRUE(small_desc.is_memory_replica()); const auto& mem_desc = small_desc.get_memory_descriptor(); - EXPECT_EQ(mem_desc.buffer_descriptors[0].transport_endpoint_, "segment1"); - EXPECT_EQ(mem_desc.buffer_descriptors[0].size_, 2 * 1024 * 1024); + EXPECT_EQ(mem_desc.buffer_descriptor.transport_endpoint_, "segment1"); + EXPECT_EQ(mem_desc.buffer_descriptor.size_, small_slice); } // Test allocation when all allocators are full @@ -338,19 +308,20 @@ TEST_P(AllocationStrategyParameterizedTest, AllAllocatorsFull) { ReplicateConfig config{1, false, ""}; // Fill up both allocators - std::vector large_slices = {15 * 1024 * 1024, 15 * 1024 * 1024, - 15 * 1024 * 1024, - 15 * 1024 * 1024}; // 60MB - auto result1 = strategy_->Allocate(allocators, allocators_by_name, - large_slices, config); - ASSERT_TRUE(result1.has_value()); - auto result2 = strategy_->Allocate(allocators, allocators_by_name, - large_slices, config); - ASSERT_TRUE(result2.has_value()); + size_t large_slice = 15 * 1024 * 1024; // 15MB + // Store the results of the allocations to avoid deallocation of the buffers + // before the test is done + std::vector> results; + // Allocate 8 times to use 120MB total + for (int i = 0; i < 8; ++i) { + auto result = strategy_->Allocate(allocators, allocators_by_name, + large_slice, config); + ASSERT_TRUE(result.has_value()); + results.emplace_back(std::move(result.value())); + } // Try to allocate more than remaining space - std::vector impossible_slice = {5 * 1024 * - 1024}; // 5MB (more than remaining) + size_t impossible_slice = 5 * 1024 * 1024; // 5MB (more than remaining) auto result = strategy_->Allocate(allocators, allocators_by_name, impossible_slice, config); EXPECT_FALSE(result.has_value()); @@ -369,7 +340,7 @@ TEST_P(AllocationStrategyParameterizedTest, ZeroSizeAllocation) { allocators.push_back(allocator); ReplicateConfig config{1, false, ""}; - std::vector zero_slice = {0}; + size_t zero_slice = 0; auto result = strategy_->Allocate(allocators, allocators_by_name, zero_slice, config); @@ -389,8 +360,7 @@ TEST_P(AllocationStrategyParameterizedTest, VeryLargeSizeAllocation) { allocators.push_back(allocator); ReplicateConfig config{1, false, ""}; - std::vector huge_slice = { - 100 * 1024 * 1024}; // 100MB (larger than 64MB capacity) + size_t huge_slice = 100 * 1024 * 1024; // 100MB (larger than 64MB capacity) auto result = strategy_->Allocate(allocators, allocators_by_name, huge_slice, config); @@ -398,26 +368,7 @@ TEST_P(AllocationStrategyParameterizedTest, VeryLargeSizeAllocation) { EXPECT_EQ(result.error(), ErrorCode::NO_AVAILABLE_HANDLE); } -// Test empty slice sizes -TEST_F(AllocationStrategyTest, EmptySliceSizes) { - auto allocator = std::make_shared( - "segment1", 0x100000000ULL, 64 * MB, "segment1"); - std::unordered_map>> - allocators_by_name; - std::vector> allocators; - - allocators_by_name["segment1"].push_back(allocator); - allocators.push_back(allocator); - - ReplicateConfig config{1, false, ""}; - std::vector empty_slices; - - auto result = strategy_->Allocate(allocators, allocators_by_name, - empty_slices, config); - EXPECT_FALSE(result.has_value()); - EXPECT_EQ(result.error(), ErrorCode::INVALID_PARAMS); -} +// Test zero slice length (already covered by ZeroSizeAllocation test) // Test invalid replication count TEST_F(AllocationStrategyTest, InvalidReplicationCount) { @@ -432,10 +383,10 @@ TEST_F(AllocationStrategyTest, InvalidReplicationCount) { allocators.push_back(allocator); ReplicateConfig config{0, false, ""}; // Invalid: 0 replicas - std::vector slice_sizes = {1024}; + size_t slice_length = 1024; auto result = strategy_->Allocate(allocators, allocators_by_name, - slice_sizes, config); + slice_length, config); EXPECT_FALSE(result.has_value()); EXPECT_EQ(result.error(), ErrorCode::INVALID_PARAMS); } @@ -460,10 +411,10 @@ TEST_F(AllocationStrategyTest, InsufficientAllocatorsForReplicas) { ReplicateConfig config{ 5, false, ""}; // Request 5 replicas, but only 2 segments available - std::vector slice_sizes = {1024}; + size_t slice_length = 1024; auto result = strategy_->Allocate(allocators, allocators_by_name, - slice_sizes, config); + slice_length, config); // With best-effort semantics, should succeed with available replicas EXPECT_TRUE(result.has_value()); // Should get 2 replicas (limited by number of segments) @@ -474,8 +425,7 @@ TEST_F(AllocationStrategyTest, InsufficientAllocatorsForReplicas) { auto descriptor = replica.get_descriptor(); ASSERT_TRUE(descriptor.is_memory_replica()); const auto& mem_desc = descriptor.get_memory_descriptor(); - ASSERT_EQ(mem_desc.buffer_descriptors.size(), 1u); - EXPECT_EQ(mem_desc.buffer_descriptors[0].size_, 1024u); + EXPECT_EQ(mem_desc.buffer_descriptor.size_, 1024u); } // Verify replicas are on different segments @@ -483,216 +433,15 @@ TEST_F(AllocationStrategyTest, InsufficientAllocatorsForReplicas) { for (const auto& replica : result.value()) { auto descriptor = replica.get_descriptor(); const auto& mem_desc = descriptor.get_memory_descriptor(); - segment_names.insert( - mem_desc.buffer_descriptors[0].transport_endpoint_); + segment_names.insert(mem_desc.buffer_descriptor.transport_endpoint_); } EXPECT_EQ(2u, segment_names.size()); } -TEST_F(AllocationStrategyUnitTest, - AllocateSingleBuffer_PreferredSegmentNotFound) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - auto allocator2 = CreateTestAllocator("segment2", 0x10000000ULL, - BufferAllocatorType::OFFSET); - - std::vector> allocators = {allocator1, - allocator2}; - std::unordered_map>> - allocators_by_name; - allocators_by_name["segment1"] = {allocator1}; - allocators_by_name["segment2"] = {allocator2}; - - ReplicateConfig config{1, false, "nonexistent"}; - std::unordered_set excluded_segments; - - auto buffer = strategy_->allocateSingleBuffer( - allocators, allocators_by_name, 1024, config, excluded_segments); - - ASSERT_TRUE(buffer != nullptr); - std::string segment_name = buffer->getSegmentName(); - EXPECT_TRUE(segment_name == "segment1" || segment_name == "segment2"); -} - -TEST_F(AllocationStrategyUnitTest, AllocateSingleBuffer_EmptyPreferredSegment) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - - std::vector> allocators = {allocator1}; - std::unordered_map>> - allocators_by_name; - allocators_by_name["segment1"] = {allocator1}; - - ReplicateConfig config{1, false, ""}; // Empty preferred segment - std::unordered_set excluded_segments; - - auto buffer = strategy_->allocateSingleBuffer( - allocators, allocators_by_name, 1024, config, excluded_segments); - - ASSERT_TRUE(buffer != nullptr); - EXPECT_EQ(buffer->getSegmentName(), "segment1"); -} - -// Test tryRandomAllocate function -TEST_F(AllocationStrategyUnitTest, TryRandomAllocate_Success) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - auto allocator2 = CreateTestAllocator("segment2", 0x10000000ULL, - BufferAllocatorType::OFFSET); - - std::vector> allocators = {allocator1, - allocator2}; - std::unordered_set excluded_segments; - - auto buffer = - strategy_->tryRandomAllocate(allocators, 1024, excluded_segments); - ASSERT_TRUE(buffer != nullptr); - EXPECT_EQ(buffer->size(), 1024); -} - -TEST_F(AllocationStrategyUnitTest, TryRandomAllocate_AllSegmentsExcluded) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - auto allocator2 = CreateTestAllocator("segment2", 0x10000000ULL, - BufferAllocatorType::OFFSET); - - std::vector> allocators = {allocator1, - allocator2}; - std::unordered_set excluded_segments = {"segment1", - "segment2"}; - - auto buffer = - strategy_->tryRandomAllocate(allocators, 1024, excluded_segments); - EXPECT_TRUE(buffer == nullptr); -} - -TEST_F(AllocationStrategyUnitTest, TryRandomAllocate_InsufficientSpace) { - auto allocator = CreateTestAllocator( - "segment1", 0, BufferAllocatorType::OFFSET, 1024); // Only 1KB - std::vector> allocators = {allocator}; - std::unordered_set excluded_segments; - - auto buffer = strategy_->tryRandomAllocate( - allocators, 2048, excluded_segments); // Request 2KB - EXPECT_TRUE(buffer == nullptr); -} - -// Test allocateSlice function -TEST_F(AllocationStrategyUnitTest, AllocateSlice_SingleReplica) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - - std::vector> allocators = {allocator1}; - std::unordered_map>> - allocators_by_name; - allocators_by_name["segment1"] = {allocator1}; - - ReplicateConfig config{1, false, ""}; - auto buffers = strategy_->allocateSlice(allocators, allocators_by_name, - 1024, 1, config); - - ASSERT_EQ(buffers.size(), 1); - EXPECT_EQ(buffers[0]->size(), 1024); - EXPECT_EQ(buffers[0]->getSegmentName(), "segment1"); -} - -TEST_F(AllocationStrategyUnitTest, AllocateSlice_MultipleReplicas) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - auto allocator2 = CreateTestAllocator("segment2", 0x10000000ULL, - BufferAllocatorType::OFFSET); - auto allocator3 = CreateTestAllocator("segment3", 0x20000000ULL, - BufferAllocatorType::OFFSET); - - std::vector> allocators = { - allocator1, allocator2, allocator3}; - std::unordered_map>> - allocators_by_name; - allocators_by_name["segment1"] = {allocator1}; - allocators_by_name["segment2"] = {allocator2}; - allocators_by_name["segment3"] = {allocator3}; - - ReplicateConfig config{3, false, ""}; - auto buffers = strategy_->allocateSlice(allocators, allocators_by_name, - 1024, 3, config); - - ASSERT_EQ(buffers.size(), 3); - - // Verify all buffers have correct size - for (const auto& buffer : buffers) { - EXPECT_EQ(buffer->size(), 1024); - } - - // Verify replicas are on different segments - std::unordered_set used_segments; - for (const auto& buffer : buffers) { - used_segments.insert(buffer->getSegmentName()); - } - EXPECT_EQ(used_segments.size(), 3); -} - -TEST_F(AllocationStrategyUnitTest, AllocateSlice_InsufficientAllocators) { - auto allocator1 = - CreateTestAllocator("segment1", 0, BufferAllocatorType::OFFSET); - - std::vector> allocators = {allocator1}; - std::unordered_map>> - allocators_by_name; - allocators_by_name["segment1"] = {allocator1}; - - ReplicateConfig config{3, false, - ""}; // Request 3 replicas but only 1 allocator - auto buffers = strategy_->allocateSlice(allocators, allocators_by_name, - 1024, 3, config); - - // Should allocate as many as possible (best-effort) - ASSERT_EQ(buffers.size(), 1); - EXPECT_EQ(buffers[0]->getSegmentName(), "segment1"); -} - -// Test getLargestFreeRegion() filtering logic with fragmented allocators -TEST_F(AllocationStrategyUnitTest, - TryRandomAllocate_LargestFreeRegionFiltering) { - // Run the test 10 times to account for randomness - for (int run = 0; run < 10; ++run) { - // Create two OffsetBufferAllocators with 10MB each - auto allocator1 = CreateTestAllocator( - "segment1", 0, BufferAllocatorType::OFFSET, 10 * MB); - auto allocator2 = CreateTestAllocator( - "segment2", 0x10000000ULL, BufferAllocatorType::OFFSET, 10 * MB); - - // Fragment allocator1 heavily - leave only small free regions - std::vector> fragments1; - for (int i = 0; i < 9; ++i) { - fragments1.push_back(allocator1->allocate(1 * MB)); - } - // allocator1: 9MB allocated, only 1MB free - - // Leave allocator2 with enough contiguous space - auto fragment2 = allocator2->allocate(5 * MB); - // allocator2: 5MB allocated, 5MB contiguous free - - std::vector> allocators = { - allocator1, allocator2}; - std::unordered_set excluded_segments; - - // Reset retry counter before test - strategy_->resetRetryCount(); - - auto buffer = - strategy_->tryRandomAllocate(allocators, 4 * MB, excluded_segments); - - ASSERT_TRUE(buffer != nullptr) << "Failed on run " << run; - EXPECT_EQ(buffer->size(), 4 * MB) << "Failed on run " << run; - EXPECT_EQ(buffer->getSegmentName(), "segment2") - << "Failed on run " << run; - EXPECT_EQ(strategy_->getRetryCount(), 0) << "Failed on run " << run; - } -} +// Note: The following unit tests for internal helper methods have been removed +// because those methods (allocateSingleBuffer, tryRandomAllocate, +// allocateSlice, resetRetryCount, getRetryCount) are no longer part of the +// public API. The functionality is now encapsulated within the Allocate() +// method. } // namespace mooncake diff --git a/mooncake-store/tests/client_buffer_test.cpp b/mooncake-store/tests/client_buffer_test.cpp index 6cdd25f27..818c15d5f 100644 --- a/mooncake-store/tests/client_buffer_test.cpp +++ b/mooncake-store/tests/client_buffer_test.cpp @@ -269,25 +269,15 @@ TEST_F(ClientBufferTest, CalculateTotalSizeMemoryReplica) { Replica::Descriptor replica; MemoryDescriptor mem_desc; - // Add some buffer descriptors with proper initialization - AllocatedBuffer::Descriptor buf1; - buf1.size_ = 1024; - buf1.buffer_address_ = 0x1000; + // Set buffer descriptor with proper initialization + mem_desc.buffer_descriptor.size_ = 4096; + mem_desc.buffer_descriptor.buffer_address_ = 0x1000; - AllocatedBuffer::Descriptor buf2; - buf2.size_ = 2048; - buf2.buffer_address_ = 0x2000; - - AllocatedBuffer::Descriptor buf3; - buf3.size_ = 512; - buf3.buffer_address_ = 0x3000; - - mem_desc.buffer_descriptors = {buf1, buf2, buf3}; replica.descriptor_variant = mem_desc; replica.status = ReplicaStatus::COMPLETE; uint64_t total_size = calculate_total_size(replica); - EXPECT_EQ(total_size, 1024 + 2048 + 512); + EXPECT_EQ(total_size, 4096); } // Test calculate_total_size function with disk replica @@ -304,12 +294,13 @@ TEST_F(ClientBufferTest, CalculateTotalSizeDiskReplica) { EXPECT_EQ(total_size, 4096); } -// Test calculate_total_size function with empty memory replica -TEST_F(ClientBufferTest, CalculateTotalSizeEmptyMemoryReplica) { - // Create an empty memory replica descriptor +// Test calculate_total_size function with zero-size memory replica +TEST_F(ClientBufferTest, CalculateTotalSizeZeroSizeMemoryReplica) { + // Create a memory replica descriptor with zero size Replica::Descriptor replica; MemoryDescriptor mem_desc; - // Empty buffer_descriptors vector + mem_desc.buffer_descriptor.size_ = 0; + mem_desc.buffer_descriptor.buffer_address_ = 0x1000; replica.descriptor_variant = mem_desc; replica.status = ReplicaStatus::COMPLETE; @@ -334,34 +325,23 @@ TEST_F(ClientBufferTest, AllocateSlicesMemoryReplica) { // Create a memory replica descriptor Replica::Descriptor replica; MemoryDescriptor mem_desc; + mem_desc.buffer_descriptor.size_ = 4096; + mem_desc.buffer_descriptor.buffer_address_ = 0x1000; - AllocatedBuffer::Descriptor buf1; - buf1.size_ = 1024; - AllocatedBuffer::Descriptor buf2; - buf2.size_ = 2048; - AllocatedBuffer::Descriptor buf3; - buf3.size_ = 1024; - - mem_desc.buffer_descriptors = {buf1, buf2, buf3}; replica.descriptor_variant = mem_desc; replica.status = ReplicaStatus::COMPLETE; std::vector slices; - int result = allocateSlices(slices, replica, handle); + int result = allocateSlices(slices, replica, handle.ptr()); EXPECT_EQ(result, 0); - EXPECT_EQ(slices.size(), 3); + EXPECT_EQ(slices.size(), 1); - // Verify slice sizes match buffer descriptors - EXPECT_EQ(slices[0].size, 1024); - EXPECT_EQ(slices[1].size, 2048); - EXPECT_EQ(slices[2].size, 1024); + // Verify slice size matches buffer descriptor + EXPECT_EQ(slices[0].size, 4096); - // Verify slices are contiguous - char* base_ptr = static_cast(handle.ptr()); - EXPECT_EQ(slices[0].ptr, base_ptr); - EXPECT_EQ(slices[1].ptr, base_ptr + 1024); - EXPECT_EQ(slices[2].ptr, base_ptr + 1024 + 2048); + // Verify slice pointer matches buffer pointer + EXPECT_EQ(slices[0].ptr, handle.ptr()); } // Test allocateSlices function with disk replica @@ -386,7 +366,7 @@ TEST_F(ClientBufferTest, AllocateSlicesDiskReplica) { replica.status = ReplicaStatus::COMPLETE; std::vector slices; - int result = allocateSlices(slices, replica, handle); + int result = allocateSlices(slices, replica, handle.ptr()); EXPECT_EQ(result, 0); EXPECT_GE(slices.size(), 1); @@ -403,8 +383,8 @@ TEST_F(ClientBufferTest, AllocateSlicesDiskReplica) { EXPECT_EQ(total_slice_size, 8192); } -// Test allocateSlices function with empty memory replica -TEST_F(ClientBufferTest, AllocateSlicesEmptyMemoryReplica) { +// Test allocateSlices function with zero-size memory replica +TEST_F(ClientBufferTest, AllocateSlicesZeroSizeMemoryReplica) { const size_t buffer_size = 1024 * 1024; // 1MB const size_t alloc_size = 1024; // 1KB @@ -416,19 +396,22 @@ TEST_F(ClientBufferTest, AllocateSlicesEmptyMemoryReplica) { BufferHandle handle = std::move(handle_opt.value()); - // Create an empty memory replica descriptor + // Create a memory replica descriptor with zero size Replica::Descriptor replica; MemoryDescriptor mem_desc; - // Empty buffer_descriptors vector + mem_desc.buffer_descriptor.size_ = 0; + mem_desc.buffer_descriptor.buffer_address_ = 0x1000; replica.descriptor_variant = mem_desc; replica.status = ReplicaStatus::COMPLETE; std::vector slices; - int result = allocateSlices(slices, replica, handle); + int result = allocateSlices(slices, replica, handle.ptr()); EXPECT_EQ(result, 0); - EXPECT_EQ(slices.size(), 0); + EXPECT_EQ(slices.size(), 1); + EXPECT_EQ(slices[0].size, 0); + EXPECT_EQ(slices[0].ptr, handle.ptr()); } } // namespace mooncake diff --git a/mooncake-store/tests/client_integration_test.cpp b/mooncake-store/tests/client_integration_test.cpp index e42d50a3b..8df3176ee 100644 --- a/mooncake-store/tests/client_integration_test.cpp +++ b/mooncake-store/tests/client_integration_test.cpp @@ -314,12 +314,9 @@ TEST_F(ClientIntegrationTest, LocalPreferredAllocationTest) { << "Query operation failed: " << toString(query_result.error()); auto replica_list = query_result.value().replicas; ASSERT_EQ(replica_list.size(), 1); - ASSERT_EQ(replica_list[0].get_memory_descriptor().buffer_descriptors.size(), - 1); ASSERT_EQ(replica_list[0] .get_memory_descriptor() - .buffer_descriptors[0] - .transport_endpoint_, + .buffer_descriptor.transport_endpoint_, segment_provider_client_->GetTransportEndpoint()); auto get_result = test_client_->Get(key, query_result.value(), slices); diff --git a/mooncake-store/tests/e2e/client_wrapper.cpp b/mooncake-store/tests/e2e/client_wrapper.cpp index ce3451bcc..74e666370 100644 --- a/mooncake-store/tests/e2e/client_wrapper.cpp +++ b/mooncake-store/tests/e2e/client_wrapper.cpp @@ -106,9 +106,9 @@ ErrorCode ClientTestWrapper::Get(const std::string& key, std::string& value) { } // Create slices - const std::vector& descriptors = - replica_list[0].get_memory_descriptor().buffer_descriptors; - SliceGuard slice_guard(descriptors, allocator_); + const AllocatedBuffer::Descriptor& descriptor = + replica_list[0].get_memory_descriptor().buffer_descriptor; + SliceGuard slice_guard(descriptor.size_, allocator_); // Perform get operation auto get_result = diff --git a/mooncake-store/tests/master_metrics_test.cpp b/mooncake-store/tests/master_metrics_test.cpp index dda05c90d..455000ef2 100644 --- a/mooncake-store/tests/master_metrics_test.cpp +++ b/mooncake-store/tests/master_metrics_test.cpp @@ -7,6 +7,7 @@ #include "rpc_service.h" #include "types.h" #include "master_config.h" +#include "master_metric_manager.h" namespace mooncake::test { @@ -90,6 +91,11 @@ TEST_F(MasterMetricsTest, InitialStatusTest) { ASSERT_EQ(metrics.get_batch_put_revoke_partial_successes(), 0); ASSERT_EQ(metrics.get_batch_put_revoke_items(), 0); ASSERT_EQ(metrics.get_batch_put_revoke_failed_items(), 0); + + // PutStart Discard Metrics + ASSERT_EQ(metrics.get_put_start_discard_cnt(), 0); + ASSERT_EQ(metrics.get_put_start_release_cnt(), 0); + ASSERT_EQ(metrics.get_put_start_discarded_staging_size(), 0); } TEST_F(MasterMetricsTest, BasicRequestTest) { @@ -114,7 +120,6 @@ TEST_F(MasterMetricsTest, BasicRequestTest) { std::string key = "test_key"; uint64_t value_length = 1024; - std::vector slice_lengths = {value_length}; ReplicateConfig config; config.replica_num = 1; @@ -124,34 +129,48 @@ TEST_F(MasterMetricsTest, BasicRequestTest) { ASSERT_EQ(metrics.get_allocated_mem_size(), 0); ASSERT_EQ(metrics.get_total_mem_capacity(), kSegmentSize); ASSERT_DOUBLE_EQ(metrics.get_global_mem_used_ratio(), 0.0); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), 0); + ASSERT_EQ(metrics.get_segment_total_mem_capacity(segment.name), + kSegmentSize); + ASSERT_DOUBLE_EQ(metrics.get_segment_mem_used_ratio(segment.name), 0.0); ASSERT_EQ(metrics.get_mount_segment_requests(), 1); ASSERT_EQ(metrics.get_mount_segment_failures(), 0); // Test PutStart and PutRevoke request - auto put_start_result1 = service_.PutStart(key, slice_lengths, config); + auto put_start_result1 = + service_.PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result1.has_value()); ASSERT_EQ(metrics.get_key_count(), 1); ASSERT_EQ(metrics.get_allocated_mem_size(), value_length); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), + value_length); ASSERT_EQ(metrics.get_put_start_requests(), 1); ASSERT_EQ(metrics.get_put_start_failures(), 0); - auto put_revoke_result = service_.PutRevoke(key, ReplicaType::MEMORY); + auto put_revoke_result = + service_.PutRevoke(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_revoke_result.has_value()); ASSERT_EQ(metrics.get_key_count(), 0); ASSERT_EQ(metrics.get_allocated_mem_size(), 0); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), 0); ASSERT_EQ(metrics.get_put_revoke_requests(), 1); ASSERT_EQ(metrics.get_put_revoke_failures(), 0); // Test PutStart and PutEnd request - auto put_start_result2 = service_.PutStart(key, slice_lengths, config); + auto put_start_result2 = + service_.PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result2.has_value()); ASSERT_EQ(metrics.get_key_count(), 1); ASSERT_EQ(metrics.get_allocated_mem_size(), value_length); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), + value_length); ASSERT_EQ(metrics.get_put_start_requests(), 2); ASSERT_EQ(metrics.get_put_start_failures(), 0); - auto put_end_result = service_.PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_.PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); ASSERT_EQ(metrics.get_key_count(), 1); ASSERT_EQ(metrics.get_allocated_mem_size(), value_length); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), + value_length); ASSERT_EQ(metrics.get_put_end_requests(), 1); ASSERT_EQ(metrics.get_put_end_failures(), 0); @@ -176,11 +195,13 @@ TEST_F(MasterMetricsTest, BasicRequestTest) { ASSERT_EQ(metrics.get_remove_failures(), 0); ASSERT_EQ(metrics.get_key_count(), 0); ASSERT_EQ(metrics.get_allocated_mem_size(), 0); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), 0); // Test RemoveAll request - auto put_start_result3 = service_.PutStart(key, slice_lengths, config); + auto put_start_result3 = + service_.PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result3.has_value()); - auto put_end_result2 = service_.PutEnd(key, ReplicaType::MEMORY); + auto put_end_result2 = service_.PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result2.has_value()); ASSERT_EQ(metrics.get_key_count(), 1); ASSERT_EQ(1, service_.RemoveAll()); @@ -188,11 +209,13 @@ TEST_F(MasterMetricsTest, BasicRequestTest) { ASSERT_EQ(metrics.get_remove_all_failures(), 0); ASSERT_EQ(metrics.get_key_count(), 0); ASSERT_EQ(metrics.get_allocated_mem_size(), 0); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), 0); // Test UnmountSegment request - auto put_start_result4 = service_.PutStart(key, slice_lengths, config); + auto put_start_result4 = + service_.PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result4.has_value()); - auto put_end_result3 = service_.PutEnd(key, ReplicaType::MEMORY); + auto put_end_result3 = service_.PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result3.has_value()); auto unmount_result = service_.UnmountSegment(segment_id, client_id); ASSERT_TRUE(unmount_result.has_value()); @@ -202,6 +225,74 @@ TEST_F(MasterMetricsTest, BasicRequestTest) { ASSERT_EQ(metrics.get_allocated_mem_size(), 0); ASSERT_EQ(metrics.get_total_mem_capacity(), 0); ASSERT_DOUBLE_EQ(metrics.get_global_mem_used_ratio(), 0.0); + ASSERT_EQ(metrics.get_segment_allocated_mem_size(segment.name), 0); + ASSERT_EQ(metrics.get_segment_total_mem_capacity(segment.name), 0); + ASSERT_DOUBLE_EQ(metrics.get_segment_mem_used_ratio(segment.name), 0.0); + + // check segment mem used ratio for non-existent segment + ASSERT_DOUBLE_EQ(metrics.get_segment_mem_used_ratio(""), 0.0); + ASSERT_DOUBLE_EQ(metrics.get_segment_mem_used_ratio("xxxxxx_segment"), 0.0); +} + +TEST_F(MasterMetricsTest, CalcCacheStatsTest) { + const uint64_t default_kv_lease_ttl = 100; + auto& metrics = MasterMetricManager::instance(); + // Use a wrapped master service to test the metrics manager + WrappedMasterServiceConfig service_config; + service_config.default_kv_lease_ttl = default_kv_lease_ttl; + service_config.enable_metric_reporting = true; + WrappedMasterService service_(service_config); + + constexpr size_t kBufferAddress = 0x300000000; + constexpr size_t kSegmentSize = 1024 * 1024 * 16; + std::string segment_name = "test_segment"; + UUID segment_id = generate_uuid(); + Segment segment; + segment.id = segment_id; + segment.name = segment_name; + segment.base = kBufferAddress; + segment.size = kSegmentSize; + UUID client_id = generate_uuid(); + + std::string key = "test_key"; + uint64_t value_length = 1024; + ReplicateConfig config; + config.replica_num = 1; + + auto stats_dict = metrics.calculate_cache_stats(); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_HITS], 1); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::SSD_HITS], 0); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_TOTAL], 2); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::SSD_TOTAL], 0); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_HIT_RATE], + 0.5); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::SSD_HIT_RATE], 0); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::OVERALL_HIT_RATE], + 0.5); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::VALID_GET_RATE], 1); + + auto mount_result = service_.MountSegment(segment, client_id); + auto put_start_result1 = + service_.PutStart(client_id, key, value_length, config); + auto put_end_result1 = service_.PutEnd(client_id, key, ReplicaType::MEMORY); + stats_dict = metrics.calculate_cache_stats(); + + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_TOTAL], 3); + + auto get_replica_result = service_.GetReplicaList(key); + stats_dict = metrics.calculate_cache_stats(); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_HITS], 2); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::SSD_HITS], 0); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_TOTAL], 3); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::SSD_TOTAL], 0); + ASSERT_NEAR(stats_dict[MasterMetricManager::CacheHitStat::MEMORY_HIT_RATE], + 0.67, 0.01); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::SSD_HIT_RATE], 0); + ASSERT_NEAR(stats_dict[MasterMetricManager::CacheHitStat::OVERALL_HIT_RATE], + 0.67, 0.01); + ASSERT_EQ(stats_dict[MasterMetricManager::CacheHitStat::VALID_GET_RATE], 1); + + auto remove_result = service_.Remove(key); } TEST_F(MasterMetricsTest, BatchRequestTest) { @@ -223,7 +314,7 @@ TEST_F(MasterMetricsTest, BatchRequestTest) { UUID client_id = generate_uuid(); std::vector keys = {"test_key1", "test_key2", "test_key3"}; - std::vector> slice_lengths = {{1024}, {2048}, {512}}; + std::vector value_lengths = {1024, 2048, 512}; ReplicateConfig config; config.replica_num = 1; @@ -242,7 +333,7 @@ TEST_F(MasterMetricsTest, BatchRequestTest) { // Test BatchPutStart request auto batch_put_start_result = - service_.BatchPutStart(keys, slice_lengths, config); + service_.BatchPutStart(client_id, keys, value_lengths, config); ASSERT_EQ(batch_put_start_result.size(), 3); ASSERT_EQ(metrics.get_batch_put_start_requests(), 1); ASSERT_EQ(metrics.get_batch_put_start_partial_successes(), 0); @@ -260,7 +351,7 @@ TEST_F(MasterMetricsTest, BatchRequestTest) { ASSERT_EQ(metrics.get_batch_get_replica_list_failed_items(), 3); // Test BatchPutEnd request - auto batch_put_end_result = service_.BatchPutEnd(keys); + auto batch_put_end_result = service_.BatchPutEnd(client_id, keys); ASSERT_EQ(batch_put_end_result.size(), 3); ASSERT_EQ(metrics.get_batch_put_end_requests(), 1); ASSERT_EQ(metrics.get_batch_put_end_partial_successes(), 0); @@ -287,7 +378,7 @@ TEST_F(MasterMetricsTest, BatchRequestTest) { ASSERT_EQ(metrics.get_batch_get_replica_list_failed_items(), 3); // Test BatchPutRevoke request (should all fail) - auto batch_put_revoke_result = service_.BatchPutRevoke(keys); + auto batch_put_revoke_result = service_.BatchPutRevoke(client_id, keys); ASSERT_EQ(batch_put_revoke_result.size(), 3); ASSERT_EQ(metrics.get_batch_put_revoke_requests(), 1); ASSERT_EQ(metrics.get_batch_put_revoke_partial_successes(), 0); @@ -297,7 +388,7 @@ TEST_F(MasterMetricsTest, BatchRequestTest) { // Test partial success keys.push_back("test_key4"); - slice_lengths.push_back({512}); + value_lengths.push_back(512); auto batch_get_replica_result3 = service_.BatchGetReplicaList(keys); ASSERT_EQ(batch_get_replica_result3.size(), 4); ASSERT_EQ(metrics.get_batch_get_replica_list_requests(), 3); @@ -307,7 +398,7 @@ TEST_F(MasterMetricsTest, BatchRequestTest) { ASSERT_EQ(metrics.get_batch_get_replica_list_failed_items(), 4); auto batch_put_start_result2 = - service_.BatchPutStart(keys, slice_lengths, config); + service_.BatchPutStart(client_id, keys, value_lengths, config); ASSERT_EQ(batch_put_start_result2.size(), 4); ASSERT_EQ(metrics.get_batch_put_start_requests(), 2); ASSERT_EQ(metrics.get_batch_put_start_partial_successes(), 1); diff --git a/mooncake-store/tests/master_service_ssd_test.cpp b/mooncake-store/tests/master_service_ssd_test.cpp index 2ee6079b2..422e1aa10 100644 --- a/mooncake-store/tests/master_service_ssd_test.cpp +++ b/mooncake-store/tests/master_service_ssd_test.cpp @@ -45,11 +45,12 @@ TEST_F(MasterServiceSSDTest, PutEndBothReplica) { ASSERT_TRUE(mount_result.has_value()); std::string key = "disk_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); auto replicas = put_start_result.value(); ASSERT_EQ(2, replicas.size()); @@ -67,8 +68,10 @@ TEST_F(MasterServiceSSDTest, PutEndBothReplica) { EXPECT_EQ(ErrorCode::REPLICA_IS_NOT_READY, get_result.error()); // PutEnd for both memory and disk - EXPECT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); - EXPECT_TRUE(service_->PutEnd(key, ReplicaType::DISK).has_value()); + EXPECT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); + EXPECT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::DISK).has_value()); get_result = service_->GetReplicaList(key); ASSERT_TRUE(get_result.has_value()); @@ -97,19 +100,22 @@ TEST_F(MasterServiceSSDTest, PutRevokeDiskReplica) { ASSERT_TRUE(mount_result.has_value()); std::string key = "revoke_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - ASSERT_TRUE(service_->PutStart(key, slice_lengths, config).has_value()); - EXPECT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + ASSERT_TRUE( + service_->PutStart(client_id, key, slice_length, config).has_value()); + EXPECT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); auto get_result = service_->GetReplicaList(key); ASSERT_TRUE(get_result.has_value()); EXPECT_EQ(1, get_result.value().replicas.size()); ASSERT_TRUE(get_result.value().replicas[0].is_memory_replica()); - EXPECT_TRUE(service_->PutRevoke(key, ReplicaType::DISK).has_value()); + EXPECT_TRUE( + service_->PutRevoke(client_id, key, ReplicaType::DISK).has_value()); get_result = service_->GetReplicaList(key); ASSERT_TRUE(get_result.has_value()); @@ -135,18 +141,21 @@ TEST_F(MasterServiceSSDTest, PutRevokeMemoryReplica) { ASSERT_TRUE(mount_result.has_value()); std::string key = "revoke_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - ASSERT_TRUE(service_->PutStart(key, slice_lengths, config).has_value()); - EXPECT_TRUE(service_->PutRevoke(key, ReplicaType::MEMORY).has_value()); + ASSERT_TRUE( + service_->PutStart(client_id, key, slice_length, config).has_value()); + EXPECT_TRUE( + service_->PutRevoke(client_id, key, ReplicaType::MEMORY).has_value()); auto get_result = service_->GetReplicaList(key); ASSERT_FALSE(get_result.has_value()); EXPECT_EQ(ErrorCode::REPLICA_IS_NOT_READY, get_result.error()); - EXPECT_TRUE(service_->PutEnd(key, ReplicaType::DISK).has_value()); + EXPECT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::DISK).has_value()); get_result = service_->GetReplicaList(key); ASSERT_TRUE(get_result.has_value()); EXPECT_EQ(1, get_result.value().replicas.size()); @@ -171,18 +180,21 @@ TEST_F(MasterServiceSSDTest, PutRevokeBothReplica) { ASSERT_TRUE(mount_result.has_value()); std::string key = "revoke_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - ASSERT_TRUE(service_->PutStart(key, slice_lengths, config).has_value()); - EXPECT_TRUE(service_->PutRevoke(key, ReplicaType::DISK).has_value()); + ASSERT_TRUE( + service_->PutStart(client_id, key, slice_length, config).has_value()); + EXPECT_TRUE( + service_->PutRevoke(client_id, key, ReplicaType::DISK).has_value()); auto get_result = service_->GetReplicaList(key); ASSERT_FALSE(get_result.has_value()); EXPECT_EQ(ErrorCode::REPLICA_IS_NOT_READY, get_result.error()); - EXPECT_TRUE(service_->PutRevoke(key, ReplicaType::MEMORY).has_value()); + EXPECT_TRUE( + service_->PutRevoke(client_id, key, ReplicaType::MEMORY).has_value()); get_result = service_->GetReplicaList(key); ASSERT_FALSE(get_result.has_value()); EXPECT_EQ(ErrorCode::OBJECT_NOT_FOUND, get_result.error()); @@ -206,13 +218,16 @@ TEST_F(MasterServiceSSDTest, RemoveKey) { ASSERT_TRUE(mount_result.has_value()); std::string key = "remove_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - ASSERT_TRUE(service_->PutStart(key, slice_lengths, config).has_value()); - EXPECT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); - EXPECT_TRUE(service_->PutEnd(key, ReplicaType::DISK).has_value()); + ASSERT_TRUE( + service_->PutStart(client_id, key, slice_length, config).has_value()); + EXPECT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); + EXPECT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::DISK).has_value()); EXPECT_TRUE(service_->Remove(key).has_value()); @@ -245,14 +260,16 @@ TEST_F(MasterServiceSSDTest, EvictObject) { int success_puts = 0; for (int i = 0; i < 1024 * 16 + 50; ++i) { std::string key = "test_key" + std::to_string(i); - std::vector slice_lengths = {object_size}; + uint64_t slice_length = object_size; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); if (put_start_result.has_value()) { auto put_end_mem_result = - service_->PutEnd(key, ReplicaType::MEMORY); - auto put_end_disk_result = service_->PutEnd(key, ReplicaType::DISK); + service_->PutEnd(client_id, key, ReplicaType::MEMORY); + auto put_end_disk_result = + service_->PutEnd(client_id, key, ReplicaType::DISK); ASSERT_TRUE(put_end_mem_result.has_value()); ASSERT_TRUE(put_end_disk_result.has_value()); success_puts++; @@ -279,6 +296,115 @@ TEST_F(MasterServiceSSDTest, EvictObject) { service_->RemoveAll(); } +TEST_F(MasterServiceSSDTest, PutStartExpires) { + // Reset storage space metrics. + MasterMetricManager::instance().reset_allocated_mem_size(); + MasterMetricManager::instance().reset_total_mem_capacity(); + + MasterServiceConfig master_config; + master_config.root_fs_dir = "/mnt/ssd"; + master_config.put_start_discard_timeout_sec = 3; + master_config.put_start_release_timeout_sec = 5; + std::unique_ptr service_(new MasterService(master_config)); + + constexpr size_t kReplicaCnt = 2; // 1 memory replica + 1 disk replica + constexpr size_t kBaseAddr = 0x300000000; + constexpr size_t kSegmentSize = 1024 * 1024 * 16; // 16MB + + // Mount a segment. + std::string segment_name = "test_segment"; + Segment segment; + segment.id = generate_uuid(); + segment.name = segment_name; + segment.base = kBaseAddr; + segment.size = kSegmentSize; + segment.te_endpoint = segment.name; + auto client_id = generate_uuid(); + auto mount_result = service_->MountSegment(segment, client_id); + ASSERT_TRUE(mount_result.has_value()); + + std::string key = "test_key"; + uint64_t value_length = 16 * 1024 * 1024; // 16MB + uint64_t slice_length = value_length; + ReplicateConfig config; + + auto test_discard_replica = [&](ReplicaType discard_type) { + const auto reserve_type = discard_type == ReplicaType::MEMORY + ? ReplicaType::DISK + : ReplicaType::MEMORY; + + // Put key, should success. + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); + EXPECT_TRUE(put_start_result.has_value()); + auto replica_list = put_start_result.value(); + EXPECT_EQ(replica_list.size(), kReplicaCnt); + for (size_t i = 0; i < kReplicaCnt; i++) { + EXPECT_EQ(ReplicaStatus::PROCESSING, replica_list[i].status); + } + + // Complete the reserved replica. + auto put_end_result = service_->PutEnd(client_id, key, reserve_type); + EXPECT_TRUE(put_end_result.has_value()); + + // Wait for a while until the put-start expired. + for (size_t i = 0; i <= master_config.put_start_discard_timeout_sec; + i++) { + // Keep mounted segments alive. + auto result = service_->Ping(client_id); + EXPECT_TRUE(result.has_value()); + // Protect the key from eviction. + auto get_result = service_->GetReplicaList(key); + EXPECT_TRUE(get_result.has_value()); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + // Put key again, should fail because the object has had an completed + // replica. + put_start_result = + service_->PutStart(client_id, key, slice_length, config); + EXPECT_FALSE(put_start_result.has_value()); + EXPECT_EQ(put_start_result.error(), ErrorCode::OBJECT_ALREADY_EXISTS); + + // Wait for a while until the discarded replicas are released. + for (size_t i = 0; i <= master_config.put_start_release_timeout_sec; + i++) { + // Keep mounted segments alive. + auto result = service_->Ping(client_id); + EXPECT_TRUE(result.has_value()); + // Protect the key from eviction. + auto get_result = service_->GetReplicaList(key); + EXPECT_TRUE(get_result.has_value()); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + // Try PutEnd the discarded replica. + put_end_result = service_->PutEnd(client_id, key, discard_type); + EXPECT_TRUE(put_end_result.has_value()); + + // Check that the key has only one replica. + auto get_result = service_->GetReplicaList(key); + EXPECT_TRUE(get_result.has_value()); + EXPECT_EQ(get_result.value().replicas.size(), 1); + if (reserve_type == ReplicaType::MEMORY) { + EXPECT_TRUE(get_result.value().replicas[0].is_memory_replica()); + } else { + EXPECT_TRUE(get_result.value().replicas[0].is_disk_replica()); + } + + // Wait for the key to expire. + for (size_t i = 0; i <= DEFAULT_DEFAULT_KV_LEASE_TTL / 1000; i++) { + auto result = service_->Ping(client_id); + EXPECT_TRUE(result.has_value()); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + service_->RemoveAll(); + }; + + test_discard_replica(ReplicaType::DISK); + test_discard_replica(ReplicaType::MEMORY); +} + } // namespace mooncake::test int main(int argc, char** argv) { diff --git a/mooncake-store/tests/master_service_test.cpp b/mooncake-store/tests/master_service_test.cpp index 1170397ff..91ffb8a6f 100644 --- a/mooncake-store/tests/master_service_test.cpp +++ b/mooncake-store/tests/master_service_test.cpp @@ -58,7 +58,8 @@ class MasterServiceTest : public ::testing::Test { void TearDown() override { google::ShutdownGoogleLogging(); } }; -std::string GenerateKeyForSegment(const std::unique_ptr& service, +std::string GenerateKeyForSegment(const UUID& client_id, + const std::unique_ptr& service, const std::string& segment_name) { static std::atomic counter(0); @@ -73,7 +74,8 @@ std::string GenerateKeyForSegment(const std::unique_ptr& service, } // Attempt to put the key. - auto put_result = service->PutStart(key, {1024}, {.replica_num = 1}); + auto put_result = + service->PutStart(client_id, key, {1024}, {.replica_num = 1}); if (put_result.has_value()) { replica_list = std::move(put_result.value()); } @@ -87,14 +89,14 @@ std::string GenerateKeyForSegment(const std::unique_ptr& service, throw std::runtime_error("PutStart failed with code: " + std::to_string(static_cast(code))); } - auto put_end_result = service->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service->PutEnd(client_id, key, ReplicaType::MEMORY); if (!put_end_result.has_value()) { throw std::runtime_error("PutEnd failed"); } if (replica_list[0] .get_memory_descriptor() - .buffer_descriptors[0] - .transport_endpoint_ == segment_name) { + .buffer_descriptor.transport_endpoint_ == segment_name) { return key; } // Clean up failed attempt @@ -304,20 +306,20 @@ TEST_F(MasterServiceTest, ConcurrentMountUnmount) { TEST_F(MasterServiceTest, PutStartInvalidParams) { std::unique_ptr service_(new MasterService()); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); std::string key = "test_key"; ReplicateConfig config; // Test invalid replica_num config.replica_num = 0; - auto put_result1 = service_->PutStart(key, {1024}, config); + auto put_result1 = service_->PutStart(client_id, key, 1024, config); EXPECT_FALSE(put_result1.has_value()); EXPECT_EQ(ErrorCode::INVALID_PARAMS, put_result1.error()); - // Test empty slice_lengths + // Test zero slice_length config.replica_num = 1; - std::vector empty_slices; - auto put_result2 = service_->PutStart(key, empty_slices, config); + auto put_result2 = service_->PutStart(client_id, key, 0, config); EXPECT_FALSE(put_result2.has_value()); EXPECT_EQ(ErrorCode::INVALID_PARAMS, put_result2.error()); } @@ -325,15 +327,18 @@ TEST_F(MasterServiceTest, PutStartInvalidParams) { TEST_F(MasterServiceTest, PutStartEndFlow) { std::unique_ptr service_(new MasterService()); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); + const UUID invalid_client_id = generate_uuid(); + ASSERT_NE(client_id, invalid_client_id); // Test PutStart std::string key = "test_key"; uint64_t value_length = 1024; - std::vector slice_lengths = {value_length}; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); EXPECT_TRUE(put_start_result.has_value()); replica_list = put_start_result.value(); EXPECT_FALSE(replica_list.empty()); @@ -347,8 +352,20 @@ TEST_F(MasterServiceTest, PutStartEndFlow) { EXPECT_FALSE(remove_result.has_value()); EXPECT_EQ(ErrorCode::REPLICA_IS_NOT_READY, remove_result.error()); + // PutEnd should fail if the client_id does not match. + auto put_end_fail_result = + service_->PutEnd(invalid_client_id, key, ReplicaType::MEMORY); + EXPECT_FALSE(put_end_fail_result.has_value()); + EXPECT_EQ(put_end_fail_result.error(), ErrorCode::ILLEGAL_CLIENT); + + // PutRevoke should fail if the client_id does not match. + auto put_revoke_fail_result = + service_->PutRevoke(invalid_client_id, key, ReplicaType::MEMORY); + EXPECT_FALSE(put_revoke_fail_result.has_value()); + EXPECT_EQ(put_revoke_fail_result.error(), ErrorCode::ILLEGAL_CLIENT); + // Test PutEnd - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); EXPECT_TRUE(put_end_result.has_value()); // Verify replica list after PutEnd @@ -361,6 +378,7 @@ TEST_F(MasterServiceTest, PutStartEndFlow) { TEST_F(MasterServiceTest, RandomPutStartEndFlow) { std::unique_ptr service_(new MasterService()); + const UUID client_id = generate_uuid(); // Mount 5 segments, each 16MB constexpr size_t kBaseAddr = 0x300000000; @@ -374,14 +392,14 @@ TEST_F(MasterServiceTest, RandomPutStartEndFlow) { // Test PutStart std::string key = "test_key"; uint64_t value_length = 1024; - std::vector slice_lengths = {value_length}; ReplicateConfig config; std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<> dis(1, 5); int random_number = dis(gen); config.replica_num = random_number; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); EXPECT_TRUE(put_start_result.has_value()); replica_list = put_start_result.value(); EXPECT_FALSE(replica_list.empty()); @@ -394,7 +412,7 @@ TEST_F(MasterServiceTest, RandomPutStartEndFlow) { EXPECT_FALSE(remove_result.has_value()); EXPECT_EQ(ErrorCode::REPLICA_IS_NOT_READY, remove_result.error()); // Test PutEnd - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); EXPECT_TRUE(put_end_result.has_value()); // Verify replica list after PutEnd auto get_result2 = service_->GetReplicaList(key); @@ -412,6 +430,7 @@ TEST_F(MasterServiceTest, GetReplicaListByRegex) { .set_default_kv_lease_ttl(kv_lease_ttl) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Test getting non-existent key auto get_result = service_->GetReplicaList(".*non_existent.*"); EXPECT_FALSE(get_result.has_value()); @@ -422,12 +441,14 @@ TEST_F(MasterServiceTest, GetReplicaListByRegex) { int times = 10; while (times--) { std::string key = "test_key" + std::to_string(times); - std::vector slice_lengths = {1024}; + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); auto exist_result = service_->ExistKey(key); ASSERT_TRUE(exist_result.has_value()); @@ -443,14 +464,16 @@ TEST_F(MasterServiceTest, GetReplicaListByRegex) { } // Helper function to put an object, making the test cleaner -void put_object(MasterService& service, const std::string& key) { - std::vector slice_lengths = {1024}; +void put_object(MasterService& service, const UUID& client_id, + const std::string& key) { + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service.PutStart(key, slice_lengths, config); + auto put_start_result = + service.PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()) << "Failed to PutStart for key: " << key; - auto put_end_result = service.PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service.PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()) << "Failed to PutEnd for key: " << key; auto exist_result = service.ExistKey(key); @@ -464,6 +487,7 @@ TEST_F(MasterServiceTest, GetReplicaListByRegexComplex) { .set_default_kv_lease_ttl(kv_lease_ttl) .build(); auto service_ = std::make_unique(service_config); + const UUID client_id = generate_uuid(); // 1. Mount segment [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); @@ -484,7 +508,7 @@ TEST_F(MasterServiceTest, GetReplicaListByRegexComplex) { "test-key-extra", "another_key"}; for (const auto& key : keys_to_put) { - put_object(*service_, key); + put_object(*service_, client_id, key); } // Wait for all leases to be written to the underlying KV store. @@ -578,6 +602,7 @@ TEST_F(MasterServiceTest, GetReplicaListByRegexComplex) { TEST_F(MasterServiceTest, GetReplicaList) { std::unique_ptr service_(new MasterService()); + const UUID client_id = generate_uuid(); // Test getting non-existent key auto get_result = service_->GetReplicaList("non_existent"); EXPECT_FALSE(get_result.has_value()); @@ -586,12 +611,13 @@ TEST_F(MasterServiceTest, GetReplicaList) { [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); std::string key = "test_key"; - std::vector slice_lengths = {1024}; + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); // Test getting existing key @@ -604,14 +630,16 @@ TEST_F(MasterServiceTest, GetReplicaList) { TEST_F(MasterServiceTest, RemoveObject) { std::unique_ptr service_(new MasterService()); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); std::string key = "test_key"; - std::vector slice_lengths = {1024}; + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); // Test removing the object @@ -632,18 +660,21 @@ TEST_F(MasterServiceTest, RemoveObject) { TEST_F(MasterServiceTest, RandomRemoveObject) { std::unique_ptr service_(new MasterService()); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); int times = 10; std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<> dis(1, 1000); while (times--) { std::string key = "test_key" + std::to_string(dis(gen)); - std::vector slice_lengths = {1024}; + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); // Test removing the object @@ -664,15 +695,18 @@ TEST_F(MasterServiceTest, RemoveByRegex) { .build(); std::unique_ptr service_(new MasterService(service_config)); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); int times = 10; while (times--) { std::string key = "test_key" + std::to_string(times); - std::vector slice_lengths = {1024}; + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); auto exist_result = service_->ExistKey(key); ASSERT_TRUE(exist_result.has_value()); @@ -697,6 +731,7 @@ TEST_F(MasterServiceTest, RemoveByRegexComplex) { .set_default_kv_lease_ttl(kv_lease_ttl) .build(); auto service_ = std::make_unique(service_config); + const UUID client_id = generate_uuid(); // 1. Mount segment [[maybe_unused]] const auto context = @@ -719,7 +754,7 @@ TEST_F(MasterServiceTest, RemoveByRegexComplex) { "test-key-extra", "another_key"}; for (const auto& key : keys_to_put) { - put_object(*service_, key); + put_object(*service_, client_id, key); } // Wait for potential lease propagation std::this_thread::sleep_for(std::chrono::milliseconds(kv_lease_ttl)); @@ -873,15 +908,18 @@ TEST_F(MasterServiceTest, RemoveAll) { .build(); std::unique_ptr service_(new MasterService(service_config)); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); int times = 10; while (times--) { std::string key = "test_key" + std::to_string(times); - std::vector slice_lengths = {1024}; + uint64_t value_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, value_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); auto exist_result = service_->ExistKey(key); ASSERT_TRUE(exist_result.has_value()); @@ -898,12 +936,13 @@ TEST_F(MasterServiceTest, RemoveAll) { } } -TEST_F(MasterServiceTest, MultiSliceMultiReplicaFlow) { +TEST_F(MasterServiceTest, SingleSliceMultiReplicaFlow) { const uint64_t kv_lease_ttl = 50; auto service_config = MasterServiceConfig::builder() .set_default_kv_lease_ttl(kv_lease_ttl) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount 3 segments, each 64MB constexpr size_t kBaseAddr = 0x300000000; @@ -917,22 +956,7 @@ TEST_F(MasterServiceTest, MultiSliceMultiReplicaFlow) { // Test parameters std::string key = "multi_slice_object"; constexpr size_t num_replicas = 3; - constexpr size_t total_size = 1024 * 1024 * 5; // 5MB total size - - // Create multiple slices of different sizes - std::vector slice_lengths = { - 1024 * 1024 * 2, // 2MB - 1024 * 1024 * 1, // 1MB - 1024 * 1024 * 1, // 1MB - 1024 * 1024 * 1 // 1MB - }; - - // Verify total size matches sum of slices - uint64_t sum_slices = 0; - for (const auto& size : slice_lengths) { - sum_slices += size; - } - ASSERT_EQ(total_size, sum_slices); + constexpr size_t slice_length = 1024 * 1024 * 5; // 5MB // Configure replication ReplicateConfig config; @@ -940,7 +964,8 @@ TEST_F(MasterServiceTest, MultiSliceMultiReplicaFlow) { std::vector replica_list; // Test PutStart with multiple slices and replicas - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); replica_list = put_start_result.value(); @@ -950,19 +975,9 @@ TEST_F(MasterServiceTest, MultiSliceMultiReplicaFlow) { // Verify replica status EXPECT_EQ(ReplicaStatus::PROCESSING, replica.status); - // Verify number of handles matches number of slices - ASSERT_EQ(slice_lengths.size(), - replica.get_memory_descriptor().buffer_descriptors.size()); - - // Verify each handle's properties - for (size_t i = 0; - i < replica.get_memory_descriptor().buffer_descriptors.size(); - i++) { - const auto& handle = - replica.get_memory_descriptor().buffer_descriptors[i]; - - EXPECT_EQ(slice_lengths[i], handle.size_); - } + // Verify slice length matches buffer descriptor + EXPECT_EQ(slice_length, + replica.get_memory_descriptor().buffer_descriptor.size_); } // Test GetReplicaList during processing (should fail) @@ -971,7 +986,7 @@ TEST_F(MasterServiceTest, MultiSliceMultiReplicaFlow) { EXPECT_EQ(ErrorCode::REPLICA_IS_NOT_READY, get_result.error()); // Complete the put operation - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); // Test GetReplicaList after completion @@ -983,8 +998,8 @@ TEST_F(MasterServiceTest, MultiSliceMultiReplicaFlow) { // Verify final state of all replicas for (const auto& replica : retrieved_replicas) { EXPECT_EQ(ReplicaStatus::COMPLETE, replica.status); - ASSERT_EQ(slice_lengths.size(), - replica.get_memory_descriptor().buffer_descriptors.size()); + ASSERT_EQ(slice_length, + replica.get_memory_descriptor().buffer_descriptor.size_); } } @@ -1003,14 +1018,15 @@ TEST_F(MasterServiceTest, CleanupStaleHandlesTest) { // Create an object that will be stored in the segment std::string key = "segment_object"; - std::vector slice_lengths = {1024 * 1024}; // One 1MB slice + uint64_t slice_length = 1024 * 1024; // One 1MB slice ReplicateConfig config; config.replica_num = 1; // One replica // Create the object - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); // Verify object exists @@ -1035,9 +1051,11 @@ TEST_F(MasterServiceTest, CleanupStaleHandlesTest) { // Create another object std::string key2 = "another_segment_object"; - auto put_start_result2 = service_->PutStart(key2, slice_lengths, config); + auto put_start_result2 = + service_->PutStart(client_id, key2, slice_length, config); ASSERT_TRUE(put_start_result2.has_value()); - auto put_end_result2 = service_->PutEnd(key2, ReplicaType::MEMORY); + auto put_end_result2 = + service_->PutEnd(client_id, key2, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result2.has_value()); // Verify we can get it @@ -1076,16 +1094,16 @@ TEST_F(MasterServiceTest, ConcurrentWriteAndRemoveAll) { for (int j = 0; j < objects_per_thread; ++j) { std::string key = "key_" + std::to_string(i) + "_" + std::to_string(j); - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; std::vector replica_list; auto put_start_result = - service_->PutStart(key, slice_lengths, config); + service_->PutStart(client_id, key, slice_length, config); if (put_start_result.has_value()) { auto put_end_result = - service_->PutEnd(key, ReplicaType::MEMORY); + service_->PutEnd(client_id, key, ReplicaType::MEMORY); if (put_end_result.has_value()) { success_writes++; } @@ -1146,13 +1164,15 @@ TEST_F(MasterServiceTest, ConcurrentReadAndRemoveAll) { constexpr int num_objects = 1000; for (int i = 0; i < num_objects; ++i) { std::string key = "pre_key_" + std::to_string(i); - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); } @@ -1225,13 +1245,15 @@ TEST_F(MasterServiceTest, ConcurrentRemoveAllOperations) { constexpr int num_objects = 1000; for (int i = 0; i < num_objects; ++i) { std::string key = "pre_key_" + std::to_string(i); - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); } @@ -1281,9 +1303,11 @@ TEST_F(MasterServiceTest, UnmountSegmentImmediateCleanup) { ASSERT_TRUE(mount_result2.has_value()); // Create two objects in the two segments - std::string key1 = GenerateKeyForSegment(service_, segment1.name); - std::string key2 = GenerateKeyForSegment(service_, segment2.name); - std::vector slice_lengths = {1024}; + std::string key1 = + GenerateKeyForSegment(client_id, service_, segment1.name); + std::string key2 = + GenerateKeyForSegment(client_id, service_, segment2.name); + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; @@ -1302,18 +1326,19 @@ TEST_F(MasterServiceTest, UnmountSegmentImmediateCleanup) { ASSERT_TRUE(get_result2.has_value()); // Verify put key1 will put into segment2 rather than segment1 - auto put_start_result = service_->PutStart(key1, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key1, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); replica_list = put_start_result.value(); - auto put_end_result = service_->PutEnd(key1, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key1, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); auto get_result3 = service_->GetReplicaList(key1); ASSERT_TRUE(get_result3.has_value()); auto retrieved = get_result3.value(); ASSERT_EQ(replica_list[0] .get_memory_descriptor() - .buffer_descriptors[0] - .transport_endpoint_, + .buffer_descriptor.transport_endpoint_, segment2.name); } @@ -1336,14 +1361,16 @@ TEST_F(MasterServiceTest, ReadableAfterPartialUnmountWithReplication) { // Put a key with 2 replicas std::string key = "replicated_key"; - std::vector slice_lengths = {object_size}; + uint64_t slice_length = object_size; ReplicateConfig config; config.replica_num = 2; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); ASSERT_EQ(2u, put_start_result->size()); - ASSERT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + ASSERT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); // Verify two replicas exist and they are on distinct segments auto get_result = service_->GetReplicaList(key); @@ -1354,8 +1381,8 @@ TEST_F(MasterServiceTest, ReadableAfterPartialUnmountWithReplication) { for (const auto& rep : replicas) { ASSERT_EQ(ReplicaStatus::COMPLETE, rep.status); const auto& mem = rep.get_memory_descriptor(); - ASSERT_EQ(1u, mem.buffer_descriptors.size()); - seg_names.insert(mem.buffer_descriptors[0].transport_endpoint_); + ASSERT_EQ(slice_length, mem.buffer_descriptor.size_); + seg_names.insert(mem.buffer_descriptor.transport_endpoint_); } ASSERT_EQ(2u, seg_names.size()) << "Replicas should be on different segments"; @@ -1390,7 +1417,8 @@ TEST_F(MasterServiceTest, UnmountSegmentPerformance) { // Create `kNumKeys` keys for (int i = 0; i < kNumKeys; ++i) { - std::string key = GenerateKeyForSegment(service_, segment_name); + std::string key = + GenerateKeyForSegment(client_id, service_, segment_name); keys.push_back(key); } @@ -1435,16 +1463,18 @@ TEST_F(MasterServiceTest, RemoveLeasedObject) { .build(); std::unique_ptr service_(new MasterService(service_config)); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); std::string key = "test_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; // Verify lease is granted on ExistsKey - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); auto exist_result = service_->ExistKey(key); ASSERT_TRUE(exist_result.has_value()); @@ -1456,9 +1486,11 @@ TEST_F(MasterServiceTest, RemoveLeasedObject) { EXPECT_TRUE(remove_result2.has_value()); // Verify lease is extended on successive ExistsKey - auto put_start_result2 = service_->PutStart(key, slice_lengths, config); + auto put_start_result2 = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result2.has_value()); - auto put_end_result2 = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result2 = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result2.has_value()); auto exist_result2 = service_->ExistKey(key); ASSERT_TRUE(exist_result2.has_value()); @@ -1473,9 +1505,11 @@ TEST_F(MasterServiceTest, RemoveLeasedObject) { EXPECT_TRUE(remove_result4.has_value()); // Verify lease is granted on GetReplicaList - auto put_start_result3 = service_->PutStart(key, slice_lengths, config); + auto put_start_result3 = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result3.has_value()); - auto put_end_result3 = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result3 = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result3.has_value()); auto get_result = service_->GetReplicaList(key); ASSERT_TRUE(get_result.has_value()); @@ -1487,9 +1521,11 @@ TEST_F(MasterServiceTest, RemoveLeasedObject) { EXPECT_TRUE(remove_result6.has_value()); // Verify lease is extended on successive GetReplicaList - auto put_start_result4 = service_->PutStart(key, slice_lengths, config); + auto put_start_result4 = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result4.has_value()); - auto put_end_result4 = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result4 = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result4.has_value()); auto get_result2 = service_->GetReplicaList(key); ASSERT_TRUE(get_result2.has_value()); @@ -1516,14 +1552,17 @@ TEST_F(MasterServiceTest, RemoveAllLeasedObject) { .build(); std::unique_ptr service_(new MasterService(service_config)); [[maybe_unused]] const auto context = PrepareSimpleSegment(*service_); + const UUID client_id = generate_uuid(); for (int i = 0; i < 10; ++i) { std::string key = "test_key" + std::to_string(i); - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); if (i >= 5) { auto exist_result = service_->ExistKey(key); @@ -1553,6 +1592,7 @@ TEST_F(MasterServiceTest, EvictObject) { .set_default_kv_lease_ttl(kv_lease_ttl) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount a segment that can hold about 1024 * 16 objects. // As the eviction is processed separately for each shard, // we need to fill each shard with enough objects to thoroughly @@ -1567,12 +1607,14 @@ TEST_F(MasterServiceTest, EvictObject) { int success_puts = 0; for (int i = 0; i < 1024 * 16 + 50; ++i) { std::string key = "test_key" + std::to_string(i); - std::vector slice_lengths = {object_size}; + uint64_t slice_length = object_size; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); if (put_start_result.has_value()) { - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); success_puts++; } else { @@ -1592,6 +1634,7 @@ TEST_F(MasterServiceTest, TryEvictLeasedObject) { .set_default_kv_lease_ttl(kv_lease_ttl) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); constexpr size_t buffer = 0x300000000; constexpr size_t size = 1024 * 1024 * 16; constexpr size_t object_size = 1024 * 1024; @@ -1604,12 +1647,14 @@ TEST_F(MasterServiceTest, TryEvictLeasedObject) { std::vector leased_keys; for (int i = 0; i < 16 + 10; ++i) { std::string key = "test_key" + std::to_string(i); - std::vector slice_lengths = {object_size}; + uint64_t slice_length = object_size; ReplicateConfig config; config.replica_num = 1; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); if (put_start_result.has_value()) { - auto put_end_result = service_->PutEnd(key, ReplicaType::MEMORY); + auto put_end_result = + service_->PutEnd(client_id, key, ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); // the object is leased auto get_result = service_->GetReplicaList(key); @@ -1645,6 +1690,7 @@ TEST_F(MasterServiceTest, RemoveSoftPinObject) { allow_evict_soft_pinned_objects) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount segment and put an object constexpr size_t buffer = 0x300000000; constexpr size_t size = 1024 * 1024 * 16; @@ -1652,19 +1698,23 @@ TEST_F(MasterServiceTest, RemoveSoftPinObject) { PrepareSimpleSegment(*service_, "test_segment", buffer, size); std::string key = "test_key"; - std::vector slice_lengths = {1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 1; config.with_soft_pin = true; // Verify soft pin does not block remove - ASSERT_TRUE(service_->PutStart(key, slice_lengths, config).has_value()); - ASSERT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + ASSERT_TRUE( + service_->PutStart(client_id, key, slice_length, config).has_value()); + ASSERT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); EXPECT_TRUE(service_->Remove(key).has_value()); // Verify soft pin does not block RemoveAll - ASSERT_TRUE(service_->PutStart(key, slice_lengths, config).has_value()); - ASSERT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + ASSERT_TRUE( + service_->PutStart(client_id, key, slice_length, config).has_value()); + ASSERT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); EXPECT_EQ(1, service_->RemoveAll()); } @@ -1682,6 +1732,7 @@ TEST_F(MasterServiceTest, SoftPinObjectsNotEvictedBeforeOtherObjects) { .set_eviction_ratio(eviction_ratio) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount segment and put an object constexpr size_t buffer = 0x300000000; @@ -1695,28 +1746,32 @@ TEST_F(MasterServiceTest, SoftPinObjectsNotEvictedBeforeOtherObjects) { // Put pin_key first for (int i = 0; i < 2; i++) { std::string pin_key = "pin_key" + std::to_string(i); - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; ReplicateConfig soft_pin_config; soft_pin_config.replica_num = 1; soft_pin_config.with_soft_pin = true; + ASSERT_TRUE(service_ + ->PutStart(client_id, pin_key, slice_length, + soft_pin_config) + .has_value()); ASSERT_TRUE( - service_->PutStart(pin_key, slice_lengths, soft_pin_config) + service_->PutEnd(client_id, pin_key, ReplicaType::MEMORY) .has_value()); - ASSERT_TRUE( - service_->PutEnd(pin_key, ReplicaType::MEMORY).has_value()); } // Fill the segment to trigger eviction int failed_puts = 0; for (int i = 0; i < 20; i++) { std::string key = "key" + std::to_string(i); - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; ReplicateConfig config; config.replica_num = 1; - if (service_->PutStart(key, slice_lengths, config).has_value()) { + if (service_->PutStart(client_id, key, slice_length, config) + .has_value()) { ASSERT_TRUE( - service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + service_->PutEnd(client_id, key, ReplicaType::MEMORY) + .has_value()); } else { failed_puts++; } @@ -1750,6 +1805,7 @@ TEST_F(MasterServiceTest, SoftPinObjectsCanBeEvicted) { allow_evict_soft_pinned_objects) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount segment and put an object constexpr size_t buffer = 0x300000000; @@ -1762,12 +1818,14 @@ TEST_F(MasterServiceTest, SoftPinObjectsCanBeEvicted) { int success_puts = 0; for (int i = 0; i < 16 + 50; ++i) { std::string key = "test_key" + std::to_string(i); - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; ReplicateConfig config; config.replica_num = 1; config.with_soft_pin = true; - if (service_->PutStart(key, slice_lengths, config).has_value()) { - ASSERT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + if (service_->PutStart(client_id, key, slice_length, config) + .has_value()) { + ASSERT_TRUE(service_->PutEnd(client_id, key, ReplicaType::MEMORY) + .has_value()); success_puts++; } else { // wait for eviction to work @@ -1797,6 +1855,7 @@ TEST_F(MasterServiceTest, SoftPinExtendedOnGet) { .set_eviction_ratio(eviction_ratio) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount segment and put an object constexpr size_t buffer = 0x300000000; @@ -1810,15 +1869,16 @@ TEST_F(MasterServiceTest, SoftPinExtendedOnGet) { // Put pin_key first for (int i = 0; i < 2; i++) { std::string pin_key = "pin_key" + std::to_string(i); - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; ReplicateConfig soft_pin_config; soft_pin_config.replica_num = 1; soft_pin_config.with_soft_pin = true; + ASSERT_TRUE(service_->PutStart(client_id, pin_key, slice_length, + soft_pin_config)); ASSERT_TRUE( - service_->PutStart(pin_key, slice_lengths, soft_pin_config)); - ASSERT_TRUE( - service_->PutEnd(pin_key, ReplicaType::MEMORY).has_value()); + service_->PutEnd(client_id, pin_key, ReplicaType::MEMORY) + .has_value()); } // Wait for the soft pin to expire @@ -1834,12 +1894,14 @@ TEST_F(MasterServiceTest, SoftPinExtendedOnGet) { int failed_puts = 0; for (int i = 0; i < 16; i++) { std::string key = "key" + std::to_string(i); - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; ReplicateConfig config; config.replica_num = 1; - if (service_->PutStart(key, slice_lengths, config).has_value()) { + if (service_->PutStart(client_id, key, slice_length, config) + .has_value()) { ASSERT_TRUE( - service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + service_->PutEnd(client_id, key, ReplicaType::MEMORY) + .has_value()); } else { failed_puts++; } @@ -1876,6 +1938,7 @@ TEST_F(MasterServiceTest, SoftPinObjectsNotAllowEvict) { allow_evict_soft_pinned_objects) .build(); std::unique_ptr service_(new MasterService(service_config)); + const UUID client_id = generate_uuid(); // Mount segment and put an object constexpr size_t buffer = 0x300000000; @@ -1888,12 +1951,14 @@ TEST_F(MasterServiceTest, SoftPinObjectsNotAllowEvict) { std::vector success_keys; for (int i = 0; i < 16 + 50; ++i) { std::string key = "test_key" + std::to_string(i); - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; ReplicateConfig config; config.replica_num = 1; config.with_soft_pin = true; - if (service_->PutStart(key, slice_lengths, config).has_value()) { - ASSERT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + if (service_->PutStart(client_id, key, slice_length, config) + .has_value()) { + ASSERT_TRUE(service_->PutEnd(client_id, key, ReplicaType::MEMORY) + .has_value()); success_keys.push_back(key); } else { // wait for eviction to work @@ -1909,8 +1974,9 @@ TEST_F(MasterServiceTest, SoftPinObjectsNotAllowEvict) { service_->RemoveAll(); } -TEST_F(MasterServiceTest, PerSliceReplicaSegmentsAreUnique) { +TEST_F(MasterServiceTest, ReplicaSegmentsAreUnique) { std::unique_ptr service_(new MasterService()); + const UUID client_id = generate_uuid(); // Mount 20 segments, each 16MB and slab-aligned constexpr size_t kBaseAddr = 0x300000000; @@ -1923,34 +1989,34 @@ TEST_F(MasterServiceTest, PerSliceReplicaSegmentsAreUnique) { // Object with 16 slices of ~1MB and replication factor 10 const std::string key = "replica_uniqueness_test_key"; - std::vector slice_lengths(16, 1024 * 1024 - 16); + uint64_t slice_length = 1024 * 1024 - 16; ReplicateConfig config; config.replica_num = 10; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); auto replica_list_local = put_start_result.value(); ASSERT_EQ(config.replica_num, replica_list_local.size()); - // For each slice index, segment names across replicas must be unique - for (size_t slice_idx = 0; slice_idx < slice_lengths.size(); ++slice_idx) { - std::unordered_set segment_names; - for (const auto& replica : replica_list_local) { - ASSERT_TRUE(replica.is_memory_replica()); - const auto& mem = replica.get_memory_descriptor(); - ASSERT_EQ(slice_lengths.size(), mem.buffer_descriptors.size()); - segment_names.insert( - mem.buffer_descriptors[slice_idx].transport_endpoint_); - } - EXPECT_EQ(segment_names.size(), config.replica_num) - << "Duplicate segment found for slice index " << slice_idx; + // Segment names across replicas must be unique + std::unordered_set segment_names; + for (const auto& replica : replica_list_local) { + ASSERT_TRUE(replica.is_memory_replica()); + const auto& mem = replica.get_memory_descriptor(); + ASSERT_EQ(slice_length, mem.buffer_descriptor.size_); + segment_names.insert(mem.buffer_descriptor.transport_endpoint_); } + EXPECT_EQ(segment_names.size(), config.replica_num) + << "Duplicate segment found"; - ASSERT_TRUE(service_->PutEnd(key, ReplicaType::MEMORY).has_value()); + ASSERT_TRUE( + service_->PutEnd(client_id, key, ReplicaType::MEMORY).has_value()); } TEST_F(MasterServiceTest, ReplicationFactorTwoWithSingleSegment) { std::unique_ptr service_(new MasterService()); + const UUID client_id = generate_uuid(); // Mount a single 16MB segment constexpr size_t kBaseAddr = 0x300000000; @@ -1961,11 +2027,12 @@ TEST_F(MasterServiceTest, ReplicationFactorTwoWithSingleSegment) { // Request replication factor 2 with a single 1KB slice // With best-effort semantics, should succeed with 1 replica const std::string key = "replication_factor_two_single_segment"; - std::vector slice_lengths{1024}; + uint64_t slice_length = 1024; ReplicateConfig config; config.replica_num = 2; - auto put_start_result = service_->PutStart(key, slice_lengths, config); + auto put_start_result = + service_->PutStart(client_id, key, slice_length, config); ASSERT_TRUE(put_start_result.has_value()); auto replicas = put_start_result.value(); @@ -1975,14 +2042,13 @@ TEST_F(MasterServiceTest, ReplicationFactorTwoWithSingleSegment) { // Verify the replica is properly allocated on the single segment auto mem_desc = replicas[0].get_memory_descriptor(); - EXPECT_EQ(1u, mem_desc.buffer_descriptors.size()); - EXPECT_EQ("single_segment", - mem_desc.buffer_descriptors[0].transport_endpoint_); - EXPECT_EQ(1024u, mem_desc.buffer_descriptors[0].size_); + EXPECT_EQ("single_segment", mem_desc.buffer_descriptor.transport_endpoint_); + EXPECT_EQ(1024u, mem_desc.buffer_descriptor.size_); } TEST_F(MasterServiceTest, BatchExistKeyTest) { std::unique_ptr service_(new MasterService()); + const UUID client_id = generate_uuid(); // Mount a segment constexpr size_t buffer = 0x300000000; @@ -1997,12 +2063,12 @@ TEST_F(MasterServiceTest, BatchExistKeyTest) { test_keys.push_back("test_key" + std::to_string(i)); ReplicateConfig config; config.replica_num = 1; - std::vector slice_lengths = {value_size}; + uint64_t slice_length = value_size; auto put_start_result = - service_->PutStart(test_keys[i], slice_lengths, config); + service_->PutStart(client_id, test_keys[i], slice_length, config); ASSERT_TRUE(put_start_result.has_value()); auto put_end_result = - service_->PutEnd(test_keys[i], ReplicaType::MEMORY); + service_->PutEnd(client_id, test_keys[i], ReplicaType::MEMORY); ASSERT_TRUE(put_end_result.has_value()); } @@ -2021,6 +2087,153 @@ TEST_F(MasterServiceTest, BatchExistKeyTest) { ASSERT_FALSE(exist_resp[test_object_num].value()); } +TEST_F(MasterServiceTest, PutStartExpiringTest) { + // Reset storage space metrics. + MasterMetricManager::instance().reset_allocated_mem_size(); + MasterMetricManager::instance().reset_total_mem_capacity(); + + MasterServiceConfig master_config; + master_config.put_start_discard_timeout_sec = 3; + master_config.put_start_release_timeout_sec = 5; + std::unique_ptr service_(new MasterService(master_config)); + + constexpr size_t kReplicaCnt = 3; + constexpr size_t kBaseAddr = 0x300000000; + constexpr size_t kSegmentSize = 1024 * 1024 * 16; // 16MB + + // Mount 3 segments. + std::vector contexts; + contexts.reserve(kReplicaCnt); + for (size_t i = 0; i < kReplicaCnt; ++i) { + auto context = PrepareSimpleSegment( + *service_, "segment_" + std::to_string(i), + kBaseAddr + static_cast(i) * kSegmentSize, kSegmentSize); + contexts.push_back(context); + } + + // The client_id used to put objects. + auto client_id = generate_uuid(); + std::string key_1 = "test_key_1", key_2 = "test_key_2"; + uint64_t value_length = 6 * 1024 * 1024; // 6MB + uint64_t slice_length = value_length; + ReplicateConfig config; + config.replica_num = kReplicaCnt; + + // Put key_1, should success. + auto put_start_result = + service_->PutStart(client_id, key_1, slice_length, config); + EXPECT_TRUE(put_start_result.has_value()); + replica_list = put_start_result.value(); + EXPECT_EQ(replica_list.size(), kReplicaCnt); + for (size_t i = 0; i < kReplicaCnt; i++) { + EXPECT_EQ(ReplicaStatus::PROCESSING, replica_list[i].status); + } + + // Put key_1 again, should fail because the key exists. + put_start_result = + service_->PutStart(client_id, key_1, slice_length, config); + EXPECT_FALSE(put_start_result.has_value()); + EXPECT_EQ(put_start_result.error(), ErrorCode::OBJECT_ALREADY_EXISTS); + + // Wait for a while until the put-start expired. + for (size_t i = 0; i <= master_config.put_start_discard_timeout_sec; i++) { + for (auto& context : contexts) { + auto result = service_->Ping(context.client_id); + EXPECT_TRUE(result.has_value()); + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + // Put key_1 again, should success because the old one has expired and will + // be discarded by this put. + put_start_result = + service_->PutStart(client_id, key_1, slice_length, config); + EXPECT_TRUE(put_start_result.has_value()); + replica_list = put_start_result.value(); + EXPECT_EQ(replica_list.size(), kReplicaCnt); + for (size_t i = 0; i < kReplicaCnt; i++) { + EXPECT_EQ(ReplicaStatus::PROCESSING, replica_list[i].status); + } + + // Complete key_1. + auto put_end_result = + service_->PutEnd(client_id, key_1, ReplicaType::MEMORY); + EXPECT_TRUE(put_end_result.has_value()); + + // Protect key_1 from eviction. + auto get_result = service_->GetReplicaList(key_1); + EXPECT_TRUE(get_result.has_value()); + + // Put key_2, should fail because the key_1 occupied 12MB (6MB processing, + // 6MB discarded but not yet released) on each segment. + put_start_result = + service_->PutStart(client_id, key_2, slice_length, config); + EXPECT_FALSE(put_start_result.has_value()); + EXPECT_EQ(put_start_result.error(), ErrorCode::NO_AVAILABLE_HANDLE); + + // Wait for a while until the discarded replicas are released. + for (size_t i = 0; i <= master_config.put_start_release_timeout_sec - + master_config.put_start_discard_timeout_sec; + i++) { + for (auto& context : contexts) { + auto result = service_->Ping(context.client_id); + EXPECT_TRUE(result.has_value()); + } + // Protect key_1 from eviction. + auto get_result = service_->GetReplicaList(key_1); + EXPECT_TRUE(get_result.has_value()); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + // Put key_2 again, should success because the discarded replica has been + // released. + put_start_result = + service_->PutStart(client_id, key_2, slice_length, config); + EXPECT_TRUE(put_start_result.has_value()); + replica_list = put_start_result.value(); + EXPECT_EQ(replica_list.size(), kReplicaCnt); + for (size_t i = 0; i < kReplicaCnt; i++) { + EXPECT_EQ(ReplicaStatus::PROCESSING, replica_list[i].status); + } + + // Wait for a while until key_2 can be discarded and released. + for (size_t i = 0; i <= master_config.put_start_release_timeout_sec; i++) { + for (auto& context : contexts) { + auto result = service_->Ping(context.client_id); + EXPECT_TRUE(result.has_value()); + } + // Protect key_1 from eviction. + auto get_result = service_->GetReplicaList(key_1); + EXPECT_TRUE(get_result.has_value()); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + // Put key_2 again, should fail because eviction has not been triggered. And + // this PutStart should trigger the eviction. + put_start_result = + service_->PutStart(client_id, key_2, slice_length, config); + EXPECT_FALSE(put_start_result.has_value()); + EXPECT_EQ(put_start_result.error(), ErrorCode::NO_AVAILABLE_HANDLE); + + // Wait a moment for the eviction to complete. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + // Put key_2 again, should success because the previous one has been + // discarded and released. + put_start_result = + service_->PutStart(client_id, key_2, slice_length, config); + EXPECT_TRUE(put_start_result.has_value()); + replica_list = put_start_result.value(); + EXPECT_EQ(replica_list.size(), kReplicaCnt); + for (size_t i = 0; i < kReplicaCnt; i++) { + EXPECT_EQ(ReplicaStatus::PROCESSING, replica_list[i].status); + } + + // Complete key_2. + put_end_result = service_->PutEnd(client_id, key_2, ReplicaType::MEMORY); + EXPECT_TRUE(put_end_result.has_value()); +} + } // namespace mooncake::test int main(int argc, char** argv) { diff --git a/mooncake-store/tests/pybind_client_test.cpp b/mooncake-store/tests/pybind_client_test.cpp index 4ae18c0ba..753402f25 100644 --- a/mooncake-store/tests/pybind_client_test.cpp +++ b/mooncake-store/tests/pybind_client_test.cpp @@ -16,6 +16,19 @@ DEFINE_string(device_name, "", "Device name to use, valid if protocol=rdma"); namespace mooncake { namespace testing { +// Helper class to temporarily mute glog output by setting log level to FATAL +class GLogMuter { + public: + GLogMuter() : original_log_level_(FLAGS_minloglevel) { + FLAGS_minloglevel = google::GLOG_FATAL; + } + + ~GLogMuter() { FLAGS_minloglevel = original_log_level_; } + + private: + int original_log_level_; +}; + class PyClientTest : public ::testing::Test { protected: static void SetUpTestSuite() { @@ -181,7 +194,11 @@ TEST_F(PyClientTest, GetWithLeaseTimeOut) { EXPECT_EQ(batch_put_result, 0) << "Batch put operation should succeed"; // Test Batch Get operation using batch_get_buffer - auto buffer_handles = py_client_->batch_get_buffer(keys); + std::vector> buffer_handles; + { + GLogMuter muter; + buffer_handles = py_client_->batch_get_buffer(keys); + } ASSERT_EQ(buffer_handles.size(), num_slices) << "Should return handles for all keys"; int fail_count = 0; @@ -190,13 +207,18 @@ TEST_F(PyClientTest, GetWithLeaseTimeOut) { fail_count++; } } + LOG(INFO) << "Batch get buffer " << fail_count << " out of " << num_slices << " keys failed"; ASSERT_NE(fail_count, 0) << "Should fail for some keys"; // Test Batch Get operation using batch_get_into - auto bytes_read_results = - py_client_->batch_get_into(keys, buffers, sizes); + std::vector bytes_read_results; + { + GLogMuter muter; + bytes_read_results = + py_client_->batch_get_into(keys, buffers, sizes); + } ASSERT_EQ(bytes_read_results.size(), num_slices) << "Should return results for all keys"; fail_count = 0; @@ -246,6 +268,8 @@ TEST_F(PyClientTest, ConcurrentPutGetWithLeaseTimeOut) { std::vector threads; std::barrier sync_barrier(num_threads); + GLogMuter muter; + // Start num_threads threads, each repeatedly putting their slice for (int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { threads.emplace_back([this, thread_idx, kv_lease_ttl_, @@ -349,6 +373,8 @@ TEST_F(PyClientTest, ConcurrentPutGetWithLeaseTimeOut) { std::vector threads; std::barrier sync_barrier(num_threads); + GLogMuter muter; + // Start num_threads threads, each putting multiple slices for (int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { threads.emplace_back([this, thread_idx, kv_lease_ttl_, @@ -475,7 +501,17 @@ TEST_F(PyClientTest, TestSetupExistTransferEngine) { ? FLAGS_device_name : std::string(""); auto transfer_engine = std::make_shared("P2PHANDSHAKE"); - transfer_engine->init("P2PHANDSHAKE", "localhost:17813"); + + // The auto discover has some problems in GitHub CI, so disable it here. + transfer_engine->setAutoDiscover(false); + auto init_ret = transfer_engine->init("P2PHANDSHAKE", "localhost:17813"); + ASSERT_EQ(init_ret, 0) << "Transfer engine initialization should succeed"; + if (FLAGS_protocol == "tcp") { + auto transport = transfer_engine->installTransport("tcp", nullptr); + ASSERT_NE(transport, nullptr) << "Install transport should succeed"; + } else { + ASSERT_TRUE(false) << "Unsupported protocol: " << FLAGS_protocol; + } ASSERT_EQ( py_client_->setup("localhost:17813", "P2PHANDSHAKE", 16 * 1024 * 1024, 16 * 1024 * 1024, FLAGS_protocol, rdma_devices, @@ -505,16 +541,24 @@ TEST_F(PyClientTest, TestBatchPutAndGetMultiBuffers) { const std::string rdma_devices = (FLAGS_protocol == std::string("rdma")) ? FLAGS_device_name : std::string(""); - auto transfer_engine = std::make_shared("P2PHANDSHAKE"); - transfer_engine->init("P2PHANDSHAKE", "localhost:17813"); - ASSERT_EQ( - py_client_->setup("localhost:17813", "P2PHANDSHAKE", 16 * 1024 * 1024, - 16 * 1024 * 1024, FLAGS_protocol, rdma_devices, - master_address_, transfer_engine), - 0); + ASSERT_EQ(py_client_->setup("localhost:17813", "P2PHANDSHAKE", + 16 * 1024 * 1024, 16 * 1024 * 1024, + FLAGS_protocol, rdma_devices, master_address_), + 0); std::string test_data(1000, '1'); std::string dst_data(1000, '0'); + + // Register buffers for zero-copy operations + int reg_result_test = + py_client_->register_buffer(test_data.data(), test_data.size()); + ASSERT_EQ(reg_result_test, 0) + << "Test data buffer registration should succeed"; + int reg_result_dst = + py_client_->register_buffer(dst_data.data(), dst_data.size()); + ASSERT_EQ(reg_result_dst, 0) + << "Dst data buffer registration should succeed"; + std::vector keys; std::vector> all_ptrs; std::vector> all_dst_ptrs; @@ -551,6 +595,14 @@ TEST_F(PyClientTest, TestBatchPutAndGetMultiBuffers) { EXPECT_EQ(result, 100) << "Get operation should succeed"; } EXPECT_EQ(dst_data, test_data) << "Retrieved data should match original"; + + // Unregister buffers + int unreg_result_test = py_client_->unregister_buffer(test_data.data()); + ASSERT_EQ(unreg_result_test, 0) + << "Test data buffer unregistration should succeed"; + int unreg_result_dst = py_client_->unregister_buffer(dst_data.data()); + ASSERT_EQ(unreg_result_dst, 0) + << "Dst data buffer unregistration should succeed"; } } // namespace testing diff --git a/mooncake-transfer-engine/include/CMakeLists.txt b/mooncake-transfer-engine/include/CMakeLists.txt index 29de02668..4f52e1e5c 100644 --- a/mooncake-transfer-engine/include/CMakeLists.txt +++ b/mooncake-transfer-engine/include/CMakeLists.txt @@ -1 +1,11 @@ install(FILES transfer_engine_c.h DESTINATION include) +install(FILES common.h DESTINATION include) +install(FILES config.h DESTINATION include) +install(FILES error.h DESTINATION include) +install(FILES memory_location.h DESTINATION include) +install(FILES multi_transport.h DESTINATION include) +install(FILES topology.h DESTINATION include) +install(FILES transfer_engine.h DESTINATION include) +install(FILES transfer_metadata.h DESTINATION include) +install(FILES common/base/status.h DESTINATION include/common/base) +install(FILES transport/transport.h DESTINATION include/transport) diff --git a/mooncake-transfer-engine/include/config.h b/mooncake-transfer-engine/include/config.h index 6098b552b..41e92ecc6 100644 --- a/mooncake-transfer-engine/include/config.h +++ b/mooncake-transfer-engine/include/config.h @@ -23,6 +23,12 @@ #include namespace mooncake { + +enum class EndpointStoreType { + FIFO = 0, + SIEVE = 1, +}; + struct GlobalConfig { size_t num_cq_per_ctx = 1; size_t num_comp_channels_per_ctx = 1; @@ -50,6 +56,7 @@ struct GlobalConfig { bool use_ipv6 = false; size_t fragment_limit = 16384; bool enable_dest_device_affinity = false; + EndpointStoreType endpoint_store_type = EndpointStoreType::SIEVE; }; void loadGlobalConfig(GlobalConfig &config); @@ -61,6 +68,7 @@ void updateGlobalConfig(ibv_device_attr &device_attr); GlobalConfig &globalConfig(); uint16_t getDefaultHandshakePort(); + } // namespace mooncake -#endif // CONFIG_H \ No newline at end of file +#endif // CONFIG_H diff --git a/mooncake-transfer-engine/include/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.h b/mooncake-transfer-engine/include/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.h index fb60f3ee7..ec3b40698 100644 --- a/mooncake-transfer-engine/include/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.h +++ b/mooncake-transfer-engine/include/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.h @@ -124,8 +124,8 @@ class AscendDirectTransport : public Transport { int32_t device_logic_id_{}; aclrtContext rt_context_{nullptr}; - int32_t connect_timeout_ = 3000; - int32_t transfer_timeout_ = 3000; + int32_t connect_timeout_ = 10000; + int32_t transfer_timeout_ = 10000; std::string local_adxl_engine_name_{}; aclrtStream stream_{}; bool use_buffer_pool_{false}; diff --git a/mooncake-transfer-engine/include/transport/tcp_transport/tcp_transport.h b/mooncake-transfer-engine/include/transport/tcp_transport/tcp_transport.h index adf96c241..f8c95d7e2 100644 --- a/mooncake-transfer-engine/include/transport/tcp_transport/tcp_transport.h +++ b/mooncake-transfer-engine/include/transport/tcp_transport/tcp_transport.h @@ -60,6 +60,8 @@ class TcpTransport : public Transport { std::shared_ptr meta, std::shared_ptr topo); + int startHandshakeDaemon(); + int allocateLocalSegmentID(int tcp_data_port); int registerLocalMemory(void *addr, size_t length, diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h index dcccc7789..2f733dead 100644 --- a/mooncake-transfer-engine/include/transport/transport.h +++ b/mooncake-transfer-engine/include/transport/transport.h @@ -26,6 +26,10 @@ #include #include #include +#include +#include +#include +#include #include "common/base/status.h" #include "transfer_metadata.h" @@ -76,8 +80,24 @@ class Transport { size_t transferred_bytes; }; + struct BatchDesc; struct TransferTask; + // NOTE ABOUT BatchID → BatchDesc conversion: + // + // BatchID is an opaque 64‑bit unsigned integer that carries a + // BatchDesc pointer value. For performance reasons, this helper + // reinterprets the integral handle directly as a BatchDesc + // reference. + // + // The conversion intentionally bypasses any map or lookup to + // minimize overhead on hot paths. The caller must ensure that + // the underlying BatchDesc object remains alive and valid for + // as long as the handle is in use. + static inline BatchDesc &toBatchDesc(BatchID id) { + return *reinterpret_cast(id); + } + // Slice must be allocated on heap, as it will delete self on markSuccess // or markFailed. struct Slice { @@ -128,16 +148,76 @@ class Transport { public: void markSuccess() { status = Slice::SUCCESS; - __sync_fetch_and_add(&task->transferred_bytes, length); - __sync_fetch_and_add(&task->success_slice_count, 1); + __atomic_fetch_add(&task->transferred_bytes, length, + __ATOMIC_RELAXED); + __atomic_fetch_add(&task->success_slice_count, 1, __ATOMIC_RELAXED); + + check_batch_completion(false); } void markFailed() { status = Slice::FAILED; - __sync_fetch_and_add(&task->failed_slice_count, 1); + __atomic_fetch_add(&task->failed_slice_count, 1, __ATOMIC_RELAXED); + + check_batch_completion(true); } volatile int64_t ts; + + private: + inline void check_batch_completion(bool is_failed) { +#ifdef USE_EVENT_DRIVEN_COMPLETION + auto &batch_desc = toBatchDesc(task->batch_id); + if (is_failed) { + batch_desc.has_failure.store(true, std::memory_order_relaxed); + } + + // When the last slice of a task completes, check if the entire task + // is done using a single atomic counter to avoid reading + // inconsistent results. + uint64_t prev_completed = __atomic_fetch_add( + &task->completed_slice_count, 1, __ATOMIC_RELAXED); + + // Only the thread completing the final slice will see prev+1 == + // slice_count. + if (prev_completed + 1 == task->slice_count) { + __atomic_store_n(&task->is_finished, true, __ATOMIC_RELAXED); + + // Increment the number of finished tasks in the batch + // (relaxed). This counter does not itself publish data; only + // the thread that observes the last task completion performs + // the release-store on batch_desc.is_finished below. The waiter + // pairs this with an acquire load, which makes all prior writes + // (including relaxed increments) visible. + // + // check if this is the last task in the batch + auto prev = batch_desc.finished_task_count.fetch_add( + 1, std::memory_order_relaxed); + + // Last task in the batch: wake up waiting thread directly + if (prev + 1 == batch_desc.batch_size) { + // Publish completion of the entire batch under the same + // mutex used by the waiter to avoid lost notifications. + // + // Keep a release-store because the reader has a fast path + // that may observe completion without taking the mutex. The + // acquire load in that fast path pairs with this release to + // make all prior updates visible. For the predicate checked + // under the mutex, relaxed would suffice since the mutex + // acquire provides the necessary visibility. + { + std::lock_guard lock( + batch_desc.completion_mutex); + batch_desc.is_finished.store(true, + std::memory_order_release); + } + // Notify after releasing the lock to avoid waking threads + // only to block again on the mutex. + batch_desc.completion_cv.notify_all(); + } + } +#endif + } }; struct ThreadLocalSliceCache { @@ -198,6 +278,10 @@ class Transport { uint64_t total_bytes = 0; BatchID batch_id = 0; +#ifdef USE_EVENT_DRIVEN_COMPLETION + volatile uint64_t completed_slice_count = 0; +#endif + // record the origin request #ifdef USE_ASCEND_HETEROGENEOUS // need to modify the request's source address, changing it from an NPU @@ -220,6 +304,18 @@ class Transport { std::vector task_list; void *context; // for transport implementers. int64_t start_timestamp; + +#ifdef USE_EVENT_DRIVEN_COMPLETION + // Event-driven completion: tracks batch progress and notifies waiters + std::atomic finished_task_count{0}; + std::atomic has_failure{false}; + std::atomic is_finished{ + false}; // Completion flag for wait predicate + + // Synchronization primitives for direct notification + std::mutex completion_mutex; + std::condition_variable completion_cv; +#endif }; public: diff --git a/mooncake-transfer-engine/src/config.cpp b/mooncake-transfer-engine/src/config.cpp index 9bc4b76fb..80828402d 100644 --- a/mooncake-transfer-engine/src/config.cpp +++ b/mooncake-transfer-engine/src/config.cpp @@ -14,6 +14,9 @@ #include "config.h" +#include +#include +#include #include #include @@ -278,6 +281,18 @@ void loadGlobalConfig(GlobalConfig &config) { if (std::getenv("MC_ENABLE_DEST_DEVICE_AFFINITY")) { config.enable_dest_device_affinity = true; } + + const char *endpoint_store_type_env = std::getenv("MC_ENDPOINT_STORE_TYPE"); + if (endpoint_store_type_env) { + if (strcmp(endpoint_store_type_env, "FIFO") == 0) { + config.endpoint_store_type = EndpointStoreType::FIFO; + } else if (strcmp(endpoint_store_type_env, "SIEVE") == 0) { + config.endpoint_store_type = EndpointStoreType::SIEVE; + } else { + LOG(WARNING) << "Ignore value from environment variable " + "MC_ENDPOINT_STORE_TYPE, it should be FIFO|SIEVE"; + } + } } std::string mtuLengthToString(ibv_mtu mtu) { @@ -336,4 +351,4 @@ GlobalConfig &globalConfig() { } uint16_t getDefaultHandshakePort() { return globalConfig().handshake_port; } -} // namespace mooncake \ No newline at end of file +} // namespace mooncake diff --git a/mooncake-transfer-engine/src/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.cpp b/mooncake-transfer-engine/src/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.cpp index 95157f7b1..dec4e1475 100644 --- a/mooncake-transfer-engine/src/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.cpp +++ b/mooncake-transfer-engine/src/transport/ascend_transport/ascend_direct_transport/ascend_direct_transport.cpp @@ -573,8 +573,14 @@ void AscendDirectTransport::processSliceList( return; } if (target_adxl_engine_name == local_adxl_engine_name_) { - VLOG(1) << "Target is local, use memory copy."; - return localCopy(slice_list[0]->opcode, slice_list); + auto start = std::chrono::steady_clock::now(); + localCopy(slice_list[0]->opcode, slice_list); + LOG(INFO) << "Local copy time: " + << std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count() + << "us"; + return; } int ret = checkAndConnect(target_adxl_engine_name); if (ret != 0) { @@ -608,7 +614,14 @@ void AscendDirectTransport::processSliceList( .count() << " us"; } else { - LOG(ERROR) << "Transfer slice failed with status: " << status; + if (status == adxl::TIMEOUT) { + LOG(ERROR) << "Transfer timeout to: " << target_adxl_engine_name + << ", you can increase the timeout duration to reduce " + "the failure rate by configuring " + "the ASCEND_TRANSFER_TIMEOUT environment variable."; + } else { + LOG(ERROR) << "Transfer slice failed with status: " << status; + } for (auto &slice : slice_list) { slice->markFailed(); } @@ -816,7 +829,11 @@ int AscendDirectTransport::checkAndConnect( } auto status = adxl_->Connect(target_adxl_engine_name.c_str(), connect_timeout_); - if (status != adxl::SUCCESS) { + if (status == adxl::TIMEOUT) { + LOG(ERROR) << "Connect timeout to: " << target_adxl_engine_name + << ", you can increase the timeout duration to reduce " + "the ASCEND_CONNECT_TIMEOUT environment variable."; + } else if (status != adxl::SUCCESS) { LOG(ERROR) << "Failed to connect to target: " << target_adxl_engine_name << ", status: " << status; return -1; diff --git a/mooncake-transfer-engine/src/transport/rdma_transport/rdma_context.cpp b/mooncake-transfer-engine/src/transport/rdma_transport/rdma_context.cpp index 6b6e3e1b7..05aab26a2 100644 --- a/mooncake-transfer-engine/src/transport/rdma_transport/rdma_context.cpp +++ b/mooncake-transfer-engine/src/transport/rdma_transport/rdma_context.cpp @@ -62,7 +62,21 @@ RdmaContext::~RdmaContext() { int RdmaContext::construct(size_t num_cq_list, size_t num_comp_channels, uint8_t port, int gid_index, size_t max_cqe, int max_endpoints) { - endpoint_store_ = std::make_shared(max_endpoints); + // Create endpoint store based on configuration + auto &config = globalConfig(); + switch (config.endpoint_store_type) { + case EndpointStoreType::FIFO: + endpoint_store_ = + std::make_shared(max_endpoints); + LOG(INFO) << "Using FIFO endpoint store"; + break; + case EndpointStoreType::SIEVE: + default: + endpoint_store_ = + std::make_shared(max_endpoints); + LOG(INFO) << "Using SIEVE endpoint store"; + break; + } if (openRdmaDevice(device_name_, port, gid_index)) { LOG(ERROR) << "Failed to open device " << device_name_ << " on port " << port << " with GID " << gid_index; diff --git a/mooncake-transfer-engine/src/transport/tcp_transport/tcp_transport.cpp b/mooncake-transfer-engine/src/transport/tcp_transport/tcp_transport.cpp index c4a5bb556..d586823c0 100644 --- a/mooncake-transfer-engine/src/transport/tcp_transport/tcp_transport.cpp +++ b/mooncake-transfer-engine/src/transport/tcp_transport/tcp_transport.cpp @@ -146,8 +146,18 @@ struct Session : public std::enable_shared_from_this { #if defined(USE_CUDA) || defined(USE_MUSA) || defined(USE_HIP) if (isCudaMemory(addr)) { dram_buffer = new char[buffer_size]; - cudaMemcpy(dram_buffer, addr + total_transferred_bytes_, - buffer_size, cudaMemcpyDefault); + cudaError_t cuda_status = + cudaMemcpy(dram_buffer, addr + total_transferred_bytes_, + buffer_size, cudaMemcpyDefault); + if (cuda_status != cudaSuccess) { + LOG(ERROR) + << "Session::writeBody failed to copy from CUDA memory. " + << "Error: " << cudaGetErrorString(cuda_status); + if (on_finalize_) on_finalize_(TransferStatusEnum::FAILED); + session_mutex_.unlock(); + delete[] dram_buffer; + return; + } } #endif @@ -223,9 +233,20 @@ struct Session : public std::enable_shared_from_this { session_mutex_.unlock(); return; } + #if defined(USE_CUDA) || defined(USE_MUSA) || defined(USE_HIP) - cudaMemcpy(addr + total_transferred_bytes_, dram_buffer, - transferred_bytes, cudaMemcpyDefault); + cudaError_t cuda_status = + cudaMemcpy(addr + total_transferred_bytes_, dram_buffer, + transferred_bytes, cudaMemcpyDefault); + if (cuda_status != cudaSuccess) { + LOG(ERROR) + << "Session::readBody failed to copy to CUDA memory. " + << "Error: " << cudaGetErrorString(cuda_status); + if (on_finalize_) on_finalize_(TransferStatusEnum::FAILED); + if (is_cuda_memory) delete[] dram_buffer; + session_mutex_.unlock(); + return; + } if (is_cuda_memory) delete[] dram_buffer; #endif total_transferred_bytes_ += transferred_bytes; @@ -236,8 +257,15 @@ struct Session : public std::enable_shared_from_this { struct TcpContext { TcpContext(short port) - : acceptor(io_context, - asio::ip::tcp::endpoint(asio::ip::tcp::v4(), port)) {} + : acceptor(io_context){ + asio::ip::tcp::endpoint(asio::ip::tcp::v6(), port); + acceptor.open(endpoint.protocol()); + acceptor.set_option( + asio::ip::v6_only(false)); // support both IPv4 and IPv6 + acceptor.set_option(asio::ip::tcp::acceptor::reuse_address(true)); + acceptor.bind(endpoint); + acceptor.listen(); + } void doAccept() { acceptor.async_accept([this](asio::error_code ec, tcpsocket socket) { @@ -269,6 +297,12 @@ TcpTransport::~TcpTransport() { metadata_->removeSegmentDesc(local_server_name_); } +int TcpTransport::startHandshakeDaemon() { + return metadata_->startHandshakeDaemon(nullptr, + metadata_->localRpcMeta().rpc_port, + metadata_->localRpcMeta().sockfd); +} + int TcpTransport::install(std::string &local_server_name, std::shared_ptr meta, std::shared_ptr topo) { @@ -288,6 +322,12 @@ int TcpTransport::install(std::string &local_server_name, return -1; } + ret = startHandshakeDaemon(); + if (ret) { + LOG(ERROR) << "TcpTransport: cannot start handshake daemon"; + return -1; + } + ret = metadata_->updateLocalSegmentDesc(); if (ret) { LOG(ERROR) << "TcpTransport: cannot publish segments, " @@ -463,9 +503,8 @@ void TcpTransport::startTransfer(Slice *slice) { slice->markFailed(); return; } - auto endpoint_iterator = - resolver.resolve(asio::ip::tcp::v4(), meta_entry.ip_or_host_name, - std::to_string(desc->tcp_data_port)); + resolver.resolve(meta_entry.ip_or_hostname, + std::to_string(desc->tcp_data_port)); asio::connect(socket, endpoint_iterator); auto session = std::make_shared(std::move(socket)); session->on_finalize_ = [slice](TransferStatusEnum status) { diff --git a/mooncake-wheel/mooncake/mooncake_config.py b/mooncake-wheel/mooncake/mooncake_config.py index 7ff330b21..8a0328cdf 100644 --- a/mooncake-wheel/mooncake/mooncake_config.py +++ b/mooncake-wheel/mooncake/mooncake_config.py @@ -10,7 +10,7 @@ DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200 # 3.125 GiB DEFAULT_LOCAL_BUFFER_SIZE = 1073741824 # 1.0 GiB -def _parse_global_segment_size(value) -> int: +def _parse_segment_size(value) -> int: if isinstance(value, int): return value if isinstance(value, str): @@ -19,7 +19,7 @@ def _parse_global_segment_size(value) -> int: num = s[:-2].strip() if not num: raise ValueError( - "Invalid global_segment_size: missing number before 'gb'" + "Invalid segment size: missing number before 'gb'" ) return int(num) * 1024 * 1024 * 1024 return int(s) @@ -73,11 +73,12 @@ def from_file(file_path: str) -> 'MooncakeConfig': return MooncakeConfig( local_hostname=config.get("local_hostname"), metadata_server=config.get("metadata_server"), - global_segment_size=_parse_global_segment_size( + global_segment_size=_parse_segment_size( config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE) ), - local_buffer_size=config.get("local_buffer_size", - DEFAULT_LOCAL_BUFFER_SIZE), + local_buffer_size=_parse_segment_size( + config.get("local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE) + ), protocol=config.get("protocol", "tcp"), device_name=config.get("device_name", ""), master_server_address=config.get("master_server_address"), @@ -96,13 +97,14 @@ def load_from_env() -> 'MooncakeConfig': if not os.getenv("MOONCAKE_MASTER"): raise ValueError("Neither the environment variable 'MOONCAKE_CONFIG_PATH' nor 'MOONCAKE_MASTER' is set.") return MooncakeConfig( - local_hostname=os.getenv("LOCAL_HOSTNAME", "localhost"), + local_hostname=os.getenv("MOONCAKE_LOCAL_HOSTNAME", "localhost"), metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"), - global_segment_size=_parse_global_segment_size( + global_segment_size=_parse_segment_size( os.getenv("MOONCAKE_GLOBAL_SEGMENT_SIZE", DEFAULT_GLOBAL_SEGMENT_SIZE) ), - # Zero copy interface does not need local buffer - local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE, + local_buffer_size=_parse_segment_size( + os.getenv("MOONCAKE_LOCAL_BUFFER_SIZE", DEFAULT_LOCAL_BUFFER_SIZE) + ), protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"), device_name=os.getenv("MOONCAKE_DEVICE", ""), master_server_address=os.getenv("MOONCAKE_MASTER"), diff --git a/scripts/test_tensor_api.py b/scripts/test_tensor_api.py new file mode 100644 index 000000000..aa78bf1c8 --- /dev/null +++ b/scripts/test_tensor_api.py @@ -0,0 +1,241 @@ +from dataclasses import dataclass +import json +import torch +from mooncake.store import MooncakeDistributedStore +import os +import sys +import time +import argparse +import numpy as np + +TENSOR_SIZE_MB = 32 +TOTAL_BATCH_SIZE_GB = 1 + +DEFAULT_MOONCAKE_CONFIG_PATH_ENV = "MOONCAKE_CONFIG_PATH" +DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024 # 4 GiB +DEFAULT_LOCAL_BUFFER_SIZE = 2 * 1024 * 1024 * 1024 # 2 MB +DEFAULT_MASTER_METRICS_PORT = 9003 +DEFAULT_CHECK_SERVER = False +TENSOR_SIZE_BYTES = int(TENSOR_SIZE_MB * 1024 * 1024) +TOTAL_BATCH_SIZE_BYTES = int(TOTAL_BATCH_SIZE_GB * 1024 * 1024 * 1024) + +NUM_TENSORS = TOTAL_BATCH_SIZE_BYTES // TENSOR_SIZE_BYTES + +if TOTAL_BATCH_SIZE_BYTES % TENSOR_SIZE_BYTES != 0: + print(f"Error: Total batch size {TOTAL_BATCH_SIZE_GB} GB is not " + f"evenly divisible by tensor size {TENSOR_SIZE_MB} MB.", + file=sys.stderr) + sys.exit(1) + +def _parse_global_segment_size(value) -> int: + if isinstance(value, int): + return value + if isinstance(value, str): + s = value.strip().lower() + if s.endswith("gb"): + num = s[:-2].strip() + if not num: + raise ValueError( + "Invalid global_segment_size: missing number before 'gb'" + ) + return int(num) * 1024 * 1024 * 1024 + return int(s) + return int(value) + +@dataclass +class MooncakeStoreConfig: + local_hostname: str + metadata_server: str + global_segment_size: int + local_buffer_size: int + protocol: str + device_name: str + master_server_address: str + master_metrics_port: int + check_server: bool + + @staticmethod + def from_file() -> "MooncakeStoreConfig": + """Load the config from a JSON file.""" + file_path = os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV) + try: + with open(file_path) as fin: + config = json.load(fin) + except Exception as e: + raise RuntimeError(f"Failed to load config from {file_path}: {str(e)}") + + return MooncakeStoreConfig( + local_hostname=config.get("local_hostname"), + metadata_server=config.get("metadata_server"), + global_segment_size=_parse_global_segment_size( + config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE) + ), + # Zero copy interface does not need local buffer + local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE, + protocol=config.get("protocol", "tcp"), + device_name=config.get("device_name", ""), + master_server_address=config.get("master_server_address"), + master_metrics_port=config.get( + "master_metrics_port", DEFAULT_MASTER_METRICS_PORT + ), + check_server=config.get("check_server", DEFAULT_CHECK_SERVER), + ) + + @staticmethod + def load_from_env() -> "MooncakeStoreConfig": + """Load config from a file specified in the environment variable. + export MOONCAKE_MASTER=10.13.3.232:50051 + export MOONCAKE_PROTOCOL="rdma" + export MOONCAKE_DEVICE="" + export MOONCAKE_TE_META_DATA_SERVER="P2PHANDSHAKE" + """ + # other required environment variables... + if not os.getenv("MOONCAKE_MASTER"): + raise ValueError("The environment variable 'MOONCAKE_MASTER' is not set.") + return MooncakeStoreConfig( + local_hostname=os.getenv("LOCAL_HOSTNAME", "localhost"), + metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"), + global_segment_size=_parse_global_segment_size( + os.getenv("MOONCAKE_GLOBAL_SEGMENT_SIZE", DEFAULT_GLOBAL_SEGMENT_SIZE) + ), + # Zero copy interface does not need local buffer + local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE, + protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"), + device_name=os.getenv("MOONCAKE_DEVICE", ""), + master_server_address=os.getenv("MOONCAKE_MASTER"), + master_metrics_port=int( + os.getenv("MOONCAKE_MASTER_METRICS_PORT", DEFAULT_MASTER_METRICS_PORT) + ), + check_server=bool(os.getenv("MOONCAKE_CHECK_SERVER", DEFAULT_CHECK_SERVER)), + ) + +def run_benchmark(num_iterations): + store = MooncakeDistributedStore() + + print("--- Mooncake Tensor Performance Benchmark ---") + print(f"Configuration:") + print(f" Tensor Size: {TENSOR_SIZE_MB} MB") + print(f" Total Batch Size: {TOTAL_BATCH_SIZE_GB} GB") + print(f" Tensors per Batch: {NUM_TENSORS} (1024MB / 32MB)") + print(f" Iterations: {num_iterations}") + + try: + config = MooncakeStoreConfig.load_from_env() + print(f" Hostname: {config.local_hostname}") + print(f" Metadata Server: {config.metadata_server}") + print(f" Master Address: {config.master_server_address}") + print(f" Protocol: {config.protocol}") + + rc = store.setup( + config.local_hostname, + config.metadata_server, + config.global_segment_size, + config.local_buffer_size, + config.protocol, + config.device_name, + config.master_server_address, + ) + if rc != 0: + print(f"Failed to setup mooncake store, error code: {rc}", file=sys.stderr) + sys.exit(1) + print("\nMooncake store setup successful.") + + print("Preparing test data (this may take a moment)...") + elements_per_tensor = TENSOR_SIZE_BYTES // 4 + + tensors_list = [ + torch.randn(elements_per_tensor, dtype=torch.float32) + for _ in range(NUM_TENSORS) + ] + keys_list = [f"perf_tensor_{i}" for i in range(NUM_TENSORS)] + print(f"Data prepared: {NUM_TENSORS} tensors, {TENSOR_SIZE_MB} MB each.") + + # ---------------------------------------- + # Test 1: batch_put_tensor + # ---------------------------------------- + print(f"\n--- Benchmarking batch_put_tensor ({num_iterations} iterations) ---") + put_times = [] + for i in range(num_iterations): + store.remove_all() + + start_time = time.perf_counter() + results = store.batch_put_tensor(keys_list, tensors_list) + end_time = time.perf_counter() + + if not all(r == 0 for r in results): + print(f" Iteration {i+1}: FAILED (rc={results})", file=sys.stderr) + continue + + elapsed_time = end_time - start_time + put_times.append(elapsed_time) + + # (total_bytes * 8 bits/byte) / (time * 1024^3 Giga) = Gbps + throughput_gbps = (TOTAL_BATCH_SIZE_BYTES * 8) / (elapsed_time * (1024**3)) + print(f" Iteration {i+1}: {elapsed_time:.4f} s ({throughput_gbps:.2f} Gbps)") + + if put_times: + avg_put_time = np.mean(put_times) + avg_put_throughput = (TOTAL_BATCH_SIZE_BYTES * 8) / (avg_put_time * (1024**3)) + print(f"Average PUT Time: {avg_put_time:.4f} s") + print(f"Average PUT Throughput: {avg_put_throughput:.2f} Gbps") + else: + print("PUT test failed to complete.") + + # ---------------------------------------- + # Test 2: batch_get_tensor + # ---------------------------------------- + print(f"\n--- Benchmarking batch_get_tensor ({num_iterations} iterations) ---") + + print(" (Pre-populating data for GET test...)") + store.remove_all() + rc = store.batch_put_tensor(keys_list, tensors_list) + if not all(r == 0 for r in rc): + print(" Failed to pre-populate data for GET test!", file=sys.stderr) + sys.exit(1) + + get_times = [] + for i in range(num_iterations): + start_time = time.perf_counter() + retrieved_tensors = store.batch_get_tensor(keys_list) + end_time = time.perf_counter() + + if len(retrieved_tensors) != NUM_TENSORS or retrieved_tensors[0] is None: + print(f" Iteration {i+1}: FAILED (Data not retrieved correctly)", file=sys.stderr) + continue + + elapsed_time = end_time - start_time + get_times.append(elapsed_time) + + throughput_gbps = (TOTAL_BATCH_SIZE_BYTES * 8) / (elapsed_time * (1024**3)) + print(f" Iteration {i+1}: {elapsed_time:.4f} s ({throughput_gbps:.2f} Gbps)") + + if get_times: + avg_get_time = np.mean(get_times) + avg_get_throughput = (TOTAL_BATCH_SIZE_BYTES * 8) / (avg_get_time * (1024**3)) + print(f"Average GET Time: {avg_get_time:.4f} s") + print(f"Average GET Throughput: {avg_get_throughput:.2f} Gbps") + else: + print("GET test failed to complete.") + + print("\n✅ Benchmark finished.") + + except Exception as e: + print(f"\n❌ An error occurred: {e}", file=sys.stderr) + sys.exit(1) + finally: + print("Cleaning up and closing store...") + store.remove_all() + store.close() + print("Store closed.") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Mooncake Tensor API Performance Benchmark") + parser.add_argument( + "-n", "--iterations", + type=int, + default=5, + help="Number of iterations for each test (default: 5)" + ) + args = parser.parse_args() + + run_benchmark(args.iterations) \ No newline at end of file