XucSh · XucSh · Dec 11, 2025 · Dec 14, 2025 · Dec 14, 2025 · Dec 14, 2025
diff --git a/docs/source/python-api-reference/mooncake-store.md b/docs/source/python-api-reference/mooncake-store.md
@@ -1004,6 +1004,91 @@ def batch_get_tensor_with_tp(self, base_keys: List[str], tp_rank: int = 0, tp_si
 
 ---
 
+### PyTorch Tensor Operations (Zero Copy)
+
+These methods provide direct support for storing and retrieving PyTorch tensors. They automatically handle serialization and metadata, and include built-in support for **Tensor Parallelism (TP)** by automatically splitting and reconstructing tensor shards.
+
+⚠️ **Note**: These methods require `torch` to be installed and available in the environment.
+
+#### get_tensor_into()
+
+Get a PyTorch tensor from the store directly into a pre-allocated buffer.
+
+```python
+def get_tensor_with_tp(self, key: str, buffer_ptr: int, size: int) -> torch.Tensor
+```
+
+**Parameters:**
+
+  - `key` (str): Base identifier of the tensor.
+  - `buffer_ptr` (int): The buffer pointer pre-allocated for tensor, and the buffer should be registered.
+  - `size` (int): The size of buffer.
+
+**Returns:**
+
+  - `torch.Tensor`: The retrieved tensor (or shard). Returns `None` if not found.
+
+#### batch_get_tensor()
+
+Get a batch of PyTorch tensor from the store directly into a pre-allocated buffer.
+
+```python
+def batch_get_tensor_with_tp(self, base_keys: List[str], buffer_ptrs: List[int], sizes: List[int]) -> List[torch.Tensor]
+```
+
+**Parameters:**
+
+  - `base_keys` (List[str]): List of base identifiers.
+  - `buffer_ptrs` (List[int]): List of the buffers pointer pre-allocated for tensor, and the buffers should be registered.
+  - `sizes` (List[int]): List of the size of buffers.
+
+**Returns:**
+
+  - `List[torch.Tensor]`: List of retrieved tensors (or shards). Contains `None` for missing keys.
+
+#### get_tensor_into_with_tp()
+
+Get a PyTorch tensor from the store, specifically retrieving the shard corresponding to the given Tensor Parallel rank, directly into the pre-allocated buffer.
+
+```python
+def get_tensor_with_tp(self, key: str, buffer_ptr: int, size: int, tp_rank: int = 0, tp_size: int = 1, split_dim: int = 0) -> torch.Tensor
+```
+
+**Parameters:**
+
+  - `key` (str): Base identifier of the tensor.
+  - `buffer_ptr` (int): The buffer pointer pre-allocated for tensor, and the buffer should be registered.
+  - `size` (int): The size of buffer.
+  - `tp_rank` (int): The tensor parallel rank to retrieve (default: 0). Fetches key `key_tp_{rank}` if `tp_size > 1`.
+  - `tp_size` (int): Total tensor parallel size (default: 1).
+  - `split_dim` (int): The dimension used during splitting (default: 0).
+
+**Returns:**
+
+  - `torch.Tensor`: The retrieved tensor (or shard). Returns `None` if not found.
+
+#### batch_get_tensor_with_tp()
+
+Get a batch of PyTorch tensor shards from the store for a given Tensor Parallel rank, directly into the pre-allocated buffer.
+
+```python
+def batch_get_tensor_with_tp(self, base_keys: List[str], buffer_ptrs: List[int], sizes: List[int], tp_rank: int = 0, tp_size: int = 1) -> List[torch.Tensor]
+```
+
+**Parameters:**
+
+  - `base_keys` (List[str]): List of base identifiers.
+  - `buffer_ptrs` (List[int]): List of the buffers pointer pre-allocated for tensor, and the buffers should be registered.
+  - `sizes` (List[int]): List of the size of buffers.
+  - `tp_rank` (int): The tensor parallel rank to retrieve (default: 0).
+  - `tp_size` (int): Total tensor parallel size (default: 1).
+
+**Returns:**
+
+  - `List[torch.Tensor]`: List of retrieved tensors (or shards). Contains `None` for missing keys.
+
+---
+
 ### Batch Zero-Copy Operations
 
 #### batch_put_from()

diff --git a/mooncake-integration/integration_utils.h b/mooncake-integration/integration_utils.h
@@ -62,6 +62,35 @@ static const std::array<ArrayCreatorFunc, 15> array_creators = {{
     create_typed_array<uint8_t>,  // FLOAT8_E5M2 = 14 (using uint8_t as storage)
 }};
 
+template <typename T>
+py::array create_typed_array_view(char *data_ptr, size_t offset,
+                                  size_t total_length) {
+    return py::array_t<T>({static_cast<ssize_t>(total_length / sizeof(T))},
+                          (T *)(data_ptr + offset), py::none());
+}
+
+static const std::array<ArrayCreatorFunc, 16> array_creators_view = {{
+    create_typed_array_view<float>,     // FLOAT32 = 0
+    create_typed_array_view<double>,    // FLOAT64 = 1
+    create_typed_array_view<int8_t>,    // INT8 = 2
+    create_typed_array_view<uint8_t>,   // UINT8 = 3
+    create_typed_array_view<int16_t>,   // INT16 = 4
+    create_typed_array_view<uint16_t>,  // UINT16 = 5
+    create_typed_array_view<int32_t>,   // INT32 = 6
+    create_typed_array_view<uint32_t>,  // UINT32 = 7
+    create_typed_array_view<int64_t>,   // INT64 = 8
+    create_typed_array_view<uint64_t>,  // UINT64 = 9
+    create_typed_array_view<bool>,      // BOOL = 10
+    create_typed_array_view<uint16_t>,  // FLOAT16 = 11 (using uint16_t as
+                                        // storage)
+    create_typed_array_view<uint16_t>,  // BFLOAT16 = 12 (using uint16_t as
+                                        // storage)
+    create_typed_array_view<uint8_t>,   // FLOAT8_E4M3 = 13 (using uint8_t as
+                                        // storage)
+    create_typed_array_view<uint8_t>,   // FLOAT8_E5M2 = 14 (using uint8_t as
+                                        // storage)
+}};
-static const std::array<ArrayCreatorFunc, 16> array_creators_view = {{
-    create_typed_array_view<float>,     // FLOAT32 = 0
-    create_typed_array_view<double>,    // FLOAT64 = 1
-    create_typed_array_view<int8_t>,    // INT8 = 2
-    create_typed_array_view<uint8_t>,   // UINT8 = 3
-    create_typed_array_view<int16_t>,   // INT16 = 4
-    create_typed_array_view<uint16_t>,  // UINT16 = 5
-    create_typed_array_view<int32_t>,   // INT32 = 6
-    create_typed_array_view<uint32_t>,  // UINT32 = 7
-    create_typed_array_view<int64_t>,   // INT64 = 8
-    create_typed_array_view<uint64_t>,  // UINT64 = 9
-    create_typed_array_view<bool>,      // BOOL = 10
-    create_typed_array_view<uint16_t>,  // FLOAT16 = 11 (using uint16_t as
-                                        // storage)
-    create_typed_array_view<uint16_t>,  // BFLOAT16 = 12 (using uint16_t as
-                                        // storage)
-    create_typed_array_view<uint8_t>,   // FLOAT8_E4M3 = 13 (using uint8_t as
-                                        // storage)
-    create_typed_array_view<uint8_t>,   // FLOAT8_E5M2 = 14 (using uint8_t as
-                                        // storage)
-}};
+static const std::array<ArrayCreatorFunc, 15> array_creators_view = {{
+    create_typed_array_view<float>,     // FLOAT32 = 0
+    create_typed_array_view<double>,    // FLOAT64 = 1
+    create_typed_array_view<int8_t>,    // INT8 = 2
+    create_typed_array_view<uint8_t>,   // UINT8 = 3
+    create_typed_array_view<int16_t>,   // INT16 = 4
+    create_typed_array_view<uint16_t>,  // UINT16 = 5
+    create_typed_array_view<int32_t>,   // INT32 = 6
+    create_typed_array_view<uint32_t>,  // UINT32 = 7
+    create_typed_array_view<int64_t>,   // INT64 = 8
+    create_typed_array_view<uint64_t>,  // UINT64 = 9
+    create_typed_array_view<bool>,      // BOOL = 10
+    create_typed_array_view<uint16_t>,  // FLOAT16 = 11 (using uint16_t as
+                                        // storage)
+    create_typed_array_view<uint16_t>,  // BFLOAT16 = 12 (using uint16_t as
+                                        // storage)
+    create_typed_array_view<uint8_t>,   // FLOAT8_E4M3 = 13 (using uint8_t as
+                                        // storage)
+    create_typed_array_view<uint8_t>,   // FLOAT8_E5M2 = 14 (using uint8_t as
+                                        // storage)
+}};
-static const std::array<ArrayCreatorFunc, 16> array_creators_view = {{
-    create_typed_array_view<float>,     // FLOAT32 = 0
-    create_typed_array_view<double>,    // FLOAT64 = 1
-    create_typed_array_view<int8_t>,    // INT8 = 2
-    create_typed_array_view<uint8_t>,   // UINT8 = 3
-    create_typed_array_view<int16_t>,   // INT16 = 4
-    create_typed_array_view<uint16_t>,  // UINT16 = 5
-    create_typed_array_view<int32_t>,   // INT32 = 6
-    create_typed_array_view<uint32_t>,  // UINT32 = 7
-    create_typed_array_view<int64_t>,   // INT64 = 8
-    create_typed_array_view<uint64_t>,  // UINT64 = 9
-    create_typed_array_view<bool>,      // BOOL = 10
-    create_typed_array_view<uint16_t>,  // FLOAT16 = 11 (using uint16_t as
-                                        // storage)
-    create_typed_array_view<uint16_t>,  // BFLOAT16 = 12 (using uint16_t as
-                                        // storage)
-    create_typed_array_view<uint8_t>,   // FLOAT8_E4M3 = 13 (using uint8_t as
-                                        // storage)
-    create_typed_array_view<uint8_t>,   // FLOAT8_E5M2 = 14 (using uint8_t as
-                                        // storage)
-}};
+static const std::array<ArrayCreatorFunc, 15> array_creators_view = {{
+    create_typed_array_view<float>,     // FLOAT32 = 0
+    create_typed_array_view<double>,    // FLOAT64 = 1
+    create_typed_array_view<int8_t>,    // INT8 = 2
+    create_typed_array_view<uint8_t>,   // UINT8 = 3
+    create_typed_array_view<int16_t>,   // INT16 = 4
+    create_typed_array_view<uint16_t>,  // UINT16 = 5
+    create_typed_array_view<int32_t>,   // INT32 = 6
+    create_typed_array_view<uint32_t>,  // UINT32 = 7
+    create_typed_array_view<int64_t>,   // INT64 = 8
+    create_typed_array_view<uint64_t>,  // UINT64 = 9
+    create_typed_array_view<bool>,      // BOOL = 10
+    create_typed_array_view<uint16_t>,  // FLOAT16 = 11 (using uint16_t as
+                                        // storage)
+    create_typed_array_view<uint16_t>,  // BFLOAT16 = 12 (using uint16_t as
+                                        // storage)
+    create_typed_array_view<uint8_t>,   // FLOAT8_E4M3 = 13 (using uint8_t as
+                                        // storage)
+    create_typed_array_view<uint8_t>,   // FLOAT8_E5M2 = 14 (using uint8_t as
+                                        // storage)
+}};
+
 inline TensorDtype get_tensor_dtype(py::object dtype_obj) {
     if (dtype_obj.is_none()) {
         return TensorDtype::UNKNOWN;