DeepAuto-AI · gmlwns2000 · Jun 30, 2025 · Jul 1, 2025 · Jul 4, 2025
diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py
@@ -21,6 +21,7 @@
 
 import heapq
 import time
+import warnings
 from collections import defaultdict
 from functools import partial
 from typing import TYPE_CHECKING, List, Optional, Tuple

diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -16,6 +16,7 @@
 # https://github.com/vllm-project/vllm/blob/fb6af8bc086328ca6659e72d11ffd4309ce4de22/vllm/model_executor/models/deepseek_v2.py
 """Inference-only DeepseekV2 model."""
 
+import copy
 import logging
 import os
 from dataclasses import dataclass
@@ -1093,6 +1094,9 @@ def forward_normal_from_cache(
         forward_batch.token_to_kv_pool.set_kv_buffer(
             self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
         )
+
+        k_current = k
+        v_current = v
 
         # Fetch latent cache from memory pool with precomputed chunked kv indices
         latent_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
@@ -1110,11 +1114,13 @@ def forward_normal_from_cache(
             chunk_len = forward_batch.extend_seq_lens_cpu[ibatch]
 
             q_chunk = q[acc_chunk_len : acc_chunk_len + chunk_len][None, ...]
+            k_chunk = k_current[acc_chunk_len : acc_chunk_len + chunk_len][None, ...]
+            v_chunk = v_current[acc_chunk_len : acc_chunk_len + chunk_len][None, ...]
 
             acc_chunk_len += chunk_len
 
             latent_cache = latent_cache_buf[
-                block_table[ibatch : ibatch + 1, : prefix_len + chunk_len]
+                block_table[ibatch : ibatch + 1, : prefix_len]
             ]
 
             kv_a_normed, k_pe = latent_cache.split(
@@ -1128,7 +1134,7 @@ def forward_normal_from_cache(
             v = kv[..., self.qk_nope_head_dim :]
             k_nope = kv[..., : self.qk_nope_head_dim]
 
-            k = torch.empty(
+            k = torch.zeros(
                 (
                     k_nope.shape[0],
                     self.num_local_heads,
@@ -1139,8 +1145,34 @@ def forward_normal_from_cache(
             )
             k[..., : self.qk_nope_head_dim] = k_nope
             k[..., self.qk_nope_head_dim :] = k_pe
+
+            # k = k[:-k_chunk.shape[1]]
+            # v = v[:-k_chunk.shape[1]]
+
+            k = torch.cat([k, k_chunk[0]], dim=0)
+            v = torch.cat([v, v_chunk[0]], dim=0)
+
+            current_forward_batch = copy.copy(forward_batch)
+            current_forward_batch.batch_size = 1
+            current_forward_batch.req_pool_indices = forward_batch.req_pool_indices[ibatch:ibatch+1]
+            current_forward_batch.extend_seq_lens = forward_batch.extend_seq_lens[ibatch: ibatch+1]
+            current_forward_batch.extend_seq_lens_cpu = forward_batch.extend_seq_lens_cpu[ibatch: ibatch+1]
+            current_forward_batch.positions = forward_batch.positions[acc_chunk_len:acc_chunk_len + chunk_len]
+            # cache_loc = (
+            #     forward_batch.out_cache_loc
+            #     if not layer.is_cross_attention
+            #     else forward_batch.encoder_out_cache_loc
+            # )
+            assert not self.attn_mha.is_cross_attention
+            current_forward_batch.out_cache_loc = forward_batch.out_cache_loc[acc_chunk_len:acc_chunk_len + chunk_len]
 
-            output = self.attn_mha(q_chunk, k, v, forward_batch, save_kv_cache=False)
+            output = self.attn_mha(
+                q_chunk, 
+                k, 
+                v, 
+                forward_batch, 
+                save_kv_cache=False
+            )
 
             outputs.append(output)
         attn_output = torch.cat(outputs, dim=0)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -1366,16 +1366,14 @@ def from_cli_args(cls, args: argparse.Namespace):
 
         if args.enable_hip_attention:
             from hip_attn.v1_2 import HiPAttentionConfig
-            
+
             if args.hip_attention_config_path is not None:
                 json_or_path = args.hip_attention_config_path
             else:
-                assert hasattr(args, 'hip_attention_config')
+                assert hasattr(args, "hip_attention_config")
                 json_or_path = args.hip_attention_config
 
-            args.hip_attention_config = HiPAttentionConfig(
-                json_or_path=json_or_path
-            )
+            args.hip_attention_config = HiPAttentionConfig(json_or_path=json_or_path)
             logger.info(
                 f"attention_backend changed {args.attention_backend} -> hip_attention"
             )