Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 103 additions & 37 deletions docker/patch/latest/sglang.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
index 199885244..742ad0639 100644
index 1998852..742ad06 100644
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The git index hashes have been changed from 9 characters to 7 characters throughout the patch file (e.g., "199885244..742ad0639" changed to "1998852..742ad06"). While 7-character short hashes are typically sufficient, this inconsistency in hash length could cause issues if tools expect a specific format. Ensure that all tools consuming this patch file can handle the shorter hash format.

Suggested change
index 1998852..742ad06 100644
index 199885244..742ad0639 100644

Copilot uses AI. Check for mistakes.
--- a/python/sglang/srt/disaggregation/decode.py
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -314,6 +314,13 @@ class DecodePreallocQueue:
Expand All @@ -17,7 +17,7 @@ index 199885244..742ad0639 100644
"""Add a request to the pending queue."""
if self._check_if_req_exceed_kv_capacity(req):
diff --git a/python/sglang/srt/disaggregation/mooncake/conn.py b/python/sglang/srt/disaggregation/mooncake/conn.py
index 32e8c0b69..df913da7b 100644
index 32e8c0b..df913da 100644
--- a/python/sglang/srt/disaggregation/mooncake/conn.py
+++ b/python/sglang/srt/disaggregation/mooncake/conn.py
@@ -1079,6 +1079,19 @@ class MooncakeKVManager(CommonKVManager):
Expand All @@ -41,7 +41,7 @@ index 32e8c0b69..df913da7b 100644
class MooncakeKVSender(CommonKVSender):

diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
index ac11013f8..478e469f6 100644
index ac11013..478e469 100644
--- a/python/sglang/srt/disaggregation/prefill.py
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -309,6 +309,13 @@ class PrefillBootstrapQueue:
Expand All @@ -59,7 +59,7 @@ index ac11013f8..478e469f6 100644
class SchedulerDisaggregationPrefillMixin:
"""
diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
index 0478526ef..cfb1aa669 100644
index 0478526..cfb1aa6 100644
--- a/python/sglang/srt/distributed/parallel_state.py
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -1797,7 +1797,10 @@ def get_tensor_model_parallel_world_size():
Expand All @@ -75,7 +75,7 @@ index 0478526ef..cfb1aa669 100644

def get_pipeline_model_parallel_world_size():
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
index 21909706b..8fac5f162 100644
index 2190970..8fac5f1 100644
--- a/python/sglang/srt/entrypoints/engine.py
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -49,6 +49,7 @@ from sglang.srt.managers.io_struct import (
Expand Down Expand Up @@ -112,7 +112,7 @@ index 21909706b..8fac5f162 100644
"""Get weights by parameter name."""
obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
index 88705cc35..c8dc052f1 100644
index 88705cc..c8dc052 100644
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -107,6 +107,7 @@ from sglang.srt.managers.io_struct import (
Expand Down Expand Up @@ -146,7 +146,7 @@ index 88705cc35..c8dc052f1 100644
@app.post("/update_weight_version")
async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
index c9e82e4b1..58270e34a 100644
index c9e82e4..58270e3 100644
--- a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
+++ b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
@@ -3,6 +3,7 @@ from __future__ import annotations
Expand Down Expand Up @@ -190,7 +190,7 @@ index c9e82e4b1..58270e34a 100644
if enable_dual_stream:
current_stream = torch.cuda.current_stream()
diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
index b07164c53..8e6722ce0 100644
index b07164c..8e6722c 100644
--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -83,15 +83,12 @@ class RMSNorm(MultiPlatformOp):
Expand Down Expand Up @@ -249,7 +249,7 @@ index b07164c53..8e6722ce0 100644
hidden_size = x.shape[-1]
if hidden_size != self.hidden_size:
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index fa7431048..cd33ea735 100644
index fa74310..cd33ea7 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -878,11 +878,6 @@ class LogitsProcessor(nn.Module):
Expand All @@ -265,7 +265,7 @@ index fa7431048..cd33ea735 100644
logits = torch.matmul(
hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
index a1885fade..14d692365 100644
index a1885fa..14d6923 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -14,6 +14,7 @@ import torch.nn.functional as F
Expand All @@ -289,7 +289,7 @@ index a1885fade..14d692365 100644
intermediate_cache3.view(*intermediate_cache3.shape),
out_hidden_states[begin_chunk_idx:end_chunk_idx],
diff --git a/python/sglang/srt/layers/moe/routed_experts_capturer.py b/python/sglang/srt/layers/moe/routed_experts_capturer.py
index 00bd68755..5a3ca8a67 100644
index 00bd687..5a3ca8a 100644
--- a/python/sglang/srt/layers/moe/routed_experts_capturer.py
+++ b/python/sglang/srt/layers/moe/routed_experts_capturer.py
@@ -1,5 +1,6 @@
Expand Down Expand Up @@ -360,7 +360,7 @@ index 00bd68755..5a3ca8a67 100644

def get_routed_experts(
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c5e5a11fc..6b788fb1d 100644
index c5e5a11..6b788fb 100644
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1016,13 +1016,38 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
Expand Down Expand Up @@ -456,7 +456,7 @@ index c5e5a11fc..6b788fb1d 100644
def create_moe_runner(
self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 56516b41b..cb2ebca60 100644
index 56516b4..cb2ebca 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -135,9 +135,7 @@ class RotaryEmbedding(MultiPlatformOp):
Expand All @@ -481,7 +481,7 @@ index 56516b41b..cb2ebca60 100644
assert (
fused_set_kv_buffer_arg is None
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
index 55bef5652..35ad68b1c 100644
index 55bef56..35ad68b 100644
--- a/python/sglang/srt/layers/sampler.py
+++ b/python/sglang/srt/layers/sampler.py
@@ -108,16 +108,11 @@ class Sampler(nn.Module):
Expand All @@ -505,7 +505,7 @@ index 55bef5652..35ad68b1c 100644
if not get_global_server_args().sampling_backend == "ascend" or (
return_logprob and not SGLANG_RETURN_ORIGINAL_LOGPROB
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
index 879e1bfa6..de52085fa 100644
index 879e1bf..de52085 100644
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -1286,6 +1286,19 @@ class UpdateWeightsFromIPCReqOutput(BaseReq):
Expand All @@ -529,7 +529,7 @@ index 879e1bfa6..de52085fa 100644
@dataclass
class InitWeightsSendGroupForRemoteInstanceReqOutput(BaseReq):
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 468d8fb8a..229a9a2dc 100644
index 468d8fb..229a9a2 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -2181,7 +2181,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
Expand All @@ -543,7 +543,7 @@ index 468d8fb8a..229a9a2dc 100644


diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index bca1c31e6..0c82e37a4 100644
index bca1c31..0c82e37 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -97,6 +97,7 @@ from sglang.srt.managers.io_struct import (
Expand All @@ -563,7 +563,7 @@ index bca1c31e6..0c82e37a4 100644
(ReleaseMemoryOccupationReqInput, self.release_memory_occupation),
(ResumeMemoryOccupationReqInput, self.resume_memory_occupation),
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
index e40586c24..32d98aee4 100644
index e40586c..32d98ae 100644
--- a/python/sglang/srt/managers/scheduler_output_processor_mixin.py
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -10,6 +10,7 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
Expand All @@ -575,7 +575,7 @@ index e40586c24..32d98aee4 100644
AbortReq,
BatchEmbeddingOutput,
diff --git a/python/sglang/srt/managers/scheduler_update_weights_mixin.py b/python/sglang/srt/managers/scheduler_update_weights_mixin.py
index 293a84350..68911c433 100644
index 293a843..d0404db 100644
--- a/python/sglang/srt/managers/scheduler_update_weights_mixin.py
+++ b/python/sglang/srt/managers/scheduler_update_weights_mixin.py
@@ -1,6 +1,7 @@
Expand Down Expand Up @@ -617,7 +617,7 @@ index 293a84350..68911c433 100644

def get_weights_by_name(self: Scheduler, recv_req: GetWeightsByNameReqInput):
parameter = self.tp_worker.get_weights_by_name(recv_req)
@@ -137,6 +148,13 @@ class SchedulerUpdateWeightsMixin:
@@ -137,11 +148,19 @@ class SchedulerUpdateWeightsMixin:
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE)
self.flush_cache()

Expand All @@ -631,7 +631,18 @@ index 293a84350..68911c433 100644
if GPU_MEMORY_TYPE_WEIGHTS in tags:
self.stashed_model_static_state = _export_static_state(
self.tp_worker.model_runner.model
@@ -177,6 +195,13 @@ class SchedulerUpdateWeightsMixin:
)
torch.distributed.barrier(self.tp_cpu_group)
+ self.tp_worker.model_runner.remote_instance_unregister_memory_region()
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)

if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
@@ -173,10 +192,18 @@ class SchedulerUpdateWeightsMixin:
self.stashed_model_static_state,
)
del self.stashed_model_static_state
+ self.tp_worker.model_runner.remote_instance_register_memory_region()
Comment on lines +635 to +644
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The barrier synchronization and memory region unregistration/registration calls need to be carefully ordered. The current implementation calls unregister after barrier and register after barrier, but if one process fails to unregister, it could lead to inconsistent state across processes. Consider adding error handling or verification that all processes successfully complete the memory operations before proceeding.

Copilot uses AI. Check for mistakes.

if GPU_MEMORY_TYPE_KV_CACHE in tags:
self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE)

Expand All @@ -646,7 +657,7 @@ index 293a84350..68911c433 100644

def check_weights(self: Scheduler, recv_req: CheckWeightsReqInput):
diff --git a/python/sglang/srt/managers/tokenizer_communicator_mixin.py b/python/sglang/srt/managers/tokenizer_communicator_mixin.py
index e5d42bed8..412293b30 100644
index e5d42be..412293b 100644
--- a/python/sglang/srt/managers/tokenizer_communicator_mixin.py
+++ b/python/sglang/srt/managers/tokenizer_communicator_mixin.py
@@ -49,6 +49,8 @@ from sglang.srt.managers.io_struct import (
Expand Down Expand Up @@ -698,7 +709,7 @@ index e5d42bed8..412293b30 100644
self,
obj: InitWeightsSendGroupForRemoteInstanceReqInput,
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index f4fc29e29..5ef12cca6 100644
index f4fc29e..5ef12cc 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1652,12 +1652,13 @@ class TokenizerManager(TokenizerCommunicatorMixin, TokenizerManagerMultiItemMixi
Expand All @@ -722,7 +733,7 @@ index f4fc29e29..5ef12cca6 100644
recv_obj.output_token_logprobs_val[recv_obj_index]
)
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index 1f1875254..51d8651ce 100644
index 1f18752..51d8651 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -27,6 +27,7 @@ from sglang.srt.managers.io_struct import (
Expand All @@ -746,7 +757,7 @@ index 1f1875254..51d8651ce 100644
parameter = self.model_runner.get_weights_by_name(
recv_req.name, recv_req.truncate_size
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 1d69c0582..c849913e9 100644
index 1d69c05..b516607 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -558,7 +558,8 @@ class ModelRunner(ModelRunnerKVCacheMixin):
Expand All @@ -759,7 +770,45 @@ index 1d69c0582..c849913e9 100644

if self.device == "cuda":
self.init_cublas()
@@ -2224,11 +2225,19 @@ class ModelRunner(ModelRunnerKVCacheMixin):
@@ -635,6 +636,37 @@ class ModelRunner(ModelRunnerKVCacheMixin):
f"{local_ip}:{self.remote_instance_transfer_engine.get_rpc_port()}"
)

+ def remote_instance_register_memory_region(self):
+ if self.remote_instance_transfer_engine is None:
+ return
+
+ logger.debug("Registering memory regions to transfer engine after memory saver resume")
+ self.remote_instance_transfer_engine_weight_info = register_memory_region(
+ self.model, self.remote_instance_transfer_engine
+ )
+
+ def remote_instance_unregister_memory_region(self):
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method name "remote_instance_unregister_memory_region" uses singular "region" while the method actually handles multiple regions. For consistency with the method's functionality and the logging message within it ("regions" plural), consider renaming to "remote_instance_unregister_memory_regions".

Suggested change
+ def remote_instance_unregister_memory_region(self):
+ def remote_instance_unregister_memory_regions(self):

Copilot uses AI. Check for mistakes.
+ if self.remote_instance_transfer_engine is None:
+ return
+
+ logger.debug("Unregistering old memory regions from transfer engine")
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logging message uses "Unregistering old memory regions" which could be misleading. This method is called during memory saver pause operations, not just for "old" regions. Consider making the log message more specific, such as "Unregistering memory regions during memory saver pause" to clarify the context.

Suggested change
+ logger.debug("Unregistering old memory regions from transfer engine")
+ logger.debug("Unregistering memory regions from transfer engine during memory saver pause")

Copilot uses AI. Check for mistakes.
+ registered_blocks = []
+ old_addrs = set()
+ for name, (data_ptr, numel, element_size) in (
+ self.remote_instance_transfer_engine_weight_info.items()
+ ):
+ if data_ptr not in old_addrs:
+ old_addrs.add(data_ptr)
+ registered_blocks.append((data_ptr, numel * element_size))
+
+ for addr, size in registered_blocks:
+ try:
+ self.remote_instance_transfer_engine.unregister_memory(addr)
+ except Exception as e:
+ logger.debug(f"Failed to unregister memory at {addr}: {e}")
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error handling in the unregister_memory method catches all exceptions but only logs them at debug level. If unregistering memory fails, this could lead to memory leaks or resource management issues. Consider logging at warning or error level instead of debug, and potentially tracking whether unregistration succeeded to handle cleanup failures appropriately.

Suggested change
+ logger.debug(f"Failed to unregister memory at {addr}: {e}")
+ logger.warning(f"Failed to unregister memory at {addr}: {e}")

Copilot uses AI. Check for mistakes.
+
+ self.remote_instance_transfer_engine_weight_info = None
Comment on lines +793 to +806
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method sets the remote_instance_transfer_engine_weight_info to None after unregistering memory regions, but there's no check to ensure the dictionary exists before iteration. If this method is called multiple times or in an unexpected order, it could raise an AttributeError. Consider adding a check for None or using hasattr before accessing the dictionary.

Copilot uses AI. Check for mistakes.
+
def model_specific_adjustment(self):
server_args = self.server_args

@@ -2224,11 +2256,19 @@ class ModelRunner(ModelRunnerKVCacheMixin):
output.expert_distribution_metrics = recorder_outputs.get("metrics")

# Copy cached routing experts' buffers back to CPU cache
Expand All @@ -784,7 +833,7 @@ index 1d69c0582..c849913e9 100644

if self.eplb_manager is not None:
self.eplb_manager.on_forward_pass_end()
@@ -2436,6 +2445,42 @@ class ModelRunner(ModelRunnerKVCacheMixin):
@@ -2436,6 +2476,42 @@ class ModelRunner(ModelRunnerKVCacheMixin):
logger.error(f"IPC weight update failed: {e}")
return False, str(e)

Expand Down Expand Up @@ -828,7 +877,7 @@ index 1d69c0582..c849913e9 100644
def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tensor]]):
params_dict = dict(model.named_parameters())
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 2918461d3..d44c8aaa0 100644
index 2918461..d44c8aa 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -2704,7 +2704,11 @@ class DeepseekV2AttentionMLA(nn.Module):
Expand Down Expand Up @@ -873,7 +922,7 @@ index 2918461d3..d44c8aaa0 100644
if is_nextn and enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
self._mark_nextn_moe_weights_as_ue8m0()
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index a7dbadec6..c83a41338 100644
index a7dbade..c83a413 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -90,9 +90,6 @@ class Qwen2MLP(nn.Module):
Expand Down Expand Up @@ -911,7 +960,7 @@ index a7dbadec6..c83a41338 100644
if get_global_server_args().rl_on_policy_target is not None
else {}
diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py
index 3ad9f6736..0b9c7f499 100644
index 3ad9f67..0b9c7f4 100644
--- a/python/sglang/srt/models/qwen2_moe.py
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -586,7 +586,17 @@ class Qwen2MoeModel(nn.Module):
Expand All @@ -934,7 +983,7 @@ index 3ad9f6736..0b9c7f499 100644
self.norm = PPMissingLayer(return_tuple=True)

diff --git a/python/sglang/srt/models/qwen3.py b/python/sglang/srt/models/qwen3.py
index 9220831f6..47a1a4e4c 100644
index 9220831..47a1a4e 100644
--- a/python/sglang/srt/models/qwen3.py
+++ b/python/sglang/srt/models/qwen3.py
@@ -90,8 +90,8 @@ class Qwen3Attention(nn.Module):
Expand All @@ -960,7 +1009,7 @@ index 9220831f6..47a1a4e4c 100644
if get_global_server_args().rl_on_policy_target is not None
else {}
diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py
index e11678a9e..e277d46f2 100644
index e11678a..e277d46 100644
--- a/python/sglang/srt/models/qwen3_moe.py
+++ b/python/sglang/srt/models/qwen3_moe.py
@@ -22,6 +22,7 @@ import math
Expand Down Expand Up @@ -1070,7 +1119,7 @@ index e11678a9e..e277d46f2 100644

self.layer_communicator = LayerCommunicator(
diff --git a/python/sglang/srt/models/qwen3_vl.py b/python/sglang/srt/models/qwen3_vl.py
index 891913078..c9dbecd23 100644
index 8919130..c9dbecd 100644
--- a/python/sglang/srt/models/qwen3_vl.py
+++ b/python/sglang/srt/models/qwen3_vl.py
@@ -397,28 +397,68 @@ class Qwen3VLMoeVisionModel(nn.Module, RotaryPosMixin):
Expand Down Expand Up @@ -1186,7 +1235,7 @@ index 891913078..c9dbecd23 100644
positions,
hidden_states,
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 54d4e415a..de7620c20 100644
index 54d4e41..84831ad 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -523,6 +523,7 @@ class ServerArgs:
Expand All @@ -1209,8 +1258,25 @@ index 54d4e415a..de7620c20 100644
parser.add_argument(
"--disable-cuda-graph-padding",
action="store_true",
@@ -4972,11 +4978,11 @@ class ServerArgs:
f"Failed to import mooncake.engine. Does not support using TransferEngine as remote instance weight loader backend."
)
return False
- elif self.enable_memory_saver:
- logger.warning(
- "Memory saver is enabled, which is not compatible with TransferEngine. Does not support using TransferEngine as remote instance weight loader backend."
- )
- return False
+ # elif self.enable_memory_saver:
+ # logger.warning(
+ # "Memory saver is enabled, which is not compatible with TransferEngine. Does not support using TransferEngine as remote instance weight loader backend."
+ # )
+ # return False
Comment on lines +1270 to +1274
Copy link

Copilot AI Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commented-out code that checks for memory saver compatibility with TransferEngine represents a significant behavioral change. Removing this safety check without proper justification could lead to runtime issues. The original check prevented using TransferEngine when memory saver is enabled due to incompatibility. If this restriction is being removed, there should be clear evidence that the incompatibility has been resolved, or documentation explaining why this check is no longer necessary.

Suggested change
+ # elif self.enable_memory_saver:
+ # logger.warning(
+ # "Memory saver is enabled, which is not compatible with TransferEngine. Does not support using TransferEngine as remote instance weight loader backend."
+ # )
+ # return False
+ elif self.enable_memory_saver:
+ logger.warning(
+ "Memory saver is enabled, which is not compatible with TransferEngine. Does not support using TransferEngine as remote instance weight loader backend."
+ )
+ return False

Copilot uses AI. Check for mistakes.
else:
return True

diff --git a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
index 5fe45086c..c95fbd0f6 100644
index 5fe4508..c95fbd0 100644
--- a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -341,7 +341,10 @@ class EAGLEDraftCudaGraphRunner:
Expand All @@ -1237,7 +1303,7 @@ index 5fe45086c..c95fbd0f6 100644
self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)

diff --git a/python/sglang/srt/speculative/eagle_info.py b/python/sglang/srt/speculative/eagle_info.py
index 1bf3816e9..b5b41dba4 100644
index 1bf3816..b5b41db 100644
--- a/python/sglang/srt/speculative/eagle_info.py
+++ b/python/sglang/srt/speculative/eagle_info.py
@@ -778,6 +778,10 @@ class EagleDraftInput(SpecInput, EagleDraftInputV2Mixin):
Expand Down Expand Up @@ -1280,7 +1346,7 @@ index 1bf3816e9..b5b41dba4 100644

@dataclass
diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py
index a702df4f8..61d9ae366 100644
index a702df4..61d9ae3 100644
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -231,7 +231,7 @@ class EAGLEWorker(TpModelWorker):
Expand Down