diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py index 08e46a039e2..99f66d04de9 100644 --- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py +++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py @@ -867,6 +867,8 @@ def tensor_need_offloading_checker(self, tensor): # Respect tensor's offload preference if specified if hasattr(tensor, "offloading_activation") and not tensor.offloading_activation: return False + if getattr(tensor, "_TE_do_not_offload", False): + return False return True def bulk_offload_group(self):