fix: Fix DCP-to-HF conversion for model-wrapped checkpoints (#1881)

RayenTian · web-flow · commit 314c272dab15 · 2026-02-10T15:08:51.000Z
Signed-off-by: ruit &lt;ruit@nvidia.com&gt;
diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
@@ -1081,7 +1081,7 @@ def save_checkpoint(
             optimizer=self.optimizer,
             optimizer_path=optimizer_path,
             scheduler=self.scheduler,
-            tokenizer=self.tokenizer if tokenizer_path is None else None,
+            tokenizer=self.tokenizer if tokenizer_path else None,
             tokenizer_path=tokenizer_path,
             checkpointing_cfg=checkpointing_cfg,
             lora_enabled=self.lora_enabled,
diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py
@@ -236,17 +236,32 @@ def convert_dcp_to_hf(
         raise FileExistsError(
             f"HF checkpoint already exists at {hf_ckpt_path}. Delete it to run or set overwrite=True."
         )
-
     os.makedirs(hf_ckpt_path, exist_ok=True)
+
+    # The ckpt path of dtensor v2 is like <ckpt_dir>/model, while v1 is like <ckpt_dir>
+    # Choose the correct subdir based on the presence of the metadata file.
+    metadata_path = os.path.join(dcp_ckpt_path, ".metadata")
+    if not os.path.exists(metadata_path):
+        model_subdir = os.path.join(dcp_ckpt_path, "model")
+        model_metadata_path = os.path.join(model_subdir, ".metadata")
+        if os.path.exists(model_metadata_path):
+            dcp_ckpt_path = model_subdir
+            print(f"Using dcp_ckpt_path of Dtensor V2: {model_subdir}")
+        else:
+            raise FileNotFoundError(
+                f"No metadata file found in {dcp_ckpt_path}(Dtensor V1 ckpt path) or {model_subdir}(Dtensor V2 ckpt path)."
+            )
+    else:
+        print(f"Using dcp_ckpt_path of Dtensor V1: {dcp_ckpt_path}")
+
     weights_path = os.path.join(hf_ckpt_path, "pytorch_model.bin")
     dcp_to_torch_save(dcp_ckpt_path, weights_path)
 
-    # Need to reload and save b/c the state dict is scoped inside the model key {"model": actual_state_dict}
+    # Reload and save because DCP exports wrap weights under {"model": ...} in dtensor v1
+    # while others save a flat state_dict already in dtensor v2.``
     state_dict = torch.load(weights_path)
-    assert set(state_dict.keys()) == {"model"}, (
-        f"We expect that the state dict only has the top level model key, but found: {state_dict.keys()}"
-    )
-    torch.save(state_dict["model"], weights_path)
+    if set(state_dict.keys()) == {"model"}:
+        torch.save(state_dict["model"], weights_path)
 
     config = AutoConfig.from_pretrained(
         model_name_or_path, trust_remote_code=True, **hf_overrides
diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py
@@ -54,6 +54,7 @@
     },
     "dtensor_cfg": {
         "enabled": True,
+        "_v2": False,
         "cpu_offload": False,
         "sequence_parallel": False,
         "activation_checkpointing": False,
@@ -118,12 +119,20 @@ def tokenizer():
 
 
 @pytest.fixture(scope="function")
-def policy(cluster, tokenizer):
-    """Initialize the policy."""
+def policy(cluster, tokenizer, request):
+    """Initialize the policy with dtensor v1/v2."""
+    use_v2 = bool(getattr(request, "param", False))
+    config = {
+        **simple_policy_config,
+        "dtensor_cfg": {
+            **simple_policy_config["dtensor_cfg"],
+            "_v2": use_v2,
+        },
+    }
     policy = Policy(
         cluster=cluster,
         tokenizer=tokenizer,
-        config=simple_policy_config,
+        config=config,
         init_optimizer=True,
         init_reference_model=False,
     )
@@ -285,7 +294,8 @@ def test_save_and_load_model_and_optimizer(mock_experiment):
 
 
 @pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"])
-def test_convert_dcp_to_hf(policy, num_gpus):
+@pytest.mark.parametrize("policy", [False, True], ids=["v1", "v2"], indirect=True)
+def test_convert_dcp_to_hf(policy, num_gpus, request):
     ## warm up with a forward pass
     ## this is needed before saving a checkpoint because FSDP does some lazy initialization
     input_ids = torch.randint(0, 16000, (4, 128))  # 4 sequences, each of length 128
@@ -301,21 +311,30 @@ def test_convert_dcp_to_hf(policy, num_gpus):
         }
     )
     policy.train(dummy_fwd_dict, SimpleLoss())
+    policy_version_is_v2 = request.node.callspec.params["policy"]
 
     with TemporaryDirectory() as tmp_dir:
         policy.save_checkpoint(
             os.path.join(tmp_dir, "test_hf_and_dcp"),
+            checkpointing_cfg={
+                "enabled": True,
+                "model_save_format": "torch_save" if policy_version_is_v2 else None,
+            },
         )
 
         # Dynamically create the expected set of distcp files based on num_gpus
         expected_distcp_files = {f"__{rank}_0.distcp" for rank in range(num_gpus)}
         expected_files = expected_distcp_files.union({".metadata"})
 
-        ## make sure we save both HF and DCP checkpoints
-        assert (
-            set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files
+        ckpt_path = (
+            os.path.join(tmp_dir, "test_hf_and_dcp", "model")
+            if policy_version_is_v2
+            else os.path.join(tmp_dir, "test_hf_and_dcp")
         )
 
+        ## make sure we save both HF and DCP checkpoints
+        assert set(os.listdir(ckpt_path)) == expected_files
+
         offline_converted_model_path = convert_dcp_to_hf(
             os.path.join(tmp_dir, "test_hf_and_dcp"),
             os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"),