fix(pu): fix lr target_model_update bug when accumulation_steps>1

puyuan · puyuan · commit 243675b5a751 · 2025-02-13T16:50:19.000+08:00
diff --git a/lzero/policy/unizero.py b/lzero/policy/unizero.py
@@ -514,17 +514,16 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
 
             self._optimizer_world_model.step()
 
-            if self._cfg.cos_lr_scheduler or self._cfg.piecewise_decay_lr_scheduler:
-                self.lr_scheduler.step()
-
-            # Core target model update step
-            self._target_model.update(self._learn_model.state_dict())
-
             if self.accumulation_steps > 1:
                 torch.cuda.empty_cache()
         else:
             total_grad_norm_before_clip_wm = torch.tensor(0.)
 
+        if self._cfg.cos_lr_scheduler or self._cfg.piecewise_decay_lr_scheduler:
+            self.lr_scheduler.step()
+        # Core target model update step
+        self._target_model.update(self._learn_model.state_dict())
+
         if torch.cuda.is_available():
             torch.cuda.synchronize()
             current_memory_allocated = torch.cuda.memory_allocated()
diff --git a/zoo/jericho/configs/jericho_ppo_config.py b/zoo/jericho/configs/jericho_ppo_config.py
@@ -1,21 +1,22 @@
 from easydict import EasyDict
 import torch.nn as nn
 
-action_space_size = 10
-max_steps = 50
+
 model_name = 'BAAI/bge-base-en-v1.5'
+evaluator_env_num = 2
+
 # env_id = 'detective.z5'
+action_space_size = 10
+max_steps = 50
 
+env_id = 'zork1.z5'
 action_space_size = 10
 max_steps = 400
-env_id = 'zork1.z5'
 
-evaluator_env_num = 2
 
 # proj train
 # collector_env_num = 18
 # batch_size = 320
-
 collector_env_num = 4
 batch_size = 32
 
diff --git a/zoo/jericho/envs/jericho_env.py b/zoo/jericho/envs/jericho_env.py
@@ -108,7 +108,7 @@ def prepare_obs(self, obs, return_str: bool = False):
 
         action_mask = np.array(action_mask, dtype=np.int8)
 
-        if return_str: # TODO: unizero需要加上'to_play'===============
+        if return_str: # TODO: unizero需要加上'to_play', PPO不能加上'to_play'===============
             return {'observation': full_obs, 'action_mask': action_mask, 'to_play': -1}
             # return {'observation': full_obs, 'action_mask': action_mask}
         else:
@@ -172,7 +172,7 @@ def step(self, action: int, return_str: bool = False):
                     action_str = self._action_list[action]
                 else:
                     action_str = 'go'
-                    print(f'rank {self.rank}, len(self._action_list) == 0, self._env.get_valid_actions():{self._env.get_valid_actions()}')
+                    print(f"rank {self.rank}, len(self._action_list) == 0, self._env.get_valid_actions():{self._env.get_valid_actions()}, so we pass action_str='go'")
 
         # 记录上一次的观察
         if self.remove_stuck_actions and self.last_observation is not None: