fix(pu): fix model.training bug

puyuan · puyuan · commit bb5572bcb9e4 · 2025-02-17T23:16:33.000+08:00
diff --git a/lzero/mcts/buffer/game_buffer_muzero.py b/lzero/mcts/buffer/game_buffer_muzero.py
@@ -461,15 +461,15 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
                     m_output = model.initial_inference(m_obs)
                 
 
-                if not model.training:
-                    # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                            m_output.policy_logits
-                        ]
-                    )
+                # if not model.training:
+                # if not in training, obtain the scalars of the value/reward
+                [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    [
+                        m_output.latent_state,
+                        inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                        m_output.policy_logits
+                    ]
+                )
 
                 network_output.append(m_output)
 
@@ -589,15 +589,15 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
                 else:
                     m_output = model.initial_inference(m_obs)
 
-                if not model.training:
-                    # if not in training, obtain the scalars of the value/reward
-                    [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                        [
-                            m_output.latent_state,
-                            inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                            m_output.policy_logits
-                        ]
-                    )
+                # if not model.training:
+                # if not in training, obtain the scalars of the value/reward
+                [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                    [
+                        m_output.latent_state,
+                        inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                        m_output.policy_logits
+                    ]
+                )
 
                 network_output.append(m_output)
 
diff --git a/zoo/atari/config/atari_unizero_multitask_segment_8games_ddp_config.py b/zoo/atari/config/atari_unizero_multitask_segment_8games_ddp_config.py
@@ -73,8 +73,8 @@ def create_config(env_id, action_space_size, collector_env_num, evaluator_env_nu
             model_path=None,
             num_unroll_steps=num_unroll_steps,
             game_segment_length=20,
-            # update_per_collect=80,
-            update_per_collect=10, # only for debug
+            update_per_collect=80,
+            # update_per_collect=10, # only for debug
             replay_ratio=0.25,
             batch_size=batch_size,
             num_segments=num_segments,
@@ -96,7 +96,7 @@ def generate_configs(env_id_list, action_space_size, collector_env_num, n_episod
                      norm_type, seed, buffer_reanalyze_freq, reanalyze_batch_size, reanalyze_partition,
                      num_segments, total_batch_size):
     configs = []
-    exp_name_prefix = f'data_unizero_atari_mt_20250216/{len(env_id_list)}games_nlayer8_bs64_brf{buffer_reanalyze_freq}_seed{seed}/'
+    exp_name_prefix = f'data_unizero_atari_mt_20250217/{len(env_id_list)}games_nlayer8_bs64_brf{buffer_reanalyze_freq}_seed{seed}_dev-uz-mz/'
 
     for task_id, env_id in enumerate(env_id_list):
         config = create_config(
@@ -164,8 +164,9 @@ def create_env_manager():
     # num_segments = 2
     # n_episode = 2
     # evaluator_env_num = 2
-    # num_simulations = 2
-    # batch_size = [4, 4, 4, 4, 4, 4, 4, 4]
+    # num_simulations = 5
+    # # batch_size = [4, 4, 4, 4, 4, 4, 4, 4]
+    # batch_size = [4, 4,4,4]
 
 
     for seed in [0]:
@@ -175,5 +176,5 @@ def create_env_manager():
                                    num_segments, total_batch_size)
 
         with DDPContext():
-            # train_unizero_multitask_segment_ddp(configs, seed=seed, max_env_step=max_env_step)
-            train_unizero_multitask_segment_ddp(configs[:4], seed=seed, max_env_step=max_env_step) # train on the first four tasks, only for debug
+            train_unizero_multitask_segment_ddp(configs, seed=seed, max_env_step=max_env_step)
+            # train_unizero_multitask_segment_ddp(configs[:4], seed=seed, max_env_step=max_env_step) # train on the first four tasks, only for debug