polish(pu): rename step_index to timestep

puyuan · puyuan · commit 25cf29f8e3e5 · 2025-02-11T18:16:47.000+08:00
diff --git a/lzero/mcts/buffer/game_segment.py b/lzero/mcts/buffer/game_segment.py
@@ -68,7 +68,7 @@ def __init__(self, action_space: int, game_segment_length: int = 200, config: Ea
 
         self.action_mask_segment = []
         self.to_play_segment = []
-        self.step_index_segment = []
+        self.timestep_segment = []
 
         self.target_values = []
         self.target_rewards = []
@@ -136,7 +136,7 @@ def append(
             reward: np.ndarray,
             action_mask: np.ndarray = None,
             to_play: int = -1,
-            step_index: int = 0,
+            timestep: int = 0,
             chance: int = 0,
     ) -> None:
         """
@@ -149,7 +149,7 @@ def append(
 
         self.action_mask_segment.append(action_mask)
         self.to_play_segment.append(to_play)
-        self.step_index_segment.append(step_index)
+        self.timestep_segment.append(timestep)
 
         if self.use_ture_chance_label_in_chance_encoder:
             self.chance_segment.append(chance)
@@ -300,7 +300,7 @@ def game_segment_to_array(self) -> None:
 
         self.action_mask_segment = np.array(self.action_mask_segment)
         self.to_play_segment = np.array(self.to_play_segment)
-        self.step_index_segment = np.array(self.step_index_segment)
+        self.timestep_segment = np.array(self.timestep_segment)
 
         if self.use_ture_chance_label_in_chance_encoder:
             self.chance_segment = np.array(self.chance_segment)
@@ -322,7 +322,7 @@ def reset(self, init_observations: np.ndarray) -> None:
 
         self.action_mask_segment = []
         self.to_play_segment = []
-        self.step_index_segment = []
+        self.timestep_segment = []
 
         if self.use_ture_chance_label_in_chance_encoder:
             self.chance_segment = []
diff --git a/lzero/policy/unizero.py b/lzero/policy/unizero.py
@@ -182,6 +182,7 @@ class UniZeroPolicy(MuZeroPolicy):
         update_per_collect=None,
         # (float) The ratio of the collected data used for training. Only effective when ``update_per_collect`` is not None.
         replay_ratio=0.25,
+        reanalyze_ratio=0,
         # (int) Minibatch size for one gradient descent.
         batch_size=256,
         # (str) Optimizer for training policy network.
diff --git a/lzero/worker/muzero_collector.py b/lzero/worker/muzero_collector.py
@@ -360,7 +360,7 @@ def collect(self,
 
         action_mask_dict = {i: to_ndarray(init_obs[i]['action_mask']) for i in range(env_nums)}
         to_play_dict = {i: to_ndarray(init_obs[i]['to_play']) for i in range(env_nums)}
-        step_index_dict = {i: to_ndarray(init_obs[i]['step_index']) for i in range(env_nums)}
+        timestep_dict = {i: to_ndarray(init_obs[i]['timestep']) for i in range(env_nums)}
         if self.policy_config.use_ture_chance_label_in_chance_encoder:
             chance_dict = {i: to_ndarray(init_obs[i]['chance']) for i in range(env_nums)}
 
@@ -421,11 +421,11 @@ def collect(self,
 
                 action_mask_dict = {env_id: action_mask_dict[env_id] for env_id in ready_env_id}
                 to_play_dict = {env_id: to_play_dict[env_id] for env_id in ready_env_id}
-                step_index_dict = {env_id: step_index_dict[env_id] for env_id in ready_env_id}
+                timestep_dict = {env_id: timestep_dict[env_id] for env_id in ready_env_id}
                 
                 action_mask = [action_mask_dict[env_id] for env_id in ready_env_id]
                 to_play = [to_play_dict[env_id] for env_id in ready_env_id]
-                step_index = [step_index_dict[env_id] for env_id in ready_env_id]
+                timestep = [timestep_dict[env_id] for env_id in ready_env_id]
                 
                 if self.policy_config.use_ture_chance_label_in_chance_encoder:
                     chance_dict = {env_id: chance_dict[env_id] for env_id in ready_env_id}
@@ -439,13 +439,13 @@ def collect(self,
                 # Key policy forward step
                 # ==============================================================
                 # print(f'ready_env_id:{ready_env_id}')
-                policy_output = self._policy.forward(stack_obs, action_mask, temperature, to_play, epsilon, ready_env_id=ready_env_id, step_index=step_index)
+                policy_output = self._policy.forward(stack_obs, action_mask, temperature, to_play, epsilon, ready_env_id=ready_env_id, timestep=timestep)
 
                 # Extract relevant policy outputs
                 actions_with_env_id = {k: v['action'] for k, v in policy_output.items()}
                 value_dict_with_env_id = {k: v['searched_value'] for k, v in policy_output.items()}
                 pred_value_dict_with_env_id = {k: v['predicted_value'] for k, v in policy_output.items()}
-                step_index_dict_with_env_id = {k: v['step_index'] for k, v in policy_output.items()}
+                timestep_dict_with_env_id = {k: v['timestep'] for k, v in policy_output.items()}
 
                 if self.policy_config.sampled_algo:
                     root_sampled_actions_dict_with_env_id = {
@@ -467,7 +467,7 @@ def collect(self,
                 actions = {}
                 value_dict = {}
                 pred_value_dict = {}
-                step_index_dict = {}
+                timestep_dict = {}
 
                 if not collect_with_pure_policy:
                     distributions_dict = {}
@@ -485,7 +485,7 @@ def collect(self,
                     actions[env_id] = actions_with_env_id.pop(env_id)
                     value_dict[env_id] = value_dict_with_env_id.pop(env_id)
                     pred_value_dict[env_id] = pred_value_dict_with_env_id.pop(env_id)
-                    step_index_dict[env_id] = step_index_dict_with_env_id.pop(env_id)
+                    timestep_dict[env_id] = timestep_dict_with_env_id.pop(env_id)
 
                     if not collect_with_pure_policy:
                         distributions_dict[env_id] = distributions_dict_with_env_id.pop(env_id)
@@ -536,19 +536,19 @@ def collect(self,
                     if self.policy_config.use_ture_chance_label_in_chance_encoder:
                         game_segments[env_id].append(
                             actions[env_id], to_ndarray(obs['observation']), reward, action_mask_dict[env_id],
-                            to_play_dict[env_id], chance_dict[env_id], step_index_dict[env_id]
+                            to_play_dict[env_id], chance_dict[env_id], timestep_dict[env_id]
                         )
                     else:
                         game_segments[env_id].append(
                             actions[env_id], to_ndarray(obs['observation']), reward, action_mask_dict[env_id],
-                            to_play_dict[env_id], step_index_dict[env_id]
+                            to_play_dict[env_id], timestep_dict[env_id]
                         )
 
                     # NOTE: the position of code snippet is very important.
                     # the obs['action_mask'] and obs['to_play'] are corresponding to the next action
                     action_mask_dict[env_id] = to_ndarray(obs['action_mask'])
                     to_play_dict[env_id] = to_ndarray(obs['to_play'])
-                    step_index_dict[env_id] = to_ndarray(obs['step_index'])
+                    timestep_dict[env_id] = to_ndarray(obs['timestep'])
                     if self.policy_config.use_ture_chance_label_in_chance_encoder:
                         chance_dict[env_id] = to_ndarray(obs['chance'])
 
@@ -679,7 +679,7 @@ def collect(self,
 
                         action_mask_dict[env_id] = to_ndarray(init_obs[env_id]['action_mask'])
                         to_play_dict[env_id] = to_ndarray(init_obs[env_id]['to_play'])
-                        step_index_dict[env_id] = to_ndarray(init_obs[env_id]['step_index'])
+                        timestep_dict[env_id] = to_ndarray(init_obs[env_id]['timestep'])
                         if self.policy_config.use_ture_chance_label_in_chance_encoder:
                             chance_dict[env_id] = to_ndarray(init_obs[env_id]['chance'])
 
diff --git a/zoo/atari/config/atari_unizero_config.py b/zoo/atari/config/atari_unizero_config.py
@@ -11,14 +11,25 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
     # ==============================================================
     collector_env_num = 8
     game_segment_length = 20
-    evaluator_env_num = 5
+    evaluator_env_num = 3
     num_simulations = 50
     max_env_step = int(5e5)
     batch_size = 64
     num_unroll_steps = 10
     infer_context_length = 4
     num_layers = 2
     replay_ratio = 0.25
+
+    # collector_env_num = 2
+    # game_segment_length = 20
+    # evaluator_env_num = 1
+    # num_simulations = 2
+    # max_env_step = int(5e5)
+    # batch_size = 2
+    # num_unroll_steps = 5
+    # infer_context_length = 2
+    # num_layers = 1
+    # replay_ratio = 0.1
     # ==============================================================
     # end of the most frequently changed config specified by the user
     # ==============================================================
@@ -33,8 +44,8 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
             n_evaluator_episode=evaluator_env_num,
             manager=dict(shared_memory=False, ),
             # TODO: only for debug
-            # collect_max_episode_steps=int(20),
-            # eval_max_episode_steps=int(20),
+            # collect_max_episode_steps=int(50),
+            # eval_max_episode_steps=int(50),
         ),
         policy=dict(
             learn=dict(learner=dict(hook=dict(save_ckpt_after_iter=1000000, ), ), ),  # default is 10000
@@ -68,6 +79,7 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
             learning_rate=0.0001,
             num_simulations=num_simulations,
             train_start_after_envsteps=2000,
+            # train_start_after_envsteps=0, # debug
             game_segment_length=game_segment_length,
             replay_buffer_size=int(1e6),
             eval_freq=int(5e3),
@@ -92,7 +104,7 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
     atari_unizero_create_config = EasyDict(atari_unizero_create_config)
     create_config = atari_unizero_create_config
 
-    main_config.exp_name = f'data_unizero/{env_id[:-14]}/{env_id[:-14]}_uz_nlayer{num_layers}_gsl{game_segment_length}_rr{replay_ratio}_Htrain{num_unroll_steps}-Hinfer{infer_context_length}_bs{batch_size}_seed{seed}'
+    main_config.exp_name = f'data_unizero_20250211/{env_id[:-14]}/{env_id[:-14]}_uz_poeembed-mergemain_nlayer{num_layers}_gsl{game_segment_length}_rr{replay_ratio}_Htrain{num_unroll_steps}-Hinfer{infer_context_length}_bs{batch_size}_seed{seed}'
     from lzero.entry import train_unizero
     train_unizero([main_config, create_config], seed=seed, model_path=main_config.policy.model_path, max_env_step=max_env_step)
 
diff --git a/zoo/atari/envs/atari_lightzero_env.py b/zoo/atari/envs/atari_lightzero_env.py
@@ -154,6 +154,7 @@ def step(self, action: int) -> BaseEnvTimestep:
         # print(f'self.timestep: {self.timestep}')
         observation = self.observe()
         if done:
+            print(f'done in self.timestep: {self.timestep}')
             info['eval_episode_return'] = self._eval_episode_return
         return BaseEnvTimestep(observation, self.reward, done, info)