polish(pu): rename step_index to timestep

puyuan · puyuan · commit 3dfc1f296dd2 · 2025-02-11T17:52:11.000+08:00
diff --git a/lzero/mcts/buffer/game_buffer_muzero.py b/lzero/mcts/buffer/game_buffer_muzero.py
@@ -286,7 +286,7 @@ def _prepare_reward_value_context(
         game_segment_lens = []
         # for board games
         action_mask_segment, to_play_segment = [], []
-        # step_index_segment = []
+        # timestep_segment = []
 
         root_values = []
 
diff --git a/lzero/mcts/buffer/game_buffer_unizero.py b/lzero/mcts/buffer/game_buffer_unizero.py
@@ -68,7 +68,7 @@ def sample(
             batch_size, self._cfg.reanalyze_ratio
         )
 
-        # current_batch = [obs_list, action_list, bootstrap_action_list, mask_list, batch_index_list, weights_list, make_time_list, step_index_list]
+        # current_batch = [obs_list, action_list, bootstrap_action_list, mask_list, batch_index_list, weights_list, make_time_list, timestep_list]
 
         # target reward, target value
         batch_rewards, batch_target_values = self._compute_target_reward_value(
@@ -118,7 +118,7 @@ def _make_batch(self, batch_size: int, reanalyze_ratio: float) -> Tuple[Any]:
         game_segment_list, pos_in_game_segment_list, batch_index_list, weights_list, make_time_list = orig_data
         batch_size = len(batch_index_list)
         obs_list, action_list, mask_list = [], [], []
-        step_index_list = []
+        timestep_list = []
         bootstrap_action_list = []
 
         # prepare the inputs of a batch
@@ -129,7 +129,7 @@ def _make_batch(self, batch_size: int, reanalyze_ratio: float) -> Tuple[Any]:
             actions_tmp = game.action_segment[pos_in_game_segment:pos_in_game_segment +
                                                                   self._cfg.num_unroll_steps].tolist()
 
-            step_index_tmp = game.step_index_segment[pos_in_game_segment:pos_in_game_segment +
+            timestep_tmp = game.timestep_segment[pos_in_game_segment:pos_in_game_segment +
                                                                   self._cfg.num_unroll_steps].tolist()
             # add mask for invalid actions (out of trajectory), 1 for valid, 0 for invalid
             # mask_tmp = [1. for i in range(len(actions_tmp))]
@@ -146,9 +146,9 @@ def _make_batch(self, batch_size: int, reanalyze_ratio: float) -> Tuple[Any]:
                 for _ in range(self._cfg.num_unroll_steps - len(actions_tmp))
             ]
             # TODO
-            step_index_tmp += [
+            timestep_tmp += [
                 0
-                for _ in range(self._cfg.num_unroll_steps - len(step_index_tmp))
+                for _ in range(self._cfg.num_unroll_steps - len(timestep_tmp))
             ]
 
             # obtain the current observations sequence
@@ -160,7 +160,7 @@ def _make_batch(self, batch_size: int, reanalyze_ratio: float) -> Tuple[Any]:
             action_list.append(actions_tmp)
 
             mask_list.append(mask_tmp)
-            step_index_list.append(step_index_tmp)
+            timestep_list.append(timestep_tmp)
 
             # NOTE: for unizero
             bootstrap_action_tmp = game.action_segment[pos_in_game_segment+self._cfg.td_steps:pos_in_game_segment +
@@ -177,7 +177,7 @@ def _make_batch(self, batch_size: int, reanalyze_ratio: float) -> Tuple[Any]:
         obs_list = prepare_observation(obs_list, self._cfg.model.model_type)
 
         # formalize the inputs of a batch
-        current_batch = [obs_list, action_list, bootstrap_action_list, mask_list, batch_index_list, weights_list, make_time_list, step_index_list]
+        current_batch = [obs_list, action_list, bootstrap_action_list, mask_list, batch_index_list, weights_list, make_time_list, timestep_list]
         for i in range(len(current_batch)):
             current_batch[i] = np.asarray(current_batch[i])
 
@@ -345,15 +345,15 @@ def _prepare_policy_reanalyzed_context(
             rewards, child_visits, game_segment_lens = [], [], []
             # for board games
             action_mask_segment, to_play_segment = [], []
-            step_index_segment = []
+            timestep_segment = []
             for game_segment, state_index in zip(game_segment_list, pos_in_game_segment_list):
                 game_segment_len = len(game_segment)
                 game_segment_lens.append(game_segment_len)
                 rewards.append(game_segment.reward_segment)
                 # for board games
                 action_mask_segment.append(game_segment.action_mask_segment)
                 to_play_segment.append(game_segment.to_play_segment)
-                step_index_segment.append(game_segment.step_index_segment)
+                timestep_segment.append(game_segment.timestep_segment)
 
                 child_visits.append(game_segment.child_visit_segment)
                 # prepare the corresponding observations
@@ -372,7 +372,7 @@ def _prepare_policy_reanalyzed_context(
 
         policy_re_context = [
             policy_obs_list, policy_mask, pos_in_game_segment_list, batch_index_list, child_visits, game_segment_lens,
-            action_mask_segment, to_play_segment, step_index_segment
+            action_mask_segment, to_play_segment, timestep_segment
         ]
         return policy_re_context
 
@@ -391,11 +391,11 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
 
         # for board games
         policy_obs_list, policy_mask, pos_in_game_segment_list, batch_index_list, child_visits, game_segment_lens, action_mask_segment, \
-            to_play_segment, step_index_segment = policy_re_context  # noqa
+            to_play_segment, timestep_segment = policy_re_context  # noqa
         transition_batch_size = len(policy_obs_list)
         game_segment_batch_size = len(pos_in_game_segment_list)
 
-        # TODO: step_index_segment
+        # TODO: timestep_segment
         to_play, action_mask = self._preprocess_to_play_and_action_mask(
             game_segment_batch_size, to_play_segment, action_mask_segment, pos_in_game_segment_list
         )
@@ -505,7 +505,7 @@ def _compute_target_policy_reanalyzed(self, policy_re_context: List[Any], model:
 
         return batch_target_policies_re
 
-    def _compute_target_reward_value(self, reward_value_context: List[Any], model: Any, batch_action, step_index_batch) -> Tuple[
+    def _compute_target_reward_value(self, reward_value_context: List[Any], model: Any, batch_action, timestep_batch) -> Tuple[
         Any, Any]:
         """
         Overview:
@@ -531,7 +531,7 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
             # =============== NOTE: The key difference with MuZero =================
             # calculate the bootstrapped value and target value
             # NOTE: batch_obs(value_obs_list) is at t+td_steps, batch_action is at timestep t+td_steps
-            m_output = model.initial_inference(batch_obs, batch_action, start_pos=step_index_batch)  # TODO: step_index
+            m_output = model.initial_inference(batch_obs, batch_action, start_pos=timestep_batch)  # TODO: timestep
             # ======================================================================
 
             # if not in training, obtain the scalars of the value/reward
diff --git a/lzero/model/unizero_world_models/transformer.py b/lzero/model/unizero_world_models/transformer.py
@@ -137,7 +137,7 @@ def forward(self, sequences: torch.Tensor, past_keys_values: Optional[KeysValues
         # 如果使用 RoPE，则对 freqs_cis 进行切片
         if self.config.rotary_emb:
             # 修复：如果 start_pos 是标量，则将其扩展为当前 batch 大小的相同数值
-            # *2是由于step_index只是统计了obs，但是序列是obs act
+            # *2是由于timestep只是统计了obs，但是序列是obs act
             if isinstance(start_pos, int) or isinstance(start_pos, float):
                 start_pos_tensor = torch.full((sequences.shape[0],), int(start_pos), device=sequences.device) * 2
             else:
diff --git a/lzero/model/unizero_world_models/world_model.py b/lzero/model/unizero_world_models/world_model.py
@@ -1128,7 +1128,7 @@ def retrieve_or_generate_kvcache(self, latent_state: list, ready_env_num: int,
 
     def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar_transform_handle=None,
                      **kwargs: Any) -> LossWithIntermediateLosses:
-        start_pos = batch['step_index']
+        start_pos = batch['timestep']
         # Encode observations into latent state representations
         obs_embeddings = self.tokenizer.encode_to_obs_embeddings(batch['observations'])
 
@@ -1345,9 +1345,9 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
             first_step_losses[loss_name] = loss_tmp[:, 0][first_step_mask].mean()
 
             # Middle step loss
-            middle_step_index = seq_len // 2
-            middle_step_mask = mask_padding[:, middle_step_index]
-            middle_step_losses[loss_name] = loss_tmp[:, middle_step_index][middle_step_mask].mean()
+            middle_timestep = seq_len // 2
+            middle_step_mask = mask_padding[:, middle_timestep]
+            middle_step_losses[loss_name] = loss_tmp[:, middle_timestep][middle_step_mask].mean()
 
             # Last step loss
             last_step_mask = mask_padding[:, -1]
diff --git a/lzero/policy/unizero.py b/lzero/policy/unizero.py
@@ -353,7 +353,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         self._target_model.train()
 
         current_batch, target_batch, _ = data
-        obs_batch_ori, action_batch,  target_action_batch, mask_batch, indices, weights, make_time, step_index_batch = current_batch
+        obs_batch_ori, action_batch,  target_action_batch, mask_batch, indices, weights, make_time, timestep_batch = current_batch
         target_reward, target_value, target_policy = target_batch
 
         # Prepare observations based on frame stack number
@@ -371,7 +371,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         # Prepare action batch and convert to torch tensor
         action_batch = torch.from_numpy(action_batch).to(self._cfg.device).unsqueeze(
             -1).long()  # For discrete action space
-        step_index_batch = torch.from_numpy(step_index_batch).to(self._cfg.device).unsqueeze(
+        timestep_batch = torch.from_numpy(timestep_batch).to(self._cfg.device).unsqueeze(
             -1).long()  # TODO: only for discrete action space
         data_list = [mask_batch, target_reward, target_value, target_policy, weights]
         mask_batch, target_reward, target_value, target_policy, weights = to_torch_float_tensor(data_list,
@@ -397,7 +397,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
                 self._cfg.batch_size, -1, *self._cfg.model.observation_shape)
 
         batch_for_gpt['actions'] = action_batch.squeeze(-1)
-        batch_for_gpt['step_index'] = step_index_batch.squeeze(-1)
+        batch_for_gpt['timestep'] = timestep_batch.squeeze(-1)
 
 
         batch_for_gpt['rewards'] = target_reward_categorical[:, :-1]
@@ -569,7 +569,7 @@ def _forward_collect(
             to_play: List = [-1],
             epsilon: float = 0.25,
             ready_env_id: np.ndarray = None,
-            step_index: List = [0]
+            timestep: List = [0]
     ) -> Dict:
         """
         Overview:
@@ -581,7 +581,7 @@ def _forward_collect(
             - temperature (:obj:`float`): The temperature of the policy.
             - to_play (:obj:`int`): The player to play.
             - ready_env_id (:obj:`list`): The id of the env that is ready to collect.
-            - step_index (:obj:`list`): The step index of the env in one episode
+            - timestep (:obj:`list`): The step index of the env in one episode
         Shape:
             - data (:obj:`torch.Tensor`):
                 - For Atari, :math:`(N, C*S, H, W)`, where N is the number of collect_env, C is the number of channels, \
@@ -591,7 +591,7 @@ def _forward_collect(
             - temperature: :math:`(1, )`.
             - to_play: :math:`(N, 1)`, where N is the number of collect_env.
             - ready_env_id: None
-            - step_index: :math:`(N, 1)`, where N is the number of collect_env.
+            - timestep: :math:`(N, 1)`, where N is the number of collect_env.
         Returns:
             - output (:obj:`Dict[int, Any]`): Dict type data, the keys including ``action``, ``distributions``, \
                 ``visit_count_distribution_entropy``, ``value``, ``pred_value``, ``policy_logits``.
@@ -606,7 +606,7 @@ def _forward_collect(
         output = {i: None for i in ready_env_id}
 
         with torch.no_grad():
-            network_output = self._collect_model.initial_inference(self.last_batch_obs, self.last_batch_action, data, step_index)
+            network_output = self._collect_model.initial_inference(self.last_batch_obs, self.last_batch_action, data, timestep)
             latent_state_roots, reward_roots, pred_values, policy_logits = mz_network_output_unpack(network_output)
 
             pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy()
@@ -627,7 +627,7 @@ def _forward_collect(
                 roots = MCTSPtree.roots(active_collect_env_num, legal_actions)
 
             roots.prepare(self._cfg.root_noise_weight, noises, reward_roots, policy_logits, to_play)
-            self._mcts_collect.search(roots, self._collect_model, latent_state_roots, to_play, step_index)
+            self._mcts_collect.search(roots, self._collect_model, latent_state_roots, to_play, timestep)
 
             # list of list, shape: ``{list: batch_size} -> {list: action_space_size}``
             roots_visit_count_distributions = roots.get_distributions()
@@ -669,7 +669,7 @@ def _forward_collect(
                     'searched_value': value,
                     'predicted_value': pred_values[i],
                     'predicted_policy_logits': policy_logits[i],
-                    'step_index': step_index[i]
+                    'timestep': timestep[i]
                 }
                 batch_action.append(action)
 
@@ -706,7 +706,7 @@ def _init_eval(self) -> None:
             self.last_batch_action = [-1 for _ in range(self.evaluator_env_num)]
 
     def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1,
-                      ready_env_id: np.array = None, step_index: int = 0) -> Dict:
+                      ready_env_id: np.array = None, timestep: int = 0) -> Dict:
         """
         Overview:
             The forward function for evaluating the current policy in eval mode. Use model to execute MCTS search.
@@ -734,7 +734,7 @@ def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1
             ready_env_id = np.arange(active_eval_env_num)
         output = {i: None for i in ready_env_id}
         with torch.no_grad():
-            network_output = self._eval_model.initial_inference(self.last_batch_obs, self.last_batch_action, data, step_index)
+            network_output = self._eval_model.initial_inference(self.last_batch_obs, self.last_batch_action, data, timestep)
             latent_state_roots, reward_roots, pred_values, policy_logits = mz_network_output_unpack(network_output)
 
             # if not in training, obtain the scalars of the value/reward
@@ -750,7 +750,7 @@ def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1
                 # python mcts_tree
                 roots = MCTSPtree.roots(active_eval_env_num, legal_actions)
             roots.prepare_no_noise(reward_roots, policy_logits, to_play)
-            self._mcts_eval.search(roots, self._eval_model, latent_state_roots, to_play, step_index)
+            self._mcts_eval.search(roots, self._eval_model, latent_state_roots, to_play, timestep)
 
             # list of list, shape: ``{list: batch_size} -> {list: action_space_size}``
             roots_visit_count_distributions = roots.get_distributions()
@@ -780,7 +780,7 @@ def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1
                     'searched_value': value,
                     'predicted_value': pred_values[i],
                     'predicted_policy_logits': policy_logits[i],
-                    'step_index': step_index[i]
+                    'timestep': timestep[i]
                 }
                 batch_action.append(action)
 
diff --git a/zoo/atari/envs/atari_lightzero_env.py b/zoo/atari/envs/atari_lightzero_env.py
@@ -99,7 +99,7 @@ def __init__(self, cfg: EasyDict) -> None:
         self.channel_last = cfg.channel_last
         self.clip_rewards = cfg.clip_rewards
         self.episode_life = cfg.episode_life
-        self.step_index = 0
+        self.timestep = 0
 
     def reset(self) -> dict:
         """
@@ -134,9 +134,7 @@ def reset(self) -> dict:
         self.obs = to_ndarray(obs)
         self._eval_episode_return = 0.
         self.timestep = 0
-
         obs = self.observe()
-        self.step_index = 0
         return obs
 
     def step(self, action: int) -> BaseEnvTimestep:
@@ -155,7 +153,6 @@ def step(self, action: int) -> BaseEnvTimestep:
         self.timestep += 1
         # print(f'self.timestep: {self.timestep}')
         observation = self.observe()
-        self.step_index += 1
         if done:
             info['eval_episode_return'] = self._eval_episode_return
         return BaseEnvTimestep(observation, self.reward, done, info)
@@ -175,7 +172,7 @@ def observe(self) -> dict:
             observation = np.transpose(observation, (2, 0, 1))
 
         action_mask = np.ones(self._action_space.n, 'int8')
-        return {'observation': observation, 'action_mask': action_mask, 'to_play': -1, 'step_index': self.step_index}
+        return {'observation': observation, 'action_mask': action_mask, 'to_play': -1, 'timestep': self.timestep}
 
     @property
     def legal_actions(self):
diff --git a/zoo/classic_control/cartpole/envs/cartpole_lightzero_env.py b/zoo/classic_control/cartpole/envs/cartpole_lightzero_env.py
@@ -68,7 +68,7 @@ def __init__(self, cfg: dict = {}) -> None:
         )
         self._action_space = gym.spaces.Discrete(2)
         self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
-        self.step_index = 0
+        self.timestep = 0
 
 
     def reset(self) -> Dict[str, np.ndarray]:
@@ -93,9 +93,9 @@ def reset(self) -> Dict[str, np.ndarray]:
 
         # Initialize the action mask and return the observation.
         action_mask = np.ones(self.action_space.n, 'int8')
-        self.step_index = 0
+        self.timestep = 0
 
-        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'step_index': self.step_index}
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'timestep': self.timestep}
 
         # this is to artificially introduce randomness in order to evaluate the performance of
         # stochastic_muzero on state input
@@ -143,9 +143,9 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
                 self.save_gif_replay()
 
         action_mask = np.ones(self.action_space.n, 'int8')
-        self.step_index += 1
+        self.timestep += 1
 
-        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'step_index': self.step_index}
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'timestep': self.timestep}
 
         # this is to artificially introduce randomness in order to evaluate the performance of
         # stochastic_muzero on state input