fix(pu): fix start_pos *2 bug

puyuan · puyuan · commit 3346e083df18 · 2025-02-14T18:07:50.000+08:00
diff --git a/lzero/model/unizero_world_models/transformer.py b/lzero/model/unizero_world_models/transformer.py
@@ -50,11 +50,9 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
 
 
 def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    # https://github.com/meta-llama/llama3/blob/main/llama/model.py#L61
     ndim = x.ndim
-    # print(f"freqs_cis shape: {freqs_cis.shape}, x shape: {x.shape}")
-    assert 0 <= 1 < ndim
-    shape = [d if i == 2 or i == ndim - 1 or i == 0 else 1 for i, d in enumerate(x.shape)]
-
+    shape = [d if i == ndim - 1 or i == 2 or i == 0 else 1 for i, d in enumerate(x.shape)]
     return freqs_cis.view(*shape)
 
 
@@ -66,7 +64,9 @@ def apply_rotary_emb(
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
     try:
+        # print(f"freqs_cis shape: {freqs_cis.shape}, xq_ shape: {xq_.shape}")
         freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+        # print(f"new freqs_cis shape: {freqs_cis.shape}")
     except Exception as e:
         print(e)
         print('We are at the reset timestep!')
@@ -137,25 +137,31 @@ def forward(self, sequences: torch.Tensor, past_keys_values: Optional[KeysValues
         # 如果使用 RoPE，则对 freqs_cis 进行切片
         if self.config.rotary_emb:
             # 修复：如果 start_pos 是标量，则将其扩展为当前 batch 大小的相同数值
-            # *2是由于timestep只是统计了obs，但是序列是obs act
+            # t==========*2是由于timestep只是统计了obs，但是序列是obs act==========
             if isinstance(start_pos, int) or isinstance(start_pos, float):
                 start_pos_tensor = torch.full((sequences.shape[0],), int(start_pos), device=sequences.device) * 2
             else:
                 # start_pos_tensor = torch.as_tensor(start_pos, device=sequences.device)
                 try:
-                    start_pos_tensor = torch.as_tensor([x.item() for x in start_pos], device=sequences.device)
+                    start_pos_tensor = torch.as_tensor([x.item() for x in start_pos], device=sequences.device) * 2
                 except Exception as e:
                     # print(e)
                     start_pos_tensor = torch.as_tensor(
                         [x.reshape(-1)[0].item() for x in start_pos],  # 强制展平后取第一个元素
                         device=sequences.device
                     ) * 2
+            
             # 对每个样本根据 start_pos 取对应区间的 freqs_cis
             start_pos_tensor = torch.remainder(start_pos_tensor, self.config.max_seq_len)
             # 将各个样本的 start_pos 转换为列表
             start_pos_list = start_pos_tensor.tolist()
             freqs_cis_slices = [self.freqs_cis[int(pos): int(pos) + seqlen] for pos in start_pos_list]
             freqs_cis = torch.stack(freqs_cis_slices)
+
+            if freqs_cis.ndim == 3 and freqs_cis.shape[1] == 1:
+                # 将形状 [seq_len, 1, num_pairs] 转换为 [seq_len, num_pairs]
+                freqs_cis = freqs_cis.squeeze(1)
+            # print(f'165 freqs_cis.shape:{freqs_cis.shape}')
         else:
             freqs_cis = None
 
@@ -307,8 +313,8 @@ def forward(self, x: torch.Tensor, kv_cache: Optional[KeysValues] = None,
             for i in range(B):
                 mask[i] = self.mask[L:L + T, :L + T].clone()
                 mask[i, :, :(L - valid_context_lengths[i])] = 0  # Set invalid parts to 0.
-            # Adjust mask dimensions to match the last two dimensions of att.
-            # (B, T, L + T) -> (B, 1, T, L + T) -> (B, num_heads, T, L + T)
+                # Adjust mask dimensions to match the last two dimensions of att.
+                # (B, T, L + T) -> (B, 1, T, L + T) -> (B, num_heads, T, L + T)
                 mask = mask.unsqueeze(1).expand(-1, att.size(1), -1, -1)
         else:
             # mask.shape: (T, L + T)
diff --git a/zoo/atari/config/atari_unizero_config.py b/zoo/atari/config/atari_unizero_config.py
@@ -13,23 +13,24 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
     game_segment_length = 20
     evaluator_env_num = 3
     num_simulations = 50
-    max_env_step = int(5e5)
+    max_env_step = int(4e5)
     batch_size = 64
     num_unroll_steps = 10
     infer_context_length = 4
     num_layers = 2
     replay_ratio = 0.25
 
-    # collector_env_num = 2
-    # game_segment_length = 20
-    # evaluator_env_num = 1
-    # num_simulations = 2
-    # max_env_step = int(5e5)
-    # batch_size = 2
-    # num_unroll_steps = 5
-    # infer_context_length = 2
-    # num_layers = 1
-    # replay_ratio = 0.1
+    # only for debug
+    collector_env_num = 2
+    game_segment_length = 20
+    evaluator_env_num = 1
+    num_simulations = 2
+    max_env_step = int(5e5)
+    batch_size = 2
+    num_unroll_steps = 5
+    infer_context_length = 2
+    num_layers = 1
+    replay_ratio = 0.1
     # ==============================================================
     # end of the most frequently changed config specified by the user
     # ==============================================================
@@ -44,16 +45,16 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
             n_evaluator_episode=evaluator_env_num,
             manager=dict(shared_memory=False, ),
             # TODO: only for debug
-            # collect_max_episode_steps=int(50),
-            # eval_max_episode_steps=int(50),
+            collect_max_episode_steps=int(50),
+            eval_max_episode_steps=int(50),
         ),
         policy=dict(
             learn=dict(learner=dict(hook=dict(save_ckpt_after_iter=1000000, ), ), ),  # default is 10000
             model=dict(
                 observation_shape=(3, 96, 96),
                 action_space_size=action_space_size,
                 world_model_cfg=dict(
-                    policy_entropy_weight=5e-3,
+                    policy_entropy_weight=1e-4,
                     continuous_action_space=False,
                     max_blocks=num_unroll_steps,
                     max_tokens=2 * num_unroll_steps,  # NOTE: each timestep has 2 tokens: obs and action
@@ -69,7 +70,10 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
                     # rotary_emb=False,
                     rotary_emb=True,
                     rope_theta=10000,
-                    max_seq_len=2048,
+                    # max_seq_len=2048,
+                    # max_seq_len=4096,
+                    max_seq_len=int(4096*2),
+
                 ),
             ),
             model_path=None,
@@ -78,8 +82,8 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
             batch_size=batch_size,
             learning_rate=0.0001,
             num_simulations=num_simulations,
-            train_start_after_envsteps=2000,
-            # train_start_after_envsteps=0, # debug
+            # train_start_after_envsteps=2000,
+            train_start_after_envsteps=0, # debug
             game_segment_length=game_segment_length,
             replay_buffer_size=int(1e6),
             eval_freq=int(5e3),
@@ -104,7 +108,10 @@ def main(env_id='PongNoFrameskip-v4', seed=0):
     atari_unizero_create_config = EasyDict(atari_unizero_create_config)
     create_config = atari_unizero_create_config
 
-    main_config.exp_name = f'data_unizero_20250211/{env_id[:-14]}/{env_id[:-14]}_uz_rope-mergemain_nlayer{num_layers}_gsl{game_segment_length}_rr{replay_ratio}_Htrain{num_unroll_steps}-Hinfer{infer_context_length}_bs{batch_size}_seed{seed}'
+    # main_config.exp_name = f'data_unizero_20250211/{env_id[:-14]}/{env_id[:-14]}_uz_posembed-mergemain_nlayer{num_layers}_gsl{game_segment_length}_rr{replay_ratio}_Htrain{num_unroll_steps}-Hinfer{infer_context_length}_bs{batch_size}_seed{seed}'
+    # main_config.exp_name = f'data_unizero_20250211/{env_id[:-14]}/{env_id[:-14]}_uz_rope-mergemain-msl4096*2_nlayer{num_layers}_gsl{game_segment_length}_rr{replay_ratio}_Htrain{num_unroll_steps}-Hinfer{infer_context_length}_bs{batch_size}_seed{seed}'
+    main_config.exp_name = f'data_unizero_20250211_debug/{env_id[:-14]}/{env_id[:-14]}_uz_rope-mergemain-msl4096*2_nlayer{num_layers}_gsl{game_segment_length}_rr{replay_ratio}_Htrain{num_unroll_steps}-Hinfer{infer_context_length}_bs{batch_size}_seed{seed}'
+    
     from lzero.entry import train_unizero
     train_unizero([main_config, create_config], seed=seed, model_path=main_config.policy.model_path, max_env_step=max_env_step)