polish(pu): add rope_embed support for cartpole

puyuan · puyuan · commit 6cd57ddb5433 · 2025-02-11T17:15:52.000+08:00
diff --git a/lzero/mcts/buffer/game_buffer_unizero.py b/lzero/mcts/buffer/game_buffer_unizero.py
@@ -427,15 +427,15 @@ def _compute_target_reward_value(self, reward_value_context: List[Any], model: A
             m_output = model.initial_inference(m_obs, action_batch, start_pos=step_index_batch)  # TODO: step_index
             # ======================================================================
 
-            if not model.training:
-                # if not in training, obtain the scalars of the value/reward
-                [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
-                    [
-                        m_output.latent_state,
-                        inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
-                        m_output.policy_logits
-                    ]
-                )
+            # if not model.training:
+            # if not in training, obtain the scalars of the value/reward
+            [m_output.latent_state, m_output.value, m_output.policy_logits] = to_detach_cpu_numpy(
+                [
+                    m_output.latent_state,
+                    inverse_scalar_transform(m_output.value, self._cfg.model.support_scale),
+                    m_output.policy_logits
+                ]
+            )
             network_output.append(m_output)
 
             # concat the output slices after model inference
diff --git a/lzero/model/unizero_world_models/transformer.py b/lzero/model/unizero_world_models/transformer.py
@@ -33,11 +33,47 @@ class TransformerConfig:
     # for RoPE
     rope_theta: float
     max_seq_len: int
+    rotary_emb: bool = False  # 增加配置选项控制是否使用 rotary_emb
+
     @property
     def max_tokens(self):
         return self.tokens_per_block * self.max_blocks
 
 
+
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    # print(f"freqs_cis shape: {freqs_cis.shape}, x shape: {x.shape}")
+    assert 0 <= 1 < ndim
+    shape = [d if i == 2 or i == ndim - 1 or i == 0 else 1 for i, d in enumerate(x.shape)]
+
+    return freqs_cis.view(*shape)
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    try:
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    except:
+        print('We are at the reset timestep!')
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
 class Transformer(nn.Module):
     """
     Transformer model class.
@@ -59,11 +95,14 @@ def __init__(self, config: TransformerConfig) -> None:
         self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)])
         self.ln_f = nn.LayerNorm(config.embed_dim)
 
-        self.freqs_cis = precompute_freqs_cis(
-            self.config.embed_dim // self.config.num_heads,
-            self.config.max_seq_len * 2,
-            self.config.rope_theta,
-        )
+        # 注册缓存, 自动管理设备转换
+        if self.config.rotary_emb:
+            freqs_cis = precompute_freqs_cis(
+                self.config.embed_dim // self.config.num_heads,
+                self.config.max_seq_len * 2,
+                self.config.rope_theta,
+            )
+            self.register_buffer("freqs_cis", freqs_cis)
 
     def generate_empty_keys_values(self, n: int, max_tokens: int) -> KeysValues:
         """
@@ -93,24 +132,31 @@ def forward(self, sequences: torch.Tensor, past_keys_values: Optional[KeysValues
             - torch.Tensor: Output tensor of shape (batch_size, seq_length, embed_dim).
         """
         seqlen = sequences.shape[1]
-        self.freqs_cis = self.freqs_cis.to(sequences.device)
 
-        # freqs_cis = self.freqs_cis[start_pos: start_pos + seqlen]
-
-        # If the start position is greater than the predefined maximum sequence length, wrap around
-        start_pos = torch.tensor(np.array(start_pos))
-        if len(start_pos.shape) > 1:
-            # TODO: train start pos [0]
-            start_pos = torch.remainder(start_pos, self.config.max_seq_len)[:,0]
+        # 如果使用 RoPE，则对 freqs_cis 进行切片
+        if self.config.rotary_emb:
+            # 修复：如果 start_pos 是标量，则将其扩展为当前 batch 大小的相同数值
+            # *2是由于step_index只是统计了obs，但是序列是obs act
+            if isinstance(start_pos, int) or isinstance(start_pos, float):
+                start_pos_tensor = torch.full((sequences.shape[0],), int(start_pos), device=sequences.device) * 2
+            else:
+                # start_pos_tensor = torch.as_tensor(start_pos, device=sequences.device)
+                try:
+                    start_pos_tensor = torch.as_tensor([x.item() for x in start_pos], device=sequences.device)
+                except Exception as e:
+                    # print(e)
+                    start_pos_tensor = torch.as_tensor(
+                        [x.reshape(-1)[0].item() for x in start_pos],  # 强制展平后取第一个元素
+                        device=sequences.device
+                    ) * 2
+            # 对每个样本根据 start_pos 取对应区间的 freqs_cis
+            start_pos_tensor = torch.remainder(start_pos_tensor, self.config.max_seq_len)
+            # 将各个样本的 start_pos 转换为列表
+            start_pos_list = start_pos_tensor.tolist()
+            freqs_cis_slices = [self.freqs_cis[int(pos): int(pos) + seqlen] for pos in start_pos_list]
+            freqs_cis = torch.stack(freqs_cis_slices)
         else:
-            start_pos = torch.remainder(start_pos, self.config.max_seq_len)
-
-        start_pos_list = torch.unbind(start_pos)
-        try:
-            freqs_cis_slices = [self.freqs_cis[int(pos.item()): int(pos.item()) + seqlen] for pos in start_pos_list]
-        except:
-            print('debug')
-        freqs_cis = torch.stack(freqs_cis_slices).squeeze(1)
+            freqs_cis = None
 
         assert past_keys_values is None or len(past_keys_values) == len(self.blocks)
         x = self.drop(sequences)
@@ -181,42 +227,6 @@ def forward(self, x: torch.Tensor, past_keys_values: Optional[KeysValues] = None
         return x
 
 
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return freqs_cis
-
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    # print(f"freqs_cis shape: {freqs_cis.shape}, x shape: {x.shape}")
-    assert 0 <= 1 < ndim
-    # assert freqs_cis.shape == (x.shape[2], x.shape[-1])
-    # shape = [d if i == 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    # TODO: check
-    shape = [d if i == 2 or i == ndim - 1 or i == 0 else 1 for i, d in enumerate(x.shape)]
-
-    return freqs_cis.view(*shape)
-
-
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-    try:
-        freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
-    except:
-        print('We are at the reset timestep!')
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-
-
 class SelfAttention(nn.Module):
     """
     Implements self-attention mechanism for transformers.
diff --git a/lzero/model/unizero_world_models/world_model.py b/lzero/model/unizero_world_models/world_model.py
@@ -59,7 +59,6 @@ def __init__(self, config: TransformerConfig, tokenizer) -> None:
         if not self.config.rotary_emb:
             self.pos_emb = nn.Embedding(config.max_tokens, config.embed_dim, device=self.device)
             self.precompute_pos_emb_diff_kv()
-
             print(f"self.pos_emb.weight.device: {self.pos_emb.weight.device}")
 
         # Initialize action embedding table
@@ -488,7 +487,8 @@ def refresh_kvs_with_initial_latent_state_for_init_infer(self, latent_state: tor
                     self.keys_values_wm_size_list_current = self.trim_and_pad_kv_cache(is_init_infer=True)
 
                     buffer_action = buffer_action[:ready_env_num]
-                    # TODO
+                    
+                    # TODO: 顺序可能不对？
                     start_pos = start_pos[:ready_env_num]
 
                     # if ready_env_num < self.env_num:
diff --git a/lzero/policy/unizero.py b/lzero/policy/unizero.py
@@ -723,11 +723,11 @@ def _forward_eval(self, data: torch.Tensor, action_mask: list, to_play: int = -1
             network_output = self._eval_model.initial_inference(self.last_batch_obs, self.last_batch_action, data, step_index)
             latent_state_roots, reward_roots, pred_values, policy_logits = mz_network_output_unpack(network_output)
 
-            if not self._eval_model.training:
-                # if not in training, obtain the scalars of the value/reward
-                pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy()  # shape（B, 1）
-                latent_state_roots = latent_state_roots.detach().cpu().numpy()
-                policy_logits = policy_logits.detach().cpu().numpy().tolist()  # list shape（B, A）
+            # if not self._eval_model.training:
+            # if not in training, obtain the scalars of the value/reward
+            pred_values = self.inverse_scalar_transform_handle(pred_values).detach().cpu().numpy()  # shape（B, 1）
+            latent_state_roots = latent_state_roots.detach().cpu().numpy()
+            policy_logits = policy_logits.detach().cpu().numpy().tolist()  # list shape（B, A）
 
             legal_actions = [[i for i, x in enumerate(action_mask[j]) if x == 1] for j in range(active_eval_env_num)]
             if self._cfg.mcts_ctree:
diff --git a/zoo/classic_control/cartpole/config/cartpole_unizero_config.py b/zoo/classic_control/cartpole/config/cartpole_unizero_config.py
@@ -9,15 +9,14 @@
 update_per_collect = None
 replay_ratio = 0.25
 max_env_step = int(2e5)
-reanalyze_ratio = 0
 batch_size = 256
 num_unroll_steps = 5
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 cartpole_unizero_config = dict(
-    exp_name=f'data_unizero/cartpole_unizero_ns{num_simulations}_upc{update_per_collect}-rr{replay_ratio}_rer{reanalyze_ratio}_H{num_unroll_steps}_bs{batch_size}_seed0',
+    exp_name=f'data_unizero_debug/cartpole_unizero_pos-embed_ns{num_simulations}_upc{update_per_collect}-rr{replay_ratio}_H{num_unroll_steps}_bs{batch_size}_seed0',
     env=dict(
         env_name='CartPole-v0',
         continuous=False,
@@ -40,16 +39,21 @@
                 max_tokens=2 * 10,
                 context_length=2 * 4,
                 context_length_for_recurrent=2 * 4,
-                device='cpu',
+                device='cuda',
                 action_space_size=2,
                 num_layers=2,
                 num_heads=2,
                 embed_dim=64,
-                env_num=collector_env_num,
+                env_num=max(collector_env_num, evaluator_env_num),
                 collector_env_num=collector_env_num,
                 evaluator_env_num=evaluator_env_num,
                 obs_type='vector',
                 norm_type='BN',
+                # for RoPE
+                rotary_emb=False,
+                # rotary_emb=True,
+                rope_theta=10000,
+                max_seq_len=2048,
             ),
         ),
         # (str) The path of the pretrained model. If None, the model will be initialized by the default model.
@@ -67,7 +71,6 @@
         target_update_freq=100,
         grad_clip_value=5,
         num_simulations=num_simulations,
-        reanalyze_ratio=reanalyze_ratio,
         n_episode=n_episode,
         eval_freq=int(1e3),
         replay_buffer_size=int(1e6),
diff --git a/zoo/classic_control/cartpole/envs/cartpole_lightzero_env.py b/zoo/classic_control/cartpole/envs/cartpole_lightzero_env.py
@@ -51,6 +51,8 @@ def __init__(self, cfg: dict = {}) -> None:
         self._action_space = gym.spaces.Discrete(2)
         self._action_space.seed(0)  # default seed
         self._reward_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
+        self.step_index = 0
+
 
     def reset(self) -> Dict[str, np.ndarray]:
         """
@@ -86,7 +88,9 @@ def reset(self) -> Dict[str, np.ndarray]:
         obs = to_ndarray(obs)
 
         action_mask = np.ones(self.action_space.n, 'int8')
-        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+        self.step_index = 0
+
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'step_index': self.step_index}
 
         return obs
 
@@ -120,7 +124,9 @@ def step(self, action: Union[int, np.ndarray]) -> BaseEnvTimestep:
             info['eval_episode_return'] = self._eval_episode_return
 
         action_mask = np.ones(self.action_space.n, 'int8')
-        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1}
+        self.step_index += 1
+
+        obs = {'observation': obs, 'action_mask': action_mask, 'to_play': -1, 'step_index': self.step_index}
 
         return BaseEnvTimestep(obs, rew, done, info)