feature(pu): add analysis_dormant_ratio_weight_rank option in single-task setting

puyuan · puyuan · commit ed9d7c148320 · 2025-02-20T17:37:19.000+08:00
diff --git a/lzero/model/unizero_world_models/world_model.py b/lzero/model/unizero_world_models/world_model.py
@@ -12,7 +12,7 @@
 from torch.distributions import TransformedDistribution, TanhTransform
 
 from lzero.model.common import SimNorm
-from lzero.model.utils import cal_dormant_ratio
+from lzero.model.utils import cal_dormant_ratio, compute_average_weight_magnitude, cal_effective_rank
 from .kv_caching import KeysValues
 from .slicer import Head, PolicyHeadCont
 from .tokenizer import Tokenizer
@@ -97,6 +97,14 @@ def __init__(self, config: TransformerConfig, tokenizer) -> None:
             self.head_policy = self._create_head(self.value_policy_tokens_pattern, self.action_space_size)
         self.head_value = self._create_head(self.value_policy_tokens_pattern, self.support_size)
 
+        # 对于 head 部分，查找所有以 "head_" 开头的子模块
+        self.head_modules = {}
+        for name, module in self.named_children():
+            if name.startswith("head_"):
+                self.head_modules[name] = module
+        if self.head_modules:
+            self.head_modules = nn.ModuleDict(self.head_modules)
+
         # Apply weight initialization, the order is important
         self.apply(lambda module: init_weights(module, norm_type=self.config.norm_type))
         self._initialize_last_layer()
@@ -259,7 +267,7 @@ def _initialize_config_parameters(self) -> None:
         self.gamma = self.config.gamma
         self.context_length = self.config.context_length
         self.dormant_threshold = self.config.dormant_threshold
-        self.analysis_dormant_ratio = self.config.analysis_dormant_ratio
+        self.analysis_dormant_ratio_weight_rank = self.config.analysis_dormant_ratio_weight_rank
         self.num_observations_tokens = self.config.tokens_per_block - 1
         self.latent_recon_loss_weight = self.config.latent_recon_loss_weight
         self.perceptual_loss_weight = self.config.perceptual_loss_weight
@@ -1149,18 +1157,43 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
         # self.save_as_image_with_timestep(batch['observations'], suffix='visual_match_memlen1-60-15_tsne')
 
         # ========= logging for analysis =========
-        if self.analysis_dormant_ratio:
+        if self.analysis_dormant_ratio_weight_rank:
             # Calculate dormant ratio of the encoder
             shape = batch['observations'].shape  # (..., C, H, W)
             inputs = batch['observations'].contiguous().view(-1, *shape[-3:])  # (32,5,3,64,64) -> (160,3,64,64)
             dormant_ratio_encoder = cal_dormant_ratio(self.tokenizer.encoder, inputs.detach(),
                                                       dormant_threshold=self.dormant_threshold)
             dormant_ratio_encoder = dormant_ratio_encoder['global']
+
+            # 计算全局平均权重绝对值
+            avg_weight_mag_encoder = compute_average_weight_magnitude(self.tokenizer.encoder)
+            # print("Average Weight Magnitude of encoder:", avg_weight_mag_encoder)
+            # 计算全局平均权重绝对值
+            avg_weight_mag_transformer = compute_average_weight_magnitude(self.transformer)
+            # print("Average Weight Magnitude of transformer:", avg_weight_mag_transformer)
+            # print(f"self.head_modules:{self.head_modules}")
+            avg_weight_mag_head = compute_average_weight_magnitude(self.head_modules)
+            # print("Average Weight Magnitude of head:", avg_weight_mag_head)
+
+            # 计算 effective rank，对于 representation 层，注意：
+            # representation 层在 model.named_modules() 的名称为 "representation"
+            # print(f"self.tokenizer.encoder:{self.tokenizer.encoder}")
+            e_rank_last_linear = cal_effective_rank(self.tokenizer.encoder, inputs, representation_layer_name="last_linear")
+            # print("Effective Rank of encoder_last_linear:", e_rank_last_linear)
+            e_rank_sim_norm = cal_effective_rank(self.tokenizer.encoder, inputs, representation_layer_name="sim_norm")
+            # print("Effective Rank of encoder_sim_norm:", e_rank_sim_norm)
+
+
             self.past_kv_cache_recurrent_infer.clear()
             self.keys_values_wm_list.clear()
             torch.cuda.empty_cache()
         else:
             dormant_ratio_encoder = torch.tensor(0.)
+            avg_weight_mag_encoder = torch.tensor(0.)
+            avg_weight_mag_transformer = torch.tensor(0.)
+            avg_weight_mag_head = torch.tensor(0.)
+            e_rank_last_linear = torch.tensor(0.)
+            e_rank_sim_norm = torch.tensor(0.)
 
         # Calculate the L2 norm of the latent state roots
         latent_state_l2_norms = torch.norm(obs_embeddings, p=2, dim=2).mean()
@@ -1228,7 +1261,7 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
         outputs = self.forward({'obs_embeddings_and_act_tokens': (obs_embeddings, act_tokens)})
 
         # ========= logging for analysis =========
-        if self.analysis_dormant_ratio:
+        if self.analysis_dormant_ratio_weight_rank:
             # Calculate dormant ratio of the world model
             dormant_ratio_world_model = cal_dormant_ratio(self, {
                 'obs_embeddings_and_act_tokens': (obs_embeddings.detach(), act_tokens.detach())},
@@ -1396,6 +1429,11 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
                 dormant_ratio_encoder=dormant_ratio_encoder,
                 dormant_ratio_transformer=dormant_ratio_transformer,
                 dormant_ratio_head=dormant_ratio_head,
+                avg_weight_mag_encoder = avg_weight_mag_encoder,
+                avg_weight_mag_transformer = avg_weight_mag_transformer,
+                avg_weight_mag_head = avg_weight_mag_head,
+                e_rank_last_linear = e_rank_last_linear,
+                e_rank_sim_norm = e_rank_sim_norm,
                 latent_state_l2_norms=latent_state_l2_norms,
                 policy_mu=mu,
                 policy_sigma=sigma,
@@ -1419,6 +1457,11 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
                 last_step_losses=last_step_losses,
                 dormant_ratio_transformer=dormant_ratio_transformer,
                 dormant_ratio_head=dormant_ratio_head,
+                avg_weight_mag_encoder = avg_weight_mag_encoder,
+                avg_weight_mag_transformer = avg_weight_mag_transformer,
+                avg_weight_mag_head = avg_weight_mag_head,
+                e_rank_last_linear = e_rank_last_linear,
+                e_rank_sim_norm = e_rank_sim_norm,
                 latent_state_l2_norms=latent_state_l2_norms,
             )
 
diff --git a/lzero/model/utils.py b/lzero/model/utils.py
@@ -9,6 +9,54 @@
 import torch
 import torch.nn as nn
 
+###############################
+# 1. 计算 average_weight_magnitude
+###############################
+def compute_average_weight_magnitude(model: nn.Module) -> float:
+    """
+    计算模型中所有参数的平均绝对值。
+
+    Arguments:
+        model: 待评估模型，类型为 nn.Module
+
+    Returns:
+        平均权重绝对值（float）
+    """
+    num_weights = 0
+    # 使用模型中第一个参数的设备，保证计算时设备一致
+    device = next(model.parameters()).device
+    sum_weight_magnitude = torch.tensor(0.0, device=device)
+
+    for p in model.parameters():
+        num_weights += p.numel()
+        sum_weight_magnitude += torch.sum(torch.abs(p))
+        
+    if num_weights == 0:
+        return 0.0
+    return sum_weight_magnitude.cpu().item() / num_weights
+
+###############################
+# 2. 计算 effective_rank
+###############################
+def compute_effective_rank(singular_values: np.ndarray) -> float:
+    """
+    根据给定的奇异值数组计算 effective rank，公式为：
+       effective_rank = exp( - sum_i [p_i * log(p_i)] )
+       其中 p_i 是归一化后的奇异值（p_i = s_i / ∑ s_i）
+
+    Arguments:
+        singular_values: 奇异值数组，类型为 np.ndarray
+
+    Returns:
+        effective rank（float）
+    """
+    norm_sv = singular_values / np.sum(np.abs(singular_values))
+    entropy = 0.0
+    for p in norm_sv:
+        if p > 0.0:
+            entropy -= p * np.log(p)
+    return np.e ** entropy
+
 
 # 定义一个 Hook 类，用来捕获中间层的输出
 class IntermediateOutputHook:
@@ -22,6 +70,73 @@ def __call__(self, module: nn.Module, input: Tuple[torch.Tensor], output: torch.
         # 这里使用 detach 防止反向传播干扰，并转移到 CPU 便于后续统计
         self.outputs.append(output.detach().cpu())
 
+def cal_effective_rank(
+    model: nn.Module,
+    inputs: Union[torch.Tensor, List[torch.Tensor]], 
+    representation_layer_name: str,
+) -> float:
+    """
+    针对模型指定的中间层（representation 层），
+    使用 Hook 捕获该层输出，并计算 effective rank。
+
+    Arguments:
+        model: 待评估模型，应为 nn.Module 类型。
+        inputs: 模型 forward 的输入，可以为 tensor 或 tensor-list。
+        representation_layer_name: 模型中表示 representation 层的名称，
+                                   该名称必须能够在 model.named_modules() 中找到对应模块。
+
+    Returns:
+        effective rank（float）
+    """
+    # 获取 representation 层模块（若名称不存在将引发 KeyError）
+    module_dict = dict(model.named_modules())
+    if representation_layer_name not in module_dict:
+        raise KeyError(f"Representation layer '{representation_layer_name}' not found in model.named_modules().")
+    representation_module = module_dict[representation_layer_name]
+
+    # 注册 hook
+    hook = IntermediateOutputHook()
+    handle = representation_module.register_forward_hook(hook)
+    
+    # 执行 forward 推理
+    model.eval()
+    with torch.no_grad():
+        if isinstance(inputs, (list, tuple)):
+            _ = model(*inputs)
+        else:
+            _ = model(inputs)
+    
+    # 注销 hook，避免内存泄露
+    handle.remove()
+
+    if not hook.outputs:
+        raise RuntimeError("No outputs captured from the representation layer.")
+
+    # 这里假定有一个或多个 forward（例如在 batch 或多次调用的场景），
+    # 将所有输出在 batch 维度上拼接
+    if len(hook.outputs) > 1:
+        rep_tensor = torch.cat(hook.outputs, dim=0)
+    else:
+        rep_tensor = hook.outputs[0]
+
+    # 将 representation 展开为二维矩阵： (samples, features)
+    rep_tensor = rep_tensor.view(rep_tensor.size(0), -1)
+
+    # 将 tensor 转换为 numpy 数组以使用 numpy.linalg.svd
+    rep_np = rep_tensor.cpu().numpy()
+
+    # 计算奇异值
+    singular_values = np.linalg.svd(rep_np, full_matrices=False, compute_uv=False)
+
+    # 计算 effective rank
+    e_rank = compute_effective_rank(singular_values)
+
+    # 清空 hook 存储（若需要多次调用可以保持清洁状态）
+    hook.outputs.clear()
+    return e_rank
+
+
+
 def compute_dormant_stats(outputs: List[torch.Tensor], threshold: float) -> Tuple[int, int]:
     """
     对给定的一组输出（同一层可能 forward 多次）进行元素级统计。
@@ -70,18 +185,22 @@ def cal_dormant_ratio(
         parts["encoder"] = model.encoder
     if hasattr(model, "transformer"):
         parts["transformer"] = model.transformer
+    
     # 对于 head 部分，查找所有以 "head_" 开头的子模块
-    head_modules = {}
-    for name, module in model.named_children():
-        if name.startswith("head_"):
-            head_modules[name] = module
+    # head_modules = {}
+    # for name, module in model.named_children():
+    #     if name.startswith("head_"):
+    #         head_modules[name] = module
+    # if head_modules:
+    #     parts["head"] = nn.ModuleDict(head_modules)
+    
+    if hasattr(model, "head_modules"):
+        parts["head"] = model.head_modules
 
-    if head_modules:
-        parts["head"] = nn.ModuleDict(head_modules)
-    if not hasattr(model, "encoder") and not hasattr(model, "transformer") and not hasattr(model, "head"):
-        parts["model"] = model
+    # if not hasattr(model, "encoder") and not hasattr(model, "transformer") and not hasattr(model, "head"):
+    #     parts["model"] = model
 
-    # 定义要捕获的目标模块类型
+    # 定义要捕获的目标模块类型 TODO: 增加更多模块
     target_modules = (nn.Conv2d, nn.Linear)
     
     # 用于存储各部分的 hook（字典：部分名 -> list of (module_name, hook)）
@@ -117,7 +236,7 @@ def cal_dormant_ratio(
         for full_name, hook in hooks:
             layer_total, layer_dormant = compute_dormant_stats(hook.outputs, dormant_threshold)
             # 可打印日志，也可记录更详细信息
-            print(f"{full_name}: {layer_dormant}/{layer_total} -> {layer_dormant / layer_total * 100.0 if layer_total > 0 else 0.0}%")
+            # print(f"{full_name}: {layer_dormant}/{layer_total} -> {layer_dormant / layer_total * 100.0 if layer_total > 0 else 0.0}%")
             part_total += layer_total
             part_dormant += layer_dormant
         if part_total > 0:
diff --git a/lzero/policy/unizero.py b/lzero/policy/unizero.py
@@ -80,8 +80,8 @@ class UniZeroPolicy(MuZeroPolicy):
                 device='cpu',
                 # (bool) Whether to analyze simulation normalization.
                 analysis_sim_norm=False,
-                # (bool) Whether to analyze dormant ratio.
-                analysis_dormant_ratio=False,
+                # (bool) Whether to analyze dormant ratio, average_weight_magnitude of net, effective_rank of latent.
+                analysis_dormant_ratio_weight_rank=False,
                 # (int) The shape of the action space.
                 action_space_size=6,
                 # (int) The size of the group, related to simulation normalization.
@@ -119,7 +119,7 @@ class UniZeroPolicy(MuZeroPolicy):
                 # (float) The discount factor for future rewards.
                 gamma=1,
                 # (float) The threshold for a dormant neuron.
-                dormant_threshold=0.025,
+                dormant_threshold=0.01,
             ),
         ),
         # ****** common ******
@@ -415,8 +415,11 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         )
 
         weighted_total_loss = losses.loss_total
-        for loss_name, loss_value in losses.intermediate_losses.items():
-            self.intermediate_losses[f"{loss_name}"] = loss_value
+        # 合并 intermediate_losses 字典，避免重复赋值
+        self.intermediate_losses.update(losses.intermediate_losses)
+
+        # for loss_name, loss_value in losses.intermediate_losses.items():
+        #     self.intermediate_losses[f"{loss_name}"] = loss_value
 
         obs_loss = self.intermediate_losses['loss_obs']
         reward_loss = self.intermediate_losses['loss_rewards']
@@ -432,6 +435,11 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         dormant_ratio_encoder = self.intermediate_losses['dormant_ratio_encoder']
         dormant_ratio_transformer = self.intermediate_losses['dormant_ratio_transformer']
         dormant_ratio_head = self.intermediate_losses['dormant_ratio_head']
+        avg_weight_mag_encoder = self.intermediate_losses['avg_weight_mag_encoder']
+        avg_weight_mag_transformer = self.intermediate_losses['avg_weight_mag_transformer']
+        avg_weight_mag_head = self.intermediate_losses['avg_weight_mag_head']
+        e_rank_last_linear = self.intermediate_losses['e_rank_last_linear'] 
+        e_rank_sim_norm = self.intermediate_losses['e_rank_sim_norm']
         latent_state_l2_norms = self.intermediate_losses['latent_state_l2_norms']
 
         assert not torch.isnan(losses.loss_total).any(), "Loss contains NaN values"
@@ -515,6 +523,12 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
             'analysis/dormant_ratio_transformer': dormant_ratio_transformer,#.item(),
             'analysis/dormant_ratio_head': dormant_ratio_head,#.item(),
 
+            'analysis/avg_weight_mag_encoder': avg_weight_mag_encoder,
+            'analysis/avg_weight_mag_transformer': avg_weight_mag_transformer,
+            'analysis/avg_weight_mag_head': avg_weight_mag_head,
+            'analysis/e_rank_last_linear': e_rank_last_linear,
+            'analysis/e_rank_sim_norm':  e_rank_sim_norm,
+
             'analysis/latent_state_l2_norms': latent_state_l2_norms.item(),
             'analysis/l2_norm_before': self.l2_norm_before,
             'analysis/l2_norm_after': self.l2_norm_after,
@@ -896,6 +910,12 @@ def _monitor_vars_learn(self) -> List[str]:
             'analysis/dormant_ratio_transformer',
             'analysis/dormant_ratio_head',
 
+            'analysis/avg_weight_mag_encoder',
+            'analysis/avg_weight_mag_transformer',
+            'analysis/avg_weight_mag_head',
+            'analysis/e_rank_last_linear',
+            'analysis/e_rank_sim_norm',
+
             'analysis/latent_state_l2_norms',
             'analysis/l2_norm_before',
             'analysis/l2_norm_after',
diff --git a/zoo/atari/config/atari_unizero_segment_config.py b/zoo/atari/config/atari_unizero_segment_config.py