strands-labs
diff --git a/‎strands_robots_sim/policies/groot/__init__.py‎
Lines changed: 201 additions & 76 deletions b/‎strands_robots_sim/policies/groot/__init__.py‎
Lines changed: 201 additions & 76 deletions
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 """GR00T Policy — natural language robot control via GR00T inference servers.
 
+Supports both GR00T N1.5 and N1.6 observation/action formats.
+
 SPDX-License-Identifier: Apache-2.0
 """
 
@@ -12,33 +14,47 @@
 
 from .. import Policy
 from .client import GR00TClient
-from .data_config import load_data_config
+from .data_config import LIBERO_STATE_TO_N1D6, load_data_config
 
 logger = logging.getLogger(__name__)
 
 
 class Gr00tPolicy(Policy):
-    """GR00T policy: connects to a GR00T inference server via ZMQ."""
+    """GR00T policy: connects to a GR00T inference server via ZMQ.
+
+    Supports both N1.5 (prefixed keys: video.X, state, action.X)
+    and N1.6 (direct keys: X, individual state components) formats.
+    """
 
     def __init__(self, data_config: Union[str, dict], host: str = "localhost", port: int = 5555, **kwargs):
         """Initialize GR00T policy.
 
         Args:
-            data_config: Config name (e.g. "libero") or dict with video/state/action/language keys
+            data_config: Config name (e.g. "libero") or dict with video/state/action/language keys.
+                Pass "libero:n1d6" or set groot_version="n1d6" in kwargs to force N1.6 format.
             host: Inference service host
             port: Inference service port
         """
-        self.config = load_data_config(data_config)
+        groot_version = kwargs.pop("groot_version", "auto")
+
+        # Support "config_name:version" syntax (e.g. "libero:n1d6")
+        if isinstance(data_config, str) and ":" in data_config:
+            parts = data_config.split(":", 1)
+            data_config = parts[0]
+            groot_version = parts[1]
+
+        self.config = load_data_config(data_config, groot_version=groot_version)
         self.data_config_name = data_config if isinstance(data_config, str) else "custom"
-        self.client = GR00TClient(host=host, port=port)
+        self.groot_version = self.config.get("groot_version", "n1d5")
+        self.client = GR00TClient(host=host, port=port, groot_version=self.groot_version)
 
         self.camera_keys = self.config["video"]
         self.state_keys = self.config["state"]
         self.action_keys = self.config["action"]
         self.language_keys = self.config["language"]
         self.robot_state_keys = []
 
-        logger.info(f"🧠 GR00T Policy: {self.data_config_name} @ {host}:{port}")
+        logger.info(f"🧠 GR00T Policy: {self.data_config_name} @ {host}:{port} (version: {self.groot_version})")
 
     @property
     def provider_name(self) -> str:
@@ -50,12 +66,66 @@ def set_robot_state_keys(self, robot_state_keys: List[str]) -> None:
     async def get_actions(self, observation_dict: Dict[str, Any], instruction: str, **kwargs) -> List[Dict[str, Any]]:
         """Get actions from GR00T policy server.
 
-        Args:
-            observation_dict: Robot observations (cameras + state)
-            instruction: Natural language instruction
+        Automatically formats observations for N1.5 or N1.6 based on config.
+        """
+        if self.groot_version == "n1d6":
+            obs = self._build_n1d6_observation(observation_dict, instruction)
+        else:
+            obs = self._build_n1d5_observation(observation_dict, instruction)
+
+        try:
+            action_chunk = self.client.get_action(obs)
+        except Exception as e:
+            logger.error(f"GR00T inference failed: {e}")
+            action_chunk = self._create_fallback_actions()
 
-        Returns:
-            List of action dicts for robot execution
+        return self._to_robot_actions(action_chunk)
+
+    def _build_n1d6_observation(self, observation_dict: Dict[str, Any], instruction: str) -> dict:
+        """Build observation dict for GR00T N1.6 format.
+
+        When the server uses Gr00tSimPolicyWrapper (--use-sim-policy-wrapper),
+        it expects flat keys with prefixes: video.image, state.x, etc.
+        The wrapper then converts these to the nested format internally.
+
+        Flat format (for SimPolicyWrapper):
+        {
+            "video.image": array(B, T, H, W, C),
+            "video.wrist_image": array(B, T, H, W, C),
+            "state.x": array(B, T, 1),
+            ...
+            "annotation.human.action.task_description": ("instruction",),
+        }
+        """
+        obs = {}
+
+        # Camera observations — flat keys with "video." prefix, shape (B, T, H, W, C)
+        for vkey in self.camera_keys:
+            cam = self._find_camera(vkey, observation_dict)
+            flat_key = f"video.{vkey}"
+            if cam and cam in observation_dict:
+                image = self._resize_image(observation_dict[cam], target_size=(256, 256))
+                obs[flat_key] = image.reshape(1, 1, *image.shape).astype(np.uint8)
+            else:
+                obs[flat_key] = np.zeros((1, 1, 256, 256, 3), dtype=np.uint8)
+
+        # State observations — flat keys with "state." prefix
+        if "libero" in self.data_config_name.lower():
+            self._map_libero_state_n1d6(obs, observation_dict)
+        else:
+            for skey in self.state_keys:
+                obs[f"state.{skey}"] = np.array([[[0.0]]], dtype=np.float32)
+
+        # Language instruction — as tuple for batch
+        if self.language_keys:
+            obs[self.language_keys[0]] = (instruction,)
+
+        return obs
+
+    def _build_n1d5_observation(self, observation_dict: Dict[str, Any], instruction: str) -> dict:
+        """Build observation dict for GR00T N1.5 format (legacy).
+
+        N1.5 uses prefixed keys: video.X, state, action.X
         """
         obs = {}
 
@@ -82,34 +152,70 @@ async def get_actions(self, observation_dict: Dict[str, Any], instruction: str,
                 robot_state_parts.extend(np.atleast_1d(value).flatten())
             else:
                 robot_state_parts.append(float(value))
-        robot_state = np.array(robot_state_parts, dtype=np.float64)
+        robot_state = np.array(robot_state_parts, dtype=np.float32)
 
         if "libero" in self.data_config_name.lower():
-            self._map_libero_state(obs, observation_dict)
+            self._map_libero_state_n1d5(obs, observation_dict)
         else:
             self._map_state(obs, robot_state)
 
         # Language instruction
         if self.language_keys:
             obs[self.language_keys[0]] = instruction
 
-        # Batch dimension
+        # Batch dimension for N1.5
         for k in obs:
             if isinstance(obs[k], np.ndarray) and k.startswith("video."):
                 obs[k] = np.expand_dims(obs[k], axis=0)
             elif isinstance(obs[k], str):
                 obs[k] = [obs[k]]
 
-        try:
-            action_chunk = self.client.get_action(obs)
-        except Exception as e:
-            logger.error(f"GR00T inference failed: {e}")
-            action_chunk = self._create_fallback_actions()
+        return obs
 
-        return self._to_robot_actions(action_chunk)
+    def _map_libero_state_n1d6(self, obs: dict, observation_dict: dict):
+        """Map Libero observation to N1.6 flat state keys (state.x, state.y, etc.).
+
+        State values have shape (B, T, dim) where B=1, T=1.
+        Uses "state." prefix for SimPolicyWrapper compatibility.
+        """
+        if "robot0_eef_pos" in observation_dict and "robot0_eef_quat" in observation_dict:
+            xyz = observation_dict["robot0_eef_pos"]
+            quat = observation_dict["robot0_eef_quat"]
+            gripper = observation_dict.get("robot0_gripper_qpos", np.array([0.0, 0.0]))
+            rpy = self._quat2axisangle(quat)
+            obs["state.x"] = np.array([[[xyz[0]]]], dtype=np.float32)
+            obs["state.y"] = np.array([[[xyz[1]]]], dtype=np.float32)
+            obs["state.z"] = np.array([[[xyz[2]]]], dtype=np.float32)
+            obs["state.roll"] = np.array([[[rpy[0]]]], dtype=np.float32)
+            obs["state.pitch"] = np.array([[[rpy[1]]]], dtype=np.float32)
+            obs["state.yaw"] = np.array([[[rpy[2]]]], dtype=np.float32)
+            obs["state.gripper"] = np.asarray(gripper, dtype=np.float32).reshape(1, 1, -1)
+        else:
+            for key in ("x", "y", "z", "roll", "pitch", "yaw"):
+                obs[f"state.{key}"] = np.array([[[0.0]]], dtype=np.float32)
+            obs["state.gripper"] = np.array([[[0.0]]], dtype=np.float32)
+
+    def _map_libero_state_n1d5(self, obs: dict, observation_dict: dict):
+        """Map Libero end-effector pose to N1.5 state format (state.x, state.y, etc.)."""
+        if "robot0_eef_pos" in observation_dict and "robot0_eef_quat" in observation_dict:
+            xyz = observation_dict["robot0_eef_pos"]
+            quat = observation_dict["robot0_eef_quat"]
+            gripper = observation_dict.get("robot0_gripper_qpos", np.array([0.0, 0.0]))
+            rpy = self._quat2axisangle(quat)
+            obs["state.x"] = np.array([[xyz[0]]])
+            obs["state.y"] = np.array([[xyz[1]]])
+            obs["state.z"] = np.array([[xyz[2]]])
+            obs["state.roll"] = np.array([[rpy[0]]])
+            obs["state.pitch"] = np.array([[rpy[1]]])
+            obs["state.yaw"] = np.array([[rpy[2]]])
+            obs["state.gripper"] = np.expand_dims(gripper, axis=0)
+        else:
+            for key in ("x", "y", "z", "roll", "pitch", "yaw"):
+                obs[f"state.{key}"] = np.array([[0.0]], dtype=np.float32)
+            obs["state.gripper"] = np.array([[0.0]], dtype=np.float32)
 
     def _find_camera(self, video_key: str, obs: dict) -> str:
-        """Map GR00T video key to available camera key."""
+        """Map GR00T video key to available camera key in observation."""
         if video_key in obs:
             return video_key
 
@@ -189,27 +295,8 @@ def _resize_image(self, image: np.ndarray, target_size: tuple = (256, 256)) -> n
         except Exception:
             return image
 
-    def _map_libero_state(self, obs: dict, observation_dict: dict):
-        """Map Libero end-effector pose to GR00T state format."""
-        if "robot0_eef_pos" in observation_dict and "robot0_eef_quat" in observation_dict:
-            xyz = observation_dict["robot0_eef_pos"]
-            quat = observation_dict["robot0_eef_quat"]
-            gripper = observation_dict.get("robot0_gripper_qpos", np.array([0.0, 0.0]))
-            rpy = self._quat2axisangle(quat)
-            obs["state.x"] = np.array([[xyz[0]]])
-            obs["state.y"] = np.array([[xyz[1]]])
-            obs["state.z"] = np.array([[xyz[2]]])
-            obs["state.roll"] = np.array([[rpy[0]]])
-            obs["state.pitch"] = np.array([[rpy[1]]])
-            obs["state.yaw"] = np.array([[rpy[2]]])
-            obs["state.gripper"] = np.expand_dims(gripper, axis=0)
-        else:
-            for key in ("x", "y", "z", "roll", "pitch", "yaw"):
-                obs[f"state.{key}"] = np.array([[0.0]], dtype=np.float64)
-            obs["state.gripper"] = np.array([[0.0]], dtype=np.float64)
-
     def _map_state(self, obs: dict, state: np.ndarray):
-        """Map robot state array to GR00T state keys."""
+        """Map robot state array to GR00T state keys (N1.5 format)."""
         name = self.data_config_name.lower()
         if "so100" in name and len(state) >= 6:
             obs["state.single_arm"] = state[:5].astype(np.float64)
@@ -229,17 +316,16 @@ def _map_state(self, obs: dict, state: np.ndarray):
             obs[self.state_keys[0]] = state.astype(np.float64)
 
     def _to_robot_actions(self, chunk: dict) -> List[Dict[str, Any]]:
-        """Convert GR00T action chunk to list of robot action dicts."""
-        act_key = None
-        for k in self.action_keys:
-            base = k.replace("action.", "") if k.startswith("action.") else k
-            full = f"action.{base}"
-            if full in chunk:
-                act_key = full
-                break
-        if not act_key:
-            act_keys = [k for k in chunk if k.startswith("action.")]
-            act_key = act_keys[0] if act_keys else None
+        """Convert GR00T action chunk to list of robot action dicts.
+
+        Handles both N1.5 format (shape: (T, dim)) and
+        N1.6 format (shape: (B, T, dim) where B=1).
+        """
+        # Strip batch dimension from N1.6 response: (B, T, dim) -> (T, dim)
+        chunk = self._strip_batch_dim(chunk)
+
+        # Find action key
+        act_key = self._find_action_key(chunk)
         if not act_key:
             return []
 
@@ -254,13 +340,15 @@ def _to_robot_actions(self, chunk: dict) -> List[Dict[str, Any]]:
             for i in range(horizon):
                 parts = []
                 for k in self.action_keys:
-                    mod = k.split(".")[-1]
-                    if f"action.{mod}" in chunk:
-                        parts.append(np.atleast_1d(chunk[f"action.{mod}"][i]))
+                    mod = k.split(".")[-1] if "." in k else k
+                    for candidate in (mod, f"action.{mod}"):
+                        if candidate in chunk:
+                            parts.append(np.atleast_1d(chunk[candidate][i]).flatten())
+                            break
                 if not parts:
                     for k, v in chunk.items():
-                        if k.startswith("action."):
-                            parts.append(np.atleast_1d(v[i]))
+                        if k.startswith("action.") or k in self.action_keys:
+                            parts.append(np.atleast_1d(v[i]).flatten())
 
                 concat = np.concatenate(parts) if parts else np.zeros(len(self.robot_state_keys) or 6)
                 actions.append(
@@ -269,6 +357,34 @@ def _to_robot_actions(self, chunk: dict) -> List[Dict[str, Any]]:
 
         return actions
 
+    @staticmethod
+    def _strip_batch_dim(chunk: dict) -> dict:
+        """Strip batch dimension from N1.6 action response.
+
+        N1.6 returns shape (B, T, dim), we need (T, dim).
+        N1.5 returns shape (T, dim), no change needed.
+        """
+        result = {}
+        for k, v in chunk.items():
+            if isinstance(v, np.ndarray) and v.ndim == 3 and v.shape[0] == 1:
+                result[k] = v[0]  # (1, T, dim) -> (T, dim)
+            else:
+                result[k] = v
+        return result
+
+    def _find_action_key(self, chunk: dict) -> str:
+        """Find the first available action key in chunk."""
+        for k in self.action_keys:
+            base = k.replace("action.", "") if k.startswith("action.") else k
+            for candidate in (base, f"action.{base}"):
+                if candidate in chunk:
+                    return candidate
+        # Fallback: any action-like key
+        for k in chunk:
+            if k.startswith("action.") or k in ("x", "y", "z", "roll", "pitch", "yaw", "gripper"):
+                return k
+        return None
+
     @staticmethod
     def _quat2axisangle(quat: np.ndarray) -> np.ndarray:
         """Convert quaternion (x,y,z,w) to axis-angle (roll,pitch,yaw)."""
@@ -280,12 +396,17 @@ def _quat2axisangle(quat: np.ndarray) -> np.ndarray:
         return (quat[:3] * 2.0 * math.acos(quat[3])) / den
 
     def _to_libero_action(self, action_chunk: dict, idx: int = 0) -> np.ndarray:
-        """Convert GR00T action chunk to Libero 7-dim: [dx,dy,dz,droll,dpitch,dyaw,gripper]."""
+        """Convert GR00T action chunk to Libero 7-dim: [dx,dy,dz,droll,dpitch,dyaw,gripper].
+
+        After _strip_batch_dim, chunk values have shape (T, dim).
+        """
         components = []
         for key in ("x", "y", "z", "roll", "pitch", "yaw", "gripper"):
-            full_key = f"action.{key}"
-            if full_key in action_chunk:
-                components.append(np.atleast_1d(action_chunk[full_key][idx])[0])
+            for candidate in (key, f"action.{key}"):
+                if candidate in action_chunk:
+                    val = action_chunk[candidate][idx]
+                    components.append(float(np.asarray(val).flatten()[0]))
+                    break
             else:
                 components.append(0.0)
         action = np.array(components, dtype=np.float32)
@@ -305,21 +426,25 @@ def _create_fallback_actions(self) -> dict:
         """Create zero-action fallback when inference fails."""
         chunk = {}
         horizon = 8
-        for key in self.action_keys:
-            mod = key.split(".")[-1]
-            if "joint_pos" in mod.lower():
-                dim = 7
-            elif "eef_pos" in mod.lower():
-                dim = 3
-            elif "eef_quat" in mod.lower():
-                dim = 4
-            elif "gripper" in mod.lower():
-                dim = 1
-            else:
-                dim = len(self.robot_state_keys) // 5 if self.robot_state_keys else 7
-            chunk[f"action.{mod}"] = np.zeros((horizon, dim), dtype=np.float64)
-        if not chunk:
-            chunk["action.robot0_joint_pos"] = np.zeros((horizon, 7), dtype=np.float64)
+        if self.groot_version == "n1d6":
+            for key in ("x", "y", "z", "roll", "pitch", "yaw", "gripper"):
+                chunk[key] = np.zeros((horizon, 1), dtype=np.float32)
+        else:
+            for key in self.action_keys:
+                mod = key.split(".")[-1]
+                if "joint_pos" in mod.lower():
+                    dim = 7
+                elif "eef_pos" in mod.lower():
+                    dim = 3
+                elif "eef_quat" in mod.lower():
+                    dim = 4
+                elif "gripper" in mod.lower():
+                    dim = 1
+                else:
+                    dim = len(self.robot_state_keys) // 5 if self.robot_state_keys else 7
+                chunk[f"action.{mod}"] = np.zeros((horizon, dim), dtype=np.float32)
+            if not chunk:
+                chunk["action.robot0_joint_pos"] = np.zeros((horizon, 7), dtype=np.float32)
         return chunk