LAMDA-RL · OrangeX4 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/offlinerllib/utils/gym_wrapper.py b/offlinerllib/utils/gym_wrapper.py
@@ -0,0 +1,135 @@
+"""
+This file implements a wrapper for facilitating compatibility with OpenAI gym.
+This is useful when using these environments with code that assumes a gym-like
+interface.
+"""
+
+import numpy as np
+import gym
+from gym import spaces, Env
+
+from robosuite.wrappers import Wrapper
+
+
+class GymWrapper(Wrapper, gym.Env):
+    metadata = None
+    render_mode = None
+    """
+    Initializes the Gym wrapper. Mimics many of the required functionalities of the Wrapper class
+    found in the gym.core module
+
+    Args:
+        env (MujocoEnv): The environment to wrap.
+        keys (None or list of str): If provided, each observation will
+            consist of concatenated keys from the wrapped environment's
+            observation dictionary. Defaults to proprio-state and object-state.
+
+    Raises:
+        AssertionError: [Object observations must be enabled if no keys]
+    """
+
+    def __init__(self, env, keys=None):
+        # Run super method
+        super().__init__(env=env)
+        # Create name for gym
+        robots = "".join(
+            [type(robot.robot_model).__name__ for robot in self.env.robots]
+        )
+        self.name = robots + "_" + type(self.env).__name__
+
+        # Get reward range
+        self.reward_range = (0, self.env.reward_scale)
+
+        if keys is None:
+            keys = []
+            # Add object obs if requested
+            if self.env.use_object_obs:
+                keys += ["object-state"]
+            # Add image obs if requested
+            if self.env.use_camera_obs:
+                keys += [f"{cam_name}_image" for cam_name in self.env.camera_names]
+            # Iterate over all robots to add to state
+            for idx in range(len(self.env.robots)):
+                keys += ["robot{}_proprio-state".format(idx)]
+        self.keys = keys
+
+        # Gym specific attributes
+        self.env.spec = None
+
+        # set up observation and action spaces
+        obs = self.env.reset()
+        self.modality_dims = {key: obs[key].shape for key in self.keys}
+        flat_ob = self._flatten_obs(obs)
+        self.obs_dim = flat_ob.size
+        high = np.inf * np.ones(self.obs_dim)
+        low = -high
+        self.observation_space = spaces.Box(low, high)
+        low, high = self.env.action_spec
+        self.action_space = spaces.Box(low, high)
+
+    def _flatten_obs(self, obs_dict, verbose=False):
+        """
+        Filters keys of interest out and concatenate the information.
+
+        Args:
+            obs_dict (OrderedDict): ordered dictionary of observations
+            verbose (bool): Whether to print out to console as observation keys are processed
+
+        Returns:
+            np.array: observations flattened into a 1d array
+        """
+        ob_lst = []
+        for key in self.keys:
+            if key in obs_dict:
+                if verbose:
+                    print("adding key: {}".format(key))
+                ob_lst.append(np.array(obs_dict[key]).flatten())
+        return np.concatenate(ob_lst)
+
+    def reset(self, seed=None, options=None):
+        """
+        Extends env reset method to return flattened observation instead of normal OrderedDict and optionally resets seed
+
+        Returns:
+            np.array: Flattened environment observation space after reset occurs
+        """
+        if seed is not None:
+            if isinstance(seed, int):
+                np.random.seed(seed)
+            else:
+                raise TypeError("Seed must be an integer type!")
+        ob_dict = self.env.reset()
+        return self._flatten_obs(ob_dict)
+
+    def step(self, action):
+        """
+        Extends vanilla step() function call to return flattened observation instead of normal OrderedDict.
+
+        Args:
+            action (np.array): Action to take in environment
+
+        Returns:
+            4-tuple:
+
+                - (np.array) flattened observations from the environment
+                - (float) reward from the environment
+                - (bool) episode ending after reaching an env terminal state
+                - (dict) misc information
+        """
+        ob_dict, reward, terminated, info = self.env.step(action)
+        return self._flatten_obs(ob_dict), reward, terminated, info
+
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        """
+        Dummy function to be compatible with gym interface that simply returns environment reward
+
+        Args:
+            achieved_goal: [NOT USED]
+            desired_goal: [NOT USED]
+            info: [NOT USED]
+
+        Returns:
+            float: environment reward
+        """
+        # Dummy args used to mimic Wrapper interface
+        return self.env.reward()
diff --git a/reproduce/sac/config/robosuite/Door-Panda.py b/reproduce/sac/config/robosuite/Door-Panda.py
@@ -0,0 +1,4 @@
+from reproduce.sac.config.robosuite.base import *
+
+task = "Door"
+robots = "Panda"
diff --git a/reproduce/sac/config/robosuite/List-Panda.py b/reproduce/sac/config/robosuite/List-Panda.py
@@ -0,0 +1,4 @@
+from reproduce.sac.config.robosuite.base import *
+
+task = "Lift"
+robots = "Panda"
diff --git a/reproduce/sac/config/robosuite/base.py b/reproduce/sac/config/robosuite/base.py
@@ -0,0 +1,47 @@
+from UtilsRL.misc import NameSpace
+
+seed = 0
+task = None
+max_buffer_size = 1000000
+discount = 0.99
+tau = 0.005
+alpha = 0.2
+auto_alpha = True
+reward_scale = 1.0
+
+critic_hidden_dims = [256, 256]
+critic_lr = 0.0003
+actor_hidden_dims = [256, 256]
+actor_lr = 0.0003
+
+alpha_lr = 0.0003
+
+num_epoch = 2000
+episode_per_epoch = 10
+step_per_epoch = 1000
+batch_size = 256
+
+eval_interval = 10
+eval_episode = 10
+save_interval = 50
+log_interval = 10
+warmup_epoch = 2
+random_policy_epoch = 5
+max_trajectory_length = 500
+
+policy_logstd_min = -20
+policy_logstd_max = 2
+target_update_freq = 1
+
+env_type = "robosuite"
+name = "robosuite"
+
+
+class wandb(NameSpace):
+    entity = None
+    project = None
+
+
+debug = False
+
+critic_q_num = 2
diff --git a/reproduce/sac/run_sac_online.py b/reproduce/sac/run_sac_online.py
@@ -1,4 +1,7 @@
 import gym
+import robosuite as suite
+from robosuite.utils.mjmod import DynamicsModder
+from offlinerllib.utils.gym_wrapper import GymWrapper
 import numpy as np
 import torch
 import wandb
@@ -19,6 +22,9 @@
     args.env = "-".join([args.domain.title(), args.task.title(), "v1"])
 elif args.env_type == "mujoco":
     args.env = args.task
+elif args.env_type == "robosuite":
+    args.env = args.task
+    args.robots = args.robots
 exp_name = "_".join([args.env, "seed"+str(args.seed)]) 
 logger = CompositeLogger(log_dir=f"./log/sac/{args.name}", name=exp_name, logger_config={
     "TensorboardLogger": {}, 
@@ -29,9 +35,28 @@
 if args.env_type == "dmc":
     env = make_dmc(domain_name=args.domain, task_name=args.task)
     eval_env = make_dmc(domain_name=args.domain, task_name=args.task)
-else:
+elif args.env_type == "mujoco":
     env = gym.make(args.env)
     eval_env = gym.make(args.env)
+elif args.env_type == "robosuite":
+    env = GymWrapper(
+        suite.make(
+            args.env,
+            robots=args.robots,
+            use_object_obs=True,
+            reward_shaping=True,
+        ),
+        ["robot0_proprio-state", "object-state"],
+    )
+    eval_env = GymWrapper(
+        suite.make(
+            args.env,
+            robots=args.robots,
+            use_object_obs=True,
+            reward_shaping=True,
+        ),
+        ["robot0_proprio-state", "object-state"],
+    )
 
 obs_shape = env.observation_space.shape[0]
 action_shape = env.action_space.shape[-1]
@@ -126,4 +151,4 @@
         }, step=i_epoch)
 
     if i_epoch % args.save_interval == 0:
-        logger.log_object(name=f"policy_{i_epoch}.pt", object=policy.state_dict(), path=f"./out/sac/{args.name}/{args.env}/seed{args.seed}/policy/")
+        logger.log_object(name=f"policy_{i_epoch}.pt", object=policy.state_dict(), path=f"./out/sac/{args.name}/{args.env}/seed{args.seed}/policy/")