Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add robosuite support for sac #20

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions offlinerllib/utils/gym_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
This file implements a wrapper for facilitating compatibility with OpenAI gym.
This is useful when using these environments with code that assumes a gym-like
interface.
"""

import numpy as np
import gym
from gym import spaces, Env

from robosuite.wrappers import Wrapper


class GymWrapper(Wrapper, gym.Env):
metadata = None
render_mode = None
"""
Initializes the Gym wrapper. Mimics many of the required functionalities of the Wrapper class
found in the gym.core module

Args:
env (MujocoEnv): The environment to wrap.
keys (None or list of str): If provided, each observation will
consist of concatenated keys from the wrapped environment's
observation dictionary. Defaults to proprio-state and object-state.

Raises:
AssertionError: [Object observations must be enabled if no keys]
"""

def __init__(self, env, keys=None):
# Run super method
super().__init__(env=env)
# Create name for gym
robots = "".join(
[type(robot.robot_model).__name__ for robot in self.env.robots]
)
self.name = robots + "_" + type(self.env).__name__

# Get reward range
self.reward_range = (0, self.env.reward_scale)

if keys is None:
keys = []
# Add object obs if requested
if self.env.use_object_obs:
keys += ["object-state"]
# Add image obs if requested
if self.env.use_camera_obs:
keys += [f"{cam_name}_image" for cam_name in self.env.camera_names]
# Iterate over all robots to add to state
for idx in range(len(self.env.robots)):
keys += ["robot{}_proprio-state".format(idx)]
self.keys = keys

# Gym specific attributes
self.env.spec = None

# set up observation and action spaces
obs = self.env.reset()
self.modality_dims = {key: obs[key].shape for key in self.keys}
flat_ob = self._flatten_obs(obs)
self.obs_dim = flat_ob.size
high = np.inf * np.ones(self.obs_dim)
low = -high
self.observation_space = spaces.Box(low, high)
low, high = self.env.action_spec
self.action_space = spaces.Box(low, high)

def _flatten_obs(self, obs_dict, verbose=False):
"""
Filters keys of interest out and concatenate the information.

Args:
obs_dict (OrderedDict): ordered dictionary of observations
verbose (bool): Whether to print out to console as observation keys are processed

Returns:
np.array: observations flattened into a 1d array
"""
ob_lst = []
for key in self.keys:
if key in obs_dict:
if verbose:
print("adding key: {}".format(key))
ob_lst.append(np.array(obs_dict[key]).flatten())
return np.concatenate(ob_lst)

def reset(self, seed=None, options=None):
"""
Extends env reset method to return flattened observation instead of normal OrderedDict and optionally resets seed

Returns:
np.array: Flattened environment observation space after reset occurs
"""
if seed is not None:
if isinstance(seed, int):
np.random.seed(seed)
else:
raise TypeError("Seed must be an integer type!")
ob_dict = self.env.reset()
return self._flatten_obs(ob_dict)

def step(self, action):
"""
Extends vanilla step() function call to return flattened observation instead of normal OrderedDict.

Args:
action (np.array): Action to take in environment

Returns:
4-tuple:

- (np.array) flattened observations from the environment
- (float) reward from the environment
- (bool) episode ending after reaching an env terminal state
- (dict) misc information
"""
ob_dict, reward, terminated, info = self.env.step(action)
return self._flatten_obs(ob_dict), reward, terminated, info

def compute_reward(self, achieved_goal, desired_goal, info):
"""
Dummy function to be compatible with gym interface that simply returns environment reward

Args:
achieved_goal: [NOT USED]
desired_goal: [NOT USED]
info: [NOT USED]

Returns:
float: environment reward
"""
# Dummy args used to mimic Wrapper interface
return self.env.reward()
4 changes: 4 additions & 0 deletions reproduce/sac/config/robosuite/Door-Panda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from reproduce.sac.config.robosuite.base import *

task = "Door"
robots = "Panda"
4 changes: 4 additions & 0 deletions reproduce/sac/config/robosuite/List-Panda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from reproduce.sac.config.robosuite.base import *

task = "Lift"
robots = "Panda"
47 changes: 47 additions & 0 deletions reproduce/sac/config/robosuite/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from UtilsRL.misc import NameSpace

seed = 0
task = None
max_buffer_size = 1000000
discount = 0.99
tau = 0.005
alpha = 0.2
auto_alpha = True
reward_scale = 1.0

critic_hidden_dims = [256, 256]
critic_lr = 0.0003
actor_hidden_dims = [256, 256]
actor_lr = 0.0003

alpha_lr = 0.0003

num_epoch = 2000
episode_per_epoch = 10
step_per_epoch = 1000
batch_size = 256

eval_interval = 10
eval_episode = 10
save_interval = 50
log_interval = 10
warmup_epoch = 2
random_policy_epoch = 5
max_trajectory_length = 500

policy_logstd_min = -20
policy_logstd_max = 2
target_update_freq = 1

env_type = "robosuite"
name = "robosuite"


class wandb(NameSpace):
entity = None
project = None


debug = False

critic_q_num = 2
29 changes: 27 additions & 2 deletions reproduce/sac/run_sac_online.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import gym
import robosuite as suite
from robosuite.utils.mjmod import DynamicsModder
from offlinerllib.utils.gym_wrapper import GymWrapper
import numpy as np
import torch
import wandb
Expand All @@ -19,6 +22,9 @@
args.env = "-".join([args.domain.title(), args.task.title(), "v1"])
elif args.env_type == "mujoco":
args.env = args.task
elif args.env_type == "robosuite":
args.env = args.task
args.robots = args.robots
exp_name = "_".join([args.env, "seed"+str(args.seed)])
logger = CompositeLogger(log_dir=f"./log/sac/{args.name}", name=exp_name, logger_config={
"TensorboardLogger": {},
Expand All @@ -29,9 +35,28 @@
if args.env_type == "dmc":
env = make_dmc(domain_name=args.domain, task_name=args.task)
eval_env = make_dmc(domain_name=args.domain, task_name=args.task)
else:
elif args.env_type == "mujoco":
env = gym.make(args.env)
eval_env = gym.make(args.env)
elif args.env_type == "robosuite":
env = GymWrapper(
suite.make(
args.env,
robots=args.robots,
use_object_obs=True,
reward_shaping=True,
),
["robot0_proprio-state", "object-state"],
)
eval_env = GymWrapper(
suite.make(
args.env,
robots=args.robots,
use_object_obs=True,
reward_shaping=True,
),
["robot0_proprio-state", "object-state"],
)

obs_shape = env.observation_space.shape[0]
action_shape = env.action_space.shape[-1]
Expand Down Expand Up @@ -126,4 +151,4 @@
}, step=i_epoch)

if i_epoch % args.save_interval == 0:
logger.log_object(name=f"policy_{i_epoch}.pt", object=policy.state_dict(), path=f"./out/sac/{args.name}/{args.env}/seed{args.seed}/policy/")
logger.log_object(name=f"policy_{i_epoch}.pt", object=policy.state_dict(), path=f"./out/sac/{args.name}/{args.env}/seed{args.seed}/policy/")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的train loop的逻辑要变更,改成:训练N1个epoch,其中每个epoch会先收集N2个episode,然后训练N3个gradient step,其中N1=2000, N2=10, N3=1000

Loading