qlan3
diff --git a/‎.gitignore
-2 b/‎.gitignore
-2
diff --git a/‎agents/ActorCritic.py
100755100644
+1-3 b/‎agents/ActorCritic.py
100755100644
+1-3
diff --git a/‎agents/BaseAgent.py
100755100644 b/‎agents/BaseAgent.py
100755100644
diff --git a/‎agents/DDPG.py
100755100644 b/‎agents/DDPG.py
100755100644
diff --git a/‎agents/DQN.py
100755100644 b/‎agents/DQN.py
100755100644
diff --git a/‎agents/PPO.py
100755100644
+5-10 b/‎agents/PPO.py
100755100644
+5-10
diff --git a/‎agents/REINFORCE.py
100755100644 b/‎agents/REINFORCE.py
100755100644
diff --git a/‎agents/RPG.py
100755100644 b/‎agents/RPG.py
100755100644
diff --git a/‎agents/RepOffPG.py
100755100644 b/‎agents/RepOffPG.py
100755100644
diff --git a/‎agents/RepOnPG.py
100755100644 b/‎agents/RepOnPG.py
100755100644
diff --git a/‎agents/SAC.py
100755100644 b/‎agents/SAC.py
100755100644
diff --git a/‎agents/TD3.py
100755100644 b/‎agents/TD3.py
100755100644
diff --git a/‎agents/VanillaDQN.py
100755100644 b/‎agents/VanillaDQN.py
100755100644
diff --git a/‎agents/__init__.py
100755100644 b/‎agents/__init__.py
100755100644
diff --git a/‎analysis_mujoco.py
+10-22 b/‎analysis_mujoco.py
+10-22
diff --git a/‎components/exploration.py
100755100644 b/‎components/exploration.py
100755100644
diff --git a/‎components/network.py
100755100644 b/‎components/network.py
100755100644
diff --git a/‎components/normalizer.py
100755100644 b/‎components/normalizer.py
100755100644
diff --git a/‎components/replay.py
100755100644 b/‎components/replay.py
100755100644
diff --git a/‎configs/rpg.json
+36 b/‎configs/rpg.json
+36
diff --git a/‎configs/rpg_gradclip.json
+36 b/‎configs/rpg_gradclip.json
+36
diff --git a/‎configs/rpg_reg.json
+36 b/‎configs/rpg_reg.json
+36
diff --git a/‎diary.md
+81-4 b/‎diary.md
+81-4
diff --git a/‎envs/env.py
100755100644 b/‎envs/env.py
100755100644
diff --git a/‎envs/wrapper.py
100755100644 b/‎envs/wrapper.py
100755100644
diff --git a/‎experiment.py
100755100644 b/‎experiment.py
100755100644
diff --git a/‎main.py
100755100644
+11-12 b/‎main.py
100755100644
+11-12
diff --git a/‎utils/helper.py
100755100644 b/‎utils/helper.py
100755100644
diff --git a/‎utils/logger.py
100755100644 b/‎utils/logger.py
100755100644
diff --git a/‎utils/sweeper.py
100755100644 b/‎utils/sweeper.py
100755100644
@@ -4,10 +4,8 @@
 # My ignores
 logs*
 output/
-*figure*
 sbatch*
 script.sh
-plot_paper.py
 *DS_Store*
 run*.sh
 
 
@@ -164,6 +164,4 @@ def learn(self):
     # Log
     if self.show_tb:
       self.logger.add_scalar(f'actor_loss', actor_loss.item(), self.step_count)
-      self.logger.add_scalar(f'critic_loss', critic_loss.item(), self.step_count)
-      self.logger.add_scalar(f'v', entries.v.mean().item(), self.step_count)
-      self.logger.add_scalar(f'log_pi', entries.log_pi.mean().item(), self.step_count)
+      self.logger.add_scalar(f'critic_loss', critic_loss.item(), self.step_count)
@@ -75,13 +75,8 @@ def learn(self):
         self.optimizer['critic'].step()
     # Log
     if self.show_tb:
-      self.logger.add_scalar('actor_loss', actor_loss.item(), self.step_count)
-      self.logger.add_scalar('critic_loss', critic_loss.item(), self.step_count)
-      self.logger.add_scalar('log_pi', entries.log_pi.mean().item(), self.step_count)
-      action_std, entropy = self.network.get_entropy_pi(entries.state)
-      self.logger.add_scalar('entropy', entropy.mean().item(), self.step_count)
-      self.logger.add_scalar('action_std', action_std.mean().item(), self.step_count)
-      self.logger.add_scalar('KL', approx_kl.item(), self.step_count)
-      self.logger.add_scalar('IS', ratio.mean().item(), self.step_count)
-      self.logger.add_scalar('v', prediction['v'].mean().item(), self.step_count)
-      self.logger.add_scalar('adv', abs(entries.adv).mean().item(), self.step_count)
+      try:
+        self.logger.add_scalar('actor_loss', actor_loss.item(), self.step_count)
+      except:
+        pass
+      self.logger.add_scalar('critic_loss', critic_loss.item(), self.step_count)
@@ -1,4 +1,4 @@
-import os
+=import os
 import math
 from utils.plotter import Plotter
 from utils.sweeper import unfinished_index, time_info, memory_info
@@ -39,7 +39,7 @@ def get_csv_result_dict(result, config_idx, mode='Train'):
   'ylim': {'min': None, 'max': None},
   'EMA': True,
   'loc': 'lower right',
-  'sweep_keys': ['gradient_clip', 'state_normalizer'],
+  'sweep_keys': [],
   'sort_by': ['Return (mean)', 'Return (se)'],
   'ascending': [False, True],
   'runs': 1
@@ -55,33 +55,21 @@ def analyze(exp, runs=1):
   plotter.csv_results('Test', get_csv_result_dict, get_process_result_dict)
   plotter.plot_results(mode='Train', indexes='all')
   plotter.plot_results(mode='Test', indexes='all')
-  # indexList = [11, 43, 15, 23, 31, 19, 22]
-  # plotter.plot_indexList(indexList, 'Train', exp)
 
   envs = ["HalfCheetah-v2", "Hopper-v2", "Walker2d-v2", "Swimmer-v2", "Ant-v2", "Reacher-v2"]
   indexes = {
-    'onrpg': [31, 32, 33, 34, 35, 36],
-    'ppo': [13, 14, 15, 16, 17, 18]
+    'ppo': [1, 2, 3, 4, 5, 6],
+    'rpg': [7, 8, 9, 10, 11, 12]
   }
-  if exp == 'rpg_onrpg':
+  if exp == 'rpg':
     for i in range(6):
       for mode in ['Train', 'Test']:
-        expIndexModeList = [['rpg_onrpg', indexes['onrpg'][i], mode], ['rpg_ppo', indexes['ppo'][i], mode]]
+        expIndexModeList = [['rpg', indexes['ppo'][i], mode], ['rpg', indexes['rpg'][i], mode]]
         plotter.plot_expIndexModeList(expIndexModeList, f'{mode}_{envs[i]}')
 
 
 if __name__ == "__main__":
-  # unfinished_index('rpg_offrpg', runs=5)
-  # memory_info('rpg_offrpg', runs=5)
-  # time_info('rpg_offrpg', runs=5)
-  # analyze('rpg_offrpg', runs=5)
-
-  # unfinished_index('rpg_ppo', runs=10)
-  # memory_info('rpg_ppo', runs=10)
-  # time_info('rpg_ppo', runs=10)
-  # analyze('rpg_ppo', runs=10)
-
-  # unfinished_index('rpg_onrpg', runs=10)
-  # memory_info('rpg_onrpg', runs=10)
-  # time_info('rpg_onrpg', runs=10)
-  analyze('rpg_onrpg', runs=10)
+  unfinished_index('rpg', runs=30)
+  memory_info('rpg', runs=30)
+  time_info('rpg', runs=30)
+  analyze('rpg', runs=30)
@@ -0,0 +1,36 @@
+{
+  "env": [
+    {
+      "name": ["HalfCheetah-v2", "Hopper-v2", "Walker2d-v2", "Swimmer-v2", "Ant-v2", "Reacher-v2"],
+      "max_episode_steps": [-1],
+      "input_type": ["feature"]
+    }
+  ],
+  "train_steps": [3e6],
+  "steps_per_epoch": [2048],
+  "test_per_epochs": [4],
+  "agent": [{"name": ["PPO", "RPG"]}],
+  "optimizer": [
+    {
+      "name": ["Adam"],
+      "actor_kwargs": [{"lr": [3e-4]}],
+      "critic_kwargs": [{"lr": [1e-3]}],
+      "reward_kwargs": [{"lr": [1e-3]}]
+    }
+  ],
+  "batch_size": [64],
+  "clip_ratio": [0.2],
+  "target_kl": [0.01],
+  "optimize_epochs": [10],
+  "gradient_clip": [2],
+  "hidden_layers": [[64,64]],
+  "hidden_act": ["Tanh"],
+  "display_interval": [20],
+  "rolling_score_window": [{"Train": [20], "Test": [5]}],
+  "discount": [0.99],
+  "gae": [0.95],
+  "seed": [1],
+  "device": ["cpu"],
+  "show_tb": [false],
+  "generate_random_seed": [true]
+}
@@ -0,0 +1,36 @@
+{
+  "env": [
+    {
+      "name": ["HalfCheetah-v2", "Hopper-v2", "Walker2d-v2", "Swimmer-v2", "Ant-v2", "Reacher-v2"],
+      "max_episode_steps": [-1],
+      "input_type": ["feature"]
+    }
+  ],
+  "train_steps": [3e6],
+  "steps_per_epoch": [2048],
+  "test_per_epochs": [4],
+  "agent": [{"name": ["PPO", "RPG"]}],
+  "optimizer": [
+    {
+      "name": ["Adam"],
+      "actor_kwargs": [{"lr": [3e-4]}],
+      "critic_kwargs": [{"lr": [1e-3]}],
+      "reward_kwargs": [{"lr": [1e-3]}]
+    }
+  ],
+  "batch_size": [64],
+  "clip_ratio": [0.2],
+  "target_kl": [0.01],
+  "optimize_epochs": [10],
+  "gradient_clip": [-1, 0.5, 1, 4],
+  "hidden_layers": [[64,64]],
+  "hidden_act": ["Tanh"],
+  "display_interval": [20],
+  "rolling_score_window": [{"Train": [20], "Test": [5]}],
+  "discount": [0.99],
+  "gae": [0.95],
+  "seed": [1],
+  "device": ["cpu"],
+  "show_tb": [false],
+  "generate_random_seed": [true]
+}
@@ -0,0 +1,36 @@
+{
+  "env": [
+    {
+      "name": ["HalfCheetah-v2", "Hopper-v2", "Walker2d-v2", "Swimmer-v2", "Ant-v2", "Reacher-v2"],
+      "max_episode_steps": [-1],
+      "input_type": ["feature"]
+    }
+  ],
+  "train_steps": [3e6],
+  "steps_per_epoch": [2048],
+  "test_per_epochs": [5],
+  "agent": [{"name": ["PPO", "RPG"]}],
+  "optimizer": [
+    {
+      "name": ["Adam"],
+      "actor_kwargs": [{"lr": [3e-4]}],
+      "critic_kwargs": [{"lr": [1e-3]}],
+      "reward_kwargs": [{"lr": [1e-3], "weight_decay": [4e-6]}]
+    }
+  ],
+  "batch_size": [64],
+  "clip_ratio": [0.2],
+  "target_kl": [0.01],
+  "optimize_epochs": [10],
+  "gradient_clip": [2],
+  "hidden_layers": [[64,64]],
+  "hidden_act": ["Tanh"],
+  "display_interval": [40],
+  "rolling_score_window": [{"Train": [30], "Test": [6]}],
+  "discount": [0.99],
+  "gae": [0.95],
+  "seed": [1],
+  "device": ["cpu"],
+  "show_tb": [false],
+  "generate_random_seed": [true]
+}
@@ -12,7 +12,6 @@
 
   - Goal:
   - Analysis:
-  - Next:
 
 ## 2020-12-15
 
@@ -369,7 +368,7 @@
   - Goal: benchmark OnRPG and PPO with state_normalizer and gradient clip.
   - Analysis: 
     - OnRPG1 is consistently better than OnRPG; gradient_clipis is very helpful; state_normalizer helps in most envs.
-    - OnRPG1 vs PPO: OnRPG1 wins 2 envs, lose 1, and 3 ties
+    - OnRPG1 vs PPO: OnRPG1 wins 2 envs, losses 1, and 3 ties
   - Next: normalize adv with global std, i.e. std(reward+adv).
 
 
@@ -385,8 +384,86 @@
 
 | experiment | runs | branch | commit  |
 | ---------- | ---- | ------ | ------- |
-| rpg_onrpg2 |  10  |   RPG  |  |
+| rpg_onrpg2 |  10  |   RPG  | 17ae0ca |
 
   - Goal: test a variant of OnRPG1 that two IS ratios, one for reward part, one for state value. We only set IS ratio for state value to 0.
   - Analysis: it is better than OnRPG1 on Walker2d, but worse on several other environments.
-  - Next:
+  - Next:
+
+
+## 2021-01-20
+
+**Change the name of OnRPG1 to RPG.**
+
+| experiment | runs | branch | commit  |
+| ---------- | ---- | ------ | ------- |
+|   rpg_lr   |  10  |   RPG  | 3c8e907 |
+
+  - Goal: sweep lr for the reward net in RPG
+  - Analysis: no lr is consistently better than the original lr (1e-3)
+  - Next: final benchmark RPG and PPO
+
+
+## 2021-01-21
+
+| experiment | runs | branch | commit  |
+| ---------- | ---- | ------ | ------- |
+|   rpg1     |  20  |   RPG  | 3c8e907 |
+
+  - Goal: benchmark RPG and PPO
+  - Analysis:
+    - Train: 2 wins, 3 losses, 1 tie
+    - Test: 3 wins, 1 loss, 2 ties
+
+
+| experiment | runs | branch | commit  |
+| ---------- | ---- | ------ | ------- |
+|  rpg_reg1  |  10  |   RPG  | 3c8e907 |
+
+  - Goal: use weight_decay to make reward net smooth
+  - Analysis: except Reacher, there is improvement in other 5 envs.
+
+
+## 2021-01-23
+
+| experiment | runs | branch | commit  |
+| ---------- | ---- | ------ | ------- |
+|  rpg_reg2  |  10  |   RPG  | 3c8e907 |
+|  rpg_reg3  |  10  |   RPG  | 3c8e907 |
+|  rpg_reg4  |  10  |   RPG  | 3c8e907 |
+
+  - Goal: sweep weight_decay with a smaller interval
+  - Analysis: no dominant weigt_decay but 2e-6 to 6e-6 seems to be good choices; all in all, the best weight decay is 4e-6.
+
+
+## 2021-01-24
+
+| experiment | runs | branch | commit  |
+| ---------- | ---- | ------ | ------- |
+|  rpg_reg   |  30  |   RPG  |  |
+
+
+  - Goal: benchmark RPG (with weight decay 4e-6) and PPO
+  - Analysis: PPO is better now somehow with more runs while RPG is worse :(
+    - Train: 1 win, 2 losses, 3 ties
+    - Test:  2 wins, 2 losses, 2 ties
+
+
+| experiment | runs | branch | commit  |
+| ---------- | ---- | ------ | ------- |
+|    rpg     |  30  |   RPG  |  |
+
+  - Goal: benchmark RPG (with no weight decay) and PPO on 6 tasks.
+  - Analysis:
+    - Train: 1 win, 2 losses, 3 ties
+    - Test:  3 wins, 2 losses, 1 tie
+
+
+## 2021-01-27
+
+|  experiment  |  runs  | branch | commit  |
+| ------------ | ------ | ------ | ------- |
+| rpg_gradclip |   10   |   RPG  |  |
+
+  - Goal: sweep grad_clip for PPO and RPG on Cedar.
+  - Analysis: no dominant gradient clip.
@@ -10,6 +10,7 @@ def main(argv):
   parser = argparse.ArgumentParser(description="Config file")
   parser.add_argument('--config_file', type=str, default='./configs/catcher.json', help='Configuration file for the chosen model')
   parser.add_argument('--config_idx', type=int, default=1, help='Configuration index')
+  parser.add_argument('--slurm_dir', type=str, default='', help='slurm tempory directory')
   args = parser.parse_args()
 
   sweeper = Sweeper(args.config_file)
@@ -27,18 +28,16 @@ def main(argv):
 
   # Set experiment name and log paths
   cfg['exp'] = args.config_file.split('/')[-1].split('.')[0]
-  logs_dir = f"./logs/{cfg['exp']}/{cfg['config_idx']}/"
-  train_log_path = logs_dir + 'result_Train.feather'
-  test_log_path = logs_dir + 'result_Test.feather'
-  model_path = logs_dir + 'model.pt'
-  cfg_path = logs_dir + 'config.json'
-  cfg['logs_dir'] = logs_dir
-  cfg['train_log_path'] = train_log_path
-  cfg['test_log_path'] = test_log_path
-  cfg['model_path'] = model_path
-  cfg['cfg_path'] = cfg_path
-
-  make_dir(cfg['logs_dir'])
+  if len(args.slurm_dir) > 0:  
+    cfg['logs_dir'] = f"{args.slurm_dir}/{cfg['exp']}/{cfg['config_idx']}/"
+    make_dir(cfg['logs_dir'])
+  else:
+    cfg['logs_dir'] = f"./logs/{cfg['exp']}/{cfg['config_idx']}/"
+  make_dir(f"./logs/{cfg['exp']}/{cfg['config_idx']}/")
+  cfg['train_log_path'] = cfg['logs_dir'] + 'result_Train.feather'
+  cfg['test_log_path'] = cfg['logs_dir'] + 'result_Test.feather'
+  cfg['model_path'] = cfg['logs_dir'] + 'model.pt'
+  cfg['cfg_path'] = cfg['logs_dir'] + 'config.json'
 
   exp = Experiment(cfg)
   exp.run()