Add an option for a local reward that just computes speed of the AV and its follower

eugenevinitsky · eugenevinitsky · commit 942fe56130ae · 2020-03-28T17:31:09.000-07:00
diff --git a/examples/exp_configs/non_rl/i210_subnetwork.py b/examples/exp_configs/non_rl/i210_subnetwork.py
@@ -101,7 +101,7 @@
 edge_id = "119257908#1-AddedOnRampEdge"
 custom_callables = {
     "avg_merge_speed": lambda env: np.nan_to_num(np.mean(
-        env.k.vehicle.get_speed(env.k.vehicle.get_ids_by_edge(edge_id)))),
+        env.k.vehicle.get_speed(env.k.vehicle.get_ids()))),
     "avg_outflow": lambda env: np.nan_to_num(
         env.k.vehicle.get_outflow_rate(120)),
     # we multiply by 5 to account for the vehicle length and by 1000 to convert
diff --git a/examples/exp_configs/rl/multiagent/multiagent_i210.py b/examples/exp_configs/rl/multiagent/multiagent_i210.py
@@ -5,7 +5,6 @@
 """
 import os
 
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.tune.registry import register_env
 
 from flow.controllers import RLController
diff --git a/examples/train.py b/examples/train.py
@@ -22,7 +22,6 @@
 
 import ray
 from ray import tune
-from ray.tune import run_experiments
 from ray.tune.registry import register_env
 try:
     from ray.rllib.agents.agent import get_agent_class
@@ -36,9 +35,9 @@
 from flow.utils.registry import make_create_env
 
 
-
 def parse_args(args):
     """Parse training options user can specify in command line.
+
     Returns
     -------
     argparse.Namespace
@@ -140,6 +139,7 @@ def setup_exps_rllib(flow_params,
                      policies_to_train=None,
                      ):
     """Return the relevant components of an RLlib experiment.
+
     Parameters
     ----------
     flow_params : dict
diff --git a/flow/envs/multiagent/i210.py b/flow/envs/multiagent/i210.py
@@ -16,6 +16,8 @@
     "max_decel": 1,
     # whether we use an obs space that contains adjacent lane info or just the lead obs
     "lead_obs": True,
+    # whether the reward should come from local vehicles instead of global rewards
+    "local_reward": True
 }
 
 
@@ -137,35 +139,47 @@ def compute_reward(self, rl_actions, **kwargs):
             return {}
 
         rewards = {}
-        for rl_id in self.k.vehicle.get_rl_ids():
-            if self.env_params.evaluate:
-                # reward is speed of vehicle if we are in evaluation mode
-                reward = self.k.vehicle.get_speed(rl_id)
-            elif kwargs['fail']:
-                # reward is 0 if a collision occurred
-                reward = 0
-            else:
-                # reward high system-level velocities
-                cost1 = average_velocity(self, fail=kwargs['fail'])
-
-                # penalize small time headways
-                cost2 = 0
-                t_min = 1  # smallest acceptable time headway
-
-                lead_id = self.k.vehicle.get_leader(rl_id)
-                if lead_id not in ["", None] \
-                        and self.k.vehicle.get_speed(rl_id) > 0:
-                    t_headway = max(
-                        self.k.vehicle.get_headway(rl_id) /
-                        self.k.vehicle.get_speed(rl_id), 0)
-                    cost2 += min((t_headway - t_min) / t_min, 0)
-
-                # weights for cost1, cost2, and cost3, respectively
-                eta1, eta2 = 1.00, 0.10
-
-                reward = max(eta1 * cost1 + eta2 * cost2, 0)
-
-            rewards[rl_id] = reward
+        if self.env_params.additional_params["local_reward"]:
+            for rl_id in self.k.vehicle.get_rl_ids():
+                rewards[rl_id] = 0
+                speeds = []
+                follow_speed = self.k.vehicle.get_speed(self.k.vehicle.get_follower(rl_id))
+                speeds.extend([speed for speed in follow_speed if speed >= 0])
+                if self.k.vehicle.get_speed(rl_id) >= 0:
+                    speeds.append(self.k.vehicle.get_speed(rl_id))
+                if len(speeds) > 0:
+                    # rescale so the q function can estimate it quickly
+                    rewards[rl_id] = np.mean(speeds) / 500.0
+        else:
+            for rl_id in self.k.vehicle.get_rl_ids():
+                if self.env_params.evaluate:
+                    # reward is speed of vehicle if we are in evaluation mode
+                    reward = self.k.vehicle.get_speed(rl_id)
+                elif kwargs['fail']:
+                    # reward is 0 if a collision occurred
+                    reward = 0
+                else:
+                    # reward high system-level velocities
+                    cost1 = average_velocity(self, fail=kwargs['fail'])
+
+                    # penalize small time headways
+                    cost2 = 0
+                    t_min = 1  # smallest acceptable time headway
+
+                    lead_id = self.k.vehicle.get_leader(rl_id)
+                    if lead_id not in ["", None] \
+                            and self.k.vehicle.get_speed(rl_id) > 0:
+                        t_headway = max(
+                            self.k.vehicle.get_headway(rl_id) /
+                            self.k.vehicle.get_speed(rl_id), 0)
+                        cost2 += min((t_headway - t_min) / t_min, 0)
+
+                    # weights for cost1, cost2, and cost3, respectively
+                    eta1, eta2 = 1.00, 0.10
+
+                    reward = max(eta1 * cost1 + eta2 * cost2, 0)
+
+                rewards[rl_id] = reward
         return rewards
 
     def additional_command(self):