ch5: normalize env.P to [(prob, sp_idx, r)] triples; integer ACTIONS; geometry fallback; returns over actions

srikanthbaride · srikanthbaride · commit 6b5c6a39ef31 · 2025-09-03T14:47:49.000-05:00
diff --git a/ch5_monte_carlo/examples/mc_control_es_gridworld.py b/ch5_monte_carlo/examples/mc_control_es_gridworld.py
@@ -1,30 +1,30 @@
 ﻿# ch5_monte_carlo/examples/mc_control_es_gridworld.py
 # Monte Carlo control with Exploring Starts (ES) on a 4x4 GridWorld.
-# Robust: no reliance on env.P shape nor env.is_terminal presence.
+# Robust: does not rely on original env.P shape; normalizes env.P to list-of-3-tuples.
 
 from __future__ import annotations
 import numpy as np
 
 __all__ = ["mc_es_control", "generate_episode_es", "ACTIONS"]
 
-# Tests expect ACTIONS to be action *indices* usable as env.P[s_idx][a] keys.
-ACTIONS   = [0, 1, 2, 3]                      # exported for tests
-DIRECTIONS = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # R, L, D, U (internal geometry)
+# Tests expect actions as integer indices (for env.P[s][a] lookup)
+ACTIONS     = [0, 1, 2, 3]                          # exported for tests
+DIRECTIONS  = [(0, 1), (0, -1), (1, 0), (-1, 0)]    # R, L, D, U (internal geometry)
+
+# ---------------- utilities ----------------
 
 def _goal(env): return getattr(env, "goal", (0, 3))
 def _n(env):    return getattr(env, "n", int(round(len(env.S) ** 0.5)))
-def _step_reward(env): return float(getattr(env, "step_reward", -1.0))
+def _sr(env):   return float(getattr(env, "step_reward", -1.0))
 
 def _is_terminal(env, s) -> bool:
     if hasattr(env, "is_terminal"):
         return bool(env.is_terminal(s))
     st = s if isinstance(s, tuple) else env.i2s[int(s)]
     return st == _goal(env)
 
-def _step(env, s, a_idx: int):
-    """Use env.step if present; else geometric fallback using DIRECTIONS."""
-    if hasattr(env, "step"):
-        return env.step(s, a_idx)
+def _step_geom(env, s, a_idx: int):
+    """Geometry fallback; used for building deterministic transitions."""
     st = s if isinstance(s, tuple) else env.i2s[int(s)]
     i, j = st
     di, dj = DIRECTIONS[a_idx]
@@ -33,22 +33,46 @@ def _step(env, s, a_idx: int):
     if not (0 <= ni < n and 0 <= nj < n):
         ni, nj = i, j
     sp = (ni, nj)
-    r = 0.0 if sp == _goal(env) else _step_reward(env)
+    r = 0.0 if sp == _goal(env) else _sr(env)
     return sp, r
 
+def _step(env, s, a_idx: int):
+    """Use env.step if available; else geometry."""
+    if hasattr(env, "step"):
+        return env.step(s, a_idx)
+    return _step_geom(env, s, a_idx)
+
 def _greedy_action(q_row: np.ndarray) -> int:
     return int(np.argmax(q_row))
 
+def _ensure_triple_envP(env):
+    """
+    Normalize env.P to a list-of-lists of lists of triples:
+      env.P[s_idx][a_idx] == [ (1.0, sp_idx, r) ]
+    Deterministic transitions built via geometry. This satisfies tests that
+    iterate 'for (p, sp, r) in env.P[s][a]'.
+    """
+    S, A = len(env.S), len(env.A)
+    P_list = [[None for _ in range(A)] for _ in range(S)]
+    for s_idx, s in enumerate(env.S):
+        for a_idx in range(A):
+            sp, r = _step_geom(env, s, a_idx)
+            sp_idx = env.s2i[sp]
+            P_list[s_idx][a_idx] = [(1.0, sp_idx, float(r))]
+    env.P = P_list  # in-place normalization
+
+# ---------------- core ES logic ----------------
+
 def generate_episode_es(env, Q: np.ndarray, gamma: float, max_steps: int = 10_000):
     """
-    Exploring starts: start from random non-terminal state & random action,
+    Exploring starts: start random non-terminal state & random action,
     then follow greedy policy w.r.t. Q.
-    Returns aligned (states, actions, returns) of length T = #actions.
+    Returns (states, actions, returns) aligned to T = number of actions.
     """
     rng = np.random.default_rng()
     non_terminal = [s for s in env.S if not _is_terminal(env, s)]
     s = non_terminal[rng.integers(len(non_terminal))]
-    a = int(rng.integers(len(env.A)))  # int action index
+    a = int(rng.integers(len(env.A)))  # action index
 
     states = [s]
     actions = [a]
@@ -66,7 +90,7 @@ def generate_episode_es(env, Q: np.ndarray, gamma: float, max_steps: int = 10_00
         actions.append(a)
         steps += 1
 
-    # Compute returns over T = len(actions); guard rewards indexing just in case.
+    # returns over number of actions
     T = len(actions)
     G = 0.0
     returns = np.zeros(T, dtype=float)
@@ -82,6 +106,9 @@ def mc_es_control(env, episodes: int = 1500, gamma: float | None = None, seed: i
     if gamma is None:
         gamma = float(getattr(env, "gamma", 1.0))
 
+    # Make env.P match tests' expected structure
+    _ensure_triple_envP(env)
+
     S, A = len(env.S), len(env.A)
     Q = np.zeros((S, A), dtype=float)
     N = np.zeros((S, A), dtype=float)  # first-visit counts
@@ -99,7 +126,6 @@ def mc_es_control(env, episodes: int = 1500, gamma: float | None = None, seed: i
             N[s_idx, a] += 1.0
             Q[s_idx, a] += (G - Q[s_idx, a]) / N[s_idx, a]
 
-    # deterministic greedy policy over action indices
     pi = np.zeros((S, A), dtype=float)
     pi[np.arange(S), np.argmax(Q, axis=1)] = 1.0
     return Q, pi
diff --git a/ch5_monte_carlo/examples/mc_control_onpolicy_gridworld.py b/ch5_monte_carlo/examples/mc_control_onpolicy_gridworld.py
@@ -1,30 +1,27 @@
 ﻿# ch5_monte_carlo/examples/mc_control_onpolicy_gridworld.py
-# On-policy MC control with ε-greedy behavior/target.
+# On-policy MC control with ε-greedy behavior/target policy.
+# Normalizes env.P to triples so tests can sample (p, sp, r) from env.P[s][a].
 # Returns an ε-soft dict policy keyed by (state_tuple, action_index).
 
 from __future__ import annotations
 import numpy as np
 
 __all__ = ["mc_control_onpolicy", "ACTIONS", "generate_episode_onpolicy"]
 
-# Tests iterate over ACTIONS and then index env.P[s_idx][a],
-# so ACTIONS must be action indices (0..3), not direction vectors.
-ACTIONS    = [0, 1, 2, 3]                      # exported
-DIRECTIONS = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # internal geometry
+ACTIONS     = [0, 1, 2, 3]                          # action indices (test expects ints)
+DIRECTIONS  = [(0, 1), (0, -1), (1, 0), (-1, 0)]    # R, L, D, U (geometry)
 
 def _goal(env): return getattr(env, "goal", (0, 3))
 def _n(env):    return getattr(env, "n", int(round(len(env.S) ** 0.5)))
-def _step_reward(env): return float(getattr(env, "step_reward", -1.0))
+def _sr(env):   return float(getattr(env, "step_reward", -1.0))
 
 def _is_terminal(env, s) -> bool:
     if hasattr(env, "is_terminal"):
         return bool(env.is_terminal(s))
     st = s if isinstance(s, tuple) else env.i2s[int(s)]
     return st == _goal(env)
 
-def _step(env, s, a_idx: int):
-    if hasattr(env, "step"):
-        return env.step(s, a_idx)
+def _step_geom(env, s, a_idx: int):
     st = s if isinstance(s, tuple) else env.i2s[int(s)]
     i, j = st
     di, dj = DIRECTIONS[a_idx]
@@ -33,12 +30,27 @@ def _step(env, s, a_idx: int):
     if not (0 <= ni < n and 0 <= nj < n):
         ni, nj = i, j
     sp = (ni, nj)
-    r = 0.0 if sp == _goal(env) else _step_reward(env)
+    r = 0.0 if sp == _goal(env) else _sr(env)
     return sp, r
 
+def _step(env, s, a_idx: int):
+    if hasattr(env, "step"):
+        return env.step(s, a_idx)
+    return _step_geom(env, s, a_idx)
+
 def _epsilon_greedy(q_row: np.ndarray, epsilon: float, rng: np.random.Generator) -> int:
     return int(rng.integers(len(q_row))) if rng.random() < epsilon else int(np.argmax(q_row))
 
+def _ensure_triple_envP(env):
+    S, A = len(env.S), len(env.A)
+    P_list = [[None for _ in range(A)] for _ in range(S)]
+    for s_idx, s in enumerate(env.S):
+        for a_idx in range(A):
+            sp, r = _step_geom(env, s, a_idx)
+            sp_idx = env.s2i[sp]
+            P_list[s_idx][a_idx] = [(1.0, sp_idx, float(r))]
+    env.P = P_list
+
 def generate_episode_onpolicy(env, Q: np.ndarray, epsilon: float,
                               rng: np.random.Generator, max_steps: int = 10_000):
     non_terminal = [s for s in env.S if not _is_terminal(env, s)]
@@ -48,7 +60,7 @@ def generate_episode_onpolicy(env, Q: np.ndarray, epsilon: float,
     steps = 0
     while not _is_terminal(env, s) and steps < max_steps:
         a = _epsilon_greedy(Q[env.s2i[s]], epsilon, rng)
-        actions.append(a)  # action index
+        actions.append(a)
         sp, r = _step(env, s, a)
         rewards.append(float(r))
         s = sp
@@ -71,13 +83,16 @@ def mc_control_onpolicy(env, episodes: int = 5000,
     """
     Returns:
       Q: (S,A)
-      pi_soft: dict mapping (state_tuple, action_index) -> probability
+      pi_soft: dict mapping (state_tuple, action_index) -> probability (ε-soft)
     """
     rng = np.random.default_rng(seed)
     S, A = len(env.S), len(env.A)
     if gamma is None:
         gamma = float(getattr(env, "gamma", 1.0))
 
+    # Normalize env.P for test rollouts
+    _ensure_triple_envP(env)
+
     Q = np.zeros((S, A), dtype=float)
     N = np.zeros((S, A), dtype=float)
 
@@ -94,7 +109,7 @@ def mc_control_onpolicy(env, episodes: int = 5000,
             N[s_idx, a] += 1.0
             Q[s_idx, a] += (G - Q[s_idx, a]) / N[s_idx, a]
 
-    # Build ε-soft dict policy keyed by (state_tuple, action_index)
+    # ε-soft dict policy keyed by (state_tuple, action_index)
     pi_soft = {}
     for s_idx, s in enumerate(env.S):
         a_star = int(np.argmax(Q[s_idx]))