srikanthbaride
diff --git a/‎ch2_rl_formulation/README.md‎
Lines changed: 20 additions & 0 deletions b/‎ch2_rl_formulation/README.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎ch2_rl_formulation/__init__.py‎
Lines changed: 1 addition & 3 deletions b/‎ch2_rl_formulation/__init__.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎ch2_rl_formulation/evaluation.py‎
Lines changed: 44 additions & 35 deletions b/‎ch2_rl_formulation/evaluation.py‎
Lines changed: 44 additions & 35 deletions
diff --git a/‎ch2_rl_formulation/examples/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎ch2_rl_formulation/examples/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ch2_rl_formulation/examples/gridworld_demo.py‎
Lines changed: 8 additions & 31 deletions b/‎ch2_rl_formulation/examples/gridworld_demo.py‎
Lines changed: 8 additions & 31 deletions
diff --git a/‎ch2_rl_formulation/examples/numeric_checks.py‎
Lines changed: 30 additions & 21 deletions b/‎ch2_rl_formulation/examples/numeric_checks.py‎
Lines changed: 30 additions & 21 deletions
diff --git a/‎ch2_rl_formulation/examples/plot_value_and_policy.py‎
Lines changed: 10 additions & 0 deletions b/‎ch2_rl_formulation/examples/plot_value_and_policy.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎ch2_rl_formulation/exercises/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎ch2_rl_formulation/exercises/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ch2_rl_formulation/gridworld.py‎
Lines changed: 49 additions & 82 deletions b/‎ch2_rl_formulation/gridworld.py‎
Lines changed: 49 additions & 82 deletions
@@ -0,0 +1,20 @@
+# Chapter 2 — The RL Problem Formulation
+
+Implements: MDP formalism, Bellman expectation & optimality, gridworld, greedy/ε-greedy, value iteration.
+Includes numeric examples (5.23 and 4.58), demos, visualizations, and tests.
+
+## Quickstart
+```bash
+python -m ch2_rl_formulation.examples.numeric_checks
+python -m ch2_rl_formulation.examples.gridworld_demo
+python -m ch2_rl_formulation.examples.plot_value_and_policy
+```
+
+## Layout
+- `gridworld.py` — 4×4 deterministic GridWorld.
+- `evaluation.py` — policy evaluation (deterministic & stochastic), `q_from_v`, `greedy_from_q`.
+- `policies.py` — deterministic & ε-greedy helpers.
+- `value_iteration.py` — value iteration, extract greedy policy.
+- `visualize.py` — single-plot, matplotlib-based visuals (no explicit colors).
+- `examples/` — boxed examples and demos.
+- `tests/` — sanity checks tied to the chapter.
@@ -1,3 +1 @@
-from .gridworld import GridWorld4x4
-from .evaluation import policy_evaluation, q_from_v
-from .policies import greedy_toward_goal_policy, greedy_from_q, epsilon_greedy
+
@@ -1,45 +1,54 @@
+from __future__ import annotations
 import numpy as np
+from .gridworld import GridWorld4x4
 
-def policy_evaluation(S, A, P, R, pi, gamma=1.0, theta=1e-10):
-    """
-    Tabular policy evaluation for general R(s,a,s').
-    Inputs:
-      - S: list of states
-      - A: list of actions
-      - P: [|S|, |A|, |S'|] transition probabilities
-      - R: [|S|, |A|, |S'|] rewards
-      - pi: [|S|, |A|] policy (row-stochastic; can be deterministic one-hot)
-      - gamma: discount factor
-      - theta: convergence threshold (max delta)
-    Returns:
-      - V: np.ndarray of shape [|S|]
-    """
-    nS, nA, nSp = P.shape
-    assert nS == len(S) and nA == len(A) and nSp == nS
-    assert pi.shape == (nS, nA)
+def policy_evaluation(env: GridWorld4x4,
+                      pi_actions: np.ndarray,
+                      gamma: float = 0.9,
+                      theta: float = 1e-8,
+                      max_iter: int = 10000) -> np.ndarray:
+    S = env.num_states
+    V = np.zeros(S, dtype=float)
+    for _ in range(max_iter):
+        delta = 0.0
+        for s in range(S):
+            a = int(pi_actions[s])
+            v_new = sum(tr.p * (tr.r + gamma * V[tr.sp]) for tr in env.P[s][a])
+            delta = max(delta, abs(v_new - V[s]))
+            V[s] = v_new
+        if delta < theta:
+            break
+    return V
 
-    V = np.zeros(nS, dtype=float)
-    while True:
+def policy_evaluation_stochastic(env: GridWorld4x4,
+                                 pi_probs: np.ndarray,
+                                 gamma: float = 0.9,
+                                 theta: float = 1e-8,
+                                 max_iter: int = 10000) -> np.ndarray:
+    S, A = env.num_states, env.num_actions
+    V = np.zeros(S, dtype=float)
+    for _ in range(max_iter):
         delta = 0.0
-        V_new = np.zeros_like(V)
-        for s in range(nS):
-            val = 0.0
-            for a in range(nA):
-                p_sa = pi[s, a]
-                if p_sa == 0.0:
+        for s in range(S):
+            v_new = 0.0
+            for a in range(A):
+                pa = pi_probs[s, a]
+                if pa == 0.0:
                     continue
-                val += p_sa * np.sum(P[s, a, :] * (R[s, a, :] + gamma * V))
-            V_new[s] = val
-            delta = max(delta, abs(V_new[s] - V[s]))
-        V = V_new
+                v_new += pa * sum(tr.p * (tr.r + gamma * V[tr.sp]) for tr in env.P[s][a])
+            delta = max(delta, abs(v_new - V[s]))
+            V[s] = v_new
         if delta < theta:
             break
     return V
 
-def q_from_v(S, A, P, R, V, gamma=1.0):
-    nS, nA, _ = P.shape
-    Q = np.zeros((nS, nA), dtype=float)
-    for s in range(nS):
-        for a in range(nA):
-            Q[s, a] = np.sum(P[s, a, :] * (R[s, a, :] + gamma * V))
+def q_from_v(env: GridWorld4x4, V: np.ndarray, gamma: float = 0.9) -> np.ndarray:
+    S, A = env.num_states, env.num_actions
+    Q = np.zeros((S, A), dtype=float)
+    for s in range(S):
+        for a in range(A):
+            Q[s, a] = sum(tr.p * (tr.r + gamma * V[tr.sp]) for tr in env.P[s][a])
     return Q
+
+def greedy_from_q(Q: np.ndarray) -> np.ndarray:
+    return np.argmax(Q, axis=1)
@@ -0,0 +1 @@
+
@@ -1,34 +1,11 @@
 import numpy as np
-from ch2_rl_formulation.gridworld import GridWorld4x4
-from ch2_rl_formulation.policies import greedy_toward_goal_policy
-from ch2_rl_formulation.evaluation import policy_evaluation, q_from_v
-
-def main():
-    env = GridWorld4x4(step_reward=-1, goal_reward=0, goal=(0,3))
-    S, A = env.states(), env.actions()
-    P, R = env.P_tensor(), env.R_tensor()
-    pi = greedy_toward_goal_policy(env)
-    V = policy_evaluation(S, A, P, R, pi, gamma=1.0, theta=1e-12)
-    Vgrid = np.array(V).reshape(4,4)
-    print("V_pi grid (goal top-right):\n", Vgrid)
-    expected = np.array([
-        [-4, -3, -2,  0],
-        [-5, -4, -3, -1],
-        [-6, -5, -4, -2],
-        [-7, -6, -5, -3],
-    ], dtype=float)
-    assert np.allclose(Vgrid, expected, atol=1e-12)
-    Q = q_from_v(S, A, P, R, V, gamma=1.0)
-    s_bl = env.state_index(3,0)
-    a = {name: env.action_index(name) for name in A}
-    print("\nQ at bottom-left (row=3,col=0):")
-    for name in A:
-        print(f"{name:>5}: {Q[s_bl, a[name]]:6.2f}")
-    assert abs(Q[s_bl, a["up"]]   - (-7)) < 1e-12
-    assert abs(Q[s_bl, a["right"]]- (-7)) < 1e-12
-    assert abs(Q[s_bl, a["left"]] - (-8)) < 1e-12
-    assert abs(Q[s_bl, a["down"]] - (-8)) < 1e-12
-    print("\nAll checks PASS.")
+from ..gridworld import GridWorld4x4
+from ..value_iteration import value_iteration
 
 if __name__ == "__main__":
-    main()
+    env = GridWorld4x4(step_reward=-1.0, goal=(0, 3))
+    V_star, pi_star = value_iteration(env, gamma=0.9, theta=1e-10)
+    print("Optimal V* (gamma=0.9):")
+    print(np.round(V_star.reshape(4, 4), 2))
+    print("\nGreedy π* (0:R,1:L,2:D,3:U):")
+    print(pi_star.reshape(4, 4))
@@ -1,24 +1,33 @@
-def approx(x, y, tol=1e-6): 
-    return abs(x - y) < tol
+import numpy as np
+from ..gridworld import GridWorld4x4
+from ..evaluation import policy_evaluation, q_from_v, greedy_from_q
 
-def main():
-    ok = True
-    g0 = 1 + 0.9*2 + (0.9**2)*3
-    print("G0 =", g0); ok &= approx(g0, 5.23)
-    v = -1 + 0.9*(-1) + (0.9**2)*(-1) + (0.9**3)*10
-    print("v =", v); ok &= approx(v, 4.58)
-    v_pe = 2 + 0.9*4
-    print("v_pe =", v_pe); ok &= approx(v_pe, 5.6, 1e-12)
-    q_pe = 1 + 0.9*3
-    print("q_pe =", q_pe); ok &= approx(q_pe, 3.7, 1e-12)
-    vopt = max(2 + 0.9*5, 1 + 0.9*8)
-    print("v* =", vopt); ok &= approx(vopt, 8.2, 1e-12)
-    qopt = 2 + 0.9*6
-    print("q* =", qopt); ok &= approx(qopt, 7.4, 1e-12)
-    v4 = sum((0.9**k)*(-1) for k in range(4)) + (0.9**4)*10
-    print("v*(4 steps) =", v4); ok &= abs(v4 - 3.122) < 1e-3
-    print("\nALL NUMERIC EXAMPLES:", "PASS" if ok else "FAIL")
-    if not ok: raise SystemExit(1)
+def discounted_return_example():
+    r = [1, 2, 3]; gamma = 0.9
+    return r[0] + gamma*r[1] + (gamma**2)*r[2]
+
+def state_value_example():
+    gamma = 0.9
+    return -1 + gamma*(-1) + (gamma**2)*(-1) + (gamma**3)*10
+
+def gridworld_vq_under_fixed_policy(gamma: float = 1.0):
+    env = GridWorld4x4(step_reward=-1.0, goal=(0, 3))
+    pi = np.zeros(env.num_states, dtype=int)
+    for s in range(env.num_states):
+        i, j = env.i2s[s]
+        if s == env.terminal: 
+            pi[s] = 0; continue
+        pi[s] = 0 if j < 3 else 3
+    V = policy_evaluation(env, pi, gamma=gamma, theta=1e-10)
+    Q = q_from_v(env, V, gamma=gamma)
+    pi_greedy = greedy_from_q(Q)
+    return V.reshape(4,4), Q, pi_greedy.reshape(4,4)
 
 if __name__ == "__main__":
-    main()
+    print("G0 example (should be 5.23):", round(discounted_return_example(), 2))
+    print("v_pi example (should be 4.58):", round(state_value_example(), 2))
+    V, Q, pi_g = gridworld_vq_under_fixed_policy(gamma=1.0)
+    print("\nGridworld V under greedy-to-goal policy (gamma=1):")
+    print(np.array_str(np.round(V, 0), precision=0))
+    print("\nGreedy-from-Q policy indices (0:R,1:L,2:D,3:U):")
+    print(pi_g)
@@ -0,0 +1,10 @@
+import numpy as np
+from ..gridworld import GridWorld4x4
+from ..value_iteration import value_iteration
+from ..visualize import plot_value_grid, plot_greedy_policy
+
+if __name__ == "__main__":
+    env = GridWorld4x4(step_reward=-1.0, goal=(0,3))
+    V, pi = value_iteration(env, gamma=0.9, theta=1e-10)
+    plot_value_grid(V, title="Optimal V* (gamma=0.9)")
+    plot_greedy_policy(pi, title="Greedy Policy π*")
@@ -0,0 +1 @@
+
@@ -1,89 +1,56 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, Tuple, List
 import numpy as np
-from typing import List, Tuple
 
-class GridWorld4x4:
-    """
-    4x4 deterministic GridWorld for Chapter 2.
-    - Step reward: -1
-    - Terminal goal: (0,3) top-right, value 0, no outgoing actions
-    - Actions: up, right, down, left
-    - Deterministic transitions; hitting a wall = self-transition
-    """
+ACTIONS: List[Tuple[int, int]] = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # R, L, D, U
 
-    ACTIONS = ("up", "right", "down", "left")
+@dataclass(frozen=True)
+class Transition:
+    s: int
+    a: int
+    sp: int
+    r: float
+    p: float
 
-    def __init__(
-        self,
-        step_reward: float = -1.0,
-        goal_reward: float = 0.0,
-        deterministic: bool = True,
-        self_transition_on_wall: bool = True,
-        goal: Tuple[int, int] = (0, 3),
-    ):
-        assert deterministic, "Only deterministic dynamics supported in this minimal build."
-        self.h, self.w = 4, 4
+class GridWorld4x4:
+    """Deterministic 4x4 gridworld with an absorbing goal (reward 0)."""
+    def __init__(self, step_reward: float = -1.0, goal: Tuple[int, int] = (0, 3)):
+        self.n = 4
         self.step_reward = float(step_reward)
-        self.goal_reward = float(goal_reward)
-        self.self_transition_on_wall = bool(self_transition_on_wall)
         self.goal = tuple(goal)
 
-        self._S = [(r, c) for r in range(self.h) for c in range(self.w)]
-        self._A = list(self.ACTIONS)
-        self._si = {s: i for i, s in enumerate(self._S)}
-        self._ai = {a: i for i, a in enumerate(self._A)}
-
-        # Precompute transition (P) and reward (R) tensors
-        self._P = np.zeros((len(self._S), len(self._A), len(self._S)), dtype=float)
-        self._R = np.zeros_like(self._P)
-        self._build_PR()
-
-    # --- Public API ---
-    def states(self) -> List[Tuple[int, int]]:
-        return list(self._S)
-
-    def actions(self) -> List[str]:
-        return list(self._A)
-
-    def state_index(self, r: int, c: int) -> int:
-        return self._si[(r, c)]
-
-    def action_index(self, name: str) -> int:
-        return self._ai[name]
-
-    def P_tensor(self) -> np.ndarray:
-        return self._P.copy()
-
-    def R_tensor(self) -> np.ndarray:
-        return self._R.copy()
-
-    # --- Internal helpers ---
-    def _in_bounds(self, r: int, c: int) -> bool:
-        return 0 <= r < self.h and 0 <= c < self.w
-
-    def _next_state(self, s: Tuple[int, int], a: str) -> Tuple[int, int]:
-        if s == self.goal:
-            return s  # terminal
-        r, c = s
-        if a == "up":
-            nr, nc = r - 1, c
-        elif a == "right":
-            nr, nc = r, c + 1
-        elif a == "down":
-            nr, nc = r + 1, c
-        elif a == "left":
-            nr, nc = r, c - 1
-        else:
-            raise ValueError(a)
-        if self._in_bounds(nr, nc):
-            return (nr, nc)
-        return s if self.self_transition_on_wall else s
-
-    def _build_PR(self):
-        for si, s in enumerate(self._S):
-            for ai, a in enumerate(self._A):
-                s2 = self._next_state(s, a)
-                s2i = self._si[s2]
-                self._P[si, ai, s2i] = 1.0
-                # Reward per step; terminal has value 0 so step reward applies on entry
-                r = self.step_reward
-                self._R[si, ai, s2i] = r
+        self.S = [(i, j) for i in range(self.n) for j in range(self.n)]
+        self.s2i = {s: k for k, s in enumerate(self.S)}  # (i,j) -> idx
+        self.i2s = {k: s for k, s in enumerate(self.S)}  # idx   -> (i,j)
+        self.A = list(range(len(ACTIONS)))
+
+        self.terminal = self.s2i[self.goal]
+        self.num_states = len(self.S)
+        self.num_actions = len(self.A)
+
+        self.P: Dict[int, Dict[int, List[Transition]]] = self._build_P()
+
+    def _in_bounds(self, i: int, j: int) -> bool:
+        return 0 <= i < self.n and 0 <= j < self.n
+
+    def _step_det(self, s_idx: int, a: int) -> Tuple[int, float]:
+        if s_idx == self.terminal:
+            return s_idx, 0.0  # absorbing
+        i, j = self.i2s[s_idx]
+        di, dj = ACTIONS[a]
+        ni, nj = i + di, j + dj
+        if not self._in_bounds(ni, nj):
+            ni, nj = i, j  # bounce to self
+        sp_idx = self.s2i[(ni, nj)]
+        r = 0.0 if sp_idx == self.terminal else self.step_reward
+        return sp_idx, r
+
+    def _build_P(self) -> Dict[int, Dict[int, List[Transition]]]:
+        P: Dict[int, Dict[int, List[Transition]]] = {}
+        for s in range(self.num_states):
+            P[s] = {}
+            for a in self.A:
+                sp, r = self._step_det(s, a)
+                P[s][a] = [Transition(s=s, a=a, sp=sp, r=r, p=1.0)]
+        return P