Skip to content

Commit 4b5bd87

Browse files
committed
Adds naive REINFORCE algorithm
1 parent 76463bc commit 4b5bd87

File tree

5 files changed

+124
-14
lines changed

5 files changed

+124
-14
lines changed

actor_critic/README.md

-6
This file was deleted.

reinforcement_learning/README.md

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Reinforcement learning training example
2+
3+
```bash
4+
pip install -r requirements.txt
5+
# For REINFORCE:
6+
python reinforce.py
7+
# For actor critic:
8+
python actor_critic.py
9+
```

actor_critic/main.py renamed to reinforcement_learning/actor_critic.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def forward(self, x):
4949

5050

5151
model = Policy()
52-
optimizer = optim.RMSprop(model.parameters(), lr=1e-3)
52+
optimizer = optim.Adam(model.parameters(), lr=1e-2)
5353

5454

5555
def select_action(state):
@@ -64,10 +64,15 @@ def finish_episode():
6464
R = 0
6565
saved_actions = model.saved_actions
6666
value_loss = 0
67-
for (action, value), r in zip(saved_actions[::-1], model.rewards[::-1]):
67+
rewards = []
68+
for r in model.rewards[::-1]:
6869
R = r + args.gamma * R
69-
action.reinforce(R - value.data.squeeze())
70-
value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([R])))
70+
rewards.insert(0, R)
71+
rewards = torch.Tensor(rewards)
72+
rewards = (rewards - rewards.mean()) / rewards.std()
73+
for (action, value), r in zip(saved_actions, rewards):
74+
action.reinforce(r)
75+
value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r])))
7176
optimizer.zero_grad()
7277
final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions))
7378
gradients = [torch.ones(1)] + [None] * len(saved_actions)
@@ -77,10 +82,10 @@ def finish_episode():
7782
del model.saved_actions[:]
7883

7984

80-
episode_durations = []
85+
running_reward = 10
8186
for i_episode in count(1):
8287
state = env.reset()
83-
for t in count(1):
88+
for t in range(10000): # Don't infinite loop while learning
8489
action = select_action(state)
8590
state, reward, done, _ = env.step(action[0,0])
8691
if args.render:
@@ -89,8 +94,12 @@ def finish_episode():
8994
if done:
9095
break
9196

97+
running_reward = running_reward * 0.99 + t * 0.01
9298
finish_episode()
93-
episode_durations.append(t)
9499
if i_episode % args.log_interval == 0:
95100
print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
96-
i_episode, t, torch.Tensor(episode_durations[-100:]).mean()))
101+
i_episode, t, running_reward))
102+
if running_reward > 200:
103+
print("Solved! Running reward is now {} and "
104+
"the last episode runs to {} time steps!".format(running_reward, t))
105+
break

reinforcement_learning/reinforce.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import argparse
2+
import gym
3+
import numpy as np
4+
from itertools import count
5+
from collections import namedtuple
6+
7+
import torch
8+
import torch.nn as nn
9+
import torch.nn.functional as F
10+
import torch.optim as optim
11+
import torch.autograd as autograd
12+
from torch.autograd import Variable
13+
import torchvision.transforms as T
14+
15+
16+
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
17+
parser.add_argument('--gamma', type=int, default=0.99, metavar='G',
18+
help='discount factor (default: 0.99)')
19+
parser.add_argument('--seed', type=int, default=543, metavar='N',
20+
help='random seed (default: 1)')
21+
parser.add_argument('--render', action='store_true',
22+
help='render the environment')
23+
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
24+
help='interval between training status logs (default: 10)')
25+
args = parser.parse_args()
26+
27+
28+
env = gym.make('CartPole-v0')
29+
env.seed(args.seed)
30+
torch.manual_seed(args.seed)
31+
32+
33+
class Policy(nn.Module):
34+
def __init__(self):
35+
super(Policy, self).__init__()
36+
self.affine1 = nn.Linear(4, 128)
37+
self.affine2 = nn.Linear(128, 2)
38+
39+
self.saved_actions = []
40+
self.rewards = []
41+
42+
def forward(self, x):
43+
x = F.relu(self.affine1(x))
44+
action_scores = self.affine2(x)
45+
return F.softmax(action_scores)
46+
47+
48+
model = Policy()
49+
optimizer = optim.Adam(model.parameters(), lr=1e-2)
50+
51+
52+
def select_action(state):
53+
state = torch.from_numpy(state).float().unsqueeze(0)
54+
probs = model(Variable(state))
55+
action = probs.multinomial()
56+
model.saved_actions.append(action)
57+
return action.data
58+
59+
60+
def finish_episode():
61+
R = 0
62+
saved_actions = model.saved_actions
63+
rewards = []
64+
for r in model.rewards[::-1]:
65+
R = r + args.gamma * R
66+
rewards.insert(0, R)
67+
rewards = torch.Tensor(rewards)
68+
rewards = (rewards - rewards.mean()) / rewards.std()
69+
for action, r in zip(model.saved_actions, rewards):
70+
action.reinforce(r)
71+
optimizer.zero_grad()
72+
autograd.backward(model.saved_actions, [None for _ in model.saved_actions])
73+
optimizer.step()
74+
del model.rewards[:]
75+
del model.saved_actions[:]
76+
77+
78+
running_reward = 10
79+
for i_episode in count(1):
80+
state = env.reset()
81+
for t in range(10000): # Don't infinite loop while learning
82+
action = select_action(state)
83+
state, reward, done, _ = env.step(action[0,0])
84+
if args.render:
85+
env.render()
86+
model.rewards.append(reward)
87+
if done:
88+
break
89+
90+
running_reward = running_reward * 0.99 + t * 0.01
91+
finish_episode()
92+
if i_episode % args.log_interval == 0:
93+
print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
94+
i_episode, t, running_reward))
95+
if running_reward > 200:
96+
print("Solved! Running reward is now {} and "
97+
"the last episode runs to {} time steps!".format(running_reward, t))
98+
break

0 commit comments

Comments
 (0)