Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit f173f42

Browse files
author
Jules Pondard
committed
Add a RL actor-critic like algorithm for benchmarking
This algorithm uses a variant of experience replay, and one policy per option to predict.
1 parent e3fa6eb commit f173f42

File tree

1 file changed

+171
-0
lines changed

1 file changed

+171
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import numpy as np
2+
import torch
3+
import torch.optim as optim
4+
import torch.nn as nn
5+
import torch.utils.data
6+
import torch.nn.functional as F
7+
#import ipdb
8+
from itertools import count
9+
from collections import namedtuple
10+
from torch.distributions import Categorical
11+
import tensor_comprehensions as tc
12+
from visdom import Visdom
13+
from collections import deque
14+
from heapq import heappush, heappop
15+
16+
import utils
17+
18+
NB_EPOCHS = 1000
19+
BATCH_SZ = 16
20+
buff = deque()
21+
MAXI_BUFF_SZ = 50
22+
23+
(tc_code, tc_name, inp, init_input_sz) = utils.get_convolution_example(size_type="input", inp_sz_list=[8,2,28,28,8,1,1])
24+
25+
NB_HYPERPARAMS, INIT_INPUT_SZ = utils.NB_HYPERPARAMS, utils.INIT_INPUT_SZ
26+
27+
viz = Visdom()
28+
win0 = viz.line(X=np.arange(NB_EPOCHS), Y=np.random.rand(NB_EPOCHS))
29+
win1 = viz.line(X=np.arange(NB_EPOCHS), Y=np.random.rand(NB_EPOCHS))
30+
31+
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
32+
33+
layer_sz = 32
34+
35+
class Predictor(nn.Module):
36+
def __init__(self, nb_inputs, nb_actions):
37+
super(Predictor, self).__init__()
38+
self.affine1 = nn.Linear(nb_inputs, layer_sz)
39+
self.affine15 = nn.Linear(layer_sz, layer_sz)
40+
self.affine2 = nn.Linear(layer_sz, nb_actions)
41+
self.affine3 = nn.Linear(layer_sz, 1)
42+
43+
self.W = nn.Linear(nb_inputs, nb_inputs)
44+
45+
def forward(self, x):
46+
#ipdb.set_trace()
47+
#x = F.softmax(self.W(x), dim=-1) * x #attention mecanism
48+
tmp1 = F.relu(self.affine1(x))
49+
#tmp1 = F.relu(self.affine15(tmp1))
50+
out_action = F.softmax(self.affine2(tmp1), dim=-1)
51+
out_value = self.affine3(tmp1)
52+
return out_action, out_value
53+
54+
class FullNetwork(nn.Module):
55+
def __init__(self, nb_hyperparams, init_input_sz):
56+
super(FullNetwork, self).__init__()
57+
self.nb_hyperparams = nb_hyperparams
58+
self.init_input_sz = init_input_sz
59+
self.nets = [Predictor(init_input_sz + i, int(utils.cat_sz[i])) for i in range(nb_hyperparams)]
60+
self.nets = nn.ModuleList(self.nets)
61+
62+
def select_action(self, x, i, out_sz):
63+
geps = 0.1
64+
proba = np.random.rand()
65+
probs, state_value = self.nets[i](x)
66+
if(proba <= geps):
67+
probs = torch.FloatTensor([1./out_sz]*out_sz)
68+
m = Categorical(probs)
69+
action = m.sample()
70+
return action.item(), m.log_prob(action), state_value
71+
72+
def forward(self, x):
73+
actions_prob = []
74+
values = []
75+
for i in range(self.nb_hyperparams):
76+
sym, action_prob, value = self.select_action(x, i, int(utils.cat_sz[i]))
77+
actions_prob.append(action_prob)
78+
values.append(value)
79+
x = torch.cat([x, torch.FloatTensor([sym])])
80+
return x[INIT_INPUT_SZ:], actions_prob, values
81+
82+
net = FullNetwork(NB_HYPERPARAMS, INIT_INPUT_SZ)
83+
optimizer = optim.Adam(net.parameters(), lr=0.0001)
84+
eps = np.finfo(np.float32).eps.item()
85+
86+
#print(utils.getAllDivs(inp))
87+
88+
def finish_episode(actions_probs, values, final_rewards):
89+
policy_losses = [[] for i in range(BATCH_SZ)]
90+
value_losses = [[] for i in range(BATCH_SZ)]
91+
final_rewards = torch.tensor(list(final_rewards))
92+
#final_rewards = (final_rewards - final_rewards.mean()) / (final_rewards.std() + eps)
93+
for batch_id in range(BATCH_SZ):
94+
for (log_prob, value) in zip(actions_probs[batch_id], values[batch_id]):
95+
reward = final_rewards[batch_id] - value.item()
96+
policy_losses[batch_id].append(-log_prob * reward)
97+
value_losses[batch_id].append(F.smooth_l1_loss(value, torch.tensor([final_rewards[batch_id]])))
98+
optimizer.zero_grad()
99+
vloss = torch.stack([torch.stack(value_losses[i]).sum() for i in range(BATCH_SZ)]).mean()
100+
ploss = torch.stack([torch.stack(policy_losses[i]).sum() for i in range(BATCH_SZ)]).mean()
101+
loss = ploss + vloss
102+
loss.backward(retain_graph=True)
103+
optimizer.step()
104+
return vloss.item(), ploss.item()
105+
106+
def add_to_buffer(actions_probs, values, reward):
107+
global buff
108+
#if(len(buff) > 0):
109+
# min_reward = np.min(np.array(buff)[:,2])
110+
# if(reward < 10*min_reward):
111+
# return
112+
if len(buff) == MAXI_BUFF_SZ:
113+
#heappop(buff)
114+
buff.popleft()
115+
#heappush(buff, (reward, actions_probs, values))
116+
buff.append((reward, actions_probs, values))
117+
118+
def select_batch():
119+
#random.sample()
120+
batch = [buff[np.random.randint(len(buff))] for i in range(BATCH_SZ)]
121+
#batch.append(buff[-1])
122+
batch=np.array(batch)
123+
return batch[:,1], batch[:,2], batch[:,0]
124+
125+
def get_best_buff():
126+
return np.max(np.array(buff)[:,0])
127+
128+
INTER_DISP = 20
129+
130+
running_reward = -0.5
131+
tab_rewards=[]
132+
tab_best=[]
133+
best=-12
134+
v_losses=[]
135+
p_losses=[]
136+
best_options = np.zeros(NB_HYPERPARAMS).astype(int)
137+
for i in range(NB_EPOCHS):
138+
rewards = []
139+
out_actions, out_probs, out_values = net(init_input_sz)
140+
#utils.print_opt(out_actions.numpy().astype(int))
141+
reward = utils.evalTime(out_actions.numpy().astype(int), prune=-1, curr_best=np.exp(-best))
142+
#reward=100*reward
143+
#reward = -((reward)/1000)
144+
reward = -np.log(reward)
145+
add_to_buffer(out_probs, out_values, reward)
146+
best_in_buffer = get_best_buff()
147+
if(i >= 20):
148+
actions_probs, values, rewards = select_batch()
149+
for j in range(1):
150+
vloss, ploss = finish_episode(actions_probs, values, rewards)
151+
v_losses.append(vloss)
152+
p_losses.append(ploss)
153+
if(best < reward or i==0):
154+
best=reward
155+
best_options = out_actions.numpy().astype(int)
156+
utils.print_opt(best_options)
157+
if(i==0):
158+
running_reward = reward
159+
running_reward = running_reward * 0.99 + reward * 0.01
160+
tab_rewards.append(-(running_reward))
161+
tab_best.append(-best)
162+
if i % INTER_DISP == 0:
163+
viz.line(X=np.column_stack((np.arange(i+1), np.arange(i+1))), Y=np.column_stack((np.array(tab_rewards), np.array(tab_best))), win=win0, opts=dict(legend=["Geometric run", "Best time"]))
164+
if(len(v_losses) > 0):
165+
viz.line(X=np.column_stack((np.arange(len(v_losses)), np.arange(len(v_losses)))), Y=np.column_stack((np.array(v_losses), np.array(p_losses))), win=win1, opts=dict(legend=["Value loss", "Policy loss"]))
166+
print(-running_reward)
167+
print(-best)
168+
print("Best in buffer: " + str(-best_in_buffer))
169+
170+
print("Finally, best options are:")
171+
utils.print_opt(best_options)

0 commit comments

Comments
 (0)