Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 59e5bcc

Browse files
Jules PondardJules Pondard
Jules Pondard
authored and
Jules Pondard
committed
Add a RL actor-critic like algorithm for benchmarking
This algorithm uses a variant of experience replay, and one policy per option to predict.
1 parent d1a1da6 commit 59e5bcc

File tree

1 file changed

+172
-0
lines changed

1 file changed

+172
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import numpy as np
2+
import torch
3+
import torch.optim as optim
4+
import torch.nn as nn
5+
import torch.utils.data
6+
import torch.nn.functional as F
7+
#import ipdb
8+
from itertools import count
9+
from collections import namedtuple
10+
from torch.distributions import Categorical
11+
import tensor_comprehensions as tc
12+
from visdom import Visdom
13+
from collections import deque
14+
from heapq import heappush, heappop
15+
16+
import utils
17+
18+
NB_EPOCHS = 1000
19+
BATCH_SZ = 16
20+
buff = deque()
21+
MAXI_BUFF_SZ = 50
22+
23+
exptuner_config = utils.ExpTunerConfig()
24+
exptuner_config.set_convolution_tc()
25+
26+
NB_HYPERPARAMS = utils.NB_HYPERPARAMS
27+
INIT_INPUT_SZ = exptuner_config.INIT_INPUT_SZ
28+
init_input_sz = exptuner_config.init_input_sz
29+
30+
viz = Visdom()
31+
win0 = viz.line(X=np.arange(NB_EPOCHS), Y=np.random.rand(NB_EPOCHS))
32+
win1 = viz.line(X=np.arange(NB_EPOCHS), Y=np.random.rand(NB_EPOCHS))
33+
34+
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
35+
36+
layer_sz = 32
37+
38+
class Predictor(nn.Module):
39+
def __init__(self, nb_inputs, nb_actions):
40+
super(Predictor, self).__init__()
41+
self.affine1 = nn.Linear(nb_inputs, layer_sz)
42+
self.affine15 = nn.Linear(layer_sz, layer_sz)
43+
self.affine2 = nn.Linear(layer_sz, nb_actions)
44+
self.affine3 = nn.Linear(layer_sz, 1)
45+
46+
self.W = nn.Linear(nb_inputs, nb_inputs)
47+
48+
def forward(self, x):
49+
#ipdb.set_trace()
50+
#x = F.softmax(self.W(x), dim=-1) * x #attention mecanism
51+
tmp1 = F.relu(self.affine1(x))
52+
#tmp1 = F.relu(self.affine15(tmp1))
53+
out_action = F.softmax(self.affine2(tmp1), dim=-1)
54+
out_value = self.affine3(tmp1)
55+
return out_action, out_value
56+
57+
class FullNetwork(nn.Module):
58+
def __init__(self, nb_hyperparams, init_input_sz):
59+
super(FullNetwork, self).__init__()
60+
self.nb_hyperparams = nb_hyperparams
61+
self.init_input_sz = init_input_sz
62+
self.nets = [Predictor(init_input_sz + i, int(exptuner_config.cat_sz[i])) for i in range(nb_hyperparams)]
63+
self.nets = nn.ModuleList(self.nets)
64+
65+
def select_action(self, x, i, out_sz):
66+
geps = 0.1
67+
proba = np.random.rand()
68+
probs, state_value = self.nets[i](x)
69+
if(proba <= geps):
70+
probs = torch.FloatTensor([1./out_sz]*out_sz)
71+
m = Categorical(probs)
72+
action = m.sample()
73+
return action.item(), m.log_prob(action), state_value
74+
75+
def forward(self, x):
76+
actions_prob = []
77+
values = []
78+
for i in range(self.nb_hyperparams):
79+
sym, action_prob, value = self.select_action(x, i, int(exptuner_config.cat_sz[i]))
80+
actions_prob.append(action_prob)
81+
values.append(value)
82+
x = torch.cat([x, torch.FloatTensor([sym])])
83+
return x[INIT_INPUT_SZ:], actions_prob, values
84+
85+
net = FullNetwork(NB_HYPERPARAMS, INIT_INPUT_SZ)
86+
optimizer = optim.Adam(net.parameters(), lr=0.0001)
87+
eps = np.finfo(np.float32).eps.item()
88+
89+
def finish_episode(actions_probs, values, final_rewards):
90+
policy_losses = [[] for i in range(BATCH_SZ)]
91+
value_losses = [[] for i in range(BATCH_SZ)]
92+
final_rewards = torch.tensor(list(final_rewards))
93+
#final_rewards = (final_rewards - final_rewards.mean()) / (final_rewards.std() + eps)
94+
for batch_id in range(BATCH_SZ):
95+
for (log_prob, value) in zip(actions_probs[batch_id], values[batch_id]):
96+
reward = final_rewards[batch_id] - value.item()
97+
policy_losses[batch_id].append(-log_prob * reward)
98+
value_losses[batch_id].append(F.smooth_l1_loss(value, torch.tensor([final_rewards[batch_id]])))
99+
optimizer.zero_grad()
100+
vloss = torch.stack([torch.stack(value_losses[i]).sum() for i in range(BATCH_SZ)]).mean()
101+
ploss = torch.stack([torch.stack(policy_losses[i]).sum() for i in range(BATCH_SZ)]).mean()
102+
loss = ploss + vloss
103+
loss.backward(retain_graph=True)
104+
optimizer.step()
105+
return vloss.item(), ploss.item()
106+
107+
def add_to_buffer(actions_probs, values, reward):
108+
global buff
109+
#if(len(buff) > 0):
110+
# min_reward = np.min(np.array(buff)[:,2])
111+
# if(reward < 10*min_reward):
112+
# return
113+
if len(buff) == MAXI_BUFF_SZ:
114+
#heappop(buff)
115+
buff.popleft()
116+
#heappush(buff, (reward, actions_probs, values))
117+
buff.append((reward, actions_probs, values))
118+
119+
def select_batch():
120+
#random.sample()
121+
batch = [buff[np.random.randint(len(buff))] for i in range(BATCH_SZ)]
122+
#batch.append(buff[-1])
123+
batch=np.array(batch)
124+
return batch[:,1], batch[:,2], batch[:,0]
125+
126+
def get_best_buff():
127+
return np.max(np.array(buff)[:,0])
128+
129+
INTER_DISP = 20
130+
131+
running_reward = -0.5
132+
tab_rewards=[]
133+
tab_best=[]
134+
best=-12
135+
v_losses=[]
136+
p_losses=[]
137+
best_options = np.zeros(NB_HYPERPARAMS).astype(int)
138+
for i in range(NB_EPOCHS):
139+
rewards = []
140+
out_actions, out_probs, out_values = net(init_input_sz)
141+
#utils.print_opt(out_actions.numpy().astype(int))
142+
reward = utils.evalTime(out_actions.numpy().astype(int), exptuner_config, prune=-1, curr_best=np.exp(-best))
143+
#reward=100*reward
144+
#reward = -((reward)/1000)
145+
reward = -np.log(reward)
146+
add_to_buffer(out_probs, out_values, reward)
147+
best_in_buffer = get_best_buff()
148+
if(i >= 20):
149+
actions_probs, values, rewards = select_batch()
150+
for j in range(1):
151+
vloss, ploss = finish_episode(actions_probs, values, rewards)
152+
v_losses.append(vloss)
153+
p_losses.append(ploss)
154+
if(best < reward or i==0):
155+
best=reward
156+
best_options = out_actions.numpy().astype(int)
157+
utils.print_opt(best_options)
158+
if(i==0):
159+
running_reward = reward
160+
running_reward = running_reward * 0.99 + reward * 0.01
161+
tab_rewards.append(-(running_reward))
162+
tab_best.append(-best)
163+
if i % INTER_DISP == 0:
164+
viz.line(X=np.column_stack((np.arange(i+1), np.arange(i+1))), Y=np.column_stack((np.array(tab_rewards), np.array(tab_best))), win=win0, opts=dict(legend=["Geometric run", "Best time"]))
165+
if(len(v_losses) > 0):
166+
viz.line(X=np.column_stack((np.arange(len(v_losses)), np.arange(len(v_losses)))), Y=np.column_stack((np.array(v_losses), np.array(p_losses))), win=win1, opts=dict(legend=["Value loss", "Policy loss"]))
167+
print(-running_reward)
168+
print(-best)
169+
print("Best in buffer: " + str(-best_in_buffer))
170+
171+
print("Finally, best options are:")
172+
utils.print_opt(best_options)

0 commit comments

Comments
 (0)