Skip to content

Commit 58766d6

Browse files
committed
Debugging hw2 and hw3
1 parent c596a2b commit 58766d6

File tree

2 files changed

+7
-5
lines changed

2 files changed

+7
-5
lines changed

hw2/train_pg.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
import tensorflow as tf
33
import gym
4+
import roboschool
45
import logz
56
import scipy.signal
67
import os
@@ -196,15 +197,16 @@ def train_PG(exp_name='',
196197
# Compute Gaussian stochastic policy over continuous actions.
197198
# The mean is a function of observations, while the variance is not.
198199
sy_mean_na = build_mlp(sy_ob_no, ac_dim, "policy", n_layers=n_layers, size=size)
199-
sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name="policy/logstd", dtype=float32)
200+
sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name="policy/logstd", dtype=tf.float32)
200201
sy_std = tf.exp(sy_logstd)
201202

202203
# Sample an action from the stochastic policy
203204
sy_sampled_z = tf.random_normal(tf.shape(sy_mean_na))
204205
sy_sampled_nac = sy_mean_na + sy_std * sy_sampled_z
205206

206207
# Likelihood of chosen action
207-
sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_sampled_z), axis=1)
208+
sy_z = (sy_nac - sy_mean_na) / sy_std
209+
sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)
208210

209211

210212

@@ -215,7 +217,7 @@ def train_PG(exp_name='',
215217

216218
# Loss function that we'll differentiate to get the policy gradient.
217219
# Note: no gradient will flow through sy_adv_n, because it's a placeholder.
218-
loss = tf.reduce_mean(-sy_logprob_n * sy_adv_n)
220+
loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n)
219221

220222
update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
221223

hw3/dqn.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,8 @@ def learn(env,
210210
if not model_initialized or random.random() < exploration.value(t):
211211
action = random.randint(0, num_actions - 1)
212212
else:
213-
q_now = np.expand_dims(replay_buffer.encode_recent_observation(), axis=0)
214-
action = session.run(greedy_action, {obs_t_float: q_now})
213+
obs = replay_buffer.encode_recent_observation()
214+
action = session.run(greedy_action, {obs_t_ph: [obs]})
215215

216216
next_obs, reward, done, _ = env.step(action)
217217
if done:

0 commit comments

Comments
 (0)