1
1
import numpy as np
2
2
import tensorflow as tf
3
3
import gym
4
+ import roboschool
4
5
import logz
5
6
import scipy .signal
6
7
import os
@@ -196,15 +197,16 @@ def train_PG(exp_name='',
196
197
# Compute Gaussian stochastic policy over continuous actions.
197
198
# The mean is a function of observations, while the variance is not.
198
199
sy_mean_na = build_mlp (sy_ob_no , ac_dim , "policy" , n_layers = n_layers , size = size )
199
- sy_logstd = tf .Variable (tf .zeros ([1 , ac_dim ]), name = "policy/logstd" , dtype = float32 )
200
+ sy_logstd = tf .Variable (tf .zeros ([1 , ac_dim ]), name = "policy/logstd" , dtype = tf . float32 )
200
201
sy_std = tf .exp (sy_logstd )
201
202
202
203
# Sample an action from the stochastic policy
203
204
sy_sampled_z = tf .random_normal (tf .shape (sy_mean_na ))
204
205
sy_sampled_nac = sy_mean_na + sy_std * sy_sampled_z
205
206
206
207
# Likelihood of chosen action
207
- sy_logprob_n = - 0.5 * tf .reduce_sum (tf .square (sy_sampled_z ), axis = 1 )
208
+ sy_z = (sy_nac - sy_mean_na ) / sy_std
209
+ sy_logprob_n = - 0.5 * tf .reduce_sum (tf .square (sy_z ), axis = 1 )
208
210
209
211
210
212
@@ -215,7 +217,7 @@ def train_PG(exp_name='',
215
217
216
218
# Loss function that we'll differentiate to get the policy gradient.
217
219
# Note: no gradient will flow through sy_adv_n, because it's a placeholder.
218
- loss = tf .reduce_mean (- sy_logprob_n * sy_adv_n )
220
+ loss = - tf .reduce_mean (sy_logprob_n * sy_adv_n )
219
221
220
222
update_op = tf .train .AdamOptimizer (learning_rate ).minimize (loss )
221
223
0 commit comments