1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
| class Critic(object): def __init__(self, sess, s_dim, s, s_, a, a_, gamma, lr, tau): self.sess = sess self.s_dim = s_dim self.s = s self.s_ = s_ self.a = a
with tf.variable_scope('critic'): self.r = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.q = self._build_net(s, a, 'eval', True) self.q_ = self._build_net(s_, a_, 'target', False)
param_eval = tf.global_variables('critic/eval') param_target = tf.global_variables('critic/target') self.target_replace_ops = [tf.assign(t, tau * e + (1 - tau) * t) for t, e in zip(param_target, param_eval)]
target_q = self.r + gamma * self.q_ target_q = tf.stop_gradient(target_q)
loss = tf.reduce_mean(tf.squared_difference(target_q, self.q)) self.train_ops = tf.train.AdamOptimizer(lr).minimize(loss, var_list=param_eval)
def _build_net(self, s, a, scope, trainable): with tf.variable_scope(scope):
l = tf.concat([s, a], 1) l = tf.layers.dense(l, 30, activation=tf.nn.relu, trainable=trainable, **initializer_helper)
with tf.variable_scope('Q'): q = tf.layers.dense(l, 1, name='q', trainable=trainable, **initializer_helper) return q
def get_gradients(self): return tf.gradients(self.q, self.a)[0]
def learn(self, s, a, r, s_): self.sess.run(self.train_ops, { self.s: s, self.a: a, self.r: r, self.s_: s_, }) self.sess.run(self.target_replace_ops)
|