-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolicies.py
63 lines (53 loc) · 2.83 KB
/
policies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import tensorflow as tf
import numpy as np
from distributions import DiagGaussian
from networks import MLP
from optimizers import ClipPPO
import time
class GaussianMLPPolicy:
def __init__(
self,
name,
ob_dim,
action_dim,
var_network=False, # NN if true, else trainable params indep of obs
out_activation=None,
hidden_dims=[64, 64],
hidden_activation=tf.nn.tanh,
weight_init=tf.contrib.layers.xavier_initializer,
bias_init=tf.zeros_initializer,
optimizer=ClipPPO
):
with tf.variable_scope(name):
self.obs = tf.placeholder(tf.float32, shape=[None, ob_dim], name='obs')
# policy net
self.mean_network = MLP('means', ob_dim, action_dim, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs)
self.means = self.mean_network.layers['out']
if var_network:
self.log_var_network = MLP('log_vars', ob_dim, action_dim, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs)
self.log_vars = self.log_var_network.layers['out']
else:
self.log_var_network = MLP('log_vars', ob_dim, action_dim, out_activation=out_activation, hidden_dims=[], hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs)
self.log_vars = self.log_var_network.layers['out']
self.distribution = DiagGaussian(self.means, self.log_vars)
self.sampled_actions = self.distribution.sample()
self.actions = tf.placeholder(tf.float32, shape=[None, action_dim], name='actions')
self.action_log_probs = self.distribution.log_prob(self.actions)
self.entropies = self.distribution.entropy()
# value net
self.value_network = MLP('values', ob_dim, 1, out_activation=out_activation, hidden_dims=hidden_dims, hidden_activation=hidden_activation, weight_init=weight_init, bias_init=bias_init, in_layer=self.obs)
self.values = self.value_network.layers['out']
# training, PPO for now
self.optimizer = optimizer(ob_dim, action_dim, self)
def act(self, obs, global_session):
actions = global_session.run(
self.sampled_actions,
feed_dict={self.obs: obs}
)
return actions
def rollout_data(self, obs, actions, global_session):
action_log_probs, values, entropies = global_session.run(
[self.action_log_probs, self.values, self.entropies],
feed_dict={self.obs: obs, self.actions: actions}
)
return action_log_probs, values, entropies