fix issue #20

uidilr · Nov 14, 2018 · 0bdee5a · 0bdee5a
1 parent 1dc3c34
commit 0bdee5a
Show file tree

Hide file tree

Showing 9 changed files with 14 additions and 15 deletions.
diff --git a/algo/__pycache__/ppo.cpython-36.pyc b/algo/__pycache__/ppo.cpython-36.pyc
diff --git a/algo/ppo.py b/algo/ppo.py
@@ -74,7 +74,7 @@ def __init__(self, Policy, Old_Policy, gamma=0.95, clip_value=0.2, c_1=1, c_2=0.
             tf.summary.scalar('total', loss)
 
         self.merged = tf.summary.merge_all()
-        optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-5)
+        optimizer = tf.train.AdamOptimizer(learning_rate=5e-5, epsilon=1e-5)
         self.gradients = optimizer.compute_gradients(loss, var_list=pi_trainable)
         self.train_op = optimizer.minimize(loss, var_list=pi_trainable)
 

diff --git a/log/train/ppo/events.out.tfevents.1527419084.YusukeNakataMBP b/log/train/ppo/events.out.tfevents.1527419084.YusukeNakataMBP
diff --git a/log/train/ppo/events.out.tfevents.1542158042.YusukenoMacBook-pro.local b/log/train/ppo/events.out.tfevents.1542158042.YusukenoMacBook-pro.local
diff --git a/run_gail.py b/run_gail.py
@@ -36,35 +36,34 @@ def main(args):
         sess.run(tf.global_variables_initializer())
 
         obs = env.reset()
-        reward = 0  # do NOT use rewards to update policy
         success_num = 0
 
         for iteration in range(args.iteration):
             observations = []
             actions = []
+            # do NOT use rewards to update policy
             rewards = []
             v_preds = []
             run_policy_steps = 0
             while True:
                 run_policy_steps += 1
                 obs = np.stack([obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
-
                 act, v_pred = Policy.act(obs=obs, stochastic=True)
 
                 act = np.asscalar(act)
                 v_pred = np.asscalar(v_pred)
+                next_obs, reward, done, info = env.step(act)
 
                 observations.append(obs)
                 actions.append(act)
                 rewards.append(reward)
                 v_preds.append(v_pred)
 
-                next_obs, reward, done, info = env.step(act)
-
                 if done:
-                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
+                    next_obs = np.stack([next_obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
+                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
+                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                     obs = env.reset()
-                    reward = -1
                     break
                 else:
                     obs = next_obs

diff --git a/run_ppo.py b/run_ppo.py
@@ -29,14 +29,13 @@ def main(args):
         writer = tf.summary.FileWriter(args.logdir, sess.graph)
         sess.run(tf.global_variables_initializer())
         obs = env.reset()
-        reward = 0
         success_num = 0
 
         for iteration in range(args.iteration):
             observations = []
             actions = []
-            v_preds = []
             rewards = []
+            v_preds = []
             episode_length = 0
             while True:  # run policy RUN_POLICY_STEPS which is much less than episode length
                 episode_length += 1
@@ -46,17 +45,18 @@ def main(args):
                 act = np.asscalar(act)
                 v_pred = np.asscalar(v_pred)
 
+                next_obs, reward, done, info = env.step(act)
+
                 observations.append(obs)
                 actions.append(act)
-                v_preds.append(v_pred)
                 rewards.append(reward)
-
-                next_obs, reward, done, info = env.step(act)
+                v_preds.append(v_pred)
 
                 if done:
-                    v_preds_next = v_preds[1:] + [0]  # next state of terminate state has 0 state value
+                    next_obs = np.stack([next_obs]).astype(dtype=np.float32)  # prepare to feed placeholder Policy.obs
+                    _, v_pred = Policy.act(obs=next_obs, stochastic=True)
+                    v_preds_next = v_preds[1:] + [np.asscalar(v_pred)]
                     obs = env.reset()
-                    reward = -1
                     break
                 else:
                     obs = next_obs
@@ -78,7 +78,7 @@ def main(args):
             gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next)
 
             # convert list to numpy array for feeding tf.placeholder
-            observations = np.reshape(observations, newshape=[-1] + list(ob_space.shape))
+            observations = np.reshape(observations, newshape=(-1,) + ob_space.shape)
             actions = np.array(actions).astype(dtype=np.int32)
             gaes = np.array(gaes).astype(dtype=np.float32)
             gaes = (gaes - gaes.mean()) / gaes.std()

diff --git a/trained_models/ppo/model.ckpt.data-00000-of-00001 b/trained_models/ppo/model.ckpt.data-00000-of-00001
diff --git a/trained_models/ppo/model.ckpt.index b/trained_models/ppo/model.ckpt.index
diff --git a/trained_models/ppo/model.ckpt.meta b/trained_models/ppo/model.ckpt.meta