hedrox
diff --git a/‎atari_wrappers.py
+2 b/‎atari_wrappers.py
+2
diff --git a/‎plot_graphs.py
+37 b/‎plot_graphs.py
+37
diff --git a/‎policies/cnn_gru_policy_dynamics.py
+53-22 b/‎policies/cnn_gru_policy_dynamics.py
+53-22
diff --git a/‎policies/cnn_policy_param_matched.py
+18-8 b/‎policies/cnn_policy_param_matched.py
+18-8
@@ -72,6 +72,8 @@ def __init__(self, env, k):
         """Stack k last frames.
 
         Returns lazy array, which is much more memory efficient.
+        A single frame when using WarpFrame is 84x84x1
+        So if we stack 4 frames then the shape is 84x84x4
 
         See Also
         --------
 
@@ -0,0 +1,37 @@
+import numpy as np
+import pandas as pd
+import sys, argparse
+
+import matplotlib.pyplot as plt
+
+parser = argparse.ArgumentParser()
+parser.add_argument("model1_path", type=str, help="Path to the progress.csv of the first model")
+parser.add_argument("model2_path", type=str,
+                    help="Path to the progress.csv of the second model being compared to")
+args = parser.parse_args()
+
+data1 = pd.read_csv(args.model1_path)
+data2 = pd.read_csv(args.model2_path)
+
+fig, axes = plt.subplots(nrows=2, ncols=2)
+
+"""
+retextmean, retextstd, retintmean, retintstd, rewintmean_norm, rewintmean_unnorm,
+vpredextmean, vpredintmean are interesting metrics
+"""
+
+data1.plot(x='tcount', y='rewtotal', ax=axes[0,0], color='blue')
+data2.plot(x='tcount', y='rewtotal', ax=axes[0,0], color='red')
+
+data1.plot(x='tcount', y='n_rooms', ax=axes[0,1], color='blue')
+data2.plot(x='tcount', y='n_rooms', ax=axes[0,1], color='red')
+
+data1.plot(x='tcount', y='eprew', ax=axes[1,0], color='blue')
+data2.plot(x='tcount', y='eprew', ax=axes[1,0], color='red')
+
+data1.plot(x='tcount', y='best_ret', ax=axes[1,1], color='blue')
+data2.plot(x='tcount', y='best_ret', ax=axes[1,1], color='red')
+
+fig.show()
+plt.show()
+
@@ -9,7 +9,9 @@
 
 def to2d(x):
     size = 1
-    for shapel in x.get_shape()[1:]: size *= shapel.value
+    for shapel in x.get_shape()[1:]:
+        size *= shapel.value
+
     return tf.reshape(x, (-1, size))
 
 
@@ -40,13 +42,15 @@ def call(self, inputs, state):
         h = m * h + (1.0 - m) * htil
         return h, h
 
+
 class CnnGruPolicy(StochasticPolicy):
     def __init__(self, scope, ob_space, ac_space,
-                 policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0,
+                 policy_size='normal', maxpool=False, extrahid=True,
+                 hidsize=128, memsize=128, rec_gate_init=0.0,
                  update_ob_stats_independently_per_gpu=True,
                  proportion_of_exp_used_for_predictor_update=1.,
-                 dynamics_bonus = False,
-                 ):
+                 dynamics_bonus = False):
+
         StochasticPolicy.__init__(self, scope, ob_space, ac_space)
         self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
         enlargement = {
@@ -61,7 +65,8 @@ def __init__(self, scope, ob_space, ac_space,
         hidsize *= enlargement
         convfeat = 16*enlargement
         self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu)
-        ph_istate = tf.placeholder(dtype=tf.float32,shape=(None,memsize), name='state')
+
+        ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state')
         pdparamsize = self.pdtype.param_shape()[0]
         self.memsize = memsize
 
@@ -77,8 +82,8 @@ def __init__(self, scope, ob_space, ac_space,
                               sy_nenvs=self.sy_nenvs,
                               sy_nsteps=self.sy_nsteps - 1,
                               pdparamsize=pdparamsize,
-                              rec_gate_init=rec_gate_init
-                              )
+                              rec_gate_init=rec_gate_init)
+
         self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \
             self.apply_policy(self.ph_ob[None],
                               ph_new=self.ph_new,
@@ -91,15 +96,13 @@ def __init__(self, scope, ob_space, ac_space,
                               sy_nenvs=self.sy_nenvs,
                               sy_nsteps=self.sy_nsteps,
                               pdparamsize=pdparamsize,
-                              rec_gate_init=rec_gate_init
-                              )
+                              rec_gate_init=rec_gate_init)
+
         if dynamics_bonus:
             self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)
         else:
             self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement)
 
-
-
         pd = self.pdtype.pdfromflat(self.pdparam_rollout)
         self.a_samp = pd.sample()
         self.nlp_samp = pd.neglogp(self.a_samp)
@@ -110,33 +113,60 @@ def __init__(self, scope, ob_space, ac_space,
 
         self.ph_istate = ph_istate
 
-    @staticmethod
-    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init):
+    def apply_policy(self, ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize,
+                     extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init):
         data_format = 'NHWC'
         ph = ph_ob
         assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
         logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape)))
         X = tf.cast(ph, tf.float32) / 255.
+        # (None, 84, 84, 4) in case of MontezumaRevengeNoFrameskip
         X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))
 
         activ = tf.nn.relu
         yes_gpu = any(get_available_gpus())
 
         with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
             X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
-            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
-            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
+            #X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
+            #X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
+
+            # over 14k rewards with these 2 and only the first conv layer
+            # with tf.variable_scope("augmented1"):
+            #     X = self.augmented_conv2d(X, 256, dk=24, dv=24)
+
+            # with tf.variable_scope("augmented2"):
+            #     X = self.augmented_conv2d(X, 256, dk=24, dv=24)
+
+            # 5.8k rewards 3 levels with these 2 and the first 2 conv layers
+            # with tf.variable_scope("augmented1"):
+            #     X = self.augmented_conv2d(X, 512, dk=256, dv=256)
+
+            # with tf.variable_scope("augmented2"):
+            #     X = self.augmented_conv2d(X, 512, dk=256, dv=256)
+
+            with tf.variable_scope("augmented1"):
+                X = self.augmented_conv2d(X, 256, dk=24, dv=24)
+
+            with tf.variable_scope("augmented2"):
+                X = self.augmented_conv2d(X, 256, dk=24, dv=24)
+
             X = to2d(X)
             X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
             X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
-            X, snext = tf.nn.dynamic_rnn(
-                GRUCell(memsize, rec_gate_init=rec_gate_init), (X, ph_new[:,:,None]),
-                dtype=tf.float32, time_major=False, initial_state=ph_istate)
+
+            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize, rec_gate_init=rec_gate_init),
+                                         (X, ph_new[:,:,None]),
+                                         dtype=tf.float32,
+                                         time_major=False,
+                                         initial_state=ph_istate)
+
             X = tf.reshape(X, (-1, memsize))
             Xtout = X
             if extrahid:
                 Xtout = X + activ(fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                 X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
+
             pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
             vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
             vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)
@@ -263,9 +293,10 @@ def call(self, dict_obs, new, istate, update_obs_stats=False):
         feed1 = { self.ph_ob[k]: dict_obs[k][:,None] for k in self.ph_ob_keys }
         feed2 = { self.ph_istate: istate, self.ph_new: new[:,None].astype(np.float32) }
         feed1.update({self.ph_mean: self.ob_rms.mean, self.ph_std: self.ob_rms.var ** 0.5})
-        # for f in feed1:
-        #     print(f)
+
         a, vpred_int,vpred_ext, nlp, newstate, ent = tf.get_default_session().run(
-            [self.a_samp, self.vpred_int_rollout,self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout],
+            [self.a_samp, self.vpred_int_rollout, self.vpred_ext_rollout, self.nlp_samp, self.snext_rollout, self.entropy_rollout],
             feed_dict={**feed1, **feed2})
-        return a[:,0], vpred_int[:,0],vpred_ext[:,0], nlp[:,0], newstate, ent[:,0]
+
+        # return for every env
+        return a[:,0], vpred_int[:,0], vpred_ext[:,0], nlp[:,0], newstate, ent[:,0]
@@ -9,14 +9,17 @@
 
 def to2d(x):
     size = 1
-    for shapel in x.get_shape()[1:]: size *= shapel.value
+    for shapel in x.get_shape()[1:]:
+        size *= shapel.value
+
     return tf.reshape(x, (-1, size))
 
 def _fcnobias(x, scope, nh, *, init_scale=1.0):
     with tf.variable_scope(scope):
         nin = x.get_shape()[1].value
         w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
         return tf.matmul(x, w)
+
 def _normalize(x):
     eps = 1e-5
     mean, var = tf.nn.moments(x, axes=(-1,), keepdims=True)
@@ -25,11 +28,12 @@ def _normalize(x):
 
 class CnnPolicy(StochasticPolicy):
     def __init__(self, scope, ob_space, ac_space,
-                 policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0,
+                 policy_size='normal', maxpool=False, extrahid=True,
+                 hidsize=128, memsize=128, rec_gate_init=0.0,
                  update_ob_stats_independently_per_gpu=True,
                  proportion_of_exp_used_for_predictor_update=1.,
-                 dynamics_bonus = False,
-                 ):
+                 dynamics_bonus = False):
+
         StochasticPolicy.__init__(self, scope, ob_space, ac_space)
         self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
         enlargement = {
@@ -87,8 +91,7 @@ def __init__(self, scope, ob_space, ac_space,
 
         self.ph_istate = ph_istate
 
-    @staticmethod
-    def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize):
+    def apply_policy(self, ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize):
         data_format = 'NHWC'
         ph = ph_ob
         assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
@@ -100,8 +103,15 @@ def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_n
         yes_gpu = any(get_available_gpus())
         with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
             X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format))
-            X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
-            X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
+            #X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
+            #X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
+
+            with tf.variable_scope("augmented1"):
+                X = self.augmented_conv2d(X, 512, dk=256, dv=256)
+
+            with tf.variable_scope("augmented2"):
+                X = self.augmented_conv2d(X, 512, dk=256, dv=256)
+
             X = to2d(X)
             mix_other_observations = [X]
             X = tf.concat(mix_other_observations, axis=1)