9
9
10
10
def to2d (x ):
11
11
size = 1
12
- for shapel in x .get_shape ()[1 :]: size *= shapel .value
12
+ for shapel in x .get_shape ()[1 :]:
13
+ size *= shapel .value
14
+
13
15
return tf .reshape (x , (- 1 , size ))
14
16
15
17
@@ -40,13 +42,15 @@ def call(self, inputs, state):
40
42
h = m * h + (1.0 - m ) * htil
41
43
return h , h
42
44
45
+
43
46
class CnnGruPolicy (StochasticPolicy ):
44
47
def __init__ (self , scope , ob_space , ac_space ,
45
- policy_size = 'normal' , maxpool = False , extrahid = True , hidsize = 128 , memsize = 128 , rec_gate_init = 0.0 ,
48
+ policy_size = 'normal' , maxpool = False , extrahid = True ,
49
+ hidsize = 128 , memsize = 128 , rec_gate_init = 0.0 ,
46
50
update_ob_stats_independently_per_gpu = True ,
47
51
proportion_of_exp_used_for_predictor_update = 1. ,
48
- dynamics_bonus = False ,
49
- ):
52
+ dynamics_bonus = False ):
53
+
50
54
StochasticPolicy .__init__ (self , scope , ob_space , ac_space )
51
55
self .proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update
52
56
enlargement = {
@@ -61,7 +65,8 @@ def __init__(self, scope, ob_space, ac_space,
61
65
hidsize *= enlargement
62
66
convfeat = 16 * enlargement
63
67
self .ob_rms = RunningMeanStd (shape = list (ob_space .shape [:2 ])+ [1 ], use_mpi = not update_ob_stats_independently_per_gpu )
64
- ph_istate = tf .placeholder (dtype = tf .float32 ,shape = (None ,memsize ), name = 'state' )
68
+
69
+ ph_istate = tf .placeholder (dtype = tf .float32 , shape = (None , memsize ), name = 'state' )
65
70
pdparamsize = self .pdtype .param_shape ()[0 ]
66
71
self .memsize = memsize
67
72
@@ -77,8 +82,8 @@ def __init__(self, scope, ob_space, ac_space,
77
82
sy_nenvs = self .sy_nenvs ,
78
83
sy_nsteps = self .sy_nsteps - 1 ,
79
84
pdparamsize = pdparamsize ,
80
- rec_gate_init = rec_gate_init
81
- )
85
+ rec_gate_init = rec_gate_init )
86
+
82
87
self .pdparam_rollout , self .vpred_int_rollout , self .vpred_ext_rollout , self .snext_rollout = \
83
88
self .apply_policy (self .ph_ob [None ],
84
89
ph_new = self .ph_new ,
@@ -91,15 +96,13 @@ def __init__(self, scope, ob_space, ac_space,
91
96
sy_nenvs = self .sy_nenvs ,
92
97
sy_nsteps = self .sy_nsteps ,
93
98
pdparamsize = pdparamsize ,
94
- rec_gate_init = rec_gate_init
95
- )
99
+ rec_gate_init = rec_gate_init )
100
+
96
101
if dynamics_bonus :
97
102
self .define_dynamics_prediction_rew (convfeat = convfeat , rep_size = rep_size , enlargement = enlargement )
98
103
else :
99
104
self .define_self_prediction_rew (convfeat = convfeat , rep_size = rep_size , enlargement = enlargement )
100
105
101
-
102
-
103
106
pd = self .pdtype .pdfromflat (self .pdparam_rollout )
104
107
self .a_samp = pd .sample ()
105
108
self .nlp_samp = pd .neglogp (self .a_samp )
@@ -110,33 +113,60 @@ def __init__(self, scope, ob_space, ac_space,
110
113
111
114
self .ph_istate = ph_istate
112
115
113
- @ staticmethod
114
- def apply_policy ( ph_ob , ph_new , ph_istate , reuse , scope , hidsize , memsize , extrahid , sy_nenvs , sy_nsteps , pdparamsize , rec_gate_init ):
116
+ def apply_policy ( self , ph_ob , ph_new , ph_istate , reuse , scope , hidsize , memsize ,
117
+ extrahid , sy_nenvs , sy_nsteps , pdparamsize , rec_gate_init ):
115
118
data_format = 'NHWC'
116
119
ph = ph_ob
117
120
assert len (ph .shape .as_list ()) == 5 # B,T,H,W,C
118
121
logger .info ("CnnGruPolicy: using '%s' shape %s as image input" % (ph .name , str (ph .shape )))
119
122
X = tf .cast (ph , tf .float32 ) / 255.
123
+ # (None, 84, 84, 4) in case of MontezumaRevengeNoFrameskip
120
124
X = tf .reshape (X , (- 1 , * ph .shape .as_list ()[- 3 :]))
121
125
122
126
activ = tf .nn .relu
123
127
yes_gpu = any (get_available_gpus ())
124
128
125
129
with tf .variable_scope (scope , reuse = reuse ), tf .device ('/gpu:0' if yes_gpu else '/cpu:0' ):
126
130
X = activ (conv (X , 'c1' , nf = 32 , rf = 8 , stride = 4 , init_scale = np .sqrt (2 ), data_format = data_format ))
127
- X = activ (conv (X , 'c2' , nf = 64 , rf = 4 , stride = 2 , init_scale = np .sqrt (2 ), data_format = data_format ))
128
- X = activ (conv (X , 'c3' , nf = 64 , rf = 4 , stride = 1 , init_scale = np .sqrt (2 ), data_format = data_format ))
131
+ #X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format))
132
+ #X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format))
133
+
134
+ # over 14k rewards with these 2 and only the first conv layer
135
+ # with tf.variable_scope("augmented1"):
136
+ # X = self.augmented_conv2d(X, 256, dk=24, dv=24)
137
+
138
+ # with tf.variable_scope("augmented2"):
139
+ # X = self.augmented_conv2d(X, 256, dk=24, dv=24)
140
+
141
+ # 5.8k rewards 3 levels with these 2 and the first 2 conv layers
142
+ # with tf.variable_scope("augmented1"):
143
+ # X = self.augmented_conv2d(X, 512, dk=256, dv=256)
144
+
145
+ # with tf.variable_scope("augmented2"):
146
+ # X = self.augmented_conv2d(X, 512, dk=256, dv=256)
147
+
148
+ with tf .variable_scope ("augmented1" ):
149
+ X = self .augmented_conv2d (X , 256 , dk = 24 , dv = 24 )
150
+
151
+ with tf .variable_scope ("augmented2" ):
152
+ X = self .augmented_conv2d (X , 256 , dk = 24 , dv = 24 )
153
+
129
154
X = to2d (X )
130
155
X = activ (fc (X , 'fc1' , nh = hidsize , init_scale = np .sqrt (2 )))
131
156
X = tf .reshape (X , [sy_nenvs , sy_nsteps , hidsize ])
132
- X , snext = tf .nn .dynamic_rnn (
133
- GRUCell (memsize , rec_gate_init = rec_gate_init ), (X , ph_new [:,:,None ]),
134
- dtype = tf .float32 , time_major = False , initial_state = ph_istate )
157
+
158
+ X , snext = tf .nn .dynamic_rnn (GRUCell (memsize , rec_gate_init = rec_gate_init ),
159
+ (X , ph_new [:,:,None ]),
160
+ dtype = tf .float32 ,
161
+ time_major = False ,
162
+ initial_state = ph_istate )
163
+
135
164
X = tf .reshape (X , (- 1 , memsize ))
136
165
Xtout = X
137
166
if extrahid :
138
167
Xtout = X + activ (fc (Xtout , 'fc2val' , nh = memsize , init_scale = 0.1 ))
139
168
X = X + activ (fc (X , 'fc2act' , nh = memsize , init_scale = 0.1 ))
169
+
140
170
pdparam = fc (X , 'pd' , nh = pdparamsize , init_scale = 0.01 )
141
171
vpred_int = fc (Xtout , 'vf_int' , nh = 1 , init_scale = 0.01 )
142
172
vpred_ext = fc (Xtout , 'vf_ext' , nh = 1 , init_scale = 0.01 )
@@ -263,9 +293,10 @@ def call(self, dict_obs, new, istate, update_obs_stats=False):
263
293
feed1 = { self .ph_ob [k ]: dict_obs [k ][:,None ] for k in self .ph_ob_keys }
264
294
feed2 = { self .ph_istate : istate , self .ph_new : new [:,None ].astype (np .float32 ) }
265
295
feed1 .update ({self .ph_mean : self .ob_rms .mean , self .ph_std : self .ob_rms .var ** 0.5 })
266
- # for f in feed1:
267
- # print(f)
296
+
268
297
a , vpred_int ,vpred_ext , nlp , newstate , ent = tf .get_default_session ().run (
269
- [self .a_samp , self .vpred_int_rollout ,self .vpred_ext_rollout , self .nlp_samp , self .snext_rollout , self .entropy_rollout ],
298
+ [self .a_samp , self .vpred_int_rollout , self .vpred_ext_rollout , self .nlp_samp , self .snext_rollout , self .entropy_rollout ],
270
299
feed_dict = {** feed1 , ** feed2 })
271
- return a [:,0 ], vpred_int [:,0 ],vpred_ext [:,0 ], nlp [:,0 ], newstate , ent [:,0 ]
300
+
301
+ # return for every env
302
+ return a [:,0 ], vpred_int [:,0 ], vpred_ext [:,0 ], nlp [:,0 ], newstate , ent [:,0 ]
0 commit comments