Skip to content

Commit

Permalink
First code review
Browse files Browse the repository at this point in the history
Code uniformed between experiments, style formatting, delete unused
files.
  • Loading branch information
GiBg1aN committed Dec 1, 2018
1 parent 63e4efd commit 7a6250e
Show file tree
Hide file tree
Showing 30 changed files with 169 additions and 704 deletions.
42 changes: 18 additions & 24 deletions Breakout/Breakout_DQN.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import random as ran
import os
import random as ran
import time
import gym
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.initializers import VarianceScaling
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Conv2D
import numpy as np
from skimage.color import rgb2gray
from skimage.transform import resize
import tensorflow as tf
from tqdm import tqdm
from dqn_lib import DQNAgent

Expand All @@ -22,11 +22,7 @@ def pre_processing(observe):
return processed_observe


# 0: stay
# 1: start
# 2: right
# 3: left

# 0: stay, 1: start, 2: right, 3: left
def experiment(n_episodes, max_action, default_policy=False, policy=None, render=False):

with tf.device('/gpu:0'):
Expand All @@ -36,24 +32,18 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render

env = gym.make('BreakoutDeterministic-v4')

# if default_policy:
# env._max_episode_steps = 5000000
# else:
# env._max_episode_steps = 1000000

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

layers = [Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)),
Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)),
Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)),
Flatten(),
Dense(512, activation='relu', kernel_initializer=VarianceScaling(scale=2.0)),
Dense(output_dim)]

if default_policy:
agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0.05, epsilon_lower_bound=0.05)
else:
layers = [Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)),
Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)),
Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)),
Flatten(),
Dense(512, activation='relu', kernel_initializer=VarianceScaling(scale=2.0)),
Dense(output_dim)]
agent = DQNAgent(output_dim, layers, use_ddqn=True, memory_size=720000, gamma=0.99)

gathered_frame = 0
Expand All @@ -65,6 +55,7 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render

has_lost_life = True
start_life = env.unwrapped.ale.lives()

t = 0
while True:
if has_lost_life:
Expand All @@ -80,16 +71,20 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render
new_stack = np.append(new_state, stack[:, :, :, :3], axis=3)
agent.memoise((stack, next_action, reward, new_state, end))
stack = new_stack

if (render):
env.render()

has_lost_life = False

next_action = agent.act(stack)
new_state, reward, end, info = env.step(next_action)

if (render):
env.render()

reward = np.clip(reward, -1., 1.)

if info['ale.lives'] < start_life:
has_lost_life = True
start_life = info['ale.lives']
Expand Down Expand Up @@ -120,14 +115,13 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render
if episode_number >= 100 and episode_number % 50 == 0:
model_name = "partial_model_breakout" + str(episode_number)
agent.save_model(model_name)


env.close()
return {"results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent}

# Training
#res = experiment(100000, 10000000, render=False)
#res["agent"].save_model("final_model")
res = experiment(100000, 10000000, render=False)
res["agent"].save_model("ddqn")

# Testing
res = experiment(20, 10000000, render=True, default_policy=True, policy="partial_model_breakout12250")
res = experiment(20, 10000000, render=True, default_policy=True, policy="ddqn")
35 changes: 11 additions & 24 deletions CartPole/dqn_cart_pole.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,17 @@
import os
import time
import gym
import numpy as np
import keras.optimizers
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from dqn_lib import DQNAgent

os.environ['PYTHONHASHSEED'] = '0'

seed = 73
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(seed)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

# random.seed(seed)

tf.set_random_seed(seed)


Expand All @@ -43,14 +33,16 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False):

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

layer1 = Dense(10, input_dim=input_dim, activation='relu')
layer2 = Dense(output_dim)

if default_policy:
agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0, tb_dir=None)
agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy,
epsilon=0, epsilon_lower_bound=0, learn_thresh=0, tb_dir=None)
else:
agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None)
layer1 = Dense(10, input_dim=input_dim, activation='relu')
layer2 = Dense(output_dim)
agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100,
epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.1,
optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None)

for _ in tqdm(range(n_episodes), desc="Episode"):
state = env.reset()
Expand All @@ -65,7 +57,6 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False):
time.sleep(0.1)

next_action = agent.act(state)

new_state, reward, end, _ = env.step(next_action)

x, x_dot, theta, theta_dot = new_state
Expand All @@ -75,19 +66,15 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False):
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
r3 = -abs(theta_dot)
reward = r1 + r2 + r3
# reward = r1 + r2
# reward = r1

agent.memoise((state, next_action, reward, new_state, end))

if end or t > 199:
if t < 195:
res[0] += 1
#reward = reward - 100
#memory.append((state, next_action, reward, new_state, end))
else:
res[1] += 1
print("ENTRATO!,", t, "steps","reward: ",cumulative_reward)
# print("ENTRATO!,", t, "steps","reward: ",cumulative_reward)

steps.append(t)
break
Expand Down Expand Up @@ -124,4 +111,4 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False):
"Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps)

# Rendering
# experiment(1, render=True, default_policy=True, policy="model1")
# experiment(1, render=True, default_policy=True, policy="model_cp")
25 changes: 7 additions & 18 deletions Ensembling/ensembling_cart_pole.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,18 @@
import os
import time
import gym
import numpy as np
import keras.optimizers
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
import keras.optimizers
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from dqn_lib import DQNAgent
from ensembler import *

os.environ['PYTHONHASHSEED'] = '0'

seed = 73
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(seed)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

# random.seed(seed)

tf.set_random_seed(seed)


Expand All @@ -32,6 +22,7 @@ def accuracy(results):
"""
return results[1] / (results[0] + results[1]) * 100


def evaluate(env, agentE):
eval_steps = []
eval_scores = []
Expand Down Expand Up @@ -67,8 +58,6 @@ def evaluate(env, agentE):
training_mean_steps = np.array(eval_steps).mean()
training_mean_score = np.array(eval_scores).mean()



print("\nEval episodes:", 200, "Eval mean score:", training_mean_score, \
"Eval mean steps", training_mean_steps, "accuracy:",accuracy(eval_res))

Expand All @@ -77,7 +66,6 @@ def evaluate(env, agentE):
return False



def experiment(n_episodes, default_policy=False, policy=None, render = False):
res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory}
scores = [] # Cumulative rewards
Expand Down Expand Up @@ -106,8 +94,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False):
agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None)
agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None)
agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None)
agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9, agent10]

agentE = EnsemblerAgent(output_dim, [agent1, agent2, agent3, agent4, agent5, agent6, agent7], EnsemblerType.TRUST_BASED)
agentE = EnsemblerAgent(output_dim, agents, EnsemblerType.TRUST_BASED)

for i_ep in tqdm(range(n_episodes), desc="Episode"):
state = env.reset()
Expand Down Expand Up @@ -178,7 +167,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False):
training_mean_steps = train_res["steps"].mean()
training_mean_score = train_res["scores"].mean()

np.savetxt("results/ens_agents10_trust.csv", train_res["steps"], delimiter=',')
# np.savetxt("results/ens_agents10_trust.csv", train_res["steps"], delimiter=',')

print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \
"Training mean steps", training_mean_steps)
Expand Down
29 changes: 14 additions & 15 deletions Ensembling/ensembling_mountaincar.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
import os
import time
import gym
import numpy as np
from tqdm import tqdm
import os
import random as ran
import numpy as np
import keras.optimizers
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
import keras.optimizers
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from dqn_lib import DQNAgent
from ensembler import *

seed = 91
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(91)
tf.set_random_seed(91)
np.random.seed(seed)
tf.set_random_seed(seed)


def accuracy(results):
"""
Evaluate the accuracy of results, considering victories and defeats.
"""
return results[1] / (results[0] + results[1]) * 100


def experiment(n_episodes, default_policy=False, policy=None, render=False):
res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory}
scores = [] # Cumulative rewards
steps = [] # Steps per episode

env = gym.make('MountainCar-v0')
env.seed(91)
env.seed(seed)

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
Expand All @@ -47,7 +47,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8]
agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9, agent10]
agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED)

evaluate = False
Expand Down Expand Up @@ -89,9 +89,8 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent6.memoise((state, next_action, r2, new_state, end))
agent7.memoise((state, next_action, r1, new_state, end))
agent8.memoise((state, next_action, r2, new_state, end))
#agent9.memoise((state, next_action, r1, new_state, end))
#agent10.memoise((state, next_action, r2, new_state, end))

agent9.memoise((state, next_action, r1, new_state, end))
agent10.memoise((state, next_action, r2, new_state, end))

if end:
if t == env._max_episode_steps - 1:
Expand Down Expand Up @@ -161,7 +160,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
training_mean_steps = train_res["steps"].mean()
training_mean_score = train_res["scores"].mean()

np.savetxt("results/ens_agent8_major.csv", train_res["steps"], delimiter=',')
# np.savetxt("results/ens_agent8_major.csv", train_res["steps"], delimiter=',')

print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \
"Training mean steps", training_mean_steps)
Expand Down
Loading

0 comments on commit 7a6250e

Please sign in to comment.