Skip to content

Commit

Permalink
Add MountainCar DDQN ensembling
Browse files Browse the repository at this point in the history
  • Loading branch information
hjorvardr committed Nov 20, 2018
1 parent d7153f6 commit 0c1fffd
Showing 1 changed file with 74 additions and 66 deletions.
140 changes: 74 additions & 66 deletions Ensembling/ensembling_mountaincar.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
import os
import random as ran
import numpy as np
import keras.optimizers
import tensorflow as tf
from qlearning_lib import QLAgent
from keras import backend as K
from keras.layers import Dense
import tensorflow as tf
from dqn_lib import DQNAgent
from ensembler import *

os.environ['PYTHONHASHSEED'] = '0'
Expand All @@ -19,106 +23,121 @@ def accuracy(results):
"""
return results[1] / (results[0] + results[1]) * 100


def obs_to_state(env, obs, n_states):
""" Maps an observation to state """
env_low = env.observation_space.low
env_high = env.observation_space.high
env_dx = (env_high - env_low) / n_states
a = int((obs[0] - env_low[0]) / env_dx[0])
b = int((obs[1] - env_low[1]) / env_dx[1])
return a, b


def experiment(n_episodes, default_policy=False, policy=None, render=False):
res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory}
res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory}
scores = [] # Cumulative rewards
steps = [] # Steps per episode

env = gym.make('MountainCar-v0')
env.seed(91)
n_states = 150
if (default_policy):
agentE = QLAgent([n_states, n_states, env.action_space.n], policy=policy, epsilon=0.01, epsilon_lower_bound=0.01)
else:
agent0 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01)
agent1 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01)
agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01)
agent3 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01)
agents = [agent1, agent2, agent3]
agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.TRUST_BASED)

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

layer1 = Dense(15, input_dim=input_dim, activation='relu')
layer2 = Dense(output_dim)

agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent2 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent3 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent4 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent5 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent6 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent7 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8]
agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED)

evaluate = False

for i_episode in tqdm(range(n_episodes + 1), desc="Episode"):
state_original = env.reset()

state = obs_to_state(env, state_original, n_states)
state = env.reset()
cumulative_reward = 0

state = np.reshape(state, [1, 2])

if i_episode > 0 and i_episode % 1000 == 0:
if i_episode > 0 and i_episode % 120 == 0:
evaluate = True

if evaluate == False:
for t in range(env._max_episode_steps):
if (render):
env.render()

next_action = agentE.act((state[0], state[1]))
state_original, reward, end, _ = env.step(next_action)
new_state = obs_to_state(env, state_original, n_states)
next_action = agentE.act(state)
new_state, reward, end, _ = env.step(next_action)
original_state = new_state

# r1 = reward + 0.1 * original_state[0]
# r2 = reward + 0.2 * np.sin(3 * original_state[0])
# r3 = reward + 0.7 * (original_state[1] * original_state[1])

r1 = reward + original_state[0]
r2 = reward + np.sin(3 * original_state[0])
r3 = reward + (original_state[1] * original_state[1])
r4 = abs(new_state[0] - (-0.5)) # r in [0, 1]

new_state = np.reshape(new_state, [1, 2])

agent1.memoise((state, next_action, r1, new_state, end))
agent2.memoise((state, next_action, r2, new_state, end))
agent3.memoise((state, next_action, r1, new_state, end))
agent4.memoise((state, next_action, r2, new_state, end))
agent5.memoise((state, next_action, r1, new_state, end))
agent6.memoise((state, next_action, r2, new_state, end))
agent7.memoise((state, next_action, r1, new_state, end))
agent8.memoise((state, next_action, r2, new_state, end))
#agent9.memoise((state, next_action, r1, new_state, end))
#agent10.memoise((state, next_action, r2, new_state, end))

if default_policy is False:
#agent0.update_q((state[0], state[1]), (new_state[0], new_state[1]), next_action, reward)
agent1.update_q((state[0], state[1]), (new_state[0], new_state[1]), next_action, reward + 0.1 * state_original[0])
agent2.update_q((state[0], state[1]), (new_state[0], new_state[1]), next_action, reward + 0.2 * np.sin(3 * state_original[0]))
agent3.update_q((state[0], state[1]), (new_state[0], new_state[1]), next_action, reward + 0.7 * (state_original[1] * state_original[1]))

if end:
if t == env._max_episode_steps - 1:
res[0] += 1
else:
res[1] += 1

print("ENTRATO!,", t, "steps","reward: ", cumulative_reward)

steps.append(t)
break
else:
state = new_state
cumulative_reward += reward

for agent in agentE.agents:
agent.learn()

cumulative_reward += reward
scores.append(cumulative_reward)
env.close()
cumulative_reward += reward
scores.append(cumulative_reward)
else:
evaluate = False
eval_res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory}
eval_res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory}
eval_scores = [] # Cumulative rewards
eval_steps = [] # Steps per episode

for i_episode in range(500):
state_original = env.reset()
for i_episode in range(100):
state = env.reset()

state = obs_to_state(env, state_original, n_states)
state = np.reshape(state, [1, 2])
cumulative_reward = 0

for t in range(env._max_episode_steps):
if (render):
env.render()

next_action = agentE.act((state[0], state[1]))
state_original, reward, end, _ = env.step(next_action)
new_state = obs_to_state(env, state_original, n_states)
next_action = agentE.act(state)
new_state, reward, end, _ = env.step(next_action)

new_state = np.reshape(new_state, [1, 2])

if end:
if t == env._max_episode_steps - 1:
eval_res[0] += 1
else:
eval_res[1] += 1


eval_steps.append(t)
break
else:
Expand All @@ -127,36 +146,25 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):

cumulative_reward += reward
eval_scores.append(cumulative_reward)
env.close()

testing_accuracy = accuracy(np.array(eval_res))
testing_mean_steps = np.array(eval_steps).mean()
testing_mean_score = np.array(eval_scores).mean()
print("\nTraining episodes:", len(steps), "Training mean score:", np.array(steps).mean(), "Training mean steps", np.array(scores).mean(), "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps)

return 0 # {"results": np.array(res), "steps": np.array(eval_steps), "scores": np.array(eval_scores), "Q": agent0.Q}
env.close()
return {"results": np.array(res), "steps": np.array(steps), "scores": np.array(scores)}


# Training
train_res = experiment(30000)
#learnt_policy = np.argmax(train_res["Q"], axis=2)
#training_mean_steps = train_res["steps"].mean()
#training_mean_score = train_res["scores"].mean()
#np.save('ql_policy.npy', learnt_policy)

# np.savetxt("results/ql.csv", train_res["steps"], delimiter=',')

# Testing
#test_agent = np.load('ql_policy.npy')
#test_res = experiment(500, default_policy=True, policy=test_agent)
#testing_accuracy = accuracy(test_res["results"])
#testing_mean_steps = test_res["steps"].mean()
#testing_mean_score = test_res["scores"].mean()
train_res = experiment(205)
training_mean_steps = train_res["steps"].mean()
training_mean_score = train_res["scores"].mean()

# np.savetxt("results/ql_test.csv", test_res["steps"], delimiter=',')
np.savetxt("results/ens_agent8_major.csv", train_res["steps"], delimiter=',')

#print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \
#"Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps)
print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \
"Training mean steps", training_mean_steps)

# Rendering
#experiment(2, 200, default_policy=True, policy=learnt_policy, render=True)

0 comments on commit 0c1fffd

Please sign in to comment.