Skip to content

Commit

Permalink
Add mixed mountaincar ensembling
Browse files Browse the repository at this point in the history
  • Loading branch information
hjorvardr committed Nov 26, 2018
1 parent 0c1fffd commit e5be340
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 4 deletions.
15 changes: 14 additions & 1 deletion Ensembling/ensembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,13 @@ def __init__(self, output_size, agents, ensembler_type):



def act(self, state):
def act(self, state, discrete_state=None):
original_state = state
if self.ensembler_type == EnsemblerType.MAJOR_VOTING_BASED:
for agent in self.agents:
state = original_state
if agent.discrete_state:
state = discrete_state
suggested_action = agent.act(state)
self.votes[suggested_action] += 1
action = np.random.choice(np.argwhere(self.votes==np.amax(self.votes)).flatten())
Expand All @@ -44,6 +48,9 @@ def act(self, state):
if self.ensembler_type == EnsemblerType.TRUST_BASED:
for i in range(len(self.agents)):
agent = self.agents[i]
state = original_state
if agent.discrete_state:
state = discrete_state
suggested_action = agent.act(state)
self.votes[suggested_action] += self.trust[i]

Expand All @@ -53,6 +60,9 @@ def act(self, state):

for i in range(len(self.agents)):
agent = self.agents[i]
state = original_state
if agent.discrete_state:
state = discrete_state
suggested_action = agent.act(state)
if action == suggested_action:
self.votes_per_agent[i] += 1
Expand All @@ -61,6 +71,9 @@ def act(self, state):

if self.ensembler_type == EnsemblerType.RANK_VOTING_BASED:
for agent in self.agents:
state = original_state
if agent.discrete_state:
state = discrete_state
suggested_action, prediction = agent.act(state, True)
# rank prediction actions
temp = prediction.argsort()
Expand Down
2 changes: 2 additions & 0 deletions Ensembling/ensembling_cart_pole.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def evaluate(env, agentE):

new_state, reward, end, _ = env.step(next_action)

new_state = np.reshape(new_state, [1, 4])

if end or t > 199:
if t < 195:
eval_res[0] += 1
Expand Down
177 changes: 177 additions & 0 deletions Ensembling/ensembling_mountaincar_mixed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import time
import gym
import numpy as np
from tqdm import tqdm
import os
import random as ran
import numpy as np
import keras.optimizers
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
import tensorflow as tf
from dqn_lib import DQNAgent
from sarsa_lib import SARSAAgent, QLAgent
from ensembler import *

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(91)
tf.set_random_seed(91)
n_states = 150

def accuracy(results):
"""
Evaluate the accuracy of results, considering victories and defeats.
"""
return results[1] / (results[0] + results[1]) * 100

def obs_to_state(env, obs, n_states):
""" Maps an observation to state """
env_low = env.observation_space.low
env_high = env.observation_space.high
env_dx = (env_high - env_low) / n_states
a = int((obs[0] - env_low[0]) / env_dx[0])
b = int((obs[1] - env_low[1]) / env_dx[1])
return a, b

def experiment(n_episodes, default_policy=False, policy=None, render=False):
res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory}
scores = [] # Cumulative rewards
steps = [] # Steps per episode

env = gym.make('MountainCar-v0')
env.seed(91)

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

layer1 = Dense(15, input_dim=input_dim, activation='relu')
layer2 = Dense(output_dim)

agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1)
agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1)


agents = [agent1, agent2, agent3]
agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED)

evaluate = False

for i_episode in tqdm(range(n_episodes + 1), desc="Episode"):
state = env.reset()
if (i_episode % 100) == 0:
agent3.extract_policy()
discretized_state = obs_to_state(env, state, n_states)
cumulative_reward = 0

state = np.reshape(state, [1, 2])

if i_episode > 0 and i_episode % 100 == 0:
evaluate = True

if evaluate == False:
for t in range(env._max_episode_steps):
if (render):
env.render()

next_action = agentE.act(state, discretized_state)
new_state, reward, end, _ = env.step(next_action)
new_discretized_state = obs_to_state(env, new_state, n_states)
original_state = new_state

# r1 = reward + 0.1 * original_state[0]
# r2 = reward + 0.2 * np.sin(3 * original_state[0])
# r3 = reward + 0.7 * (original_state[1] * original_state[1])

r1 = reward + original_state[0]
r2 = reward + np.sin(3 * original_state[0])
r3 = reward + (original_state[1] * original_state[1])
r4 = abs(new_state[0] - (-0.5)) # r in [0, 1]
reward = r4

new_state = np.reshape(new_state, [1, 2])

agent1.memoise((state, next_action, reward, new_state, end))
agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)


if end:
if t == env._max_episode_steps - 1:
res[0] += 1
else:
res[1] += 1
print("ENTRATO!,", t, "steps","reward: ", cumulative_reward)

steps.append(t)
break
else:
state = new_state
discretized_state = new_discretized_state
cumulative_reward += reward

agent1.learn()

cumulative_reward += reward
scores.append(cumulative_reward)
else:
evaluate = False
eval_res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory}
eval_scores = [] # Cumulative rewards
eval_steps = [] # Steps per episode

for i_episode in range(100):
state = env.reset()
discretized_state = obs_to_state(env, state, n_states)

state = np.reshape(state, [1, 2])
cumulative_reward = 0

for t in range(env._max_episode_steps):
if (render):
env.render()

next_action = agentE.act(state, discretized_state)
new_state, reward, end, _ = env.step(next_action)
new_discretized_state = obs_to_state(env, new_state, n_states)
original_state = new_state


if end:
if t == env._max_episode_steps - 1:
eval_res[0] += 1
else:
eval_res[1] += 1

eval_steps.append(t)
break
else:
state = new_state
discretized_state = new_discretized_state
cumulative_reward += reward

cumulative_reward += reward
eval_scores.append(cumulative_reward)

testing_accuracy = accuracy(np.array(eval_res))
testing_mean_steps = np.array(eval_steps).mean()
testing_mean_score = np.array(eval_scores).mean()
print("\nTraining episodes:", len(steps), "Training mean score:", np.array(steps).mean(), "Training mean steps", np.array(scores).mean(), "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps)

env.close()
return {"results": np.array(res), "steps": np.array(steps), "scores": np.array(scores)}


# Training
train_res = experiment(505)
training_mean_steps = train_res["steps"].mean()
training_mean_score = train_res["scores"].mean()

np.savetxt("results/ens_mixed_major.csv", train_res["steps"], delimiter=',')

print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \
"Training mean steps", training_mean_steps)

# Rendering
#experiment(2, 200, default_policy=True, policy=learnt_policy, render=True)
1 change: 1 addition & 0 deletions ReinforcementLearningLib/dqn_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, output_size, layers, memory_size=3000, batch_size=32,
self.batch_size = batch_size
self.learn_thresh = learn_thresh # Number of steps from which the network starts learning
self.update_rate = update_rate
self.discrete_state = False

if self.default_policy:
self.evaluate_model = self.load_model(model_filename)
Expand Down
1 change: 1 addition & 0 deletions ReinforcementLearningLib/qlearning_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1,
self.epsilon_decay_function = epsilon_decay_function
self.policy = policy
self.actions = shape[-1]
self.discrete_state = True
np.random.seed(91)

def update_q(self, state, new_state, action, reward):
Expand Down
8 changes: 5 additions & 3 deletions ReinforcementLearningLib/sarsa_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1,
epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6):
super().__init__(shape, alpha, gamma, policy, epsilon, epsilon_lower_bound,
epsilon_decay_function)
self.current_policy = policy
self.current_policy = None
if policy is not None:
self.current_policy = policy
self.shape = shape

def extract_policy(self):
Expand All @@ -32,9 +34,9 @@ def update_q(self, state, new_state, action, reward):
next_action = self.current_policy[new_state]
self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * self.Q[new_state][next_action])

def act(self, state, episode_number):
def act(self, state, episode_number=None): # TODO: controllare episode_number
if (self.policy is not None):
next_action = self.current_policy[state]
next_action = self.policy[state]
else:
self.epsilon = self.epsilon_decay_function(self.epsilon)
self.epsilon = np.amax([self.epsilon, self.epsilon_lower_bound])
Expand Down

0 comments on commit e5be340

Please sign in to comment.