diff --git a/Ensembling/ensembler.py b/Ensembling/ensembler.py index f922982d..907a976b 100644 --- a/Ensembling/ensembler.py +++ b/Ensembling/ensembler.py @@ -32,9 +32,13 @@ def __init__(self, output_size, agents, ensembler_type): - def act(self, state): + def act(self, state, discrete_state=None): + original_state = state if self.ensembler_type == EnsemblerType.MAJOR_VOTING_BASED: for agent in self.agents: + state = original_state + if agent.discrete_state: + state = discrete_state suggested_action = agent.act(state) self.votes[suggested_action] += 1 action = np.random.choice(np.argwhere(self.votes==np.amax(self.votes)).flatten()) @@ -44,6 +48,9 @@ def act(self, state): if self.ensembler_type == EnsemblerType.TRUST_BASED: for i in range(len(self.agents)): agent = self.agents[i] + state = original_state + if agent.discrete_state: + state = discrete_state suggested_action = agent.act(state) self.votes[suggested_action] += self.trust[i] @@ -53,6 +60,9 @@ def act(self, state): for i in range(len(self.agents)): agent = self.agents[i] + state = original_state + if agent.discrete_state: + state = discrete_state suggested_action = agent.act(state) if action == suggested_action: self.votes_per_agent[i] += 1 @@ -61,6 +71,9 @@ def act(self, state): if self.ensembler_type == EnsemblerType.RANK_VOTING_BASED: for agent in self.agents: + state = original_state + if agent.discrete_state: + state = discrete_state suggested_action, prediction = agent.act(state, True) # rank prediction actions temp = prediction.argsort() diff --git a/Ensembling/ensembling_cart_pole.py b/Ensembling/ensembling_cart_pole.py index 59a4aea0..f9c1fbec 100644 --- a/Ensembling/ensembling_cart_pole.py +++ b/Ensembling/ensembling_cart_pole.py @@ -49,6 +49,8 @@ def evaluate(env, agentE): new_state, reward, end, _ = env.step(next_action) + new_state = np.reshape(new_state, [1, 4]) + if end or t > 199: if t < 195: eval_res[0] += 1 diff --git a/Ensembling/ensembling_mountaincar_mixed.py b/Ensembling/ensembling_mountaincar_mixed.py new file mode 100644 index 00000000..7eab2ce8 --- /dev/null +++ b/Ensembling/ensembling_mountaincar_mixed.py @@ -0,0 +1,177 @@ +import time +import gym +import numpy as np +from tqdm import tqdm +import os +import random as ran +import numpy as np +import keras.optimizers +import tensorflow as tf +from keras import backend as K +from keras.layers import Dense +import tensorflow as tf +from dqn_lib import DQNAgent +from sarsa_lib import SARSAAgent, QLAgent +from ensembler import * + +os.environ['PYTHONHASHSEED'] = '0' +np.random.seed(91) +tf.set_random_seed(91) +n_states = 150 + +def accuracy(results): + """ + Evaluate the accuracy of results, considering victories and defeats. + """ + return results[1] / (results[0] + results[1]) * 100 + +def obs_to_state(env, obs, n_states): + """ Maps an observation to state """ + env_low = env.observation_space.low + env_high = env.observation_space.high + env_dx = (env_high - env_low) / n_states + a = int((obs[0] - env_low[0]) / env_dx[0]) + b = int((obs[1] - env_low[1]) / env_dx[1]) + return a, b + +def experiment(n_episodes, default_policy=False, policy=None, render=False): + res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} + scores = [] # Cumulative rewards + steps = [] # Steps per episode + + env = gym.make('MountainCar-v0') + env.seed(91) + + input_dim = env.observation_space.shape[0] + output_dim = env.action_space.n + + layer1 = Dense(15, input_dim=input_dim, activation='relu') + layer2 = Dense(output_dim) + + agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) + agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) + agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) + + + agents = [agent1, agent2, agent3] + agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED) + + evaluate = False + + for i_episode in tqdm(range(n_episodes + 1), desc="Episode"): + state = env.reset() + if (i_episode % 100) == 0: + agent3.extract_policy() + discretized_state = obs_to_state(env, state, n_states) + cumulative_reward = 0 + + state = np.reshape(state, [1, 2]) + + if i_episode > 0 and i_episode % 100 == 0: + evaluate = True + + if evaluate == False: + for t in range(env._max_episode_steps): + if (render): + env.render() + + next_action = agentE.act(state, discretized_state) + new_state, reward, end, _ = env.step(next_action) + new_discretized_state = obs_to_state(env, new_state, n_states) + original_state = new_state + + # r1 = reward + 0.1 * original_state[0] + # r2 = reward + 0.2 * np.sin(3 * original_state[0]) + # r3 = reward + 0.7 * (original_state[1] * original_state[1]) + + r1 = reward + original_state[0] + r2 = reward + np.sin(3 * original_state[0]) + r3 = reward + (original_state[1] * original_state[1]) + r4 = abs(new_state[0] - (-0.5)) # r in [0, 1] + reward = r4 + + new_state = np.reshape(new_state, [1, 2]) + + agent1.memoise((state, next_action, reward, new_state, end)) + agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward) + agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward) + + + if end: + if t == env._max_episode_steps - 1: + res[0] += 1 + else: + res[1] += 1 + print("ENTRATO!,", t, "steps","reward: ", cumulative_reward) + + steps.append(t) + break + else: + state = new_state + discretized_state = new_discretized_state + cumulative_reward += reward + + agent1.learn() + + cumulative_reward += reward + scores.append(cumulative_reward) + else: + evaluate = False + eval_res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} + eval_scores = [] # Cumulative rewards + eval_steps = [] # Steps per episode + + for i_episode in range(100): + state = env.reset() + discretized_state = obs_to_state(env, state, n_states) + + state = np.reshape(state, [1, 2]) + cumulative_reward = 0 + + for t in range(env._max_episode_steps): + if (render): + env.render() + + next_action = agentE.act(state, discretized_state) + new_state, reward, end, _ = env.step(next_action) + new_discretized_state = obs_to_state(env, new_state, n_states) + original_state = new_state + + + if end: + if t == env._max_episode_steps - 1: + eval_res[0] += 1 + else: + eval_res[1] += 1 + + eval_steps.append(t) + break + else: + state = new_state + discretized_state = new_discretized_state + cumulative_reward += reward + + cumulative_reward += reward + eval_scores.append(cumulative_reward) + + testing_accuracy = accuracy(np.array(eval_res)) + testing_mean_steps = np.array(eval_steps).mean() + testing_mean_score = np.array(eval_scores).mean() + print("\nTraining episodes:", len(steps), "Training mean score:", np.array(steps).mean(), "Training mean steps", np.array(scores).mean(), "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) + + env.close() + return {"results": np.array(res), "steps": np.array(steps), "scores": np.array(scores)} + + +# Training +train_res = experiment(505) +training_mean_steps = train_res["steps"].mean() +training_mean_score = train_res["scores"].mean() + +np.savetxt("results/ens_mixed_major.csv", train_res["steps"], delimiter=',') + +print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ +"Training mean steps", training_mean_steps) + +# Rendering +#experiment(2, 200, default_policy=True, policy=learnt_policy, render=True) diff --git a/ReinforcementLearningLib/dqn_lib.py b/ReinforcementLearningLib/dqn_lib.py index 178d7625..d67a5400 100644 --- a/ReinforcementLearningLib/dqn_lib.py +++ b/ReinforcementLearningLib/dqn_lib.py @@ -48,6 +48,7 @@ def __init__(self, output_size, layers, memory_size=3000, batch_size=32, self.batch_size = batch_size self.learn_thresh = learn_thresh # Number of steps from which the network starts learning self.update_rate = update_rate + self.discrete_state = False if self.default_policy: self.evaluate_model = self.load_model(model_filename) diff --git a/ReinforcementLearningLib/qlearning_lib.py b/ReinforcementLearningLib/qlearning_lib.py index 2340b2cb..9056272d 100644 --- a/ReinforcementLearningLib/qlearning_lib.py +++ b/ReinforcementLearningLib/qlearning_lib.py @@ -13,6 +13,7 @@ def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, self.epsilon_decay_function = epsilon_decay_function self.policy = policy self.actions = shape[-1] + self.discrete_state = True np.random.seed(91) def update_q(self, state, new_state, action, reward): diff --git a/ReinforcementLearningLib/sarsa_lib.py b/ReinforcementLearningLib/sarsa_lib.py index 05adbf99..4ce1dbe9 100644 --- a/ReinforcementLearningLib/sarsa_lib.py +++ b/ReinforcementLearningLib/sarsa_lib.py @@ -8,7 +8,9 @@ def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6): super().__init__(shape, alpha, gamma, policy, epsilon, epsilon_lower_bound, epsilon_decay_function) - self.current_policy = policy + self.current_policy = None + if policy is not None: + self.current_policy = policy self.shape = shape def extract_policy(self): @@ -32,9 +34,9 @@ def update_q(self, state, new_state, action, reward): next_action = self.current_policy[new_state] self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * self.Q[new_state][next_action]) - def act(self, state, episode_number): + def act(self, state, episode_number=None): # TODO: controllare episode_number if (self.policy is not None): - next_action = self.current_policy[state] + next_action = self.policy[state] else: self.epsilon = self.epsilon_decay_function(self.epsilon) self.epsilon = np.amax([self.epsilon, self.epsilon_lower_bound])