diff --git a/Breakout/Breakout_DQN.py b/Breakout/Breakout_DQN.py index 1513b55d..5b00da7c 100644 --- a/Breakout/Breakout_DQN.py +++ b/Breakout/Breakout_DQN.py @@ -1,15 +1,15 @@ -import random as ran import os +import random as ran import time import gym -import numpy as np -import tensorflow as tf from keras import backend as K from keras.initializers import VarianceScaling from keras.layers import Dense, Flatten from keras.layers.convolutional import Conv2D +import numpy as np from skimage.color import rgb2gray from skimage.transform import resize +import tensorflow as tf from tqdm import tqdm from dqn_lib import DQNAgent @@ -22,11 +22,7 @@ def pre_processing(observe): return processed_observe -# 0: stay -# 1: start -# 2: right -# 3: left - +# 0: stay, 1: start, 2: right, 3: left def experiment(n_episodes, max_action, default_policy=False, policy=None, render=False): with tf.device('/gpu:0'): @@ -36,24 +32,18 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render env = gym.make('BreakoutDeterministic-v4') - # if default_policy: - # env._max_episode_steps = 5000000 - # else: - # env._max_episode_steps = 1000000 - input_dim = env.observation_space.shape[0] output_dim = env.action_space.n - - layers = [Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)), - Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), - Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), - Flatten(), - Dense(512, activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), - Dense(output_dim)] if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0.05, epsilon_lower_bound=0.05) else: + layers = [Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)), + Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), + Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), + Flatten(), + Dense(512, activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), + Dense(output_dim)] agent = DQNAgent(output_dim, layers, use_ddqn=True, memory_size=720000, gamma=0.99) gathered_frame = 0 @@ -65,6 +55,7 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render has_lost_life = True start_life = env.unwrapped.ale.lives() + t = 0 while True: if has_lost_life: @@ -80,6 +71,7 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render new_stack = np.append(new_state, stack[:, :, :, :3], axis=3) agent.memoise((stack, next_action, reward, new_state, end)) stack = new_stack + if (render): env.render() @@ -87,9 +79,12 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render next_action = agent.act(stack) new_state, reward, end, info = env.step(next_action) + if (render): env.render() + reward = np.clip(reward, -1., 1.) + if info['ale.lives'] < start_life: has_lost_life = True start_life = info['ale.lives'] @@ -120,14 +115,13 @@ def experiment(n_episodes, max_action, default_policy=False, policy=None, render if episode_number >= 100 and episode_number % 50 == 0: model_name = "partial_model_breakout" + str(episode_number) agent.save_model(model_name) - env.close() return {"results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent} # Training -#res = experiment(100000, 10000000, render=False) -#res["agent"].save_model("final_model") +res = experiment(100000, 10000000, render=False) +res["agent"].save_model("ddqn") # Testing -res = experiment(20, 10000000, render=True, default_policy=True, policy="partial_model_breakout12250") +res = experiment(20, 10000000, render=True, default_policy=True, policy="ddqn") diff --git a/CartPole/dqn_cart_pole.py b/CartPole/dqn_cart_pole.py index 057f28c5..764b93eb 100644 --- a/CartPole/dqn_cart_pole.py +++ b/CartPole/dqn_cart_pole.py @@ -1,27 +1,17 @@ import os import time import gym -import numpy as np import keras.optimizers -import tensorflow as tf from keras import backend as K from keras.layers import Dense +import numpy as np +import tensorflow as tf from tqdm import tqdm from dqn_lib import DQNAgent os.environ['PYTHONHASHSEED'] = '0' - seed = 73 -# The below is necessary for starting Numpy generated random numbers -# in a well-defined initial state. - np.random.seed(seed) - -# The below is necessary for starting core Python generated random numbers -# in a well-defined state. - -# random.seed(seed) - tf.set_random_seed(seed) @@ -43,14 +33,16 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): input_dim = env.observation_space.shape[0] output_dim = env.action_space.n - - layer1 = Dense(10, input_dim=input_dim, activation='relu') - layer2 = Dense(output_dim) if default_policy: - agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0, tb_dir=None) + agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, + epsilon=0, epsilon_lower_bound=0, learn_thresh=0, tb_dir=None) else: - agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) + layer1 = Dense(10, input_dim=input_dim, activation='relu') + layer2 = Dense(output_dim) + agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, + epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.1, + optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) for _ in tqdm(range(n_episodes), desc="Episode"): state = env.reset() @@ -65,7 +57,6 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): time.sleep(0.1) next_action = agent.act(state) - new_state, reward, end, _ = env.step(next_action) x, x_dot, theta, theta_dot = new_state @@ -75,19 +66,15 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r3 = -abs(theta_dot) reward = r1 + r2 + r3 - # reward = r1 + r2 - # reward = r1 agent.memoise((state, next_action, reward, new_state, end)) if end or t > 199: if t < 195: res[0] += 1 - #reward = reward - 100 - #memory.append((state, next_action, reward, new_state, end)) else: res[1] += 1 - print("ENTRATO!,", t, "steps","reward: ",cumulative_reward) + # print("ENTRATO!,", t, "steps","reward: ",cumulative_reward) steps.append(t) break @@ -124,4 +111,4 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): "Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) # Rendering -# experiment(1, render=True, default_policy=True, policy="model1") +# experiment(1, render=True, default_policy=True, policy="model_cp") diff --git a/Ensembling/ensembling_cart_pole.py b/Ensembling/ensembling_cart_pole.py index f9c1fbec..d9f93a45 100644 --- a/Ensembling/ensembling_cart_pole.py +++ b/Ensembling/ensembling_cart_pole.py @@ -1,28 +1,18 @@ import os import time import gym -import numpy as np -import keras.optimizers -import tensorflow as tf from keras import backend as K from keras.layers import Dense +import keras.optimizers +import numpy as np +import tensorflow as tf from tqdm import tqdm from dqn_lib import DQNAgent from ensembler import * os.environ['PYTHONHASHSEED'] = '0' - seed = 73 -# The below is necessary for starting Numpy generated random numbers -# in a well-defined initial state. - np.random.seed(seed) - -# The below is necessary for starting core Python generated random numbers -# in a well-defined state. - -# random.seed(seed) - tf.set_random_seed(seed) @@ -32,6 +22,7 @@ def accuracy(results): """ return results[1] / (results[0] + results[1]) * 100 + def evaluate(env, agentE): eval_steps = [] eval_scores = [] @@ -67,8 +58,6 @@ def evaluate(env, agentE): training_mean_steps = np.array(eval_steps).mean() training_mean_score = np.array(eval_scores).mean() - - print("\nEval episodes:", 200, "Eval mean score:", training_mean_score, \ "Eval mean steps", training_mean_steps, "accuracy:",accuracy(eval_res)) @@ -77,7 +66,6 @@ def evaluate(env, agentE): return False - def experiment(n_episodes, default_policy=False, policy=None, render = False): res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards @@ -106,8 +94,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.0001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) + agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9, agent10] - agentE = EnsemblerAgent(output_dim, [agent1, agent2, agent3, agent4, agent5, agent6, agent7], EnsemblerType.TRUST_BASED) + agentE = EnsemblerAgent(output_dim, agents, EnsemblerType.TRUST_BASED) for i_ep in tqdm(range(n_episodes), desc="Episode"): state = env.reset() @@ -178,7 +167,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() -np.savetxt("results/ens_agents10_trust.csv", train_res["steps"], delimiter=',') +# np.savetxt("results/ens_agents10_trust.csv", train_res["steps"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps) diff --git a/Ensembling/ensembling_mountaincar.py b/Ensembling/ensembling_mountaincar.py index 3f2f54f6..7805b17d 100644 --- a/Ensembling/ensembling_mountaincar.py +++ b/Ensembling/ensembling_mountaincar.py @@ -1,21 +1,20 @@ +import os import time import gym -import numpy as np -from tqdm import tqdm -import os -import random as ran -import numpy as np -import keras.optimizers -import tensorflow as tf from keras import backend as K from keras.layers import Dense +import keras.optimizers +import numpy as np import tensorflow as tf +from tqdm import tqdm from dqn_lib import DQNAgent from ensembler import * +seed = 91 os.environ['PYTHONHASHSEED'] = '0' -np.random.seed(91) -tf.set_random_seed(91) +np.random.seed(seed) +tf.set_random_seed(seed) + def accuracy(results): """ @@ -23,13 +22,14 @@ def accuracy(results): """ return results[1] / (results[0] + results[1]) * 100 + def experiment(n_episodes, default_policy=False, policy=None, render=False): res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode env = gym.make('MountainCar-v0') - env.seed(91) + env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n @@ -47,7 +47,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent8 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent9 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) agent10 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) - agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8] + agents = [agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9, agent10] agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED) evaluate = False @@ -89,9 +89,8 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent6.memoise((state, next_action, r2, new_state, end)) agent7.memoise((state, next_action, r1, new_state, end)) agent8.memoise((state, next_action, r2, new_state, end)) - #agent9.memoise((state, next_action, r1, new_state, end)) - #agent10.memoise((state, next_action, r2, new_state, end)) - + agent9.memoise((state, next_action, r1, new_state, end)) + agent10.memoise((state, next_action, r2, new_state, end)) if end: if t == env._max_episode_steps - 1: @@ -161,7 +160,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() -np.savetxt("results/ens_agent8_major.csv", train_res["steps"], delimiter=',') +# np.savetxt("results/ens_agent8_major.csv", train_res["steps"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps) diff --git a/Ensembling/ensembling_mountaincar_mixed.py b/Ensembling/ensembling_mountaincar_mixed.py index a90e7f4b..f11be6cf 100644 --- a/Ensembling/ensembling_mountaincar_mixed.py +++ b/Ensembling/ensembling_mountaincar_mixed.py @@ -1,30 +1,31 @@ +import os import time import gym -import numpy as np -from tqdm import tqdm -import os -import random as ran -import numpy as np -import keras.optimizers -import tensorflow as tf from keras import backend as K from keras.layers import Dense +import keras.optimizers +import numpy as np import tensorflow as tf +from tqdm import tqdm from dqn_lib import DQNAgent from sarsa_lib import SARSAAgent, QLAgent from ensembler import * +seed = 91 + os.environ['PYTHONHASHSEED'] = '0' -np.random.seed(91) -tf.set_random_seed(91) +np.random.seed(seed) +tf.set_random_seed(seed) n_states = 150 + def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. """ return results[1] / (results[0] + results[1]) * 100 + def obs_to_state(env, obs, n_states): """ Maps an observation to state """ env_low = env.observation_space.low @@ -34,13 +35,14 @@ def obs_to_state(env, obs, n_states): b = int((obs[1] - env_low[1]) / env_dx[1]) return a, b + def experiment(n_episodes, default_policy=False, policy=None, render=False): res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode env = gym.make('MountainCar-v0') - env.seed(91) + env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n @@ -170,7 +172,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() -np.savetxt("results/ens_mixed_trust_cont.csv", train_res["steps"], delimiter=',') +# np.savetxt("results/ens_mixed_trust_cont.csv", train_res["steps"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps) diff --git a/Ensembling/newens.py b/Ensembling/newens.py deleted file mode 100644 index 319ace39..00000000 --- a/Ensembling/newens.py +++ /dev/null @@ -1,283 +0,0 @@ -import random as ran -import numpy as np -import tensorflow as tf -from keras import backend as K -from keras.models import Sequential, load_model -from keras.optimizers import RMSprop, Adam -from keras.callbacks import TensorBoard - - -def huber_loss(a, b, in_keras=True): - error = a - b - quadratic_term = error*error / 2 - linear_term = abs(error) - 1/2 - use_linear_term = (abs(error) > 1.0) - if in_keras: - # Keras won't let us multiply floats by booleans, so we explicitly cast the booleans to floats - use_linear_term = K.cast(use_linear_term, 'float32') - return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term - -class DQNAgent: - - def __init__(self, output_size, layers, memory_size=3000, batch_size=32, - use_ddqn=False, default_policy=False, model_filename=None, tb_dir=None, - epsilon=1, epsilon_lower_bound=0.1, epsilon_decay_function=lambda e: e - (0.9 / 1000000), - gamma=0.95, optimizer=RMSprop(0.00025), learn_thresh=50000, - update_rate=10000): - self.output_size = output_size - self.memory = RingBuffer(memory_size) - self.use_ddqn = use_ddqn - self.default_policy = default_policy - # Tensorboard parameters - self.tb_step = 0 - self.tb_gather = 500 - self.tb_dir = tb_dir - if tb_dir is not None: - self.tensorboard = TensorBoard(log_dir='./Monitoring/%s' % tb_dir, write_graph=False) - print("Tensorboard Loaded! (log_dir: %s)" % self.tensorboard.log_dir) - # Exploration/Exploitation parameters - self.epsilon = epsilon - self.epsilon_decay_function = epsilon_decay_function - self.epsilon_lower_bound = epsilon_lower_bound - self.total_steps = 0 - # Learning parameters - self.gamma = gamma - self.loss = huber_loss #'mean_squared_error' - self.optimizer = optimizer - self.batch_size = batch_size - self.learn_thresh = learn_thresh # Number of steps from which the network starts learning - self.update_rate = update_rate - - if self.default_policy: - self.evaluate_model = self.load_model(model_filename) - else: - self.evaluate_model = self.build_model(layers) - - if self.use_ddqn: - self.target_model = self.build_model(layers) - #self.evaluate_model.summary() - - def build_model(self, layers): - model = Sequential() - for l in layers: - model.add(l) - model.compile(loss=self.loss, optimizer=self.optimizer) - - return model - - def update_target_model(self): - self.target_model.set_weights(self.evaluate_model.get_weights()) - - def replay(self): - pick = self.random_pick() - for state, next_action, reward, new_state, end in pick: - # for state, next_action, reward, frame, end in pick: - # state = np.float32(state / 255) # TODO: generalisation - # frame = np.float32(frame / 255) # TODO: generalisation - # new_state = np.append(frame, state[:, :, :, :3], axis=3) # TODO: generalisation - if self.use_ddqn == False: - if not end: - reward = reward + self.gamma * np.amax(self.evaluate_model.predict(new_state)[0]) - - new_prediction = self.evaluate_model.predict(state) - new_prediction[0][next_action] = reward - else: - if not end: - action = np.argmax(self.evaluate_model.predict(new_state)[0]) - reward = reward + self.gamma * self.target_model.predict(new_state)[0][action] - - new_prediction = self.target_model.predict(state) - new_prediction[0][next_action] = reward - - if (self.tb_step % self.tb_gather) == 0 and self.tb_dir is not None: - self.evaluate_model.fit(state, new_prediction, verbose=0, callbacks=[self.tensorboard]) - else: - self.evaluate_model.fit(state, new_prediction, verbose=0) - self.tb_step += 1 - - def random_pick(self): - return self.memory.random_pick(self.batch_size) - - def act(self, state): - if np.random.uniform() > self.epsilon: - # state = np.float32(state / 255) # TODO: generalisation - prediction = self.evaluate_model.predict(state)[0] - next_action = np.argmax(prediction) - else: - prediction = np.random.uniform(0, 1, size=self.output_size) - next_action = np.argmax(prediction) - - if self.total_steps > self.learn_thresh: - self.epsilon = self.epsilon_decay_function(self.epsilon) - self.epsilon = np.amax([self.epsilon, self.epsilon_lower_bound]) - - self.total_steps += 1 - - return next_action, prediction - - def memoise(self, t): - if not self.default_policy: - self.memory.append(t) - - def learn(self): - if (self.total_steps > self.learn_thresh and - (self.total_steps % self.update_rate) == 0 and not self.default_policy and - self.use_ddqn == True): - self.update_target_model() - if self.total_steps > self.learn_thresh and not self.default_policy and self.total_steps % 4 == 0: - self.replay() - - def save_model(self, filename): - self.evaluate_model.save('%s.h5' % filename) - - def load_model(self, filename): - return load_model('%s.h5' % filename, custom_objects={ 'huber_loss': huber_loss }) - - -# In[23]: - - -import os -import time -import gym -import numpy as np -import keras.optimizers -import pandas as pd -import tensorflow as tf -from keras import backend as K -from keras.layers import Dense, Dropout -from tqdm import tqdm - - -os.environ['PYTHONHASHSEED'] = '0' -np.random.seed(91) -tf.set_random_seed(91) - -def experiment(n_episodes, default_policy=False, policy=None, render=False): - res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} - scores = [] # Cumulative rewards - steps = [] # Steps per episode - - env = gym.make('MountainCar-v0') - env.seed(91) - n_states = 150 - - input_dim = env.observation_space.shape[0] - output_dim = env.action_space.n - - layer1 = Dense(15, input_dim=input_dim, activation='relu') - layer2 = Dense(output_dim) - - - if (default_policy): - agentE = QLAgent([n_states, n_states, env.action_space.n], policy=policy, epsilon=0.01, epsilon_lower_bound=0.01) - else: - agent0 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001)) - agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001)) - agent2 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001)) - agent3 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001)) - agents = [agent1, agent2, agent3] - agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.RANK_VOTING_BASED) - - evaluate = False - - for i_episode in tqdm(range(n_episodes + 1), desc="Episode"): - state_original = env.reset() - - state = obs_to_state(env, state_original, n_states) - cumulative_reward = 0 - - if i_episode > 0 and i_episode % 30 == 0: - evaluate = True - - state = np.reshape(state, [1, 2]) - - if evaluate == False: - - - for t in range(env._max_episode_steps): - if (render): - env.render() - - next_action = agentE.act(state) - state_original, reward, end, _ = env.step(next_action) - - - new_state = np.reshape(state_original, [1, 2]) - - #agent0.memoise((state, next_action, reward, new_state, end)) - agent1.memoise((state, next_action, reward + 0.4 * state_original[0], new_state, end)) - agent2.memoise((state, next_action, reward + 0.5 * np.sin(3 * state_original[0]), new_state, end)) - agent3.memoise((state, next_action, reward + 0.9 * (state_original[1] * state_original[1]), new_state, end)) - - if end: - if t == env._max_episode_steps - 1: - res[0] += 1 - else: - res[1] += 1 - print("ENTRATO!,", t, "steps") - - steps.append(t) - break - else: - state = new_state - cumulative_reward += reward - - agent0.learn() - agent1.learn() - agent2.learn() - agent3.learn() - - cumulative_reward += reward - scores.append(cumulative_reward) - env.close() - - else: - evaluate = False - eval_res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} - eval_scores = [] # Cumulative rewards - eval_steps = [] # Steps per episode - - for i_episode in range(500): - if (render): - env.render() - - next_action = agentE.act(state) - state_original, reward, end, _ = env.step(next_action) - - - new_state = np.reshape(state_original, [1, 2]) - - if end: - if t == env._max_episode_steps - 1: - res[0] += 1 - else: - res[1] += 1 - print("ENTRATO!,", t, "steps") - - eval_steps.append(t) - break - else: - state = new_state - cumulative_reward += reward - cumulative_reward += reward - eval_scores.append(cumulative_reward) - env.close() - - testing_accuracy = accuracy(np.array(res)) - testing_mean_steps = np.array(eval_steps).mean() - testing_mean_score = np.array(eval_scores).mean() - print("\nTraining episodes:", len(steps), "Training mean score:", np.array(steps).mean(), "Training mean steps", np.array(scores).mean(), "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) - - return 0 # {"results": np.array(res), "steps": np.array(eval_steps), "scores": np.array(eval_scores), "Q": agent0.Q} - - -# Training -train_res = experiment(2500) - - -# In[ ]: - - - - diff --git a/FrozenLake/ql_4x4.py b/FrozenLake/ql_4x4.py index 04425c18..160d49a1 100644 --- a/FrozenLake/ql_4x4.py +++ b/FrozenLake/ql_4x4.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from qlearning_lib import QLAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLake-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = QLAgent([env.observation_space.n, env.action_space.n], policy=policy) diff --git a/FrozenLake/ql_4x4_deterministic.py b/FrozenLake/ql_4x4_deterministic.py index 610ae3b0..04d905c8 100644 --- a/FrozenLake/ql_4x4_deterministic.py +++ b/FrozenLake/ql_4x4_deterministic.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from qlearning_lib import QLAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLakeNotSlippery-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = QLAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1) @@ -81,6 +82,5 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) - # Rendering #experiment(5, default_policy=True, policy=learnt_policy, render=True) \ No newline at end of file diff --git a/FrozenLake/ql_8x8.py b/FrozenLake/ql_8x8.py index b09c1373..88226b57 100644 --- a/FrozenLake/ql_8x8.py +++ b/FrozenLake/ql_8x8.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from qlearning_lib import QLAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLake8x8-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = QLAgent([env.observation_space.n, env.action_space.n], policy=policy) diff --git a/FrozenLake/ql_8x8_deterministic.py b/FrozenLake/ql_8x8_deterministic.py index 7ebc91ef..05560848 100644 --- a/FrozenLake/ql_8x8_deterministic.py +++ b/FrozenLake/ql_8x8_deterministic.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from qlearning_lib import QLAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLakeNotSlippery8x8-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = QLAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1) diff --git a/FrozenLake/sarsa_4x4.py b/FrozenLake/sarsa_4x4.py index 18c6d372..89898f20 100644 --- a/FrozenLake/sarsa_4x4.py +++ b/FrozenLake/sarsa_4x4.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from sarsa_lib import SARSAAgent +seed = 91 def accuracy(results): """ @@ -18,16 +19,18 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLake-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy) else: - agent = SARSAAgent([env.observation_space.n, env.action_space.n], update_rate=15, epsilon_decay_function=lambda e: e * 0.995) + agent = SARSAAgent([env.observation_space.n, env.action_space.n], update_rate=15, + epsilon_decay_function=lambda e: e * 0.995) for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 + if not default_policy: agent.extract_policy() diff --git a/FrozenLake/sarsa_4x4_deterministic.py b/FrozenLake/sarsa_4x4_deterministic.py index 22d1801d..36fb362b 100644 --- a/FrozenLake/sarsa_4x4_deterministic.py +++ b/FrozenLake/sarsa_4x4_deterministic.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from sarsa_lib import SARSAAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLakeNotSlippery-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1) @@ -29,6 +30,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 + if not default_policy: agent.extract_policy() diff --git a/FrozenLake/sarsa_8x8.py b/FrozenLake/sarsa_8x8.py index 25eef2f3..c3515e06 100644 --- a/FrozenLake/sarsa_8x8.py +++ b/FrozenLake/sarsa_8x8.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from sarsa_lib import SARSAAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLake8x8-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy) @@ -29,6 +30,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 + if not default_policy: agent.extract_policy() diff --git a/FrozenLake/sarsa_8x8_deterministic.py b/FrozenLake/sarsa_8x8_deterministic.py index 14469f36..83ad4751 100644 --- a/FrozenLake/sarsa_8x8_deterministic.py +++ b/FrozenLake/sarsa_8x8_deterministic.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from sarsa_lib import SARSAAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('FrozenLakeNotSlippery8x8-v0') - env.seed(91) + env.seed(seed) if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1) @@ -29,6 +30,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 + if not default_policy: agent.extract_policy() diff --git a/MountainCar/dqn_mountain_car.py b/MountainCar/dqn_mountain_car.py index 8af50629..5ef1b539 100644 --- a/MountainCar/dqn_mountain_car.py +++ b/MountainCar/dqn_mountain_car.py @@ -1,18 +1,19 @@ import os import time import gym -import numpy as np import keras.optimizers +from keras import backend as K +from keras.layers import Dense +import numpy as np import pandas as pd import tensorflow as tf -from keras import backend as K -from keras.layers import Dense, Dropout from tqdm import tqdm from dqn_lib import DQNAgent os.environ['PYTHONHASHSEED'] = '0' -np.random.seed(17) -tf.set_random_seed(17) +seed = 17 +np.random.seed(seed) +tf.set_random_seed(seed) def accuracy(results): """ @@ -27,18 +28,21 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen steps = [] # Steps per episode env = gym.make('MountainCar-v0') - env.seed(17) + env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n if agent_config is None: if default_policy: - agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0) + agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, + epsilon=0, epsilon_lower_bound=0, learn_thresh=0) else: layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) - agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001)) + agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, + epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, + optimizer=keras.optimizers.RMSprop(0.001)) else: agent = agent_config @@ -58,7 +62,6 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen state = np.reshape(state, [1, 2]) - #for t in tqdm(range(env._max_episode_steps), desc="Action", leave=False): for t in range(env._max_episode_steps): if (render): env.render() @@ -76,7 +79,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen res[0] += 1 else: res[1] += 1 - print("ENTRATO!,", t, "steps") + # print("ENTRATO!,", t, "steps") steps.append(t) break @@ -111,68 +114,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) layers = [layer1, layer2] -# experiments.append(("model01", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model02", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model03", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model04", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model05", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=5000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model06", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=6000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model07", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=7000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model08", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=8000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model09", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=9000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model10", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=10000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model11", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model12", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=200, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model13", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model14", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=400, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model15", 120, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=500, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model16", 500, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model17", 500, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model18", 500, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model19", 500, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model20", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model21", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -# experiments.append(("model22", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) experiments.append(("model23", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model24", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model25", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model26", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model27", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=5000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model28", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=6000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model29", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=70000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model30", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=8000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model31", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=9000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# experiments.append(("model32", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=10000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# layer1 = Dense(10, input_dim=input_dim, activation='relu') -# layer2 = Dense(output_dim) -# layers = [layer1, layer2] -# experiments.append(("model33", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001)))) -# layer1 = Dense(10, input_dim=input_dim, activation='relu') -# layer2 = Dense(10, input_dim=input_dim) -# layer3 = Dense(output_dim) -# layers = [layer1, layer2, layer3] -# experiments.append(("model34", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# layer1 = Dense(15, input_dim=input_dim, activation='relu') -# layer2 = Dense(output_dim) -# layers = [layer1, layer2] -# experiments.append(("model35", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=30, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# layer1 = Dense(15, input_dim=input_dim, activation='relu') -# layer2 = Dense(15, input_dim=input_dim) -# layer3 = Dense(output_dim) -# layers = [layer1, layer2, layer3] -# experiments.append(("model36", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# layer1 = Dense(20, input_dim=input_dim, activation='relu') -# layer2 = Dense(output_dim) -# layers = [layer1, layer2] -# experiments.append(("model37", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=30, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) -# layer1 = Dense(20, input_dim=input_dim, activation='relu') -# layer2 = Dense(20, input_dim=input_dim) -# layer3 = Dense(output_dim) -# layers = [layer1, layer2, layer3] -# experiments.append(("model38", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) - -# experiments.append(("model39", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) - def train_and_test(experiments): df = pd.DataFrame(columns=['model name', 'episode number', 'train mean score', 'train mean steps', 'test accuracy', 'test mean score', 'test mean steps']) diff --git a/MountainCar/ql_mountain_car.py b/MountainCar/ql_mountain_car.py index bb909888..b95b27c3 100644 --- a/MountainCar/ql_mountain_car.py +++ b/MountainCar/ql_mountain_car.py @@ -4,6 +4,7 @@ from tqdm import tqdm from qlearning_lib import QLAgent +seed = 91 def accuracy(results): """ @@ -28,13 +29,15 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('MountainCar-v0') - env.seed(91) + env.seed(seed) n_states = 150 if (default_policy): - agent = QLAgent([n_states, n_states, env.action_space.n], policy=policy, epsilon=0.01, epsilon_lower_bound=0.01) + agent = QLAgent([n_states, n_states, env.action_space.n], policy=policy, + epsilon=0.01, epsilon_lower_bound=0.01) else: - agent = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) + agent = QLAgent([n_states, n_states, env.action_space.n], + epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) for _ in tqdm(range(n_episodes), desc="Episode"): state = env.reset() @@ -62,6 +65,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): else: state = new_state cumulative_reward += reward + cumulative_reward += reward scores.append(cumulative_reward) env.close() @@ -75,7 +79,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): training_mean_score = train_res["scores"].mean() np.save('ql_policy.npy', learnt_policy) -# np.savetxt("results/ql.csv", train_res["steps"], delimiter=',') +# np.savetxt("results/training/ql.csv", train_res["steps"], delimiter=',') # Testing test_agent = np.load('ql_policy.npy') @@ -84,7 +88,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): testing_mean_steps = test_res["steps"].mean() testing_mean_score = test_res["scores"].mean() -# np.savetxt("results/ql_test.csv", test_res["steps"], delimiter=',') +# np.savetxt("results/testing/ql.csv", test_res["steps"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) diff --git a/MountainCar/sarsa_mountain_car.py b/MountainCar/sarsa_mountain_car.py index 8099a427..affbc285 100644 --- a/MountainCar/sarsa_mountain_car.py +++ b/MountainCar/sarsa_mountain_car.py @@ -4,6 +4,7 @@ from tqdm import tqdm from sarsa_lib import SARSAAgent +seed = 91 def accuracy(results): """ @@ -28,18 +29,21 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('MountainCar-v0') - env.seed(91) + env.seed(seed) n_states = 150 if (default_policy): - agent = SARSAAgent([n_states, n_states, env.action_space.n], policy=policy, epsilon=0.01, epsilon_lower_bound=0.01) + agent = SARSAAgent([n_states, n_states, env.action_space.n], policy=policy, + epsilon=0.01, epsilon_lower_bound=0.01) else: - agent = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) + agent = SARSAAgent([n_states, n_states, env.action_space.n], + epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) for _ in tqdm(range(n_episodes), desc="Episode"): state = env.reset() state = obs_to_state(env, state, n_states) cumulative_reward = 0 + if not default_policy: agent.extract_policy() @@ -64,6 +68,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): else: state = new_state cumulative_reward += reward + cumulative_reward += reward scores.append(cumulative_reward) env.close() @@ -77,7 +82,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): training_mean_score = train_res["scores"].mean() np.save('sarsa_policy.npy', learnt_policy) -# np.savetxt("results/sarsa.csv", train_res["steps"], delimiter=',') +# np.savetxt("results/training/sarsa.csv", train_res["steps"], delimiter=',') # Testing test_agent = np.load('sarsa_policy.npy') @@ -86,7 +91,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): testing_mean_steps = test_res["steps"].mean() testing_mean_score = test_res["scores"].mean() -# np.savetxt("results/sarsa.csv", test_res["steps"], delimiter=',') +# np.savetxt("results/testing/sarsa.csv", test_res["steps"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) diff --git a/Pong/pong_dqn.py b/Pong/pong_dqn.py index 3dd49f16..2b431fdd 100644 --- a/Pong/pong_dqn.py +++ b/Pong/pong_dqn.py @@ -1,16 +1,16 @@ -import random as ran import os +import random as ran import time import gym -import numpy as np -import tensorflow as tf from keras import backend as K from keras.initializers import VarianceScaling from keras.layers import Dense, Flatten from keras.layers.convolutional import Conv2D from keras.optimizers import Adam +import numpy as np from skimage.color import rgb2gray from skimage.transform import resize +import tensorflow as tf from tqdm import tqdm from dqn_lib import DQNAgent from ring_buffer import RingBuffer @@ -30,22 +30,22 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode + reward_list = RingBuffer(100) env = gym.make('PongDeterministic-v4') input_dim = env.observation_space.shape[0] output_dim = env.action_space.n - - layers = [Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)), - Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), - Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), - Flatten(), - Dense(512, activation='relu'), - Dense(output_dim)] if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0.05, epsilon_lower_bound=0.05) else: + layers = [Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)), + Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), + Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), + Flatten(), + Dense(512, activation='relu'), + Dense(output_dim)] agent = DQNAgent(output_dim, layers, use_ddqn=True, memory_size=700000, gamma=0.99, learn_thresh=50000, epsilon_lower_bound=0.02, epsilon_decay_function=lambda e: e - (0.98 / 950000), update_rate=10000, optimizer=Adam(0.00025)) @@ -73,6 +73,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): new_state = np.reshape(pre_processing(frame), (1, 84, 84, 1)) new_stack = np.append(new_state, stack[:, :, :, :3], axis=3) stack = new_stack + if (render): env.render() @@ -80,10 +81,13 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): next_action = agent.act(stack) new_state, reward, end, _ = env.step(next_action) + if (render): env.render() time.sleep(0.02) + reward = np.clip(reward, -1., 1.) + if reward != 0: has_lost_life = True @@ -105,7 +109,6 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): res[0] += 1 print("You Lost!, steps:", t, "reward:", reward_list.mean(), "frames:", gathered_frame) steps.append(t) - break agent.learn() @@ -116,14 +119,13 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): model_name = "partial_model_pong" + str(episode_number) agent.save_model(model_name) - env.close() return {"results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent} # Training res = experiment(10000, render=False) -res["agent"].save_model("finalmodel") +res["agent"].save_model("ddqn") # Testing -# res = experiment(20, render=True, default_policy=True, policy="partial_model_pong300") +res = experiment(20, render=True, default_policy=True, policy="ddqn") diff --git a/ReinforcementLearningLib/ac_lib.py b/ReinforcementLearningLib/ac_lib.py index f1b705cc..8f811771 100644 --- a/ReinforcementLearningLib/ac_lib.py +++ b/ReinforcementLearningLib/ac_lib.py @@ -1,17 +1,17 @@ -import random as ran -from collections import deque -import numpy as np -import tensorflow as tf from keras import backend as K -from keras.models import Sequential, load_model, Model -from keras.optimizers import RMSprop, Adam from keras.callbacks import TensorBoard from keras.layers import Dense, Dropout, Input from keras.layers.merge import Add, Multiply +from keras.models import Sequential, load_model, Model +from keras.optimizers import RMSprop, Adam +import numpy as np +import tensorflow as tf class ACAgent: + class Actor: + def __init__(self, layers, tb_dir, default_policy=None): self.loss = "mean_squared_error" self.optimizer = Adam(lr=0.0001) @@ -31,7 +31,6 @@ def __init__(self, layers, tb_dir, default_policy=None): else: self.model = default_policy - def learn(self, state, action, td_error): new_prediction = self.model.predict(state) new_prediction[0][action] = -td_error @@ -44,6 +43,7 @@ def learn(self, state, action, td_error): class Critic: + def __init__(self, layers, tb_dir, default_policy=None): self.optimizer = Adam(lr=0.001) self.gamma = 0.9 @@ -64,11 +64,9 @@ def __init__(self, layers, tb_dir, default_policy=None): else: self.model = default_policy - def learn(self, state, new_state, reward): td_error = reward + self.gamma * self.model.predict(new_state)[0] - self.model.predict(state)[0] - if (self.tb_step % self.tb_gather) == 0 and self.tb_dir is not None: self.model.fit(state, td_error, verbose=0, callbacks=[self.tensorboard_critic]) else: diff --git a/ReinforcementLearningLib/dqn_lib.py b/ReinforcementLearningLib/dqn_lib.py index d67a5400..ad4abd12 100644 --- a/ReinforcementLearningLib/dqn_lib.py +++ b/ReinforcementLearningLib/dqn_lib.py @@ -1,10 +1,9 @@ -import random as ran -import numpy as np -import tensorflow as tf from keras import backend as K +from keras.callbacks import TensorBoard from keras.models import Sequential, load_model from keras.optimizers import RMSprop, Adam -from keras.callbacks import TensorBoard +import numpy as np +import tensorflow as tf from ring_buffer import RingBuffer @@ -137,4 +136,3 @@ def save_model(self, filename): def load_model(self, filename): return load_model('%s.h5' % filename, custom_objects={ 'huber_loss': huber_loss }) - diff --git a/Ensembling/ensembler.py b/ReinforcementLearningLib/ensembler.py similarity index 99% rename from Ensembling/ensembler.py rename to ReinforcementLearningLib/ensembler.py index 907a976b..da555fb2 100644 --- a/Ensembling/ensembler.py +++ b/ReinforcementLearningLib/ensembler.py @@ -1,6 +1,7 @@ import numpy as np from enum import Enum + class EnsemblerType(Enum): MAJOR_VOTING_BASED = 0 TRUST_BASED = 1 @@ -8,6 +9,7 @@ class EnsemblerType(Enum): class EnsemblerAgent: + def __init__(self, output_size, agents, ensembler_type): self.agents = agents self.output_size = output_size @@ -30,8 +32,6 @@ def __init__(self, output_size, agents, ensembler_type): self.trust[i] = 1 / len(self.agents) # print("INITIAL TRUST: ", self.trust) - - def act(self, state, discrete_state=None): original_state = state if self.ensembler_type == EnsemblerType.MAJOR_VOTING_BASED: @@ -105,4 +105,4 @@ def trust_update(self, win): self.votes_per_agent = np.zeros(len(self.agents)) self.total_actions = 0 # print(self.trust) - + \ No newline at end of file diff --git a/ReinforcementLearningLib/qlearning_lib.py b/ReinforcementLearningLib/qlearning_lib.py index 9056272d..5986a4bd 100644 --- a/ReinforcementLearningLib/qlearning_lib.py +++ b/ReinforcementLearningLib/qlearning_lib.py @@ -64,5 +64,3 @@ def next_action(self, state): max_indexes = np.arange(len(state))[state == max_value] np.random.shuffle(max_indexes) return max_indexes[0] - - \ No newline at end of file diff --git a/ReinforcementLearningLib/ring_buffer.py b/ReinforcementLearningLib/ring_buffer.py index 6ba0c56e..6888fc65 100644 --- a/ReinforcementLearningLib/ring_buffer.py +++ b/ReinforcementLearningLib/ring_buffer.py @@ -1,6 +1,7 @@ from random import randint class RingBuffer: + def __init__(self, max_buffer_size): self.max_buffer_size = max_buffer_size self.current_index = 0 diff --git a/ReinforcementLearningLib/sarsa_lib.py b/ReinforcementLearningLib/sarsa_lib.py index 126bc4a9..62bcfb55 100644 --- a/ReinforcementLearningLib/sarsa_lib.py +++ b/ReinforcementLearningLib/sarsa_lib.py @@ -54,4 +54,4 @@ def act(self, state, return_prob_dist=False): # TODO: controllare episode_number if not return_prob_dist: return next_action - return next_action, self.Q_target[state] \ No newline at end of file + return next_action, self.Q_target[state] diff --git a/TF/tf-gputest.ipynb b/TF/tf-gputest.ipynb deleted file mode 100644 index 84b06bad..00000000 --- a/TF/tf-gputest.ipynb +++ /dev/null @@ -1,43 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Just a sample to test if GPU is working\n", - "import tensorflow as tf\n", - "\n", - "with tf.device('/gpu:0'):\n", - " a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')\n", - " b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')\n", - " c = tf.matmul(a, b)\n", - "\n", - "sess = tf.Session()\n", - "print (sess.run(c))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Taxi/ql_taxi.py b/Taxi/ql_taxi.py index 0cf22d46..1c6f5bb7 100644 --- a/Taxi/ql_taxi.py +++ b/Taxi/ql_taxi.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from qlearning_lib import QLAgent +seed = 91 def accuracy(results): """ @@ -18,7 +19,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('Taxi-v2') - env.seed(91) + env.seed(seed) if (default_policy): agent = QLAgent([env.observation_space.n, env.action_space.n], policy=policy) @@ -64,9 +65,8 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() np.save('ql_policy.npy', learnt_policy) -#print("Policy learnt: ", learnt_policy) -# np.savetxt("results/ql.csv", train_res["scores"], delimiter=',') +# np.savetxt("results/training/ql.csv", train_res["scores"], delimiter=',') # Testing test_agent = np.load('ql_policy.npy') @@ -75,7 +75,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): testing_mean_steps = test_res["steps"].mean() testing_mean_score = test_res["scores"].mean() -# np.savetxt("results/ql.csv", test_res["scores"], delimiter=',') +# np.savetxt("results/testing/ql.csv", test_res["scores"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) diff --git a/Taxi/sarsa_taxi.py b/Taxi/sarsa_taxi.py index 4f8e65df..b2d89377 100644 --- a/Taxi/sarsa_taxi.py +++ b/Taxi/sarsa_taxi.py @@ -1,9 +1,10 @@ import time -import numpy as np import gym +import numpy as np from tqdm import tqdm from sarsa_lib import SARSAAgent +seed = 91 def accuracy(results): """ @@ -18,16 +19,18 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): steps = [] # Steps per episode env = gym.make('Taxi-v2') - env.seed(91) + env.seed(seed) if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy) else: - agent = SARSAAgent([env.observation_space.n, env.action_space.n], epsilon_decay_function=lambda e: e - 0.000016, update_rate=10) + agent = SARSAAgent([env.observation_space.n, env.action_space.n], + epsilon_decay_function=lambda e: e - 0.000016, update_rate=10) for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 + if not default_policy: agent.extract_policy() @@ -63,12 +66,11 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): # Training train_res = experiment(10000) learnt_policy = np.argmax(train_res["Q"], axis=1) -# print("Policy learnt: ", learnt_policy) training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() np.save('sarsa_policy.npy', learnt_policy) -# np.savetxt("results/sarsa.csv", train_res["scores"], delimiter=',') +# np.savetxt("results/training/sarsa.csv", train_res["scores"], delimiter=',') # Testing test_agent = np.load('sarsa_policy.npy') @@ -77,7 +79,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): testing_mean_steps = test_res["steps"].mean() testing_mean_score = test_res["scores"].mean() -# np.savetxt("results/sarsa.csv", test_res["scores"], delimiter=',') +# np.savetxt("results/testing/sarsa.csv", test_res["scores"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps, "\nAccuracy:", testing_accuracy, "Test mean score:", testing_mean_score, "Test mean steps:", testing_mean_steps) diff --git a/qlearning.py b/qlearning.py deleted file mode 100644 index 8c71fc67..00000000 --- a/qlearning.py +++ /dev/null @@ -1,71 +0,0 @@ -import numpy as np -import numpy.random as rn - -def updateQ(Q, state, new_state, action, reward, alpha, gamma): - """ - It applies Q-Learning update rule. - Parameters: - Q -> Q matrix - state -> current state t - new_state -> next state t - reward -> reward - action -> current action - """ - future_action = np.argmax(Q[new_state]) # Find the best action to perform at time t+1 - Q[state, action] = (1 - alpha)*Q[state, action] + alpha * (reward + gamma*Q[new_state, future_action]) - return Q - -def updateQ_tensor(Q, state, new_state, action, reward, alpha, gamma): - """ - It applies Q-Learning update rule considering 3-dimensional matrices. It is used in MountainCar-v0 environment. - Parameters: - Q -> Q matrix - state -> current state t - new_state -> next state t - reward -> reward - action -> current action - """ - future_action = np.argmax(Q[new_state[0],new_state[1]]) # Find the best action to perform at time t+1 - Q[state[0],state[1], action] = (1 - alpha)*Q[state[0],state[1], action] + alpha * (reward + gamma*Q[new_state[0],new_state[1], future_action]) - return Q - -def next_action1(state): - """ - It chooses the best action given the current state. - Paramteres: - state -> array of possible actions in the current state. - """ - max_value = np.amax(state) - max_indexes = np.arange(len(state))[state == max_value] - rn.shuffle(max_indexes) - return max_indexes[0] - -def next_action2(state,i_episode): - return np.argmax(state + np.random.randn(1,len(state))*(1./(i_episode+1))) - -def next_action3(state,epsilon): - """ - It chooses the best action given the current state. - Paramteres: - state -> array of possible actions in the current state. - """ - if np.random.uniform() > epsilon: - max_value = np.amax(state) - max_indexes = np.arange(len(state))[state == max_value] - rn.shuffle(max_indexes) - return max_indexes[0] - return np.argmax(np.random.uniform(0,1, size=4)) - -def get_epsilon(k,n): - res = (n - k) / n - if res < 0.01: - return 0.01 - return res - - -def get_epsilon_exp(n): - res = 1 / (n + 1) - if res < 0.01: - return 0.01 - return res - \ No newline at end of file diff --git a/sarsa.py b/sarsa.py deleted file mode 100644 index 8f17dbfc..00000000 --- a/sarsa.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -import numpy.random as rn - -def updateQ(Q, state, new_state, action, next_action, reward, alpha, gamma): - """ - It applies Q-Learning update rule. - Parameters: - Q -> Q matrix - state -> current state t - new_state -> next state t - reward -> reward - action -> current action - next_action -> next action - """ - Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma*Q[new_state, next_action]) - return Q - -def next_action1(state): - """ - It chooses the best action given the current state. - Paramteres: - state -> array of possible actions in the current state. - """ - v_max = np.amax(state) - indexes = np.arange(len(state))[state == v_max] - rn.shuffle(indexes) - return indexes[0] - -def next_action2(state,i_episode): - return np.argmax(state + np.random.randn(1,len(state))*(1./(i_episode+1))) - -def next_action3(action,epsilon): - """ - It chooses the best action given the current state. - Paramteres: - action -> best action to perform. - epsilon -> exploration/exploitation probability. - """ - if np.random.uniform() > epsilon: - return action - return np.argmax(np.random.uniform(0,1, size=6)) - -def gen_policy(Q): - return [next_action1(state) for state in Q] - -def get_epsilon(k,n): - res = (n - k) / n - if res < 0.01: - return 0.01 - return res - -def get_epsilon_exp(n): - res = 1 / (n + 1) - if res < 0.01: - return 0.01 - return res \ No newline at end of file diff --git a/statistics.py b/statistics.py deleted file mode 100644 index 295b04d5..00000000 --- a/statistics.py +++ /dev/null @@ -1,15 +0,0 @@ -def ma(ts, q): - acc = 0 - res = [] - for i in range(q, len(ts) - q): - for j in range(i - q, i + q): - acc += ts[j] - res.append(acc / (2 * q + 1)) - acc = 0 - return res - -def accuracy(results): - """ - Evaluate the accuracy of results, considering victories and defeats. - """ - return results[1] / (results[0]+results[1]) * 100 \ No newline at end of file