From 60895feeeb3b6b86acd664dece277b433eee877e Mon Sep 17 00:00:00 2001 From: Gianluca Bigaglia Date: Sat, 5 Jan 2019 15:20:10 +0100 Subject: [PATCH] Add comments --- Breakout/Breakout_DQN.py | 29 +++++- CartPole/dqn_cart_pole.py | 24 +++++ Ensembling/ensembling_cart_pole.py | 33 +++++++ Ensembling/ensembling_mountaincar.py | 24 +++++ Ensembling/ensembling_mountaincar_mixed.py | 35 ++++++- FrozenLake/ql_4x4.py | 22 +++++ FrozenLake/ql_4x4_deterministic.py | 22 +++++ FrozenLake/ql_8x8.py | 22 +++++ FrozenLake/ql_8x8_deterministic.py | 22 +++++ FrozenLake/sarsa_4x4.py | 22 +++++ FrozenLake/sarsa_4x4_deterministic.py | 22 +++++ FrozenLake/sarsa_8x8.py | 22 +++++ FrozenLake/sarsa_8x8_deterministic.py | 22 +++++ MountainCar/dqn_mountain_car.py | 26 +++++- MountainCar/ql_mountain_car.py | 34 ++++++- MountainCar/sarsa_mountain_car.py | 34 ++++++- Pong/pong_dqn.py | 28 +++++- ReinforcementLearningLib/dqn_lib.py | 102 +++++++++++++++++++-- ReinforcementLearningLib/ensembler.py | 31 +++++-- ReinforcementLearningLib/qlearning_lib.py | 56 +++++++---- ReinforcementLearningLib/ring_buffer.py | 69 +++++++++----- ReinforcementLearningLib/sarsa_lib.py | 45 +++++++-- Taxi/ql_taxi.py | 22 +++++ Taxi/sarsa_taxi.py | 22 +++++ 24 files changed, 715 insertions(+), 75 deletions(-) diff --git a/Breakout/Breakout_DQN.py b/Breakout/Breakout_DQN.py index 5b00da7c..6985f4e2 100644 --- a/Breakout/Breakout_DQN.py +++ b/Breakout/Breakout_DQN.py @@ -14,9 +14,17 @@ from dqn_lib import DQNAgent -# Original size: 210x160x3 def pre_processing(observe): - grayscaled = rgb2gray(observe) # 210x160 + """ + Frame grayscaling and subsampling + + Args: + observe: input frame + + Returns: + processed_observed: output frame + """ + grayscaled = rgb2gray(observe) # From 210x160x3 to 210x160 grayscaled = grayscaled[16:201,:] processed_observe = np.uint8(resize(grayscaled, (84, 84), mode='constant') * 255) return processed_observe @@ -24,6 +32,23 @@ def pre_processing(observe): # 0: stay, 1: start, 2: right, 3: left def experiment(n_episodes, max_action, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + max_action: maximum number of steps per episode + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ with tf.device('/gpu:0'): res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} diff --git a/CartPole/dqn_cart_pole.py b/CartPole/dqn_cart_pole.py index 764b93eb..1cdef6fa 100644 --- a/CartPole/dqn_cart_pole.py +++ b/CartPole/dqn_cart_pole.py @@ -18,11 +18,34 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render = False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + agent_config: DQNAgent object + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # steps per episode @@ -62,6 +85,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): x, x_dot, theta, theta_dot = new_state new_state = np.reshape(new_state, [1, 4]) + # Reward shaping r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r3 = -abs(theta_dot) diff --git a/Ensembling/ensembling_cart_pole.py b/Ensembling/ensembling_cart_pole.py index d9f93a45..0165d1ec 100644 --- a/Ensembling/ensembling_cart_pole.py +++ b/Ensembling/ensembling_cart_pole.py @@ -19,11 +19,27 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def evaluate(env, agentE): + """ + Model validation for early stopping. + + Args: + env: OpenAI environment object + agentE: Ensembler object + + Returns: + true if accuracy is 100%, false otherwise + """ eval_steps = [] eval_scores = [] eval_res = [0, 0] @@ -67,6 +83,22 @@ def evaluate(env, agentE): def experiment(n_episodes, default_policy=False, policy=None, render = False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # steps per episode @@ -117,6 +149,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render = False): x, x_dot, theta, theta_dot = new_state new_state = np.reshape(new_state, [1, 4]) + # Reward shaping r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r3 = -abs(theta_dot) diff --git a/Ensembling/ensembling_mountaincar.py b/Ensembling/ensembling_mountaincar.py index 7805b17d..c1af2eec 100644 --- a/Ensembling/ensembling_mountaincar.py +++ b/Ensembling/ensembling_mountaincar.py @@ -19,11 +19,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode @@ -74,6 +96,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): # r2 = reward + 0.2 * np.sin(3 * original_state[0]) # r3 = reward + 0.7 * (original_state[1] * original_state[1]) + # Reward shaping r1 = reward + original_state[0] r2 = reward + np.sin(3 * original_state[0]) r3 = reward + (original_state[1] * original_state[1]) @@ -111,6 +134,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): cumulative_reward += reward scores.append(cumulative_reward) else: + # Model validation for early stopping evaluate = False eval_res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} eval_scores = [] # Cumulative rewards diff --git a/Ensembling/ensembling_mountaincar_mixed.py b/Ensembling/ensembling_mountaincar_mixed.py index f11be6cf..635d94e7 100644 --- a/Ensembling/ensembling_mountaincar_mixed.py +++ b/Ensembling/ensembling_mountaincar_mixed.py @@ -22,12 +22,28 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def obs_to_state(env, obs, n_states): - """ Maps an observation to state """ + """ + Perfom the discretisation of an observation. + + Args: + env: OpenAI environment object + obs: current state observation + n_state: number of discrete bins + + Returns: + Discretised observation + """ env_low = env.observation_space.low env_high = env.observation_space.high env_dx = (env_high - env_low) / n_states @@ -37,6 +53,22 @@ def obs_to_state(env, obs, n_states): def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode @@ -82,6 +114,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): new_discretized_state = obs_to_state(env, new_state, n_states) original_state = new_state + # Reward shaping # r1 = reward + 0.1 * original_state[0] # r2 = reward + 0.2 * np.sin(3 * original_state[0]) # r3 = reward + 0.7 * (original_state[1] * original_state[1]) diff --git a/FrozenLake/ql_4x4.py b/FrozenLake/ql_4x4.py index 160d49a1..9298aae1 100644 --- a/FrozenLake/ql_4x4.py +++ b/FrozenLake/ql_4x4.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/ql_4x4_deterministic.py b/FrozenLake/ql_4x4_deterministic.py index 04d905c8..5df84b73 100644 --- a/FrozenLake/ql_4x4_deterministic.py +++ b/FrozenLake/ql_4x4_deterministic.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/ql_8x8.py b/FrozenLake/ql_8x8.py index 88226b57..7fff7b10 100644 --- a/FrozenLake/ql_8x8.py +++ b/FrozenLake/ql_8x8.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/ql_8x8_deterministic.py b/FrozenLake/ql_8x8_deterministic.py index 05560848..090ca6bd 100644 --- a/FrozenLake/ql_8x8_deterministic.py +++ b/FrozenLake/ql_8x8_deterministic.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/sarsa_4x4.py b/FrozenLake/sarsa_4x4.py index 89898f20..eb42f19b 100644 --- a/FrozenLake/sarsa_4x4.py +++ b/FrozenLake/sarsa_4x4.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/sarsa_4x4_deterministic.py b/FrozenLake/sarsa_4x4_deterministic.py index 36fb362b..49f27a0b 100644 --- a/FrozenLake/sarsa_4x4_deterministic.py +++ b/FrozenLake/sarsa_4x4_deterministic.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/sarsa_8x8.py b/FrozenLake/sarsa_8x8.py index c3515e06..4e937b1f 100644 --- a/FrozenLake/sarsa_8x8.py +++ b/FrozenLake/sarsa_8x8.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/FrozenLake/sarsa_8x8_deterministic.py b/FrozenLake/sarsa_8x8_deterministic.py index 83ad4751..1ae69592 100644 --- a/FrozenLake/sarsa_8x8_deterministic.py +++ b/FrozenLake/sarsa_8x8_deterministic.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/MountainCar/dqn_mountain_car.py b/MountainCar/dqn_mountain_car.py index 5ef1b539..c8b7a772 100644 --- a/MountainCar/dqn_mountain_car.py +++ b/MountainCar/dqn_mountain_car.py @@ -18,11 +18,34 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False, agent_config=None): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + agent_config: DQNAgent object + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode @@ -51,6 +74,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen state = env.reset() cumulative_reward = 0 + # Model validation for early stopping if i_episode > 0 and (i_episode % 100) == 0 and not default_policy: agent.save_model("tmp_model") evaluation_result = experiment(500, default_policy=True, policy="tmp_model") @@ -69,7 +93,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) - reward = abs(new_state[0] - (-0.5)) # r in [0, 1] + reward = abs(new_state[0] - (-0.5)) # r in [0, 1] (reward shaping) new_state = np.reshape(new_state, [1, 2]) agent.memoise((state, next_action, reward, new_state, end)) diff --git a/MountainCar/ql_mountain_car.py b/MountainCar/ql_mountain_car.py index b95b27c3..a1230f5f 100644 --- a/MountainCar/ql_mountain_car.py +++ b/MountainCar/ql_mountain_car.py @@ -9,12 +9,28 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def obs_to_state(env, obs, n_states): - """ Maps an observation to state """ + """ + Perfom the discretisation of an observation. + + Args: + env: OpenAI environment object + obs: current state observation + n_state: number of discrete bins + + Returns: + Discretised observation + """ env_low = env.observation_space.low env_high = env.observation_space.high env_dx = (env_high - env_low) / n_states @@ -24,6 +40,22 @@ def obs_to_state(env, obs, n_states): def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/MountainCar/sarsa_mountain_car.py b/MountainCar/sarsa_mountain_car.py index affbc285..773127ad 100644 --- a/MountainCar/sarsa_mountain_car.py +++ b/MountainCar/sarsa_mountain_car.py @@ -9,12 +9,28 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def obs_to_state(env, obs, n_states): - """ Maps an observation to state """ + """ + Perfom the discretisation of an observation. + + Args: + env: OpenAI environment object + obs: current state observation + n_state: number of discrete bins + + Returns: + Discretised observation + """ env_low = env.observation_space.low env_high = env.observation_space.high env_dx = (env_high - env_low) / n_states @@ -24,6 +40,22 @@ def obs_to_state(env, obs, n_states): def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/Pong/pong_dqn.py b/Pong/pong_dqn.py index 2b431fdd..58d6e762 100644 --- a/Pong/pong_dqn.py +++ b/Pong/pong_dqn.py @@ -16,15 +16,39 @@ from ring_buffer import RingBuffer -# Original size: 210x160x3 def pre_processing(observe): - grayscaled = rgb2gray(observe) # 210x160 + """ + Frame grayscaling and subsampling + + Args: + observe: input frame + + Returns: + processed_observed: output frame + """ + grayscaled = rgb2gray(observe) # From 210x160x3 to 210x160 grayscaled = grayscaled[16:201,:] processed_observe = np.uint8(resize(grayscaled, (84, 84), mode='constant') * 255) return processed_observe def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ with tf.device('/gpu:0'): res = [0,0] # array of results accumulator: {[0]: Loss, [1]: Victory} diff --git a/ReinforcementLearningLib/dqn_lib.py b/ReinforcementLearningLib/dqn_lib.py index ad4abd12..0d0137e8 100644 --- a/ReinforcementLearningLib/dqn_lib.py +++ b/ReinforcementLearningLib/dqn_lib.py @@ -8,6 +8,17 @@ def huber_loss(a, b, in_keras=True): + """ + Apply Huber loss function + + Args: + a: target value + b: predicted value + in_keras: use keras backend + + Returns: + Huber loss value + """ error = a - b quadratic_term = error*error / 2 linear_term = abs(error) - 1/2 @@ -24,6 +35,24 @@ def __init__(self, output_size, layers, memory_size=3000, batch_size=32, epsilon=1, epsilon_lower_bound=0.1, epsilon_decay_function=lambda e: e - (0.9 / 1000000), gamma=0.95, optimizer=RMSprop(0.00025), learn_thresh=50000, update_rate=10000): + """ + Args: + output_size: number of actions + layers: list of Keras layers + memory_size: size of replay memory + batch_size: size of batch for replay memory + use_ddqn: boolean for choosing between DQN/DDQN + default_policy: boolean for loading a model from a file + model_filename: name of file to load + tb_dir: directory for tensorboard logging + epsilon: annealing function upper bound + epsilon_lower_bound: annealing function lower bound + epsilon_decay_function: lambda annealing function + gamma: discount factor hyper parameter + optimizer: Keras optimiser + learn_thresh: number of steps to perform without learning + update_rate: number of steps between network-target weights copy + """ self.output_size = output_size self.memory = RingBuffer(memory_size) self.use_ddqn = use_ddqn @@ -56,9 +85,17 @@ def __init__(self, output_size, layers, memory_size=3000, batch_size=32, if self.use_ddqn: self.target_model = self.build_model(layers) - # self.evaluate_model.summary() def build_model(self, layers): + """ + Build a Neural Network. + + Args: + layers: list of Keras NN layers + + Returns: + model: compiled model with embedded loss and optimiser + """ model = Sequential() for l in layers: model.add(l) @@ -67,15 +104,23 @@ def build_model(self, layers): return model def update_target_model(self): + """ + Set target net weights to evaluation net weights. + """ self.target_model.set_weights(self.evaluate_model.get_weights()) def replay(self): + """ + Perform DQN learning phase through experience replay. + """ pick = self.random_pick() for state, next_action, reward, new_state, end in pick: # for state, next_action, reward, frame, end in pick: - # state = np.float32(state / 255) # TODO: generalisation - # frame = np.float32(frame / 255) # TODO: generalisation - # new_state = np.append(frame, state[:, :, :, :3], axis=3) # TODO: generalisation + # state = np.float32(state / 255) # for CNN learning + # frame = np.float32(frame / 255) # for CNN learning + # new_state = np.append(frame, state[:, :, :, :3], axis=3) # for CNN learning + + # Simple DQN case if self.use_ddqn == False: if not end: reward = reward + self.gamma * np.amax(self.evaluate_model.predict(new_state)[0]) @@ -83,6 +128,7 @@ def replay(self): new_prediction = self.evaluate_model.predict(state) new_prediction[0][next_action] = reward else: + # Double DQN case if not end: action = np.argmax(self.evaluate_model.predict(new_state)[0]) reward = reward + self.gamma * self.target_model.predict(new_state)[0][action] @@ -97,17 +143,36 @@ def replay(self): self.tb_step += 1 def random_pick(self): + """ + Pick a random set of elements from replay memory of size self.batch_size. + + Returns: + set of random elements from memory + """ return self.memory.random_pick(self.batch_size) def act(self, state, return_prob_dist=False): + """ + Return the action for current state. + + Args: + state: current state t + return_prob_dist: boolean for probability distribution used by ensemblers + + Returns: + next_action: next action to perform + prediction: probability distribution + """ + # Annealing if np.random.uniform() > self.epsilon: - # state = np.float32(state / 255) # TODO: generalisation + # state = np.float32(state / 255) # for CNN learning prediction = self.evaluate_model.predict(state)[0] next_action = np.argmax(prediction) else: prediction = np.random.uniform(0, 1, size=self.output_size) next_action = np.argmax(prediction) + # Start decaying after self.learn_thresh steps if self.total_steps > self.learn_thresh: self.epsilon = self.epsilon_decay_function(self.epsilon) self.epsilon = np.amax([self.epsilon, self.epsilon_lower_bound]) @@ -119,20 +184,45 @@ def act(self, state, return_prob_dist=False): return next_action, prediction def memoise(self, t): + """ + Store tuple to replay memory. + + Args: + t: element to store + """ if not self.default_policy: self.memory.append(t) def learn(self): + """ + Perform the learning phase. + """ + # Start target model update after self.learn_thresh steps if (self.total_steps > self.learn_thresh and (self.total_steps % self.update_rate) == 0 and not self.default_policy and self.use_ddqn == True): self.update_target_model() - # print("model updated, epsilon:", self.epsilon) + # Start learning after self.learn_thresh steps if self.total_steps > self.learn_thresh and not self.default_policy and self.total_steps % 4 == 0: self.replay() def save_model(self, filename): + """ + Serialise the model to .h5 file. + + Args: + filename + """ self.evaluate_model.save('%s.h5' % filename) def load_model(self, filename): + """ + Load model from .h5 file + + Args: + filename + + Returns: + model + """ return load_model('%s.h5' % filename, custom_objects={ 'huber_loss': huber_loss }) diff --git a/ReinforcementLearningLib/ensembler.py b/ReinforcementLearningLib/ensembler.py index da555fb2..5e7289e9 100644 --- a/ReinforcementLearningLib/ensembler.py +++ b/ReinforcementLearningLib/ensembler.py @@ -11,6 +11,12 @@ class EnsemblerType(Enum): class EnsemblerAgent: def __init__(self, output_size, agents, ensembler_type): + """ + Args: + output_size: number of actions + agents: list of RL agents + ensembler_type: type of ensembling by EnsemblerType enum + """ self.agents = agents self.output_size = output_size self.ensembler_type = ensembler_type @@ -30,9 +36,18 @@ def __init__(self, output_size, agents, ensembler_type): for i in range(len(self.trust)): self.trust[i] = 1 / len(self.agents) - # print("INITIAL TRUST: ", self.trust) def act(self, state, discrete_state=None): + """ + Return the action for current state. + + Args: + state: current state + discrete_state: discretised current state + + Returns: + action: next action to perform + """ original_state = state if self.ensembler_type == EnsemblerType.MAJOR_VOTING_BASED: for agent in self.agents: @@ -84,17 +99,16 @@ def act(self, state, discrete_state=None): action = np.random.choice(np.argwhere(self.votes==np.amax(self.votes)).flatten()) self.votes = np.zeros(self.output_size) return action - # for RANKING VOTING - #array = np.array([4,2,7,1]) - #temp = array.argsort() - #ranks = np.empty_like(temp) - #ranks[temp] = np.arange(len(array)) - return 73 # Houston, we have a problem! def trust_update(self, win): + """ + Update Gamma trust vector. + + Args: + win: result of current game + """ if self.ensembler_type == EnsemblerType.TRUST_BASED: - # print(self.votes_per_agent, "win:", win, "total actions:", self.total_actions) for i in range(len(self.agents)): if win: self.trust[i] = self.trust[i] * (1 + self.trust_rate * (self.votes_per_agent[i] / self.total_actions)) @@ -104,5 +118,4 @@ def trust_update(self, win): self.trust = self.trust / sum(self.trust) self.votes_per_agent = np.zeros(len(self.agents)) self.total_actions = 0 - # print(self.trust) \ No newline at end of file diff --git a/ReinforcementLearningLib/qlearning_lib.py b/ReinforcementLearningLib/qlearning_lib.py index 5986a4bd..ef27a757 100644 --- a/ReinforcementLearningLib/qlearning_lib.py +++ b/ReinforcementLearningLib/qlearning_lib.py @@ -4,7 +4,17 @@ class QLAgent: def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, - epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6): + epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6): + """ + Args: + shape: a tuple that describes the state space tensor shape + alpha: learning rate hyperparameter + gamma: discount factor hyper parameter + policy: numpy tensor test policy + epsilon: annealing function upper bound + epsilon_lower_bound: annealing function lower bound + epsilon_decay_function: lambda annealing function + """ self.alpha = alpha # learning rate self.gamma = gamma # discount factor self.Q = np.zeros(shape) @@ -18,23 +28,36 @@ def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, def update_q(self, state, new_state, action, reward): """ - It applies Q-Learning update rule. - Parameters: - state -> current state t - new_state -> next state t - reward -> reward - action -> current action + Apply Q-Learning update rule. + + Args: + state: current state t + new_state: next state t + reward: reward + action: current action """ future_action = np.argmax(self.Q[new_state]) # Find the best action to perform at time t+1 self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * self.Q[new_state][future_action]) def act(self, state=None, return_prob_dist=False): + """ + Return the action for current state. + + Args: + state: current state + return_prob_dist: boolean for probability distribution used by ensemblers + + Returns: + next_action: next action to perform + prediction: probability distribution + """ if (self.policy is not None): next_action = self.policy[state] else: self.epsilon = self.epsilon_decay_function(self.epsilon) self.epsilon = np.amax([self.epsilon, self.epsilon_lower_bound]) - # self.epsilon = self.get_epsilon_exponential(episode_number) + + # Annealing if np.random.uniform() > self.epsilon: prediction = self.Q[state] next_action = self.next_action(prediction) @@ -46,19 +69,16 @@ def act(self, state=None, return_prob_dist=False): return next_action return next_action, prediction - # def get_epsilon_linear(self, k, n): - # res = (n - k) / n - # return np.amax([res, self.epsilon_lower_bound]) - - # def get_epsilon_exponential(self, n): - # res = 1 / (n + 1) - # return np.amax([res, self.epsilon_lower_bound]) def next_action(self, state): """ - It chooses the best action given the current state. - Paramteres: - state -> array of possible actions in the current state. + Choose the best action given the current state. + + Args: + state: array of possible actions in the current state. + + Returns: + max_indexes[0]: best action for current state """ max_value = np.amax(state) max_indexes = np.arange(len(state))[state == max_value] diff --git a/ReinforcementLearningLib/ring_buffer.py b/ReinforcementLearningLib/ring_buffer.py index 6888fc65..62900c89 100644 --- a/ReinforcementLearningLib/ring_buffer.py +++ b/ReinforcementLearningLib/ring_buffer.py @@ -2,27 +2,48 @@ class RingBuffer: - def __init__(self, max_buffer_size): - self.max_buffer_size = max_buffer_size - self.current_index = 0 - self.buffer = [None] * self.max_buffer_size - self.stored_elements = 0 - - def append(self, item): - self.buffer[self.current_index] = item - self.current_index = (self.current_index + 1) % self.max_buffer_size - if self.stored_elements <= self.max_buffer_size: - self.stored_elements += 1 - - def random_pick(self, n_elem): - picks = [] - for _ in range(n_elem): - rand_index = randint(0, min(self.stored_elements, self.max_buffer_size) - 1) - picks.append(self.buffer[rand_index]) - return picks - - def mean(self): - acc = 0 - for i in range(min(self.stored_elements, 100)): - acc += self.buffer[i] - return acc/self.stored_elements + def __init__(self, max_buffer_size): + self.max_buffer_size = max_buffer_size + self.current_index = 0 + self.buffer = [None] * self.max_buffer_size + self.stored_elements = 0 + + def append(self, item): + """ + Append item to buffer. + + Args: + item: item to append + """ + self.buffer[self.current_index] = item + self.current_index = (self.current_index + 1) % self.max_buffer_size + if self.stored_elements <= self.max_buffer_size: + self.stored_elements += 1 + + def random_pick(self, n_elem): + """ + Pick a random set of elements from buffer. + + Args: + n_elem: number of element to pick + + Returns: + picks: set of random picks + """ + picks = [] + for _ in range(n_elem): + rand_index = randint(0, min(self.stored_elements, self.max_buffer_size) - 1) + picks.append(self.buffer[rand_index]) + return picks + + def mean(self): + """ + Perform the mean of buffer elements. + + Returns: + mean of values stored in the buffer + """ + acc = 0 + for i in range(min(self.stored_elements, 100)): + acc += self.buffer[i] + return acc/self.stored_elements diff --git a/ReinforcementLearningLib/sarsa_lib.py b/ReinforcementLearningLib/sarsa_lib.py index 62bcfb55..987eb583 100644 --- a/ReinforcementLearningLib/sarsa_lib.py +++ b/ReinforcementLearningLib/sarsa_lib.py @@ -5,7 +5,18 @@ class SARSAAgent(QLAgent): def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, - epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6, update_rate=100): + epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6, update_rate=100): + """ + Args: + shape: a tuple that describes the state space tensor shape + alpha: learning rate hyperparameter + gamma: discount factor hyper parameter + policy: numpy tensor test policy + epsilon: annealing function upper bound + epsilon_lower_bound: annealing function lower bound + epsilon_decay_function: lambda annealing function + update_rate: number of steps between policy generation operations + """ super().__init__(shape, alpha, gamma, policy, epsilon, epsilon_lower_bound, epsilon_decay_function) self.current_policy = None @@ -17,6 +28,9 @@ def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, self.total_episodes = 0 def extract_policy(self): + """ + Generate the current policy. + """ if (self.total_episodes % self.update_rate) == 0: policy_shape = self.shape policy_shape = policy_shape[:-1] @@ -28,25 +42,36 @@ def extract_policy(self): def update_q(self, state, new_state, action, reward): """ - It applies Q-Learning update rule. - Parameters: - Q -> Q matrix - state -> current state t - new_state -> next state t - reward -> reward - action -> current action - next_action -> next action + Apply Q-Learning update rule. + + Args: + state: current state t + new_state: next state t + reward: reward + action: current action """ next_action = self.current_policy[new_state] self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * self.Q[new_state][next_action]) def act(self, state, return_prob_dist=False): # TODO: controllare episode_number + """ + Return the action for current state. + + Args: + state: current state + return_prob_dist: boolean for probability distribution used by ensemblers + + Returns: + next_action: next action to perform + self.Q_target[state]: probability distribution + """ if (self.policy is not None): next_action = self.policy[state] else: self.epsilon = self.epsilon_decay_function(self.epsilon) self.epsilon = np.amax([self.epsilon, self.epsilon_lower_bound]) - # self.epsilon = self.get_epsilon_exponential(episode_number) + + # Annealing if np.random.uniform() > self.epsilon: next_action = self.current_policy[state] else: diff --git a/Taxi/ql_taxi.py b/Taxi/ql_taxi.py index 1c6f5bb7..295c3471 100644 --- a/Taxi/ql_taxi.py +++ b/Taxi/ql_taxi.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode diff --git a/Taxi/sarsa_taxi.py b/Taxi/sarsa_taxi.py index b2d89377..aac77986 100644 --- a/Taxi/sarsa_taxi.py +++ b/Taxi/sarsa_taxi.py @@ -9,11 +9,33 @@ def accuracy(results): """ Evaluate the accuracy of results, considering victories and defeats. + + Args: + results: List of 2 elements representing the number of victories and defeats + + Returns: + results accuracy """ return results[1] / (results[0] + results[1]) * 100 def experiment(n_episodes, default_policy=False, policy=None, render=False): + """ + Run a RL experiment that can be either training or testing + + Args: + n_episodes: number of train/test episodes + default_policy: boolean to enable testing/training phase + policy: numpy tensor with a trained policy + render: enable OpenAI environment graphical rendering + + Returns: + Dictionary with: + cumulative experiments outcomes + list of steps per episode + list of cumulative rewards + trained policy + """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode