Skip to content

Commit

Permalink
Fixed sarsa, update networks
Browse files Browse the repository at this point in the history
  • Loading branch information
hjorvardr committed Nov 28, 2018
1 parent e5be340 commit 63e4efd
Show file tree
Hide file tree
Showing 20 changed files with 69 additions and 59 deletions.
28 changes: 15 additions & 13 deletions Ensembling/ensembling_mountaincar_mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,20 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
layer1 = Dense(15, input_dim=input_dim, activation='relu')
layer2 = Dense(output_dim)

agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1)
agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1)
agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)
#agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01)
#agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01)
agent4 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=False, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)


agents = [agent1, agent2, agent3]
agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED)
agents = [agent1, agent4]
agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.TRUST_BASED)

evaluate = False

for i_episode in tqdm(range(n_episodes + 1), desc="Episode"):
state = env.reset()
if (i_episode % 100) == 0:
agent3.extract_policy()
# agent3.extract_policy()
discretized_state = obs_to_state(env, state, n_states)
cumulative_reward = 0

Expand All @@ -88,13 +88,13 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
r2 = reward + np.sin(3 * original_state[0])
r3 = reward + (original_state[1] * original_state[1])
r4 = abs(new_state[0] - (-0.5)) # r in [0, 1]
reward = r4

new_state = np.reshape(new_state, [1, 2])

agent1.memoise((state, next_action, reward, new_state, end))
agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
agent1.memoise((state, next_action, r4, new_state, end))
#agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
#agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
agent4.memoise((state, next_action, r4, new_state, end))


if end:
Expand All @@ -112,6 +112,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
cumulative_reward += reward

agent1.learn()
agent4.learn()

cumulative_reward += reward
scores.append(cumulative_reward)
Expand All @@ -136,6 +137,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
new_state, reward, end, _ = env.step(next_action)
new_discretized_state = obs_to_state(env, new_state, n_states)
original_state = new_state
new_state = np.reshape(new_state, [1, 2])


if end:
Expand Down Expand Up @@ -164,11 +166,11 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):


# Training
train_res = experiment(505)
train_res = experiment(200)
training_mean_steps = train_res["steps"].mean()
training_mean_score = train_res["scores"].mean()

np.savetxt("results/ens_mixed_major.csv", train_res["steps"], delimiter=',')
np.savetxt("results/ens_mixed_trust_cont.csv", train_res["steps"], delimiter=',')

print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \
"Training mean steps", training_mean_steps)
Expand Down
Binary file modified FrozenLake/SavedNetworks/sarsa_4x4_policy.npy
Binary file not shown.
Binary file modified FrozenLake/SavedNetworks/sarsa_8x8_policy.npy
Binary file not shown.
Binary file modified FrozenLake/SavedNetworks/sarsa_8x8d_policy.npy
Binary file not shown.
4 changes: 2 additions & 2 deletions FrozenLake/ql_4x4.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = QLAgent([env.observation_space.n, env.action_space.n],
epsilon_decay_function=lambda e: e - 0.000036)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0

Expand All @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
4 changes: 2 additions & 2 deletions FrozenLake/ql_4x4_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = QLAgent([env.observation_space.n, env.action_space.n], alpha=1,
epsilon_decay_function=lambda e: e * 0.995)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0

Expand All @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
4 changes: 2 additions & 2 deletions FrozenLake/ql_8x8.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = QLAgent([env.observation_space.n, env.action_space.n],
epsilon_decay_function=lambda e: e - 0.000012)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0

Expand All @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
4 changes: 2 additions & 2 deletions FrozenLake/ql_8x8_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = QLAgent([env.observation_space.n, env.action_space.n], alpha=1,
epsilon_decay_function=lambda e: e - 0.000036)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0

Expand All @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
6 changes: 3 additions & 3 deletions FrozenLake/sarsa_4x4.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
if (default_policy):
agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy)
else:
agent = SARSAAgent([env.observation_space.n, env.action_space.n])
agent = SARSAAgent([env.observation_space.n, env.action_space.n], update_rate=15, epsilon_decay_function=lambda e: e * 0.995)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0
if not default_policy:
Expand All @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
6 changes: 3 additions & 3 deletions FrozenLake/sarsa_4x4_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1)
else:
agent = SARSAAgent([env.observation_space.n, env.action_space.n], alpha=1,
epsilon_decay_function=lambda e: e * 0.995)
epsilon_decay_function=lambda e: e * 0.995, update_rate=1)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0
if not default_policy:
Expand All @@ -37,7 +37,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
6 changes: 3 additions & 3 deletions FrozenLake/sarsa_8x8.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy)
else:
agent = SARSAAgent([env.observation_space.n, env.action_space.n],
epsilon_decay_function=lambda e: e - 0.000016)
epsilon_decay_function=lambda e: e - 0.000016, update_rate=2)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0
if not default_policy:
Expand All @@ -37,7 +37,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
6 changes: 3 additions & 3 deletions FrozenLake/sarsa_8x8_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1)
else:
agent = SARSAAgent([env.observation_space.n, env.action_space.n], alpha=1,
epsilon_decay_function=lambda e: e - 0.000016)
epsilon_decay_function=lambda e: e - 0.000016, update_rate=10)

for i_episode in tqdm(range(n_episodes)):
for _ in tqdm(range(n_episodes)):
state = env.reset()
cumulative_reward = 0
if not default_policy:
Expand All @@ -37,7 +37,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
env.render()
time.sleep(1)

next_action = agent.act(state, i_episode)
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)
if policy is None:
agent.update_q(state, new_state, next_action, reward)
Expand Down
Binary file modified MountainCar/SavedNetworks/ql_policy.npy
Binary file not shown.
16 changes: 8 additions & 8 deletions MountainCar/dqn_mountain_car.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from dqn_lib import DQNAgent

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(43)
tf.set_random_seed(43)
np.random.seed(17)
tf.set_random_seed(17)

def accuracy(results):
"""
Expand All @@ -27,7 +27,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen
steps = [] # Steps per episode

env = gym.make('MountainCar-v0')
env.seed(43)
env.seed(17)

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n
Expand All @@ -48,8 +48,8 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen
cumulative_reward = 0

if i_episode > 0 and (i_episode % 100) == 0 and not default_policy:
agent.save_model("43-tmp_model")
evaluation_result = experiment(500, default_policy=True, policy="43-tmp_model")
agent.save_model("tmp_model")
evaluation_result = experiment(500, default_policy=True, policy="tmp_model")
acc = accuracy(evaluation_result["results"])
if acc == 100:
break
Expand All @@ -66,7 +66,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen
next_action = agent.act(state)
new_state, reward, end, _ = env.step(next_action)

# reward = abs(new_state[0] - (-0.5)) # r in [0, 1]
reward = abs(new_state[0] - (-0.5)) # r in [0, 1]
new_state = np.reshape(new_state, [1, 2])

agent.memoise((state, next_action, reward, new_state, end))
Expand Down Expand Up @@ -133,7 +133,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen
# experiments.append(("model20", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)))
# experiments.append(("model21", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)))
# experiments.append(("model22", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None)))
experiments.append(("43-model23", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None)))
experiments.append(("model23", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None)))
# experiments.append(("model24", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None)))
# experiments.append(("model25", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None)))
# experiments.append(("model26", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None)))
Expand Down Expand Up @@ -196,7 +196,7 @@ def train_and_test(experiments):

df.loc[len(df)] = [model_name, len(train_res["steps"]), training_mean_score, training_mean_steps, testing_accuracy, testing_mean_score, testing_mean_steps]

df.to_csv('43-experiments.csv')
df.to_csv('experiments.csv')

def main():
train_and_test(experiments)
Expand Down
4 changes: 2 additions & 2 deletions MountainCar/ql_mountain_car.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
else:
agent = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1)

for i_episode in tqdm(range(n_episodes), desc="Episode"):
for _ in tqdm(range(n_episodes), desc="Episode"):
state = env.reset()
state = obs_to_state(env, state, n_states)
cumulative_reward = 0
Expand All @@ -45,7 +45,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
if (render):
env.render()

next_action = agent.act((state[0], state[1]), i_episode)
next_action = agent.act((state[0], state[1]))
new_state, reward, end, _ = env.step(next_action)
new_state = obs_to_state(env, new_state, n_states)
if policy is None:
Expand Down
4 changes: 2 additions & 2 deletions MountainCar/sarsa_mountain_car.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
else:
agent = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1)

for i_episode in tqdm(range(n_episodes), desc="Episode"):
for _ in tqdm(range(n_episodes), desc="Episode"):
state = env.reset()
state = obs_to_state(env, state, n_states)
cumulative_reward = 0
Expand All @@ -47,7 +47,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False):
if (render):
env.render()

next_action = agent.act((state[0], state[1]), i_episode)
next_action = agent.act((state[0], state[1]))
new_state, reward, end, _ = env.step(next_action)
new_state = obs_to_state(env, new_state, n_states)
if policy is None:
Expand Down
24 changes: 16 additions & 8 deletions ReinforcementLearningLib/sarsa_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,26 @@
class SARSAAgent(QLAgent):

def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1,
epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6):
epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6, update_rate=100):
super().__init__(shape, alpha, gamma, policy, epsilon, epsilon_lower_bound,
epsilon_decay_function)
self.current_policy = None
if policy is not None:
self.current_policy = policy
self.shape = shape
self.update_rate = update_rate
self.Q_target = None
self.total_episodes = 0

def extract_policy(self):
policy_shape = self.shape
policy_shape = policy_shape[:-1]
self.current_policy = np.zeros(policy_shape, dtype=int)
for idx, _ in np.ndenumerate(self.current_policy):
self.current_policy[idx] = self.next_action(self.Q[idx])
if (self.total_episodes % self.update_rate) == 0:
policy_shape = self.shape
policy_shape = policy_shape[:-1]
self.current_policy = np.zeros(policy_shape, dtype=int)
for idx, _ in np.ndenumerate(self.current_policy):
self.current_policy[idx] = self.next_action(self.Q[idx])
self.Q_target = self.Q
self.total_episodes += 1

def update_q(self, state, new_state, action, reward):
"""
Expand All @@ -34,7 +40,7 @@ def update_q(self, state, new_state, action, reward):
next_action = self.current_policy[new_state]
self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * self.Q[new_state][next_action])

def act(self, state, episode_number=None): # TODO: controllare episode_number
def act(self, state, return_prob_dist=False): # TODO: controllare episode_number
if (self.policy is not None):
next_action = self.policy[state]
else:
Expand All @@ -46,4 +52,6 @@ def act(self, state, episode_number=None): # TODO: controllare episode_number
else:
next_action = np.argmax(np.random.uniform(0, 1, size=self.actions))

return next_action
if not return_prob_dist:
return next_action
return next_action, self.Q_target[state]
Binary file modified Taxi/SavedNetworks/sarsa_policy.npy
Binary file not shown.
Loading

0 comments on commit 63e4efd

Please sign in to comment.