diff --git a/Ensembling/ensembling_mountaincar_mixed.py b/Ensembling/ensembling_mountaincar_mixed.py index 7eab2ce8..a90e7f4b 100644 --- a/Ensembling/ensembling_mountaincar_mixed.py +++ b/Ensembling/ensembling_mountaincar_mixed.py @@ -48,20 +48,20 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) - agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) - agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) - agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) + agent1 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) + #agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01) + #agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01) + agent4 = DQNAgent(output_dim, [layer1, layer2], use_ddqn=False, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None) - agents = [agent1, agent2, agent3] - agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.MAJOR_VOTING_BASED) + agents = [agent1, agent4] + agentE = EnsemblerAgent(env.action_space.n, agents, EnsemblerType.TRUST_BASED) evaluate = False for i_episode in tqdm(range(n_episodes + 1), desc="Episode"): state = env.reset() - if (i_episode % 100) == 0: - agent3.extract_policy() + # agent3.extract_policy() discretized_state = obs_to_state(env, state, n_states) cumulative_reward = 0 @@ -88,13 +88,13 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): r2 = reward + np.sin(3 * original_state[0]) r3 = reward + (original_state[1] * original_state[1]) r4 = abs(new_state[0] - (-0.5)) # r in [0, 1] - reward = r4 new_state = np.reshape(new_state, [1, 2]) - agent1.memoise((state, next_action, reward, new_state, end)) - agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward) - agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward) + agent1.memoise((state, next_action, r4, new_state, end)) + #agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward) + #agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward) + agent4.memoise((state, next_action, r4, new_state, end)) if end: @@ -112,6 +112,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): cumulative_reward += reward agent1.learn() + agent4.learn() cumulative_reward += reward scores.append(cumulative_reward) @@ -136,6 +137,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): new_state, reward, end, _ = env.step(next_action) new_discretized_state = obs_to_state(env, new_state, n_states) original_state = new_state + new_state = np.reshape(new_state, [1, 2]) if end: @@ -164,11 +166,11 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): # Training -train_res = experiment(505) +train_res = experiment(200) training_mean_steps = train_res["steps"].mean() training_mean_score = train_res["scores"].mean() -np.savetxt("results/ens_mixed_major.csv", train_res["steps"], delimiter=',') +np.savetxt("results/ens_mixed_trust_cont.csv", train_res["steps"], delimiter=',') print("Training episodes:", len(train_res["steps"]), "Training mean score:", training_mean_score, \ "Training mean steps", training_mean_steps) diff --git a/FrozenLake/SavedNetworks/sarsa_4x4_policy.npy b/FrozenLake/SavedNetworks/sarsa_4x4_policy.npy index fc6f1972..a4f8cefc 100644 Binary files a/FrozenLake/SavedNetworks/sarsa_4x4_policy.npy and b/FrozenLake/SavedNetworks/sarsa_4x4_policy.npy differ diff --git a/FrozenLake/SavedNetworks/sarsa_8x8_policy.npy b/FrozenLake/SavedNetworks/sarsa_8x8_policy.npy index a944cfe6..4be6acf4 100644 Binary files a/FrozenLake/SavedNetworks/sarsa_8x8_policy.npy and b/FrozenLake/SavedNetworks/sarsa_8x8_policy.npy differ diff --git a/FrozenLake/SavedNetworks/sarsa_8x8d_policy.npy b/FrozenLake/SavedNetworks/sarsa_8x8d_policy.npy index 4738ebd8..a53d03bd 100644 Binary files a/FrozenLake/SavedNetworks/sarsa_8x8d_policy.npy and b/FrozenLake/SavedNetworks/sarsa_8x8d_policy.npy differ diff --git a/FrozenLake/ql_4x4.py b/FrozenLake/ql_4x4.py index a0c437cb..04425c18 100644 --- a/FrozenLake/ql_4x4.py +++ b/FrozenLake/ql_4x4.py @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = QLAgent([env.observation_space.n, env.action_space.n], epsilon_decay_function=lambda e: e - 0.000036) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/ql_4x4_deterministic.py b/FrozenLake/ql_4x4_deterministic.py index b3e55720..610ae3b0 100644 --- a/FrozenLake/ql_4x4_deterministic.py +++ b/FrozenLake/ql_4x4_deterministic.py @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = QLAgent([env.observation_space.n, env.action_space.n], alpha=1, epsilon_decay_function=lambda e: e * 0.995) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/ql_8x8.py b/FrozenLake/ql_8x8.py index b0fcf199..b09c1373 100644 --- a/FrozenLake/ql_8x8.py +++ b/FrozenLake/ql_8x8.py @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = QLAgent([env.observation_space.n, env.action_space.n], epsilon_decay_function=lambda e: e - 0.000012) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/ql_8x8_deterministic.py b/FrozenLake/ql_8x8_deterministic.py index b7e247a8..7ebc91ef 100644 --- a/FrozenLake/ql_8x8_deterministic.py +++ b/FrozenLake/ql_8x8_deterministic.py @@ -26,7 +26,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = QLAgent([env.observation_space.n, env.action_space.n], alpha=1, epsilon_decay_function=lambda e: e - 0.000036) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 @@ -35,7 +35,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/sarsa_4x4.py b/FrozenLake/sarsa_4x4.py index c799df67..18c6d372 100644 --- a/FrozenLake/sarsa_4x4.py +++ b/FrozenLake/sarsa_4x4.py @@ -23,9 +23,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy) else: - agent = SARSAAgent([env.observation_space.n, env.action_space.n]) + agent = SARSAAgent([env.observation_space.n, env.action_space.n], update_rate=15, epsilon_decay_function=lambda e: e * 0.995) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 if not default_policy: @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/sarsa_4x4_deterministic.py b/FrozenLake/sarsa_4x4_deterministic.py index 4a66afb6..22d1801d 100644 --- a/FrozenLake/sarsa_4x4_deterministic.py +++ b/FrozenLake/sarsa_4x4_deterministic.py @@ -24,9 +24,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1) else: agent = SARSAAgent([env.observation_space.n, env.action_space.n], alpha=1, - epsilon_decay_function=lambda e: e * 0.995) + epsilon_decay_function=lambda e: e * 0.995, update_rate=1) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 if not default_policy: @@ -37,7 +37,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/sarsa_8x8.py b/FrozenLake/sarsa_8x8.py index c12e7a26..25eef2f3 100644 --- a/FrozenLake/sarsa_8x8.py +++ b/FrozenLake/sarsa_8x8.py @@ -24,9 +24,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy) else: agent = SARSAAgent([env.observation_space.n, env.action_space.n], - epsilon_decay_function=lambda e: e - 0.000016) + epsilon_decay_function=lambda e: e - 0.000016, update_rate=2) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 if not default_policy: @@ -37,7 +37,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/FrozenLake/sarsa_8x8_deterministic.py b/FrozenLake/sarsa_8x8_deterministic.py index 07de352f..14469f36 100644 --- a/FrozenLake/sarsa_8x8_deterministic.py +++ b/FrozenLake/sarsa_8x8_deterministic.py @@ -24,9 +24,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy, alpha=1) else: agent = SARSAAgent([env.observation_space.n, env.action_space.n], alpha=1, - epsilon_decay_function=lambda e: e - 0.000016) + epsilon_decay_function=lambda e: e - 0.000016, update_rate=10) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 if not default_policy: @@ -37,7 +37,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/MountainCar/SavedNetworks/ql_policy.npy b/MountainCar/SavedNetworks/ql_policy.npy index ee487cc7..a0e02f65 100644 Binary files a/MountainCar/SavedNetworks/ql_policy.npy and b/MountainCar/SavedNetworks/ql_policy.npy differ diff --git a/MountainCar/dqn_mountain_car.py b/MountainCar/dqn_mountain_car.py index d92af496..8af50629 100644 --- a/MountainCar/dqn_mountain_car.py +++ b/MountainCar/dqn_mountain_car.py @@ -11,8 +11,8 @@ from dqn_lib import DQNAgent os.environ['PYTHONHASHSEED'] = '0' -np.random.seed(43) -tf.set_random_seed(43) +np.random.seed(17) +tf.set_random_seed(17) def accuracy(results): """ @@ -27,7 +27,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen steps = [] # Steps per episode env = gym.make('MountainCar-v0') - env.seed(43) + env.seed(17) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n @@ -48,8 +48,8 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen cumulative_reward = 0 if i_episode > 0 and (i_episode % 100) == 0 and not default_policy: - agent.save_model("43-tmp_model") - evaluation_result = experiment(500, default_policy=True, policy="43-tmp_model") + agent.save_model("tmp_model") + evaluation_result = experiment(500, default_policy=True, policy="tmp_model") acc = accuracy(evaluation_result["results"]) if acc == 100: break @@ -66,7 +66,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) - # reward = abs(new_state[0] - (-0.5)) # r in [0, 1] + reward = abs(new_state[0] - (-0.5)) # r in [0, 1] new_state = np.reshape(new_state, [1, 2]) agent.memoise((state, next_action, reward, new_state, end)) @@ -133,7 +133,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False, agen # experiments.append(("model20", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) # experiments.append(("model21", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) # experiments.append(("model22", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), tb_dir=None))) -experiments.append(("43-model23", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) +experiments.append(("model23", 25000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.01, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) # experiments.append(("model24", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=2000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) # experiments.append(("model25", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=3000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) # experiments.append(("model26", 10000, DQNAgent(output_dim, layers, use_ddqn=True, learn_thresh=4000, update_rate=300, epsilon_decay_function=lambda e: e * 0.995, epsilon_lower_bound=0.1, optimizer=keras.optimizers.Adam(0.001), tb_dir=None))) @@ -196,7 +196,7 @@ def train_and_test(experiments): df.loc[len(df)] = [model_name, len(train_res["steps"]), training_mean_score, training_mean_steps, testing_accuracy, testing_mean_score, testing_mean_steps] - df.to_csv('43-experiments.csv') + df.to_csv('experiments.csv') def main(): train_and_test(experiments) diff --git a/MountainCar/ql_mountain_car.py b/MountainCar/ql_mountain_car.py index 2f959d4b..bb909888 100644 --- a/MountainCar/ql_mountain_car.py +++ b/MountainCar/ql_mountain_car.py @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): else: agent = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) - for i_episode in tqdm(range(n_episodes), desc="Episode"): + for _ in tqdm(range(n_episodes), desc="Episode"): state = env.reset() state = obs_to_state(env, state, n_states) cumulative_reward = 0 @@ -45,7 +45,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): if (render): env.render() - next_action = agent.act((state[0], state[1]), i_episode) + next_action = agent.act((state[0], state[1])) new_state, reward, end, _ = env.step(next_action) new_state = obs_to_state(env, new_state, n_states) if policy is None: diff --git a/MountainCar/sarsa_mountain_car.py b/MountainCar/sarsa_mountain_car.py index d7dc06d1..8099a427 100644 --- a/MountainCar/sarsa_mountain_car.py +++ b/MountainCar/sarsa_mountain_car.py @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): else: agent = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e * 0.6, epsilon_lower_bound=0.1) - for i_episode in tqdm(range(n_episodes), desc="Episode"): + for _ in tqdm(range(n_episodes), desc="Episode"): state = env.reset() state = obs_to_state(env, state, n_states) cumulative_reward = 0 @@ -47,7 +47,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): if (render): env.render() - next_action = agent.act((state[0], state[1]), i_episode) + next_action = agent.act((state[0], state[1])) new_state, reward, end, _ = env.step(next_action) new_state = obs_to_state(env, new_state, n_states) if policy is None: diff --git a/ReinforcementLearningLib/sarsa_lib.py b/ReinforcementLearningLib/sarsa_lib.py index 4ce1dbe9..126bc4a9 100644 --- a/ReinforcementLearningLib/sarsa_lib.py +++ b/ReinforcementLearningLib/sarsa_lib.py @@ -5,20 +5,26 @@ class SARSAAgent(QLAgent): def __init__(self, shape, alpha=0.8, gamma=0.95, policy=None, epsilon=1, - epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6): + epsilon_lower_bound=0.01, epsilon_decay_function=lambda e: e * 0.6, update_rate=100): super().__init__(shape, alpha, gamma, policy, epsilon, epsilon_lower_bound, epsilon_decay_function) self.current_policy = None if policy is not None: self.current_policy = policy self.shape = shape + self.update_rate = update_rate + self.Q_target = None + self.total_episodes = 0 def extract_policy(self): - policy_shape = self.shape - policy_shape = policy_shape[:-1] - self.current_policy = np.zeros(policy_shape, dtype=int) - for idx, _ in np.ndenumerate(self.current_policy): - self.current_policy[idx] = self.next_action(self.Q[idx]) + if (self.total_episodes % self.update_rate) == 0: + policy_shape = self.shape + policy_shape = policy_shape[:-1] + self.current_policy = np.zeros(policy_shape, dtype=int) + for idx, _ in np.ndenumerate(self.current_policy): + self.current_policy[idx] = self.next_action(self.Q[idx]) + self.Q_target = self.Q + self.total_episodes += 1 def update_q(self, state, new_state, action, reward): """ @@ -34,7 +40,7 @@ def update_q(self, state, new_state, action, reward): next_action = self.current_policy[new_state] self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * (reward + self.gamma * self.Q[new_state][next_action]) - def act(self, state, episode_number=None): # TODO: controllare episode_number + def act(self, state, return_prob_dist=False): # TODO: controllare episode_number if (self.policy is not None): next_action = self.policy[state] else: @@ -46,4 +52,6 @@ def act(self, state, episode_number=None): # TODO: controllare episode_number else: next_action = np.argmax(np.random.uniform(0, 1, size=self.actions)) - return next_action \ No newline at end of file + if not return_prob_dist: + return next_action + return next_action, self.Q_target[state] \ No newline at end of file diff --git a/Taxi/SavedNetworks/sarsa_policy.npy b/Taxi/SavedNetworks/sarsa_policy.npy index 7fc7a6c7..b5c2331b 100644 Binary files a/Taxi/SavedNetworks/sarsa_policy.npy and b/Taxi/SavedNetworks/sarsa_policy.npy differ diff --git a/Taxi/ql_taxi.py b/Taxi/ql_taxi.py index 634b8a17..0cf22d46 100644 --- a/Taxi/ql_taxi.py +++ b/Taxi/ql_taxi.py @@ -25,7 +25,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): else: agent = QLAgent([env.observation_space.n, env.action_space.n]) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 @@ -34,7 +34,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) diff --git a/Taxi/sarsa_taxi.py b/Taxi/sarsa_taxi.py index cec7296d..4f8e65df 100644 --- a/Taxi/sarsa_taxi.py +++ b/Taxi/sarsa_taxi.py @@ -23,9 +23,9 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): if (default_policy): agent = SARSAAgent([env.observation_space.n, env.action_space.n], policy=policy) else: - agent = SARSAAgent([env.observation_space.n, env.action_space.n], epsilon_decay_function=lambda e: e - 0.000016) + agent = SARSAAgent([env.observation_space.n, env.action_space.n], epsilon_decay_function=lambda e: e - 0.000016, update_rate=10) - for i_episode in tqdm(range(n_episodes)): + for _ in tqdm(range(n_episodes)): state = env.reset() cumulative_reward = 0 if not default_policy: @@ -36,7 +36,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): env.render() time.sleep(1) - next_action = agent.act(state, i_episode) + next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) if policy is None: agent.update_q(state, new_state, next_action, reward) @@ -61,7 +61,7 @@ def experiment(n_episodes, default_policy=False, policy=None, render=False): # Training -train_res = experiment(30000) +train_res = experiment(10000) learnt_policy = np.argmax(train_res["Q"], axis=1) # print("Policy learnt: ", learnt_policy) training_mean_steps = train_res["steps"].mean()