Skip to content

Commit

Permalink
added new epsilon decay method to ppo
Browse files Browse the repository at this point in the history
  • Loading branch information
Jaden505 committed Jul 29, 2023
1 parent c7853b0 commit 162997d
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 35 deletions.
11 changes: 4 additions & 7 deletions cube/helper_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,20 +97,17 @@ def step(self, action):
return next_state, reward, done

def reward_action(self, state, next_state):
color_count_reward = 4 * (self.reward_color_count(next_state) - self.reward_color_count(state))
solved_face_reward = self.reward_face_solved(state, next_state)
color_count_reward = self.reward_color_count(next_state) - self.reward_color_count(state)
solved_face_reward = self.reward_face_solved(state, next_state) / 6
move_penalty = -0.05

if self.check_solved():
print('Solved cube!')
solved_cube_reward = 3
solved_cube_reward = 1
else:
solved_cube_reward = 0

total_reward = color_count_reward + solved_face_reward + solved_cube_reward + move_penalty
total_reward = np.clip(total_reward, -1, 1)

return total_reward
return color_count_reward + solved_face_reward + solved_cube_reward + move_penalty


if __name__ == "__main__":
Expand Down
5 changes: 1 addition & 4 deletions dqn/dqn_agent.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import copy

import replay_buffer as rb

import numpy as np

from keras.models import clone_model
from keras.models import Model
from keras.layers import *
Expand All @@ -28,7 +25,7 @@ def __init__(self):

# Temperature for Boltzmann exploration: higher temperature means more exploration
self.temp = 1.0
self.temp_decay = 0.995
self.temp_decay = 0.992
self.temp_min = 0.01

self.rotation_dict = {0: "U", 1: "U'", 2: "D", 3: "D'", 4: "L", 5: "L'",
Expand Down
19 changes: 8 additions & 11 deletions dqn/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from dqn.replay_buffer import ReplayBuffer
from dqn.dqn_agent import DqnAgent

from keras.models import load_model, clone_model
import copy


Expand All @@ -12,10 +11,7 @@ def __init__(self):
self.buffer = ReplayBuffer()
self.agent = DqnAgent()

self.model_save_path = '../models/dqn/model3_negative_rewards.h5'
# model = load_model(self.model_save_path)
# self.agent.model = model
# self.agent.target_model = clone_model(model)
self.model_save_path = '../models/dqn/model.h5'

self.STEPS = 2000
self.BATCH_SIZE = 96
Expand All @@ -24,7 +20,6 @@ def __init__(self):

self.solved_count = 0
self.scramble_length = 2
self.moves_since_scramble = 0

def train_model(self):
for step in range(self.STEPS):
Expand Down Expand Up @@ -52,12 +47,10 @@ def get_train_data(self):
state = copy.deepcopy(self.cube.get_cube_state())

for i in range(self.BATCH_SIZE):
self.moves_since_scramble += 1

ohe_state = self.agent.one_hot_encode(state)
action, q_state = self.agent.boltzmann_exploration(ohe_state, self.agent.model)
next_state, reward, done = self.cube.step(self.agent.rotation_dict[action],
self.moves_since_scramble, self.scramble_length)
next_state, reward, done = self.cube.step(self.agent.rotation_dict[action])

_, q_next_state = self.agent.boltzmann_exploration(self.agent.one_hot_encode(next_state),
self.agent.target_model)
td_error = abs(reward + (0.99 * max(q_next_state)) - q_state[action])
Expand All @@ -70,14 +63,18 @@ def get_train_data(self):
if self.solved_count >= 10:
self.scramble_length += 1
self.solved_count = 0
self.agent.temp = 1
if self.scramble_length > 5:
self.agent.temp_decay = 1 - (self.scramble_length / 100)
else:
self.agent.temp_decay = 0.992
else:
self.solved_count += 1

if i % self.scramble_length == 0 or done:
self.cube.reset()
self.cube.scramble(self.scramble_length)
state = copy.deepcopy(self.cube.get_cube_state())
self.moves_since_scramble = 0


if __name__ == "__main__":
Expand Down
18 changes: 9 additions & 9 deletions ppo/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


class PPOAgent:
def __init__(self, state_dim, action_dim, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, gamma=0.99, clip_ratio=0.15, learning_rate=3e-3):
def __init__(self, state_dim, action_dim, epsilon=1.0, epsilon_decay=0.992, epsilon_min=0.01,
gamma=0.99, clip_ratio=0.15, learning_rate=3e-4):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = gamma
Expand All @@ -27,12 +28,12 @@ def build_actor(self):
x = Dense(256, kernel_initializer=GlorotUniform())(x)
x = BatchNormalization()(x)
x = Activation('elu')(x)
x = Dropout(0.3)(x)
x = Dropout(0.1)(x)

x = Dense(128, kernel_initializer=GlorotUniform())(x)
x = BatchNormalization()(x)
x = Activation('elu')(x)
x = Dropout(0.3)(x)
x = Dropout(0.1)(x)

x = Dense(128, kernel_initializer=GlorotUniform())(x)
x = BatchNormalization()(x)
Expand All @@ -48,12 +49,12 @@ def build_critic(self):
x = Dense(256, kernel_initializer=GlorotUniform())(x)
x = BatchNormalization()(x)
x = Activation('elu')(x)
x = Dropout(0.2)(x)
x = Dropout(0.1)(x)

x = Dense(128, kernel_initializer=GlorotUniform())(x)
x = BatchNormalization()(x)
x = Activation('elu')(x)
x = Dropout(0.2)(x)
x = Dropout(0.1)(x)

x = Dense(128, kernel_initializer=GlorotUniform())(x)
x = BatchNormalization()(x)
Expand All @@ -69,10 +70,6 @@ def get_action(self, state):
probabilities = self.actor.predict(state, verbose=0)
action = np.argmax(probabilities[0])

# decay epsilon
if self.epsilon > self.epsilon_min and self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay

return action

def train(self, states, actions, rewards, next_states, dones):
Expand All @@ -99,6 +96,9 @@ def train(self, states, actions, rewards, next_states, dones):
self.actor_optimizer.apply_gradients(zip(grads_actor, self.actor.trainable_variables))
self.critic_optimizer.apply_gradients(zip(grads_critic, self.critic.trainable_variables))

if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay

def get_gae(self, rewards, states, next_states, dones):
"""Calculate the Generalized Advantage Estimation (GAE)"""
values = self.critic(states).numpy().squeeze()
Expand Down
13 changes: 9 additions & 4 deletions ppo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def __init__(self):
self.BATCH_SIZE = 64

self.model_save_path = "../models/ppo/model.h5"
self.agent.actor = load_model(self.model_save_path + "_actor")
self.agent.critic = load_model(self.model_save_path + "_critic")
self.agent.epsilon = 0.5
# self.agent.actor = load_model(self.model_save_path + "_actor")
# self.agent.critic = load_model(self.model_save_path + "_critic")
# self.agent.epsilon = 0.5

self.solved_count = 0
self.scramble_length = 7
self.scramble_length = 2

self.rotation_dict = {0: "U", 1: "U'", 2: "D", 3: "D'", 4: "L", 5: "L'",
6: "R", 7: "R'", 8: "F", 9: "F'", 10: "B", 11: "B'"}
Expand Down Expand Up @@ -68,6 +68,11 @@ def get_train_data(self):
self.scramble_length += 1
self.solved_count = 0
self.agent.epsilon = 1.0

if self.scramble_length > 5:
self.agent.epsilon_decay = 1 - (self.scramble_length / 100)
else:
self.agent.epsilon_decay = 0.992
else:
self.solved_count += 1

Expand Down

0 comments on commit 162997d

Please sign in to comment.