added new epsilon decay method to ppo

Jaden505 · Jul 29, 2023 · 162997d · 162997d
1 parent c7853b0
commit 162997d
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 35 deletions.
diff --git a/cube/helper_cube.py b/cube/helper_cube.py
@@ -97,20 +97,17 @@ def step(self, action):
         return next_state, reward, done
 
     def reward_action(self, state, next_state):
-        color_count_reward = 4 * (self.reward_color_count(next_state) - self.reward_color_count(state))
-        solved_face_reward = self.reward_face_solved(state, next_state)
+        color_count_reward = self.reward_color_count(next_state) - self.reward_color_count(state)
+        solved_face_reward = self.reward_face_solved(state, next_state) / 6
         move_penalty = -0.05
 
         if self.check_solved():
             print('Solved cube!')
-            solved_cube_reward = 3
+            solved_cube_reward = 1
         else:
             solved_cube_reward = 0
 
-        total_reward = color_count_reward + solved_face_reward + solved_cube_reward + move_penalty
-        total_reward = np.clip(total_reward, -1, 1)
-
-        return total_reward
+        return color_count_reward + solved_face_reward + solved_cube_reward + move_penalty
 
 
 if __name__ == "__main__":

diff --git a/dqn/dqn_agent.py b/dqn/dqn_agent.py
@@ -1,9 +1,6 @@
-import copy
-
 import replay_buffer as rb
 
 import numpy as np
-
 from keras.models import clone_model
 from keras.models import Model
 from keras.layers import *
@@ -28,7 +25,7 @@ def __init__(self):
 
         # Temperature for Boltzmann exploration: higher temperature means more exploration
         self.temp = 1.0
-        self.temp_decay = 0.995
+        self.temp_decay = 0.992
         self.temp_min = 0.01
 
         self.rotation_dict = {0: "U", 1: "U'", 2: "D", 3: "D'", 4: "L", 5: "L'",

diff --git a/dqn/main.py b/dqn/main.py
@@ -2,7 +2,6 @@
 from dqn.replay_buffer import ReplayBuffer
 from dqn.dqn_agent import DqnAgent
 
-from keras.models import load_model, clone_model
 import copy
 
 
@@ -12,10 +11,7 @@ def __init__(self):
         self.buffer = ReplayBuffer()
         self.agent = DqnAgent()
 
-        self.model_save_path = '../models/dqn/model3_negative_rewards.h5'
-        # model = load_model(self.model_save_path)
-        # self.agent.model = model
-        # self.agent.target_model = clone_model(model)
+        self.model_save_path = '../models/dqn/model.h5'
 
         self.STEPS = 2000
         self.BATCH_SIZE = 96
@@ -24,7 +20,6 @@ def __init__(self):
 
         self.solved_count = 0
         self.scramble_length = 2
-        self.moves_since_scramble = 0
 
     def train_model(self):
         for step in range(self.STEPS):
@@ -52,12 +47,10 @@ def get_train_data(self):
         state = copy.deepcopy(self.cube.get_cube_state())
 
         for i in range(self.BATCH_SIZE):
-            self.moves_since_scramble += 1
-
             ohe_state = self.agent.one_hot_encode(state)
             action, q_state = self.agent.boltzmann_exploration(ohe_state, self.agent.model)
-            next_state, reward, done = self.cube.step(self.agent.rotation_dict[action],
-                                                      self.moves_since_scramble, self.scramble_length)
+            next_state, reward, done = self.cube.step(self.agent.rotation_dict[action])
+
             _, q_next_state = self.agent.boltzmann_exploration(self.agent.one_hot_encode(next_state),
                                                                self.agent.target_model)
             td_error = abs(reward + (0.99 * max(q_next_state)) - q_state[action])
@@ -70,14 +63,18 @@ def get_train_data(self):
                 if self.solved_count >= 10:
                     self.scramble_length += 1
                     self.solved_count = 0
+                    self.agent.temp = 1
+                    if self.scramble_length > 5:
+                        self.agent.temp_decay = 1 - (self.scramble_length / 100)
+                    else:
+                        self.agent.temp_decay = 0.992
                 else:
                     self.solved_count += 1
 
             if i % self.scramble_length == 0 or done:
                 self.cube.reset()
                 self.cube.scramble(self.scramble_length)
                 state = copy.deepcopy(self.cube.get_cube_state())
-                self.moves_since_scramble = 0
 
 
 if __name__ == "__main__":

diff --git a/ppo/agent.py b/ppo/agent.py
@@ -6,7 +6,8 @@
 
 
 class PPOAgent:
-    def __init__(self, state_dim, action_dim, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, gamma=0.99, clip_ratio=0.15, learning_rate=3e-3):
+    def __init__(self, state_dim, action_dim, epsilon=1.0, epsilon_decay=0.992, epsilon_min=0.01,
+                 gamma=0.99, clip_ratio=0.15, learning_rate=3e-4):
         self.state_dim = state_dim
         self.action_dim = action_dim
         self.gamma = gamma
@@ -27,12 +28,12 @@ def build_actor(self):
         x = Dense(256, kernel_initializer=GlorotUniform())(x)
         x = BatchNormalization()(x)
         x = Activation('elu')(x)
-        x = Dropout(0.3)(x)
+        x = Dropout(0.1)(x)
 
         x = Dense(128, kernel_initializer=GlorotUniform())(x)
         x = BatchNormalization()(x)
         x = Activation('elu')(x)
-        x = Dropout(0.3)(x)
+        x = Dropout(0.1)(x)
 
         x = Dense(128, kernel_initializer=GlorotUniform())(x)
         x = BatchNormalization()(x)
@@ -48,12 +49,12 @@ def build_critic(self):
         x = Dense(256, kernel_initializer=GlorotUniform())(x)
         x = BatchNormalization()(x)
         x = Activation('elu')(x)
-        x = Dropout(0.2)(x)
+        x = Dropout(0.1)(x)
 
         x = Dense(128, kernel_initializer=GlorotUniform())(x)
         x = BatchNormalization()(x)
         x = Activation('elu')(x)
-        x = Dropout(0.2)(x)
+        x = Dropout(0.1)(x)
 
         x = Dense(128, kernel_initializer=GlorotUniform())(x)
         x = BatchNormalization()(x)
@@ -69,10 +70,6 @@ def get_action(self, state):
             probabilities = self.actor.predict(state, verbose=0)
             action = np.argmax(probabilities[0])
 
-        # decay epsilon
-        if self.epsilon > self.epsilon_min and self.epsilon > self.epsilon_min:
-            self.epsilon *= self.epsilon_decay
-
         return action
 
     def train(self, states, actions, rewards, next_states, dones):
@@ -99,6 +96,9 @@ def train(self, states, actions, rewards, next_states, dones):
         self.actor_optimizer.apply_gradients(zip(grads_actor, self.actor.trainable_variables))
         self.critic_optimizer.apply_gradients(zip(grads_critic, self.critic.trainable_variables))
 
+        if self.epsilon > self.epsilon_min:
+            self.epsilon *= self.epsilon_decay
+
     def get_gae(self, rewards, states, next_states, dones):
         """Calculate the Generalized Advantage Estimation (GAE)"""
         values = self.critic(states).numpy().squeeze()

diff --git a/ppo/main.py b/ppo/main.py
@@ -14,12 +14,12 @@ def __init__(self):
         self.BATCH_SIZE = 64
 
         self.model_save_path = "../models/ppo/model.h5"
-        self.agent.actor = load_model(self.model_save_path + "_actor")
-        self.agent.critic = load_model(self.model_save_path + "_critic")
-        self.agent.epsilon = 0.5
+        # self.agent.actor = load_model(self.model_save_path + "_actor")
+        # self.agent.critic = load_model(self.model_save_path + "_critic")
+        # self.agent.epsilon = 0.5
 
         self.solved_count = 0
-        self.scramble_length = 7
+        self.scramble_length = 2
 
         self.rotation_dict = {0: "U", 1: "U'", 2: "D", 3: "D'", 4: "L", 5: "L'",
                               6: "R", 7: "R'", 8: "F", 9: "F'", 10: "B", 11: "B'"}
@@ -68,6 +68,11 @@ def get_train_data(self):
                     self.scramble_length += 1
                     self.solved_count = 0
                     self.agent.epsilon = 1.0
+
+                    if self.scramble_length > 5:
+                        self.agent.epsilon_decay = 1 - (self.scramble_length / 100)
+                    else:
+                        self.agent.epsilon_decay = 0.992
                 else:
                     self.solved_count += 1