diff --git a/turtlebot3_rl/turtlebot3_rl/env.py b/turtlebot3_rl/turtlebot3_rl/env.py index cb53247..a632570 100644 --- a/turtlebot3_rl/turtlebot3_rl/env.py +++ b/turtlebot3_rl/turtlebot3_rl/env.py @@ -3,40 +3,47 @@ import numpy as np import rclpy from rclpy.node import Node -from rclpy.executors import MultiThreadedExecutor +from rclpy.executors import SingleThreadedExecutor from geometry_msgs.msg import Twist from sensor_msgs.msg import LaserScan import threading +from .debug_publisher import DebugPublisher class TurtleBot3Env(gym.Env): def __init__(self): super(TurtleBot3Env, self).__init__() + # Define a range for LIDAR data that covers the front, left, right, and rear sides + self.full_indices = list(range(0, 360)) # Covers the full 360 degrees + # Initialize ROS2 node rclpy.init(args=None) self.node = rclpy.create_node('turtlebot3_rl_env') - # Action space: linear and angular velocity - # Reduced linear velocity for more cautious movement - self.action_space = spaces.Box(low=np.array([-0.1, -0.05]), high=np.array([0.1, 0.05]), dtype=np.float32) + # Initialize Debug Publisher + self.debug_publisher = DebugPublisher() - # Observation space: laser scan data - self.observation_space = spaces.Box(low=0.0, high=3.5, shape=(360,), dtype=np.float32) + # Action space: linear and angular velocity (Normalized to [-1, 1] range) + self.action_space = spaces.Box(low=np.array([-1.0, -1.0]), high=np.array([1.0, 1.0]), dtype=np.float32) + # Observation space: Now includes front, left, right, and rear LIDAR data + self.observation_space = spaces.Box(low=0.0, high=3.5, shape=(len(self.full_indices),), dtype=np.float32) + # Publishers and subscribers - self.cmd_vel_pub = self.node.create_publisher(Twist, 'cmd_vel', 10) + self.cmd_vel_pub = self.node.create_publisher(Twist, 'cmd_vel', 1) # Use a smaller queue size for faster response self.scan_sub = self.node.create_subscription( LaserScan, '/scan', self.scan_callback, - 10 # Queue size + 1 # Smaller queue size to prioritize the latest data ) # Initialize scan data - self.scan_data = np.zeros(360, dtype=np.float32) + self.scan_data = np.zeros(len(self.full_indices), dtype=np.float32) + self.last_action = np.zeros(2) # Initialize last_action to avoid uninitialized errors - # Use the executor to manage the node spinning - self.executor = MultiThreadedExecutor() + # Use a single-threaded executor for simpler, faster node execution + self.executor = SingleThreadedExecutor() self.executor.add_node(self.node) # Start a separate thread to spin the ROS2 node @@ -44,8 +51,8 @@ def __init__(self): self.spin_thread.start() def scan_callback(self, msg): - # Handle laser scan data - raw_scan_data = np.array(msg.ranges, dtype=np.float32) + # Capture the full 360-degree LIDAR data + raw_scan_data = np.array(msg.ranges, dtype=np.float32)[self.full_indices] # Replace 0.0 values with a large number, indicating 'no detection' raw_scan_data[raw_scan_data == 0.0] = np.inf @@ -61,73 +68,83 @@ def reset(self, seed=None, options=None): self.seed_val = seed self._np_random, seed = gym.utils.seeding.np_random(seed) + # Reset last_action to zero on reset + self.last_action = np.zeros(2) + return self.scan_data.astype(np.float32), {} def step(self, action): + # Map normalized action back to original range + linear_velocity = np.clip(action[0], -1.0, 1.0) * 0.2 # Adjusted speed for more effective obstacle avoidance + angular_velocity = np.clip(action[1], -1.0, 1.0) * 0.2 # Adjust angular speed for better turning + twist = Twist() - twist.linear.x = float(action[0]) - twist.angular.z = float(action[1]) + twist.linear.x = float(linear_velocity) + twist.angular.z = float(angular_velocity) self.cmd_vel_pub.publish(twist) # Log scan data for debugging - self.node.get_logger().info(f"Current scan data: {self.scan_data[:10]}") # Log the first 10 laser scan values + self.node.get_logger().debug(f"Current scan data: {self.scan_data}") # Use debug level to reduce logging overhead observation = self.scan_data.astype(np.float32) - reward = self.calculate_reward() + reward = self.calculate_reward(action, linear_velocity, angular_velocity) done = self.check_done() - # Log the action and the corresponding observation - self.node.get_logger().info(f"Action: {action}, Min Distance: {np.min(self.scan_data)}, Reward: {reward}") + # Publish debug information + self.debug_publisher.publish_debug_info(self.scan_data, action, reward, done) return observation, reward, done, False, {} - def calculate_reward(self): - min_distance = np.min(self.scan_data) + def calculate_reward(self, action, linear_velocity, angular_velocity): + # Focus on the full 360-degree sector for calculating rewards + front_min = np.min(self.scan_data[:90] + self.scan_data[-90:]) # Front sector + rear_min = np.min(self.scan_data[90:270]) # Rear sector + + # Initialize reward + reward = 0.0 + + # Strong penalty for collisions (front or rear) + if front_min < 0.25 or rear_min < 0.25: + reward = -5000.0 # Strong penalty for collision - # Collision penalty - if min_distance < 0.2: - return -300.0 # High penalty for collision + # Reward for moving forward safely + if linear_velocity > 0 and front_min >= 0.5: + reward += 20.0 # Encourage forward movement if safe - # Reward for moving forward - forward_movement_reward = 1.0 # Maintain a modest reward for moving forward + # Reward for moving backward safely + if linear_velocity < 0 and rear_min >= 0.5: + reward += 20.0 # Encourage backward movement if safe - # Penalize proximity to obstacles more heavily - proximity_penalty = -50.0 * (0.5 - min_distance) if min_distance < 0.5 else 0.0 + # Penalty for proximity to obstacles in the front or rear sectors + if front_min < 0.5 or rear_min < 0.5: + reward -= 1500.0 * (0.5 - min(front_min, rear_min)) # Stronger penalty for being too close to obstacles - # Reward for maintaining a safe distance - safe_distance_reward = 15.0 if min_distance > 0.5 else 0.0 + # Penalize slow movements (to avoid the robot "creeping" forward too slowly) + if abs(linear_velocity) < 0.05: + reward -= 100.0 # Penalize very slow movements - # Small reward for each step the robot avoids collision (time-based reward) - time_step_reward = 0.2 # Encourage staying in a safe state over time + # Reward for turning away from obstacles + if (front_min < 0.3 or rear_min < 0.3) and angular_velocity != 0: + reward += 100.0 # Reward for turning away when too close to obstacles - # Combine rewards and penalties - reward = forward_movement_reward + proximity_penalty + safe_distance_reward + time_step_reward + # Update last action + self.last_action = action return reward def check_done(self): - min_distance = np.min(self.scan_data) - if min_distance < 0.2: + # Check the broader sector to determine if the episode is done + front_min = np.min(self.scan_data[:90] + self.scan_data[-90:]) # Front sector + rear_min = np.min(self.scan_data[90:270]) # Rear sector + + # End the episode if a collision is detected in front or rear + if front_min < 0.25 or rear_min < 0.25: return True return False def close(self): self.executor.shutdown() self.spin_thread.join() + self.debug_publisher.destroy_node() rclpy.shutdown() -def main(): - rclpy.init(args=None) - env = TurtleBot3Env() - - # Example test loop - for _ in range(100): # Run for 100 steps as an example - action = env.action_space.sample() # Random action - observation, reward, done, _, _ = env.step(action) - if done: - env.reset() - - env.close() - -if __name__ == '__main__': - main() diff --git a/turtlebot3_rl/turtlebot3_rl/ppo_turtlebot3.zip b/turtlebot3_rl/turtlebot3_rl/ppo_turtlebot3.zip deleted file mode 100644 index 3f217dc..0000000 Binary files a/turtlebot3_rl/turtlebot3_rl/ppo_turtlebot3.zip and /dev/null differ diff --git a/turtlebot3_rl/turtlebot3_rl/sac_turtlebot3_final.zip b/turtlebot3_rl/turtlebot3_rl/sac_turtlebot3_final.zip new file mode 100644 index 0000000..f067ca4 Binary files /dev/null and b/turtlebot3_rl/turtlebot3_rl/sac_turtlebot3_final.zip differ diff --git a/turtlebot3_rl/turtlebot3_rl/test.py b/turtlebot3_rl/turtlebot3_rl/test.py index ffd208b..4d660d0 100644 --- a/turtlebot3_rl/turtlebot3_rl/test.py +++ b/turtlebot3_rl/turtlebot3_rl/test.py @@ -1,22 +1,39 @@ -from stable_baselines3 import PPO +import gym +from stable_baselines3 import SAC from turtlebot3_rl.env import TurtleBot3Env -import numpy as np def main(): + # Initialize the environment env = TurtleBot3Env() # Load the trained model - model = PPO.load("ppo_turtlebot3") + model = SAC.load("sac_turtlebot3_final.zip") - obs, _ = env.reset() - for _ in range(20): # Test for a small number of steps - action, _states = model.predict(obs, deterministic=True) - obs, reward, done, truncated, info = env.step(action) - print(f"Action: {action}, Min Distance: {np.min(obs)}, Reward: {reward}, Done: {done}") - if done or truncated: - obs, _ = env.reset() + try: + while True: # Infinite loop + obs, _ = env.reset() # Unpack the observation from the tuple + done = False + total_reward = 0.0 - env.close() + while not done: + # The model predicts an action based on the observation + action, _states = model.predict(obs, deterministic=True) + + # Apply the action in the environment + obs, reward, done, _, info = env.step(action) # Unpack all returned values + + # Accumulate the reward for this episode + total_reward += reward + + print(f"Episode finished: Total Reward: {total_reward}") + + except KeyboardInterrupt: + print("Testing interrupted by user. Exiting...") + + finally: + # Close the environment properly + env.close() if __name__ == '__main__': main() + diff --git a/turtlebot3_rl/turtlebot3_rl/train.py b/turtlebot3_rl/turtlebot3_rl/train.py index 9b98b96..dfb7e88 100644 --- a/turtlebot3_rl/turtlebot3_rl/train.py +++ b/turtlebot3_rl/turtlebot3_rl/train.py @@ -1,26 +1,79 @@ -import gym -from stable_baselines3 import PPO +'''import gym +from stable_baselines3 import SAC +from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.env_checker import check_env +from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback from turtlebot3_rl.env import TurtleBot3Env -import numpy as np -np.bool = np.bool_ - def main(): + # Initialize the environment env = TurtleBot3Env() - check_env(env) + check_env(env) # Ensure that the environment adheres to the Stable Baselines3 API + + # Wrap the environment in a Monitor for logging + env = Monitor(env) + + # Define the total number of timesteps for training + timesteps = 1000000 # Define this before using it in the lambda function + + # Define the learning rate schedule + learning_rate = lambda x: 0.0003 * (1 - x / timesteps) + + # Define the RL algorithm (SAC) with potential adjustments + model = SAC("MlpPolicy", env, verbose=1, learning_rate=learning_rate, buffer_size=200000, learning_starts=1000, batch_size=128, tau=0.005, gamma=0.99, train_freq=4, gradient_steps=4, use_sde=True) + + # Callbacks for saving models and evaluation + checkpoint_interval = 50000 # Define the interval for saving checkpoints + checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path='./models/', name_prefix='sac_turtlebot3') + eval_callback = EvalCallback(env, best_model_save_path='./models/', log_path='./logs/', eval_freq=checkpoint_interval, deterministic=True) - # Define the RL algorithm (PPO in this case) - model = PPO("MlpPolicy", env, ent_coef=0.01, learning_rate=0.000001, n_steps=2048, batch_size=64, n_epochs=10, clip_range=0.1, verbose=1) + # Train the model with checkpoints + model.learn(total_timesteps=timesteps, reset_num_timesteps=False, callback=[checkpoint_callback, eval_callback]) - # Train the model - model.learn(total_timesteps=1000000)#50000 + # Save the final model + model.save("sac_turtlebot3_final") + + # Close the environment + env.close() + +if __name__ == '__main__': + main() +''' +import gym +from stable_baselines3 import SAC +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.env_checker import check_env +from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback +from turtlebot3_rl.env import TurtleBot3Env + +def main(): + # Initialize the environment + env = TurtleBot3Env() + check_env(env) # Ensure that the environment adheres to the Stable Baselines3 API - # Save the model - model.save("ppo_turtlebot3") + # Wrap the environment in a Monitor for logging + env = Monitor(env) + + # Define the total number of additional timesteps for training + additional_timesteps = 500000 # Define how many more timesteps you want to train + + # Load the existing model + model = SAC.load("sac_turtlebot3_final.zip", env=env) + + # Callbacks for saving models and evaluation during continued training + checkpoint_interval = 50000 # Define the interval for saving checkpoints + checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path='./models/', name_prefix='sac_turtlebot3') + eval_callback = EvalCallback(env, best_model_save_path='./models/', log_path='./logs/', eval_freq=checkpoint_interval, deterministic=True) + + # Continue training the model with additional timesteps + model.learn(total_timesteps=additional_timesteps, reset_num_timesteps=False, callback=[checkpoint_callback, eval_callback]) + + # Save the updated model + model.save("sac_turtlebot3_final_finetuned") # Close the environment env.close() if __name__ == '__main__': main() +