Skip to content

Commit

Permalink
8/18
Browse files Browse the repository at this point in the history
First training session has been done, and the performance is quite well, it can start training in complex environment, and also test with dynamic obstacle.(model name:sac_turtlebot3_final.zip)
  • Loading branch information
StanleyChueh committed Aug 18, 2024
1 parent 7e7fa03 commit be8983a
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 74 deletions.
119 changes: 68 additions & 51 deletions turtlebot3_rl/turtlebot3_rl/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,49 +3,56 @@
import numpy as np
import rclpy
from rclpy.node import Node
from rclpy.executors import MultiThreadedExecutor
from rclpy.executors import SingleThreadedExecutor
from geometry_msgs.msg import Twist
from sensor_msgs.msg import LaserScan
import threading
from .debug_publisher import DebugPublisher

class TurtleBot3Env(gym.Env):
def __init__(self):
super(TurtleBot3Env, self).__init__()

# Define a range for LIDAR data that covers the front, left, right, and rear sides
self.full_indices = list(range(0, 360)) # Covers the full 360 degrees

# Initialize ROS2 node
rclpy.init(args=None)
self.node = rclpy.create_node('turtlebot3_rl_env')

# Action space: linear and angular velocity
# Reduced linear velocity for more cautious movement
self.action_space = spaces.Box(low=np.array([-0.1, -0.05]), high=np.array([0.1, 0.05]), dtype=np.float32)
# Initialize Debug Publisher
self.debug_publisher = DebugPublisher()

# Observation space: laser scan data
self.observation_space = spaces.Box(low=0.0, high=3.5, shape=(360,), dtype=np.float32)
# Action space: linear and angular velocity (Normalized to [-1, 1] range)
self.action_space = spaces.Box(low=np.array([-1.0, -1.0]), high=np.array([1.0, 1.0]), dtype=np.float32)

# Observation space: Now includes front, left, right, and rear LIDAR data
self.observation_space = spaces.Box(low=0.0, high=3.5, shape=(len(self.full_indices),), dtype=np.float32)

# Publishers and subscribers
self.cmd_vel_pub = self.node.create_publisher(Twist, 'cmd_vel', 10)
self.cmd_vel_pub = self.node.create_publisher(Twist, 'cmd_vel', 1) # Use a smaller queue size for faster response
self.scan_sub = self.node.create_subscription(
LaserScan,
'/scan',
self.scan_callback,
10 # Queue size
1 # Smaller queue size to prioritize the latest data
)

# Initialize scan data
self.scan_data = np.zeros(360, dtype=np.float32)
self.scan_data = np.zeros(len(self.full_indices), dtype=np.float32)
self.last_action = np.zeros(2) # Initialize last_action to avoid uninitialized errors

# Use the executor to manage the node spinning
self.executor = MultiThreadedExecutor()
# Use a single-threaded executor for simpler, faster node execution
self.executor = SingleThreadedExecutor()
self.executor.add_node(self.node)

# Start a separate thread to spin the ROS2 node
self.spin_thread = threading.Thread(target=self.executor.spin)
self.spin_thread.start()

def scan_callback(self, msg):
# Handle laser scan data
raw_scan_data = np.array(msg.ranges, dtype=np.float32)
# Capture the full 360-degree LIDAR data
raw_scan_data = np.array(msg.ranges, dtype=np.float32)[self.full_indices]

# Replace 0.0 values with a large number, indicating 'no detection'
raw_scan_data[raw_scan_data == 0.0] = np.inf
Expand All @@ -61,73 +68,83 @@ def reset(self, seed=None, options=None):
self.seed_val = seed
self._np_random, seed = gym.utils.seeding.np_random(seed)

# Reset last_action to zero on reset
self.last_action = np.zeros(2)

return self.scan_data.astype(np.float32), {}

def step(self, action):
# Map normalized action back to original range
linear_velocity = np.clip(action[0], -1.0, 1.0) * 0.2 # Adjusted speed for more effective obstacle avoidance
angular_velocity = np.clip(action[1], -1.0, 1.0) * 0.2 # Adjust angular speed for better turning

twist = Twist()
twist.linear.x = float(action[0])
twist.angular.z = float(action[1])
twist.linear.x = float(linear_velocity)
twist.angular.z = float(angular_velocity)
self.cmd_vel_pub.publish(twist)

# Log scan data for debugging
self.node.get_logger().info(f"Current scan data: {self.scan_data[:10]}") # Log the first 10 laser scan values
self.node.get_logger().debug(f"Current scan data: {self.scan_data}") # Use debug level to reduce logging overhead

observation = self.scan_data.astype(np.float32)
reward = self.calculate_reward()
reward = self.calculate_reward(action, linear_velocity, angular_velocity)
done = self.check_done()

# Log the action and the corresponding observation
self.node.get_logger().info(f"Action: {action}, Min Distance: {np.min(self.scan_data)}, Reward: {reward}")
# Publish debug information
self.debug_publisher.publish_debug_info(self.scan_data, action, reward, done)

return observation, reward, done, False, {}

def calculate_reward(self):
min_distance = np.min(self.scan_data)
def calculate_reward(self, action, linear_velocity, angular_velocity):
# Focus on the full 360-degree sector for calculating rewards
front_min = np.min(self.scan_data[:90] + self.scan_data[-90:]) # Front sector
rear_min = np.min(self.scan_data[90:270]) # Rear sector

# Initialize reward
reward = 0.0

# Strong penalty for collisions (front or rear)
if front_min < 0.25 or rear_min < 0.25:
reward = -5000.0 # Strong penalty for collision

# Collision penalty
if min_distance < 0.2:
return -300.0 # High penalty for collision
# Reward for moving forward safely
if linear_velocity > 0 and front_min >= 0.5:
reward += 20.0 # Encourage forward movement if safe

# Reward for moving forward
forward_movement_reward = 1.0 # Maintain a modest reward for moving forward
# Reward for moving backward safely
if linear_velocity < 0 and rear_min >= 0.5:
reward += 20.0 # Encourage backward movement if safe

# Penalize proximity to obstacles more heavily
proximity_penalty = -50.0 * (0.5 - min_distance) if min_distance < 0.5 else 0.0
# Penalty for proximity to obstacles in the front or rear sectors
if front_min < 0.5 or rear_min < 0.5:
reward -= 1500.0 * (0.5 - min(front_min, rear_min)) # Stronger penalty for being too close to obstacles

# Reward for maintaining a safe distance
safe_distance_reward = 15.0 if min_distance > 0.5 else 0.0
# Penalize slow movements (to avoid the robot "creeping" forward too slowly)
if abs(linear_velocity) < 0.05:
reward -= 100.0 # Penalize very slow movements

# Small reward for each step the robot avoids collision (time-based reward)
time_step_reward = 0.2 # Encourage staying in a safe state over time
# Reward for turning away from obstacles
if (front_min < 0.3 or rear_min < 0.3) and angular_velocity != 0:
reward += 100.0 # Reward for turning away when too close to obstacles

# Combine rewards and penalties
reward = forward_movement_reward + proximity_penalty + safe_distance_reward + time_step_reward
# Update last action
self.last_action = action

return reward

def check_done(self):
min_distance = np.min(self.scan_data)
if min_distance < 0.2:
# Check the broader sector to determine if the episode is done
front_min = np.min(self.scan_data[:90] + self.scan_data[-90:]) # Front sector
rear_min = np.min(self.scan_data[90:270]) # Rear sector

# End the episode if a collision is detected in front or rear
if front_min < 0.25 or rear_min < 0.25:
return True
return False

def close(self):
self.executor.shutdown()
self.spin_thread.join()
self.debug_publisher.destroy_node()
rclpy.shutdown()

def main():
rclpy.init(args=None)
env = TurtleBot3Env()

# Example test loop
for _ in range(100): # Run for 100 steps as an example
action = env.action_space.sample() # Random action
observation, reward, done, _, _ = env.step(action)
if done:
env.reset()

env.close()

if __name__ == '__main__':
main()
Binary file removed turtlebot3_rl/turtlebot3_rl/ppo_turtlebot3.zip
Binary file not shown.
Binary file not shown.
39 changes: 28 additions & 11 deletions turtlebot3_rl/turtlebot3_rl/test.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,39 @@
from stable_baselines3 import PPO
import gym
from stable_baselines3 import SAC
from turtlebot3_rl.env import TurtleBot3Env
import numpy as np

def main():
# Initialize the environment
env = TurtleBot3Env()

# Load the trained model
model = PPO.load("ppo_turtlebot3")
model = SAC.load("sac_turtlebot3_final.zip")

obs, _ = env.reset()
for _ in range(20): # Test for a small number of steps
action, _states = model.predict(obs, deterministic=True)
obs, reward, done, truncated, info = env.step(action)
print(f"Action: {action}, Min Distance: {np.min(obs)}, Reward: {reward}, Done: {done}")
if done or truncated:
obs, _ = env.reset()
try:
while True: # Infinite loop
obs, _ = env.reset() # Unpack the observation from the tuple
done = False
total_reward = 0.0

env.close()
while not done:
# The model predicts an action based on the observation
action, _states = model.predict(obs, deterministic=True)

# Apply the action in the environment
obs, reward, done, _, info = env.step(action) # Unpack all returned values

# Accumulate the reward for this episode
total_reward += reward

print(f"Episode finished: Total Reward: {total_reward}")

except KeyboardInterrupt:
print("Testing interrupted by user. Exiting...")

finally:
# Close the environment properly
env.close()

if __name__ == '__main__':
main()

77 changes: 65 additions & 12 deletions turtlebot3_rl/turtlebot3_rl/train.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,79 @@
import gym
from stable_baselines3 import PPO
'''import gym
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from turtlebot3_rl.env import TurtleBot3Env
import numpy as np
np.bool = np.bool_

def main():
# Initialize the environment
env = TurtleBot3Env()
check_env(env)
check_env(env) # Ensure that the environment adheres to the Stable Baselines3 API
# Wrap the environment in a Monitor for logging
env = Monitor(env)
# Define the total number of timesteps for training
timesteps = 1000000 # Define this before using it in the lambda function
# Define the learning rate schedule
learning_rate = lambda x: 0.0003 * (1 - x / timesteps)
# Define the RL algorithm (SAC) with potential adjustments
model = SAC("MlpPolicy", env, verbose=1, learning_rate=learning_rate, buffer_size=200000, learning_starts=1000, batch_size=128, tau=0.005, gamma=0.99, train_freq=4, gradient_steps=4, use_sde=True)
# Callbacks for saving models and evaluation
checkpoint_interval = 50000 # Define the interval for saving checkpoints
checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path='./models/', name_prefix='sac_turtlebot3')
eval_callback = EvalCallback(env, best_model_save_path='./models/', log_path='./logs/', eval_freq=checkpoint_interval, deterministic=True)
# Define the RL algorithm (PPO in this case)
model = PPO("MlpPolicy", env, ent_coef=0.01, learning_rate=0.000001, n_steps=2048, batch_size=64, n_epochs=10, clip_range=0.1, verbose=1)
# Train the model with checkpoints
model.learn(total_timesteps=timesteps, reset_num_timesteps=False, callback=[checkpoint_callback, eval_callback])
# Train the model
model.learn(total_timesteps=1000000)#50000
# Save the final model
model.save("sac_turtlebot3_final")
# Close the environment
env.close()
if __name__ == '__main__':
main()
'''
import gym
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from turtlebot3_rl.env import TurtleBot3Env

def main():
# Initialize the environment
env = TurtleBot3Env()
check_env(env) # Ensure that the environment adheres to the Stable Baselines3 API

# Save the model
model.save("ppo_turtlebot3")
# Wrap the environment in a Monitor for logging
env = Monitor(env)

# Define the total number of additional timesteps for training
additional_timesteps = 500000 # Define how many more timesteps you want to train

# Load the existing model
model = SAC.load("sac_turtlebot3_final.zip", env=env)

# Callbacks for saving models and evaluation during continued training
checkpoint_interval = 50000 # Define the interval for saving checkpoints
checkpoint_callback = CheckpointCallback(save_freq=checkpoint_interval, save_path='./models/', name_prefix='sac_turtlebot3')
eval_callback = EvalCallback(env, best_model_save_path='./models/', log_path='./logs/', eval_freq=checkpoint_interval, deterministic=True)

# Continue training the model with additional timesteps
model.learn(total_timesteps=additional_timesteps, reset_num_timesteps=False, callback=[checkpoint_callback, eval_callback])

# Save the updated model
model.save("sac_turtlebot3_final_finetuned")

# Close the environment
env.close()

if __name__ == '__main__':
main()

0 comments on commit be8983a

Please sign in to comment.