training Turn 0.1.0 version

This commit is contained in:
xxh
2026-03-25 03:20:22 -04:00
parent 77120ecb7b
commit 8ab57840ba

View File

@@ -153,7 +153,7 @@ class WalkEnv(gym.Env):
self.min_stance_rad = 0.10 self.min_stance_rad = 0.10
# Small reset perturbations for robustness training. # Small reset perturbations for robustness training.
self.enable_reset_perturb = True self.enable_reset_perturb = False
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
self.reset_joint_noise_rad = 0.025 self.reset_joint_noise_rad = 0.025
self.reset_perturb_steps = 4 self.reset_perturb_steps = 4
@@ -162,6 +162,7 @@ class WalkEnv(gym.Env):
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Track previous position self.previous_pos = np.array([0.0, 0.0]) # Track previous position
self.last_yaw_error = None
self.Player.server.connect() self.Player.server.connect()
# sleep(2.0) # Longer wait for connection to establish completely # sleep(2.0) # Longer wait for connection to establish completely
self.Player.server.send_immediate( self.Player.server.send_immediate(
@@ -204,6 +205,10 @@ class WalkEnv(gym.Env):
except OSError: except OSError:
pass pass
@staticmethod
def _wrap_to_pi(angle_rad: float) -> float:
return (angle_rad + math.pi) % (2.0 * math.pi) - math.pi
def observe(self, init=False): def observe(self, init=False):
"""获取当前观测值""" """获取当前观测值"""
@@ -312,11 +317,8 @@ class WalkEnv(gym.Env):
if seed is not None: if seed is not None:
np.random.seed(seed) np.random.seed(seed)
length1 = 2 # randomize target distance target_distance = np.random.uniform(1.2, 2.8)
length2 = np.random.uniform(0.6, 1) # randomize target distance target_bearing_deg = np.random.uniform(-180.0, 180.0)
length3 = np.random.uniform(0.6, 1) # randomize target distance
angle2 = np.random.uniform(-30, 30) # randomize initial orientation
angle3 = np.random.uniform(-30, 30) # randomize target direction
self.step_counter = 0 self.step_counter = 0
self.waypoint_index = 0 self.waypoint_index = 0
@@ -324,6 +326,7 @@ class WalkEnv(gym.Env):
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
self.last_yaw_error = None
self.walk_cycle_step = 0 self.walk_cycle_step = 0
# 随机 beam 目标位置和朝向,增加训练多样性 # 随机 beam 目标位置和朝向,增加训练多样性
@@ -379,12 +382,14 @@ class WalkEnv(gym.Env):
self.initial_position = np.array(self.Player.world.global_position[:2]) self.initial_position = np.array(self.Player.world.global_position[:2])
self.previous_pos = self.initial_position.copy() # Critical: set to actual position self.previous_pos = self.initial_position.copy() # Critical: set to actual position
self.act = np.zeros(self.no_of_actions, np.float32) self.act = np.zeros(self.no_of_actions, np.float32)
# Build target in the robot's current forward direction instead of fixed global +x. # Randomize global target bearing so policy must learn to rotate toward it first.
heading_deg = float(r.global_orientation_euler[2]) heading_deg = float(r.global_orientation_euler[2])
forward_offset = MathOps.rotate_2d_vec(np.array([length1, 0.0]), heading_deg, is_rad=False) target_offset = MathOps.rotate_2d_vec(
point1 = self.initial_position + forward_offset np.array([target_distance, 0.0]),
point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False) heading_deg + target_bearing_deg,
point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False) is_rad=False,
)
point1 = self.initial_position + target_offset
self.point_list = [point1] self.point_list = [point1]
self.target_position = self.point_list[self.waypoint_index] self.target_position = self.point_list[self.waypoint_index]
self.initial_height = self.Player.world.global_position[2] self.initial_height = self.Player.world.global_position[2]
@@ -408,7 +413,7 @@ class WalkEnv(gym.Env):
if is_fallen: if is_fallen:
# remain = max(0, 800 - self.step_counter) # remain = max(0, 800 - self.step_counter)
# return -8.0 - 0.01 * remain # return -8.0 - 0.01 * remain
return -1.0 return -2
@@ -423,16 +428,43 @@ class WalkEnv(gym.Env):
# forward_step = float(np.dot(delta_pos, forward_dir)) # forward_step = float(np.dot(delta_pos, forward_dir))
# lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step)) # lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step))
# 奖励项 alive_bonus = 0.5
# progress_reward = 2 * forward_step
# lateral_penalty = -0.1 * lateral_step
alive_bonus = 2.0
# action_penalty = -0.01 * float(np.linalg.norm(action)) # action_penalty = -0.01 * float(np.linalg.norm(action))
smoothness_penalty = -0.01 * float(np.linalg.norm(action - self.last_action_for_reward)) smoothness_penalty = -0.06 * float(np.linalg.norm(action - self.last_action_for_reward))
posture_penalty = -0.3 * (tilt_mag) posture_penalty = -0.5 * (tilt_mag)
ang_vel_penalty = -0.02 * ang_vel_mag ang_vel_penalty = -0.04 * ang_vel_mag
# Turn-to-target shaping.
to_target = self.target_position - current_pos
dist_to_target = float(np.linalg.norm(to_target))
if dist_to_target > 1e-6:
target_yaw = math.atan2(float(to_target[1]), float(to_target[0]))
else:
target_yaw = 0.0
robot_yaw = math.radians(float(robot.global_orientation_euler[2]))
yaw_error = self._wrap_to_pi(target_yaw - robot_yaw)
# Dense alignment reward in [-1, 1], max when facing target.
heading_align_reward = 1.6 * math.cos(yaw_error)
# Reward reducing heading error across consecutive control steps.
if self.last_yaw_error is None:
heading_progress_reward = 0.0
else:
heading_progress_reward = 1.2 * (abs(self.last_yaw_error) - abs(yaw_error))
self.last_yaw_error = yaw_error
# Encourage yaw rotation in the correct direction while far from alignment.
yaw_rate = float(np.deg2rad(robot.gyroscope[2]))
turn_dir = float(np.sign(yaw_error))
turn_cap = max(0.03, 0.08 * abs(yaw_error))
turn_rate_reward = float(np.clip(0.35 * turn_dir * yaw_rate, -turn_cap, turn_cap)) if abs(yaw_error) > math.radians(10.0) else 0.0
# Small bonus for holding a good heading; prevents oscillation near target angle.
heading_hold_bonus = 0.25 if abs(yaw_error) < math.radians(10.0) else 0.0
# Use simulator joint readings in training frame to shape lateral stance. # Use simulator joint readings in training frame to shape lateral stance.
joint_pos = np.deg2rad( joint_pos = np.deg2rad(
@@ -455,16 +487,6 @@ class WalkEnv(gym.Env):
height_error = height - target_height height_error = height - target_height
height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调 height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
# # 在 compute_reward 开头附近,添加高度变化率计算
# if not hasattr(self, 'last_height'):
# self.last_height = height
# self.last_height_time = self.step_counter # 可选,用于时间间隔
# height_rate = height - self.last_height # 正为上升,负为下降
# self.last_height = height
# 惩罚高度下降(负变化率)
# height_down_penalty = -5.0 * max(0, -height_rate) # 系数可调,-height_rate 为正表示下降幅度
# # 在 compute_reward 中 # # 在 compute_reward 中
# if self.step_counter > 50: # if self.step_counter > 50:
# avg_prev_action = np.mean(self.prev_action_history, axis=0) # avg_prev_action = np.mean(self.prev_action_history, axis=0)
@@ -488,6 +510,10 @@ class WalkEnv(gym.Env):
+ height_penalty + height_penalty
+ stance_collapse_penalty + stance_collapse_penalty
+ cross_leg_penalty + cross_leg_penalty
+ heading_align_reward
+ heading_progress_reward
+ turn_rate_reward
+ heading_hold_bonus
# + exploration_bonus # + exploration_bonus
# + height_down_penalty # + height_down_penalty
) )
@@ -497,11 +523,15 @@ class WalkEnv(gym.Env):
# f"progress_reward:{progress_reward:.4f}", # f"progress_reward:{progress_reward:.4f}",
# f"lateral_penalty:{lateral_penalty:.4f}", # f"lateral_penalty:{lateral_penalty:.4f}",
# f"action_penalty:{action_penalty:.4f}"s, # f"action_penalty:{action_penalty:.4f}"s,
f"height_penalty:{height_penalty:.4f}", f"height_penalty:{height_penalty:.4f}",
f"smoothness_penalty:{smoothness_penalty:.4f},", f"smoothness_penalty:{smoothness_penalty:.4f},",
f"posture_penalty:{posture_penalty:.4f}", f"posture_penalty:{posture_penalty:.4f}",
f"stance_collapse_penalty:{stance_collapse_penalty:.4f}", f"stance_collapse_penalty:{stance_collapse_penalty:.4f}",
f"cross_leg_penalty:{cross_leg_penalty:.4f}", f"cross_leg_penalty:{cross_leg_penalty:.4f}",
f"heading_align_reward:{heading_align_reward:.4f}",
f"heading_progress_reward:{heading_progress_reward:.4f}",
f"turn_rate_reward:{turn_rate_reward:.4f}",
f"heading_hold_bonus:{heading_hold_bonus:.4f}",
# f"ang_vel_penalty:{ang_vel_penalty:.4f}", # f"ang_vel_penalty:{ang_vel_penalty:.4f}",
# f"height_down_penalty:{height_down_penalty:.4f}", # f"height_down_penalty:{height_down_penalty:.4f}",
# f"exploration_bonus:{exploration_bonus:.4f}" # f"exploration_bonus:{exploration_bonus:.4f}"
@@ -569,7 +599,7 @@ class Train(Train_Base):
minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs) minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs)
total_steps = 30000000 total_steps = 30000000
learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4")) learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4"))
folder_name = f'Walk_R{self.robot_type}' folder_name = f'Turn_R{self.robot_type}'
model_path = f'./scripts/gyms/logs/{folder_name}/' model_path = f'./scripts/gyms/logs/{folder_name}/'
print(f"Model path: {model_path}") print(f"Model path: {model_path}")
@@ -596,7 +626,7 @@ class Train(Train_Base):
sleep(server_warmup_sec) sleep(server_warmup_sec)
print("Servers started, creating environments...") print("Servers started, creating environments...")
env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)]) env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)], start_method="spawn")
# Use single-process eval env to avoid extra subprocess fragility during callback evaluation. # Use single-process eval env to avoid extra subprocess fragility during callback evaluation.
eval_env = DummyVecEnv([init_env(n_envs, monitor=True)]) eval_env = DummyVecEnv([init_env(n_envs, monitor=True)])
@@ -633,7 +663,7 @@ class Train(Train_Base):
) )
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env, model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100, eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=30,
backup_env_file=__file__) backup_env_file=__file__)
except KeyboardInterrupt: except KeyboardInterrupt:
sleep(1) # wait for child processes sleep(1) # wait for child processes
@@ -694,8 +724,8 @@ if __name__ == "__main__":
run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower() run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower()
if run_mode == "test": if run_mode == "test":
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Walk_R0_004/best_model.zip") test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Turn_R0_004/best_model.zip")
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Walk_R0_004/") test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Turn_R0_004/")
trainer.test({"model_file": test_model_file, "folder_dir": test_folder}) trainer.test({"model_file": test_model_file, "folder_dir": test_folder})
else: else:
retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip() retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()