From 8ab57840bab7fb8d8a3fd65ea749e749b4339626 Mon Sep 17 00:00:00 2001 From: xxh Date: Wed, 25 Mar 2026 03:20:22 -0400 Subject: [PATCH] training Turn 0.1.0 version --- scripts/gyms/Walk.py | 108 +++++++++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 39 deletions(-) diff --git a/scripts/gyms/Walk.py b/scripts/gyms/Walk.py index 4983f04..2e7ccef 100755 --- a/scripts/gyms/Walk.py +++ b/scripts/gyms/Walk.py @@ -153,7 +153,7 @@ class WalkEnv(gym.Env): self.min_stance_rad = 0.10 # Small reset perturbations for robustness training. - self.enable_reset_perturb = True + self.enable_reset_perturb = False self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait self.reset_joint_noise_rad = 0.025 self.reset_perturb_steps = 4 @@ -162,6 +162,7 @@ class WalkEnv(gym.Env): self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_pos = np.array([0.0, 0.0]) # Track previous position + self.last_yaw_error = None self.Player.server.connect() # sleep(2.0) # Longer wait for connection to establish completely self.Player.server.send_immediate( @@ -204,6 +205,10 @@ class WalkEnv(gym.Env): except OSError: pass + @staticmethod + def _wrap_to_pi(angle_rad: float) -> float: + return (angle_rad + math.pi) % (2.0 * math.pi) - math.pi + def observe(self, init=False): """获取当前观测值""" @@ -312,11 +317,8 @@ class WalkEnv(gym.Env): if seed is not None: np.random.seed(seed) - length1 = 2 # randomize target distance - length2 = np.random.uniform(0.6, 1) # randomize target distance - length3 = np.random.uniform(0.6, 1) # randomize target distance - angle2 = np.random.uniform(-30, 30) # randomize initial orientation - angle3 = np.random.uniform(-30, 30) # randomize target direction + target_distance = np.random.uniform(1.2, 2.8) + target_bearing_deg = np.random.uniform(-180.0, 180.0) self.step_counter = 0 self.waypoint_index = 0 @@ -324,6 +326,7 @@ class WalkEnv(gym.Env): self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS)) self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step + self.last_yaw_error = None self.walk_cycle_step = 0 # 随机 beam 目标位置和朝向,增加训练多样性 @@ -379,12 +382,14 @@ class WalkEnv(gym.Env): self.initial_position = np.array(self.Player.world.global_position[:2]) self.previous_pos = self.initial_position.copy() # Critical: set to actual position self.act = np.zeros(self.no_of_actions, np.float32) - # Build target in the robot's current forward direction instead of fixed global +x. + # Randomize global target bearing so policy must learn to rotate toward it first. heading_deg = float(r.global_orientation_euler[2]) - forward_offset = MathOps.rotate_2d_vec(np.array([length1, 0.0]), heading_deg, is_rad=False) - point1 = self.initial_position + forward_offset - point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False) - point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False) + target_offset = MathOps.rotate_2d_vec( + np.array([target_distance, 0.0]), + heading_deg + target_bearing_deg, + is_rad=False, + ) + point1 = self.initial_position + target_offset self.point_list = [point1] self.target_position = self.point_list[self.waypoint_index] self.initial_height = self.Player.world.global_position[2] @@ -408,7 +413,7 @@ class WalkEnv(gym.Env): if is_fallen: # remain = max(0, 800 - self.step_counter) # return -8.0 - 0.01 * remain - return -1.0 + return -2 @@ -423,16 +428,43 @@ class WalkEnv(gym.Env): # forward_step = float(np.dot(delta_pos, forward_dir)) # lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step)) - # 奖励项 - # progress_reward = 2 * forward_step - # lateral_penalty = -0.1 * lateral_step - alive_bonus = 2.0 + alive_bonus = 0.5 # action_penalty = -0.01 * float(np.linalg.norm(action)) - smoothness_penalty = -0.01 * float(np.linalg.norm(action - self.last_action_for_reward)) + smoothness_penalty = -0.06 * float(np.linalg.norm(action - self.last_action_for_reward)) - posture_penalty = -0.3 * (tilt_mag) - ang_vel_penalty = -0.02 * ang_vel_mag + posture_penalty = -0.5 * (tilt_mag) + ang_vel_penalty = -0.04 * ang_vel_mag + + # Turn-to-target shaping. + to_target = self.target_position - current_pos + dist_to_target = float(np.linalg.norm(to_target)) + if dist_to_target > 1e-6: + target_yaw = math.atan2(float(to_target[1]), float(to_target[0])) + else: + target_yaw = 0.0 + + robot_yaw = math.radians(float(robot.global_orientation_euler[2])) + yaw_error = self._wrap_to_pi(target_yaw - robot_yaw) + + # Dense alignment reward in [-1, 1], max when facing target. + heading_align_reward = 1.6 * math.cos(yaw_error) + + # Reward reducing heading error across consecutive control steps. + if self.last_yaw_error is None: + heading_progress_reward = 0.0 + else: + heading_progress_reward = 1.2 * (abs(self.last_yaw_error) - abs(yaw_error)) + self.last_yaw_error = yaw_error + + # Encourage yaw rotation in the correct direction while far from alignment. + yaw_rate = float(np.deg2rad(robot.gyroscope[2])) + turn_dir = float(np.sign(yaw_error)) + turn_cap = max(0.03, 0.08 * abs(yaw_error)) + turn_rate_reward = float(np.clip(0.35 * turn_dir * yaw_rate, -turn_cap, turn_cap)) if abs(yaw_error) > math.radians(10.0) else 0.0 + + # Small bonus for holding a good heading; prevents oscillation near target angle. + heading_hold_bonus = 0.25 if abs(yaw_error) < math.radians(10.0) else 0.0 # Use simulator joint readings in training frame to shape lateral stance. joint_pos = np.deg2rad( @@ -455,16 +487,6 @@ class WalkEnv(gym.Env): height_error = height - target_height height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调 - # # 在 compute_reward 开头附近,添加高度变化率计算 - # if not hasattr(self, 'last_height'): - # self.last_height = height - # self.last_height_time = self.step_counter # 可选,用于时间间隔 - # height_rate = height - self.last_height # 正为上升,负为下降 - # self.last_height = height - - # 惩罚高度下降(负变化率) - # height_down_penalty = -5.0 * max(0, -height_rate) # 系数可调,-height_rate 为正表示下降幅度 - # # 在 compute_reward 中 # if self.step_counter > 50: # avg_prev_action = np.mean(self.prev_action_history, axis=0) @@ -488,6 +510,10 @@ class WalkEnv(gym.Env): + height_penalty + stance_collapse_penalty + cross_leg_penalty + + heading_align_reward + + heading_progress_reward + + turn_rate_reward + + heading_hold_bonus # + exploration_bonus # + height_down_penalty ) @@ -497,11 +523,15 @@ class WalkEnv(gym.Env): # f"progress_reward:{progress_reward:.4f}", # f"lateral_penalty:{lateral_penalty:.4f}", # f"action_penalty:{action_penalty:.4f}"s, - f"height_penalty:{height_penalty:.4f}", - f"smoothness_penalty:{smoothness_penalty:.4f},", - f"posture_penalty:{posture_penalty:.4f}", - f"stance_collapse_penalty:{stance_collapse_penalty:.4f}", - f"cross_leg_penalty:{cross_leg_penalty:.4f}", + f"height_penalty:{height_penalty:.4f}", + f"smoothness_penalty:{smoothness_penalty:.4f},", + f"posture_penalty:{posture_penalty:.4f}", + f"stance_collapse_penalty:{stance_collapse_penalty:.4f}", + f"cross_leg_penalty:{cross_leg_penalty:.4f}", + f"heading_align_reward:{heading_align_reward:.4f}", + f"heading_progress_reward:{heading_progress_reward:.4f}", + f"turn_rate_reward:{turn_rate_reward:.4f}", + f"heading_hold_bonus:{heading_hold_bonus:.4f}", # f"ang_vel_penalty:{ang_vel_penalty:.4f}", # f"height_down_penalty:{height_down_penalty:.4f}", # f"exploration_bonus:{exploration_bonus:.4f}" @@ -569,7 +599,7 @@ class Train(Train_Base): minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs) total_steps = 30000000 learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4")) - folder_name = f'Walk_R{self.robot_type}' + folder_name = f'Turn_R{self.robot_type}' model_path = f'./scripts/gyms/logs/{folder_name}/' print(f"Model path: {model_path}") @@ -596,7 +626,7 @@ class Train(Train_Base): sleep(server_warmup_sec) print("Servers started, creating environments...") - env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)]) + env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)], start_method="spawn") # Use single-process eval env to avoid extra subprocess fragility during callback evaluation. eval_env = DummyVecEnv([init_env(n_envs, monitor=True)]) @@ -633,7 +663,7 @@ class Train(Train_Base): ) model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env, - eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100, + eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=30, backup_env_file=__file__) except KeyboardInterrupt: sleep(1) # wait for child processes @@ -694,8 +724,8 @@ if __name__ == "__main__": run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower() if run_mode == "test": - test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Walk_R0_004/best_model.zip") - test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Walk_R0_004/") + test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Turn_R0_004/best_model.zip") + test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Turn_R0_004/") trainer.test({"model_file": test_model_file, "folder_dir": test_folder}) else: retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()