training Turn 0.1.0 version
This commit is contained in:
@@ -153,7 +153,7 @@ class WalkEnv(gym.Env):
|
|||||||
self.min_stance_rad = 0.10
|
self.min_stance_rad = 0.10
|
||||||
|
|
||||||
# Small reset perturbations for robustness training.
|
# Small reset perturbations for robustness training.
|
||||||
self.enable_reset_perturb = True
|
self.enable_reset_perturb = False
|
||||||
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
|
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
|
||||||
self.reset_joint_noise_rad = 0.025
|
self.reset_joint_noise_rad = 0.025
|
||||||
self.reset_perturb_steps = 4
|
self.reset_perturb_steps = 4
|
||||||
@@ -162,6 +162,7 @@ class WalkEnv(gym.Env):
|
|||||||
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||||
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||||
self.previous_pos = np.array([0.0, 0.0]) # Track previous position
|
self.previous_pos = np.array([0.0, 0.0]) # Track previous position
|
||||||
|
self.last_yaw_error = None
|
||||||
self.Player.server.connect()
|
self.Player.server.connect()
|
||||||
# sleep(2.0) # Longer wait for connection to establish completely
|
# sleep(2.0) # Longer wait for connection to establish completely
|
||||||
self.Player.server.send_immediate(
|
self.Player.server.send_immediate(
|
||||||
@@ -204,6 +205,10 @@ class WalkEnv(gym.Env):
|
|||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _wrap_to_pi(angle_rad: float) -> float:
|
||||||
|
return (angle_rad + math.pi) % (2.0 * math.pi) - math.pi
|
||||||
|
|
||||||
def observe(self, init=False):
|
def observe(self, init=False):
|
||||||
|
|
||||||
"""获取当前观测值"""
|
"""获取当前观测值"""
|
||||||
@@ -312,11 +317,8 @@ class WalkEnv(gym.Env):
|
|||||||
if seed is not None:
|
if seed is not None:
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
|
|
||||||
length1 = 2 # randomize target distance
|
target_distance = np.random.uniform(1.2, 2.8)
|
||||||
length2 = np.random.uniform(0.6, 1) # randomize target distance
|
target_bearing_deg = np.random.uniform(-180.0, 180.0)
|
||||||
length3 = np.random.uniform(0.6, 1) # randomize target distance
|
|
||||||
angle2 = np.random.uniform(-30, 30) # randomize initial orientation
|
|
||||||
angle3 = np.random.uniform(-30, 30) # randomize target direction
|
|
||||||
|
|
||||||
self.step_counter = 0
|
self.step_counter = 0
|
||||||
self.waypoint_index = 0
|
self.waypoint_index = 0
|
||||||
@@ -324,6 +326,7 @@ class WalkEnv(gym.Env):
|
|||||||
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||||
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||||
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
|
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
|
||||||
|
self.last_yaw_error = None
|
||||||
self.walk_cycle_step = 0
|
self.walk_cycle_step = 0
|
||||||
|
|
||||||
# 随机 beam 目标位置和朝向,增加训练多样性
|
# 随机 beam 目标位置和朝向,增加训练多样性
|
||||||
@@ -379,12 +382,14 @@ class WalkEnv(gym.Env):
|
|||||||
self.initial_position = np.array(self.Player.world.global_position[:2])
|
self.initial_position = np.array(self.Player.world.global_position[:2])
|
||||||
self.previous_pos = self.initial_position.copy() # Critical: set to actual position
|
self.previous_pos = self.initial_position.copy() # Critical: set to actual position
|
||||||
self.act = np.zeros(self.no_of_actions, np.float32)
|
self.act = np.zeros(self.no_of_actions, np.float32)
|
||||||
# Build target in the robot's current forward direction instead of fixed global +x.
|
# Randomize global target bearing so policy must learn to rotate toward it first.
|
||||||
heading_deg = float(r.global_orientation_euler[2])
|
heading_deg = float(r.global_orientation_euler[2])
|
||||||
forward_offset = MathOps.rotate_2d_vec(np.array([length1, 0.0]), heading_deg, is_rad=False)
|
target_offset = MathOps.rotate_2d_vec(
|
||||||
point1 = self.initial_position + forward_offset
|
np.array([target_distance, 0.0]),
|
||||||
point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False)
|
heading_deg + target_bearing_deg,
|
||||||
point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False)
|
is_rad=False,
|
||||||
|
)
|
||||||
|
point1 = self.initial_position + target_offset
|
||||||
self.point_list = [point1]
|
self.point_list = [point1]
|
||||||
self.target_position = self.point_list[self.waypoint_index]
|
self.target_position = self.point_list[self.waypoint_index]
|
||||||
self.initial_height = self.Player.world.global_position[2]
|
self.initial_height = self.Player.world.global_position[2]
|
||||||
@@ -408,7 +413,7 @@ class WalkEnv(gym.Env):
|
|||||||
if is_fallen:
|
if is_fallen:
|
||||||
# remain = max(0, 800 - self.step_counter)
|
# remain = max(0, 800 - self.step_counter)
|
||||||
# return -8.0 - 0.01 * remain
|
# return -8.0 - 0.01 * remain
|
||||||
return -1.0
|
return -2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -423,16 +428,43 @@ class WalkEnv(gym.Env):
|
|||||||
# forward_step = float(np.dot(delta_pos, forward_dir))
|
# forward_step = float(np.dot(delta_pos, forward_dir))
|
||||||
# lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step))
|
# lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step))
|
||||||
|
|
||||||
# 奖励项
|
alive_bonus = 0.5
|
||||||
# progress_reward = 2 * forward_step
|
|
||||||
# lateral_penalty = -0.1 * lateral_step
|
|
||||||
alive_bonus = 2.0
|
|
||||||
|
|
||||||
# action_penalty = -0.01 * float(np.linalg.norm(action))
|
# action_penalty = -0.01 * float(np.linalg.norm(action))
|
||||||
smoothness_penalty = -0.01 * float(np.linalg.norm(action - self.last_action_for_reward))
|
smoothness_penalty = -0.06 * float(np.linalg.norm(action - self.last_action_for_reward))
|
||||||
|
|
||||||
posture_penalty = -0.3 * (tilt_mag)
|
posture_penalty = -0.5 * (tilt_mag)
|
||||||
ang_vel_penalty = -0.02 * ang_vel_mag
|
ang_vel_penalty = -0.04 * ang_vel_mag
|
||||||
|
|
||||||
|
# Turn-to-target shaping.
|
||||||
|
to_target = self.target_position - current_pos
|
||||||
|
dist_to_target = float(np.linalg.norm(to_target))
|
||||||
|
if dist_to_target > 1e-6:
|
||||||
|
target_yaw = math.atan2(float(to_target[1]), float(to_target[0]))
|
||||||
|
else:
|
||||||
|
target_yaw = 0.0
|
||||||
|
|
||||||
|
robot_yaw = math.radians(float(robot.global_orientation_euler[2]))
|
||||||
|
yaw_error = self._wrap_to_pi(target_yaw - robot_yaw)
|
||||||
|
|
||||||
|
# Dense alignment reward in [-1, 1], max when facing target.
|
||||||
|
heading_align_reward = 1.6 * math.cos(yaw_error)
|
||||||
|
|
||||||
|
# Reward reducing heading error across consecutive control steps.
|
||||||
|
if self.last_yaw_error is None:
|
||||||
|
heading_progress_reward = 0.0
|
||||||
|
else:
|
||||||
|
heading_progress_reward = 1.2 * (abs(self.last_yaw_error) - abs(yaw_error))
|
||||||
|
self.last_yaw_error = yaw_error
|
||||||
|
|
||||||
|
# Encourage yaw rotation in the correct direction while far from alignment.
|
||||||
|
yaw_rate = float(np.deg2rad(robot.gyroscope[2]))
|
||||||
|
turn_dir = float(np.sign(yaw_error))
|
||||||
|
turn_cap = max(0.03, 0.08 * abs(yaw_error))
|
||||||
|
turn_rate_reward = float(np.clip(0.35 * turn_dir * yaw_rate, -turn_cap, turn_cap)) if abs(yaw_error) > math.radians(10.0) else 0.0
|
||||||
|
|
||||||
|
# Small bonus for holding a good heading; prevents oscillation near target angle.
|
||||||
|
heading_hold_bonus = 0.25 if abs(yaw_error) < math.radians(10.0) else 0.0
|
||||||
|
|
||||||
# Use simulator joint readings in training frame to shape lateral stance.
|
# Use simulator joint readings in training frame to shape lateral stance.
|
||||||
joint_pos = np.deg2rad(
|
joint_pos = np.deg2rad(
|
||||||
@@ -455,16 +487,6 @@ class WalkEnv(gym.Env):
|
|||||||
height_error = height - target_height
|
height_error = height - target_height
|
||||||
height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
|
height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
|
||||||
|
|
||||||
# # 在 compute_reward 开头附近,添加高度变化率计算
|
|
||||||
# if not hasattr(self, 'last_height'):
|
|
||||||
# self.last_height = height
|
|
||||||
# self.last_height_time = self.step_counter # 可选,用于时间间隔
|
|
||||||
# height_rate = height - self.last_height # 正为上升,负为下降
|
|
||||||
# self.last_height = height
|
|
||||||
|
|
||||||
# 惩罚高度下降(负变化率)
|
|
||||||
# height_down_penalty = -5.0 * max(0, -height_rate) # 系数可调,-height_rate 为正表示下降幅度
|
|
||||||
|
|
||||||
# # 在 compute_reward 中
|
# # 在 compute_reward 中
|
||||||
# if self.step_counter > 50:
|
# if self.step_counter > 50:
|
||||||
# avg_prev_action = np.mean(self.prev_action_history, axis=0)
|
# avg_prev_action = np.mean(self.prev_action_history, axis=0)
|
||||||
@@ -488,6 +510,10 @@ class WalkEnv(gym.Env):
|
|||||||
+ height_penalty
|
+ height_penalty
|
||||||
+ stance_collapse_penalty
|
+ stance_collapse_penalty
|
||||||
+ cross_leg_penalty
|
+ cross_leg_penalty
|
||||||
|
+ heading_align_reward
|
||||||
|
+ heading_progress_reward
|
||||||
|
+ turn_rate_reward
|
||||||
|
+ heading_hold_bonus
|
||||||
# + exploration_bonus
|
# + exploration_bonus
|
||||||
# + height_down_penalty
|
# + height_down_penalty
|
||||||
)
|
)
|
||||||
@@ -502,6 +528,10 @@ class WalkEnv(gym.Env):
|
|||||||
f"posture_penalty:{posture_penalty:.4f}",
|
f"posture_penalty:{posture_penalty:.4f}",
|
||||||
f"stance_collapse_penalty:{stance_collapse_penalty:.4f}",
|
f"stance_collapse_penalty:{stance_collapse_penalty:.4f}",
|
||||||
f"cross_leg_penalty:{cross_leg_penalty:.4f}",
|
f"cross_leg_penalty:{cross_leg_penalty:.4f}",
|
||||||
|
f"heading_align_reward:{heading_align_reward:.4f}",
|
||||||
|
f"heading_progress_reward:{heading_progress_reward:.4f}",
|
||||||
|
f"turn_rate_reward:{turn_rate_reward:.4f}",
|
||||||
|
f"heading_hold_bonus:{heading_hold_bonus:.4f}",
|
||||||
# f"ang_vel_penalty:{ang_vel_penalty:.4f}",
|
# f"ang_vel_penalty:{ang_vel_penalty:.4f}",
|
||||||
# f"height_down_penalty:{height_down_penalty:.4f}",
|
# f"height_down_penalty:{height_down_penalty:.4f}",
|
||||||
# f"exploration_bonus:{exploration_bonus:.4f}"
|
# f"exploration_bonus:{exploration_bonus:.4f}"
|
||||||
@@ -569,7 +599,7 @@ class Train(Train_Base):
|
|||||||
minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs)
|
minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs)
|
||||||
total_steps = 30000000
|
total_steps = 30000000
|
||||||
learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4"))
|
learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4"))
|
||||||
folder_name = f'Walk_R{self.robot_type}'
|
folder_name = f'Turn_R{self.robot_type}'
|
||||||
model_path = f'./scripts/gyms/logs/{folder_name}/'
|
model_path = f'./scripts/gyms/logs/{folder_name}/'
|
||||||
|
|
||||||
print(f"Model path: {model_path}")
|
print(f"Model path: {model_path}")
|
||||||
@@ -596,7 +626,7 @@ class Train(Train_Base):
|
|||||||
sleep(server_warmup_sec)
|
sleep(server_warmup_sec)
|
||||||
print("Servers started, creating environments...")
|
print("Servers started, creating environments...")
|
||||||
|
|
||||||
env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)])
|
env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)], start_method="spawn")
|
||||||
# Use single-process eval env to avoid extra subprocess fragility during callback evaluation.
|
# Use single-process eval env to avoid extra subprocess fragility during callback evaluation.
|
||||||
eval_env = DummyVecEnv([init_env(n_envs, monitor=True)])
|
eval_env = DummyVecEnv([init_env(n_envs, monitor=True)])
|
||||||
|
|
||||||
@@ -633,7 +663,7 @@ class Train(Train_Base):
|
|||||||
)
|
)
|
||||||
|
|
||||||
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
|
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
|
||||||
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100,
|
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=30,
|
||||||
backup_env_file=__file__)
|
backup_env_file=__file__)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
sleep(1) # wait for child processes
|
sleep(1) # wait for child processes
|
||||||
@@ -694,8 +724,8 @@ if __name__ == "__main__":
|
|||||||
run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower()
|
run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower()
|
||||||
|
|
||||||
if run_mode == "test":
|
if run_mode == "test":
|
||||||
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Walk_R0_004/best_model.zip")
|
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Turn_R0_004/best_model.zip")
|
||||||
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Walk_R0_004/")
|
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Turn_R0_004/")
|
||||||
trainer.test({"model_file": test_model_file, "folder_dir": test_folder})
|
trainer.test({"model_file": test_model_file, "folder_dir": test_folder})
|
||||||
else:
|
else:
|
||||||
retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()
|
retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user