training Turn 0.1.0 version
This commit is contained in:
@@ -153,7 +153,7 @@ class WalkEnv(gym.Env):
|
||||
self.min_stance_rad = 0.10
|
||||
|
||||
# Small reset perturbations for robustness training.
|
||||
self.enable_reset_perturb = True
|
||||
self.enable_reset_perturb = False
|
||||
self.reset_beam_yaw_range_deg = 180 # randomize target direction fully to encourage learning a real walk instead of a fixed gait
|
||||
self.reset_joint_noise_rad = 0.025
|
||||
self.reset_perturb_steps = 4
|
||||
@@ -162,6 +162,7 @@ class WalkEnv(gym.Env):
|
||||
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||
self.previous_pos = np.array([0.0, 0.0]) # Track previous position
|
||||
self.last_yaw_error = None
|
||||
self.Player.server.connect()
|
||||
# sleep(2.0) # Longer wait for connection to establish completely
|
||||
self.Player.server.send_immediate(
|
||||
@@ -204,6 +205,10 @@ class WalkEnv(gym.Env):
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _wrap_to_pi(angle_rad: float) -> float:
|
||||
return (angle_rad + math.pi) % (2.0 * math.pi) - math.pi
|
||||
|
||||
def observe(self, init=False):
|
||||
|
||||
"""获取当前观测值"""
|
||||
@@ -312,11 +317,8 @@ class WalkEnv(gym.Env):
|
||||
if seed is not None:
|
||||
np.random.seed(seed)
|
||||
|
||||
length1 = 2 # randomize target distance
|
||||
length2 = np.random.uniform(0.6, 1) # randomize target distance
|
||||
length3 = np.random.uniform(0.6, 1) # randomize target distance
|
||||
angle2 = np.random.uniform(-30, 30) # randomize initial orientation
|
||||
angle3 = np.random.uniform(-30, 30) # randomize target direction
|
||||
target_distance = np.random.uniform(1.2, 2.8)
|
||||
target_bearing_deg = np.random.uniform(-180.0, 180.0)
|
||||
|
||||
self.step_counter = 0
|
||||
self.waypoint_index = 0
|
||||
@@ -324,6 +326,7 @@ class WalkEnv(gym.Env):
|
||||
self.previous_action = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||
self.last_action_for_reward = np.zeros(len(self.Player.robot.ROBOT_MOTORS))
|
||||
self.previous_pos = np.array([0.0, 0.0]) # Initialize for first step
|
||||
self.last_yaw_error = None
|
||||
self.walk_cycle_step = 0
|
||||
|
||||
# 随机 beam 目标位置和朝向,增加训练多样性
|
||||
@@ -379,12 +382,14 @@ class WalkEnv(gym.Env):
|
||||
self.initial_position = np.array(self.Player.world.global_position[:2])
|
||||
self.previous_pos = self.initial_position.copy() # Critical: set to actual position
|
||||
self.act = np.zeros(self.no_of_actions, np.float32)
|
||||
# Build target in the robot's current forward direction instead of fixed global +x.
|
||||
# Randomize global target bearing so policy must learn to rotate toward it first.
|
||||
heading_deg = float(r.global_orientation_euler[2])
|
||||
forward_offset = MathOps.rotate_2d_vec(np.array([length1, 0.0]), heading_deg, is_rad=False)
|
||||
point1 = self.initial_position + forward_offset
|
||||
point2 = point1 + MathOps.rotate_2d_vec(np.array([length2, 0]), angle2, is_rad=False)
|
||||
point3 = point2 + MathOps.rotate_2d_vec(np.array([length3, 0]), angle3, is_rad=False)
|
||||
target_offset = MathOps.rotate_2d_vec(
|
||||
np.array([target_distance, 0.0]),
|
||||
heading_deg + target_bearing_deg,
|
||||
is_rad=False,
|
||||
)
|
||||
point1 = self.initial_position + target_offset
|
||||
self.point_list = [point1]
|
||||
self.target_position = self.point_list[self.waypoint_index]
|
||||
self.initial_height = self.Player.world.global_position[2]
|
||||
@@ -408,7 +413,7 @@ class WalkEnv(gym.Env):
|
||||
if is_fallen:
|
||||
# remain = max(0, 800 - self.step_counter)
|
||||
# return -8.0 - 0.01 * remain
|
||||
return -1.0
|
||||
return -2
|
||||
|
||||
|
||||
|
||||
@@ -423,16 +428,43 @@ class WalkEnv(gym.Env):
|
||||
# forward_step = float(np.dot(delta_pos, forward_dir))
|
||||
# lateral_step = float(np.linalg.norm(delta_pos - forward_dir * forward_step))
|
||||
|
||||
# 奖励项
|
||||
# progress_reward = 2 * forward_step
|
||||
# lateral_penalty = -0.1 * lateral_step
|
||||
alive_bonus = 2.0
|
||||
alive_bonus = 0.5
|
||||
|
||||
# action_penalty = -0.01 * float(np.linalg.norm(action))
|
||||
smoothness_penalty = -0.01 * float(np.linalg.norm(action - self.last_action_for_reward))
|
||||
smoothness_penalty = -0.06 * float(np.linalg.norm(action - self.last_action_for_reward))
|
||||
|
||||
posture_penalty = -0.3 * (tilt_mag)
|
||||
ang_vel_penalty = -0.02 * ang_vel_mag
|
||||
posture_penalty = -0.5 * (tilt_mag)
|
||||
ang_vel_penalty = -0.04 * ang_vel_mag
|
||||
|
||||
# Turn-to-target shaping.
|
||||
to_target = self.target_position - current_pos
|
||||
dist_to_target = float(np.linalg.norm(to_target))
|
||||
if dist_to_target > 1e-6:
|
||||
target_yaw = math.atan2(float(to_target[1]), float(to_target[0]))
|
||||
else:
|
||||
target_yaw = 0.0
|
||||
|
||||
robot_yaw = math.radians(float(robot.global_orientation_euler[2]))
|
||||
yaw_error = self._wrap_to_pi(target_yaw - robot_yaw)
|
||||
|
||||
# Dense alignment reward in [-1, 1], max when facing target.
|
||||
heading_align_reward = 1.6 * math.cos(yaw_error)
|
||||
|
||||
# Reward reducing heading error across consecutive control steps.
|
||||
if self.last_yaw_error is None:
|
||||
heading_progress_reward = 0.0
|
||||
else:
|
||||
heading_progress_reward = 1.2 * (abs(self.last_yaw_error) - abs(yaw_error))
|
||||
self.last_yaw_error = yaw_error
|
||||
|
||||
# Encourage yaw rotation in the correct direction while far from alignment.
|
||||
yaw_rate = float(np.deg2rad(robot.gyroscope[2]))
|
||||
turn_dir = float(np.sign(yaw_error))
|
||||
turn_cap = max(0.03, 0.08 * abs(yaw_error))
|
||||
turn_rate_reward = float(np.clip(0.35 * turn_dir * yaw_rate, -turn_cap, turn_cap)) if abs(yaw_error) > math.radians(10.0) else 0.0
|
||||
|
||||
# Small bonus for holding a good heading; prevents oscillation near target angle.
|
||||
heading_hold_bonus = 0.25 if abs(yaw_error) < math.radians(10.0) else 0.0
|
||||
|
||||
# Use simulator joint readings in training frame to shape lateral stance.
|
||||
joint_pos = np.deg2rad(
|
||||
@@ -455,16 +487,6 @@ class WalkEnv(gym.Env):
|
||||
height_error = height - target_height
|
||||
height_penalty = -0.5 * abs(height_error) # 惩罚高度偏离,系数可调
|
||||
|
||||
# # 在 compute_reward 开头附近,添加高度变化率计算
|
||||
# if not hasattr(self, 'last_height'):
|
||||
# self.last_height = height
|
||||
# self.last_height_time = self.step_counter # 可选,用于时间间隔
|
||||
# height_rate = height - self.last_height # 正为上升,负为下降
|
||||
# self.last_height = height
|
||||
|
||||
# 惩罚高度下降(负变化率)
|
||||
# height_down_penalty = -5.0 * max(0, -height_rate) # 系数可调,-height_rate 为正表示下降幅度
|
||||
|
||||
# # 在 compute_reward 中
|
||||
# if self.step_counter > 50:
|
||||
# avg_prev_action = np.mean(self.prev_action_history, axis=0)
|
||||
@@ -488,6 +510,10 @@ class WalkEnv(gym.Env):
|
||||
+ height_penalty
|
||||
+ stance_collapse_penalty
|
||||
+ cross_leg_penalty
|
||||
+ heading_align_reward
|
||||
+ heading_progress_reward
|
||||
+ turn_rate_reward
|
||||
+ heading_hold_bonus
|
||||
# + exploration_bonus
|
||||
# + height_down_penalty
|
||||
)
|
||||
@@ -502,6 +528,10 @@ class WalkEnv(gym.Env):
|
||||
f"posture_penalty:{posture_penalty:.4f}",
|
||||
f"stance_collapse_penalty:{stance_collapse_penalty:.4f}",
|
||||
f"cross_leg_penalty:{cross_leg_penalty:.4f}",
|
||||
f"heading_align_reward:{heading_align_reward:.4f}",
|
||||
f"heading_progress_reward:{heading_progress_reward:.4f}",
|
||||
f"turn_rate_reward:{turn_rate_reward:.4f}",
|
||||
f"heading_hold_bonus:{heading_hold_bonus:.4f}",
|
||||
# f"ang_vel_penalty:{ang_vel_penalty:.4f}",
|
||||
# f"height_down_penalty:{height_down_penalty:.4f}",
|
||||
# f"exploration_bonus:{exploration_bonus:.4f}"
|
||||
@@ -569,7 +599,7 @@ class Train(Train_Base):
|
||||
minibatch_size = int(os.environ.get("GYM_CPU_TRAIN_BATCH_SIZE", "512")) # should be a factor of (n_steps_per_env * n_envs)
|
||||
total_steps = 30000000
|
||||
learning_rate = float(os.environ.get("GYM_CPU_TRAIN_LR", "3e-4"))
|
||||
folder_name = f'Walk_R{self.robot_type}'
|
||||
folder_name = f'Turn_R{self.robot_type}'
|
||||
model_path = f'./scripts/gyms/logs/{folder_name}/'
|
||||
|
||||
print(f"Model path: {model_path}")
|
||||
@@ -596,7 +626,7 @@ class Train(Train_Base):
|
||||
sleep(server_warmup_sec)
|
||||
print("Servers started, creating environments...")
|
||||
|
||||
env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)])
|
||||
env = SubprocVecEnv([init_env(i, monitor=True) for i in range(n_envs)], start_method="spawn")
|
||||
# Use single-process eval env to avoid extra subprocess fragility during callback evaluation.
|
||||
eval_env = DummyVecEnv([init_env(n_envs, monitor=True)])
|
||||
|
||||
@@ -633,7 +663,7 @@ class Train(Train_Base):
|
||||
)
|
||||
|
||||
model_path = self.learn_model(model, total_steps, model_path, eval_env=eval_env,
|
||||
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=100,
|
||||
eval_freq=n_steps_per_env * 20, save_freq=n_steps_per_env * 20, eval_eps=30,
|
||||
backup_env_file=__file__)
|
||||
except KeyboardInterrupt:
|
||||
sleep(1) # wait for child processes
|
||||
@@ -694,8 +724,8 @@ if __name__ == "__main__":
|
||||
run_mode = os.environ.get("GYM_CPU_MODE", "train").strip().lower()
|
||||
|
||||
if run_mode == "test":
|
||||
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Walk_R0_004/best_model.zip")
|
||||
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Walk_R0_004/")
|
||||
test_model_file = os.environ.get("GYM_CPU_TEST_MODEL", "scripts/gyms/logs/Turn_R0_004/best_model.zip")
|
||||
test_folder = os.environ.get("GYM_CPU_TEST_FOLDER", "scripts/gyms/logs/Turn_R0_004/")
|
||||
trainer.test({"model_file": test_model_file, "folder_dir": test_folder})
|
||||
else:
|
||||
retrain_model = os.environ.get("GYM_CPU_TRAIN_MODEL", "").strip()
|
||||
|
||||
Reference in New Issue
Block a user