train Walk and tackle the memory leak issue.

This commit is contained in:
xxh
2026-04-08 05:59:16 -04:00
parent 28e7eb0692
commit 87e5c6d931
5 changed files with 282 additions and 323 deletions

View File

@@ -24,36 +24,14 @@ CPU_QUOTA="$((CORES * UTIL_PERCENT))%"
MEMORY_MAX="${MEMORY_MAX:-0}"
# ------------------------------
# 训练运行参数(由 scripts/gyms/Walk.py 读取)
# 精简运行参数(由 scripts/gyms/Walk.py 读取)
# ------------------------------
# 运行模式train 或 test
# 仅保留最常用开关,避免超长环境变量命令。
GYM_CPU_MODE="${GYM_CPU_MODE:-train}"
# 并行环境数量:越大通常吞吐越高,但也更容易触发 OOM 或连接不稳定。
# 默认使用更稳妥的 12确认稳定后再升到 16/20。
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS:-12}"
# 服务器预热时间(秒):
# 在批量拉起 rcssserver 后等待一段时间,再创建 SubprocVecEnv
# 可降低 ConnectionReset/EOFError 概率。
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC:-10}"
# 训练专用参数
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV:-256}"
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE:-512}"
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR:-1e-4}"
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF:-0.03}"
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE:-0.13}"
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA:-0.95}"
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS:-5}"
GYM_CPU_TRAIN_STAGE="${GYM_CPU_TRAIN_STAGE:-walk}"
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL:-}"
# 测试专用参数
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL:-scripts/gyms/logs/Walk_R0_004/best_model.zip}"
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER:-scripts/gyms/logs/Walk_R0_004/}"
# 测试默认实时且显示画面:默认均为 0
# 设为 1 表示关闭对应能力
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER:-0}"
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME:-0}"
# Python 解释器选择策略:
# 1) 优先使用你手动传入的 PYTHON_BIN
@@ -93,7 +71,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# 打印当前生效配置,方便排障和复现实验。
echo "Starting training with limits: CPU=${CPU_QUOTA}, Memory=${MEMORY_MAX}"
echo "Mode: ${GYM_CPU_MODE}"
echo "Runtime knobs: GYM_CPU_N_ENVS=${GYM_CPU_N_ENVS}, GYM_CPU_SERVER_WARMUP_SEC=${GYM_CPU_SERVER_WARMUP_SEC}"
echo "Run knobs: GYM_CPU_MODE=${GYM_CPU_MODE}, GYM_CPU_TRAIN_STAGE=${GYM_CPU_TRAIN_STAGE}"
echo "Using Python: ${PYTHON_EXEC}"
if [[ -n "${CONDA_DEFAULT_ENV:-}" ]]; then
echo "Detected conda env: ${CONDA_DEFAULT_ENV}"
@@ -118,19 +96,9 @@ systemd-run --user --scope \
"${SYSTEMD_PROPS[@]}" \
env \
GYM_CPU_MODE="${GYM_CPU_MODE}" \
GYM_CPU_N_ENVS="${GYM_CPU_N_ENVS}" \
GYM_CPU_SERVER_WARMUP_SEC="${GYM_CPU_SERVER_WARMUP_SEC}" \
GYM_CPU_TRAIN_STEPS_PER_ENV="${GYM_CPU_TRAIN_STEPS_PER_ENV}" \
GYM_CPU_TRAIN_BATCH_SIZE="${GYM_CPU_TRAIN_BATCH_SIZE}" \
GYM_CPU_TRAIN_LR="${GYM_CPU_TRAIN_LR}" \
GYM_CPU_TRAIN_ENT_COEF="${GYM_CPU_TRAIN_ENT_COEF}" \
GYM_CPU_TRAIN_CLIP_RANGE="${GYM_CPU_TRAIN_CLIP_RANGE}" \
GYM_CPU_TRAIN_GAMMA="${GYM_CPU_TRAIN_GAMMA}" \
GYM_CPU_TRAIN_EPOCHS="${GYM_CPU_TRAIN_EPOCHS}" \
GYM_CPU_TRAIN_STAGE="${GYM_CPU_TRAIN_STAGE}" \
GYM_CPU_TRAIN_MODEL="${GYM_CPU_TRAIN_MODEL}" \
GYM_CPU_TEST_MODEL="${GYM_CPU_TEST_MODEL}" \
GYM_CPU_TEST_FOLDER="${GYM_CPU_TEST_FOLDER}" \
GYM_CPU_TEST_NO_RENDER="${GYM_CPU_TEST_NO_RENDER}" \
GYM_CPU_TEST_NO_REALTIME="${GYM_CPU_TEST_NO_REALTIME}" \
"${PYTHON_EXEC}" "-m" "scripts.gyms.Walk"