From 1496c20b666ded29f202e08bb3c636ca2d6396ac Mon Sep 17 00:00:00 2001
From: weloMThreads <268572325+weloMThreads@users.noreply.github.com>
Date: Thu, 4 Jun 2026 10:42:30 +0800
Subject: [PATCH] ci: add current-only validation runner

---
 run_current_only_validation.sh | 709 +++++++++++++++++++++++++++++++++
 1 file changed, 709 insertions(+)
 create mode 100755 run_current_only_validation.sh

diff --git a/run_current_only_validation.sh b/run_current_only_validation.sh
new file mode 100755
index 00000000..5f64226d
--- /dev/null
+++ b/run_current_only_validation.sh
@@ -0,0 +1,709 @@
+#!/usr/bin/env bash
+
+# Current-only validation runner based on .github/workflows/pr-validation.yml.
+#
+# This script intentionally does not fetch, checkout, clean, or build source.
+# Build the current wheel yourself first, then run this script from the repo root.
+
+set -uo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$REPO_ROOT"
+
+PYTHON_BIN="${PYTHON:-python3}"
+LOG_BASE="${LOG_BASE:-$REPO_ROOT/current_only_logs}"
+RUN_ID="${RUN_ID:-$(date +%Y%m%d-%H%M%S)}"
+LOG_ROOT="${LOG_ROOT:-$LOG_BASE/current/$RUN_ID}"
+MODEL_ROOT="${MODEL_ROOT:-/home/runner/tf_test_model_wheel}"
+WHEEL_PATH="${WHEEL_PATH:-}"
+
+INTEGRATION_TIMEOUT="${INTEGRATION_TIMEOUT:-60m}"
+FUSION_TIMEOUT="${FUSION_TIMEOUT:-60m}"
+T_PERF_TIMEOUT="${T_PERF_TIMEOUT:-20m}"
+T_ACCURACY_TIMEOUT="${T_ACCURACY_TIMEOUT:-20m}"
+BD_MODEL_TIMEOUT="${BD_MODEL_TIMEOUT:-30m}"
+BD_MODEL_BS="${BD_MODEL_BS:-32,128,256,1024}"
+BD_MODEL_RUN_ITERS="${BD_MODEL_RUN_ITERS:-30}"
+
+ONETRANS_PERF_ROOT="${ONETRANS_PERF_ROOT:-$MODEL_ROOT/training/onetrans}"
+ONETRANS_PERF_SCRIPT_NAME="${ONETRANS_PERF_SCRIPT_NAME:-train_perf_bf16.py}"
+ONETRANS_PERF_DATA_DIR="${ONETRANS_PERF_DATA_DIR:-$ONETRANS_PERF_ROOT/data}"
+ONETRANS_PERF_WARMUP_STEPS="${ONETRANS_PERF_WARMUP_STEPS:-20}"
+ONETRANS_PERF_MEASURE_STEPS="${ONETRANS_PERF_MEASURE_STEPS:-50}"
+ONETRANS_PERF_TIMEOUT="${ONETRANS_PERF_TIMEOUT:-60m}"
+
+TOKENLARGE_PERF_ROOT="${TOKENLARGE_PERF_ROOT:-$MODEL_ROOT/training/tokenmixer-large}"
+TOKENLARGE_PERF_SCRIPT_NAME="${TOKENLARGE_PERF_SCRIPT_NAME:-train_perf_bf16.py}"
+TOKENLARGE_PERF_DATA_DIR="${TOKENLARGE_PERF_DATA_DIR:-$TOKENLARGE_PERF_ROOT/data}"
+TOKENLARGE_PERF_WARMUP_STEPS="${TOKENLARGE_PERF_WARMUP_STEPS:-20}"
+TOKENLARGE_PERF_MEASURE_STEPS="${TOKENLARGE_PERF_MEASURE_STEPS:-50}"
+TOKENLARGE_PERF_TIMEOUT="${TOKENLARGE_PERF_TIMEOUT:-60m}"
+
+TRAINING_TIMEOUT="${TRAINING_TIMEOUT:-180m}"
+TRAINING_MODEL_TIMEOUT_SECONDS="${TRAINING_MODEL_TIMEOUT_SECONDS:-3600}"
+TRAINING_EPOCHS="${TRAINING_EPOCHS:-100}"
+TRAINING_MODE="${TRAINING_MODE:-pr-smoke}"
+PR_TRAINING_MODELS="rankmixer"
+DAILY_TRAINING_MODELS="rankmixer onetrans tokenmixer-large afm autoint bst ccpm dcn dcnmix deepfefm deepfm dien difm din dsin edcn esmm fgcnn fibinet flen fnn fwfm ifm mlr mmoe nfm onn ple pnn wdl wukong xdeepfm"
+TRAINING_MODELS="${TRAINING_MODELS:-}"
+
+ALL_CURRENT_ONLY_JOBS="integration fusion t_accuracy t_perf bd_model1 bd_model2 bd_model3 onetrans_perf tokenlarge_perf training"
+DEFAULT_CURRENT_ONLY_JOBS="$ALL_CURRENT_ONLY_JOBS"
+CURRENT_ONLY_JOBS="${CURRENT_ONLY_JOBS:-$DEFAULT_CURRENT_ONLY_JOBS}"
+FAIL_FAST="${FAIL_FAST:-0}"
+
+EMPTY_TEST_RESULT_PATTERN="${EMPTY_TEST_RESULT_PATTERN:-No tests found|No test files found|Total Tests:[[:space:]]*0|Total Tests[[:space:]]*\\|[[:space:]]*0|no average_time_summary entries|report path missing|log not found}"
+MIN_INTEGRATION_TESTS="${MIN_INTEGRATION_TESTS:-500}"
+
+mkdir -p "$LOG_ROOT/summaries"
+
+declare -a SUMMARY_ROWS=()
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  ./run_current_only_validation.sh
+
+Required before running:
+  Build the current wheel yourself. The script installs the wheel from:
+    1. WHEEL_PATH, if set
+    2. ./dist/tensorflow_musa-*.whl, latest by name
+
+Common environment variables:
+  WHEEL_PATH=/path/to/tensorflow_musa-*.whl
+  MODEL_ROOT=/home/runner/tf_test_model_wheel
+  GPU_LOCK_FILE=/tmp/tensorflow_musa_gpu.lock
+  CURRENT_ONLY_JOBS="integration t_accuracy t_perf bd_model1"
+  CURRENT_ONLY_JOBS=all
+  FAIL_FAST=1
+
+Supported jobs:
+  integration
+  fusion
+  t_accuracy
+  t_perf
+  bd_model1
+  bd_model2
+  bd_model3
+  onetrans_perf
+  tokenlarge_perf
+  training
+
+By default this script runs all supported current-only jobs.
+This script does not fetch, checkout, clean, build, or run baseline.
+USAGE
+}
+
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+  usage
+  exit 0
+fi
+
+log_info() {
+  printf '[INFO] %s\n' "$*"
+}
+
+log_error() {
+  printf '[ERROR] %s\n' "$*" >&2
+}
+
+append_summary() {
+  local name="$1"
+  local status="$2"
+  local detail="${3:-}"
+  SUMMARY_ROWS+=("$name|$status|$detail")
+}
+
+resolve_wheel_path() {
+  if [[ -n "$WHEEL_PATH" ]]; then
+    if [[ ! -f "$WHEEL_PATH" ]]; then
+      log_error "WHEEL_PATH does not exist: $WHEEL_PATH"
+      return 1
+    fi
+    printf '%s\n' "$WHEEL_PATH"
+    return 0
+  fi
+
+  local wheel
+  wheel="$(find "$REPO_ROOT/dist" -maxdepth 1 -type f -name 'tensorflow_musa-*.whl' 2>/dev/null | sort | tail -n 1 || true)"
+  if [[ -z "$wheel" ]]; then
+    log_error "No wheel found. Set WHEEL_PATH or put tensorflow_musa-*.whl under $REPO_ROOT/dist."
+    return 1
+  fi
+  printf '%s\n' "$wheel"
+}
+
+run_logged() {
+  local name="$1"
+  local log_file="$2"
+  shift 2
+
+  mkdir -p "$(dirname "$log_file")"
+  log_info "Running $name"
+  log_info "Log: $log_file"
+
+  "$@" 2>&1 | tee "$log_file"
+  local exit_code=${PIPESTATUS[0]}
+
+  if [[ "$exit_code" == "0" ]]; then
+    log_info "$name: success"
+  else
+    log_error "$name: failure, exit_code=$exit_code"
+  fi
+  return "$exit_code"
+}
+
+run_with_optional_gpu_lock() {
+  if [[ -n "${GPU_LOCK_FILE:-}" ]]; then
+    mkdir -p "$(dirname "$GPU_LOCK_FILE")"
+    (
+      exec 9>"$GPU_LOCK_FILE"
+      flock 9
+      echo "Acquired GPU test lock: $GPU_LOCK_FILE"
+      "$@"
+    )
+  else
+    echo "GPU_LOCK_FILE is not set; running without a global GPU lock."
+    "$@"
+  fi
+}
+
+install_current_wheel_body() {
+  local wheel="$1"
+
+  "$PYTHON_BIN" -m pip install "$wheel" --no-deps --force-reinstall || return $?
+  "$PYTHON_BIN" - <<'PY'
+import pathlib
+import tensorflow_musa
+
+package_root = pathlib.Path(tensorflow_musa.__file__).resolve().parent
+print(f'tensorflow_musa wheel: {package_root}')
+PY
+}
+
+install_current_wheel() {
+  local wheel="$1"
+  local log_file="$LOG_ROOT/install/install_current_wheel.log"
+
+  run_logged "install current wheel" "$log_file" install_current_wheel_body "$wheel"
+}
+
+job_integration_body() {
+  cd "$REPO_ROOT/test" || return $?
+  timeout "$INTEGRATION_TIMEOUT" "$PYTHON_BIN" test_runner.py --quiet
+}
+
+job_integration() {
+  local log_file="$LOG_ROOT/integration/integration_current.log"
+  run_logged "integration current" "$log_file" job_integration_body
+  local exit_code=$?
+
+  local detail=""
+  if [[ -f "$log_file" ]]; then
+    local total_tests
+    total_tests="$(grep -E "Total Tests[[:space:]]*(\\||:)[[:space:]]*[0-9]+" "$log_file" | tail -1 | grep -oE "[0-9]+" | tail -1 || true)"
+    detail="total_tests=${total_tests:-n/a}"
+    if grep -Eq "$EMPTY_TEST_RESULT_PATTERN" "$log_file"; then
+      exit_code=1
+      detail="$detail, empty-result"
+    elif [[ -z "$total_tests" || "$total_tests" -lt "$MIN_INTEGRATION_TESTS" ]]; then
+      exit_code=1
+      detail="$detail, less-than-min-$MIN_INTEGRATION_TESTS"
+    fi
+  else
+    detail="log-missing"
+    exit_code=1
+  fi
+
+  append_summary "integration" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "$detail"
+  return "$exit_code"
+}
+
+job_fusion_body() {
+  cd "$REPO_ROOT/test" || return $?
+  timeout "$FUSION_TIMEOUT" "$PYTHON_BIN" test_runner.py --fusion --quiet
+}
+
+job_fusion() {
+  local log_file="$LOG_ROOT/fusion/fusion_current.log"
+  run_logged "fusion current" "$log_file" job_fusion_body
+  local exit_code=$?
+  append_summary "fusion" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "log=$log_file"
+  return "$exit_code"
+}
+
+job_t_perf_body() {
+  cd "$MODEL_ROOT/inference/prunedGraph" || return $?
+  timeout "$T_PERF_TIMEOUT" "$PYTHON_BIN" run_inference.py \
+    --device musa \
+    --batch-size 100 \
+    --infer-iters 1000
+}
+
+parse_chinese_average_ms() {
+  local log_file="$1"
+  "$PYTHON_BIN" - "$log_file" <<'PY' || true
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text(encoding="utf-8", errors="ignore")
+matches = re.findall(r"\u5e73\u5747:\s*([0-9]+(?:\.[0-9]+)?)\s*ms", text)
+print(matches[-1] if matches else "")
+PY
+}
+
+job_t_perf() {
+  local log_file="$LOG_ROOT/t-perf/t_perf_current.log"
+  run_logged "T performance current" "$log_file" run_with_optional_gpu_lock job_t_perf_body
+  local exit_code=$?
+  local current_ms=""
+
+  if [[ "$exit_code" == "0" && -f "$log_file" ]]; then
+    current_ms="$(parse_chinese_average_ms "$log_file")"
+    [[ -n "$current_ms" ]] || exit_code=1
+  fi
+
+  append_summary "t_perf" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "current_ms=${current_ms:-n/a}"
+  return "$exit_code"
+}
+
+job_t_accuracy_body() {
+  cd "$MODEL_ROOT/inference/prunedGraph" || return $?
+  TF_ENABLE_ONEDNN_OPTS=0 \
+  MUSA_ENABLE_TF32=0 \
+  timeout "$T_ACCURACY_TIMEOUT" "$PYTHON_BIN" run_inference.py \
+    --device musa \
+    --batch-size 100 \
+    --check-acc \
+    --rtol 1e-2 \
+    --atol 1e-2
+}
+
+job_t_accuracy() {
+  local log_file="$LOG_ROOT/t-accuracy/t_accuracy_current.log"
+  run_logged "T accuracy current" "$log_file" run_with_optional_gpu_lock job_t_accuracy_body
+  local exit_code=$?
+  local current_acc="n/a"
+
+  if [[ -f "$log_file" ]]; then
+    current_acc="$(grep -E "PASSED|FAILED" "$log_file" | tail -1 | grep -oE "PASSED|FAILED" || echo "n/a")"
+  fi
+  if [[ "$current_acc" != "PASSED" ]]; then
+    exit_code=1
+  fi
+
+  append_summary "t_accuracy" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "current=$current_acc"
+  return "$exit_code"
+}
+
+bd_report_summary() {
+  local log_file="$1"
+  local report_path=""
+
+  if [[ -f "$log_file" ]]; then
+    report_path="$(grep '^\[OK\] report=' "$log_file" | tail -1 | sed 's/^\[OK\] report=//' || true)"
+  fi
+
+  if [[ -z "$report_path" || ! -f "$report_path" ]]; then
+    printf 'report=n/a'
+    return 1
+  fi
+
+  "$PYTHON_BIN" - "$report_path" <<'PY'
+import json
+import sys
+
+with open(sys.argv[1], "r", encoding="utf-8") as f:
+    report = json.load(f)
+
+parts = []
+ok = True
+for item in report.get("average_time_summary", []):
+    bs = item.get("batch_size")
+    status = item.get("status")
+    value = item.get("trimmed_avg_ms")
+    if value is None:
+        value = item.get("average_time_ms")
+    if status != "ok" or value is None:
+        ok = False
+        parts.append(f"bs{bs}=n/a")
+    else:
+        parts.append(f"bs{bs}={float(value):.4f}ms")
+
+print(", ".join(parts) if parts else "report-empty")
+sys.exit(0 if ok and parts else 1)
+PY
+}
+
+bd_model_body() {
+  local model_id="$1"
+  local job_dir="$2"
+  local spec_path="$MODEL_ROOT/inference/metaGraph/meta_graph/meta_graph_${model_id}.spec"
+
+  if [[ ! -f "$spec_path" ]]; then
+    echo "Spec file not found: $spec_path"
+    return 1
+  fi
+
+  cd "$MODEL_ROOT/inference/metaGraph" || return $?
+  MUSA_PINNED_FEED=1 \
+  MUSA_PINNED_H2D_ON_COMPUTE_STREAM=1 \
+  timeout "$BD_MODEL_TIMEOUT" "$PYTHON_BIN" musa_run_pb_graph.py \
+    --spec "$spec_path" \
+    --bs "$BD_MODEL_BS" \
+    --run_iters "$BD_MODEL_RUN_ITERS" \
+    --out_root "$job_dir/current-out"
+}
+
+job_bd_model() {
+  local model_id="$1"
+  local job_name="bd_model${model_id}"
+  local job_dir="$LOG_ROOT/bd-model${model_id}"
+  local log_file="$job_dir/${job_name}_current.log"
+
+  mkdir -p "$job_dir"
+  rm -rf "$job_dir/current-out"
+
+  run_logged "$job_name current" "$log_file" run_with_optional_gpu_lock bd_model_body "$model_id" "$job_dir"
+  local exit_code=$?
+  local detail=""
+  detail="$(bd_report_summary "$log_file")" || exit_code=1
+
+  append_summary "$job_name" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "$detail"
+  return "$exit_code"
+}
+
+find_first_npz() {
+  local preferred_file="$1"
+  local data_dir="$2"
+
+  if [[ -n "$preferred_file" && -f "$preferred_file" ]]; then
+    printf '%s\n' "$preferred_file"
+    return 0
+  fi
+
+  if [[ -d "$data_dir" ]]; then
+    find "$data_dir" -maxdepth 2 -type f -name '*.npz' | sort | head -n 1
+  fi
+}
+
+training_perf_body() {
+  local root="$1"
+  local script_name="$2"
+  local data_dir="$3"
+  local explicit_data_file="$4"
+  local warmup_steps="$5"
+  local measure_steps="$6"
+  local timeout_value="$7"
+
+  local script_path="$root/$script_name"
+  local data_file
+
+  if [[ ! -d "$root" ]]; then
+    echo "Training performance root not found: $root"
+    return 1
+  fi
+  if [[ ! -d "$root/model" || ! -d "$root/data" ]]; then
+    echo "Training performance model/data directories not found under: $root"
+    return 1
+  fi
+  if [[ ! -f "$script_path" ]]; then
+    echo "Training performance script missing: $script_path"
+    return 1
+  fi
+
+  data_file="$(find_first_npz "$explicit_data_file" "$data_dir")"
+  if [[ -z "$data_file" || ! -f "$data_file" ]]; then
+    echo "Training performance data npz not found under: $data_dir"
+    return 1
+  fi
+
+  cd "$root" || return $?
+  echo "Script: $script_path"
+  echo "Data file: $data_file"
+  timeout "$timeout_value" "$PYTHON_BIN" -u "$script_path" \
+    --data_path "$data_file" \
+    --warmup_steps "$warmup_steps" \
+    --measure_steps "$measure_steps" \
+    --max_steps "$((warmup_steps + measure_steps))"
+}
+
+parse_average_full_step_ms() {
+  local log_file="$1"
+  "$PYTHON_BIN" - "$log_file" <<'PY' || true
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text(encoding="utf-8", errors="ignore")
+matches = re.findall(r"Average ms/full step:\s*([0-9]+(?:\.[0-9]+)?)", text)
+print(matches[-1] if matches else "")
+PY
+}
+
+job_onetrans_perf() {
+  local log_file="$LOG_ROOT/onetrans-perf/onetrans_perf_current.log"
+  run_logged "OneTrans performance current" "$log_file" run_with_optional_gpu_lock training_perf_body \
+    "$ONETRANS_PERF_ROOT" \
+    "$ONETRANS_PERF_SCRIPT_NAME" \
+    "$ONETRANS_PERF_DATA_DIR" \
+    "${ONETRANS_PERF_DATA_FILE:-}" \
+    "$ONETRANS_PERF_WARMUP_STEPS" \
+    "$ONETRANS_PERF_MEASURE_STEPS" \
+    "$ONETRANS_PERF_TIMEOUT"
+  local exit_code=$?
+  local current_ms=""
+
+  if [[ "$exit_code" == "0" && -f "$log_file" ]]; then
+    current_ms="$(parse_average_full_step_ms "$log_file")"
+    [[ -n "$current_ms" ]] || exit_code=1
+  fi
+
+  append_summary "onetrans_perf" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "current_ms=${current_ms:-n/a}"
+  return "$exit_code"
+}
+
+job_tokenlarge_perf() {
+  local log_file="$LOG_ROOT/tokenlarge-perf/tokenlarge_perf_current.log"
+  run_logged "TokenMixer Large performance current" "$log_file" run_with_optional_gpu_lock training_perf_body \
+    "$TOKENLARGE_PERF_ROOT" \
+    "$TOKENLARGE_PERF_SCRIPT_NAME" \
+    "$TOKENLARGE_PERF_DATA_DIR" \
+    "${TOKENLARGE_PERF_DATA_FILE:-}" \
+    "$TOKENLARGE_PERF_WARMUP_STEPS" \
+    "$TOKENLARGE_PERF_MEASURE_STEPS" \
+    "$TOKENLARGE_PERF_TIMEOUT"
+  local exit_code=$?
+  local current_ms=""
+
+  if [[ "$exit_code" == "0" && -f "$log_file" ]]; then
+    current_ms="$(parse_average_full_step_ms "$log_file")"
+    [[ -n "$current_ms" ]] || exit_code=1
+  fi
+
+  append_summary "tokenlarge_perf" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "current_ms=${current_ms:-n/a}"
+  return "$exit_code"
+}
+
+job_training_body() {
+  local job_dir="$LOG_ROOT/training"
+  local error_log_dir="$job_dir/error_logs"
+  local live_wrapper="$job_dir/run_training_live.py"
+  local selected_models="$TRAINING_MODELS"
+
+  if [[ -z "$selected_models" ]]; then
+    if [[ "$TRAINING_MODE" == "daily" ]]; then
+      selected_models="$DAILY_TRAINING_MODELS"
+    else
+      selected_models="$PR_TRAINING_MODELS"
+    fi
+  fi
+
+  if [[ ! -d "$MODEL_ROOT/training" ]]; then
+    echo "Training root not found: $MODEL_ROOT/training"
+    return 1
+  fi
+
+  mkdir -p "$job_dir" "$error_log_dir"
+  cat > "$live_wrapper" <<'PY'
+import os
+import selectors
+import subprocess
+import sys
+import time
+
+sys.path.insert(0, os.getcwd())
+import run_all_training_tests as runner
+
+_real_run = subprocess.run
+
+
+def run_live(cmd, capture_output=False, text=False, cwd=None, env=None, timeout=None, **kwargs):
+    if not capture_output:
+        return _real_run(cmd, capture_output=capture_output, text=text, cwd=cwd, env=env, timeout=timeout, **kwargs)
+
+    effective_timeout = int(env.get("TRAINING_MODEL_TIMEOUT_SECONDS", "3600")) if env else 3600
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        cwd=cwd,
+        env=env,
+    )
+
+    lines = []
+    start = time.monotonic()
+    selector = selectors.DefaultSelector()
+    selector.register(proc.stdout, selectors.EVENT_READ)
+
+    try:
+        while True:
+            for key, _ in selector.select(timeout=0.2):
+                line = key.fileobj.readline()
+                if line:
+                    lines.append(line)
+                    sys.stdout.write(line)
+                    sys.stdout.flush()
+
+            if proc.poll() is not None:
+                rest = proc.stdout.read()
+                if rest:
+                    lines.append(rest)
+                    sys.stdout.write(rest)
+                    sys.stdout.flush()
+                break
+
+            if effective_timeout > 0 and time.monotonic() - start > effective_timeout:
+                proc.kill()
+                proc.wait()
+                timeout_msg = f"\n[ERROR] Model command timeout (>{effective_timeout}s)\n"
+                lines.append(timeout_msg)
+                sys.stdout.write(timeout_msg)
+                sys.stdout.flush()
+                return subprocess.CompletedProcess(cmd, 124, stdout="".join(lines), stderr="")
+    finally:
+        selector.close()
+        if proc.stdout:
+            proc.stdout.close()
+
+    return subprocess.CompletedProcess(cmd, proc.returncode, stdout="".join(lines), stderr="")
+
+
+runner.subprocess.run = run_live
+sys.exit(runner.main())
+PY
+
+  cd "$MODEL_ROOT/training" || return $?
+  read -r -a training_models <<< "$selected_models"
+  echo "Training mode: $TRAINING_MODE"
+  echo "Selected models (${#training_models[@]}): $selected_models"
+
+  TRAINING_MODEL_TIMEOUT_SECONDS="$TRAINING_MODEL_TIMEOUT_SECONDS" \
+  PYTHONUNBUFFERED=1 \
+  timeout "$TRAINING_TIMEOUT" "$PYTHON_BIN" -u "$live_wrapper" \
+    --epochs "$TRAINING_EPOCHS" \
+    --log-dir "$error_log_dir" \
+    --models "${training_models[@]}"
+}
+
+job_training() {
+  local log_file="$LOG_ROOT/training/training_tests_current.log"
+  run_logged "training current" "$log_file" run_with_optional_gpu_lock job_training_body
+  local exit_code=$?
+  local detail="log=$log_file"
+
+  if [[ ! -f "$log_file" ]]; then
+    exit_code=1
+    detail="log-missing"
+  elif grep -Eq "$EMPTY_TEST_RESULT_PATTERN" "$log_file" || grep -q '^\[FAIL\]' "$log_file"; then
+    exit_code=1
+    detail="failure-marker"
+  fi
+
+  append_summary "training" "$([[ "$exit_code" == "0" ]] && echo success || echo failure)" "$detail"
+  return "$exit_code"
+}
+
+write_summary_file() {
+  local summary_file="$LOG_ROOT/summaries/current-only-summary.md"
+  {
+    echo "## Current-only Validation"
+    echo
+    echo "- Repo: $REPO_ROOT"
+    echo "- Log root: $LOG_ROOT"
+    echo "- Model root: $MODEL_ROOT"
+    echo "- Jobs: $CURRENT_ONLY_JOBS"
+    echo "- Wheel: ${RESOLVED_WHEEL_PATH:-n/a}"
+    echo "- GPU lock: ${GPU_LOCK_FILE:-not set}"
+    echo
+    echo "| Job | Status | Detail |"
+    echo "|-----|--------|--------|"
+    local row
+    for row in "${SUMMARY_ROWS[@]}"; do
+      IFS='|' read -r name status detail <<< "$row"
+      echo "| $name | $status | $detail |"
+    done
+  } | tee "$summary_file"
+}
+
+run_job_by_name() {
+  local job="$1"
+  case "$job" in
+    integration)
+      job_integration
+      ;;
+    fusion)
+      job_fusion
+      ;;
+    t_accuracy)
+      job_t_accuracy
+      ;;
+    t_perf)
+      job_t_perf
+      ;;
+    bd_model1)
+      job_bd_model 1
+      ;;
+    bd_model2)
+      job_bd_model 2
+      ;;
+    bd_model3)
+      job_bd_model 3
+      ;;
+    onetrans_perf)
+      job_onetrans_perf
+      ;;
+    tokenlarge_perf)
+      job_tokenlarge_perf
+      ;;
+    training)
+      job_training
+      ;;
+    *)
+      log_error "Unknown job: $job"
+      return 1
+      ;;
+  esac
+}
+
+main() {
+  local any_failed=0
+
+  if [[ "$CURRENT_ONLY_JOBS" == "all" ]]; then
+    CURRENT_ONLY_JOBS="$ALL_CURRENT_ONLY_JOBS"
+  fi
+
+  RESOLVED_WHEEL_PATH="$(resolve_wheel_path)" || exit 1
+  log_info "Repo: $REPO_ROOT"
+  log_info "Log root: $LOG_ROOT"
+  log_info "Wheel: $RESOLVED_WHEEL_PATH"
+  log_info "Jobs: $CURRENT_ONLY_JOBS"
+
+  install_current_wheel "$RESOLVED_WHEEL_PATH" || exit 1
+
+  local job
+  for job in $CURRENT_ONLY_JOBS; do
+    if run_job_by_name "$job"; then
+      :
+    else
+      any_failed=1
+      if [[ "$FAIL_FAST" == "1" ]]; then
+        break
+      fi
+    fi
+  done
+
+  write_summary_file
+
+  if [[ "$any_failed" == "0" ]]; then
+    log_info "Current-only validation finished successfully."
+  else
+    log_error "Current-only validation finished with failures."
+  fi
+  return "$any_failed"
+}
+
+main "$@"