chore: update benchmark execution script parameters and paths

2026-05-17 09:16:03 +01:00
parent 451723bab8
commit 2a4cb50b52
1 changed files with 236 additions and 17 deletions
@@ -5,7 +5,198 @@ MODEL_DIR="$(realpath ~/models)"
 RESULTDIR="results"
 mkdir -p "$RESULTDIR"
 # ═══════════════════════════════════════════════════════════════════════════════
 # OOM Recovery System
 # ═══════════════════════════════════════════════════════════════════════════════
 #
 # When llama-bench gets OOM-killed, the Linux OOM killer can also kill
 # systemd --user (same cgroup), which breaks podman's cgroup management.
 # All subsequent toolbox commands silently fail with:
 #   "Error: unable to find user <user>: no matching entries in passwd file"
 #
 # Recovery requires:
 #   1. sudo systemctl restart user@<uid>   (restart dead systemd --user)
 #   2. podman stop --all                    (clean up zombie containers)
 #
 # For unattended runs, add to /etc/sudoers.d/toolbox-recovery:
 #   <user> ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@<uid>
 # ═══════════════════════════════════════════════════════════════════════════════
 RECOVERY_WAIT_SECS=3
 MAX_RECOVERY_ATTEMPTS=2
 _recovery_count=0
 # --- Container / Backend Configuration ---
 declare -A CONTAINERS=(
  [rocm6_4_4]="llama-rocm-6.4.4"
  [rocm-7_2_3]="llama-rocm-7.2.3"
  [rocm7-nightlies]="llama-rocm7-nightlies"
  [vulkan_amdvlk]="llama-vulkan-amdvlk"
  [vulkan_radv]="llama-vulkan-radv"
 )
 declare -A BENCH_BINS=(
  [rocm6_4_4]="/usr/local/bin/llama-bench"
  [rocm-7_2_3]="/usr/local/bin/llama-bench"
  [rocm7-nightlies]="/usr/local/bin/llama-bench"
  [vulkan_amdvlk]="/usr/sbin/llama-bench"
  [vulkan_radv]="/usr/sbin/llama-bench"
 )
 # --- Health Check & Recovery Functions ---
 # Check if systemd --user is alive
 check_systemd_user() {
  systemctl --user status &>/dev/null
 }
 # Check if a specific toolbox container is functional
 check_toolbox_health() {
  local container="$1"
  local output
  output=$(toolbox run -c "$container" echo "health_ok" 2>&1)
  [[ "$output" == *"health_ok"* ]]
 }
 # Recover from OOM-induced toolbox/podman failure
 # Returns 0 on success, 1 on failure
 recover_toolbox_system() {
  (( _recovery_count++ ))
  if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then
    echo "  ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up"
    echo "  → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
    return 1
  fi
  echo ""
  echo "🔧 ═══════════════════════════════════════════════════════════════"
  echo "🔧  Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})"
  echo "🔧 ═══════════════════════════════════════════════════════════════"
  # Step 1: Restart systemd --user if dead
  if ! check_systemd_user; then
    echo "  → systemd --user is dead, restarting..."
    if sudo systemctl restart "user@$(id -u)"; then
      echo "  ✔ systemd --user restarted"
      sleep "$RECOVERY_WAIT_SECS"
    else
      echo "  ✖ Failed to restart systemd --user"
      echo "  → Ensure passwordless sudo is configured (see header comments)"
      return 1
    fi
  else
    echo "  → systemd --user is alive"
  fi
  # Step 2: Stop all zombie containers
  echo "  → Stopping all zombie containers..."
  podman stop --all 2>/dev/null
  sleep 1
  # Step 3: Verify systemd is alive after container cleanup
  if ! check_systemd_user; then
    echo "  ✖ systemd --user died again after container cleanup"
    return 1
  fi
  echo "  ✔ Recovery complete"
  echo ""
  return 0
 }
 # Pre-flight checks before starting benchmarks
 preflight_check() {
  echo "🔍 Pre-flight checks"
  echo "───────────────────────────────────────────────────"
  # Check sudo access for recovery
  if sudo -n true &>/dev/null; then
    echo "  ✔ Passwordless sudo available for auto-recovery"
  else
    echo "  ⚠ Passwordless sudo NOT available"
    echo "    → Auto-recovery will prompt for password (or fail in unattended mode)"
    echo "    → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery"
    echo "    → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)"
  fi
  # Check systemd --user
  if check_systemd_user; then
    echo "  ✔ systemd --user is running"
  else
    echo "  ⚠ systemd --user is dead — recovering before start..."
    if ! recover_toolbox_system; then
      echo "  ✖ Cannot recover toolbox system, aborting"
      exit 1
    fi
  fi
  # Spot-check one container
  local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}"
  if check_toolbox_health "$first_container"; then
    echo "  ✔ Toolbox health check passed ($first_container)"
  else
    echo "  ⚠ Toolbox health check failed ($first_container) — recovering..."
    if ! recover_toolbox_system; then
      echo "  ✖ Cannot recover toolbox system, aborting"
      exit 1
    fi
  fi
  echo ""
 }
 # Run a benchmark with automatic failure detection and recovery
 # Usage: run_bench_with_recovery <env> <container> <out_file> <label> <cmd_args...>
 # Returns: 0 = success, 1 = legitimate failure (OOM etc.), 2 = unrecoverable system failure
 #
 # On failure, checks if the toolbox system is broken (OOM killed systemd).
 # If broken: recovers the system and moves on (does NOT retry — the same
 # benchmark would just OOM again). If healthy: it was a legit failure.
 run_bench_with_recovery() {
  local env="$1" container="$2" out_file="$3" label="$4"
  shift 4
  local -a cmd_args=("$@")
  "${cmd_args[@]}" >"$out_file" 2>&1
  local exit_code=$?
  if (( exit_code == 0 )); then
    return 0  # success
  fi
  # --- Failure: determine if it's a system issue or legitimate benchmark failure ---
  echo "  ⚠ Benchmark exited with code $exit_code, checking system health..."
  if check_toolbox_health "$container"; then
    # Toolbox is fine → legitimate failure (OOM kill, model too large, etc.)
    echo "  → Toolbox is healthy — benchmark failure (OOM / model issue), moving on"
    echo "✖ ${label} failed (exit ${exit_code})" >>"$out_file"
    return 1
  fi
  # --- System failure detected → recover and continue to next benchmark ---
  echo "  → Toolbox is broken — initiating recovery..."
  rm -f "$out_file"  # Remove invalid log so it can be retried in a future run
  if ! recover_toolbox_system; then
    echo "  ✖ Recovery failed — aborting"
    return 2
  fi
  # Verify recovery worked for this specific container
  if ! check_toolbox_health "$container"; then
    echo "  ✖ Container $container still broken after recovery"
    return 2
  fi
  _recovery_count=0  # Reset counter on successful recovery
  echo "  ✔ System recovered — skipping this benchmark, continuing with next"
  return 1
 }
 # ═══════════════════════════════════════════════════════════════════════════════
 # Capture system info
 # ═══════════════════════════════════════════════════════════════════════════════
 if [[ ! -f "$RESULTDIR/system_info.json" ]]; then
    python3 -c '
 import platform, json, datetime
@@ -40,6 +231,10 @@ print(json.dumps(info))
    echo "Captured system info to $RESULTDIR/system_info.json"
 fi
 # ═══════════════════════════════════════════════════════════════════════════════
 # Discover models
 # ═══════════════════════════════════════════════════════════════════════════════
 # Pick exactly one .gguf per model: either
 #  - any .gguf without "-000*-of-" (single-file models)
 #  - or the first shard "*-00001-of-*.gguf"
@@ -60,21 +255,28 @@ for p in "${MODEL_PATHS[@]}"; do
 done
 echo
-declare -A CMDS=(
+# ═══════════════════════════════════════════════════════════════════════════════
-  [rocm6_4_4]="toolbox run -c llama-rocm-6.4.4 -- /usr/local/bin/llama-bench"
+# Pre-flight & Main benchmark loop
-  [rocm-7_2_3]="toolbox run -c llama-rocm-7.2.3 -- /usr/local/bin/llama-bench"
+# ═══════════════════════════════════════════════════════════════════════════════
-  [rocm7-nightlies]="toolbox run -c llama-rocm7-nightlies -- /usr/local/bin/llama-bench"
+
-  [vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench"
+preflight_check
-  [vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench"
+
-)
+ABORT_ALL=0
 for MODEL_PATH in "${MODEL_PATHS[@]}"; do
  MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
-  for ENV in "${!CMDS[@]}"; do
+  for ENV in "${!CONTAINERS[@]}"; do
-    CMD_EFFECTIVE="${CMDS[$ENV]}"
+    if (( ABORT_ALL )); then
      echo "⛔ Aborting due to unrecoverable system failure"
      exit 1
    fi
-    # run twice: baseline and with flash attention
+    CONTAINER="${CONTAINERS[$ENV]}"
    BENCH_BIN="${BENCH_BINS[$ENV]}"
    CMD_PREFIX=( toolbox run -c "$CONTAINER" -- "$BENCH_BIN" )
    # run with flash attention
    for FA in 1; do
      SUFFIX=""
      EXTRA_ARGS=()
@@ -115,18 +317,35 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
          continue
        fi
-        FULL_CMD=( $CMD_EFFECTIVE -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
+        LABEL="[${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX}"
        FULL_CMD=( "${CMD_PREFIX[@]}" -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
-        printf "\n▶ [%s] %s%s%s\n" "$ENV" "$MODEL_NAME" "${SUFFIX:+ $SUFFIX}" "${CTX_SUFFIX:+ $CTX_SUFFIX}"
+        printf "\n▶ %s\n" "$LABEL"
        printf "  → log: %s\n" "$OUT"
        printf "  → cmd: %s\n\n" "${FULL_CMD[*]}"
-        if ! "${FULL_CMD[@]}" >"$OUT" 2>&1; then
+        run_bench_with_recovery "$ENV" "$CONTAINER" "$OUT" "$LABEL" "${FULL_CMD[@]}"
-          status=$?
+        rc=$?
-          echo "✖ ! [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} failed (exit ${status})" >>"$OUT"
+
-          echo "  * [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} : FAILED"
+        case $rc in
-        fi
+          0) echo "  ✔ $LABEL : OK" ;;
          1) echo "  * $LABEL : FAILED" ;;
          2) echo "  ⛔ $LABEL : SYSTEM FAILURE — aborting all"
             ABORT_ALL=1
             break 3  # break out of CTX, FA, and ENV loops
             ;;
        esac
      done
    done
  done
 done
 if (( ABORT_ALL )); then
  echo ""
  echo "⛔ Benchmark run aborted due to unrecoverable system failure"
  echo "   Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
  exit 1
 fi
 echo ""
 echo "✅ All benchmarks complete"