chore: update benchmark execution script parameters and paths

2026-05-17 09:16:03 +01:00
parent 451723bab8
commit 2a4cb50b52
1 changed files with 236 additions and 17 deletions
@@ -5,7 +5,198 @@ MODEL_DIR="$(realpath ~/models)"
 RESULTDIR="results"
 mkdir -p "$RESULTDIR"

+# ═══════════════════════════════════════════════════════════════════════════════
+# OOM Recovery System
+# ═══════════════════════════════════════════════════════════════════════════════
+#
+# When llama-bench gets OOM-killed, the Linux OOM killer can also kill
+# systemd --user (same cgroup), which breaks podman's cgroup management.
+# All subsequent toolbox commands silently fail with:
+#   "Error: unable to find user <user>: no matching entries in passwd file"
+#
+# Recovery requires:
+#   1. sudo systemctl restart user@<uid>   (restart dead systemd --user)
+#   2. podman stop --all                    (clean up zombie containers)
+#
+# For unattended runs, add to /etc/sudoers.d/toolbox-recovery:
+#   <user> ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@<uid>
+# ═══════════════════════════════════════════════════════════════════════════════
+
+RECOVERY_WAIT_SECS=3
+MAX_RECOVERY_ATTEMPTS=2
+_recovery_count=0
+
+# --- Container / Backend Configuration ---
+declare -A CONTAINERS=(
+  [rocm6_4_4]="llama-rocm-6.4.4"
+  [rocm-7_2_3]="llama-rocm-7.2.3"
+  [rocm7-nightlies]="llama-rocm7-nightlies"
+  [vulkan_amdvlk]="llama-vulkan-amdvlk"
+  [vulkan_radv]="llama-vulkan-radv"
+)
+
+declare -A BENCH_BINS=(
+  [rocm6_4_4]="/usr/local/bin/llama-bench"
+  [rocm-7_2_3]="/usr/local/bin/llama-bench"
+  [rocm7-nightlies]="/usr/local/bin/llama-bench"
+  [vulkan_amdvlk]="/usr/sbin/llama-bench"
+  [vulkan_radv]="/usr/sbin/llama-bench"
+)
+
+# --- Health Check & Recovery Functions ---
+
+# Check if systemd --user is alive
+check_systemd_user() {
+  systemctl --user status &>/dev/null
+}
+
+# Check if a specific toolbox container is functional
+check_toolbox_health() {
+  local container="$1"
+  local output
+  output=$(toolbox run -c "$container" echo "health_ok" 2>&1)
+  [[ "$output" == *"health_ok"* ]]
+}
+
+# Recover from OOM-induced toolbox/podman failure
+# Returns 0 on success, 1 on failure
+recover_toolbox_system() {
+  (( _recovery_count++ ))
+  if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then
+    echo "  ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up"
+    echo "  → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
+    return 1
+  fi
+
+  echo ""
+  echo "🔧 ═══════════════════════════════════════════════════════════════"
+  echo "🔧  Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})"
+  echo "🔧 ═══════════════════════════════════════════════════════════════"
+
+  # Step 1: Restart systemd --user if dead
+  if ! check_systemd_user; then
+    echo "  → systemd --user is dead, restarting..."
+    if sudo systemctl restart "user@$(id -u)"; then
+      echo "  ✔ systemd --user restarted"
+      sleep "$RECOVERY_WAIT_SECS"
+    else
+      echo "  ✖ Failed to restart systemd --user"
+      echo "  → Ensure passwordless sudo is configured (see header comments)"
+      return 1
+    fi
+  else
+    echo "  → systemd --user is alive"
+  fi
+
+  # Step 2: Stop all zombie containers
+  echo "  → Stopping all zombie containers..."
+  podman stop --all 2>/dev/null
+  sleep 1
+
+  # Step 3: Verify systemd is alive after container cleanup
+  if ! check_systemd_user; then
+    echo "  ✖ systemd --user died again after container cleanup"
+    return 1
+  fi
+
+  echo "  ✔ Recovery complete"
+  echo ""
+  return 0
+}
+
+# Pre-flight checks before starting benchmarks
+preflight_check() {
+  echo "🔍 Pre-flight checks"
+  echo "───────────────────────────────────────────────────"
+
+  # Check sudo access for recovery
+  if sudo -n true &>/dev/null; then
+    echo "  ✔ Passwordless sudo available for auto-recovery"
+  else
+    echo "  ⚠ Passwordless sudo NOT available"
+    echo "    → Auto-recovery will prompt for password (or fail in unattended mode)"
+    echo "    → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery"
+    echo "    → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)"
+  fi
+
+  # Check systemd --user
+  if check_systemd_user; then
+    echo "  ✔ systemd --user is running"
+  else
+    echo "  ⚠ systemd --user is dead — recovering before start..."
+    if ! recover_toolbox_system; then
+      echo "  ✖ Cannot recover toolbox system, aborting"
+      exit 1
+    fi
+  fi
+
+  # Spot-check one container
+  local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}"
+  if check_toolbox_health "$first_container"; then
+    echo "  ✔ Toolbox health check passed ($first_container)"
+  else
+    echo "  ⚠ Toolbox health check failed ($first_container) — recovering..."
+    if ! recover_toolbox_system; then
+      echo "  ✖ Cannot recover toolbox system, aborting"
+      exit 1
+    fi
+  fi
+
+  echo ""
+}
+
+# Run a benchmark with automatic failure detection and recovery
+# Usage: run_bench_with_recovery <env> <container> <out_file> <label> <cmd_args...>
+# Returns: 0 = success, 1 = legitimate failure (OOM etc.), 2 = unrecoverable system failure
+#
+# On failure, checks if the toolbox system is broken (OOM killed systemd).
+# If broken: recovers the system and moves on (does NOT retry — the same
+# benchmark would just OOM again). If healthy: it was a legit failure.
+run_bench_with_recovery() {
+  local env="$1" container="$2" out_file="$3" label="$4"
+  shift 4
+  local -a cmd_args=("$@")
+
+  "${cmd_args[@]}" >"$out_file" 2>&1
+  local exit_code=$?
+
+  if (( exit_code == 0 )); then
+    return 0  # success
+  fi
+
+  # --- Failure: determine if it's a system issue or legitimate benchmark failure ---
+  echo "  ⚠ Benchmark exited with code $exit_code, checking system health..."
+
+  if check_toolbox_health "$container"; then
+    # Toolbox is fine → legitimate failure (OOM kill, model too large, etc.)
+    echo "  → Toolbox is healthy — benchmark failure (OOM / model issue), moving on"
+    echo "✖ ${label} failed (exit ${exit_code})" >>"$out_file"
+    return 1
+  fi
+
+  # --- System failure detected → recover and continue to next benchmark ---
+  echo "  → Toolbox is broken — initiating recovery..."
+  rm -f "$out_file"  # Remove invalid log so it can be retried in a future run
+
+  if ! recover_toolbox_system; then
+    echo "  ✖ Recovery failed — aborting"
+    return 2
+  fi
+
+  # Verify recovery worked for this specific container
+  if ! check_toolbox_health "$container"; then
+    echo "  ✖ Container $container still broken after recovery"
+    return 2
+  fi
+
+  _recovery_count=0  # Reset counter on successful recovery
+  echo "  ✔ System recovered — skipping this benchmark, continuing with next"
+  return 1
+}
+
+# ═══════════════════════════════════════════════════════════════════════════════
 # Capture system info
+# ═══════════════════════════════════════════════════════════════════════════════
 if [[ ! -f "$RESULTDIR/system_info.json" ]]; then
    python3 -c '
 import platform, json, datetime
@@ -40,6 +231,10 @@ print(json.dumps(info))
    echo "Captured system info to $RESULTDIR/system_info.json"
 fi

+# ═══════════════════════════════════════════════════════════════════════════════
+# Discover models
+# ═══════════════════════════════════════════════════════════════════════════════
+
 # Pick exactly one .gguf per model: either
 #  - any .gguf without "-000*-of-" (single-file models)
 #  - or the first shard "*-00001-of-*.gguf"
@@ -60,21 +255,28 @@ for p in "${MODEL_PATHS[@]}"; do
 done
 echo

-declare -A CMDS=(
-  [rocm6_4_4]="toolbox run -c llama-rocm-6.4.4 -- /usr/local/bin/llama-bench"
-  [rocm-7_2_3]="toolbox run -c llama-rocm-7.2.3 -- /usr/local/bin/llama-bench"
-  [rocm7-nightlies]="toolbox run -c llama-rocm7-nightlies -- /usr/local/bin/llama-bench"
-  [vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench"
-  [vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench"
-)
+# ═══════════════════════════════════════════════════════════════════════════════
+# Pre-flight & Main benchmark loop
+# ═══════════════════════════════════════════════════════════════════════════════
+
+preflight_check
+
+ABORT_ALL=0

 for MODEL_PATH in "${MODEL_PATHS[@]}"; do
  MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"

-  for ENV in "${!CMDS[@]}"; do
-    CMD_EFFECTIVE="${CMDS[$ENV]}"
+  for ENV in "${!CONTAINERS[@]}"; do
+    if (( ABORT_ALL )); then
+      echo "⛔ Aborting due to unrecoverable system failure"
+      exit 1
+    fi

-    # run twice: baseline and with flash attention
+    CONTAINER="${CONTAINERS[$ENV]}"
+    BENCH_BIN="${BENCH_BINS[$ENV]}"
+    CMD_PREFIX=( toolbox run -c "$CONTAINER" -- "$BENCH_BIN" )
+
+    # run with flash attention
    for FA in 1; do
      SUFFIX=""
      EXTRA_ARGS=()
@@ -115,18 +317,35 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
          continue
        fi

-        FULL_CMD=( $CMD_EFFECTIVE -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
+        LABEL="[${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX}"
+        FULL_CMD=( "${CMD_PREFIX[@]}" -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )

-        printf "\n▶ [%s] %s%s%s\n" "$ENV" "$MODEL_NAME" "${SUFFIX:+ $SUFFIX}" "${CTX_SUFFIX:+ $CTX_SUFFIX}"
+        printf "\n▶ %s\n" "$LABEL"
        printf "  → log: %s\n" "$OUT"
        printf "  → cmd: %s\n\n" "${FULL_CMD[*]}"

-        if ! "${FULL_CMD[@]}" >"$OUT" 2>&1; then
-          status=$?
-          echo "✖ ! [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} failed (exit ${status})" >>"$OUT"
-          echo "  * [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} : FAILED"
-        fi
+        run_bench_with_recovery "$ENV" "$CONTAINER" "$OUT" "$LABEL" "${FULL_CMD[@]}"
+        rc=$?
+
+        case $rc in
+          0) echo "  ✔ $LABEL : OK" ;;
+          1) echo "  * $LABEL : FAILED" ;;
+          2) echo "  ⛔ $LABEL : SYSTEM FAILURE — aborting all"
+             ABORT_ALL=1
+             break 3  # break out of CTX, FA, and ENV loops
+             ;;
+        esac
      done
    done
  done
 done
+
+if (( ABORT_ALL )); then
+  echo ""
+  echo "⛔ Benchmark run aborted due to unrecoverable system failure"
+  echo "   Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
+  exit 1
+fi
+
+echo ""
+echo "✅ All benchmarks complete"