#!/usr/bin/env bash set -uo pipefail MODEL_DIR="$(realpath ~/models)" RESULTDIR="results" mkdir -p "$RESULTDIR" # ═══════════════════════════════════════════════════════════════════════════════ # OOM Recovery System # ═══════════════════════════════════════════════════════════════════════════════ # # When llama-bench gets OOM-killed, the Linux OOM killer can also kill # systemd --user (same cgroup), which breaks podman's cgroup management. # All subsequent toolbox commands silently fail with: # "Error: unable to find user : no matching entries in passwd file" # # Recovery requires: # 1. sudo systemctl restart user@ (restart dead systemd --user) # 2. podman stop --all (clean up zombie containers) # # For unattended runs, add to /etc/sudoers.d/toolbox-recovery: # ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@ # ═══════════════════════════════════════════════════════════════════════════════ RECOVERY_WAIT_SECS=3 MAX_RECOVERY_ATTEMPTS=2 _recovery_count=0 # --- Container / Backend Configuration --- declare -A CONTAINERS=( [rocm6_4_4]="llama-rocm-6.4.4" [rocm-7_2_3]="llama-rocm-7.2.3" [rocm7-nightlies]="llama-rocm7-nightlies" [vulkan_amdvlk]="llama-vulkan-amdvlk" [vulkan_radv]="llama-vulkan-radv" ) declare -A BENCH_BINS=( [rocm6_4_4]="/usr/local/bin/llama-bench" [rocm-7_2_3]="/usr/local/bin/llama-bench" [rocm7-nightlies]="/usr/local/bin/llama-bench" [vulkan_amdvlk]="/usr/sbin/llama-bench" [vulkan_radv]="/usr/sbin/llama-bench" ) # --- Health Check & Recovery Functions --- # Check if systemd --user is alive check_systemd_user() { systemctl --user status &>/dev/null } # Check if a specific toolbox container is functional check_toolbox_health() { local container="$1" local output output=$(toolbox run -c "$container" echo "health_ok" 2>&1) [[ "$output" == *"health_ok"* ]] } # Recover from OOM-induced toolbox/podman failure # Returns 0 on success, 1 on failure recover_toolbox_system() { (( _recovery_count++ )) if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then echo " ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up" echo " → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all" return 1 fi echo "" echo "🔧 ═══════════════════════════════════════════════════════════════" echo "🔧 Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})" echo "🔧 ═══════════════════════════════════════════════════════════════" # Step 1: Restart systemd --user if dead if ! check_systemd_user; then echo " → systemd --user is dead, restarting..." if sudo systemctl restart "user@$(id -u)"; then echo " ✔ systemd --user restarted" sleep "$RECOVERY_WAIT_SECS" else echo " ✖ Failed to restart systemd --user" echo " → Ensure passwordless sudo is configured (see header comments)" return 1 fi else echo " → systemd --user is alive" fi # Step 2: Stop all zombie containers echo " → Stopping all zombie containers..." podman stop --all 2>/dev/null sleep 1 # Step 3: Verify systemd is alive after container cleanup if ! check_systemd_user; then echo " ✖ systemd --user died again after container cleanup" return 1 fi echo " ✔ Recovery complete" echo "" return 0 } # Pre-flight checks before starting benchmarks preflight_check() { echo "🔍 Pre-flight checks" echo "───────────────────────────────────────────────────" # Check sudo access for recovery if sudo -n true &>/dev/null; then echo " ✔ Passwordless sudo available for auto-recovery" else echo " ⚠ Passwordless sudo NOT available" echo " → Auto-recovery will prompt for password (or fail in unattended mode)" echo " → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery" echo " → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)" fi # Check systemd --user if check_systemd_user; then echo " ✔ systemd --user is running" else echo " ⚠ systemd --user is dead — recovering before start..." if ! recover_toolbox_system; then echo " ✖ Cannot recover toolbox system, aborting" exit 1 fi fi # Spot-check one container local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}" if check_toolbox_health "$first_container"; then echo " ✔ Toolbox health check passed ($first_container)" else echo " ⚠ Toolbox health check failed ($first_container) — recovering..." if ! recover_toolbox_system; then echo " ✖ Cannot recover toolbox system, aborting" exit 1 fi fi echo "" } # Run a benchmark with automatic failure detection and recovery # Usage: run_bench_with_recovery # Returns: 0 = success, 1 = legitimate failure (OOM etc.), 2 = unrecoverable system failure # # On failure, checks if the toolbox system is broken (OOM killed systemd). # If broken: recovers the system and moves on (does NOT retry — the same # benchmark would just OOM again). If healthy: it was a legit failure. run_bench_with_recovery() { local env="$1" container="$2" out_file="$3" label="$4" shift 4 local -a cmd_args=("$@") "${cmd_args[@]}" >"$out_file" 2>&1 local exit_code=$? if (( exit_code == 0 )); then return 0 # success fi # --- Failure: determine if it's a system issue or legitimate benchmark failure --- echo " ⚠ Benchmark exited with code $exit_code, checking system health..." if check_toolbox_health "$container"; then # Toolbox is fine → legitimate failure (OOM kill, model too large, etc.) echo " → Toolbox is healthy — benchmark failure (OOM / model issue), moving on" echo "✖ ${label} failed (exit ${exit_code})" >>"$out_file" return 1 fi # --- System failure detected → recover and continue to next benchmark --- echo " → Toolbox is broken — initiating recovery..." rm -f "$out_file" # Remove invalid log so it can be retried in a future run if ! recover_toolbox_system; then echo " ✖ Recovery failed — aborting" return 2 fi # Verify recovery worked for this specific container if ! check_toolbox_health "$container"; then echo " ✖ Container $container still broken after recovery" return 2 fi _recovery_count=0 # Reset counter on successful recovery echo " ✔ System recovered — skipping this benchmark, continuing with next" return 1 } # ═══════════════════════════════════════════════════════════════════════════════ # Capture system info # ═══════════════════════════════════════════════════════════════════════════════ if [[ ! -f "$RESULTDIR/system_info.json" ]]; then python3 -c ' import platform, json, datetime def get_distro(): try: with open("/etc/os-release") as f: for line in f: if line.startswith("PRETTY_NAME="): return line.split("=", 1)[1].strip().strip("\"") except: return "Linux" return "Linux" def get_linux_firmware(): try: import subprocess result = subprocess.run(["rpm", "-q", "linux-firmware"], capture_output=True, text=True) if result.returncode == 0: return result.stdout.strip() except: pass return "unknown" info = { "distro": get_distro(), "kernel": platform.release(), "linux_firmware": get_linux_firmware(), "timestamp": datetime.datetime.now().strftime("%d %b %Y") } print(json.dumps(info)) ' > "$RESULTDIR/system_info.json" echo "Captured system info to $RESULTDIR/system_info.json" fi # ═══════════════════════════════════════════════════════════════════════════════ # Discover models # ═══════════════════════════════════════════════════════════════════════════════ # Pick exactly one .gguf per model: either # - any .gguf without "-000*-of-" (single-file models) # - or the first shard "*-00001-of-*.gguf" mapfile -t MODEL_PATHS < <( find "$MODEL_DIR" -type f -name '*.gguf' \ $ -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' $ \ | sort ) if (( ${#MODEL_PATHS[@]} == 0 )); then echo "❌ No models found under $MODEL_DIR – check your paths/patterns!" exit 1 fi echo "Found ${#MODEL_PATHS[@]} model(s) to bench:" for p in "${MODEL_PATHS[@]}"; do echo " • $p" done echo # ═══════════════════════════════════════════════════════════════════════════════ # Pre-flight & Main benchmark loop # ═══════════════════════════════════════════════════════════════════════════════ preflight_check ABORT_ALL=0 for MODEL_PATH in "${MODEL_PATHS[@]}"; do MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" for ENV in "${!CONTAINERS[@]}"; do if (( ABORT_ALL )); then echo "⛔ Aborting due to unrecoverable system failure" exit 1 fi CONTAINER="${CONTAINERS[$ENV]}" BENCH_BIN="${BENCH_BINS[$ENV]}" CMD_PREFIX=( toolbox run -c "$CONTAINER" -- "$BENCH_BIN" ) # run with flash attention for FA in 1; do SUFFIX="" EXTRA_ARGS=() if (( FA == 1 )); then SUFFIX="__fa1" EXTRA_ARGS=( -fa 1 ) fi for CTX in default longctx32768 longctx65536; do CTX_SUFFIX="" CTX_ARGS=() if [[ "$CTX" == longctx32768 ]]; then CTX_SUFFIX="__longctx32768" CTX_ARGS=( -p 2048 -n 32 -d 32768 ) if [[ "$ENV" == *vulkan* ]]; then CTX_ARGS+=( -ub 512 ) else CTX_ARGS+=( -ub 2048 ) fi elif [[ "$CTX" == longctx65536 ]]; then CTX_SUFFIX="__longctx65536" CTX_ARGS=( -p 2048 -n 32 -d 65536 ) if [[ "$ENV" == *vulkan* ]]; then CTX_ARGS+=( -ub 512 ) else CTX_ARGS+=( -ub 2048 ) fi fi OUT="$RESULTDIR/${MODEL_NAME}__${ENV}${SUFFIX}${CTX_SUFFIX}.log" CTX_REPS=5 if [[ "$CTX" == longctx32768 ]] || [[ "$CTX" == longctx65536 ]]; then CTX_REPS=3 fi if [[ -s "$OUT" ]]; then echo "⏩ Skipping [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ ($CTX_SUFFIX)}, log already exists at $OUT" continue fi LABEL="[${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX}" FULL_CMD=( "${CMD_PREFIX[@]}" -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" ) printf "\n▶ %s\n" "$LABEL" printf " → log: %s\n" "$OUT" printf " → cmd: %s\n\n" "${FULL_CMD[*]}" run_bench_with_recovery "$ENV" "$CONTAINER" "$OUT" "$LABEL" "${FULL_CMD[@]}" rc=$? case $rc in 0) echo " ✔ $LABEL : OK" ;; 1) echo " * $LABEL : FAILED" ;; 2) echo " ⛔ $LABEL : SYSTEM FAILURE — aborting all" ABORT_ALL=1 break 3 # break out of CTX, FA, and ENV loops ;; esac done done done done if (( ABORT_ALL )); then echo "" echo "⛔ Benchmark run aborted due to unrecoverable system failure" echo " Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all" exit 1 fi echo "" echo "✅ All benchmarks complete"