#!/usr/bin/env bash set -uo pipefail MODEL_DIR="$(realpath ~/models)" RESULTDIR="results" mkdir -p "$RESULTDIR" # ═══════════════════════════════════════════════════════════════════════════════ # OOM Recovery System # ═══════════════════════════════════════════════════════════════════════════════ # # When llama-bench gets OOM-killed, the Linux OOM killer can also kill # systemd --user (same cgroup), which breaks podman's cgroup management. # All subsequent toolbox commands silently fail with: # "Error: unable to find user : no matching entries in passwd file" # # Recovery requires: # 1. sudo systemctl restart user@ (restart dead systemd --user) # 2. podman stop --all (clean up zombie containers) # # For unattended runs, add to /etc/sudoers.d/toolbox-recovery: # ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@ # ═══════════════════════════════════════════════════════════════════════════════ RECOVERY_WAIT_SECS=3 MAX_RECOVERY_ATTEMPTS=2 _recovery_count=0 # --- Container / Backend Configuration --- declare -A CONTAINERS=( [rocm6_4_4]="llama-rocm-6.4.4" [rocm-7_2_3]="llama-rocm-7.2.3" [rocm7-nightlies]="llama-rocm7-nightlies" [vulkan_amdvlk]="llama-vulkan-amdvlk" [vulkan_radv]="llama-vulkan-radv" ) declare -A BENCH_BINS=( [rocm6_4_4]="/usr/local/bin/llama-bench" [rocm-7_2_3]="/usr/local/bin/llama-bench" [rocm7-nightlies]="/usr/local/bin/llama-bench" [vulkan_amdvlk]="/usr/sbin/llama-bench" [vulkan_radv]="/usr/sbin/llama-bench" ) # --- Health Check & Recovery Functions --- # Check if systemd --user is alive check_systemd_user() { systemctl --user status &>/dev/null } # Check if a specific toolbox container is functional check_toolbox_health() { local container="$1" local output output=$(toolbox run -c "$container" echo "health_ok" 2>&1) [[ "$output" == *"health_ok"* ]] } # Recover from OOM-induced toolbox/podman failure # Returns 0 on success, 1 on failure recover_toolbox_system() { (( _recovery_count++ )) if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then echo " ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up" echo " → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all" return 1 fi echo "" echo "🔧 ═══════════════════════════════════════════════════════════════" echo "🔧 Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})" echo "🔧 ═══════════════════════════════════════════════════════════════" # Step 1: Restart systemd --user if dead if ! check_systemd_user; then echo " → systemd --user is dead, restarting..." if sudo systemctl restart "user@$(id -u)"; then echo " ✔ systemd --user restarted" sleep "$RECOVERY_WAIT_SECS" else echo " ✖ Failed to restart systemd --user" echo " → Ensure passwordless sudo is configured (see header comments)" return 1 fi else echo " → systemd --user is alive" fi # Step 2: Stop all zombie containers echo " → Stopping all zombie containers..." podman stop --all 2>/dev/null sleep 1 # Step 3: Verify systemd is alive after container cleanup if ! check_systemd_user; then echo " ✖ systemd --user died again after container cleanup" return 1 fi echo " ✔ Recovery complete" echo "" return 0 } # Pre-flight checks before starting benchmarks preflight_check() { echo "🔍 Pre-flight checks" echo "───────────────────────────────────────────────────" # Check sudo access for recovery if sudo -n true &>/dev/null; then echo " ✔ Passwordless sudo available for auto-recovery" else echo " ⚠ Passwordless sudo NOT available" echo " → Auto-recovery will prompt for password (or fail in unattended mode)" echo " → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery" echo " → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)" fi # Check systemd --user if check_systemd_user; then echo " ✔ systemd --user is running" else echo " ⚠ systemd --user is dead — recovering before start..." if ! recover_toolbox_system; then echo " ✖ Cannot recover toolbox system, aborting" exit 1 fi fi # Spot-check one container local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}" if check_toolbox_health "$first_container"; then echo " ✔ Toolbox health check passed ($first_container)" else echo " ⚠ Toolbox health check failed ($first_container) — recovering..." if ! recover_toolbox_system; then echo " ✖ Cannot recover toolbox system, aborting" exit 1 fi fi echo "" } # Run a benchmark with automatic failure detection and recovery # Usage: run_bench_with_recovery