From 2a4cb50b52977f6d7d3db0c96361a0bee6ce72f3 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sun, 17 May 2026 09:16:03 +0100 Subject: [PATCH] chore: update benchmark execution script parameters and paths --- benchmark/run_benchmarks.sh | 253 +++++++++++++++++++++++++++++++++--- 1 file changed, 236 insertions(+), 17 deletions(-) diff --git a/benchmark/run_benchmarks.sh b/benchmark/run_benchmarks.sh index b2564fe..11509b8 100755 --- a/benchmark/run_benchmarks.sh +++ b/benchmark/run_benchmarks.sh @@ -5,7 +5,198 @@ MODEL_DIR="$(realpath ~/models)" RESULTDIR="results" mkdir -p "$RESULTDIR" +# ═══════════════════════════════════════════════════════════════════════════════ +# OOM Recovery System +# ═══════════════════════════════════════════════════════════════════════════════ +# +# When llama-bench gets OOM-killed, the Linux OOM killer can also kill +# systemd --user (same cgroup), which breaks podman's cgroup management. +# All subsequent toolbox commands silently fail with: +# "Error: unable to find user : no matching entries in passwd file" +# +# Recovery requires: +# 1. sudo systemctl restart user@ (restart dead systemd --user) +# 2. podman stop --all (clean up zombie containers) +# +# For unattended runs, add to /etc/sudoers.d/toolbox-recovery: +# ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@ +# ═══════════════════════════════════════════════════════════════════════════════ + +RECOVERY_WAIT_SECS=3 +MAX_RECOVERY_ATTEMPTS=2 +_recovery_count=0 + +# --- Container / Backend Configuration --- +declare -A CONTAINERS=( + [rocm6_4_4]="llama-rocm-6.4.4" + [rocm-7_2_3]="llama-rocm-7.2.3" + [rocm7-nightlies]="llama-rocm7-nightlies" + [vulkan_amdvlk]="llama-vulkan-amdvlk" + [vulkan_radv]="llama-vulkan-radv" +) + +declare -A BENCH_BINS=( + [rocm6_4_4]="/usr/local/bin/llama-bench" + [rocm-7_2_3]="/usr/local/bin/llama-bench" + [rocm7-nightlies]="/usr/local/bin/llama-bench" + [vulkan_amdvlk]="/usr/sbin/llama-bench" + [vulkan_radv]="/usr/sbin/llama-bench" +) + +# --- Health Check & Recovery Functions --- + +# Check if systemd --user is alive +check_systemd_user() { + systemctl --user status &>/dev/null +} + +# Check if a specific toolbox container is functional +check_toolbox_health() { + local container="$1" + local output + output=$(toolbox run -c "$container" echo "health_ok" 2>&1) + [[ "$output" == *"health_ok"* ]] +} + +# Recover from OOM-induced toolbox/podman failure +# Returns 0 on success, 1 on failure +recover_toolbox_system() { + (( _recovery_count++ )) + if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then + echo " ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up" + echo " → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all" + return 1 + fi + + echo "" + echo "🔧 ═══════════════════════════════════════════════════════════════" + echo "🔧 Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})" + echo "🔧 ═══════════════════════════════════════════════════════════════" + + # Step 1: Restart systemd --user if dead + if ! check_systemd_user; then + echo " → systemd --user is dead, restarting..." + if sudo systemctl restart "user@$(id -u)"; then + echo " ✔ systemd --user restarted" + sleep "$RECOVERY_WAIT_SECS" + else + echo " ✖ Failed to restart systemd --user" + echo " → Ensure passwordless sudo is configured (see header comments)" + return 1 + fi + else + echo " → systemd --user is alive" + fi + + # Step 2: Stop all zombie containers + echo " → Stopping all zombie containers..." + podman stop --all 2>/dev/null + sleep 1 + + # Step 3: Verify systemd is alive after container cleanup + if ! check_systemd_user; then + echo " ✖ systemd --user died again after container cleanup" + return 1 + fi + + echo " ✔ Recovery complete" + echo "" + return 0 +} + +# Pre-flight checks before starting benchmarks +preflight_check() { + echo "🔍 Pre-flight checks" + echo "───────────────────────────────────────────────────" + + # Check sudo access for recovery + if sudo -n true &>/dev/null; then + echo " ✔ Passwordless sudo available for auto-recovery" + else + echo " ⚠ Passwordless sudo NOT available" + echo " → Auto-recovery will prompt for password (or fail in unattended mode)" + echo " → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery" + echo " → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)" + fi + + # Check systemd --user + if check_systemd_user; then + echo " ✔ systemd --user is running" + else + echo " ⚠ systemd --user is dead — recovering before start..." + if ! recover_toolbox_system; then + echo " ✖ Cannot recover toolbox system, aborting" + exit 1 + fi + fi + + # Spot-check one container + local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}" + if check_toolbox_health "$first_container"; then + echo " ✔ Toolbox health check passed ($first_container)" + else + echo " ⚠ Toolbox health check failed ($first_container) — recovering..." + if ! recover_toolbox_system; then + echo " ✖ Cannot recover toolbox system, aborting" + exit 1 + fi + fi + + echo "" +} + +# Run a benchmark with automatic failure detection and recovery +# Usage: run_bench_with_recovery