Files
amd-strix-halo-toolboxes/benchmark/run_benchmarks.sh
T

352 lines
13 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
set -uo pipefail
MODEL_DIR="$(realpath ~/models)"
RESULTDIR="results"
mkdir -p "$RESULTDIR"
# ═══════════════════════════════════════════════════════════════════════════════
# OOM Recovery System
# ═══════════════════════════════════════════════════════════════════════════════
#
# When llama-bench gets OOM-killed, the Linux OOM killer can also kill
# systemd --user (same cgroup), which breaks podman's cgroup management.
# All subsequent toolbox commands silently fail with:
# "Error: unable to find user <user>: no matching entries in passwd file"
#
# Recovery requires:
# 1. sudo systemctl restart user@<uid> (restart dead systemd --user)
# 2. podman stop --all (clean up zombie containers)
#
# For unattended runs, add to /etc/sudoers.d/toolbox-recovery:
# <user> ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@<uid>
# ═══════════════════════════════════════════════════════════════════════════════
RECOVERY_WAIT_SECS=3
MAX_RECOVERY_ATTEMPTS=2
_recovery_count=0
# --- Container / Backend Configuration ---
declare -A CONTAINERS=(
[rocm6_4_4]="llama-rocm-6.4.4"
[rocm-7_2_3]="llama-rocm-7.2.3"
[rocm7-nightlies]="llama-rocm7-nightlies"
[vulkan_amdvlk]="llama-vulkan-amdvlk"
[vulkan_radv]="llama-vulkan-radv"
)
declare -A BENCH_BINS=(
[rocm6_4_4]="/usr/local/bin/llama-bench"
[rocm-7_2_3]="/usr/local/bin/llama-bench"
[rocm7-nightlies]="/usr/local/bin/llama-bench"
[vulkan_amdvlk]="/usr/sbin/llama-bench"
[vulkan_radv]="/usr/sbin/llama-bench"
)
# --- Health Check & Recovery Functions ---
# Check if systemd --user is alive
check_systemd_user() {
systemctl --user status &>/dev/null
}
# Check if a specific toolbox container is functional
check_toolbox_health() {
local container="$1"
local output
output=$(toolbox run -c "$container" echo "health_ok" 2>&1)
[[ "$output" == *"health_ok"* ]]
}
# Recover from OOM-induced toolbox/podman failure
# Returns 0 on success, 1 on failure
recover_toolbox_system() {
(( _recovery_count++ ))
if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then
echo " ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up"
echo " → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
return 1
fi
echo ""
echo "🔧 ═══════════════════════════════════════════════════════════════"
echo "🔧 Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})"
echo "🔧 ═══════════════════════════════════════════════════════════════"
# Step 1: Restart systemd --user if dead
if ! check_systemd_user; then
echo " → systemd --user is dead, restarting..."
if sudo systemctl restart "user@$(id -u)"; then
echo " ✔ systemd --user restarted"
sleep "$RECOVERY_WAIT_SECS"
else
echo " ✖ Failed to restart systemd --user"
echo " → Ensure passwordless sudo is configured (see header comments)"
return 1
fi
else
echo " → systemd --user is alive"
fi
# Step 2: Stop all zombie containers
echo " → Stopping all zombie containers..."
podman stop --all 2>/dev/null
sleep 1
# Step 3: Verify systemd is alive after container cleanup
if ! check_systemd_user; then
echo " ✖ systemd --user died again after container cleanup"
return 1
fi
echo " ✔ Recovery complete"
echo ""
return 0
}
# Pre-flight checks before starting benchmarks
preflight_check() {
echo "🔍 Pre-flight checks"
echo "───────────────────────────────────────────────────"
# Check sudo access for recovery
if sudo -n true &>/dev/null; then
echo " ✔ Passwordless sudo available for auto-recovery"
else
echo " ⚠ Passwordless sudo NOT available"
echo " → Auto-recovery will prompt for password (or fail in unattended mode)"
echo " → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery"
echo " → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)"
fi
# Check systemd --user
if check_systemd_user; then
echo " ✔ systemd --user is running"
else
echo " ⚠ systemd --user is dead — recovering before start..."
if ! recover_toolbox_system; then
echo " ✖ Cannot recover toolbox system, aborting"
exit 1
fi
fi
# Spot-check one container
local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}"
if check_toolbox_health "$first_container"; then
echo " ✔ Toolbox health check passed ($first_container)"
else
echo " ⚠ Toolbox health check failed ($first_container) — recovering..."
if ! recover_toolbox_system; then
echo " ✖ Cannot recover toolbox system, aborting"
exit 1
fi
fi
echo ""
}
# Run a benchmark with automatic failure detection and recovery
# Usage: run_bench_with_recovery <env> <container> <out_file> <label> <cmd_args...>
# Returns: 0 = success, 1 = legitimate failure (OOM etc.), 2 = unrecoverable system failure
#
# On failure, checks if the toolbox system is broken (OOM killed systemd).
# If broken: recovers the system and moves on (does NOT retry — the same
# benchmark would just OOM again). If healthy: it was a legit failure.
run_bench_with_recovery() {
local env="$1" container="$2" out_file="$3" label="$4"
shift 4
local -a cmd_args=("$@")
"${cmd_args[@]}" >"$out_file" 2>&1
local exit_code=$?
if (( exit_code == 0 )); then
return 0 # success
fi
# --- Failure: determine if it's a system issue or legitimate benchmark failure ---
echo " ⚠ Benchmark exited with code $exit_code, checking system health..."
if check_toolbox_health "$container"; then
# Toolbox is fine → legitimate failure (OOM kill, model too large, etc.)
echo " → Toolbox is healthy — benchmark failure (OOM / model issue), moving on"
echo "${label} failed (exit ${exit_code})" >>"$out_file"
return 1
fi
# --- System failure detected → recover and continue to next benchmark ---
echo " → Toolbox is broken — initiating recovery..."
rm -f "$out_file" # Remove invalid log so it can be retried in a future run
if ! recover_toolbox_system; then
echo " ✖ Recovery failed — aborting"
return 2
fi
# Verify recovery worked for this specific container
if ! check_toolbox_health "$container"; then
echo " ✖ Container $container still broken after recovery"
return 2
fi
_recovery_count=0 # Reset counter on successful recovery
echo " ✔ System recovered — skipping this benchmark, continuing with next"
return 1
}
# ═══════════════════════════════════════════════════════════════════════════════
# Capture system info
# ═══════════════════════════════════════════════════════════════════════════════
if [[ ! -f "$RESULTDIR/system_info.json" ]]; then
python3 -c '
import platform, json, datetime
def get_distro():
try:
with open("/etc/os-release") as f:
for line in f:
if line.startswith("PRETTY_NAME="):
return line.split("=", 1)[1].strip().strip("\"")
except:
return "Linux"
return "Linux"
def get_linux_firmware():
try:
import subprocess
result = subprocess.run(["rpm", "-q", "linux-firmware"], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except:
pass
return "unknown"
info = {
"distro": get_distro(),
"kernel": platform.release(),
"linux_firmware": get_linux_firmware(),
"timestamp": datetime.datetime.now().strftime("%d %b %Y")
}
print(json.dumps(info))
' > "$RESULTDIR/system_info.json"
echo "Captured system info to $RESULTDIR/system_info.json"
fi
# ═══════════════════════════════════════════════════════════════════════════════
# Discover models
# ═══════════════════════════════════════════════════════════════════════════════
# Pick exactly one .gguf per model: either
# - any .gguf without "-000*-of-" (single-file models)
# - or the first shard "*-00001-of-*.gguf"
mapfile -t MODEL_PATHS < <(
find "$MODEL_DIR" -type f -name '*.gguf' \
\( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \
| sort
)
if (( ${#MODEL_PATHS[@]} == 0 )); then
echo "❌ No models found under $MODEL_DIR check your paths/patterns!"
exit 1
fi
echo "Found ${#MODEL_PATHS[@]} model(s) to bench:"
for p in "${MODEL_PATHS[@]}"; do
echo "$p"
done
echo
# ═══════════════════════════════════════════════════════════════════════════════
# Pre-flight & Main benchmark loop
# ═══════════════════════════════════════════════════════════════════════════════
preflight_check
ABORT_ALL=0
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
for ENV in "${!CONTAINERS[@]}"; do
if (( ABORT_ALL )); then
echo "⛔ Aborting due to unrecoverable system failure"
exit 1
fi
CONTAINER="${CONTAINERS[$ENV]}"
BENCH_BIN="${BENCH_BINS[$ENV]}"
CMD_PREFIX=( toolbox run -c "$CONTAINER" -- "$BENCH_BIN" )
# run with flash attention
for FA in 1; do
SUFFIX=""
EXTRA_ARGS=()
if (( FA == 1 )); then
SUFFIX="__fa1"
EXTRA_ARGS=( -fa 1 )
fi
for CTX in default longctx32768 longctx65536; do
CTX_SUFFIX=""
CTX_ARGS=()
if [[ "$CTX" == longctx32768 ]]; then
CTX_SUFFIX="__longctx32768"
CTX_ARGS=( -p 2048 -n 32 -d 32768 )
if [[ "$ENV" == *vulkan* ]]; then
CTX_ARGS+=( -ub 512 )
else
CTX_ARGS+=( -ub 2048 )
fi
elif [[ "$CTX" == longctx65536 ]]; then
CTX_SUFFIX="__longctx65536"
CTX_ARGS=( -p 2048 -n 32 -d 65536 )
if [[ "$ENV" == *vulkan* ]]; then
CTX_ARGS+=( -ub 512 )
else
CTX_ARGS+=( -ub 2048 )
fi
fi
OUT="$RESULTDIR/${MODEL_NAME}__${ENV}${SUFFIX}${CTX_SUFFIX}.log"
CTX_REPS=5
if [[ "$CTX" == longctx32768 ]] || [[ "$CTX" == longctx65536 ]]; then
CTX_REPS=3
fi
if [[ -s "$OUT" ]]; then
echo "⏩ Skipping [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ ($CTX_SUFFIX)}, log already exists at $OUT"
continue
fi
LABEL="[${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX}"
FULL_CMD=( "${CMD_PREFIX[@]}" -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
printf "\n▶ %s\n" "$LABEL"
printf " → log: %s\n" "$OUT"
printf " → cmd: %s\n\n" "${FULL_CMD[*]}"
run_bench_with_recovery "$ENV" "$CONTAINER" "$OUT" "$LABEL" "${FULL_CMD[@]}"
rc=$?
case $rc in
0) echo "$LABEL : OK" ;;
1) echo " * $LABEL : FAILED" ;;
2) echo "$LABEL : SYSTEM FAILURE — aborting all"
ABORT_ALL=1
break 3 # break out of CTX, FA, and ENV loops
;;
esac
done
done
done
done
if (( ABORT_ALL )); then
echo ""
echo "⛔ Benchmark run aborted due to unrecoverable system failure"
echo " Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
exit 1
fi
echo ""
echo "✅ All benchmarks complete"