chore: update benchmark execution script parameters and paths
This commit is contained in:
+236
-17
@@ -5,7 +5,198 @@ MODEL_DIR="$(realpath ~/models)"
|
||||
RESULTDIR="results"
|
||||
mkdir -p "$RESULTDIR"
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# OOM Recovery System
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
#
|
||||
# When llama-bench gets OOM-killed, the Linux OOM killer can also kill
|
||||
# systemd --user (same cgroup), which breaks podman's cgroup management.
|
||||
# All subsequent toolbox commands silently fail with:
|
||||
# "Error: unable to find user <user>: no matching entries in passwd file"
|
||||
#
|
||||
# Recovery requires:
|
||||
# 1. sudo systemctl restart user@<uid> (restart dead systemd --user)
|
||||
# 2. podman stop --all (clean up zombie containers)
|
||||
#
|
||||
# For unattended runs, add to /etc/sudoers.d/toolbox-recovery:
|
||||
# <user> ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@<uid>
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
RECOVERY_WAIT_SECS=3
|
||||
MAX_RECOVERY_ATTEMPTS=2
|
||||
_recovery_count=0
|
||||
|
||||
# --- Container / Backend Configuration ---
|
||||
declare -A CONTAINERS=(
|
||||
[rocm6_4_4]="llama-rocm-6.4.4"
|
||||
[rocm-7_2_3]="llama-rocm-7.2.3"
|
||||
[rocm7-nightlies]="llama-rocm7-nightlies"
|
||||
[vulkan_amdvlk]="llama-vulkan-amdvlk"
|
||||
[vulkan_radv]="llama-vulkan-radv"
|
||||
)
|
||||
|
||||
declare -A BENCH_BINS=(
|
||||
[rocm6_4_4]="/usr/local/bin/llama-bench"
|
||||
[rocm-7_2_3]="/usr/local/bin/llama-bench"
|
||||
[rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||||
[vulkan_amdvlk]="/usr/sbin/llama-bench"
|
||||
[vulkan_radv]="/usr/sbin/llama-bench"
|
||||
)
|
||||
|
||||
# --- Health Check & Recovery Functions ---
|
||||
|
||||
# Check if systemd --user is alive
|
||||
check_systemd_user() {
|
||||
systemctl --user status &>/dev/null
|
||||
}
|
||||
|
||||
# Check if a specific toolbox container is functional
|
||||
check_toolbox_health() {
|
||||
local container="$1"
|
||||
local output
|
||||
output=$(toolbox run -c "$container" echo "health_ok" 2>&1)
|
||||
[[ "$output" == *"health_ok"* ]]
|
||||
}
|
||||
|
||||
# Recover from OOM-induced toolbox/podman failure
|
||||
# Returns 0 on success, 1 on failure
|
||||
recover_toolbox_system() {
|
||||
(( _recovery_count++ ))
|
||||
if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then
|
||||
echo " ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up"
|
||||
echo " → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🔧 ═══════════════════════════════════════════════════════════════"
|
||||
echo "🔧 Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})"
|
||||
echo "🔧 ═══════════════════════════════════════════════════════════════"
|
||||
|
||||
# Step 1: Restart systemd --user if dead
|
||||
if ! check_systemd_user; then
|
||||
echo " → systemd --user is dead, restarting..."
|
||||
if sudo systemctl restart "user@$(id -u)"; then
|
||||
echo " ✔ systemd --user restarted"
|
||||
sleep "$RECOVERY_WAIT_SECS"
|
||||
else
|
||||
echo " ✖ Failed to restart systemd --user"
|
||||
echo " → Ensure passwordless sudo is configured (see header comments)"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
echo " → systemd --user is alive"
|
||||
fi
|
||||
|
||||
# Step 2: Stop all zombie containers
|
||||
echo " → Stopping all zombie containers..."
|
||||
podman stop --all 2>/dev/null
|
||||
sleep 1
|
||||
|
||||
# Step 3: Verify systemd is alive after container cleanup
|
||||
if ! check_systemd_user; then
|
||||
echo " ✖ systemd --user died again after container cleanup"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo " ✔ Recovery complete"
|
||||
echo ""
|
||||
return 0
|
||||
}
|
||||
|
||||
# Pre-flight checks before starting benchmarks
|
||||
preflight_check() {
|
||||
echo "🔍 Pre-flight checks"
|
||||
echo "───────────────────────────────────────────────────"
|
||||
|
||||
# Check sudo access for recovery
|
||||
if sudo -n true &>/dev/null; then
|
||||
echo " ✔ Passwordless sudo available for auto-recovery"
|
||||
else
|
||||
echo " ⚠ Passwordless sudo NOT available"
|
||||
echo " → Auto-recovery will prompt for password (or fail in unattended mode)"
|
||||
echo " → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery"
|
||||
echo " → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)"
|
||||
fi
|
||||
|
||||
# Check systemd --user
|
||||
if check_systemd_user; then
|
||||
echo " ✔ systemd --user is running"
|
||||
else
|
||||
echo " ⚠ systemd --user is dead — recovering before start..."
|
||||
if ! recover_toolbox_system; then
|
||||
echo " ✖ Cannot recover toolbox system, aborting"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Spot-check one container
|
||||
local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}"
|
||||
if check_toolbox_health "$first_container"; then
|
||||
echo " ✔ Toolbox health check passed ($first_container)"
|
||||
else
|
||||
echo " ⚠ Toolbox health check failed ($first_container) — recovering..."
|
||||
if ! recover_toolbox_system; then
|
||||
echo " ✖ Cannot recover toolbox system, aborting"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Run a benchmark with automatic failure detection and recovery
|
||||
# Usage: run_bench_with_recovery <env> <container> <out_file> <label> <cmd_args...>
|
||||
# Returns: 0 = success, 1 = legitimate failure (OOM etc.), 2 = unrecoverable system failure
|
||||
#
|
||||
# On failure, checks if the toolbox system is broken (OOM killed systemd).
|
||||
# If broken: recovers the system and moves on (does NOT retry — the same
|
||||
# benchmark would just OOM again). If healthy: it was a legit failure.
|
||||
run_bench_with_recovery() {
|
||||
local env="$1" container="$2" out_file="$3" label="$4"
|
||||
shift 4
|
||||
local -a cmd_args=("$@")
|
||||
|
||||
"${cmd_args[@]}" >"$out_file" 2>&1
|
||||
local exit_code=$?
|
||||
|
||||
if (( exit_code == 0 )); then
|
||||
return 0 # success
|
||||
fi
|
||||
|
||||
# --- Failure: determine if it's a system issue or legitimate benchmark failure ---
|
||||
echo " ⚠ Benchmark exited with code $exit_code, checking system health..."
|
||||
|
||||
if check_toolbox_health "$container"; then
|
||||
# Toolbox is fine → legitimate failure (OOM kill, model too large, etc.)
|
||||
echo " → Toolbox is healthy — benchmark failure (OOM / model issue), moving on"
|
||||
echo "✖ ${label} failed (exit ${exit_code})" >>"$out_file"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# --- System failure detected → recover and continue to next benchmark ---
|
||||
echo " → Toolbox is broken — initiating recovery..."
|
||||
rm -f "$out_file" # Remove invalid log so it can be retried in a future run
|
||||
|
||||
if ! recover_toolbox_system; then
|
||||
echo " ✖ Recovery failed — aborting"
|
||||
return 2
|
||||
fi
|
||||
|
||||
# Verify recovery worked for this specific container
|
||||
if ! check_toolbox_health "$container"; then
|
||||
echo " ✖ Container $container still broken after recovery"
|
||||
return 2
|
||||
fi
|
||||
|
||||
_recovery_count=0 # Reset counter on successful recovery
|
||||
echo " ✔ System recovered — skipping this benchmark, continuing with next"
|
||||
return 1
|
||||
}
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Capture system info
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
if [[ ! -f "$RESULTDIR/system_info.json" ]]; then
|
||||
python3 -c '
|
||||
import platform, json, datetime
|
||||
@@ -40,6 +231,10 @@ print(json.dumps(info))
|
||||
echo "Captured system info to $RESULTDIR/system_info.json"
|
||||
fi
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Discover models
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Pick exactly one .gguf per model: either
|
||||
# - any .gguf without "-000*-of-" (single-file models)
|
||||
# - or the first shard "*-00001-of-*.gguf"
|
||||
@@ -60,21 +255,28 @@ for p in "${MODEL_PATHS[@]}"; do
|
||||
done
|
||||
echo
|
||||
|
||||
declare -A CMDS=(
|
||||
[rocm6_4_4]="toolbox run -c llama-rocm-6.4.4 -- /usr/local/bin/llama-bench"
|
||||
[rocm-7_2_3]="toolbox run -c llama-rocm-7.2.3 -- /usr/local/bin/llama-bench"
|
||||
[rocm7-nightlies]="toolbox run -c llama-rocm7-nightlies -- /usr/local/bin/llama-bench"
|
||||
[vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench"
|
||||
[vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench"
|
||||
)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Pre-flight & Main benchmark loop
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
preflight_check
|
||||
|
||||
ABORT_ALL=0
|
||||
|
||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
|
||||
|
||||
for ENV in "${!CMDS[@]}"; do
|
||||
CMD_EFFECTIVE="${CMDS[$ENV]}"
|
||||
for ENV in "${!CONTAINERS[@]}"; do
|
||||
if (( ABORT_ALL )); then
|
||||
echo "⛔ Aborting due to unrecoverable system failure"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# run twice: baseline and with flash attention
|
||||
CONTAINER="${CONTAINERS[$ENV]}"
|
||||
BENCH_BIN="${BENCH_BINS[$ENV]}"
|
||||
CMD_PREFIX=( toolbox run -c "$CONTAINER" -- "$BENCH_BIN" )
|
||||
|
||||
# run with flash attention
|
||||
for FA in 1; do
|
||||
SUFFIX=""
|
||||
EXTRA_ARGS=()
|
||||
@@ -115,18 +317,35 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||
continue
|
||||
fi
|
||||
|
||||
FULL_CMD=( $CMD_EFFECTIVE -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
|
||||
LABEL="[${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX}"
|
||||
FULL_CMD=( "${CMD_PREFIX[@]}" -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
|
||||
|
||||
printf "\n▶ [%s] %s%s%s\n" "$ENV" "$MODEL_NAME" "${SUFFIX:+ $SUFFIX}" "${CTX_SUFFIX:+ $CTX_SUFFIX}"
|
||||
printf "\n▶ %s\n" "$LABEL"
|
||||
printf " → log: %s\n" "$OUT"
|
||||
printf " → cmd: %s\n\n" "${FULL_CMD[*]}"
|
||||
|
||||
if ! "${FULL_CMD[@]}" >"$OUT" 2>&1; then
|
||||
status=$?
|
||||
echo "✖ ! [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} failed (exit ${status})" >>"$OUT"
|
||||
echo " * [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} : FAILED"
|
||||
fi
|
||||
run_bench_with_recovery "$ENV" "$CONTAINER" "$OUT" "$LABEL" "${FULL_CMD[@]}"
|
||||
rc=$?
|
||||
|
||||
case $rc in
|
||||
0) echo " ✔ $LABEL : OK" ;;
|
||||
1) echo " * $LABEL : FAILED" ;;
|
||||
2) echo " ⛔ $LABEL : SYSTEM FAILURE — aborting all"
|
||||
ABORT_ALL=1
|
||||
break 3 # break out of CTX, FA, and ENV loops
|
||||
;;
|
||||
esac
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
if (( ABORT_ALL )); then
|
||||
echo ""
|
||||
echo "⛔ Benchmark run aborted due to unrecoverable system failure"
|
||||
echo " Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "✅ All benchmarks complete"
|
||||
|
||||
Reference in New Issue
Block a user