chore: update benchmark execution script parameters and paths
This commit is contained in:
+236
-17
@@ -5,7 +5,198 @@ MODEL_DIR="$(realpath ~/models)"
|
|||||||
RESULTDIR="results"
|
RESULTDIR="results"
|
||||||
mkdir -p "$RESULTDIR"
|
mkdir -p "$RESULTDIR"
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
# OOM Recovery System
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
#
|
||||||
|
# When llama-bench gets OOM-killed, the Linux OOM killer can also kill
|
||||||
|
# systemd --user (same cgroup), which breaks podman's cgroup management.
|
||||||
|
# All subsequent toolbox commands silently fail with:
|
||||||
|
# "Error: unable to find user <user>: no matching entries in passwd file"
|
||||||
|
#
|
||||||
|
# Recovery requires:
|
||||||
|
# 1. sudo systemctl restart user@<uid> (restart dead systemd --user)
|
||||||
|
# 2. podman stop --all (clean up zombie containers)
|
||||||
|
#
|
||||||
|
# For unattended runs, add to /etc/sudoers.d/toolbox-recovery:
|
||||||
|
# <user> ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@<uid>
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
RECOVERY_WAIT_SECS=3
|
||||||
|
MAX_RECOVERY_ATTEMPTS=2
|
||||||
|
_recovery_count=0
|
||||||
|
|
||||||
|
# --- Container / Backend Configuration ---
|
||||||
|
declare -A CONTAINERS=(
|
||||||
|
[rocm6_4_4]="llama-rocm-6.4.4"
|
||||||
|
[rocm-7_2_3]="llama-rocm-7.2.3"
|
||||||
|
[rocm7-nightlies]="llama-rocm7-nightlies"
|
||||||
|
[vulkan_amdvlk]="llama-vulkan-amdvlk"
|
||||||
|
[vulkan_radv]="llama-vulkan-radv"
|
||||||
|
)
|
||||||
|
|
||||||
|
declare -A BENCH_BINS=(
|
||||||
|
[rocm6_4_4]="/usr/local/bin/llama-bench"
|
||||||
|
[rocm-7_2_3]="/usr/local/bin/llama-bench"
|
||||||
|
[rocm7-nightlies]="/usr/local/bin/llama-bench"
|
||||||
|
[vulkan_amdvlk]="/usr/sbin/llama-bench"
|
||||||
|
[vulkan_radv]="/usr/sbin/llama-bench"
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Health Check & Recovery Functions ---
|
||||||
|
|
||||||
|
# Check if systemd --user is alive
|
||||||
|
check_systemd_user() {
|
||||||
|
systemctl --user status &>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if a specific toolbox container is functional
|
||||||
|
check_toolbox_health() {
|
||||||
|
local container="$1"
|
||||||
|
local output
|
||||||
|
output=$(toolbox run -c "$container" echo "health_ok" 2>&1)
|
||||||
|
[[ "$output" == *"health_ok"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Recover from OOM-induced toolbox/podman failure
|
||||||
|
# Returns 0 on success, 1 on failure
|
||||||
|
recover_toolbox_system() {
|
||||||
|
(( _recovery_count++ ))
|
||||||
|
if (( _recovery_count > MAX_RECOVERY_ATTEMPTS )); then
|
||||||
|
echo " ✖ Max recovery attempts ($MAX_RECOVERY_ATTEMPTS) exceeded, giving up"
|
||||||
|
echo " → Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "🔧 ═══════════════════════════════════════════════════════════════"
|
||||||
|
echo "🔧 Toolbox/Podman recovery (attempt ${_recovery_count}/${MAX_RECOVERY_ATTEMPTS})"
|
||||||
|
echo "🔧 ═══════════════════════════════════════════════════════════════"
|
||||||
|
|
||||||
|
# Step 1: Restart systemd --user if dead
|
||||||
|
if ! check_systemd_user; then
|
||||||
|
echo " → systemd --user is dead, restarting..."
|
||||||
|
if sudo systemctl restart "user@$(id -u)"; then
|
||||||
|
echo " ✔ systemd --user restarted"
|
||||||
|
sleep "$RECOVERY_WAIT_SECS"
|
||||||
|
else
|
||||||
|
echo " ✖ Failed to restart systemd --user"
|
||||||
|
echo " → Ensure passwordless sudo is configured (see header comments)"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo " → systemd --user is alive"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 2: Stop all zombie containers
|
||||||
|
echo " → Stopping all zombie containers..."
|
||||||
|
podman stop --all 2>/dev/null
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
# Step 3: Verify systemd is alive after container cleanup
|
||||||
|
if ! check_systemd_user; then
|
||||||
|
echo " ✖ systemd --user died again after container cleanup"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo " ✔ Recovery complete"
|
||||||
|
echo ""
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Pre-flight checks before starting benchmarks
|
||||||
|
preflight_check() {
|
||||||
|
echo "🔍 Pre-flight checks"
|
||||||
|
echo "───────────────────────────────────────────────────"
|
||||||
|
|
||||||
|
# Check sudo access for recovery
|
||||||
|
if sudo -n true &>/dev/null; then
|
||||||
|
echo " ✔ Passwordless sudo available for auto-recovery"
|
||||||
|
else
|
||||||
|
echo " ⚠ Passwordless sudo NOT available"
|
||||||
|
echo " → Auto-recovery will prompt for password (or fail in unattended mode)"
|
||||||
|
echo " → Fix: sudo visudo -f /etc/sudoers.d/toolbox-recovery"
|
||||||
|
echo " → Add: $USER ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart user@$(id -u)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check systemd --user
|
||||||
|
if check_systemd_user; then
|
||||||
|
echo " ✔ systemd --user is running"
|
||||||
|
else
|
||||||
|
echo " ⚠ systemd --user is dead — recovering before start..."
|
||||||
|
if ! recover_toolbox_system; then
|
||||||
|
echo " ✖ Cannot recover toolbox system, aborting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Spot-check one container
|
||||||
|
local first_container="${CONTAINERS[${!CONTAINERS[*]%% *}]}"
|
||||||
|
if check_toolbox_health "$first_container"; then
|
||||||
|
echo " ✔ Toolbox health check passed ($first_container)"
|
||||||
|
else
|
||||||
|
echo " ⚠ Toolbox health check failed ($first_container) — recovering..."
|
||||||
|
if ! recover_toolbox_system; then
|
||||||
|
echo " ✖ Cannot recover toolbox system, aborting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run a benchmark with automatic failure detection and recovery
|
||||||
|
# Usage: run_bench_with_recovery <env> <container> <out_file> <label> <cmd_args...>
|
||||||
|
# Returns: 0 = success, 1 = legitimate failure (OOM etc.), 2 = unrecoverable system failure
|
||||||
|
#
|
||||||
|
# On failure, checks if the toolbox system is broken (OOM killed systemd).
|
||||||
|
# If broken: recovers the system and moves on (does NOT retry — the same
|
||||||
|
# benchmark would just OOM again). If healthy: it was a legit failure.
|
||||||
|
run_bench_with_recovery() {
|
||||||
|
local env="$1" container="$2" out_file="$3" label="$4"
|
||||||
|
shift 4
|
||||||
|
local -a cmd_args=("$@")
|
||||||
|
|
||||||
|
"${cmd_args[@]}" >"$out_file" 2>&1
|
||||||
|
local exit_code=$?
|
||||||
|
|
||||||
|
if (( exit_code == 0 )); then
|
||||||
|
return 0 # success
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Failure: determine if it's a system issue or legitimate benchmark failure ---
|
||||||
|
echo " ⚠ Benchmark exited with code $exit_code, checking system health..."
|
||||||
|
|
||||||
|
if check_toolbox_health "$container"; then
|
||||||
|
# Toolbox is fine → legitimate failure (OOM kill, model too large, etc.)
|
||||||
|
echo " → Toolbox is healthy — benchmark failure (OOM / model issue), moving on"
|
||||||
|
echo "✖ ${label} failed (exit ${exit_code})" >>"$out_file"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- System failure detected → recover and continue to next benchmark ---
|
||||||
|
echo " → Toolbox is broken — initiating recovery..."
|
||||||
|
rm -f "$out_file" # Remove invalid log so it can be retried in a future run
|
||||||
|
|
||||||
|
if ! recover_toolbox_system; then
|
||||||
|
echo " ✖ Recovery failed — aborting"
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify recovery worked for this specific container
|
||||||
|
if ! check_toolbox_health "$container"; then
|
||||||
|
echo " ✖ Container $container still broken after recovery"
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
_recovery_count=0 # Reset counter on successful recovery
|
||||||
|
echo " ✔ System recovered — skipping this benchmark, continuing with next"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
# Capture system info
|
# Capture system info
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
if [[ ! -f "$RESULTDIR/system_info.json" ]]; then
|
if [[ ! -f "$RESULTDIR/system_info.json" ]]; then
|
||||||
python3 -c '
|
python3 -c '
|
||||||
import platform, json, datetime
|
import platform, json, datetime
|
||||||
@@ -40,6 +231,10 @@ print(json.dumps(info))
|
|||||||
echo "Captured system info to $RESULTDIR/system_info.json"
|
echo "Captured system info to $RESULTDIR/system_info.json"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
# Discover models
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
# Pick exactly one .gguf per model: either
|
# Pick exactly one .gguf per model: either
|
||||||
# - any .gguf without "-000*-of-" (single-file models)
|
# - any .gguf without "-000*-of-" (single-file models)
|
||||||
# - or the first shard "*-00001-of-*.gguf"
|
# - or the first shard "*-00001-of-*.gguf"
|
||||||
@@ -60,21 +255,28 @@ for p in "${MODEL_PATHS[@]}"; do
|
|||||||
done
|
done
|
||||||
echo
|
echo
|
||||||
|
|
||||||
declare -A CMDS=(
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
[rocm6_4_4]="toolbox run -c llama-rocm-6.4.4 -- /usr/local/bin/llama-bench"
|
# Pre-flight & Main benchmark loop
|
||||||
[rocm-7_2_3]="toolbox run -c llama-rocm-7.2.3 -- /usr/local/bin/llama-bench"
|
# ═══════════════════════════════════════════════════════════════════════════════
|
||||||
[rocm7-nightlies]="toolbox run -c llama-rocm7-nightlies -- /usr/local/bin/llama-bench"
|
|
||||||
[vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench"
|
preflight_check
|
||||||
[vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench"
|
|
||||||
)
|
ABORT_ALL=0
|
||||||
|
|
||||||
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
||||||
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
|
MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
|
||||||
|
|
||||||
for ENV in "${!CMDS[@]}"; do
|
for ENV in "${!CONTAINERS[@]}"; do
|
||||||
CMD_EFFECTIVE="${CMDS[$ENV]}"
|
if (( ABORT_ALL )); then
|
||||||
|
echo "⛔ Aborting due to unrecoverable system failure"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# run twice: baseline and with flash attention
|
CONTAINER="${CONTAINERS[$ENV]}"
|
||||||
|
BENCH_BIN="${BENCH_BINS[$ENV]}"
|
||||||
|
CMD_PREFIX=( toolbox run -c "$CONTAINER" -- "$BENCH_BIN" )
|
||||||
|
|
||||||
|
# run with flash attention
|
||||||
for FA in 1; do
|
for FA in 1; do
|
||||||
SUFFIX=""
|
SUFFIX=""
|
||||||
EXTRA_ARGS=()
|
EXTRA_ARGS=()
|
||||||
@@ -115,18 +317,35 @@ for MODEL_PATH in "${MODEL_PATHS[@]}"; do
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
FULL_CMD=( $CMD_EFFECTIVE -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
|
LABEL="[${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX}"
|
||||||
|
FULL_CMD=( "${CMD_PREFIX[@]}" -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
|
||||||
|
|
||||||
printf "\n▶ [%s] %s%s%s\n" "$ENV" "$MODEL_NAME" "${SUFFIX:+ $SUFFIX}" "${CTX_SUFFIX:+ $CTX_SUFFIX}"
|
printf "\n▶ %s\n" "$LABEL"
|
||||||
printf " → log: %s\n" "$OUT"
|
printf " → log: %s\n" "$OUT"
|
||||||
printf " → cmd: %s\n\n" "${FULL_CMD[*]}"
|
printf " → cmd: %s\n\n" "${FULL_CMD[*]}"
|
||||||
|
|
||||||
if ! "${FULL_CMD[@]}" >"$OUT" 2>&1; then
|
run_bench_with_recovery "$ENV" "$CONTAINER" "$OUT" "$LABEL" "${FULL_CMD[@]}"
|
||||||
status=$?
|
rc=$?
|
||||||
echo "✖ ! [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} failed (exit ${status})" >>"$OUT"
|
|
||||||
echo " * [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} : FAILED"
|
case $rc in
|
||||||
fi
|
0) echo " ✔ $LABEL : OK" ;;
|
||||||
|
1) echo " * $LABEL : FAILED" ;;
|
||||||
|
2) echo " ⛔ $LABEL : SYSTEM FAILURE — aborting all"
|
||||||
|
ABORT_ALL=1
|
||||||
|
break 3 # break out of CTX, FA, and ENV loops
|
||||||
|
;;
|
||||||
|
esac
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if (( ABORT_ALL )); then
|
||||||
|
echo ""
|
||||||
|
echo "⛔ Benchmark run aborted due to unrecoverable system failure"
|
||||||
|
echo " Manual fix: sudo systemctl restart user@$(id -u) && podman stop --all"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "✅ All benchmarks complete"
|
||||||
|
|||||||
Reference in New Issue
Block a user