Files
amd-strix-halo-toolboxes/benchmark/summarize_results.py
T
2025-08-09 11:25:44 +01:00

78 lines
2.6 KiB
Python

#!/usr/bin/env python3
import json
from collections import defaultdict
from statistics import mean
# CONFIG
TOLERANCE_MULTIPLIER = 1.0 # multiplier for std dev to count as "within best"
def within_tolerance(best_mean, best_std, contender_mean, contender_std):
# Winner if contender is within (best_mean - best_std * tol) of best_mean
return contender_mean >= (best_mean - TOLERANCE_MULTIPLIER * best_std)
# --- Load data ---
with open("../docs/results.json", encoding="utf-8") as f:
data = json.load(f)
runs = data["runs"]
# --- Group by benchmark type ---
benchmarks = defaultdict(list)
for r in runs:
if r["error"]:
continue
if r["test"] in ("pp512", "tg128"):
benchmarks[r["test"]].append(r)
summary = {}
for bench_type, results in benchmarks.items():
winners_count = defaultdict(int)
backend_perf = defaultdict(list)
# Group results by model
models = defaultdict(list)
for r in results:
models[r["model_clean"]].append(r)
for model, entries in models.items():
# Find the best mean
best_entry = max(entries, key=lambda x: x["tps_mean"])
best_mean = best_entry["tps_mean"]
best_std = best_entry["tps_std"] or 0
# Find all within tolerance
for e in entries:
if e["tps_mean"] is None:
continue
if within_tolerance(best_mean, best_std, e["tps_mean"], e["tps_std"] or 0):
label = f"{e['env']}{' (FA on)' if e['fa'] else ' (FA off)'}"
winners_count[label] += 1
# Collect performance data for average TPS
for e in entries:
label = f"{e['env']}{' (FA on)' if e['fa'] else ' (FA off)'}"
if e["tps_mean"] is not None:
backend_perf[label].append(e["tps_mean"])
# Store summary
summary[bench_type] = {
"winners": dict(sorted(winners_count.items(), key=lambda x: -x[1])),
"avg_perf": {k: round(mean(v), 2) for k, v in backend_perf.items()},
"total_models": len(models),
}
# --- Print human-readable analysis ---
for bench_type in ("pp512", "tg128"):
if bench_type not in summary:
continue
print(f"\n=== {bench_type.upper()} ===")
print(f"Models tested: {summary[bench_type]['total_models']}")
print("Winner counts (within tolerance):")
for backend, count in summary[bench_type]["winners"].items():
print(f" {backend}: {count} models")
print("Average throughput (tokens/sec):")
for backend, avg in sorted(summary[bench_type]["avg_perf"].items(), key=lambda x: -x[1]):
print(f" {backend}: {avg}")