78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
from collections import defaultdict
|
|
from statistics import mean
|
|
|
|
# CONFIG
|
|
TOLERANCE_MULTIPLIER = 1.0 # multiplier for std dev to count as "within best"
|
|
|
|
def within_tolerance(best_mean, best_std, contender_mean, contender_std):
|
|
# Winner if contender is within (best_mean - best_std * tol) of best_mean
|
|
return contender_mean >= (best_mean - TOLERANCE_MULTIPLIER * best_std)
|
|
|
|
# --- Load data ---
|
|
with open("../docs/results.json", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
runs = data["runs"]
|
|
|
|
# --- Group by benchmark type ---
|
|
benchmarks = defaultdict(list)
|
|
for r in runs:
|
|
if r["error"]:
|
|
continue
|
|
if r["test"] in ("pp512", "tg128"):
|
|
benchmarks[r["test"]].append(r)
|
|
|
|
summary = {}
|
|
|
|
for bench_type, results in benchmarks.items():
|
|
winners_count = defaultdict(int)
|
|
backend_perf = defaultdict(list)
|
|
|
|
# Group results by model
|
|
models = defaultdict(list)
|
|
for r in results:
|
|
models[r["model_clean"]].append(r)
|
|
|
|
for model, entries in models.items():
|
|
# Find the best mean
|
|
best_entry = max(entries, key=lambda x: x["tps_mean"])
|
|
best_mean = best_entry["tps_mean"]
|
|
best_std = best_entry["tps_std"] or 0
|
|
|
|
# Find all within tolerance
|
|
for e in entries:
|
|
if e["tps_mean"] is None:
|
|
continue
|
|
if within_tolerance(best_mean, best_std, e["tps_mean"], e["tps_std"] or 0):
|
|
label = f"{e['env']}{' (FA on)' if e['fa'] else ' (FA off)'}"
|
|
winners_count[label] += 1
|
|
|
|
# Collect performance data for average TPS
|
|
for e in entries:
|
|
label = f"{e['env']}{' (FA on)' if e['fa'] else ' (FA off)'}"
|
|
if e["tps_mean"] is not None:
|
|
backend_perf[label].append(e["tps_mean"])
|
|
|
|
# Store summary
|
|
summary[bench_type] = {
|
|
"winners": dict(sorted(winners_count.items(), key=lambda x: -x[1])),
|
|
"avg_perf": {k: round(mean(v), 2) for k, v in backend_perf.items()},
|
|
"total_models": len(models),
|
|
}
|
|
|
|
# --- Print human-readable analysis ---
|
|
for bench_type in ("pp512", "tg128"):
|
|
if bench_type not in summary:
|
|
continue
|
|
print(f"\n=== {bench_type.upper()} ===")
|
|
print(f"Models tested: {summary[bench_type]['total_models']}")
|
|
print("Winner counts (within tolerance):")
|
|
for backend, count in summary[bench_type]["winners"].items():
|
|
print(f" {backend}: {count} models")
|
|
print("Average throughput (tokens/sec):")
|
|
for backend, avg in sorted(summary[bench_type]["avg_perf"].items(), key=lambda x: -x[1]):
|
|
print(f" {backend}: {avg}")
|
|
|