190 lines
5.6 KiB
Python
190 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compare hipBLASLt-on vs hblt0-off benchmark runs.
|
|
|
|
This script inspects docs/results.json (the same dataset consumed by
|
|
docs/assets/index2.js) and reports, for every backend that was benchmarked
|
|
both with and without the `-hblt0` suffix, which configuration wins.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import math
|
|
import statistics
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List, Tuple
|
|
|
|
|
|
DEFAULT_RESULTS = Path("docs") / "results.json"
|
|
# Matches the tolerance used in docs/assets/index2.js (MIN_TOL = 0.25)
|
|
DEFAULT_TOLERANCE = 0.25
|
|
|
|
VariantValues = Dict[str, List[float]]
|
|
BackendMatrix = Dict[Tuple, VariantValues]
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description=(
|
|
"Pair benchmark runs with and without '-hblt0' and report which "
|
|
"configuration is faster per backend."
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"--results",
|
|
type=Path,
|
|
default=DEFAULT_RESULTS,
|
|
help="Path to results.json generated by the benchmark pipeline.",
|
|
)
|
|
parser.add_argument(
|
|
"--tolerance",
|
|
type=float,
|
|
default=DEFAULT_TOLERANCE,
|
|
help="Minimum tokens/sec delta to treat as a win (default: 0.25).",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_runs(path: Path) -> Iterable[dict]:
|
|
data = json.loads(path.read_text())
|
|
runs = data.get("runs")
|
|
if not isinstance(runs, list):
|
|
raise ValueError(f"results.json at {path} does not contain a 'runs' array")
|
|
return runs
|
|
|
|
|
|
def measurement_key(run: dict) -> Tuple:
|
|
"""Return a tuple that uniquely identifies a benchmark scenario."""
|
|
return (
|
|
(run.get("model_clean") or run.get("model") or "").lower(),
|
|
run.get("test") or "",
|
|
run.get("context") or "default",
|
|
run.get("context_tokens") or 0,
|
|
(run.get("quant") or "").upper(),
|
|
run.get("fa"),
|
|
run.get("rpc"),
|
|
run.get("ngl"),
|
|
run.get("backend"),
|
|
)
|
|
|
|
|
|
def pair_runs(runs: Iterable[dict]) -> Tuple[Dict[str, BackendMatrix], Dict[str, Dict[str, int]]]:
|
|
"""
|
|
Group runs by backend (without / with '-hblt0') and measurement key.
|
|
|
|
Returns:
|
|
pairs: backend -> measurement_key -> {'hipblaslt': [...], 'hblt0': [...]}
|
|
coverage: backend -> {'hipblaslt': raw_run_count, 'hblt0': raw_run_count}
|
|
"""
|
|
pairs: Dict[str, BackendMatrix] = defaultdict(lambda: defaultdict(dict))
|
|
coverage: Dict[str, Dict[str, int]] = defaultdict(lambda: {"hipblaslt": 0, "hblt0": 0})
|
|
|
|
for run in runs:
|
|
env = run.get("env")
|
|
if not env:
|
|
continue
|
|
if run.get("error"):
|
|
continue
|
|
tps = run.get("tps_mean")
|
|
if not isinstance(tps, (int, float)) or math.isnan(tps):
|
|
continue
|
|
|
|
is_hblt0 = env.endswith("-hblt0")
|
|
base_env = env[:-6] if is_hblt0 else env
|
|
variant = "hblt0" if is_hblt0 else "hipblaslt"
|
|
|
|
key = measurement_key(run)
|
|
entry = pairs[base_env][key]
|
|
entry.setdefault(variant, []).append(float(tps))
|
|
coverage[base_env][variant] += 1
|
|
|
|
return pairs, coverage
|
|
|
|
|
|
def summarize_backend(
|
|
backend: str,
|
|
matrix: BackendMatrix,
|
|
tolerance: float,
|
|
coverage: Dict[str, int],
|
|
) -> dict | None:
|
|
pairs: List[Tuple[float, float]] = []
|
|
|
|
for entry in matrix.values():
|
|
if "hipblaslt" not in entry or "hblt0" not in entry:
|
|
continue
|
|
hip = statistics.mean(entry["hipblaslt"])
|
|
hbl = statistics.mean(entry["hblt0"])
|
|
pairs.append((hip, hbl))
|
|
|
|
if not pairs:
|
|
return None
|
|
|
|
hip_wins = sum(1 for hip, hbl in pairs if (hip - hbl) > tolerance)
|
|
hbl_wins = sum(1 for hip, hbl in pairs if (hbl - hip) > tolerance)
|
|
ties = len(pairs) - hip_wins - hbl_wins
|
|
|
|
avg_hip = statistics.mean(hip for hip, _ in pairs)
|
|
avg_hbl = statistics.mean(hbl for _, hbl in pairs)
|
|
avg_delta = avg_hip - avg_hbl
|
|
pct_delta = (avg_delta / avg_hbl * 100.0) if avg_hbl else float("inf")
|
|
|
|
if avg_delta > tolerance:
|
|
verdict = "hipBLASLt faster"
|
|
elif avg_delta < -tolerance:
|
|
verdict = "hblt0 faster"
|
|
else:
|
|
verdict = "too close to call"
|
|
|
|
return {
|
|
"backend": backend,
|
|
"pairs": len(pairs),
|
|
"hip_wins": hip_wins,
|
|
"hbl_wins": hbl_wins,
|
|
"ties": ties,
|
|
"avg_hip": avg_hip,
|
|
"avg_hbl": avg_hbl,
|
|
"avg_delta": avg_delta,
|
|
"pct_delta": pct_delta,
|
|
"verdict": verdict,
|
|
"coverage": coverage,
|
|
}
|
|
|
|
|
|
def format_summary(summary: dict) -> str:
|
|
cov = summary["coverage"]
|
|
hip_runs = cov.get("hipblaslt", 0)
|
|
hbl_runs = cov.get("hblt0", 0)
|
|
return (
|
|
f"{summary['backend']}: {summary['verdict']} "
|
|
f"(Δ {summary['avg_delta']:+.2f} tps / {summary['pct_delta']:+.2f}% "
|
|
f"across {summary['pairs']} matched cases; "
|
|
f"hipBLASLt wins {summary['hip_wins']}, hblt0 wins {summary['hbl_wins']}, "
|
|
f"ties {summary['ties']}; raw runs hipBLASLt={hip_runs}, hblt0={hbl_runs})"
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
runs = load_runs(args.results)
|
|
matrices, coverage = pair_runs(runs)
|
|
|
|
summaries = []
|
|
for backend in sorted(matrices):
|
|
summary = summarize_backend(backend, matrices[backend], args.tolerance, coverage.get(backend, {}))
|
|
if summary:
|
|
summaries.append(summary)
|
|
|
|
if not summaries:
|
|
print("No matching hipBLASLt vs hblt0 pairs were found.")
|
|
return
|
|
|
|
for summary in summaries:
|
|
print(format_summary(summary))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|