diff --git a/benchmark/compare_hblt0.py b/benchmark/compare_hblt0.py new file mode 100644 index 0000000..094ec94 --- /dev/null +++ b/benchmark/compare_hblt0.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Compare hipBLASLt-on vs hblt0-off benchmark runs. + +This script inspects docs/results.json (the same dataset consumed by +docs/assets/index2.js) and reports, for every backend that was benchmarked +both with and without the `-hblt0` suffix, which configuration wins. +""" + +from __future__ import annotations + +import argparse +import json +import math +import statistics +from collections import defaultdict +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + + +DEFAULT_RESULTS = Path("docs") / "results.json" +# Matches the tolerance used in docs/assets/index2.js (MIN_TOL = 0.25) +DEFAULT_TOLERANCE = 0.25 + +VariantValues = Dict[str, List[float]] +BackendMatrix = Dict[Tuple, VariantValues] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Pair benchmark runs with and without '-hblt0' and report which " + "configuration is faster per backend." + ) + ) + parser.add_argument( + "--results", + type=Path, + default=DEFAULT_RESULTS, + help="Path to results.json generated by the benchmark pipeline.", + ) + parser.add_argument( + "--tolerance", + type=float, + default=DEFAULT_TOLERANCE, + help="Minimum tokens/sec delta to treat as a win (default: 0.25).", + ) + return parser.parse_args() + + +def load_runs(path: Path) -> Iterable[dict]: + data = json.loads(path.read_text()) + runs = data.get("runs") + if not isinstance(runs, list): + raise ValueError(f"results.json at {path} does not contain a 'runs' array") + return runs + + +def measurement_key(run: dict) -> Tuple: + """Return a tuple that uniquely identifies a benchmark scenario.""" + return ( + (run.get("model_clean") or run.get("model") or "").lower(), + run.get("test") or "", + run.get("context") or "default", + run.get("context_tokens") or 0, + (run.get("quant") or "").upper(), + run.get("fa"), + run.get("rpc"), + run.get("ngl"), + run.get("backend"), + ) + + +def pair_runs(runs: Iterable[dict]) -> Tuple[Dict[str, BackendMatrix], Dict[str, Dict[str, int]]]: + """ + Group runs by backend (without / with '-hblt0') and measurement key. + + Returns: + pairs: backend -> measurement_key -> {'hipblaslt': [...], 'hblt0': [...]} + coverage: backend -> {'hipblaslt': raw_run_count, 'hblt0': raw_run_count} + """ + pairs: Dict[str, BackendMatrix] = defaultdict(lambda: defaultdict(dict)) + coverage: Dict[str, Dict[str, int]] = defaultdict(lambda: {"hipblaslt": 0, "hblt0": 0}) + + for run in runs: + env = run.get("env") + if not env: + continue + if run.get("error"): + continue + tps = run.get("tps_mean") + if not isinstance(tps, (int, float)) or math.isnan(tps): + continue + + is_hblt0 = env.endswith("-hblt0") + base_env = env[:-6] if is_hblt0 else env + variant = "hblt0" if is_hblt0 else "hipblaslt" + + key = measurement_key(run) + entry = pairs[base_env][key] + entry.setdefault(variant, []).append(float(tps)) + coverage[base_env][variant] += 1 + + return pairs, coverage + + +def summarize_backend( + backend: str, + matrix: BackendMatrix, + tolerance: float, + coverage: Dict[str, int], +) -> dict | None: + pairs: List[Tuple[float, float]] = [] + + for entry in matrix.values(): + if "hipblaslt" not in entry or "hblt0" not in entry: + continue + hip = statistics.mean(entry["hipblaslt"]) + hbl = statistics.mean(entry["hblt0"]) + pairs.append((hip, hbl)) + + if not pairs: + return None + + hip_wins = sum(1 for hip, hbl in pairs if (hip - hbl) > tolerance) + hbl_wins = sum(1 for hip, hbl in pairs if (hbl - hip) > tolerance) + ties = len(pairs) - hip_wins - hbl_wins + + avg_hip = statistics.mean(hip for hip, _ in pairs) + avg_hbl = statistics.mean(hbl for _, hbl in pairs) + avg_delta = avg_hip - avg_hbl + pct_delta = (avg_delta / avg_hbl * 100.0) if avg_hbl else float("inf") + + if avg_delta > tolerance: + verdict = "hipBLASLt faster" + elif avg_delta < -tolerance: + verdict = "hblt0 faster" + else: + verdict = "too close to call" + + return { + "backend": backend, + "pairs": len(pairs), + "hip_wins": hip_wins, + "hbl_wins": hbl_wins, + "ties": ties, + "avg_hip": avg_hip, + "avg_hbl": avg_hbl, + "avg_delta": avg_delta, + "pct_delta": pct_delta, + "verdict": verdict, + "coverage": coverage, + } + + +def format_summary(summary: dict) -> str: + cov = summary["coverage"] + hip_runs = cov.get("hipblaslt", 0) + hbl_runs = cov.get("hblt0", 0) + return ( + f"{summary['backend']}: {summary['verdict']} " + f"(Δ {summary['avg_delta']:+.2f} tps / {summary['pct_delta']:+.2f}% " + f"across {summary['pairs']} matched cases; " + f"hipBLASLt wins {summary['hip_wins']}, hblt0 wins {summary['hbl_wins']}, " + f"ties {summary['ties']}; raw runs hipBLASLt={hip_runs}, hblt0={hbl_runs})" + ) + + +def main() -> None: + args = parse_args() + runs = load_runs(args.results) + matrices, coverage = pair_runs(runs) + + summaries = [] + for backend in sorted(matrices): + summary = summarize_backend(backend, matrices[backend], args.tolerance, coverage.get(backend, {})) + if summary: + summaries.append(summary) + + if not summaries: + print("No matching hipBLASLt vs hblt0 pairs were found.") + return + + for summary in summaries: + print(format_summary(summary)) + + +if __name__ == "__main__": + main()