diff --git a/README.md b/README.md index 11fafda..73afb27 100644 --- a/README.md +++ b/README.md @@ -161,16 +161,16 @@ PP = prompt processing (tokens/sec prefill), TG = token generation (tokens/sec i | Model | Vulkan (AMDVLK) | Vulkan (RADV) | ROCm 6.4.2 | ROCm 6.4.2 + ROCWMMA | ROCm 7.0 Beta | ROCm 7.0 RC | 🏆 Best PP | 🏆 Best TG | |---|---|---|---|---|---|---|---|---| -| **Gemma3 12B Q8_0** | 677 pp / 14.0 tg | 503 pp / 13.8 tg | 223 pp / 13.8 tg | 223 pp / 13.9 tg | 223 pp / 13.9 tg | 222 pp / 13.9 tg | 🏆 **AMDVLK** | — | -| **Gemma3 27B BF16** | — | 136 pp / 4.0 tg | 84 pp / 4.0 tg | 93 pp / 4.0 tg | 92 pp / 4.0 tg | 56 pp / 3.1 tg | 🏆 **RADV** | — | -| **Llama-4-Scout 17B Q8_0** | 258 pp / 12.2 tg | 169 pp / 12.3 tg | 135 pp / 11.6 tg | — | — | — | 🏆 **AMDVLK** | — | -| **Llama-4-Scout 17B Q4_K XL** | 218 pp / 20.0 tg | 152 pp / 20.0 tg | 138 pp / 17.4 tg | — | 139 pp / 17.6 tg | 124 pp / 17.6 tg | 🏆 **AMDVLK** | — | -| **Qwen3 30B BF16** | 107 pp / 8.0 tg | 86 pp / 7.4 tg | 158 pp / 23.9 tg | 158 pp / 24.5 tg | 153 pp / 24.5 tg | 152 pp / 24.6 tg | 🏆 **ROCm6.4.2+ROCWMMA** | — | -| **Qwen3-235B Q3_K XL** | 114 pp / 16.0 tg | 65 pp / 16.6 tg | 74 pp / 13.7 tg | — | — | — | 🏆 **AMDVLK** | — | -| **GLM-4.5-Air-Q4_K_XL** | 201 pp / 22.8 tg | 128 pp / 22.9 tg | 130 pp / 19.4 tg | — | — | 130 pp / 19.8 tg | 🏆 **AMDVLK** | — | -| **GLM-4.5-Air-Q6_K_XL** | 223 pp / 16.5 tg | 127 pp / 16.8 tg | 125 pp / 15.3 tg | 114 pp / 15.5 tg | 121 pp / 15.5 tg | 124 pp / 15.5 tg | 🏆 **AMDVLK** | — | -| **gpt-oss-120b-mxfp4** | 487 pp / 48.1 tg | 240 pp / 49.0 tg | 353 pp / 44.1 tg | 354 pp / 45.0 tg | 355 pp / 45.0 tg | 353 pp / 45.1 tg | 🏆 **AMDVLK** | — | -| **gpt-oss-20b-mxfp4** | 1205 pp / 68.8 tg | 649 pp / 69.9 tg | 583 pp / 64.5 tg | 581 pp / 64.5 tg | 584 pp / 64.4 tg | 582 pp / 64.5 tg | 🏆 **AMDVLK** | — | +| **Gemma3 12B Q8_0** | 677 pp / 14.0 tg | 503 pp / 13.8 tg | 223 pp / 13.8 tg | 230 pp / 13.9 tg | 223 pp / 13.9 tg | 222 pp / 13.9 tg | 🏆 **AMDVLK** | 🏆 **AMDVLK** | +| **Gemma3 27B BF16** | ⚠️ Load Error | 139 pp / 4.0 tg | 84 pp / 4.0 tg | 95 pp / 4.0 tg | 92 pp / 4.0 tg | 83 pp / 4.0 tg | 🏆 **RADV** | 🏆 **ROCm6.4.2+ROCWMMA** | +| **Llama-4-Scout 17B Q8_0** | 260 pp / 12.2 tg | 172 pp / 12.3 tg | 135 pp / 11.6 tg | ⚠️ GPU Hang | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **AMDVLK** | 🏆 **RADV** | +| **Llama-4-Scout 17B Q4_K XL** | 221 pp / 20.0 tg | 155 pp / 20.0 tg | 138 pp / 17.4 tg | ⚠️ GPU Hang | 139 pp / 17.6 tg | 124 pp / 17.6 tg | 🏆 **AMDVLK** | 🏆 **AMDVLK** | +| **Qwen3 30B BF16** | 108 pp / 8.0 tg | 87 pp / 7.4 tg | 158 pp / 24.3 tg | 162 pp / 24.5 tg | 153 pp / 24.5 tg | 152 pp / 24.6 tg | 🏆 **ROCm6.4.2+ROCWMMA** | 🏆 **ROCm7 RC** | +| **Qwen3-235B Q3_K XL** | 116 pp / 16.0 tg | 67 pp / 16.8 tg | 74 pp / 13.7 tg | ⚠️ GPU Hang | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **AMDVLK** | 🏆 **RADV** | +| **GLM-4.5-Air-Q4_K_XL** | 202 pp / 22.8 tg | 133 pp / 23.3 tg | 130 pp / 19.4 tg | ⚠️ GPU Hang | ⚠️ GPU Hang | 130 pp / 20.1 tg | 🏆 **AMDVLK** | 🏆 **RADV** | +| **GLM-4.5-Air-Q6_K_XL** | 225 pp / 16.5 tg | 132 pp / 17.0 tg | 125 pp / 15.3 tg | 114 pp / 15.5 tg | 121 pp / 15.5 tg | 124 pp / 15.5 tg | 🏆 **AMDVLK** | 🏆 **RADV** | +| **gpt-oss-120b-mxfp4** | 546 pp / 48.1 tg | 255 pp / 49.0 tg | 353 pp / 44.1 tg | 408 pp / 45.0 tg | 355 pp / 45.0 tg | 353 pp / 45.1 tg | 🏆 **AMDVLK** | 🏆 **RADV** | +| **gpt-oss-20b-mxfp4** | 1473 pp / 68.8 tg | 728 pp / 69.9 tg | 583 pp / 64.5 tg | 649 pp / 64.5 tg | 584 pp / 64.4 tg | 582 pp / 64.5 tg | 🏆 **AMDVLK** | 🏆 **RADV** | **Observations:** diff --git a/benchmark/generate_readme_summary.py b/benchmark/generate_readme_summary.py index f272eda..476351c 100644 --- a/benchmark/generate_readme_summary.py +++ b/benchmark/generate_readme_summary.py @@ -1,118 +1,174 @@ #!/usr/bin/env python3 -import json +import json, re +from collections import defaultdict from pathlib import Path -# --- Config --- -RESULTS_JSON = Path("../docs/results.json") +RESULTS_FILE = "../docs/results.json" +# Column order + labels ENV_ORDER = [ "vulkan_amdvlk", "vulkan_radv", "rocm6_4_2", "rocm6_4_2-rocwmma", "rocm7_beta", - "rocm7_rc" + "rocm7_rc", ] - COL_NAMES = { "vulkan_amdvlk": "Vulkan (AMDVLK)", "vulkan_radv": "Vulkan (RADV)", "rocm6_4_2": "ROCm 6.4.2", "rocm6_4_2-rocwmma": "ROCm 6.4.2 + ROCWMMA", "rocm7_beta": "ROCm 7.0 Beta", - "rocm7_rc": "ROCm 7.0 RC" + "rocm7_rc": "ROCm 7.0 RC", } - -WINNER_LABELS = { +WINNER_NAMES = { "vulkan_amdvlk": "AMDVLK", "vulkan_radv": "RADV", "rocm6_4_2": "ROCm6.4.2", "rocm6_4_2-rocwmma": "ROCm6.4.2+ROCWMMA", "rocm7_beta": "ROCm7 Beta", - "rocm7_rc": "ROCm7 RC" + "rocm7_rc": "ROCm7 RC", } - -DEFAULT_MODELS = [ - ("Gemma3 12B Q8_0", "gemma-3-12b-it-UD-Q8_K_XL"), - ("Gemma3 27B BF16", "gemma-3-27b-it-BF16"), - ("Llama-4-Scout 17B Q8_0", "Llama-4-Scout-17B-16E-Instruct-Q8_0"), - ("Llama-4-Scout 17B Q4_K XL", "Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL"), - ("Qwen3 30B BF16", "Qwen3-30B-A3B-BF16"), - ("Qwen3-235B Q3_K XL", "Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL"), - ("GLM-4.5-Air-Q4_K_XL", "GLM-4.5-Air-UD-Q4_K_XL"), - ("GLM-4.5-Air-Q6_K_XL", "GLM-4.5-Air-UD-Q6_K_XL"), - ("gpt-oss-120b-mxfp4", "gpt-oss-120b-mxfp4"), - ("gpt-oss-20b-mxfp4", "gpt-oss-20b-mxfp4"), -] - -ERROR_LABELS = { +ERROR_LABEL = { "load": "⚠️ Load Error", "hang": "⚠️ GPU Hang", - "runtime": "⚠️ Runtime Error" + "runtime": "⚠️ Runtime Error", } -# --- Helpers --- -def load_results(): - data = json.loads(Path(RESULTS_JSON).read_text()) - return data["runs"] +# Display name → fuzzy key (case/UD/shard-insensitive) +DEFAULT_MODELS = [ + ("Gemma3 12B Q8_0", "gemma-3-12b"), + ("Gemma3 27B BF16", "gemma-3-27b"), + ("Llama-4-Scout 17B Q8_0", "llama-4-scout-17b-16e-instruct-q8_0"), + ("Llama-4-Scout 17B Q4_K XL", "llama-4-scout-17b-16e-instruct-q4_k_xl"), + ("Qwen3 30B BF16", "qwen3-30b-a3b-bf16"), + ("Qwen3-235B Q3_K XL", "qwen3-235b-a22b"), + ("GLM-4.5-Air-Q4_K_XL", "glm-4.5-air-q4_k_xl"), + ("GLM-4.5-Air-Q6_K_XL", "glm-4.5-air-q6_k_xl"), + ("gpt-oss-120b-mxfp4", "gpt-oss-120b-mxfp4"), + ("gpt-oss-20b-mxfp4", "gpt-oss-20b-mxfp4"), +] -def filter_runs(runs, model_prefix, env): - for r in runs: - if r["model_clean"].startswith(model_prefix) and r["env"] == env: - return r +SHARD_RE = re.compile(r"-000\d+-of-000\d+", re.IGNORECASE) +def norm_model(s: str) -> str: + s = (s or "").lower().replace("_", "-") + s = SHARD_RE.sub("", s) + s = s.replace("-ud", "") # drop -UD tag for matching + return s + +# Load JSON +raw = json.loads(Path(RESULTS_FILE).read_text(encoding="utf-8")) +runs = raw["runs"] + +# Bucket rows by (model_key, env, test, fa) +buckets = defaultdict(list) +error_only = defaultdict(list) # (model_key, env) -> [error_type,...] for test=None rows +all_models = set() + +for r in runs: + env = r.get("env") + if env not in ENV_ORDER: + continue + mkey = norm_model(r.get("model_clean") or r.get("model") or "") + all_models.add(mkey) + test = r.get("test") # "pp512", "tg128", or None for pure errors + if test in ("pp512", "tg128"): + buckets[(mkey, env, test)].append(r) + else: + # capture error-only rows so we can show ⚠️ instead of "—" + if r.get("error"): + error_only[(mkey, env)].append(r.get("error_type") or "runtime") + +def pick_best(rows): + """Choose the best non-error row by tps_mean; if all error, return an error row.""" + best = None + best_val = -1 + fallback = None + for r in rows: + if r.get("error"): + fallback = r + continue + v = r.get("tps_mean") + if isinstance(v, (int, float)) and v > best_val: + best_val = v + best = r + return best or fallback + +# Build chosen results per (model, env): {pp: row|None, tg: row|None, err_only: str|None} +chosen = defaultdict(lambda: defaultdict(dict)) +for (mkey, env, test), rows in buckets.items(): + chosen_row = pick_best(rows) + chosen[mkey][env][test] = chosen_row + +for (mkey, env), etypes in error_only.items(): + if etypes: + # prefer specific types in a stable order + if "load" in etypes: + chosen[mkey][env]["error_only"] = "load" + elif "hang" in etypes: + chosen[mkey][env]["error_only"] = "hang" + else: + chosen[mkey][env]["error_only"] = "runtime" + +def format_cell(entry_dict): + pp = entry_dict.get("pp512") + tg = entry_dict.get("tg128") + + # If either chosen row is an error, show that error (web UI behavior) + for row in (pp, tg): + if row and row.get("error"): + return ERROR_LABEL.get(row.get("error_type") or "runtime", "⚠️ Error") + + # If both pp/tg missing but we have an error-only marker, show it + if not pp and not tg: + et = entry_dict.get("error_only") + if et: + return ERROR_LABEL.get(et, "⚠️ Error") + return "—" # truly absent + + # Otherwise, print available values (partial allowed) + def fmt(v): + return f"{int(round(v))}" if isinstance(v, (int, float)) else "—" + ppv = pp.get("tps_mean") if pp else None + tgv = tg.get("tps_mean") if tg else None + return f"{fmt(ppv)} pp / {tgv:.1f} tg" if isinstance(tgv, (int, float)) \ + else f"{fmt(ppv)} pp / — tg" + +def best_env_for(mkey, test): + best_env, best_val = None, -1 + for env in ENV_ORDER: + row = chosen[mkey].get(env, {}).get(test) + if not row or row.get("error"): + continue + v = row.get("tps_mean") + if isinstance(v, (int, float)) and v > best_val: + best_env, best_val = env, v + return best_env + +# Fuzzy match helper +def find_model_key(fuzzy): + needle = norm_model(fuzzy) + for k in all_models: + if needle in k: + return k return None -def format_cell(pp_run, tg_run): - if not pp_run or not tg_run: - return "—" - if pp_run["error"] or tg_run["error"]: - return ERROR_LABELS.get(pp_run["error_type"] or tg_run["error_type"], "⚠️ Error") - if pp_run["tps_mean"] is None or tg_run["tps_mean"] is None: - return "—" - return f"{int(round(pp_run['tps_mean']))} pp / {tg_run['tps_mean']:.1f} tg" +# Print table +header = ["Model"] + [COL_NAMES[e] for e in ENV_ORDER] + ["🏆 Best PP", "🏆 Best TG"] +print("| " + " | ".join(header) + " |") +print("|" + "|".join(["---"] * len(header)) + "|") -def find_winner(runs, model_prefix, bench_type): - vals = {} +for disp, fuzzy in DEFAULT_MODELS: + mkey = find_model_key(fuzzy) + if not mkey: + print("| " + " | ".join([f"**{disp}**"] + ["—"]*len(ENV_ORDER) + ["—","—"]) + " |") + continue + row = [f"**{disp}**"] for env in ENV_ORDER: - r = filter_runs(runs, model_prefix, env) - if r and not r["error"] and r["test"] == bench_type and r["tps_mean"] is not None: - vals[env] = r["tps_mean"] - if not vals: - return None - return max(vals, key=vals.get) - -# --- Main --- -def main(): - runs = load_results() - - header = ["Model"] + [COL_NAMES[e] for e in ENV_ORDER] + ["🏆 Best PP", "🏆 Best TG"] - print("| " + " | ".join(header) + " |") - print("|" + "|".join(["---"] * len(header)) + "|") - - for disp_name, model_prefix in DEFAULT_MODELS: - row = [f"**{disp_name}**"] - for env in ENV_ORDER: - pp_run = filter_runs(runs, model_prefix, env) - tg_run = filter_runs(runs, model_prefix, env) - pp = None - tg = None - if pp_run and pp_run["test"] == "pp512": - pp = pp_run - if tg_run and tg_run["test"] == "tg128": - tg = tg_run - # match pp and tg runs by env - pp_env_run = next((r for r in runs if r["model_clean"].startswith(model_prefix) and r["env"] == env and r["test"] == "pp512"), None) - tg_env_run = next((r for r in runs if r["model_clean"].startswith(model_prefix) and r["env"] == env and r["test"] == "tg128"), None) - row.append(format_cell(pp_env_run, tg_env_run)) - - bpp = find_winner(runs, model_prefix, "pp512") - btg = find_winner(runs, model_prefix, "tg128") - row.append(f"🏆 **{WINNER_LABELS[bpp]}**" if bpp else "—") - row.append(f"🏆 **{WINNER_LABELS[btg]}**" if btg else "—") - - print("| " + " | ".join(row) + " |") - - print("\nFull interactive results: [Live Benchmark Viewer](https://your-live-results-url)") - -if __name__ == "__main__": - main() + row.append(format_cell(chosen[mkey].get(env, {}))) + bpp = best_env_for(mkey, "pp512") + btg = best_env_for(mkey, "tg128") + row.append(f"🏆 **{WINNER_NAMES[bpp]}**" if bpp else "—") + row.append(f"🏆 **{WINNER_NAMES[btg]}**" if btg else "—") + print("| " + " | ".join(row) + " |")