Adding new benchmarks

This commit is contained in:
Donato Capitella
2025-08-09 11:25:44 +01:00
parent 8972ef01ff
commit bc9483b75d
5 changed files with 312 additions and 395 deletions
+95 -85
View File
@@ -1,108 +1,118 @@
#!/usr/bin/env python3
import re, glob, os, argparse
import json
from pathlib import Path
PP_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*pp512\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
TG_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*tg128\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed", re.IGNORECASE)
HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE)
GEN_ERR = re.compile(r"error:|exit \d+", re.IGNORECASE)
# --- Config ---
RESULTS_JSON = Path("../docs/results.json")
ENV_ORDER = [
"vulkan_amdvlk",
"vulkan_radv",
"rocm6_4_2",
"rocm6_4_2-rocwmma",
"rocm7_beta",
"rocm7_rc"
]
ENV_ORDER = ["vulkan_amdvlk","vulkan_radv","rocm6_4_2","rocm7_beta","rocm7_rc"]
COL_NAMES = {
"vulkan_amdvlk":"Vulkan (AMDVLK)",
"vulkan_radv":"Vulkan (RADV)",
"rocm6_4_2":"ROCm 6.4.2",
"rocm7_beta":"ROCm 7.0 Beta",
"rocm7_rc":"ROCm 7.0 RC",
"vulkan_amdvlk": "Vulkan (AMDVLK)",
"vulkan_radv": "Vulkan (RADV)",
"rocm6_4_2": "ROCm 6.4.2",
"rocm6_4_2-rocwmma": "ROCm 6.4.2 + ROCWMMA",
"rocm7_beta": "ROCm 7.0 Beta",
"rocm7_rc": "ROCm 7.0 RC"
}
WINNER = {
"vulkan_amdvlk":"AMDVLK",
"vulkan_radv":"RADV",
"rocm6_4_2":"ROCm6.4.2",
"rocm7_beta":"ROCm7 Beta",
"rocm7_rc":"ROCm7 RC",
WINNER_LABELS = {
"vulkan_amdvlk": "AMDVLK",
"vulkan_radv": "RADV",
"rocm6_4_2": "ROCm6.4.2",
"rocm6_4_2-rocwmma": "ROCm6.4.2+ROCWMMA",
"rocm7_beta": "ROCm7 Beta",
"rocm7_rc": "ROCm7 RC"
}
DEFAULT_MODELS = [
("Gemma3 12B Q8_0", "gemma-3-12b-it-UD-Q8_K_XL"),
("Gemma3 27B BF16", "gemma-3-27b-it-BF16"),
("Llama-4-Scout 17B Q8_0", "Llama-4-Scout-17B-16E-Instruct-Q8_0"),
("Llama-4-Scout 17B Q4_K XL", "Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL"),
("Qwen3 30B BF16", "Qwen3-30B-A3B-BF16"),
("Qwen3-235B Q3_K XL", "Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL"),
("GLM-4.5-Air-UD-Q4_K_XL", "GLM-4.5-Air-UD-Q4_K_XL"),
("GLM-4.5-Air-UD-Q6_K_XL", "GLM-4.5-Air-UD-Q6_K_XL"),
("gpt-oss-120b-mxfp4", "gpt-oss-120b-mxfp4"),
("gpt-oss-20b-mxfp4", "gpt-oss-20b-mxfp4"),
("Gemma3 12B Q8_0", "gemma-3-12b-it-UD-Q8_K_XL"),
("Gemma3 27B BF16", "gemma-3-27b-it-BF16"),
("Llama-4-Scout 17B Q8_0", "Llama-4-Scout-17B-16E-Instruct-Q8_0"),
("Llama-4-Scout 17B Q4_K XL", "Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL"),
("Qwen3 30B BF16", "Qwen3-30B-A3B-BF16"),
("Qwen3-235B Q3_K XL", "Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL"),
("GLM-4.5-Air-Q4_K_XL", "GLM-4.5-Air-UD-Q4_K_XL"),
("GLM-4.5-Air-Q6_K_XL", "GLM-4.5-Air-UD-Q6_K_XL"),
("gpt-oss-120b-mxfp4", "gpt-oss-120b-mxfp4"),
("gpt-oss-20b-mxfp4", "gpt-oss-20b-mxfp4"),
]
CLEAN = lambda s: re.sub(r"-000\d+-of-000\d+", "", s)
ERROR_LABELS = {
"load": "⚠️ Load Error",
"hang": "⚠️ GPU Hang",
"runtime": "⚠️ Runtime Error"
}
def parse_logs():
data = {}
for p in glob.glob(os.path.join("results","*.log")):
base = os.path.basename(p)[:-4]
if "__" not in base:
continue
model_raw, env = base.split("__", 1)
key = CLEAN(model_raw)
t = open(p, errors="ignore").read()
pp = PP_RE.search(t)
tg = TG_RE.search(t)
et = None
if LOAD_ERR.search(t): et = "load"
elif HANG_ERR.search(t): et = "hang"
elif GEN_ERR.search(t) and not (pp and tg): et = "runtime"
data.setdefault(key, {"pp512": {}, "tg128": {}})
data[key]["pp512"][env] = {"mean": float(pp.group(1)) if (pp and et is None) else None,
"error": et is not None, "etype": et}
data[key]["tg128"][env] = {"mean": float(tg.group(1)) if (tg and et is None) else None,
"error": et is not None, "etype": et}
return data
# --- Helpers ---
def load_results():
data = json.loads(Path(RESULTS_JSON).read_text())
return data["runs"]
def best(env_data):
vals = {e:d["mean"] for e,d in env_data.items() if (not d["error"]) and d["mean"] is not None}
return max(vals, key=vals.get) if vals else None
def cell(pp, tg):
if (pp is None) or (tg is None):
return ""
if pp["error"] or tg["error"]:
m = pp["etype"] or tg["etype"] or "runtime"
return {"load":"⚠️ Load Error","hang":"⚠️ GPU Hang","runtime":"⚠️ Runtime Error"}.get(m, "⚠️ Error")
return f"{int(round(pp['mean']))} pp / {tg['mean']:.1f} tg"
def find_key(keys, prefix):
for k in keys:
if k.startswith(prefix):
return k
def filter_runs(runs, model_prefix, env):
for r in runs:
if r["model_clean"].startswith(model_prefix) and r["env"] == env:
return r
return None
def format_cell(pp_run, tg_run):
if not pp_run or not tg_run:
return ""
if pp_run["error"] or tg_run["error"]:
return ERROR_LABELS.get(pp_run["error_type"] or tg_run["error_type"], "⚠️ Error")
if pp_run["tps_mean"] is None or tg_run["tps_mean"] is None:
return ""
return f"{int(round(pp_run['tps_mean']))} pp / {tg_run['tps_mean']:.1f} tg"
def find_winner(runs, model_prefix, bench_type):
vals = {}
for env in ENV_ORDER:
r = filter_runs(runs, model_prefix, env)
if r and not r["error"] and r["test"] == bench_type and r["tps_mean"] is not None:
vals[env] = r["tps_mean"]
if not vals:
return None
return max(vals, key=vals.get)
# --- Main ---
def main():
ap = argparse.ArgumentParser()
ap.add_argument("models", nargs="*", help="Optional model prefixes to include")
args = ap.parse_args()
data = parse_logs()
want = [(m,m) for m in args.models] if args.models else DEFAULT_MODELS
runs = load_results()
header = ["Model"] + [COL_NAMES[e] for e in ENV_ORDER] + ["🏆 Best PP","🏆 Best TG"]
header = ["Model"] + [COL_NAMES[e] for e in ENV_ORDER] + ["🏆 Best PP", "🏆 Best TG"]
print("| " + " | ".join(header) + " |")
print("|" + "|".join(["---"]*len(header)) + "|")
print("|" + "|".join(["---"] * len(header)) + "|")
for disp, patt in want:
key = find_key(data.keys(), patt)
row = [f"**{disp}**"]
if not key:
row += [""]*len(ENV_ORDER) + ["",""]
print("| " + " | ".join(row) + " |")
continue
ppd, tgd = data[key]["pp512"], data[key]["tg128"]
for disp_name, model_prefix in DEFAULT_MODELS:
row = [f"**{disp_name}**"]
for env in ENV_ORDER:
row.append(cell(ppd.get(env), tgd.get(env)))
bpp, btg = best(ppd), best(tgd)
row.append(f"🏆 **{WINNER[bpp]}**" if bpp else "")
row.append(f"🏆 **{WINNER[btg]}**" if btg else "")
pp_run = filter_runs(runs, model_prefix, env)
tg_run = filter_runs(runs, model_prefix, env)
pp = None
tg = None
if pp_run and pp_run["test"] == "pp512":
pp = pp_run
if tg_run and tg_run["test"] == "tg128":
tg = tg_run
# match pp and tg runs by env
pp_env_run = next((r for r in runs if r["model_clean"].startswith(model_prefix) and r["env"] == env and r["test"] == "pp512"), None)
tg_env_run = next((r for r in runs if r["model_clean"].startswith(model_prefix) and r["env"] == env and r["test"] == "tg128"), None)
row.append(format_cell(pp_env_run, tg_env_run))
bpp = find_winner(runs, model_prefix, "pp512")
btg = find_winner(runs, model_prefix, "tg128")
row.append(f"🏆 **{WINNER_LABELS[bpp]}**" if bpp else "")
row.append(f"🏆 **{WINNER_LABELS[btg]}**" if btg else "")
print("| " + " | ".join(row) + " |")
print("\nFull interactive results: [Live Benchmark Viewer](https://your-live-results-url)")
if __name__ == "__main__":
main()