Adding new benchmarks

2025-08-09 11:25:44 +01:00
parent 8972ef01ff
commit bc9483b75d
5 changed files with 312 additions and 395 deletions
@@ -1,108 +1,118 @@
 #!/usr/bin/env python3
-import re, glob, os, argparse
+import json
+from pathlib import Path

-PP_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*pp512\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
-TG_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*tg128\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
-LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed", re.IGNORECASE)
-HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE)
-GEN_ERR  = re.compile(r"error:|exit \d+", re.IGNORECASE)
+# --- Config ---
+RESULTS_JSON = Path("../docs/results.json")
+
+ENV_ORDER = [
+    "vulkan_amdvlk",
+    "vulkan_radv",
+    "rocm6_4_2",
+    "rocm6_4_2-rocwmma",
+    "rocm7_beta",
+    "rocm7_rc"
+]

-ENV_ORDER = ["vulkan_amdvlk","vulkan_radv","rocm6_4_2","rocm7_beta","rocm7_rc"]
 COL_NAMES = {
-    "vulkan_amdvlk":"Vulkan (AMDVLK)",
-    "vulkan_radv":"Vulkan (RADV)",
-    "rocm6_4_2":"ROCm 6.4.2",
-    "rocm7_beta":"ROCm 7.0 Beta",
-    "rocm7_rc":"ROCm 7.0 RC",
+    "vulkan_amdvlk": "Vulkan (AMDVLK)",
+    "vulkan_radv": "Vulkan (RADV)",
+    "rocm6_4_2": "ROCm 6.4.2",
+    "rocm6_4_2-rocwmma": "ROCm 6.4.2 + ROCWMMA",
+    "rocm7_beta": "ROCm 7.0 Beta",
+    "rocm7_rc": "ROCm 7.0 RC"
 }
-WINNER = {
-    "vulkan_amdvlk":"AMDVLK",
-    "vulkan_radv":"RADV",
-    "rocm6_4_2":"ROCm6.4.2",
-    "rocm7_beta":"ROCm7 Beta",
-    "rocm7_rc":"ROCm7 RC",
+
+WINNER_LABELS = {
+    "vulkan_amdvlk": "AMDVLK",
+    "vulkan_radv": "RADV",
+    "rocm6_4_2": "ROCm6.4.2",
+    "rocm6_4_2-rocwmma": "ROCm6.4.2+ROCWMMA",
+    "rocm7_beta": "ROCm7 Beta",
+    "rocm7_rc": "ROCm7 RC"
 }

 DEFAULT_MODELS = [
-    ("Gemma3 12B Q8_0",                  "gemma-3-12b-it-UD-Q8_K_XL"),
-    ("Gemma3 27B BF16",                  "gemma-3-27b-it-BF16"),
-    ("Llama-4-Scout 17B Q8_0",           "Llama-4-Scout-17B-16E-Instruct-Q8_0"),
-    ("Llama-4-Scout 17B Q4_K XL",        "Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL"),
-    ("Qwen3 30B BF16",                    "Qwen3-30B-A3B-BF16"),
-    ("Qwen3-235B Q3_K XL",               "Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL"),
-    ("GLM-4.5-Air-UD-Q4_K_XL",           "GLM-4.5-Air-UD-Q4_K_XL"),
-    ("GLM-4.5-Air-UD-Q6_K_XL",           "GLM-4.5-Air-UD-Q6_K_XL"),
-    ("gpt-oss-120b-mxfp4",               "gpt-oss-120b-mxfp4"),
-    ("gpt-oss-20b-mxfp4",                "gpt-oss-20b-mxfp4"),
+    ("Gemma3 12B Q8_0", "gemma-3-12b-it-UD-Q8_K_XL"),
+    ("Gemma3 27B BF16", "gemma-3-27b-it-BF16"),
+    ("Llama-4-Scout 17B Q8_0", "Llama-4-Scout-17B-16E-Instruct-Q8_0"),
+    ("Llama-4-Scout 17B Q4_K XL", "Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL"),
+    ("Qwen3 30B BF16", "Qwen3-30B-A3B-BF16"),
+    ("Qwen3-235B Q3_K XL", "Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL"),
+    ("GLM-4.5-Air-Q4_K_XL", "GLM-4.5-Air-UD-Q4_K_XL"),
+    ("GLM-4.5-Air-Q6_K_XL", "GLM-4.5-Air-UD-Q6_K_XL"),
+    ("gpt-oss-120b-mxfp4", "gpt-oss-120b-mxfp4"),
+    ("gpt-oss-20b-mxfp4", "gpt-oss-20b-mxfp4"),
 ]

-CLEAN = lambda s: re.sub(r"-000\d+-of-000\d+", "", s)
+ERROR_LABELS = {
+    "load": "⚠️ Load Error",
+    "hang": "⚠️ GPU Hang",
+    "runtime": "⚠️ Runtime Error"
+}

-def parse_logs():
-    data = {}
-    for p in glob.glob(os.path.join("results","*.log")):
-        base = os.path.basename(p)[:-4]
-        if "__" not in base:
-            continue
-        model_raw, env = base.split("__", 1)
-        key = CLEAN(model_raw)
-        t = open(p, errors="ignore").read()
-        pp = PP_RE.search(t)
-        tg = TG_RE.search(t)
-        et = None
-        if LOAD_ERR.search(t): et = "load"
-        elif HANG_ERR.search(t): et = "hang"
-        elif GEN_ERR.search(t) and not (pp and tg): et = "runtime"
-        data.setdefault(key, {"pp512": {}, "tg128": {}})
-        data[key]["pp512"][env] = {"mean": float(pp.group(1)) if (pp and et is None) else None,
-                                   "error": et is not None, "etype": et}
-        data[key]["tg128"][env] = {"mean": float(tg.group(1)) if (tg and et is None) else None,
-                                   "error": et is not None, "etype": et}
-    return data
+# --- Helpers ---
+def load_results():
+    data = json.loads(Path(RESULTS_JSON).read_text())
+    return data["runs"]

-def best(env_data):
-    vals = {e:d["mean"] for e,d in env_data.items() if (not d["error"]) and d["mean"] is not None}
-    return max(vals, key=vals.get) if vals else None
-
-def cell(pp, tg):
-    if (pp is None) or (tg is None):
-        return "—"
-    if pp["error"] or tg["error"]:
-        m = pp["etype"] or tg["etype"] or "runtime"
-        return {"load":"⚠️ Load Error","hang":"⚠️ GPU Hang","runtime":"⚠️ Runtime Error"}.get(m, "⚠️ Error")
-    return f"{int(round(pp['mean']))} pp / {tg['mean']:.1f} tg"
-
-def find_key(keys, prefix):
-    for k in keys:
-        if k.startswith(prefix):
-            return k
+def filter_runs(runs, model_prefix, env):
+    for r in runs:
+        if r["model_clean"].startswith(model_prefix) and r["env"] == env:
+            return r
    return None

+def format_cell(pp_run, tg_run):
+    if not pp_run or not tg_run:
+        return "—"
+    if pp_run["error"] or tg_run["error"]:
+        return ERROR_LABELS.get(pp_run["error_type"] or tg_run["error_type"], "⚠️ Error")
+    if pp_run["tps_mean"] is None or tg_run["tps_mean"] is None:
+        return "—"
+    return f"{int(round(pp_run['tps_mean']))} pp / {tg_run['tps_mean']:.1f} tg"
+
+def find_winner(runs, model_prefix, bench_type):
+    vals = {}
+    for env in ENV_ORDER:
+        r = filter_runs(runs, model_prefix, env)
+        if r and not r["error"] and r["test"] == bench_type and r["tps_mean"] is not None:
+            vals[env] = r["tps_mean"]
+    if not vals:
+        return None
+    return max(vals, key=vals.get)
+
+# --- Main ---
 def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("models", nargs="*", help="Optional model prefixes to include")
-    args = ap.parse_args()
-    data = parse_logs()
-    want = [(m,m) for m in args.models] if args.models else DEFAULT_MODELS
+    runs = load_results()

-    header = ["Model"] + [COL_NAMES[e] for e in ENV_ORDER] + ["🏆 Best PP","🏆 Best TG"]
+    header = ["Model"] + [COL_NAMES[e] for e in ENV_ORDER] + ["🏆 Best PP", "🏆 Best TG"]
    print("| " + " | ".join(header) + " |")
-    print("|" + "|".join(["---"]*len(header)) + "|")
+    print("|" + "|".join(["---"] * len(header)) + "|")

-    for disp, patt in want:
-        key = find_key(data.keys(), patt)
-        row = [f"**{disp}**"]
-        if not key:
-            row += ["—"]*len(ENV_ORDER) + ["—","—"]
-            print("| " + " | ".join(row) + " |")
-            continue
-        ppd, tgd = data[key]["pp512"], data[key]["tg128"]
+    for disp_name, model_prefix in DEFAULT_MODELS:
+        row = [f"**{disp_name}**"]
        for env in ENV_ORDER:
-            row.append(cell(ppd.get(env), tgd.get(env)))
-        bpp, btg = best(ppd), best(tgd)
-        row.append(f"🏆 **{WINNER[bpp]}**" if bpp else "—")
-        row.append(f"🏆 **{WINNER[btg]}**" if btg else "—")
+            pp_run = filter_runs(runs, model_prefix, env)
+            tg_run = filter_runs(runs, model_prefix, env)
+            pp = None
+            tg = None
+            if pp_run and pp_run["test"] == "pp512":
+                pp = pp_run
+            if tg_run and tg_run["test"] == "tg128":
+                tg = tg_run
+            # match pp and tg runs by env
+            pp_env_run = next((r for r in runs if r["model_clean"].startswith(model_prefix) and r["env"] == env and r["test"] == "pp512"), None)
+            tg_env_run = next((r for r in runs if r["model_clean"].startswith(model_prefix) and r["env"] == env and r["test"] == "tg128"), None)
+            row.append(format_cell(pp_env_run, tg_env_run))
+
+        bpp = find_winner(runs, model_prefix, "pp512")
+        btg = find_winner(runs, model_prefix, "tg128")
+        row.append(f"🏆 **{WINNER_LABELS[bpp]}**" if bpp else "—")
+        row.append(f"🏆 **{WINNER_LABELS[btg]}**" if btg else "—")
+
        print("| " + " | ".join(row) + " |")

+    print("\nFull interactive results: [Live Benchmark Viewer](https://your-live-results-url)")
+
 if __name__ == "__main__":
    main()
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+import json
+from collections import defaultdict
+from statistics import mean
+
+# CONFIG
+TOLERANCE_MULTIPLIER = 1.0  # multiplier for std dev to count as "within best"
+
+def within_tolerance(best_mean, best_std, contender_mean, contender_std):
+    # Winner if contender is within (best_mean - best_std * tol) of best_mean
+    return contender_mean >= (best_mean - TOLERANCE_MULTIPLIER * best_std)
+
+# --- Load data ---
+with open("../docs/results.json", encoding="utf-8") as f:
+    data = json.load(f)
+
+runs = data["runs"]
+
+# --- Group by benchmark type ---
+benchmarks = defaultdict(list)
+for r in runs:
+    if r["error"]:
+        continue
+    if r["test"] in ("pp512", "tg128"):
+        benchmarks[r["test"]].append(r)
+
+summary = {}
+
+for bench_type, results in benchmarks.items():
+    winners_count = defaultdict(int)
+    backend_perf = defaultdict(list)
+
+    # Group results by model
+    models = defaultdict(list)
+    for r in results:
+        models[r["model_clean"]].append(r)
+
+    for model, entries in models.items():
+        # Find the best mean
+        best_entry = max(entries, key=lambda x: x["tps_mean"])
+        best_mean = best_entry["tps_mean"]
+        best_std = best_entry["tps_std"] or 0
+
+        # Find all within tolerance
+        for e in entries:
+            if e["tps_mean"] is None:
+                continue
+            if within_tolerance(best_mean, best_std, e["tps_mean"], e["tps_std"] or 0):
+                label = f"{e['env']}{' (FA on)' if e['fa'] else ' (FA off)'}"
+                winners_count[label] += 1
+
+        # Collect performance data for average TPS
+        for e in entries:
+            label = f"{e['env']}{' (FA on)' if e['fa'] else ' (FA off)'}"
+            if e["tps_mean"] is not None:
+                backend_perf[label].append(e["tps_mean"])
+
+    # Store summary
+    summary[bench_type] = {
+        "winners": dict(sorted(winners_count.items(), key=lambda x: -x[1])),
+        "avg_perf": {k: round(mean(v), 2) for k, v in backend_perf.items()},
+        "total_models": len(models),
+    }
+
+# --- Print human-readable analysis ---
+for bench_type in ("pp512", "tg128"):
+    if bench_type not in summary:
+        continue
+    print(f"\n=== {bench_type.upper()} ===")
+    print(f"Models tested: {summary[bench_type]['total_models']}")
+    print("Winner counts (within tolerance):")
+    for backend, count in summary[bench_type]["winners"].items():
+        print(f"  {backend}: {count} models")
+    print("Average throughput (tokens/sec):")
+    for backend, avg in sorted(summary[bench_type]["avg_perf"].items(), key=lambda x: -x[1]):
+        print(f"  {backend}: {avg}")
+
@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to remove host-related entries from log files and delete host files.
-"""
-
-import os
-import glob
-import shutil
-from pathlib import Path
-
-
-def remove_host_entries_from_log(log_file):
-    """
-    Remove all entries that start with '[host]' from the log file.
-    Each entry is separated by empty lines.
-    """
-    if not os.path.exists(log_file):
-        print(f"Log file {log_file} not found!")
-        return False
-    
-    # Create backup
-    backup_file = f"{log_file}.backup"
-    shutil.copy2(log_file, backup_file)
-    print(f"Created backup: {backup_file}")
-    
-    with open(log_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    
-    filtered_lines = []
-    i = 0
-    
-    while i < len(lines):
-        line = lines[i].strip()
-        
-        # Check if this line starts a host entry
-        if line.startswith('▶ [host]'):
-            # Skip this entry by finding the next empty line or end of file
-            i += 1
-            while i < len(lines) and lines[i].strip() != '':
-                i += 1
-            # Skip the empty line too if we found one
-            if i < len(lines) and lines[i].strip() == '':
-                i += 1
-        else:
-            # Keep this line
-            filtered_lines.append(lines[i])
-            i += 1
-    
-    # Write the filtered content back
-    with open(log_file, 'w', encoding='utf-8') as f:
-        f.writelines(filtered_lines)
-    
-    print(f"Removed host entries from {log_file}")
-    return True
-
-
-def remove_host_files():
-    """Remove all files with 'host' in their filename."""
-    host_files = glob.glob('*host*')
-    
-    if not host_files:
-        print("No files with 'host' in filename found.")
-        return
-    
-    print("Files to be removed:")
-    for file in host_files:
-        print(f"  - {file}")
-    
-    for file in host_files:
-        try:
-            os.remove(file)
-            print(f"Removed: {file}")
-        except OSError as e:
-            print(f"Error removing {file}: {e}")
-
-
-def preview_host_entries(log_file):
-    """Preview what host entries would be removed."""
-    if not os.path.exists(log_file):
-        print(f"Log file {log_file} not found!")
-        return
-    
-    with open(log_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    
-    print("Host entries that would be removed:")
-    print("-" * 50)
-    
-    i = 0
-    entry_count = 0
-    
-    while i < len(lines):
-        line = lines[i].strip()
-        
-        if line.startswith('▶ [host]'):
-            entry_count += 1
-            print(f"Entry {entry_count}:")
-            
-            # Print this entry until we hit an empty line
-            while i < len(lines) and lines[i].strip() != '':
-                print(lines[i].rstrip())
-                i += 1
-            print()  # Add empty line after entry
-        else:
-            i += 1
-    
-    print(f"Total host entries found: {entry_count}")
-
-
-def main():
-    log_file = "run_benchmarks.log"  # Change this to your actual log file name
-    
-    print("Host Entry and File Removal Script")
-    print("=" * 40)
-    
-    # Preview what would be removed
-    preview_host_entries(log_file)
-    
-    # Show files that would be removed
-    host_files = glob.glob('*host*')
-    if host_files:
-        print(f"\nFiles with 'host' in filename ({len(host_files)} found):")
-        for file in host_files:
-            print(f"  - {file}")
-    
-    print("\nThis script will:")
-    print(f"1. Remove host entries from log file: {log_file}")
-    print("2. Remove all files with 'host' in the filename")
-    
-    response = input("\nContinue? (y/N): ").strip().lower()
-    
-    if response == 'y' or response == 'yes':
-        # Remove host entries from log
-        if remove_host_entries_from_log(log_file):
-            print("✓ Host entries removed from log file")
-        
-        # Remove host files
-        remove_host_files()
-        print("✓ Host files removed")
-        
-        print("\nDone!")
-    else:
-        print("Aborted.")
-
-
-if __name__ == "__main__":
-    main()