Benchmark and container updates

2025-08-03 13:05:52 +01:00
parent e295685041
commit e7e27e6cf3
130 changed files with 12111 additions and 267 deletions
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+import re, glob, os
+
+# This script parses llama-bench logs in 'results/' to produce
+# Markdown tables for pp512 (prompt processing) and tg128 (text generation).
+
+# Regex patterns to extract tokens/sec rows
+PP_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*pp512\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
+TG_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*tg128\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
+
+# Patterns to classify errors
+LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed", re.IGNORECASE)
+HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE)
+GENERIC_ERR = re.compile(r"error:|exit \d+", re.IGNORECASE)
+
+# Env ordering
+ENV_ORDER = ["vulkan_radv","vulkan_amdvlk","rocm6_4_2","rocm7_beta","rocm7_rc"]
+
+data = {}
+
+# Utility to clean model names
+def clean_name(raw):
+    return re.sub(r"-000\d+-of-000\d+", "", raw)
+
+# Scan logs
+glob_pattern = os.path.join("results", "*.log")
+for path in sorted(glob.glob(glob_pattern)):
+    # Fix: use rsplit, not rssplit
+    base = os.path.basename(path).rsplit('.log',1)[0]
+    if '__' not in base:
+        continue
+    model_raw, env = base.split('__',1)
+    model = clean_name(model_raw)
+
+    text = open(path, errors='ignore').read()
+    # Determine error type
+    if LOAD_ERR.search(text):
+        err_type = 'load'
+    elif HANG_ERR.search(text):
+        err_type = 'hang'
+    elif GENERIC_ERR.search(text) and not (PP_RE.search(text) and TG_RE.search(text)):
+        err_type = 'runtime'
+    else:
+        err_type = None
+
+    # Extract performance if no load error
+    pp_match = PP_RE.search(text) if err_type is None else None
+    tg_match = TG_RE.search(text) if err_type is None else None
+
+    for key, match in [('pp512', pp_match), ('tg128', tg_match)]:
+        cell = {
+            'mean': match.group(1) if match else None,
+            'std':  match.group(2) if match else None,
+            'error': err_type is not None,
+            'etype': err_type
+        }
+        data.setdefault(model, {}).setdefault(key, {})[env] = cell
+
+# Select winner
+def pick_winner(env_data):
+    scores = {e: float(d['mean']) for e,d in env_data.items() if not d['error'] and d['mean']}
+    if not scores:
+        return '—'
+    best = max(scores, key=scores.get)
+    others = [v for k,v in scores.items() if k!=best]
+    tag = f"🏆 **{best}**"
+    if others:
+        gain = (scores[best]/max(others)-1)*100
+        tag += f" (+{gain:.0f}%)"
+    return tag
+
+# Render table with distinct error messages
+def render_table(test_label, display_name):
+    print(f"### {display_name} — tokens/second\n")
+    header = ['Model'] + [e.replace('_',' ').title() for e in ENV_ORDER] + ['Winner']
+    print("| " + " | ".join(header) + " |")
+    print("|" + "|".join(['---']*len(header)) + "|")
+
+    for model in sorted(data, key=lambda s: s.lower()):
+        row = [f"**{model}**"]
+        env_data = data[model].get(test_label, {})
+        for env in ENV_ORDER:
+            d = env_data.get(env)
+            if not d:
+                cell = '—'
+            elif d['error']:
+                et = d['etype']
+                if et=='load':
+                    cell = '⚠️ Load Error'
+                elif et=='hang':
+                    cell = '⚠️ GPU Hang'
+                else:
+                    cell = '⚠️ Runtime Error'
+            else:
+                cell = f"{float(d['mean']):.2f} ± {float(d['std']):.2f}"
+            row.append(cell)
+        row.append(pick_winner(env_data))
+        print("| " + " | ".join(row) + " |")
+    print()
+
+# Output tables
+render_table('pp512','Prompt Processing (pp512)')
+render_table('tg128','Text Generation (tg128)')
+
+# Summary of failures by type
+fail_lines = []
+for model in sorted(data, key=lambda s: s.lower()):
+    for test_label, envs in data[model].items():
+        for env,d in envs.items():
+            if d['error']:
+                et = d['etype'] or 'unknown'
+                desc = {
+                    'load':'failed to load',
+                    'hang':'GPU hang',
+                    'runtime':'runtime error',
+                }.get(et, 'error')
+                fail_lines.append(f"- **{model}** [{test_label}] on *{env}*: {desc}")
+if fail_lines:
+    print("## Failed Runs\n")
+    print("\n".join(fail_lines))