Benchmark and container updates
This commit is contained in:
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
import re, glob, os
|
||||
|
||||
# This script parses llama-bench logs in 'results/' to produce
|
||||
# Markdown tables for pp512 (prompt processing) and tg128 (text generation).
|
||||
|
||||
# Regex patterns to extract tokens/sec rows
|
||||
PP_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*pp512\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
|
||||
TG_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*tg128\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
|
||||
|
||||
# Patterns to classify errors
|
||||
LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed", re.IGNORECASE)
|
||||
HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE)
|
||||
GENERIC_ERR = re.compile(r"error:|exit \d+", re.IGNORECASE)
|
||||
|
||||
# Env ordering
|
||||
ENV_ORDER = ["vulkan_radv","vulkan_amdvlk","rocm6_4_2","rocm7_beta","rocm7_rc"]
|
||||
|
||||
data = {}
|
||||
|
||||
# Utility to clean model names
|
||||
def clean_name(raw):
|
||||
return re.sub(r"-000\d+-of-000\d+", "", raw)
|
||||
|
||||
# Scan logs
|
||||
glob_pattern = os.path.join("results", "*.log")
|
||||
for path in sorted(glob.glob(glob_pattern)):
|
||||
# Fix: use rsplit, not rssplit
|
||||
base = os.path.basename(path).rsplit('.log',1)[0]
|
||||
if '__' not in base:
|
||||
continue
|
||||
model_raw, env = base.split('__',1)
|
||||
model = clean_name(model_raw)
|
||||
|
||||
text = open(path, errors='ignore').read()
|
||||
# Determine error type
|
||||
if LOAD_ERR.search(text):
|
||||
err_type = 'load'
|
||||
elif HANG_ERR.search(text):
|
||||
err_type = 'hang'
|
||||
elif GENERIC_ERR.search(text) and not (PP_RE.search(text) and TG_RE.search(text)):
|
||||
err_type = 'runtime'
|
||||
else:
|
||||
err_type = None
|
||||
|
||||
# Extract performance if no load error
|
||||
pp_match = PP_RE.search(text) if err_type is None else None
|
||||
tg_match = TG_RE.search(text) if err_type is None else None
|
||||
|
||||
for key, match in [('pp512', pp_match), ('tg128', tg_match)]:
|
||||
cell = {
|
||||
'mean': match.group(1) if match else None,
|
||||
'std': match.group(2) if match else None,
|
||||
'error': err_type is not None,
|
||||
'etype': err_type
|
||||
}
|
||||
data.setdefault(model, {}).setdefault(key, {})[env] = cell
|
||||
|
||||
# Select winner
|
||||
def pick_winner(env_data):
|
||||
scores = {e: float(d['mean']) for e,d in env_data.items() if not d['error'] and d['mean']}
|
||||
if not scores:
|
||||
return '—'
|
||||
best = max(scores, key=scores.get)
|
||||
others = [v for k,v in scores.items() if k!=best]
|
||||
tag = f"🏆 **{best}**"
|
||||
if others:
|
||||
gain = (scores[best]/max(others)-1)*100
|
||||
tag += f" (+{gain:.0f}%)"
|
||||
return tag
|
||||
|
||||
# Render table with distinct error messages
|
||||
def render_table(test_label, display_name):
|
||||
print(f"### {display_name} — tokens/second\n")
|
||||
header = ['Model'] + [e.replace('_',' ').title() for e in ENV_ORDER] + ['Winner']
|
||||
print("| " + " | ".join(header) + " |")
|
||||
print("|" + "|".join(['---']*len(header)) + "|")
|
||||
|
||||
for model in sorted(data, key=lambda s: s.lower()):
|
||||
row = [f"**{model}**"]
|
||||
env_data = data[model].get(test_label, {})
|
||||
for env in ENV_ORDER:
|
||||
d = env_data.get(env)
|
||||
if not d:
|
||||
cell = '—'
|
||||
elif d['error']:
|
||||
et = d['etype']
|
||||
if et=='load':
|
||||
cell = '⚠️ Load Error'
|
||||
elif et=='hang':
|
||||
cell = '⚠️ GPU Hang'
|
||||
else:
|
||||
cell = '⚠️ Runtime Error'
|
||||
else:
|
||||
cell = f"{float(d['mean']):.2f} ± {float(d['std']):.2f}"
|
||||
row.append(cell)
|
||||
row.append(pick_winner(env_data))
|
||||
print("| " + " | ".join(row) + " |")
|
||||
print()
|
||||
|
||||
# Output tables
|
||||
render_table('pp512','Prompt Processing (pp512)')
|
||||
render_table('tg128','Text Generation (tg128)')
|
||||
|
||||
# Summary of failures by type
|
||||
fail_lines = []
|
||||
for model in sorted(data, key=lambda s: s.lower()):
|
||||
for test_label, envs in data[model].items():
|
||||
for env,d in envs.items():
|
||||
if d['error']:
|
||||
et = d['etype'] or 'unknown'
|
||||
desc = {
|
||||
'load':'failed to load',
|
||||
'hang':'GPU hang',
|
||||
'runtime':'runtime error',
|
||||
}.get(et, 'error')
|
||||
fail_lines.append(f"- **{model}** [{test_label}] on *{env}*: {desc}")
|
||||
if fail_lines:
|
||||
print("## Failed Runs\n")
|
||||
print("\n".join(fail_lines))
|
||||
Reference in New Issue
Block a user