Benchmark and container updates

This commit is contained in:
Donato Capitella
2025-08-03 13:05:52 +01:00
parent e295685041
commit e7e27e6cf3
130 changed files with 12111 additions and 267 deletions
+120
View File
@@ -0,0 +1,120 @@
#!/usr/bin/env python3
import re, glob, os
# This script parses llama-bench logs in 'results/' to produce
# Markdown tables for pp512 (prompt processing) and tg128 (text generation).
# Regex patterns to extract tokens/sec rows
PP_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*pp512\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
TG_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*tg128\s*\|\s*([\d.]+)\s*±\s*([\d.]+)")
# Patterns to classify errors
LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed", re.IGNORECASE)
HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE)
GENERIC_ERR = re.compile(r"error:|exit \d+", re.IGNORECASE)
# Env ordering
ENV_ORDER = ["vulkan_radv","vulkan_amdvlk","rocm6_4_2","rocm7_beta","rocm7_rc"]
data = {}
# Utility to clean model names
def clean_name(raw):
return re.sub(r"-000\d+-of-000\d+", "", raw)
# Scan logs
glob_pattern = os.path.join("results", "*.log")
for path in sorted(glob.glob(glob_pattern)):
# Fix: use rsplit, not rssplit
base = os.path.basename(path).rsplit('.log',1)[0]
if '__' not in base:
continue
model_raw, env = base.split('__',1)
model = clean_name(model_raw)
text = open(path, errors='ignore').read()
# Determine error type
if LOAD_ERR.search(text):
err_type = 'load'
elif HANG_ERR.search(text):
err_type = 'hang'
elif GENERIC_ERR.search(text) and not (PP_RE.search(text) and TG_RE.search(text)):
err_type = 'runtime'
else:
err_type = None
# Extract performance if no load error
pp_match = PP_RE.search(text) if err_type is None else None
tg_match = TG_RE.search(text) if err_type is None else None
for key, match in [('pp512', pp_match), ('tg128', tg_match)]:
cell = {
'mean': match.group(1) if match else None,
'std': match.group(2) if match else None,
'error': err_type is not None,
'etype': err_type
}
data.setdefault(model, {}).setdefault(key, {})[env] = cell
# Select winner
def pick_winner(env_data):
scores = {e: float(d['mean']) for e,d in env_data.items() if not d['error'] and d['mean']}
if not scores:
return ''
best = max(scores, key=scores.get)
others = [v for k,v in scores.items() if k!=best]
tag = f"🏆 **{best}**"
if others:
gain = (scores[best]/max(others)-1)*100
tag += f" (+{gain:.0f}%)"
return tag
# Render table with distinct error messages
def render_table(test_label, display_name):
print(f"### {display_name} — tokens/second\n")
header = ['Model'] + [e.replace('_',' ').title() for e in ENV_ORDER] + ['Winner']
print("| " + " | ".join(header) + " |")
print("|" + "|".join(['---']*len(header)) + "|")
for model in sorted(data, key=lambda s: s.lower()):
row = [f"**{model}**"]
env_data = data[model].get(test_label, {})
for env in ENV_ORDER:
d = env_data.get(env)
if not d:
cell = ''
elif d['error']:
et = d['etype']
if et=='load':
cell = '⚠️ Load Error'
elif et=='hang':
cell = '⚠️ GPU Hang'
else:
cell = '⚠️ Runtime Error'
else:
cell = f"{float(d['mean']):.2f} ± {float(d['std']):.2f}"
row.append(cell)
row.append(pick_winner(env_data))
print("| " + " | ".join(row) + " |")
print()
# Output tables
render_table('pp512','Prompt Processing (pp512)')
render_table('tg128','Text Generation (tg128)')
# Summary of failures by type
fail_lines = []
for model in sorted(data, key=lambda s: s.lower()):
for test_label, envs in data[model].items():
for env,d in envs.items():
if d['error']:
et = d['etype'] or 'unknown'
desc = {
'load':'failed to load',
'hang':'GPU hang',
'runtime':'runtime error',
}.get(et, 'error')
fail_lines.append(f"- **{model}** [{test_label}] on *{env}*: {desc}")
if fail_lines:
print("## Failed Runs\n")
print("\n".join(fail_lines))