From 71abff5a0d459cfd0c03c3ae229f31f0c1808ab8 Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sun, 17 May 2026 11:03:23 +0100 Subject: [PATCH] feat: add interactive granular benchmark details to UI and update MTP summary data format --- benchmark/mtp-bench.py | 16 +- benchmark/run_mtp_bench.py | 23 +- docs/assets/mtp.css | 81 ++ docs/assets/mtp.js | 112 ++ docs/mtp-summary.json | 1402 +++++++++++++++++++++++++- toolboxes/Dockerfile.rocm-7.2.3-mtp | 113 --- toolboxes/Dockerfile.vulkan-radv-mtp | 68 -- 7 files changed, 1588 insertions(+), 227 deletions(-) delete mode 100644 toolboxes/Dockerfile.rocm-7.2.3-mtp delete mode 100644 toolboxes/Dockerfile.vulkan-radv-mtp diff --git a/benchmark/mtp-bench.py b/benchmark/mtp-bench.py index d6c8a10..3146332 100755 --- a/benchmark/mtp-bench.py +++ b/benchmark/mtp-bench.py @@ -87,21 +87,33 @@ def run(args): # OpenAI-compatible endpoint: timings are in usage or top-level usage = r.get("usage", {}) or {} t = r.get("timings", {}) or {} + prompt_n = usage.get("prompt_tokens") or t.get("prompt_n") + prompt_ms = t.get("prompt_ms") + prompt_per_second = t.get("prompt_per_second") predicted_n = usage.get("completion_tokens") or t.get("predicted_n") predicted_per_second = t.get("predicted_per_second") or (predicted_n / wall if wall > 0 else 0) rec = {"name": p["name"], "wall_s": round(wall,3), + "prompt_n": prompt_n, + "prompt_ms": round(prompt_ms, 2) if prompt_ms is not None else None, + "prompt_per_second": round(prompt_per_second, 2) if prompt_per_second is not None else None, "predicted_n": predicted_n, "predicted_per_second": round(predicted_per_second, 2), "draft_n": t.get("draft_n",0), "draft_n_accepted": t.get("draft_n_accepted",0)} rec["accept_rate"] = round(rec["draft_n_accepted"]/rec["draft_n"],4) if rec["draft_n"] else None out["results"].append(rec) ar = f"{rec['accept_rate']:.3f}" if rec["accept_rate"] is not None else "n/a" - print(f" {rec['name']:<18} pred={rec['predicted_n']:>4} draft={rec['draft_n']:>4} acc={rec['draft_n_accepted']:>4} rate={ar} tok/s={rec['predicted_per_second']:.1f}") + pps = f" pt/s={rec['prompt_per_second']:.1f}" if rec.get("prompt_per_second") else "" + print(f" {rec['name']:<18} pred={rec['predicted_n']:>4} draft={rec['draft_n']:>4} acc={rec['draft_n_accepted']:>4} rate={ar} tok/s={rec['predicted_per_second']:.1f}{pps}") td = sum(x["draft_n"] or 0 for x in out["results"]) ta = sum(x["draft_n_accepted"] or 0 for x in out["results"]) tp = sum(x["predicted_n"] or 0 for x in out["results"]) + t_pn = sum(x["prompt_n"] or 0 for x in out["results"]) tw = sum(x["wall_s"] for x in out["results"]) + pps_list = [x["prompt_per_second"] for x in out["results"] if x.get("prompt_per_second") is not None] + avg_pps = sum(pps_list)/len(pps_list) if pps_list else None + out["aggregate"] = {"n_requests": len(out["results"]), "total_predicted": tp, "total_draft": td, "total_draft_accepted": ta, - "aggregate_accept_rate": round(ta/td,4) if td else None, "wall_s_total": round(tw,2)} + "aggregate_accept_rate": round(ta/td,4) if td else None, "wall_s_total": round(tw,2), + "total_prompt_tokens": t_pn, "avg_prompt_per_second": round(avg_pps, 2) if avg_pps is not None else None} print("\nAggregate:", json.dumps(out["aggregate"], indent=2)) if args.out: json.dump(out, open(args.out,"w"), indent=2); print("Wrote", args.out) diff --git a/benchmark/run_mtp_bench.py b/benchmark/run_mtp_bench.py index b33aec5..6c836c7 100755 --- a/benchmark/run_mtp_bench.py +++ b/benchmark/run_mtp_bench.py @@ -29,8 +29,8 @@ from urllib.error import URLError # ── Toolbox definitions ────────────────────────────────────────────────────── TOOLBOXES = { - "rocm-7.2.3-mtp": { - "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7.2.3-mtp", + "rocm-7.2.3": { + "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7.2.3", "engine_args": [ "--device", "/dev/dri", "--device", "/dev/kfd", @@ -39,8 +39,8 @@ TOOLBOXES = { "--security-opt", "seccomp=unconfined", ], }, - "vulkan-radv-mtp": { - "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan-radv-mtp", + "vulkan-radv": { + "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan-radv", "engine_args": [ "--device", "/dev/dri", "--group-add", "video", @@ -415,9 +415,9 @@ def print_summary(results_dir: Path): baselines[key] = r["_avg_toks"] # Print table - print("\n" + "=" * 100) - print(f"{'Model':<30} {'Toolbox':<20} {'Mode':<10} {'Avg tok/s':>10} {'Accept%':>9} {'Wall(s)':>8} {'Speedup':>8}") - print("-" * 100) + print("\n" + "=" * 115) + print(f"{'Model':<30} {'Toolbox':<20} {'Mode':<10} {'Prefill pt/s':>13} {'Avg tok/s':>10} {'Accept%':>9} {'Wall(s)':>8} {'Speedup':>8}") + print("-" * 115) for r in results: agg = r.get("aggregate", {}) @@ -426,6 +426,9 @@ def print_summary(results_dir: Path): accept_str = f"{accept * 100:.1f}%" if accept is not None else "—" avg_toks = r["_avg_toks"] + avg_prompt = agg.get("avg_prompt_per_second") + prefill_str = f"{avg_prompt:.1f}" if avg_prompt is not None else "—" + # Speedup relative to baseline baseline_key = (r["model"], r["toolbox"]) baseline_toks = baselines.get(baseline_key) @@ -434,9 +437,9 @@ def print_summary(results_dir: Path): else: speedup = "—" - print(f"{r['model']:<30} {r['toolbox']:<20} {r['mode']:<10} {avg_toks:>10.1f} {accept_str:>9} {wall:>8.1f} {speedup:>8}") + print(f"{r['model']:<30} {r['toolbox']:<20} {r['mode']:<10} {prefill_str:>13} {avg_toks:>10.1f} {accept_str:>9} {wall:>8.1f} {speedup:>8}") - print("=" * 100) + print("=" * 115) # Write summary.json summary_data = [] @@ -446,9 +449,11 @@ def print_summary(results_dir: Path): "model": r["model"], "toolbox": r["toolbox"], "mode": r["mode"], + "avg_prompt_tok_s": agg.get("avg_prompt_per_second"), "avg_tok_s": round(r["_avg_toks"], 1), "accept_rate": agg.get("aggregate_accept_rate"), "wall_s_total": agg.get("wall_s_total"), + "results": r.get("results", []) }) summary_path = results_dir / "summary.json" diff --git a/docs/assets/mtp.css b/docs/assets/mtp.css index 516399b..fda32fc 100644 --- a/docs/assets/mtp.css +++ b/docs/assets/mtp.css @@ -103,9 +103,90 @@ font-size: 11px; background: #f1f5ff; color: #1d4ed8; + white-space: nowrap; } .toolbox-pill.radv { background: #fdf2f8; color: #9d174d; } + +/* Expandable row interactivity */ +.mtp-table tbody tr.main-row { + cursor: pointer; + transition: background-color 0.15s ease; +} + +.mtp-table tbody tr.main-row:hover { + background-color: var(--hover); +} + +.mtp-table tbody tr.main-row td:first-child::before { + content: "▶"; + display: inline-block; + font-size: 10px; + margin-right: 8px; + color: var(--muted); + transition: transform 0.2s ease; +} + +.mtp-table tbody tr.main-row.expanded td:first-child::before { + transform: rotate(90deg); +} + +/* Details row and sub-table */ +.details-row { + background-color: #f8fafc; +} + +.details-row.hidden { + display: none; +} + +.details-row td { + padding: 0; + border-bottom: 1px solid var(--border); +} + +.granular-wrap { + padding: 16px 24px; + box-shadow: inset 0 2px 4px rgba(0,0,0,0.02); +} + +.granular-table { + width: 100%; + border-collapse: collapse; + font-size: 13px; + background: #fff; + border: 1px solid var(--border); + border-radius: 4px; + overflow: hidden; +} + +.granular-table th, .granular-table td { + padding: 8px 12px; + text-align: left; + border-bottom: 1px solid var(--border); +} + +.granular-table th { + background: #f1f5f9; + font-weight: 600; + color: var(--ink); + text-transform: uppercase; + font-size: 11px; + letter-spacing: 0.5px; +} + +.granular-table td.num { + text-align: right; + font-variant-numeric: tabular-nums; +} + +.granular-table tr:last-child td { + border-bottom: none; +} + +.granular-table tbody tr:hover { + background-color: var(--hover); +} diff --git a/docs/assets/mtp.js b/docs/assets/mtp.js index 0862651..c106d24 100644 --- a/docs/assets/mtp.js +++ b/docs/assets/mtp.js @@ -49,6 +49,7 @@ function renderTable(runs, tbody) { rows.forEach(row => { const tr = document.createElement("tr"); + tr.className = "main-row"; // Model const tdModel = document.createElement("td"); @@ -87,7 +88,22 @@ function renderTable(runs, tbody) { tr.appendChild(makeMetricCell(mtp3Speed)); tr.appendChild(makeSpeedupCell(baseSpeed, mtp3Speed)); + // Details row + const detailsTr = document.createElement("tr"); + detailsTr.className = "details-row hidden"; + const detailsTd = document.createElement("td"); + detailsTd.colSpan = 8; + + detailsTd.innerHTML = makeDetailsHTML(row); + detailsTr.appendChild(detailsTd); + + tr.addEventListener("click", () => { + tr.classList.toggle("expanded"); + detailsTr.classList.toggle("hidden"); + }); + tbody.appendChild(tr); + tbody.appendChild(detailsTr); }); } @@ -126,3 +142,99 @@ function makeSpeedupCell(base, mtp) { } return td; } + +function makeDetailsHTML(row) { + if (!row.baseline || !row.baseline.results || row.baseline.results.length === 0) { + return `

Granular data not available for this run. Re-run benchmarks to capture prompt-level metrics.

`; + } + + const tasks = new Map(); + const modes = [ + { key: "base", data: row.baseline }, + { key: "mtp2", data: row.mtp2 }, + { key: "mtp3", data: row.mtp3 } + ]; + + modes.forEach(mode => { + if (!mode.data || !mode.data.results) return; + mode.data.results.forEach(res => { + if (!tasks.has(res.name)) { + tasks.set(res.name, { name: res.name }); + } + const t = tasks.get(res.name); + t[`${mode.key}_prefill`] = res.prompt_per_second; + t[`${mode.key}_toks`] = res.predicted_per_second; + t[`${mode.key}_acc`] = res.accept_rate; + }); + }); + + let html = ` +
+ + + + + + + + + + + + + + + + `; + + tasks.forEach(t => { + const p_base_val = t.base_prefill; + const p_mtp2_val = t.mtp2_prefill; + const p_mtp3_val = t.mtp3_prefill; + + const p_base = p_base_val ? p_base_val.toFixed(1) : "—"; + let p_mtp2 = p_mtp2_val ? p_mtp2_val.toFixed(1) : "—"; + let p_mtp3 = p_mtp3_val ? p_mtp3_val.toFixed(1) : "—"; + + if (p_base_val && p_mtp2_val) { + const pct = ((p_mtp2_val - p_base_val) / p_base_val) * 100; + const color = pct >= 0 ? '#16a34a' : '#dc2626'; + const sign = pct > 0 ? '+' : ''; + p_mtp2 += ` ${sign}${pct.toFixed(1)}%`; + } + + if (p_base_val && p_mtp3_val) { + const pct = ((p_mtp3_val - p_base_val) / p_base_val) * 100; + const color = pct >= 0 ? '#16a34a' : '#dc2626'; + const sign = pct > 0 ? '+' : ''; + p_mtp3 += ` ${sign}${pct.toFixed(1)}%`; + } + + const g_base = t.base_toks ? t.base_toks.toFixed(1) : "—"; + const g_mtp2 = t.mtp2_toks ? t.mtp2_toks.toFixed(1) : "—"; + const a_mtp2 = t.mtp2_acc !== null && t.mtp2_acc !== undefined ? (t.mtp2_acc * 100).toFixed(1) + "%" : "—"; + const g_mtp3 = t.mtp3_toks ? t.mtp3_toks.toFixed(1) : "—"; + const a_mtp3 = t.mtp3_acc !== null && t.mtp3_acc !== undefined ? (t.mtp3_acc * 100).toFixed(1) + "%" : "—"; + + html += ` + + + + + + + + + + + + `; + }); + + html += ` + +
Prompt TaskPrefill (Base)Prefill (MTP-2)Prefill (MTP-3)Base GenMTP-2 GenAcc%MTP-3 GenAcc%
${t.name}${p_base}${p_mtp2}${p_mtp3}${g_base}${g_mtp2}${a_mtp2}${g_mtp3}${a_mtp3}
+
`; + + return html; +} diff --git a/docs/mtp-summary.json b/docs/mtp-summary.json index a5544bf..ce0276d 100644 --- a/docs/mtp-summary.json +++ b/docs/mtp-summary.json @@ -1,98 +1,1430 @@ [ { "model": "Qwen3.6-27B-UD-Q8_K_XL", - "toolbox": "rocm-7.2.3-mtp", + "toolbox": "rocm-7.2.3", "mode": "baseline", - "avg_tok_s": 6.5, + "avg_prompt_tok_s": 104.9, + "avg_tok_s": 6.4, "accept_rate": null, - "wall_s_total": 273.39 + "wall_s_total": 274.48, + "results": [ + { + "name": "code_python", + "wall_s": 30.306, + "prompt_n": 30, + "prompt_ms": 509.43, + "prompt_per_second": 58.89, + "predicted_n": 192, + "predicted_per_second": 6.45, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "code_cpp", + "wall_s": 30.249, + "prompt_n": 40, + "prompt_ms": 428.45, + "prompt_per_second": 93.36, + "predicted_n": 192, + "predicted_per_second": 6.45, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "explain_concept", + "wall_s": 30.3, + "prompt_n": 27, + "prompt_ms": 469.46, + "prompt_per_second": 57.51, + "predicted_n": 192, + "predicted_per_second": 6.44, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "summarize", + "wall_s": 30.292, + "prompt_n": 62, + "prompt_ms": 472.42, + "prompt_per_second": 131.24, + "predicted_n": 192, + "predicted_per_second": 6.45, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "qa_factual", + "wall_s": 30.31, + "prompt_n": 24, + "prompt_ms": 467.84, + "prompt_per_second": 51.3, + "predicted_n": 192, + "predicted_per_second": 6.44, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "translation", + "wall_s": 30.299, + "prompt_n": 25, + "prompt_ms": 467.72, + "prompt_per_second": 53.45, + "predicted_n": 192, + "predicted_per_second": 6.44, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "creative_short", + "wall_s": 30.312, + "prompt_n": 21, + "prompt_ms": 476.88, + "prompt_per_second": 44.04, + "predicted_n": 192, + "predicted_per_second": 6.44, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "stepwise_math", + "wall_s": 30.29, + "prompt_n": 60, + "prompt_ms": 469.75, + "prompt_per_second": 127.73, + "predicted_n": 192, + "predicted_per_second": 6.45, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "long_code_review", + "wall_s": 32.122, + "prompt_n": 731, + "prompt_ms": 2238.33, + "prompt_per_second": 326.58, + "predicted_n": 192, + "predicted_per_second": 6.43, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + } + ] }, { "model": "Qwen3.6-27B-UD-Q8_K_XL", - "toolbox": "rocm-7.2.3-mtp", + "toolbox": "rocm-7.2.3", "mode": "mtp-2", - "avg_tok_s": 12.4, + "avg_prompt_tok_s": 97.38, + "avg_tok_s": 12.6, "accept_rate": 0.7971, - "wall_s_total": 147.31 + "wall_s_total": 144.29, + "results": [ + { + "name": "code_python", + "wall_s": 15.171, + "prompt_n": 30, + "prompt_ms": 527.23, + "prompt_per_second": 56.9, + "predicted_n": 192, + "predicted_per_second": 13.12, + "draft_n": 140, + "draft_n_accepted": 120, + "accept_rate": 0.8571 + }, + { + "name": "code_cpp", + "wall_s": 15.787, + "prompt_n": 40, + "prompt_ms": 461.35, + "prompt_per_second": 86.7, + "predicted_n": 192, + "predicted_per_second": 12.56, + "draft_n": 147, + "draft_n_accepted": 117, + "accept_rate": 0.7959 + }, + { + "name": "explain_concept", + "wall_s": 15.203, + "prompt_n": 27, + "prompt_ms": 495.97, + "prompt_per_second": 54.44, + "predicted_n": 192, + "predicted_per_second": 13.09, + "draft_n": 141, + "draft_n_accepted": 120, + "accept_rate": 0.8511 + }, + { + "name": "summarize", + "wall_s": 14.803, + "prompt_n": 62, + "prompt_ms": 508.33, + "prompt_per_second": 121.97, + "predicted_n": 192, + "predicted_per_second": 13.47, + "draft_n": 137, + "draft_n_accepted": 122, + "accept_rate": 0.8905 + }, + { + "name": "qa_factual", + "wall_s": 15.822, + "prompt_n": 24, + "prompt_ms": 495.12, + "prompt_per_second": 48.47, + "predicted_n": 192, + "predicted_per_second": 12.56, + "draft_n": 148, + "draft_n_accepted": 117, + "accept_rate": 0.7905 + }, + { + "name": "translation", + "wall_s": 16.442, + "prompt_n": 25, + "prompt_ms": 496.05, + "prompt_per_second": 50.4, + "predicted_n": 192, + "predicted_per_second": 12.07, + "draft_n": 153, + "draft_n_accepted": 114, + "accept_rate": 0.7451 + }, + { + "name": "creative_short", + "wall_s": 17.224, + "prompt_n": 21, + "prompt_ms": 492.99, + "prompt_per_second": 42.6, + "predicted_n": 192, + "predicted_per_second": 11.5, + "draft_n": 160, + "draft_n_accepted": 110, + "accept_rate": 0.6875 + }, + { + "name": "stepwise_math", + "wall_s": 15.005, + "prompt_n": 60, + "prompt_ms": 503.13, + "prompt_per_second": 119.25, + "predicted_n": 192, + "predicted_per_second": 13.28, + "draft_n": 139, + "draft_n_accepted": 121, + "accept_rate": 0.8705 + }, + { + "name": "long_code_review", + "wall_s": 18.833, + "prompt_n": 731, + "prompt_ms": 2471.97, + "prompt_per_second": 295.72, + "predicted_n": 192, + "predicted_per_second": 11.76, + "draft_n": 156, + "draft_n_accepted": 112, + "accept_rate": 0.7179 + } + ] }, { "model": "Qwen3.6-27B-UD-Q8_K_XL", - "toolbox": "rocm-7.2.3-mtp", + "toolbox": "rocm-7.2.3", "mode": "mtp-3", - "avg_tok_s": 13.5, + "avg_prompt_tok_s": 95.5, + "avg_tok_s": 13.7, "accept_rate": 0.744, - "wall_s_total": 135.2 + "wall_s_total": 133.44, + "results": [ + { + "name": "code_python", + "wall_s": 13.55, + "prompt_n": 30, + "prompt_ms": 542.34, + "prompt_per_second": 55.32, + "predicted_n": 192, + "predicted_per_second": 14.78, + "draft_n": 163, + "draft_n_accepted": 136, + "accept_rate": 0.8344 + }, + { + "name": "code_cpp", + "wall_s": 14.924, + "prompt_n": 40, + "prompt_ms": 473.5, + "prompt_per_second": 84.48, + "predicted_n": 192, + "predicted_per_second": 13.32, + "draft_n": 181, + "draft_n_accepted": 130, + "accept_rate": 0.7182 + }, + { + "name": "explain_concept", + "wall_s": 14.489, + "prompt_n": 27, + "prompt_ms": 510.02, + "prompt_per_second": 52.94, + "predicted_n": 192, + "predicted_per_second": 13.77, + "draft_n": 175, + "draft_n_accepted": 132, + "accept_rate": 0.7543 + }, + { + "name": "summarize", + "wall_s": 13.973, + "prompt_n": 62, + "prompt_ms": 522.38, + "prompt_per_second": 118.69, + "predicted_n": 192, + "predicted_per_second": 14.32, + "draft_n": 168, + "draft_n_accepted": 134, + "accept_rate": 0.7976 + }, + { + "name": "qa_factual", + "wall_s": 14.491, + "prompt_n": 24, + "prompt_ms": 508.76, + "prompt_per_second": 47.17, + "predicted_n": 192, + "predicted_per_second": 13.77, + "draft_n": 176, + "draft_n_accepted": 132, + "accept_rate": 0.75 + }, + { + "name": "translation", + "wall_s": 15.448, + "prompt_n": 25, + "prompt_ms": 508.67, + "prompt_per_second": 49.15, + "predicted_n": 192, + "predicted_per_second": 12.88, + "draft_n": 189, + "draft_n_accepted": 128, + "accept_rate": 0.6772 + }, + { + "name": "creative_short", + "wall_s": 15.432, + "prompt_n": 21, + "prompt_ms": 505.51, + "prompt_per_second": 41.54, + "predicted_n": 192, + "predicted_per_second": 12.9, + "draft_n": 187, + "draft_n_accepted": 128, + "accept_rate": 0.6845 + }, + { + "name": "stepwise_math", + "wall_s": 13.969, + "prompt_n": 60, + "prompt_ms": 516.53, + "prompt_per_second": 116.16, + "predicted_n": 192, + "predicted_per_second": 14.31, + "draft_n": 168, + "draft_n_accepted": 134, + "accept_rate": 0.7976 + }, + { + "name": "long_code_review", + "wall_s": 17.16, + "prompt_n": 731, + "prompt_ms": 2486.03, + "prompt_per_second": 294.04, + "predicted_n": 192, + "predicted_per_second": 13.12, + "draft_n": 183, + "draft_n_accepted": 129, + "accept_rate": 0.7049 + } + ] }, { "model": "Qwen3.6-27B-UD-Q8_K_XL", - "toolbox": "vulkan-radv-mtp", + "toolbox": "vulkan-radv", "mode": "baseline", + "avg_prompt_tok_s": 97.03, "avg_tok_s": 6.3, "accept_rate": null, - "wall_s_total": 283.86 + "wall_s_total": 280.34, + "results": [ + { + "name": "code_python", + "wall_s": 30.889, + "prompt_n": 30, + "prompt_ms": 497.72, + "prompt_per_second": 60.27, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "code_cpp", + "wall_s": 30.922, + "prompt_n": 40, + "prompt_ms": 513.86, + "prompt_per_second": 77.84, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "explain_concept", + "wall_s": 30.807, + "prompt_n": 27, + "prompt_ms": 404.26, + "prompt_per_second": 66.79, + "predicted_n": 192, + "predicted_per_second": 6.33, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "summarize", + "wall_s": 30.907, + "prompt_n": 62, + "prompt_ms": 490.66, + "prompt_per_second": 126.36, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "qa_factual", + "wall_s": 30.809, + "prompt_n": 24, + "prompt_ms": 400.38, + "prompt_per_second": 59.94, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "translation", + "wall_s": 30.815, + "prompt_n": 25, + "prompt_ms": 401.5, + "prompt_per_second": 62.27, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "creative_short", + "wall_s": 30.809, + "prompt_n": 21, + "prompt_ms": 397.41, + "prompt_per_second": 52.84, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "stepwise_math", + "wall_s": 30.9, + "prompt_n": 60, + "prompt_ms": 488.22, + "prompt_per_second": 122.9, + "predicted_n": 192, + "predicted_per_second": 6.32, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "long_code_review", + "wall_s": 33.483, + "prompt_n": 731, + "prompt_ms": 2995.01, + "prompt_per_second": 244.07, + "predicted_n": 192, + "predicted_per_second": 6.31, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + } + ] }, { "model": "Qwen3.6-27B-UD-Q8_K_XL", - "toolbox": "vulkan-radv-mtp", + "toolbox": "vulkan-radv", "mode": "mtp-2", + "avg_prompt_tok_s": 88.14, "avg_tok_s": 11.7, "accept_rate": 0.8024, - "wall_s_total": 159.41 + "wall_s_total": 156.11, + "results": [ + { + "name": "code_python", + "wall_s": 16.264, + "prompt_n": 30, + "prompt_ms": 538.04, + "prompt_per_second": 55.76, + "predicted_n": 192, + "predicted_per_second": 12.22, + "draft_n": 140, + "draft_n_accepted": 121, + "accept_rate": 0.8643 + }, + { + "name": "code_cpp", + "wall_s": 16.926, + "prompt_n": 40, + "prompt_ms": 564.77, + "prompt_per_second": 70.83, + "predicted_n": 192, + "predicted_per_second": 11.76, + "draft_n": 146, + "draft_n_accepted": 118, + "accept_rate": 0.8082 + }, + { + "name": "explain_concept", + "wall_s": 17.671, + "prompt_n": 27, + "prompt_ms": 438.41, + "prompt_per_second": 61.59, + "predicted_n": 192, + "predicted_per_second": 11.16, + "draft_n": 152, + "draft_n_accepted": 114, + "accept_rate": 0.75 + }, + { + "name": "summarize", + "wall_s": 15.168, + "prompt_n": 62, + "prompt_ms": 541.89, + "prompt_per_second": 114.41, + "predicted_n": 192, + "predicted_per_second": 13.17, + "draft_n": 130, + "draft_n_accepted": 126, + "accept_rate": 0.9692 + }, + { + "name": "qa_factual", + "wall_s": 16.558, + "prompt_n": 24, + "prompt_ms": 438.01, + "prompt_per_second": 54.79, + "predicted_n": 192, + "predicted_per_second": 11.95, + "draft_n": 142, + "draft_n_accepted": 119, + "accept_rate": 0.838 + }, + { + "name": "translation", + "wall_s": 17.52, + "prompt_n": 25, + "prompt_ms": 437.99, + "prompt_per_second": 57.08, + "predicted_n": 192, + "predicted_per_second": 11.27, + "draft_n": 151, + "draft_n_accepted": 115, + "accept_rate": 0.7616 + }, + { + "name": "creative_short", + "wall_s": 18.359, + "prompt_n": 21, + "prompt_ms": 433.87, + "prompt_per_second": 48.4, + "predicted_n": 192, + "predicted_per_second": 10.74, + "draft_n": 157, + "draft_n_accepted": 111, + "accept_rate": 0.707 + }, + { + "name": "stepwise_math", + "wall_s": 16.929, + "prompt_n": 60, + "prompt_ms": 530.6, + "prompt_per_second": 113.08, + "predicted_n": 192, + "predicted_per_second": 11.74, + "draft_n": 145, + "draft_n_accepted": 118, + "accept_rate": 0.8138 + }, + { + "name": "long_code_review", + "wall_s": 20.716, + "prompt_n": 731, + "prompt_ms": 3363.15, + "prompt_per_second": 217.36, + "predicted_n": 192, + "predicted_per_second": 11.1, + "draft_n": 153, + "draft_n_accepted": 114, + "accept_rate": 0.7451 + } + ] }, { "model": "Qwen3.6-27B-UD-Q8_K_XL", - "toolbox": "vulkan-radv-mtp", + "toolbox": "vulkan-radv", "mode": "mtp-3", + "avg_prompt_tok_s": 86.42, "avg_tok_s": 13.3, "accept_rate": 0.7301, - "wall_s_total": 141.74 + "wall_s_total": 137.86, + "results": [ + { + "name": "code_python", + "wall_s": 13.782, + "prompt_n": 30, + "prompt_ms": 550.68, + "prompt_per_second": 54.48, + "predicted_n": 192, + "predicted_per_second": 14.53, + "draft_n": 163, + "draft_n_accepted": 136, + "accept_rate": 0.8344 + }, + { + "name": "code_cpp", + "wall_s": 14.755, + "prompt_n": 40, + "prompt_ms": 578.77, + "prompt_per_second": 69.11, + "predicted_n": 192, + "predicted_per_second": 13.58, + "draft_n": 175, + "draft_n_accepted": 132, + "accept_rate": 0.7543 + }, + { + "name": "explain_concept", + "wall_s": 16.075, + "prompt_n": 27, + "prompt_ms": 452.06, + "prompt_per_second": 59.73, + "predicted_n": 192, + "predicted_per_second": 12.32, + "draft_n": 195, + "draft_n_accepted": 126, + "accept_rate": 0.6462 + }, + { + "name": "summarize", + "wall_s": 14.065, + "prompt_n": 62, + "prompt_ms": 553.79, + "prompt_per_second": 111.96, + "predicted_n": 192, + "predicted_per_second": 14.26, + "draft_n": 167, + "draft_n_accepted": 135, + "accept_rate": 0.8084 + }, + { + "name": "qa_factual", + "wall_s": 14.198, + "prompt_n": 24, + "prompt_ms": 450.37, + "prompt_per_second": 53.29, + "predicted_n": 192, + "predicted_per_second": 14.02, + "draft_n": 171, + "draft_n_accepted": 134, + "accept_rate": 0.7836 + }, + { + "name": "translation", + "wall_s": 14.908, + "prompt_n": 25, + "prompt_ms": 452.28, + "prompt_per_second": 55.28, + "predicted_n": 192, + "predicted_per_second": 13.33, + "draft_n": 179, + "draft_n_accepted": 131, + "accept_rate": 0.7318 + }, + { + "name": "creative_short", + "wall_s": 16.099, + "prompt_n": 21, + "prompt_ms": 448.93, + "prompt_per_second": 46.78, + "predicted_n": 192, + "predicted_per_second": 12.31, + "draft_n": 192, + "draft_n_accepted": 126, + "accept_rate": 0.6562 + }, + { + "name": "stepwise_math", + "wall_s": 15.415, + "prompt_n": 60, + "prompt_ms": 543.5, + "prompt_per_second": 110.4, + "predicted_n": 192, + "predicted_per_second": 12.95, + "draft_n": 183, + "draft_n_accepted": 129, + "accept_rate": 0.7049 + }, + { + "name": "long_code_review", + "wall_s": 18.562, + "prompt_n": 731, + "prompt_ms": 3372.45, + "prompt_per_second": 216.76, + "predicted_n": 192, + "predicted_per_second": 12.68, + "draft_n": 187, + "draft_n_accepted": 128, + "accept_rate": 0.6845 + } + ] }, { "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", - "toolbox": "rocm-7.2.3-mtp", + "toolbox": "rocm-7.2.3", "mode": "baseline", - "avg_tok_s": 48.7, + "avg_prompt_tok_s": 352.21, + "avg_tok_s": 51.1, "accept_rate": null, - "wall_s_total": 37.55 + "wall_s_total": 35.79, + "results": [ + { + "name": "code_python", + "wall_s": 3.904, + "prompt_n": 30, + "prompt_ms": 136.88, + "prompt_per_second": 219.17, + "predicted_n": 192, + "predicted_per_second": 51.14, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "code_cpp", + "wall_s": 3.901, + "prompt_n": 40, + "prompt_ms": 126.79, + "prompt_per_second": 315.49, + "predicted_n": 192, + "predicted_per_second": 51.15, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "explain_concept", + "wall_s": 3.883, + "prompt_n": 27, + "prompt_ms": 109.03, + "prompt_per_second": 247.65, + "predicted_n": 192, + "predicted_per_second": 51.16, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "summarize", + "wall_s": 3.931, + "prompt_n": 62, + "prompt_ms": 155.31, + "prompt_per_second": 399.2, + "predicted_n": 192, + "predicted_per_second": 51.14, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "qa_factual", + "wall_s": 3.88, + "prompt_n": 24, + "prompt_ms": 104.26, + "prompt_per_second": 230.19, + "predicted_n": 192, + "predicted_per_second": 51.15, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "translation", + "wall_s": 3.879, + "prompt_n": 25, + "prompt_ms": 104.91, + "prompt_per_second": 238.3, + "predicted_n": 192, + "predicted_per_second": 51.15, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "creative_short", + "wall_s": 3.877, + "prompt_n": 21, + "prompt_ms": 101.75, + "prompt_per_second": 206.38, + "predicted_n": 192, + "predicted_per_second": 51.15, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "stepwise_math", + "wall_s": 3.925, + "prompt_n": 60, + "prompt_ms": 148.79, + "prompt_per_second": 403.25, + "predicted_n": 192, + "predicted_per_second": 51.14, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "long_code_review", + "wall_s": 4.606, + "prompt_n": 731, + "prompt_ms": 803.04, + "prompt_per_second": 910.29, + "predicted_n": 192, + "predicted_per_second": 50.8, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + } + ] }, { "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", - "toolbox": "rocm-7.2.3-mtp", + "toolbox": "rocm-7.2.3", "mode": "mtp-2", - "avg_tok_s": 64.5, - "accept_rate": 0.7958, - "wall_s_total": 29.33 + "avg_prompt_tok_s": 309.53, + "avg_tok_s": 67.5, + "accept_rate": 0.8183, + "wall_s_total": 27.94, + "results": [ + { + "name": "code_python", + "wall_s": 2.844, + "prompt_n": 30, + "prompt_ms": 168.12, + "prompt_per_second": 178.44, + "predicted_n": 192, + "predicted_per_second": 72.12, + "draft_n": 133, + "draft_n_accepted": 123, + "accept_rate": 0.9248 + }, + { + "name": "code_cpp", + "wall_s": 2.906, + "prompt_n": 40, + "prompt_ms": 141.09, + "prompt_per_second": 283.51, + "predicted_n": 192, + "predicted_per_second": 69.99, + "draft_n": 139, + "draft_n_accepted": 121, + "accept_rate": 0.8705 + }, + { + "name": "explain_concept", + "wall_s": 3.202, + "prompt_n": 27, + "prompt_ms": 120.53, + "prompt_per_second": 224.02, + "predicted_n": 192, + "predicted_per_second": 62.77, + "draft_n": 156, + "draft_n_accepted": 113, + "accept_rate": 0.7244 + }, + { + "name": "summarize", + "wall_s": 2.894, + "prompt_n": 62, + "prompt_ms": 173.04, + "prompt_per_second": 358.31, + "predicted_n": 192, + "predicted_per_second": 71.13, + "draft_n": 136, + "draft_n_accepted": 122, + "accept_rate": 0.8971 + }, + { + "name": "qa_factual", + "wall_s": 2.88, + "prompt_n": 24, + "prompt_ms": 115.19, + "prompt_per_second": 208.36, + "predicted_n": 192, + "predicted_per_second": 70.01, + "draft_n": 139, + "draft_n_accepted": 121, + "accept_rate": 0.8705 + }, + { + "name": "translation", + "wall_s": 2.995, + "prompt_n": 25, + "prompt_ms": 116.14, + "prompt_per_second": 215.25, + "predicted_n": 192, + "predicted_per_second": 67.18, + "draft_n": 145, + "draft_n_accepted": 118, + "accept_rate": 0.8138 + }, + { + "name": "creative_short", + "wall_s": 3.171, + "prompt_n": 21, + "prompt_ms": 112.34, + "prompt_per_second": 186.93, + "predicted_n": 192, + "predicted_per_second": 63.23, + "draft_n": 154, + "draft_n_accepted": 113, + "accept_rate": 0.7338 + }, + { + "name": "stepwise_math", + "wall_s": 3.037, + "prompt_n": 60, + "prompt_ms": 161.11, + "prompt_per_second": 372.42, + "predicted_n": 192, + "predicted_per_second": 67.25, + "draft_n": 145, + "draft_n_accepted": 118, + "accept_rate": 0.8138 + }, + { + "name": "long_code_review", + "wall_s": 4.009, + "prompt_n": 731, + "prompt_ms": 963.68, + "prompt_per_second": 758.55, + "predicted_n": 192, + "predicted_per_second": 63.52, + "draft_n": 152, + "draft_n_accepted": 114, + "accept_rate": 0.75 + } + ] }, { "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", - "toolbox": "rocm-7.2.3-mtp", + "toolbox": "rocm-7.2.3", "mode": "mtp-3", - "avg_tok_s": 68.3, + "avg_prompt_tok_s": 302.26, + "avg_tok_s": 70.3, "accept_rate": 0.7386, - "wall_s_total": 27.83 + "wall_s_total": 27.0, + "results": [ + { + "name": "code_python", + "wall_s": 2.789, + "prompt_n": 30, + "prompt_ms": 165.27, + "prompt_per_second": 181.52, + "predicted_n": 192, + "predicted_per_second": 73.57, + "draft_n": 168, + "draft_n_accepted": 134, + "accept_rate": 0.7976 + }, + { + "name": "code_cpp", + "wall_s": 2.714, + "prompt_n": 40, + "prompt_ms": 146.13, + "prompt_per_second": 273.74, + "predicted_n": 192, + "predicted_per_second": 75.39, + "draft_n": 165, + "draft_n_accepted": 135, + "accept_rate": 0.8182 + }, + { + "name": "explain_concept", + "wall_s": 3.246, + "prompt_n": 27, + "prompt_ms": 126.13, + "prompt_per_second": 214.07, + "predicted_n": 192, + "predicted_per_second": 62.01, + "draft_n": 201, + "draft_n_accepted": 123, + "accept_rate": 0.6119 + }, + { + "name": "summarize", + "wall_s": 2.664, + "prompt_n": 62, + "prompt_ms": 178.24, + "prompt_per_second": 347.84, + "predicted_n": 192, + "predicted_per_second": 77.93, + "draft_n": 157, + "draft_n_accepted": 137, + "accept_rate": 0.8726 + }, + { + "name": "qa_factual", + "wall_s": 2.762, + "prompt_n": 24, + "prompt_ms": 119.88, + "prompt_per_second": 200.2, + "predicted_n": 192, + "predicted_per_second": 73.32, + "draft_n": 169, + "draft_n_accepted": 134, + "accept_rate": 0.7929 + }, + { + "name": "translation", + "wall_s": 2.875, + "prompt_n": 25, + "prompt_ms": 121.36, + "prompt_per_second": 206.0, + "predicted_n": 192, + "predicted_per_second": 70.29, + "draft_n": 177, + "draft_n_accepted": 131, + "accept_rate": 0.7401 + }, + { + "name": "creative_short", + "wall_s": 3.157, + "prompt_n": 21, + "prompt_ms": 116.34, + "prompt_per_second": 180.5, + "predicted_n": 192, + "predicted_per_second": 63.6, + "draft_n": 197, + "draft_n_accepted": 125, + "accept_rate": 0.6345 + }, + { + "name": "stepwise_math", + "wall_s": 2.912, + "prompt_n": 60, + "prompt_ms": 165.66, + "prompt_per_second": 362.18, + "predicted_n": 192, + "predicted_per_second": 70.46, + "draft_n": 177, + "draft_n_accepted": 131, + "accept_rate": 0.7401 + }, + { + "name": "long_code_review", + "wall_s": 3.882, + "prompt_n": 731, + "prompt_ms": 969.08, + "prompt_per_second": 754.32, + "predicted_n": 192, + "predicted_per_second": 66.48, + "draft_n": 184, + "draft_n_accepted": 128, + "accept_rate": 0.6957 + } + ] }, { "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", - "toolbox": "vulkan-radv-mtp", + "toolbox": "vulkan-radv", "mode": "baseline", - "avg_tok_s": 58.7, + "avg_prompt_tok_s": 302.53, + "avg_tok_s": 59.4, "accept_rate": null, - "wall_s_total": 31.93 + "wall_s_total": 31.46, + "results": [ + { + "name": "code_python", + "wall_s": 3.594, + "prompt_n": 30, + "prompt_ms": 314.93, + "prompt_per_second": 95.26, + "predicted_n": 192, + "predicted_per_second": 58.8, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "code_cpp", + "wall_s": 3.475, + "prompt_n": 40, + "prompt_ms": 227.75, + "prompt_per_second": 175.63, + "predicted_n": 192, + "predicted_per_second": 59.52, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "explain_concept", + "wall_s": 3.363, + "prompt_n": 27, + "prompt_ms": 119.62, + "prompt_per_second": 225.72, + "predicted_n": 192, + "predicted_per_second": 59.6, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "summarize", + "wall_s": 3.429, + "prompt_n": 62, + "prompt_ms": 175.54, + "prompt_per_second": 353.2, + "predicted_n": 192, + "predicted_per_second": 59.39, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "qa_factual", + "wall_s": 3.41, + "prompt_n": 24, + "prompt_ms": 108.41, + "prompt_per_second": 221.39, + "predicted_n": 192, + "predicted_per_second": 58.66, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "translation", + "wall_s": 3.347, + "prompt_n": 25, + "prompt_ms": 112.61, + "prompt_per_second": 222.01, + "predicted_n": 192, + "predicted_per_second": 59.87, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "creative_short", + "wall_s": 3.355, + "prompt_n": 21, + "prompt_ms": 106.77, + "prompt_per_second": 196.69, + "predicted_n": 192, + "predicted_per_second": 59.62, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "stepwise_math", + "wall_s": 3.399, + "prompt_n": 60, + "prompt_ms": 167.04, + "prompt_per_second": 359.19, + "predicted_n": 192, + "predicted_per_second": 59.93, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + }, + { + "name": "long_code_review", + "wall_s": 4.093, + "prompt_n": 731, + "prompt_ms": 836.67, + "prompt_per_second": 873.7, + "predicted_n": 192, + "predicted_per_second": 59.5, + "draft_n": 0, + "draft_n_accepted": 0, + "accept_rate": null + } + ] }, { "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", - "toolbox": "vulkan-radv-mtp", + "toolbox": "vulkan-radv", "mode": "mtp-2", - "avg_tok_s": 72.8, + "avg_prompt_tok_s": 255.09, + "avg_tok_s": 74.6, "accept_rate": 0.7907, - "wall_s_total": 26.85 + "wall_s_total": 26.13, + "results": [ + { + "name": "code_python", + "wall_s": 2.855, + "prompt_n": 30, + "prompt_ms": 373.02, + "prompt_per_second": 80.42, + "predicted_n": 192, + "predicted_per_second": 77.78, + "draft_n": 135, + "draft_n_accepted": 123, + "accept_rate": 0.9111 + }, + { + "name": "code_cpp", + "wall_s": 2.922, + "prompt_n": 40, + "prompt_ms": 267.27, + "prompt_per_second": 149.66, + "predicted_n": 192, + "predicted_per_second": 72.95, + "draft_n": 149, + "draft_n_accepted": 116, + "accept_rate": 0.7785 + }, + { + "name": "explain_concept", + "wall_s": 2.881, + "prompt_n": 27, + "prompt_ms": 133.95, + "prompt_per_second": 201.57, + "predicted_n": 192, + "predicted_per_second": 70.44, + "draft_n": 155, + "draft_n_accepted": 113, + "accept_rate": 0.729 + }, + { + "name": "summarize", + "wall_s": 2.546, + "prompt_n": 62, + "prompt_ms": 202.66, + "prompt_per_second": 305.94, + "predicted_n": 192, + "predicted_per_second": 82.7, + "draft_n": 134, + "draft_n_accepted": 124, + "accept_rate": 0.9254 + }, + { + "name": "qa_factual", + "wall_s": 2.516, + "prompt_n": 24, + "prompt_ms": 123.14, + "prompt_per_second": 194.89, + "predicted_n": 192, + "predicted_per_second": 80.95, + "draft_n": 137, + "draft_n_accepted": 122, + "accept_rate": 0.8905 + }, + { + "name": "translation", + "wall_s": 3.02, + "prompt_n": 25, + "prompt_ms": 126.43, + "prompt_per_second": 197.73, + "predicted_n": 192, + "predicted_per_second": 67.01, + "draft_n": 165, + "draft_n_accepted": 107, + "accept_rate": 0.6485 + }, + { + "name": "creative_short", + "wall_s": 2.989, + "prompt_n": 21, + "prompt_ms": 121.75, + "prompt_per_second": 172.48, + "predicted_n": 192, + "predicted_per_second": 67.62, + "draft_n": 166, + "draft_n_accepted": 108, + "accept_rate": 0.6506 + }, + { + "name": "stepwise_math", + "wall_s": 2.602, + "prompt_n": 60, + "prompt_ms": 186.44, + "prompt_per_second": 321.81, + "predicted_n": 192, + "predicted_per_second": 80.45, + "draft_n": 137, + "draft_n_accepted": 122, + "accept_rate": 0.8905 + }, + { + "name": "long_code_review", + "wall_s": 3.8, + "prompt_n": 731, + "prompt_ms": 1088.89, + "prompt_per_second": 671.32, + "predicted_n": 192, + "predicted_per_second": 71.64, + "draft_n": 150, + "draft_n_accepted": 115, + "accept_rate": 0.7667 + } + ] }, { "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", - "toolbox": "vulkan-radv-mtp", + "toolbox": "vulkan-radv", "mode": "mtp-3", - "avg_tok_s": 74.6, + "avg_prompt_tok_s": 248.42, + "avg_tok_s": 75.7, "accept_rate": 0.7374, - "wall_s_total": 26.36 + "wall_s_total": 25.87, + "results": [ + { + "name": "code_python", + "wall_s": 2.738, + "prompt_n": 30, + "prompt_ms": 375.67, + "prompt_per_second": 79.86, + "predicted_n": 192, + "predicted_per_second": 81.75, + "draft_n": 165, + "draft_n_accepted": 136, + "accept_rate": 0.8242 + }, + { + "name": "code_cpp", + "wall_s": 2.946, + "prompt_n": 40, + "prompt_ms": 274.8, + "prompt_per_second": 145.56, + "predicted_n": 192, + "predicted_per_second": 72.5, + "draft_n": 185, + "draft_n_accepted": 129, + "accept_rate": 0.6973 + }, + { + "name": "explain_concept", + "wall_s": 2.814, + "prompt_n": 27, + "prompt_ms": 137.5, + "prompt_per_second": 196.37, + "predicted_n": 192, + "predicted_per_second": 72.37, + "draft_n": 185, + "draft_n_accepted": 129, + "accept_rate": 0.6973 + }, + { + "name": "summarize", + "wall_s": 2.516, + "prompt_n": 62, + "prompt_ms": 208.53, + "prompt_per_second": 297.32, + "predicted_n": 192, + "predicted_per_second": 84.04, + "draft_n": 159, + "draft_n_accepted": 138, + "accept_rate": 0.8679 + }, + { + "name": "qa_factual", + "wall_s": 2.536, + "prompt_n": 24, + "prompt_ms": 129.09, + "prompt_per_second": 185.91, + "predicted_n": 192, + "predicted_per_second": 80.54, + "draft_n": 165, + "draft_n_accepted": 135, + "accept_rate": 0.8182 + }, + { + "name": "translation", + "wall_s": 2.8, + "prompt_n": 25, + "prompt_ms": 131.97, + "prompt_per_second": 189.44, + "predicted_n": 192, + "predicted_per_second": 72.77, + "draft_n": 185, + "draft_n_accepted": 129, + "accept_rate": 0.6973 + }, + { + "name": "creative_short", + "wall_s": 2.943, + "prompt_n": 21, + "prompt_ms": 126.24, + "prompt_per_second": 166.35, + "predicted_n": 192, + "predicted_per_second": 68.88, + "draft_n": 198, + "draft_n_accepted": 125, + "accept_rate": 0.6313 + }, + { + "name": "stepwise_math", + "wall_s": 2.553, + "prompt_n": 60, + "prompt_ms": 192.58, + "prompt_per_second": 311.56, + "predicted_n": 192, + "predicted_per_second": 82.33, + "draft_n": 164, + "draft_n_accepted": 136, + "accept_rate": 0.8293 + }, + { + "name": "long_code_review", + "wall_s": 4.027, + "prompt_n": 731, + "prompt_ms": 1101.86, + "prompt_per_second": 663.43, + "predicted_n": 192, + "predicted_per_second": 66.37, + "draft_n": 197, + "draft_n_accepted": 125, + "accept_rate": 0.6345 + } + ] } -] +] \ No newline at end of file diff --git a/toolboxes/Dockerfile.rocm-7.2.3-mtp b/toolboxes/Dockerfile.rocm-7.2.3-mtp deleted file mode 100644 index 7f4da4f..0000000 --- a/toolboxes/Dockerfile.rocm-7.2.3-mtp +++ /dev/null @@ -1,113 +0,0 @@ -# build stage -FROM registry.fedoraproject.org/fedora:43 AS builder - -# rocm 7.2.3 repo -RUN <<'EOF' -tee /etc/yum.repos.d/rocm.repo < /etc/ld.so.conf.d/local.conf \ - && echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf \ - && ldconfig \ - && cp -n /usr/local/lib/libllama*.so* /usr/lib64/ 2>/dev/null || true \ - && ldconfig - -# helper -COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py -RUN chmod +x /usr/local/bin/gguf-vram-estimator.py - -# profile -RUN printf '%s\n' \ - > /etc/profile.d/rocm.sh && chmod +x /etc/profile.d/rocm.sh \ - && echo 'source /etc/profile.d/rocm.sh' >> /etc/bashrc - -# shell -CMD ["/bin/bash"] diff --git a/toolboxes/Dockerfile.vulkan-radv-mtp b/toolboxes/Dockerfile.vulkan-radv-mtp deleted file mode 100644 index 76bbded..0000000 --- a/toolboxes/Dockerfile.vulkan-radv-mtp +++ /dev/null @@ -1,68 +0,0 @@ -# build stage -FROM registry.fedoraproject.org/fedora:43 AS builder - -# deps -RUN dnf -y --nodocs --setopt=install_weak_deps=False install \ - git vim \ - make gcc cmake ninja-build lld clang clang-devel compiler-rt libcurl-devel \ - vulkan-loader-devel vulkaninfo mesa-vulkan-drivers \ - spirv-headers-devel radeontop glslc patch \ - && dnf clean all && rm -rf /var/cache/dnf/* - -# llama.cpp (am17an mtp-clean fork — Multi-Token Prediction) -WORKDIR /opt/llama.cpp -RUN git clone -b mtp-clean --single-branch https://github.com/am17an/llama.cpp.git . - -COPY llama-grammar.patch /tmp/llama-grammar.patch - -# build -RUN git clean -xdf \ - && git submodule update --recursive \ - && patch -p1 < /tmp/llama-grammar.patch \ - && cmake -S . -B build -G Ninja \ - -DGGML_VULKAN=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_RPC=ON \ - -DCMAKE_INSTALL_PREFIX=/usr \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_SERVER=ON \ - && cmake --build build --config Release \ - && cmake --install build --config Release - -# libs -RUN find /opt/llama.cpp/build -type f -name 'lib*.so*' -exec cp {} /usr/lib64/ \; \ - && ldconfig - -# helper -COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py -RUN chmod +x /usr/local/bin/gguf-vram-estimator.py - - -# runtime stage -FROM registry.fedoraproject.org/fedora-minimal:43 - -# runtime deps -RUN microdnf -y --nodocs --setopt=install_weak_deps=0 install \ - bash ca-certificates libatomic libstdc++ libgcc \ - vulkan-loader vulkan-loader-devel vulkaninfo mesa-vulkan-drivers radeontop procps-ng \ - && microdnf clean all && rm -rf /var/cache/dnf/* - -# copy -COPY --from=builder /usr/ /usr/ -COPY --from=builder /usr/local/ /usr/local/ -COPY --from=builder /opt/llama.cpp/build/bin/rpc-* /usr/local/bin/ - -# ld -RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf \ - && echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf \ - && ldconfig \ - && cp -n /usr/local/lib/libllama*.so* /usr/lib64/ 2>/dev/null || true \ - && ldconfig - -# helper -COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py -RUN chmod +x /usr/local/bin/gguf-vram-estimator.py - -# shell -CMD ["/bin/bash"]