diff --git a/benchmark/mtp-bench.py b/benchmark/mtp-bench.py
index d6c8a10..3146332 100755
--- a/benchmark/mtp-bench.py
+++ b/benchmark/mtp-bench.py
@@ -87,21 +87,33 @@ def run(args):
# OpenAI-compatible endpoint: timings are in usage or top-level
usage = r.get("usage", {}) or {}
t = r.get("timings", {}) or {}
+ prompt_n = usage.get("prompt_tokens") or t.get("prompt_n")
+ prompt_ms = t.get("prompt_ms")
+ prompt_per_second = t.get("prompt_per_second")
predicted_n = usage.get("completion_tokens") or t.get("predicted_n")
predicted_per_second = t.get("predicted_per_second") or (predicted_n / wall if wall > 0 else 0)
rec = {"name": p["name"], "wall_s": round(wall,3),
+ "prompt_n": prompt_n,
+ "prompt_ms": round(prompt_ms, 2) if prompt_ms is not None else None,
+ "prompt_per_second": round(prompt_per_second, 2) if prompt_per_second is not None else None,
"predicted_n": predicted_n, "predicted_per_second": round(predicted_per_second, 2),
"draft_n": t.get("draft_n",0), "draft_n_accepted": t.get("draft_n_accepted",0)}
rec["accept_rate"] = round(rec["draft_n_accepted"]/rec["draft_n"],4) if rec["draft_n"] else None
out["results"].append(rec)
ar = f"{rec['accept_rate']:.3f}" if rec["accept_rate"] is not None else "n/a"
- print(f" {rec['name']:<18} pred={rec['predicted_n']:>4} draft={rec['draft_n']:>4} acc={rec['draft_n_accepted']:>4} rate={ar} tok/s={rec['predicted_per_second']:.1f}")
+ pps = f" pt/s={rec['prompt_per_second']:.1f}" if rec.get("prompt_per_second") else ""
+ print(f" {rec['name']:<18} pred={rec['predicted_n']:>4} draft={rec['draft_n']:>4} acc={rec['draft_n_accepted']:>4} rate={ar} tok/s={rec['predicted_per_second']:.1f}{pps}")
td = sum(x["draft_n"] or 0 for x in out["results"])
ta = sum(x["draft_n_accepted"] or 0 for x in out["results"])
tp = sum(x["predicted_n"] or 0 for x in out["results"])
+ t_pn = sum(x["prompt_n"] or 0 for x in out["results"])
tw = sum(x["wall_s"] for x in out["results"])
+ pps_list = [x["prompt_per_second"] for x in out["results"] if x.get("prompt_per_second") is not None]
+ avg_pps = sum(pps_list)/len(pps_list) if pps_list else None
+
out["aggregate"] = {"n_requests": len(out["results"]), "total_predicted": tp, "total_draft": td, "total_draft_accepted": ta,
- "aggregate_accept_rate": round(ta/td,4) if td else None, "wall_s_total": round(tw,2)}
+ "aggregate_accept_rate": round(ta/td,4) if td else None, "wall_s_total": round(tw,2),
+ "total_prompt_tokens": t_pn, "avg_prompt_per_second": round(avg_pps, 2) if avg_pps is not None else None}
print("\nAggregate:", json.dumps(out["aggregate"], indent=2))
if args.out:
json.dump(out, open(args.out,"w"), indent=2); print("Wrote", args.out)
diff --git a/benchmark/run_mtp_bench.py b/benchmark/run_mtp_bench.py
index b33aec5..6c836c7 100755
--- a/benchmark/run_mtp_bench.py
+++ b/benchmark/run_mtp_bench.py
@@ -29,8 +29,8 @@ from urllib.error import URLError
# ── Toolbox definitions ──────────────────────────────────────────────────────
TOOLBOXES = {
- "rocm-7.2.3-mtp": {
- "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7.2.3-mtp",
+ "rocm-7.2.3": {
+ "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7.2.3",
"engine_args": [
"--device", "/dev/dri",
"--device", "/dev/kfd",
@@ -39,8 +39,8 @@ TOOLBOXES = {
"--security-opt", "seccomp=unconfined",
],
},
- "vulkan-radv-mtp": {
- "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan-radv-mtp",
+ "vulkan-radv": {
+ "image": "docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan-radv",
"engine_args": [
"--device", "/dev/dri",
"--group-add", "video",
@@ -415,9 +415,9 @@ def print_summary(results_dir: Path):
baselines[key] = r["_avg_toks"]
# Print table
- print("\n" + "=" * 100)
- print(f"{'Model':<30} {'Toolbox':<20} {'Mode':<10} {'Avg tok/s':>10} {'Accept%':>9} {'Wall(s)':>8} {'Speedup':>8}")
- print("-" * 100)
+ print("\n" + "=" * 115)
+ print(f"{'Model':<30} {'Toolbox':<20} {'Mode':<10} {'Prefill pt/s':>13} {'Avg tok/s':>10} {'Accept%':>9} {'Wall(s)':>8} {'Speedup':>8}")
+ print("-" * 115)
for r in results:
agg = r.get("aggregate", {})
@@ -426,6 +426,9 @@ def print_summary(results_dir: Path):
accept_str = f"{accept * 100:.1f}%" if accept is not None else "—"
avg_toks = r["_avg_toks"]
+ avg_prompt = agg.get("avg_prompt_per_second")
+ prefill_str = f"{avg_prompt:.1f}" if avg_prompt is not None else "—"
+
# Speedup relative to baseline
baseline_key = (r["model"], r["toolbox"])
baseline_toks = baselines.get(baseline_key)
@@ -434,9 +437,9 @@ def print_summary(results_dir: Path):
else:
speedup = "—"
- print(f"{r['model']:<30} {r['toolbox']:<20} {r['mode']:<10} {avg_toks:>10.1f} {accept_str:>9} {wall:>8.1f} {speedup:>8}")
+ print(f"{r['model']:<30} {r['toolbox']:<20} {r['mode']:<10} {prefill_str:>13} {avg_toks:>10.1f} {accept_str:>9} {wall:>8.1f} {speedup:>8}")
- print("=" * 100)
+ print("=" * 115)
# Write summary.json
summary_data = []
@@ -446,9 +449,11 @@ def print_summary(results_dir: Path):
"model": r["model"],
"toolbox": r["toolbox"],
"mode": r["mode"],
+ "avg_prompt_tok_s": agg.get("avg_prompt_per_second"),
"avg_tok_s": round(r["_avg_toks"], 1),
"accept_rate": agg.get("aggregate_accept_rate"),
"wall_s_total": agg.get("wall_s_total"),
+ "results": r.get("results", [])
})
summary_path = results_dir / "summary.json"
diff --git a/docs/assets/mtp.css b/docs/assets/mtp.css
index 516399b..fda32fc 100644
--- a/docs/assets/mtp.css
+++ b/docs/assets/mtp.css
@@ -103,9 +103,90 @@
font-size: 11px;
background: #f1f5ff;
color: #1d4ed8;
+ white-space: nowrap;
}
.toolbox-pill.radv {
background: #fdf2f8;
color: #9d174d;
}
+
+/* Expandable row interactivity */
+.mtp-table tbody tr.main-row {
+ cursor: pointer;
+ transition: background-color 0.15s ease;
+}
+
+.mtp-table tbody tr.main-row:hover {
+ background-color: var(--hover);
+}
+
+.mtp-table tbody tr.main-row td:first-child::before {
+ content: "▶";
+ display: inline-block;
+ font-size: 10px;
+ margin-right: 8px;
+ color: var(--muted);
+ transition: transform 0.2s ease;
+}
+
+.mtp-table tbody tr.main-row.expanded td:first-child::before {
+ transform: rotate(90deg);
+}
+
+/* Details row and sub-table */
+.details-row {
+ background-color: #f8fafc;
+}
+
+.details-row.hidden {
+ display: none;
+}
+
+.details-row td {
+ padding: 0;
+ border-bottom: 1px solid var(--border);
+}
+
+.granular-wrap {
+ padding: 16px 24px;
+ box-shadow: inset 0 2px 4px rgba(0,0,0,0.02);
+}
+
+.granular-table {
+ width: 100%;
+ border-collapse: collapse;
+ font-size: 13px;
+ background: #fff;
+ border: 1px solid var(--border);
+ border-radius: 4px;
+ overflow: hidden;
+}
+
+.granular-table th, .granular-table td {
+ padding: 8px 12px;
+ text-align: left;
+ border-bottom: 1px solid var(--border);
+}
+
+.granular-table th {
+ background: #f1f5f9;
+ font-weight: 600;
+ color: var(--ink);
+ text-transform: uppercase;
+ font-size: 11px;
+ letter-spacing: 0.5px;
+}
+
+.granular-table td.num {
+ text-align: right;
+ font-variant-numeric: tabular-nums;
+}
+
+.granular-table tr:last-child td {
+ border-bottom: none;
+}
+
+.granular-table tbody tr:hover {
+ background-color: var(--hover);
+}
diff --git a/docs/assets/mtp.js b/docs/assets/mtp.js
index 0862651..c106d24 100644
--- a/docs/assets/mtp.js
+++ b/docs/assets/mtp.js
@@ -49,6 +49,7 @@ function renderTable(runs, tbody) {
rows.forEach(row => {
const tr = document.createElement("tr");
+ tr.className = "main-row";
// Model
const tdModel = document.createElement("td");
@@ -87,7 +88,22 @@ function renderTable(runs, tbody) {
tr.appendChild(makeMetricCell(mtp3Speed));
tr.appendChild(makeSpeedupCell(baseSpeed, mtp3Speed));
+ // Details row
+ const detailsTr = document.createElement("tr");
+ detailsTr.className = "details-row hidden";
+ const detailsTd = document.createElement("td");
+ detailsTd.colSpan = 8;
+
+ detailsTd.innerHTML = makeDetailsHTML(row);
+ detailsTr.appendChild(detailsTd);
+
+ tr.addEventListener("click", () => {
+ tr.classList.toggle("expanded");
+ detailsTr.classList.toggle("hidden");
+ });
+
tbody.appendChild(tr);
+ tbody.appendChild(detailsTr);
});
}
@@ -126,3 +142,99 @@ function makeSpeedupCell(base, mtp) {
}
return td;
}
+
+function makeDetailsHTML(row) {
+ if (!row.baseline || !row.baseline.results || row.baseline.results.length === 0) {
+ return `
Granular data not available for this run. Re-run benchmarks to capture prompt-level metrics.
`;
+ }
+
+ const tasks = new Map();
+ const modes = [
+ { key: "base", data: row.baseline },
+ { key: "mtp2", data: row.mtp2 },
+ { key: "mtp3", data: row.mtp3 }
+ ];
+
+ modes.forEach(mode => {
+ if (!mode.data || !mode.data.results) return;
+ mode.data.results.forEach(res => {
+ if (!tasks.has(res.name)) {
+ tasks.set(res.name, { name: res.name });
+ }
+ const t = tasks.get(res.name);
+ t[`${mode.key}_prefill`] = res.prompt_per_second;
+ t[`${mode.key}_toks`] = res.predicted_per_second;
+ t[`${mode.key}_acc`] = res.accept_rate;
+ });
+ });
+
+ let html = `
+
+
+
+
+ | Prompt Task |
+ Prefill (Base) |
+ Prefill (MTP-2) |
+ Prefill (MTP-3) |
+ Base Gen |
+ MTP-2 Gen |
+ Acc% |
+ MTP-3 Gen |
+ Acc% |
+
+
+
+ `;
+
+ tasks.forEach(t => {
+ const p_base_val = t.base_prefill;
+ const p_mtp2_val = t.mtp2_prefill;
+ const p_mtp3_val = t.mtp3_prefill;
+
+ const p_base = p_base_val ? p_base_val.toFixed(1) : "—";
+ let p_mtp2 = p_mtp2_val ? p_mtp2_val.toFixed(1) : "—";
+ let p_mtp3 = p_mtp3_val ? p_mtp3_val.toFixed(1) : "—";
+
+ if (p_base_val && p_mtp2_val) {
+ const pct = ((p_mtp2_val - p_base_val) / p_base_val) * 100;
+ const color = pct >= 0 ? '#16a34a' : '#dc2626';
+ const sign = pct > 0 ? '+' : '';
+ p_mtp2 += ` ${sign}${pct.toFixed(1)}%`;
+ }
+
+ if (p_base_val && p_mtp3_val) {
+ const pct = ((p_mtp3_val - p_base_val) / p_base_val) * 100;
+ const color = pct >= 0 ? '#16a34a' : '#dc2626';
+ const sign = pct > 0 ? '+' : '';
+ p_mtp3 += ` ${sign}${pct.toFixed(1)}%`;
+ }
+
+ const g_base = t.base_toks ? t.base_toks.toFixed(1) : "—";
+ const g_mtp2 = t.mtp2_toks ? t.mtp2_toks.toFixed(1) : "—";
+ const a_mtp2 = t.mtp2_acc !== null && t.mtp2_acc !== undefined ? (t.mtp2_acc * 100).toFixed(1) + "%" : "—";
+ const g_mtp3 = t.mtp3_toks ? t.mtp3_toks.toFixed(1) : "—";
+ const a_mtp3 = t.mtp3_acc !== null && t.mtp3_acc !== undefined ? (t.mtp3_acc * 100).toFixed(1) + "%" : "—";
+
+ html += `
+
+ | ${t.name} |
+ ${p_base} |
+ ${p_mtp2} |
+ ${p_mtp3} |
+ ${g_base} |
+ ${g_mtp2} |
+ ${a_mtp2} |
+ ${g_mtp3} |
+ ${a_mtp3} |
+
+ `;
+ });
+
+ html += `
+
+
+
`;
+
+ return html;
+}
diff --git a/docs/mtp-summary.json b/docs/mtp-summary.json
index a5544bf..ce0276d 100644
--- a/docs/mtp-summary.json
+++ b/docs/mtp-summary.json
@@ -1,98 +1,1430 @@
[
{
"model": "Qwen3.6-27B-UD-Q8_K_XL",
- "toolbox": "rocm-7.2.3-mtp",
+ "toolbox": "rocm-7.2.3",
"mode": "baseline",
- "avg_tok_s": 6.5,
+ "avg_prompt_tok_s": 104.9,
+ "avg_tok_s": 6.4,
"accept_rate": null,
- "wall_s_total": 273.39
+ "wall_s_total": 274.48,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 30.306,
+ "prompt_n": 30,
+ "prompt_ms": 509.43,
+ "prompt_per_second": 58.89,
+ "predicted_n": 192,
+ "predicted_per_second": 6.45,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 30.249,
+ "prompt_n": 40,
+ "prompt_ms": 428.45,
+ "prompt_per_second": 93.36,
+ "predicted_n": 192,
+ "predicted_per_second": 6.45,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 30.3,
+ "prompt_n": 27,
+ "prompt_ms": 469.46,
+ "prompt_per_second": 57.51,
+ "predicted_n": 192,
+ "predicted_per_second": 6.44,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "summarize",
+ "wall_s": 30.292,
+ "prompt_n": 62,
+ "prompt_ms": 472.42,
+ "prompt_per_second": 131.24,
+ "predicted_n": 192,
+ "predicted_per_second": 6.45,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 30.31,
+ "prompt_n": 24,
+ "prompt_ms": 467.84,
+ "prompt_per_second": 51.3,
+ "predicted_n": 192,
+ "predicted_per_second": 6.44,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "translation",
+ "wall_s": 30.299,
+ "prompt_n": 25,
+ "prompt_ms": 467.72,
+ "prompt_per_second": 53.45,
+ "predicted_n": 192,
+ "predicted_per_second": 6.44,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 30.312,
+ "prompt_n": 21,
+ "prompt_ms": 476.88,
+ "prompt_per_second": 44.04,
+ "predicted_n": 192,
+ "predicted_per_second": 6.44,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 30.29,
+ "prompt_n": 60,
+ "prompt_ms": 469.75,
+ "prompt_per_second": 127.73,
+ "predicted_n": 192,
+ "predicted_per_second": 6.45,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 32.122,
+ "prompt_n": 731,
+ "prompt_ms": 2238.33,
+ "prompt_per_second": 326.58,
+ "predicted_n": 192,
+ "predicted_per_second": 6.43,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ }
+ ]
},
{
"model": "Qwen3.6-27B-UD-Q8_K_XL",
- "toolbox": "rocm-7.2.3-mtp",
+ "toolbox": "rocm-7.2.3",
"mode": "mtp-2",
- "avg_tok_s": 12.4,
+ "avg_prompt_tok_s": 97.38,
+ "avg_tok_s": 12.6,
"accept_rate": 0.7971,
- "wall_s_total": 147.31
+ "wall_s_total": 144.29,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 15.171,
+ "prompt_n": 30,
+ "prompt_ms": 527.23,
+ "prompt_per_second": 56.9,
+ "predicted_n": 192,
+ "predicted_per_second": 13.12,
+ "draft_n": 140,
+ "draft_n_accepted": 120,
+ "accept_rate": 0.8571
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 15.787,
+ "prompt_n": 40,
+ "prompt_ms": 461.35,
+ "prompt_per_second": 86.7,
+ "predicted_n": 192,
+ "predicted_per_second": 12.56,
+ "draft_n": 147,
+ "draft_n_accepted": 117,
+ "accept_rate": 0.7959
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 15.203,
+ "prompt_n": 27,
+ "prompt_ms": 495.97,
+ "prompt_per_second": 54.44,
+ "predicted_n": 192,
+ "predicted_per_second": 13.09,
+ "draft_n": 141,
+ "draft_n_accepted": 120,
+ "accept_rate": 0.8511
+ },
+ {
+ "name": "summarize",
+ "wall_s": 14.803,
+ "prompt_n": 62,
+ "prompt_ms": 508.33,
+ "prompt_per_second": 121.97,
+ "predicted_n": 192,
+ "predicted_per_second": 13.47,
+ "draft_n": 137,
+ "draft_n_accepted": 122,
+ "accept_rate": 0.8905
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 15.822,
+ "prompt_n": 24,
+ "prompt_ms": 495.12,
+ "prompt_per_second": 48.47,
+ "predicted_n": 192,
+ "predicted_per_second": 12.56,
+ "draft_n": 148,
+ "draft_n_accepted": 117,
+ "accept_rate": 0.7905
+ },
+ {
+ "name": "translation",
+ "wall_s": 16.442,
+ "prompt_n": 25,
+ "prompt_ms": 496.05,
+ "prompt_per_second": 50.4,
+ "predicted_n": 192,
+ "predicted_per_second": 12.07,
+ "draft_n": 153,
+ "draft_n_accepted": 114,
+ "accept_rate": 0.7451
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 17.224,
+ "prompt_n": 21,
+ "prompt_ms": 492.99,
+ "prompt_per_second": 42.6,
+ "predicted_n": 192,
+ "predicted_per_second": 11.5,
+ "draft_n": 160,
+ "draft_n_accepted": 110,
+ "accept_rate": 0.6875
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 15.005,
+ "prompt_n": 60,
+ "prompt_ms": 503.13,
+ "prompt_per_second": 119.25,
+ "predicted_n": 192,
+ "predicted_per_second": 13.28,
+ "draft_n": 139,
+ "draft_n_accepted": 121,
+ "accept_rate": 0.8705
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 18.833,
+ "prompt_n": 731,
+ "prompt_ms": 2471.97,
+ "prompt_per_second": 295.72,
+ "predicted_n": 192,
+ "predicted_per_second": 11.76,
+ "draft_n": 156,
+ "draft_n_accepted": 112,
+ "accept_rate": 0.7179
+ }
+ ]
},
{
"model": "Qwen3.6-27B-UD-Q8_K_XL",
- "toolbox": "rocm-7.2.3-mtp",
+ "toolbox": "rocm-7.2.3",
"mode": "mtp-3",
- "avg_tok_s": 13.5,
+ "avg_prompt_tok_s": 95.5,
+ "avg_tok_s": 13.7,
"accept_rate": 0.744,
- "wall_s_total": 135.2
+ "wall_s_total": 133.44,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 13.55,
+ "prompt_n": 30,
+ "prompt_ms": 542.34,
+ "prompt_per_second": 55.32,
+ "predicted_n": 192,
+ "predicted_per_second": 14.78,
+ "draft_n": 163,
+ "draft_n_accepted": 136,
+ "accept_rate": 0.8344
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 14.924,
+ "prompt_n": 40,
+ "prompt_ms": 473.5,
+ "prompt_per_second": 84.48,
+ "predicted_n": 192,
+ "predicted_per_second": 13.32,
+ "draft_n": 181,
+ "draft_n_accepted": 130,
+ "accept_rate": 0.7182
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 14.489,
+ "prompt_n": 27,
+ "prompt_ms": 510.02,
+ "prompt_per_second": 52.94,
+ "predicted_n": 192,
+ "predicted_per_second": 13.77,
+ "draft_n": 175,
+ "draft_n_accepted": 132,
+ "accept_rate": 0.7543
+ },
+ {
+ "name": "summarize",
+ "wall_s": 13.973,
+ "prompt_n": 62,
+ "prompt_ms": 522.38,
+ "prompt_per_second": 118.69,
+ "predicted_n": 192,
+ "predicted_per_second": 14.32,
+ "draft_n": 168,
+ "draft_n_accepted": 134,
+ "accept_rate": 0.7976
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 14.491,
+ "prompt_n": 24,
+ "prompt_ms": 508.76,
+ "prompt_per_second": 47.17,
+ "predicted_n": 192,
+ "predicted_per_second": 13.77,
+ "draft_n": 176,
+ "draft_n_accepted": 132,
+ "accept_rate": 0.75
+ },
+ {
+ "name": "translation",
+ "wall_s": 15.448,
+ "prompt_n": 25,
+ "prompt_ms": 508.67,
+ "prompt_per_second": 49.15,
+ "predicted_n": 192,
+ "predicted_per_second": 12.88,
+ "draft_n": 189,
+ "draft_n_accepted": 128,
+ "accept_rate": 0.6772
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 15.432,
+ "prompt_n": 21,
+ "prompt_ms": 505.51,
+ "prompt_per_second": 41.54,
+ "predicted_n": 192,
+ "predicted_per_second": 12.9,
+ "draft_n": 187,
+ "draft_n_accepted": 128,
+ "accept_rate": 0.6845
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 13.969,
+ "prompt_n": 60,
+ "prompt_ms": 516.53,
+ "prompt_per_second": 116.16,
+ "predicted_n": 192,
+ "predicted_per_second": 14.31,
+ "draft_n": 168,
+ "draft_n_accepted": 134,
+ "accept_rate": 0.7976
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 17.16,
+ "prompt_n": 731,
+ "prompt_ms": 2486.03,
+ "prompt_per_second": 294.04,
+ "predicted_n": 192,
+ "predicted_per_second": 13.12,
+ "draft_n": 183,
+ "draft_n_accepted": 129,
+ "accept_rate": 0.7049
+ }
+ ]
},
{
"model": "Qwen3.6-27B-UD-Q8_K_XL",
- "toolbox": "vulkan-radv-mtp",
+ "toolbox": "vulkan-radv",
"mode": "baseline",
+ "avg_prompt_tok_s": 97.03,
"avg_tok_s": 6.3,
"accept_rate": null,
- "wall_s_total": 283.86
+ "wall_s_total": 280.34,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 30.889,
+ "prompt_n": 30,
+ "prompt_ms": 497.72,
+ "prompt_per_second": 60.27,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 30.922,
+ "prompt_n": 40,
+ "prompt_ms": 513.86,
+ "prompt_per_second": 77.84,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 30.807,
+ "prompt_n": 27,
+ "prompt_ms": 404.26,
+ "prompt_per_second": 66.79,
+ "predicted_n": 192,
+ "predicted_per_second": 6.33,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "summarize",
+ "wall_s": 30.907,
+ "prompt_n": 62,
+ "prompt_ms": 490.66,
+ "prompt_per_second": 126.36,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 30.809,
+ "prompt_n": 24,
+ "prompt_ms": 400.38,
+ "prompt_per_second": 59.94,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "translation",
+ "wall_s": 30.815,
+ "prompt_n": 25,
+ "prompt_ms": 401.5,
+ "prompt_per_second": 62.27,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 30.809,
+ "prompt_n": 21,
+ "prompt_ms": 397.41,
+ "prompt_per_second": 52.84,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 30.9,
+ "prompt_n": 60,
+ "prompt_ms": 488.22,
+ "prompt_per_second": 122.9,
+ "predicted_n": 192,
+ "predicted_per_second": 6.32,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 33.483,
+ "prompt_n": 731,
+ "prompt_ms": 2995.01,
+ "prompt_per_second": 244.07,
+ "predicted_n": 192,
+ "predicted_per_second": 6.31,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ }
+ ]
},
{
"model": "Qwen3.6-27B-UD-Q8_K_XL",
- "toolbox": "vulkan-radv-mtp",
+ "toolbox": "vulkan-radv",
"mode": "mtp-2",
+ "avg_prompt_tok_s": 88.14,
"avg_tok_s": 11.7,
"accept_rate": 0.8024,
- "wall_s_total": 159.41
+ "wall_s_total": 156.11,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 16.264,
+ "prompt_n": 30,
+ "prompt_ms": 538.04,
+ "prompt_per_second": 55.76,
+ "predicted_n": 192,
+ "predicted_per_second": 12.22,
+ "draft_n": 140,
+ "draft_n_accepted": 121,
+ "accept_rate": 0.8643
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 16.926,
+ "prompt_n": 40,
+ "prompt_ms": 564.77,
+ "prompt_per_second": 70.83,
+ "predicted_n": 192,
+ "predicted_per_second": 11.76,
+ "draft_n": 146,
+ "draft_n_accepted": 118,
+ "accept_rate": 0.8082
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 17.671,
+ "prompt_n": 27,
+ "prompt_ms": 438.41,
+ "prompt_per_second": 61.59,
+ "predicted_n": 192,
+ "predicted_per_second": 11.16,
+ "draft_n": 152,
+ "draft_n_accepted": 114,
+ "accept_rate": 0.75
+ },
+ {
+ "name": "summarize",
+ "wall_s": 15.168,
+ "prompt_n": 62,
+ "prompt_ms": 541.89,
+ "prompt_per_second": 114.41,
+ "predicted_n": 192,
+ "predicted_per_second": 13.17,
+ "draft_n": 130,
+ "draft_n_accepted": 126,
+ "accept_rate": 0.9692
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 16.558,
+ "prompt_n": 24,
+ "prompt_ms": 438.01,
+ "prompt_per_second": 54.79,
+ "predicted_n": 192,
+ "predicted_per_second": 11.95,
+ "draft_n": 142,
+ "draft_n_accepted": 119,
+ "accept_rate": 0.838
+ },
+ {
+ "name": "translation",
+ "wall_s": 17.52,
+ "prompt_n": 25,
+ "prompt_ms": 437.99,
+ "prompt_per_second": 57.08,
+ "predicted_n": 192,
+ "predicted_per_second": 11.27,
+ "draft_n": 151,
+ "draft_n_accepted": 115,
+ "accept_rate": 0.7616
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 18.359,
+ "prompt_n": 21,
+ "prompt_ms": 433.87,
+ "prompt_per_second": 48.4,
+ "predicted_n": 192,
+ "predicted_per_second": 10.74,
+ "draft_n": 157,
+ "draft_n_accepted": 111,
+ "accept_rate": 0.707
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 16.929,
+ "prompt_n": 60,
+ "prompt_ms": 530.6,
+ "prompt_per_second": 113.08,
+ "predicted_n": 192,
+ "predicted_per_second": 11.74,
+ "draft_n": 145,
+ "draft_n_accepted": 118,
+ "accept_rate": 0.8138
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 20.716,
+ "prompt_n": 731,
+ "prompt_ms": 3363.15,
+ "prompt_per_second": 217.36,
+ "predicted_n": 192,
+ "predicted_per_second": 11.1,
+ "draft_n": 153,
+ "draft_n_accepted": 114,
+ "accept_rate": 0.7451
+ }
+ ]
},
{
"model": "Qwen3.6-27B-UD-Q8_K_XL",
- "toolbox": "vulkan-radv-mtp",
+ "toolbox": "vulkan-radv",
"mode": "mtp-3",
+ "avg_prompt_tok_s": 86.42,
"avg_tok_s": 13.3,
"accept_rate": 0.7301,
- "wall_s_total": 141.74
+ "wall_s_total": 137.86,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 13.782,
+ "prompt_n": 30,
+ "prompt_ms": 550.68,
+ "prompt_per_second": 54.48,
+ "predicted_n": 192,
+ "predicted_per_second": 14.53,
+ "draft_n": 163,
+ "draft_n_accepted": 136,
+ "accept_rate": 0.8344
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 14.755,
+ "prompt_n": 40,
+ "prompt_ms": 578.77,
+ "prompt_per_second": 69.11,
+ "predicted_n": 192,
+ "predicted_per_second": 13.58,
+ "draft_n": 175,
+ "draft_n_accepted": 132,
+ "accept_rate": 0.7543
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 16.075,
+ "prompt_n": 27,
+ "prompt_ms": 452.06,
+ "prompt_per_second": 59.73,
+ "predicted_n": 192,
+ "predicted_per_second": 12.32,
+ "draft_n": 195,
+ "draft_n_accepted": 126,
+ "accept_rate": 0.6462
+ },
+ {
+ "name": "summarize",
+ "wall_s": 14.065,
+ "prompt_n": 62,
+ "prompt_ms": 553.79,
+ "prompt_per_second": 111.96,
+ "predicted_n": 192,
+ "predicted_per_second": 14.26,
+ "draft_n": 167,
+ "draft_n_accepted": 135,
+ "accept_rate": 0.8084
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 14.198,
+ "prompt_n": 24,
+ "prompt_ms": 450.37,
+ "prompt_per_second": 53.29,
+ "predicted_n": 192,
+ "predicted_per_second": 14.02,
+ "draft_n": 171,
+ "draft_n_accepted": 134,
+ "accept_rate": 0.7836
+ },
+ {
+ "name": "translation",
+ "wall_s": 14.908,
+ "prompt_n": 25,
+ "prompt_ms": 452.28,
+ "prompt_per_second": 55.28,
+ "predicted_n": 192,
+ "predicted_per_second": 13.33,
+ "draft_n": 179,
+ "draft_n_accepted": 131,
+ "accept_rate": 0.7318
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 16.099,
+ "prompt_n": 21,
+ "prompt_ms": 448.93,
+ "prompt_per_second": 46.78,
+ "predicted_n": 192,
+ "predicted_per_second": 12.31,
+ "draft_n": 192,
+ "draft_n_accepted": 126,
+ "accept_rate": 0.6562
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 15.415,
+ "prompt_n": 60,
+ "prompt_ms": 543.5,
+ "prompt_per_second": 110.4,
+ "predicted_n": 192,
+ "predicted_per_second": 12.95,
+ "draft_n": 183,
+ "draft_n_accepted": 129,
+ "accept_rate": 0.7049
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 18.562,
+ "prompt_n": 731,
+ "prompt_ms": 3372.45,
+ "prompt_per_second": 216.76,
+ "predicted_n": 192,
+ "predicted_per_second": 12.68,
+ "draft_n": 187,
+ "draft_n_accepted": 128,
+ "accept_rate": 0.6845
+ }
+ ]
},
{
"model": "Qwen3.6-35B-A3B-UD-Q4_K_XL",
- "toolbox": "rocm-7.2.3-mtp",
+ "toolbox": "rocm-7.2.3",
"mode": "baseline",
- "avg_tok_s": 48.7,
+ "avg_prompt_tok_s": 352.21,
+ "avg_tok_s": 51.1,
"accept_rate": null,
- "wall_s_total": 37.55
+ "wall_s_total": 35.79,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 3.904,
+ "prompt_n": 30,
+ "prompt_ms": 136.88,
+ "prompt_per_second": 219.17,
+ "predicted_n": 192,
+ "predicted_per_second": 51.14,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 3.901,
+ "prompt_n": 40,
+ "prompt_ms": 126.79,
+ "prompt_per_second": 315.49,
+ "predicted_n": 192,
+ "predicted_per_second": 51.15,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 3.883,
+ "prompt_n": 27,
+ "prompt_ms": 109.03,
+ "prompt_per_second": 247.65,
+ "predicted_n": 192,
+ "predicted_per_second": 51.16,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "summarize",
+ "wall_s": 3.931,
+ "prompt_n": 62,
+ "prompt_ms": 155.31,
+ "prompt_per_second": 399.2,
+ "predicted_n": 192,
+ "predicted_per_second": 51.14,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 3.88,
+ "prompt_n": 24,
+ "prompt_ms": 104.26,
+ "prompt_per_second": 230.19,
+ "predicted_n": 192,
+ "predicted_per_second": 51.15,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "translation",
+ "wall_s": 3.879,
+ "prompt_n": 25,
+ "prompt_ms": 104.91,
+ "prompt_per_second": 238.3,
+ "predicted_n": 192,
+ "predicted_per_second": 51.15,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 3.877,
+ "prompt_n": 21,
+ "prompt_ms": 101.75,
+ "prompt_per_second": 206.38,
+ "predicted_n": 192,
+ "predicted_per_second": 51.15,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 3.925,
+ "prompt_n": 60,
+ "prompt_ms": 148.79,
+ "prompt_per_second": 403.25,
+ "predicted_n": 192,
+ "predicted_per_second": 51.14,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 4.606,
+ "prompt_n": 731,
+ "prompt_ms": 803.04,
+ "prompt_per_second": 910.29,
+ "predicted_n": 192,
+ "predicted_per_second": 50.8,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ }
+ ]
},
{
"model": "Qwen3.6-35B-A3B-UD-Q4_K_XL",
- "toolbox": "rocm-7.2.3-mtp",
+ "toolbox": "rocm-7.2.3",
"mode": "mtp-2",
- "avg_tok_s": 64.5,
- "accept_rate": 0.7958,
- "wall_s_total": 29.33
+ "avg_prompt_tok_s": 309.53,
+ "avg_tok_s": 67.5,
+ "accept_rate": 0.8183,
+ "wall_s_total": 27.94,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 2.844,
+ "prompt_n": 30,
+ "prompt_ms": 168.12,
+ "prompt_per_second": 178.44,
+ "predicted_n": 192,
+ "predicted_per_second": 72.12,
+ "draft_n": 133,
+ "draft_n_accepted": 123,
+ "accept_rate": 0.9248
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 2.906,
+ "prompt_n": 40,
+ "prompt_ms": 141.09,
+ "prompt_per_second": 283.51,
+ "predicted_n": 192,
+ "predicted_per_second": 69.99,
+ "draft_n": 139,
+ "draft_n_accepted": 121,
+ "accept_rate": 0.8705
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 3.202,
+ "prompt_n": 27,
+ "prompt_ms": 120.53,
+ "prompt_per_second": 224.02,
+ "predicted_n": 192,
+ "predicted_per_second": 62.77,
+ "draft_n": 156,
+ "draft_n_accepted": 113,
+ "accept_rate": 0.7244
+ },
+ {
+ "name": "summarize",
+ "wall_s": 2.894,
+ "prompt_n": 62,
+ "prompt_ms": 173.04,
+ "prompt_per_second": 358.31,
+ "predicted_n": 192,
+ "predicted_per_second": 71.13,
+ "draft_n": 136,
+ "draft_n_accepted": 122,
+ "accept_rate": 0.8971
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 2.88,
+ "prompt_n": 24,
+ "prompt_ms": 115.19,
+ "prompt_per_second": 208.36,
+ "predicted_n": 192,
+ "predicted_per_second": 70.01,
+ "draft_n": 139,
+ "draft_n_accepted": 121,
+ "accept_rate": 0.8705
+ },
+ {
+ "name": "translation",
+ "wall_s": 2.995,
+ "prompt_n": 25,
+ "prompt_ms": 116.14,
+ "prompt_per_second": 215.25,
+ "predicted_n": 192,
+ "predicted_per_second": 67.18,
+ "draft_n": 145,
+ "draft_n_accepted": 118,
+ "accept_rate": 0.8138
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 3.171,
+ "prompt_n": 21,
+ "prompt_ms": 112.34,
+ "prompt_per_second": 186.93,
+ "predicted_n": 192,
+ "predicted_per_second": 63.23,
+ "draft_n": 154,
+ "draft_n_accepted": 113,
+ "accept_rate": 0.7338
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 3.037,
+ "prompt_n": 60,
+ "prompt_ms": 161.11,
+ "prompt_per_second": 372.42,
+ "predicted_n": 192,
+ "predicted_per_second": 67.25,
+ "draft_n": 145,
+ "draft_n_accepted": 118,
+ "accept_rate": 0.8138
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 4.009,
+ "prompt_n": 731,
+ "prompt_ms": 963.68,
+ "prompt_per_second": 758.55,
+ "predicted_n": 192,
+ "predicted_per_second": 63.52,
+ "draft_n": 152,
+ "draft_n_accepted": 114,
+ "accept_rate": 0.75
+ }
+ ]
},
{
"model": "Qwen3.6-35B-A3B-UD-Q4_K_XL",
- "toolbox": "rocm-7.2.3-mtp",
+ "toolbox": "rocm-7.2.3",
"mode": "mtp-3",
- "avg_tok_s": 68.3,
+ "avg_prompt_tok_s": 302.26,
+ "avg_tok_s": 70.3,
"accept_rate": 0.7386,
- "wall_s_total": 27.83
+ "wall_s_total": 27.0,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 2.789,
+ "prompt_n": 30,
+ "prompt_ms": 165.27,
+ "prompt_per_second": 181.52,
+ "predicted_n": 192,
+ "predicted_per_second": 73.57,
+ "draft_n": 168,
+ "draft_n_accepted": 134,
+ "accept_rate": 0.7976
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 2.714,
+ "prompt_n": 40,
+ "prompt_ms": 146.13,
+ "prompt_per_second": 273.74,
+ "predicted_n": 192,
+ "predicted_per_second": 75.39,
+ "draft_n": 165,
+ "draft_n_accepted": 135,
+ "accept_rate": 0.8182
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 3.246,
+ "prompt_n": 27,
+ "prompt_ms": 126.13,
+ "prompt_per_second": 214.07,
+ "predicted_n": 192,
+ "predicted_per_second": 62.01,
+ "draft_n": 201,
+ "draft_n_accepted": 123,
+ "accept_rate": 0.6119
+ },
+ {
+ "name": "summarize",
+ "wall_s": 2.664,
+ "prompt_n": 62,
+ "prompt_ms": 178.24,
+ "prompt_per_second": 347.84,
+ "predicted_n": 192,
+ "predicted_per_second": 77.93,
+ "draft_n": 157,
+ "draft_n_accepted": 137,
+ "accept_rate": 0.8726
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 2.762,
+ "prompt_n": 24,
+ "prompt_ms": 119.88,
+ "prompt_per_second": 200.2,
+ "predicted_n": 192,
+ "predicted_per_second": 73.32,
+ "draft_n": 169,
+ "draft_n_accepted": 134,
+ "accept_rate": 0.7929
+ },
+ {
+ "name": "translation",
+ "wall_s": 2.875,
+ "prompt_n": 25,
+ "prompt_ms": 121.36,
+ "prompt_per_second": 206.0,
+ "predicted_n": 192,
+ "predicted_per_second": 70.29,
+ "draft_n": 177,
+ "draft_n_accepted": 131,
+ "accept_rate": 0.7401
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 3.157,
+ "prompt_n": 21,
+ "prompt_ms": 116.34,
+ "prompt_per_second": 180.5,
+ "predicted_n": 192,
+ "predicted_per_second": 63.6,
+ "draft_n": 197,
+ "draft_n_accepted": 125,
+ "accept_rate": 0.6345
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 2.912,
+ "prompt_n": 60,
+ "prompt_ms": 165.66,
+ "prompt_per_second": 362.18,
+ "predicted_n": 192,
+ "predicted_per_second": 70.46,
+ "draft_n": 177,
+ "draft_n_accepted": 131,
+ "accept_rate": 0.7401
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 3.882,
+ "prompt_n": 731,
+ "prompt_ms": 969.08,
+ "prompt_per_second": 754.32,
+ "predicted_n": 192,
+ "predicted_per_second": 66.48,
+ "draft_n": 184,
+ "draft_n_accepted": 128,
+ "accept_rate": 0.6957
+ }
+ ]
},
{
"model": "Qwen3.6-35B-A3B-UD-Q4_K_XL",
- "toolbox": "vulkan-radv-mtp",
+ "toolbox": "vulkan-radv",
"mode": "baseline",
- "avg_tok_s": 58.7,
+ "avg_prompt_tok_s": 302.53,
+ "avg_tok_s": 59.4,
"accept_rate": null,
- "wall_s_total": 31.93
+ "wall_s_total": 31.46,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 3.594,
+ "prompt_n": 30,
+ "prompt_ms": 314.93,
+ "prompt_per_second": 95.26,
+ "predicted_n": 192,
+ "predicted_per_second": 58.8,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 3.475,
+ "prompt_n": 40,
+ "prompt_ms": 227.75,
+ "prompt_per_second": 175.63,
+ "predicted_n": 192,
+ "predicted_per_second": 59.52,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 3.363,
+ "prompt_n": 27,
+ "prompt_ms": 119.62,
+ "prompt_per_second": 225.72,
+ "predicted_n": 192,
+ "predicted_per_second": 59.6,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "summarize",
+ "wall_s": 3.429,
+ "prompt_n": 62,
+ "prompt_ms": 175.54,
+ "prompt_per_second": 353.2,
+ "predicted_n": 192,
+ "predicted_per_second": 59.39,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 3.41,
+ "prompt_n": 24,
+ "prompt_ms": 108.41,
+ "prompt_per_second": 221.39,
+ "predicted_n": 192,
+ "predicted_per_second": 58.66,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "translation",
+ "wall_s": 3.347,
+ "prompt_n": 25,
+ "prompt_ms": 112.61,
+ "prompt_per_second": 222.01,
+ "predicted_n": 192,
+ "predicted_per_second": 59.87,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 3.355,
+ "prompt_n": 21,
+ "prompt_ms": 106.77,
+ "prompt_per_second": 196.69,
+ "predicted_n": 192,
+ "predicted_per_second": 59.62,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 3.399,
+ "prompt_n": 60,
+ "prompt_ms": 167.04,
+ "prompt_per_second": 359.19,
+ "predicted_n": 192,
+ "predicted_per_second": 59.93,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 4.093,
+ "prompt_n": 731,
+ "prompt_ms": 836.67,
+ "prompt_per_second": 873.7,
+ "predicted_n": 192,
+ "predicted_per_second": 59.5,
+ "draft_n": 0,
+ "draft_n_accepted": 0,
+ "accept_rate": null
+ }
+ ]
},
{
"model": "Qwen3.6-35B-A3B-UD-Q4_K_XL",
- "toolbox": "vulkan-radv-mtp",
+ "toolbox": "vulkan-radv",
"mode": "mtp-2",
- "avg_tok_s": 72.8,
+ "avg_prompt_tok_s": 255.09,
+ "avg_tok_s": 74.6,
"accept_rate": 0.7907,
- "wall_s_total": 26.85
+ "wall_s_total": 26.13,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 2.855,
+ "prompt_n": 30,
+ "prompt_ms": 373.02,
+ "prompt_per_second": 80.42,
+ "predicted_n": 192,
+ "predicted_per_second": 77.78,
+ "draft_n": 135,
+ "draft_n_accepted": 123,
+ "accept_rate": 0.9111
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 2.922,
+ "prompt_n": 40,
+ "prompt_ms": 267.27,
+ "prompt_per_second": 149.66,
+ "predicted_n": 192,
+ "predicted_per_second": 72.95,
+ "draft_n": 149,
+ "draft_n_accepted": 116,
+ "accept_rate": 0.7785
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 2.881,
+ "prompt_n": 27,
+ "prompt_ms": 133.95,
+ "prompt_per_second": 201.57,
+ "predicted_n": 192,
+ "predicted_per_second": 70.44,
+ "draft_n": 155,
+ "draft_n_accepted": 113,
+ "accept_rate": 0.729
+ },
+ {
+ "name": "summarize",
+ "wall_s": 2.546,
+ "prompt_n": 62,
+ "prompt_ms": 202.66,
+ "prompt_per_second": 305.94,
+ "predicted_n": 192,
+ "predicted_per_second": 82.7,
+ "draft_n": 134,
+ "draft_n_accepted": 124,
+ "accept_rate": 0.9254
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 2.516,
+ "prompt_n": 24,
+ "prompt_ms": 123.14,
+ "prompt_per_second": 194.89,
+ "predicted_n": 192,
+ "predicted_per_second": 80.95,
+ "draft_n": 137,
+ "draft_n_accepted": 122,
+ "accept_rate": 0.8905
+ },
+ {
+ "name": "translation",
+ "wall_s": 3.02,
+ "prompt_n": 25,
+ "prompt_ms": 126.43,
+ "prompt_per_second": 197.73,
+ "predicted_n": 192,
+ "predicted_per_second": 67.01,
+ "draft_n": 165,
+ "draft_n_accepted": 107,
+ "accept_rate": 0.6485
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 2.989,
+ "prompt_n": 21,
+ "prompt_ms": 121.75,
+ "prompt_per_second": 172.48,
+ "predicted_n": 192,
+ "predicted_per_second": 67.62,
+ "draft_n": 166,
+ "draft_n_accepted": 108,
+ "accept_rate": 0.6506
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 2.602,
+ "prompt_n": 60,
+ "prompt_ms": 186.44,
+ "prompt_per_second": 321.81,
+ "predicted_n": 192,
+ "predicted_per_second": 80.45,
+ "draft_n": 137,
+ "draft_n_accepted": 122,
+ "accept_rate": 0.8905
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 3.8,
+ "prompt_n": 731,
+ "prompt_ms": 1088.89,
+ "prompt_per_second": 671.32,
+ "predicted_n": 192,
+ "predicted_per_second": 71.64,
+ "draft_n": 150,
+ "draft_n_accepted": 115,
+ "accept_rate": 0.7667
+ }
+ ]
},
{
"model": "Qwen3.6-35B-A3B-UD-Q4_K_XL",
- "toolbox": "vulkan-radv-mtp",
+ "toolbox": "vulkan-radv",
"mode": "mtp-3",
- "avg_tok_s": 74.6,
+ "avg_prompt_tok_s": 248.42,
+ "avg_tok_s": 75.7,
"accept_rate": 0.7374,
- "wall_s_total": 26.36
+ "wall_s_total": 25.87,
+ "results": [
+ {
+ "name": "code_python",
+ "wall_s": 2.738,
+ "prompt_n": 30,
+ "prompt_ms": 375.67,
+ "prompt_per_second": 79.86,
+ "predicted_n": 192,
+ "predicted_per_second": 81.75,
+ "draft_n": 165,
+ "draft_n_accepted": 136,
+ "accept_rate": 0.8242
+ },
+ {
+ "name": "code_cpp",
+ "wall_s": 2.946,
+ "prompt_n": 40,
+ "prompt_ms": 274.8,
+ "prompt_per_second": 145.56,
+ "predicted_n": 192,
+ "predicted_per_second": 72.5,
+ "draft_n": 185,
+ "draft_n_accepted": 129,
+ "accept_rate": 0.6973
+ },
+ {
+ "name": "explain_concept",
+ "wall_s": 2.814,
+ "prompt_n": 27,
+ "prompt_ms": 137.5,
+ "prompt_per_second": 196.37,
+ "predicted_n": 192,
+ "predicted_per_second": 72.37,
+ "draft_n": 185,
+ "draft_n_accepted": 129,
+ "accept_rate": 0.6973
+ },
+ {
+ "name": "summarize",
+ "wall_s": 2.516,
+ "prompt_n": 62,
+ "prompt_ms": 208.53,
+ "prompt_per_second": 297.32,
+ "predicted_n": 192,
+ "predicted_per_second": 84.04,
+ "draft_n": 159,
+ "draft_n_accepted": 138,
+ "accept_rate": 0.8679
+ },
+ {
+ "name": "qa_factual",
+ "wall_s": 2.536,
+ "prompt_n": 24,
+ "prompt_ms": 129.09,
+ "prompt_per_second": 185.91,
+ "predicted_n": 192,
+ "predicted_per_second": 80.54,
+ "draft_n": 165,
+ "draft_n_accepted": 135,
+ "accept_rate": 0.8182
+ },
+ {
+ "name": "translation",
+ "wall_s": 2.8,
+ "prompt_n": 25,
+ "prompt_ms": 131.97,
+ "prompt_per_second": 189.44,
+ "predicted_n": 192,
+ "predicted_per_second": 72.77,
+ "draft_n": 185,
+ "draft_n_accepted": 129,
+ "accept_rate": 0.6973
+ },
+ {
+ "name": "creative_short",
+ "wall_s": 2.943,
+ "prompt_n": 21,
+ "prompt_ms": 126.24,
+ "prompt_per_second": 166.35,
+ "predicted_n": 192,
+ "predicted_per_second": 68.88,
+ "draft_n": 198,
+ "draft_n_accepted": 125,
+ "accept_rate": 0.6313
+ },
+ {
+ "name": "stepwise_math",
+ "wall_s": 2.553,
+ "prompt_n": 60,
+ "prompt_ms": 192.58,
+ "prompt_per_second": 311.56,
+ "predicted_n": 192,
+ "predicted_per_second": 82.33,
+ "draft_n": 164,
+ "draft_n_accepted": 136,
+ "accept_rate": 0.8293
+ },
+ {
+ "name": "long_code_review",
+ "wall_s": 4.027,
+ "prompt_n": 731,
+ "prompt_ms": 1101.86,
+ "prompt_per_second": 663.43,
+ "predicted_n": 192,
+ "predicted_per_second": 66.37,
+ "draft_n": 197,
+ "draft_n_accepted": 125,
+ "accept_rate": 0.6345
+ }
+ ]
}
-]
+]
\ No newline at end of file
diff --git a/toolboxes/Dockerfile.rocm-7.2.3-mtp b/toolboxes/Dockerfile.rocm-7.2.3-mtp
deleted file mode 100644
index 7f4da4f..0000000
--- a/toolboxes/Dockerfile.rocm-7.2.3-mtp
+++ /dev/null
@@ -1,113 +0,0 @@
-# build stage
-FROM registry.fedoraproject.org/fedora:43 AS builder
-
-# rocm 7.2.3 repo
-RUN <<'EOF'
-tee /etc/yum.repos.d/rocm.repo < /etc/ld.so.conf.d/local.conf \
- && echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf \
- && ldconfig \
- && cp -n /usr/local/lib/libllama*.so* /usr/lib64/ 2>/dev/null || true \
- && ldconfig
-
-# helper
-COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py
-RUN chmod +x /usr/local/bin/gguf-vram-estimator.py
-
-# profile
-RUN printf '%s\n' \
- > /etc/profile.d/rocm.sh && chmod +x /etc/profile.d/rocm.sh \
- && echo 'source /etc/profile.d/rocm.sh' >> /etc/bashrc
-
-# shell
-CMD ["/bin/bash"]
diff --git a/toolboxes/Dockerfile.vulkan-radv-mtp b/toolboxes/Dockerfile.vulkan-radv-mtp
deleted file mode 100644
index 76bbded..0000000
--- a/toolboxes/Dockerfile.vulkan-radv-mtp
+++ /dev/null
@@ -1,68 +0,0 @@
-# build stage
-FROM registry.fedoraproject.org/fedora:43 AS builder
-
-# deps
-RUN dnf -y --nodocs --setopt=install_weak_deps=False install \
- git vim \
- make gcc cmake ninja-build lld clang clang-devel compiler-rt libcurl-devel \
- vulkan-loader-devel vulkaninfo mesa-vulkan-drivers \
- spirv-headers-devel radeontop glslc patch \
- && dnf clean all && rm -rf /var/cache/dnf/*
-
-# llama.cpp (am17an mtp-clean fork — Multi-Token Prediction)
-WORKDIR /opt/llama.cpp
-RUN git clone -b mtp-clean --single-branch https://github.com/am17an/llama.cpp.git .
-
-COPY llama-grammar.patch /tmp/llama-grammar.patch
-
-# build
-RUN git clean -xdf \
- && git submodule update --recursive \
- && patch -p1 < /tmp/llama-grammar.patch \
- && cmake -S . -B build -G Ninja \
- -DGGML_VULKAN=ON \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_RPC=ON \
- -DCMAKE_INSTALL_PREFIX=/usr \
- -DLLAMA_BUILD_TESTS=OFF \
- -DLLAMA_BUILD_EXAMPLES=ON \
- -DLLAMA_BUILD_SERVER=ON \
- && cmake --build build --config Release \
- && cmake --install build --config Release
-
-# libs
-RUN find /opt/llama.cpp/build -type f -name 'lib*.so*' -exec cp {} /usr/lib64/ \; \
- && ldconfig
-
-# helper
-COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py
-RUN chmod +x /usr/local/bin/gguf-vram-estimator.py
-
-
-# runtime stage
-FROM registry.fedoraproject.org/fedora-minimal:43
-
-# runtime deps
-RUN microdnf -y --nodocs --setopt=install_weak_deps=0 install \
- bash ca-certificates libatomic libstdc++ libgcc \
- vulkan-loader vulkan-loader-devel vulkaninfo mesa-vulkan-drivers radeontop procps-ng \
- && microdnf clean all && rm -rf /var/cache/dnf/*
-
-# copy
-COPY --from=builder /usr/ /usr/
-COPY --from=builder /usr/local/ /usr/local/
-COPY --from=builder /opt/llama.cpp/build/bin/rpc-* /usr/local/bin/
-
-# ld
-RUN echo "/usr/local/lib" > /etc/ld.so.conf.d/local.conf \
- && echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf \
- && ldconfig \
- && cp -n /usr/local/lib/libllama*.so* /usr/lib64/ 2>/dev/null || true \
- && ldconfig
-
-# helper
-COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py
-RUN chmod +x /usr/local/bin/gguf-vram-estimator.py
-
-# shell
-CMD ["/bin/bash"]