diff --git a/docs/assets/mtp.css b/docs/assets/mtp.css new file mode 100644 index 0000000..516399b --- /dev/null +++ b/docs/assets/mtp.css @@ -0,0 +1,111 @@ +.description { + font-size: 13px; + color: var(--ink); + max-width: 800px; + line-height: 1.5; + margin-top: 12px; +} + +.description a { + color: var(--accent); + text-decoration: none; +} + +.description a:hover { + text-decoration: underline; +} + +.mtp-layout-inner { + max-width: 1050px; + margin-left: auto !important; + margin-right: auto !important; + width: 100%; +} + +.mtp-table { + width: 100%; + min-width: auto; + font-size: 14px; +} + +.mtp-table.hidden { + display: none; +} + +.mtp-table th { + text-align: left; + vertical-align: bottom; + padding: 12px 16px; + font-size: 13px; +} + +.mtp-table td { + padding: 12px 16px; +} + +.mtp-table th.metric-col { + text-align: right; + width: 120px; +} + +.mtp-table td.metric-col { + text-align: right; + font-feature-settings: "tnum"; + font-weight: 600; +} + +.mtp-table td.metric-col .measure { + font-size: 15px; +} + +.mtp-table th.model, .mtp-table td.model { + width: 320px; +} + +.mtp-table th .sub { + font-weight: 400; + font-size: 11px; + color: var(--muted); + text-transform: uppercase; + letter-spacing: 0.05em; +} + +.speedup-badge { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 2px 8px; + border-radius: 6px; + font-size: 12px; + font-weight: 700; +} + +.speedup-high { + background: #d7f5e3; + color: #025333; +} + +.speedup-med { + background: #eef9ff; + color: #0a517a; +} + +.speedup-low { + background: #fdf2f8; + color: #9d174d; +} + +.toolbox-pill { + display: inline-flex; + align-items: center; + padding: 2px 8px; + border-radius: 999px; + font-size: 11px; + background: #f1f5ff; + color: #1d4ed8; +} + +.toolbox-pill.radv { + background: #fdf2f8; + color: #9d174d; +} diff --git a/docs/assets/mtp.js b/docs/assets/mtp.js new file mode 100644 index 0000000..0862651 --- /dev/null +++ b/docs/assets/mtp.js @@ -0,0 +1,128 @@ +document.addEventListener("DOMContentLoaded", async () => { + const statsLine = document.getElementById("stats-line"); + const table = document.getElementById("mtp-table"); + const tbody = document.getElementById("mtp-tbody"); + + try { + const res = await fetch("mtp-summary.json"); + if (!res.ok) throw new Error("Network response was not ok"); + const data = await res.json(); + + renderTable(data, tbody); + table.classList.remove("hidden"); + statsLine.textContent = `Showing ${data.length} benchmark runs`; + + } catch (err) { + console.error("Failed to load mtp-summary.json", err); + statsLine.textContent = "Failed to load mtp-summary.json. Ensure the file is present in the docs folder."; + } +}); + +function renderTable(runs, tbody) { + // Group by model and toolbox + const grouped = new Map(); + + runs.forEach(run => { + const key = `${run.model}|${run.toolbox}`; + if (!grouped.has(key)) { + grouped.set(key, { + model: run.model, + toolbox: run.toolbox, + baseline: null, + mtp2: null, + mtp3: null + }); + } + + const entry = grouped.get(key); + if (run.mode === "baseline") entry.baseline = run; + if (run.mode === "mtp-2") entry.mtp2 = run; + if (run.mode === "mtp-3") entry.mtp3 = run; + }); + + const rows = Array.from(grouped.values()).sort((a, b) => { + if (a.model !== b.model) return a.model.localeCompare(b.model); + return a.toolbox.localeCompare(b.toolbox); + }); + + tbody.innerHTML = ""; + + rows.forEach(row => { + const tr = document.createElement("tr"); + + // Model + const tdModel = document.createElement("td"); + tdModel.className = "model"; + const modelHead = document.createElement("div"); + modelHead.className = "model-head"; + const nameSpan = document.createElement("span"); + nameSpan.className = "model-name"; + nameSpan.textContent = row.model; + modelHead.appendChild(nameSpan); + tdModel.appendChild(modelHead); + tr.appendChild(tdModel); + + // Toolbox + const tdToolbox = document.createElement("td"); + const tbPill = document.createElement("span"); + tbPill.className = "toolbox-pill"; + if (row.toolbox.includes("vulkan") || row.toolbox.includes("radv")) { + tbPill.classList.add("radv"); + } + tbPill.textContent = row.toolbox; + tdToolbox.appendChild(tbPill); + tr.appendChild(tdToolbox); + + // Baseline + const baseSpeed = row.baseline ? row.baseline.avg_tok_s : null; + tr.appendChild(makeMetricCell(baseSpeed)); + + // MTP-2 + const mtp2Speed = row.mtp2 ? row.mtp2.avg_tok_s : null; + tr.appendChild(makeMetricCell(mtp2Speed)); + tr.appendChild(makeSpeedupCell(baseSpeed, mtp2Speed)); + + // MTP-3 + const mtp3Speed = row.mtp3 ? row.mtp3.avg_tok_s : null; + tr.appendChild(makeMetricCell(mtp3Speed)); + tr.appendChild(makeSpeedupCell(baseSpeed, mtp3Speed)); + + tbody.appendChild(tr); + }); +} + +function makeMetricCell(val) { + const td = document.createElement("td"); + td.className = "metric-col"; + if (val !== null && val !== undefined) { + td.innerHTML = `${val.toFixed(1)}`; + } else { + td.innerHTML = `—`; + } + return td; +} + +function makeSpeedupCell(base, mtp) { + const td = document.createElement("td"); + td.className = "metric-col"; + + if (base && mtp && base > 0) { + const ratio = mtp / base; + const badge = document.createElement("span"); + badge.className = "speedup-badge"; + badge.textContent = `${ratio.toFixed(2)}×`; + + if (ratio >= 1.8) { + badge.classList.add("speedup-high"); + } else if (ratio >= 1.3) { + badge.classList.add("speedup-med"); + } else { + badge.classList.add("speedup-low"); + } + + td.appendChild(badge); + } else { + td.innerHTML = `—`; + } + return td; +} diff --git a/docs/index.html b/docs/index.html index 8ec0a68..8c907ca 100644 --- a/docs/index.html +++ b/docs/index.html @@ -27,6 +27,9 @@ + + ★ View Experimental MTP Benchmarks + diff --git a/docs/mtp-summary.json b/docs/mtp-summary.json new file mode 100644 index 0000000..a5544bf --- /dev/null +++ b/docs/mtp-summary.json @@ -0,0 +1,98 @@ +[ + { + "model": "Qwen3.6-27B-UD-Q8_K_XL", + "toolbox": "rocm-7.2.3-mtp", + "mode": "baseline", + "avg_tok_s": 6.5, + "accept_rate": null, + "wall_s_total": 273.39 + }, + { + "model": "Qwen3.6-27B-UD-Q8_K_XL", + "toolbox": "rocm-7.2.3-mtp", + "mode": "mtp-2", + "avg_tok_s": 12.4, + "accept_rate": 0.7971, + "wall_s_total": 147.31 + }, + { + "model": "Qwen3.6-27B-UD-Q8_K_XL", + "toolbox": "rocm-7.2.3-mtp", + "mode": "mtp-3", + "avg_tok_s": 13.5, + "accept_rate": 0.744, + "wall_s_total": 135.2 + }, + { + "model": "Qwen3.6-27B-UD-Q8_K_XL", + "toolbox": "vulkan-radv-mtp", + "mode": "baseline", + "avg_tok_s": 6.3, + "accept_rate": null, + "wall_s_total": 283.86 + }, + { + "model": "Qwen3.6-27B-UD-Q8_K_XL", + "toolbox": "vulkan-radv-mtp", + "mode": "mtp-2", + "avg_tok_s": 11.7, + "accept_rate": 0.8024, + "wall_s_total": 159.41 + }, + { + "model": "Qwen3.6-27B-UD-Q8_K_XL", + "toolbox": "vulkan-radv-mtp", + "mode": "mtp-3", + "avg_tok_s": 13.3, + "accept_rate": 0.7301, + "wall_s_total": 141.74 + }, + { + "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", + "toolbox": "rocm-7.2.3-mtp", + "mode": "baseline", + "avg_tok_s": 48.7, + "accept_rate": null, + "wall_s_total": 37.55 + }, + { + "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", + "toolbox": "rocm-7.2.3-mtp", + "mode": "mtp-2", + "avg_tok_s": 64.5, + "accept_rate": 0.7958, + "wall_s_total": 29.33 + }, + { + "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", + "toolbox": "rocm-7.2.3-mtp", + "mode": "mtp-3", + "avg_tok_s": 68.3, + "accept_rate": 0.7386, + "wall_s_total": 27.83 + }, + { + "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", + "toolbox": "vulkan-radv-mtp", + "mode": "baseline", + "avg_tok_s": 58.7, + "accept_rate": null, + "wall_s_total": 31.93 + }, + { + "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", + "toolbox": "vulkan-radv-mtp", + "mode": "mtp-2", + "avg_tok_s": 72.8, + "accept_rate": 0.7907, + "wall_s_total": 26.85 + }, + { + "model": "Qwen3.6-35B-A3B-UD-Q4_K_XL", + "toolbox": "vulkan-radv-mtp", + "mode": "mtp-3", + "avg_tok_s": 74.6, + "accept_rate": 0.7374, + "wall_s_total": 26.36 + } +] diff --git a/docs/mtp.html b/docs/mtp.html new file mode 100644 index 0000000..9bc47bb --- /dev/null +++ b/docs/mtp.html @@ -0,0 +1,64 @@ + + + +
+ + +Framework Desktop · AMD Ryzen AI MAX 395+ · 128GB unified RAM
++ Multi-Token Prediction (MTP) is an experimental speculative decoding feature for `llama.cpp` + (see PR #22673). + It allows supported models to predict multiple tokens per forward pass, significantly increasing generation speed. + These benchmarks compare the baseline generation speed against MTP with 2-token and 3-token drafts. +
+