From c129a04a1c22ea4c891a25af149a95e78758b941 Mon Sep 17 00:00:00 2001
From: Donato Capitella <donato.capitella@reversec.com>
Date: Fri, 10 Apr 2026 11:23:06 +0100
Subject: [PATCH] refactor: remove hblt0 benchmark support and associated
 comparison scripts

---
 benchmark/compare_hblt0.py         | 189 -----------------------------
 benchmark/generate_results_json.py |   5 +-
 benchmark/run_benchmarks.sh        |  98 ++++++---------
 benchmark/run_rpc_benchmarks.sh    |  42 +------
 docs/assets/index2.css             |   5 +-
 docs/assets/index2.js              |  11 +-
 docs/index.html                    |  24 +---
 7 files changed, 47 insertions(+), 327 deletions(-)
 delete mode 100644 benchmark/compare_hblt0.py

diff --git a/benchmark/compare_hblt0.py b/benchmark/compare_hblt0.py
deleted file mode 100644
index 68bfd3e..0000000
--- a/benchmark/compare_hblt0.py
+++ /dev/null
@@ -1,189 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compare hipBLASLt-on vs hblt0-off benchmark runs.
-
-This script inspects docs/results.json (the same dataset consumed by
-docs/assets/index2.js) and reports, for every backend that was benchmarked
-both with and without the `-hblt0` suffix, which configuration wins.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import math
-import statistics
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, Iterable, List, Tuple
-
-
-DEFAULT_RESULTS = Path("../docs") / "results.json"
-# Matches the tolerance used in docs/assets/index2.js (MIN_TOL = 0.25)
-DEFAULT_TOLERANCE = 0.25
-
-VariantValues = Dict[str, List[float]]
-BackendMatrix = Dict[Tuple, VariantValues]
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Pair benchmark runs with and without '-hblt0' and report which "
-            "configuration is faster per backend."
-        )
-    )
-    parser.add_argument(
-        "--results",
-        type=Path,
-        default=DEFAULT_RESULTS,
-        help="Path to results.json generated by the benchmark pipeline.",
-    )
-    parser.add_argument(
-        "--tolerance",
-        type=float,
-        default=DEFAULT_TOLERANCE,
-        help="Minimum tokens/sec delta to treat as a win (default: 0.25).",
-    )
-    return parser.parse_args()
-
-
-def load_runs(path: Path) -> Iterable[dict]:
-    data = json.loads(path.read_text())
-    runs = data.get("runs")
-    if not isinstance(runs, list):
-        raise ValueError(f"results.json at {path} does not contain a 'runs' array")
-    return runs
-
-
-def measurement_key(run: dict) -> Tuple:
-    """Return a tuple that uniquely identifies a benchmark scenario."""
-    return (
-        (run.get("model_clean") or run.get("model") or "").lower(),
-        run.get("test") or "",
-        run.get("context") or "default",
-        run.get("context_tokens") or 0,
-        (run.get("quant") or "").upper(),
-        run.get("fa"),
-        run.get("rpc"),
-        run.get("ngl"),
-        run.get("backend"),
-    )
-
-
-def pair_runs(runs: Iterable[dict]) -> Tuple[Dict[str, BackendMatrix], Dict[str, Dict[str, int]]]:
-    """
-    Group runs by backend (without / with '-hblt0') and measurement key.
-
-    Returns:
-        pairs: backend -> measurement_key -> {'hipblaslt': [...], 'hblt0': [...]}
-        coverage: backend -> {'hipblaslt': raw_run_count, 'hblt0': raw_run_count}
-    """
-    pairs: Dict[str, BackendMatrix] = defaultdict(lambda: defaultdict(dict))
-    coverage: Dict[str, Dict[str, int]] = defaultdict(lambda: {"hipblaslt": 0, "hblt0": 0})
-
-    for run in runs:
-        env = run.get("env")
-        if not env:
-            continue
-        if run.get("error"):
-            continue
-        tps = run.get("tps_mean")
-        if not isinstance(tps, (int, float)) or math.isnan(tps):
-            continue
-
-        is_hblt0 = env.endswith("-hblt0")
-        base_env = env[:-6] if is_hblt0 else env
-        variant = "hblt0" if is_hblt0 else "hipblaslt"
-
-        key = measurement_key(run)
-        entry = pairs[base_env][key]
-        entry.setdefault(variant, []).append(float(tps))
-        coverage[base_env][variant] += 1
-
-    return pairs, coverage
-
-
-def summarize_backend(
-    backend: str,
-    matrix: BackendMatrix,
-    tolerance: float,
-    coverage: Dict[str, int],
-) -> dict | None:
-    pairs: List[Tuple[float, float]] = []
-
-    for entry in matrix.values():
-        if "hipblaslt" not in entry or "hblt0" not in entry:
-            continue
-        hip = statistics.mean(entry["hipblaslt"])
-        hbl = statistics.mean(entry["hblt0"])
-        pairs.append((hip, hbl))
-
-    if not pairs:
-        return None
-
-    hip_wins = sum(1 for hip, hbl in pairs if (hip - hbl) > tolerance)
-    hbl_wins = sum(1 for hip, hbl in pairs if (hbl - hip) > tolerance)
-    ties = len(pairs) - hip_wins - hbl_wins
-
-    avg_hip = statistics.mean(hip for hip, _ in pairs)
-    avg_hbl = statistics.mean(hbl for _, hbl in pairs)
-    avg_delta = avg_hip - avg_hbl
-    pct_delta = (avg_delta / avg_hbl * 100.0) if avg_hbl else float("inf")
-
-    if avg_delta > tolerance:
-        verdict = "hipBLASLt faster"
-    elif avg_delta < -tolerance:
-        verdict = "hblt0 faster"
-    else:
-        verdict = "too close to call"
-
-    return {
-        "backend": backend,
-        "pairs": len(pairs),
-        "hip_wins": hip_wins,
-        "hbl_wins": hbl_wins,
-        "ties": ties,
-        "avg_hip": avg_hip,
-        "avg_hbl": avg_hbl,
-        "avg_delta": avg_delta,
-        "pct_delta": pct_delta,
-        "verdict": verdict,
-        "coverage": coverage,
-    }
-
-
-def format_summary(summary: dict) -> str:
-    cov = summary["coverage"]
-    hip_runs = cov.get("hipblaslt", 0)
-    hbl_runs = cov.get("hblt0", 0)
-    return (
-        f"{summary['backend']}: {summary['verdict']} "
-        f"(Δ {summary['avg_delta']:+.2f} tps / {summary['pct_delta']:+.2f}% "
-        f"across {summary['pairs']} matched cases; "
-        f"hipBLASLt wins {summary['hip_wins']}, hblt0 wins {summary['hbl_wins']}, "
-        f"ties {summary['ties']}; raw runs hipBLASLt={hip_runs}, hblt0={hbl_runs})"
-    )
-
-
-def main() -> None:
-    args = parse_args()
-    runs = load_runs(args.results)
-    matrices, coverage = pair_runs(runs)
-
-    summaries = []
-    for backend in sorted(matrices):
-        summary = summarize_backend(backend, matrices[backend], args.tolerance, coverage.get(backend, {}))
-        if summary:
-            summaries.append(summary)
-
-    if not summaries:
-        print("No matching hipBLASLt vs hblt0 pairs were found.")
-        return
-
-    for summary in summaries:
-        print(format_summary(summary))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/generate_results_json.py b/benchmark/generate_results_json.py
index 7ff7954..f7497ec 100644
--- a/benchmark/generate_results_json.py
+++ b/benchmark/generate_results_json.py
@@ -65,7 +65,7 @@ def canonicalize_env(env):
 
 def parse_env_flags(basename):
     """
-    pattern: <model>__<env>[__fa1][__hblt0][__longctx32768][__rpc]
+    pattern: <model>__<env>[__fa1][__longctx32768][__rpc]
     Returns (env, fa, context_tag, context_tokens, rpc_flag)
     """
     parts = basename.split("__")
@@ -82,8 +82,7 @@ def parse_env_flags(basename):
         suffix = raw_suffix.lower()
         if suffix == "fa1":
             fa = True
-        elif suffix == "hblt0":
-            env = f"{env}-hblt0"
+
         elif suffix.startswith("longctx"):
             context_tag = suffix
             m = LONGCTX_RE.search(suffix)
diff --git a/benchmark/run_benchmarks.sh b/benchmark/run_benchmarks.sh
index 71ede28..b7eb91a 100755
--- a/benchmark/run_benchmarks.sh
+++ b/benchmark/run_benchmarks.sh
@@ -68,80 +68,56 @@ declare -A CMDS=(
   [vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench"
 )
 
-get_hblt_modes() {
-  local env="$1"
-  if [[ "$env" == rocm* ]]; then
-    printf '%s\n' default off
-  else
-    printf '%s\n' default
-  fi
-}
-
 for MODEL_PATH in "${MODEL_PATHS[@]}"; do
   MODEL_NAME="$(basename "$MODEL_PATH" .gguf)"
 
   for ENV in "${!CMDS[@]}"; do
-    CMD="${CMDS[$ENV]}"
-    mapfile -t HBLT_MODES < <(get_hblt_modes "$ENV")
+    CMD_EFFECTIVE="${CMDS[$ENV]}"
 
-    for MODE in "${HBLT_MODES[@]}"; do
-      BASE_SUFFIX=""
-      CMD_EFFECTIVE="$CMD"
-
-      if [[ "$ENV" == rocm* ]]; then
-        if [[ "$MODE" == off ]]; then
-          BASE_SUFFIX="__hblt0"
-          CMD_EFFECTIVE="${CMD_EFFECTIVE/-- /-- env ROCBLAS_USE_HIPBLASLT=0 }"
-        else
-          CMD_EFFECTIVE="${CMD_EFFECTIVE/-- /-- env ROCBLAS_USE_HIPBLASLT=1 }"
-        fi
+    # run twice: baseline and with flash attention
+    for FA in 1; do
+      SUFFIX=""
+      EXTRA_ARGS=()
+      if (( FA == 1 )); then
+        SUFFIX="__fa1"
+        EXTRA_ARGS=( -fa 1 )
       fi
 
-      # run twice: baseline and with flash attention
-      for FA in 1; do
-        SUFFIX="$BASE_SUFFIX"
-        EXTRA_ARGS=()
-        if (( FA == 1 )); then
-          SUFFIX="${SUFFIX}__fa1"
-          EXTRA_ARGS=( -fa 1 )
+      for CTX in default longctx32768; do
+        CTX_SUFFIX=""
+        CTX_ARGS=()
+        if [[ "$CTX" == longctx32768 ]]; then
+          CTX_SUFFIX="__longctx32768"
+          CTX_ARGS=( -p 2048 -n 32 -d 32768 )
+          if [[ "$ENV" == *vulkan* ]]; then
+            CTX_ARGS+=( -ub 512 )
+          else
+            CTX_ARGS+=( -ub 2048 )
+          fi
         fi
 
-        for CTX in default longctx32768; do
-          CTX_SUFFIX=""
-          CTX_ARGS=()
-          if [[ "$CTX" == longctx32768 ]]; then
-            CTX_SUFFIX="__longctx32768"
-            CTX_ARGS=( -p 2048 -n 32 -d 32768 )
-            if [[ "$ENV" == *vulkan* ]]; then
-              CTX_ARGS+=( -ub 512 )
-            else
-              CTX_ARGS+=( -ub 2048 )
-            fi
-          fi
+        OUT="$RESULTDIR/${MODEL_NAME}__${ENV}${SUFFIX}${CTX_SUFFIX}.log"
+        CTX_REPS=5
+        if [[ "$CTX" == longctx32768 ]]; then
+          CTX_REPS=3
+        fi
 
-          OUT="$RESULTDIR/${MODEL_NAME}__${ENV}${SUFFIX}${CTX_SUFFIX}.log"
-          CTX_REPS=5
-          if [[ "$CTX" == longctx32768 ]]; then
-            CTX_REPS=3
-          fi
+        if [[ -s "$OUT" ]]; then
+          echo "⏩ Skipping [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ ($CTX_SUFFIX)}, log already exists at $OUT"
+          continue
+        fi
 
-          if [[ -s "$OUT" ]]; then
-            echo "⏩ Skipping [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ ($CTX_SUFFIX)}, log already exists at $OUT"
-            continue
-          fi
+        FULL_CMD=( $CMD_EFFECTIVE -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
 
-          FULL_CMD=( $CMD_EFFECTIVE -ngl 99 -mmp 0 -m "$MODEL_PATH" "${EXTRA_ARGS[@]}" "${CTX_ARGS[@]}" -r "$CTX_REPS" )
+        printf "\n▶ [%s] %s%s%s\n" "$ENV" "$MODEL_NAME" "${SUFFIX:+ $SUFFIX}" "${CTX_SUFFIX:+ $CTX_SUFFIX}"
+        printf "  → log: %s\n" "$OUT"
+        printf "  → cmd: %s\n\n" "${FULL_CMD[*]}"
 
-          printf "\n▶ [%s] %s%s%s\n" "$ENV" "$MODEL_NAME" "${SUFFIX:+ $SUFFIX}" "${CTX_SUFFIX:+ $CTX_SUFFIX}"
-          printf "  → log: %s\n" "$OUT"
-          printf "  → cmd: %s\n\n" "${FULL_CMD[*]}"
-
-          if ! "${FULL_CMD[@]}" >"$OUT" 2>&1; then
-            status=$?
-            echo "✖ ! [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} failed (exit ${status})" >>"$OUT"
-            echo "  * [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} : FAILED"
-          fi
-        done
+        if ! "${FULL_CMD[@]}" >"$OUT" 2>&1; then
+          status=$?
+          echo "✖ ! [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} failed (exit ${status})" >>"$OUT"
+          echo "  * [${ENV}] ${MODEL_NAME}${SUFFIX}${CTX_SUFFIX:+ $CTX_SUFFIX} : FAILED"
+        fi
       done
     done
   done
diff --git a/benchmark/run_rpc_benchmarks.sh b/benchmark/run_rpc_benchmarks.sh
index 4a37f99..4db3a89 100755
--- a/benchmark/run_rpc_benchmarks.sh
+++ b/benchmark/run_rpc_benchmarks.sh
@@ -84,14 +84,6 @@ resolve_model_path() {
   return 1
 }
 
-get_hblt_modes() {
-  local env="$1"
-  if [[ "$env" == rocm* ]]; then
-    printf '%s\n' default off
-  else
-    printf '%s\n' default
-  fi
-}
 
 ensure_models_exist() {
   RESOLVED_MODELS=()
@@ -141,23 +133,13 @@ has_pending_runs() {
 start_remote_rpc() {
   local env="$1"
   local image="$2"
-  local mode="$3"
-  local suffix="$4"
+  local suffix="$3"
   local remote_log="/tmp/rpc-server-${env}${suffix}.log"
-  local env_prefix=""
-
-  if [[ "$env" == rocm* ]]; then
-    if [[ "$mode" == off ]]; then
-      env_prefix="env ROCBLAS_USE_HIPBLASLT=0 "
-    else
-      env_prefix="env ROCBLAS_USE_HIPBLASLT=1 "
-    fi
-  fi
 
   ssh -p "$REMOTE_PORT" "$REMOTE_TARGET" 'bash -s' <<EOF
 set -euo pipefail
 pkill -9 -f rpc-server || true
-nohup toolbox run -c ${image} -- ${env_prefix}rpc-server -H 0.0.0.0 -p ${RPC_PORT} -c >${remote_log} 2>&1 < /dev/null &
+nohup toolbox run -c ${image} -- rpc-server -H 0.0.0.0 -p ${RPC_PORT} -c >${remote_log} 2>&1 < /dev/null &
 echo \$!
 EOF
 }
@@ -201,7 +183,6 @@ run_llama_bench_rpc() {
   local model_path="$1"
   local env="$2"
   local suffix="$3"
-  local mode="$4"
   local model_name
   model_name="$(basename "${model_path}" .gguf)"
   local client_cmd="${CLIENT_CMDS[$env]:-}"
@@ -216,14 +197,6 @@ run_llama_bench_rpc() {
     return
   fi
 
-  if [[ "$env" == rocm* ]]; then
-    if [[ "$mode" == off ]]; then
-      client_cmd="${client_cmd/-- /-- env ROCBLAS_USE_HIPBLASLT=0 }"
-    else
-      client_cmd="${client_cmd/-- /-- env ROCBLAS_USE_HIPBLASLT=1 }"
-    fi
-  fi
-
   local -a client_cmd_ary
   # shellcheck disable=SC2206 # intentional word splitting
   client_cmd_ary=( $client_cmd )
@@ -284,13 +257,7 @@ run_all() {
       continue
     fi
 
-    mapfile -t hblt_modes < <(get_hblt_modes "$env")
-
-    for mode in "${hblt_modes[@]}"; do
       local suffix=""
-      if [[ "$mode" == off ]]; then
-        suffix="__hblt0"
-      fi
 
       echo
       echo "==== ${env}${suffix} -> ${image} ===="
@@ -302,7 +269,7 @@ run_all() {
 
       CURRENT_REMOTE_ENV="${env}${suffix}"
       local remote_pid
-      remote_pid="$(start_remote_rpc "$env" "$image" "$mode" "$suffix" | tr -d '\r')"
+      remote_pid="$(start_remote_rpc "$env" "$image" "$suffix" | tr -d '\r')"
 
       if [[ -z "$remote_pid" ]]; then
         echo "[ERROR] Failed to start RPC server for ${env}${suffix}"
@@ -322,13 +289,12 @@ run_all() {
       fi
 
       for model in "${RESOLVED_MODELS[@]}"; do
-        run_llama_bench_rpc "$model" "$env" "$suffix" "$mode"
+        run_llama_bench_rpc "$model" "$env" "$suffix"
       done
 
       stop_remote_rpc "$env" "$remote_pid" || true
       CURRENT_REMOTE_PID=""
       CURRENT_REMOTE_ENV=""
-    done
   done
 }
 
diff --git a/docs/assets/index2.css b/docs/assets/index2.css
index 0f1a98f..7f2a9af 100644
--- a/docs/assets/index2.css
+++ b/docs/assets/index2.css
@@ -165,10 +165,7 @@ select {
     transform: translateY(-2px);
 }
 
-.backend-item .tag.tag-hblt0 {
-    background: #e9edff;
-    color: #1d3ea5;
-}
+
 
 .backend-item .tag.tag-rocwmma {
     background: #eef9ff;
diff --git a/docs/assets/index2.js b/docs/assets/index2.js
index d1a75fa..d910fd8 100644
--- a/docs/assets/index2.js
+++ b/docs/assets/index2.js
@@ -55,9 +55,7 @@ function cacheUI() {
         stats: document.getElementById("stats-line"),
         resetBtn: document.getElementById("reset-layout"),
         tables: document.getElementById("tables"),
-        hipblasModalOpen: document.getElementById("hipblas-modal-open"),
-        hipblasModal: document.getElementById("hipblas-modal"),
-        hipblasModalClose: document.getElementById("hipblas-modal-close"),
+
         rpcModalOpen: document.getElementById("rpc-modal-open"),
         rpcModal: document.getElementById("rpc-modal"),
         rpcModalClose: document.getElementById("rpc-modal-close"),
@@ -72,11 +70,6 @@ function cacheUI() {
 
 function setupModals() {
     const modalConfigs = [
-        {
-            open: state.ui.hipblasModalOpen,
-            modal: state.ui.hipblasModal,
-            close: state.ui.hipblasModalClose,
-        },
         {
             open: state.ui.rpcModalOpen,
             modal: state.ui.rpcModal,
@@ -668,7 +661,7 @@ function backendValue(entry, direction) {
 
 function splitEnvName(env) {
     const canonical = env.replace(/_/g, ".");
-    const tagRegex = /-(rocwmma-improved|rocwmma|improved|hblt0)/gi;
+    const tagRegex = /-(rocwmma-improved|rocwmma|improved)/gi;
     const tags = [];
     let match;
     while ((match = tagRegex.exec(canonical)) !== null) {
diff --git a/docs/index.html b/docs/index.html
index 27145f1..ac98265 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -19,9 +19,7 @@
         <div class="legend">
             <label>Legend</label>
             <div class="legend-pills">
-                <button id="hipblas-modal-open" type="button" class="chip small legend-pill legend-pill-default">
-                    hipBLASLt vs hblt0
-                </button>
+
                 <button id="rpc-modal-open" type="button" class="chip small legend-pill legend-pill-rpc">
                     RPC · dual server
                 </button>
@@ -83,26 +81,6 @@
         <div id="tables"></div>
     </section>
 
-    <div id="hipblas-modal" class="modal hidden" role="dialog" aria-modal="true" aria-labelledby="hipblas-title">
-        <div class="modal-content">
-            <button id="hipblas-modal-close" class="modal-close" aria-label="Close dialog">×</button>
-            <h2 id="hipblas-title">hipBLASLt &amp; hblt0 explained</h2>
-            <p>The ROCm toolboxes ship with <code>ROCBLAS_USE_HIPBLASLT=1</code> by default. This forces rocBLAS to
-                prefer
-                the hipBLASLt kernel library, which historically delivered the best throughput on gfx1151 (Strix Halo).
-            </p>
-            <p>Rows tagged with <code>__hblt0</code> were re-run with <code>ROCBLAS_USE_HIPBLASLT=0</code>, letting
-                rocBLAS
-                auto-select between hipBLASLt, Tensile, or other kernel providers. These runs show how performance
-                shifts when
-                the tuned hipBLASLt path is disabled.</p>
-            <p>hipBLASLt is AMD's LT (low-level tuned) matmul backend, optimized for transformer workloads. Disabling it
-                can
-                expose regressions or improvements depending on driver versions, so both configurations are published
-                for
-                comparison.</p>
-        </div>
-    </div>
 
     <div id="rpc-modal" class="modal hidden" role="dialog" aria-modal="true" aria-labelledby="rpc-title">
         <div class="modal-content">