From 2c2c36d3dac58df39d0e345bc8f2c61bb820d70c Mon Sep 17 00:00:00 2001
From: Donato Capitella <donato.capitella@reversec.com>
Date: Wed, 15 Apr 2026 09:23:58 +0100
Subject: [PATCH] add rocm-7.2.1-pr21344 toolbox (gfx1151 MMQ/MMVQ tile + nwarp
 tuning)

Adds a new toolbox variant based on PR #21344 (pedapudi/llama.cpp@gfx1151-opt)
which tunes MMQ tile sizes (x_max=48, y=64) and warp counts (nwarps=4) for
RDNA3_5 gfx1151, yielding up to +100% prefill throughput at small batch sizes.

Also adds BMI2/FMA/F16C CPU SIMD flags and GGML_CUDA_FA_ALL_QUANTS=ON to match
the benchmark build used in the PR. Wire up CI (build matrix + prune), the
refresh script, and run_benchmarks.sh so results land alongside rocm-7.2.1.
---
 .github/workflows/build_and_publish.yml   |   2 +-
 .github/workflows/prune-old-toolboxes.yml |   2 +-
 AGENTS.md                                 |  26 +++++
 benchmark/run_benchmarks.sh               |   1 +
 refresh-toolboxes.sh                      |   1 +
 toolboxes/Dockerfile.rocm-7.2.1-pr21344   | 124 ++++++++++++++++++++++
 6 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 AGENTS.md
 create mode 100644 toolboxes/Dockerfile.rocm-7.2.1-pr21344

diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml
index e7b2cfd..1ded5e2 100644
--- a/.github/workflows/build_and_publish.yml
+++ b/.github/workflows/build_and_publish.yml
@@ -28,7 +28,7 @@ jobs:
           IN='${{ inputs.backends }}'
 
           if [[ "$IN" == "all" || -z "$IN" ]]; then
-            JSON='["rocm-6.4.4","rocm-7.2.1","rocm7-nightlies","vulkan-amdvlk","vulkan-radv"]'
+            JSON='["rocm-6.4.4","rocm-7.2.1","rocm-7.2.1-pr21344","rocm7-nightlies","vulkan-amdvlk","vulkan-radv"]'
           else
             # Remove spaces and build JSON array from comma list
             IN_CLEAN=$(echo "$IN" | tr -d '[:space:]')
diff --git a/.github/workflows/prune-old-toolboxes.yml b/.github/workflows/prune-old-toolboxes.yml
index cc52d6c..22bb3b4 100644
--- a/.github/workflows/prune-old-toolboxes.yml
+++ b/.github/workflows/prune-old-toolboxes.yml
@@ -44,7 +44,7 @@ jobs:
         run: |
           IN='${{ github.event.inputs.backends }}'
           if [[ "$IN" == "all" || -z "$IN" ]]; then
-            JSON='["rocm-6.4.2","rocm-6.4.3","rocm-6.4.4","rocm-7.1.1","rocm-7.2","rocm-7.2.1","rocm-7beta","rocm7-nightlies","vulkan-amdvlk","vulkan-radv"]'
+            JSON='["rocm-6.4.2","rocm-6.4.3","rocm-6.4.4","rocm-7.1.1","rocm-7.2","rocm-7.2.1","rocm-7.2.1-pr21344","rocm-7beta","rocm7-nightlies","vulkan-amdvlk","vulkan-radv"]'
           else
             IN_CLEAN=$(echo "$IN" | tr -d '[:space:]')
             JSON='["'${IN_CLEAN//,/\",\"}'"]'
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..89045a3
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,26 @@
+# AI Agent Context: AMD Strix Halo Llama.cpp Toolboxes
+
+**Primary Goal:** This project provides pre-built containers ("toolboxes") for running `llama.cpp` optimally on AMD Ryzen AI Max "Strix Halo" APUs. It simplifies environment setup for ROCm and Vulkan backends, allowing users to leverage up to 124 GiB of unified system memory for LLM inference.
+
+## Core Technologies
+*   **Containerization**: [Toolbx](https://containertoolbx.org/) (Fedora) or Distrobox (Ubuntu). Underneath, Docker/Podman is used to build base images.
+*   **Inference Engine**: [Llama.cpp](https://github.com/ggerganov/llama.cpp)
+*   **Hardware / Drivers**: AMD "Strix Halo" APUs (Gfx1151). Implementations depend on ROCm (v6.4.4, v7.x) and Vulkan (Mesa RADV, AMDVLK).
+
+## Repository Structure Overview
+*   `/toolboxes/`: Dockerfiles used to build the container images (e.g., `rocm-6.4.4`, `rocm-7.2.1`, `vulkan-radv`). These often use multi-stage builds to compile Llama.cpp and extract standalone binaries.
+*   `/benchmark/`: Shell scripts and Python utilities (like `generate_results_json.py`) to systematically test Llama.cpp throughput, latency, and RPC performance.
+*   `/docs/`: Markdown documents, along with HTML/CSS/JS (e.g., `index.html`, `assets/`) for the GitHub Pages website (`strix-halo-toolboxes.com`), plus interactive benchmark viewers and documentation on VRAM estimation.
+*   `/scripts/`: Python utilities, including `run_distributed_llama.py` for distributed inference across nodes.
+*   `.github/workflows/`: GitHub Actions that automatically rebuild containers whenever the upstream `llama.cpp` master branch updates or when triggered manually.
+
+## Critical Technical Quirks (Important for Development)
+*   **Flash Attention & no-mmap**: Running `llama-server` or `llama-cli` on Strix Halo *requires* `-fa 1` (flash attention) and `--no-mmap` to avoid memory fragmentation and crashes.
+*   **Kernel memory params**: The optimal Strix Halo host configuration relies on custom boot parameters (`iommu=pt amdgpu.gttsize=126976 ttm.pages_limit=32505856`) to allocate unified RAM to the iGPU.
+*   **ROCm 7+ Workaround**: Due to LLVM compiler regressions, ROCm 7 builds currently use a workaround flag (`-mllvm --amdgpu-unroll-threshold-local=600`) to restore Llama.cpp performance.
+*   **Kernel Bugs**: Avoid kernels older than 6.18.4, and the specifically broken `linux-firmware-20251125`.
+
+## General Instructions for Coding Agents
+1.  **Container Builds**: When modifying `Dockerfile.*` files inside `/toolboxes`, ensure the build output remains lean and only necessary runtime dependencies and Llama.cpp binaries are carried over.
+2.  **Documentation Synchronization**: If adding a new backend or feature, ensure `README.md` is updated simultaneously.
+3.  **Scripts**: Benchmarking and utility scripts are expected to integrate with standard `toolbox` execution. Use `/dev/dri` and `/dev/kfd` mounts for device access.
diff --git a/benchmark/run_benchmarks.sh b/benchmark/run_benchmarks.sh
index dc60ff2..728e808 100755
--- a/benchmark/run_benchmarks.sh
+++ b/benchmark/run_benchmarks.sh
@@ -63,6 +63,7 @@ echo
 declare -A CMDS=(
   [rocm6_4_4]="toolbox run -c llama-rocm-6.4.4 -- /usr/local/bin/llama-bench"
   [rocm-7_2_1]="toolbox run -c llama-rocm-7.2.1 -- /usr/local/bin/llama-bench"
+  [rocm-7_2_1-pr21344]="toolbox run -c llama-rocm-7.2.1-pr21344 -- /usr/local/bin/llama-bench"
   [rocm7-nightlies]="toolbox run -c llama-rocm7-nightlies -- /usr/local/bin/llama-bench"
   [vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench"
   [vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench"
diff --git a/refresh-toolboxes.sh b/refresh-toolboxes.sh
index d5618d4..936f141 100755
--- a/refresh-toolboxes.sh
+++ b/refresh-toolboxes.sh
@@ -9,6 +9,7 @@ TOOLBOXES["llama-vulkan-amdvlk"]="docker.io/kyuz0/amd-strix-halo-toolboxes:vulka
 TOOLBOXES["llama-vulkan-radv"]="docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan-radv --device /dev/dri --group-add video --security-opt seccomp=unconfined"
 TOOLBOXES["llama-rocm-6.4.4"]="docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-6.4.4 --device /dev/dri --device /dev/kfd --group-add video --group-add render --group-add sudo --security-opt seccomp=unconfined"
 TOOLBOXES["llama-rocm-7.2.1"]="docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7.2.1 --device /dev/dri --device /dev/kfd --group-add video --group-add render --group-add sudo --security-opt seccomp=unconfined"
+TOOLBOXES["llama-rocm-7.2.1-pr21344"]="docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7.2.1-pr21344 --device /dev/dri --device /dev/kfd --group-add video --group-add render --group-add sudo --security-opt seccomp=unconfined"
 TOOLBOXES["llama-rocm7-nightlies"]="docker.io/kyuz0/amd-strix-halo-toolboxes:rocm7-nightlies --device /dev/dri --device /dev/kfd --group-add video --group-add render --group-add sudo --security-opt seccomp=unconfined"
 
 function usage() {
diff --git a/toolboxes/Dockerfile.rocm-7.2.1-pr21344 b/toolboxes/Dockerfile.rocm-7.2.1-pr21344
new file mode 100644
index 0000000..0f3ec43
--- /dev/null
+++ b/toolboxes/Dockerfile.rocm-7.2.1-pr21344
@@ -0,0 +1,124 @@
+# build stage
+# Based on Dockerfile.rocm-7.2.1, but clones pedapudi/llama.cpp@gfx1151-opt
+# (PR #21344: gfx1151 nwarps, tile sizing to curb VGPR pressure)
+FROM registry.fedoraproject.org/fedora:43 AS builder
+
+# rocm 7.2.1 repo
+RUN <<'EOF'
+tee /etc/yum.repos.d/rocm.repo <<REPO
+[ROCm-7.2.1]
+name=ROCm7.2.1
+baseurl=https://repo.radeon.com/rocm/rhel10/7.2.1/main
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+REPO
+EOF
+
+# deps
+RUN dnf -y --nodocs --setopt=install_weak_deps=False \
+  --exclude='*sdk*' --exclude='*samples*' --exclude='*-doc*' --exclude='*-docs*' \
+  install \
+  make gcc cmake lld clang clang-devel compiler-rt libcurl-devel ninja-build \
+  rocm-llvm rocm-device-libs hip-runtime-amd hip-devel \
+  rocblas rocblas-devel hipblas hipblas-devel rocm-cmake libomp-devel libomp \
+  rocminfo radeontop \
+  git-core vim sudo rsync patch \
+  && dnf clean all && rm -rf /var/cache/dnf/*
+
+# rocm env
+ENV ROCM_PATH=/opt/rocm \
+  HIP_PATH=/opt/rocm \
+  HIP_CLANG_PATH=/opt/rocm/llvm/bin \
+  HIP_DEVICE_LIB_PATH=/opt/rocm/amdgcn/bitcode \
+  PATH=/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH
+
+# llama.cpp — PR #21344 fork (gfx1151 MMQ/MMVQ tile + nwarp tuning)
+WORKDIR /opt/llama.cpp
+ARG REPO=https://github.com/pedapudi/llama.cpp.git
+ARG BRANCH=gfx1151-opt
+RUN git clone -b ${BRANCH} --single-branch --recursive ${REPO} .
+
+COPY llama-grammar.patch /tmp/llama-grammar.patch
+
+# build
+RUN git clean -xdf \
+  && git submodule update --recursive \
+  && patch -p1 < /tmp/llama-grammar.patch \
+  && cmake -S . -B build \
+  -DGGML_HIP=ON \
+  -DCMAKE_HIP_COMPILER=${HIP_CLANG_PATH}/clang \
+  -DCMAKE_HIP_FLAGS="--rocm-path=/opt/rocm -mllvm --amdgpu-unroll-threshold-local=600" \
+  -DAMDGPU_TARGETS=gfx1151 \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DGGML_RPC=ON \
+  -DLLAMA_HIP_UMA=ON \
+  -DGGML_CUDA_ENABLE_UNIFIED_MEMORY=ON \
+  -DGGML_BMI2=ON \
+  -DGGML_FMA=ON \
+  -DGGML_F16C=ON \
+  -DGGML_CUDA_FA_ALL_QUANTS=ON \
+  -DLLAMA_BUILD_TESTS=OFF \
+  -DLLAMA_BUILD_EXAMPLES=OFF \
+  -DROCM_PATH=/opt/rocm \
+  -DHIP_PATH=/opt/rocm \
+  -DHIP_PLATFORM=amd \
+  && cmake --build build --config Release -- -j$(nproc) \
+  && cmake --install build --config Release
+
+# libs
+RUN find /opt/llama.cpp/build -type f -name 'lib*.so*' -exec cp {} /usr/lib64/ \; \
+  && ldconfig
+
+# helper
+COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py
+RUN chmod +x /usr/local/bin/gguf-vram-estimator.py
+
+# runtime stage
+FROM registry.fedoraproject.org/fedora-minimal:43
+
+# rocm 7.2.1 repo
+RUN <<'EOF'
+tee /etc/yum.repos.d/rocm.repo <<REPO
+[ROCm-7.2.1]
+name=ROCm7.2.1
+baseurl=https://repo.radeon.com/rocm/rhel10/7.2.1/main
+enabled=1
+priority=50
+gpgcheck=1
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
+REPO
+EOF
+
+# runtime deps
+RUN microdnf -y --nodocs --setopt=install_weak_deps=0 \
+  --exclude='*sdk*' --exclude='*samples*' --exclude='*-doc*' --exclude='*-docs*' \
+  install \
+  bash ca-certificates libatomic libstdc++ libgcc libgomp sudo \
+  hip-runtime-amd rocblas hipblas \
+  rocminfo radeontop procps-ng \
+  && microdnf clean all && rm -rf /var/cache/dnf/*
+
+# copy
+COPY --from=builder /usr/local/ /usr/local/
+COPY --from=builder /opt/llama.cpp/build/bin/rpc-* /usr/local/bin/
+
+# ld
+RUN echo "/usr/local/lib"  > /etc/ld.so.conf.d/local.conf \
+  && echo "/usr/local/lib64" >> /etc/ld.so.conf.d/local.conf \
+  && ldconfig \
+  && cp -n /usr/local/lib/libllama*.so* /usr/lib64/ 2>/dev/null || true \
+  && ldconfig
+
+# helper
+COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py
+RUN chmod +x /usr/local/bin/gguf-vram-estimator.py
+
+# profile
+RUN printf '%s\n' \
+  > /etc/profile.d/rocm.sh && chmod +x /etc/profile.d/rocm.sh \
+  && echo 'source /etc/profile.d/rocm.sh' >> /etc/bashrc
+
+# shell
+CMD ["/bin/bash"]