From 8648f93ad3f89b73bde2b7bce7cd32f08d9b0ff9 Mon Sep 17 00:00:00 2001 From: kyuz0 Date: Tue, 12 May 2026 12:32:07 +0100 Subject: [PATCH] updated benchs --- ...GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1.log | 8 ++++++++ ...BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ .../results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1.log | 8 ++++++++ ....7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ....7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ ...Max-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1.log | 8 ++++++++ ..._K_S-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ..._K_S-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log | 2 ++ ...-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log | 8 ++++++++ ...BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ .../gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log | 8 ++++++++ ...B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ .../gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log | 8 ++++++++ ...B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ ...emma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log | 8 ++++++++ ...BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ .../gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log | 8 ++++++++ ...4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ .../gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log | 8 ++++++++ ...4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ ...gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1.log | 8 ++++++++ ...xfp4-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ ...xfp4-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1.log | 8 ++++++++ .../gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ .../gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1.log | 8 ++++++++ .../llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx32768.log | 8 ++++++++ .../llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx65536.log | 8 ++++++++ 36 files changed, 282 insertions(+) create mode 100644 benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx65536.log create mode 100644 benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1.log create mode 100644 benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx32768.log create mode 100644 benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx65536.log diff --git a/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1.log b/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..b419d8c --- /dev/null +++ b/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| deepseek2 30B.A3B BF16 | 55.79 GiB | 29.94 B | ROCm | 99 | 1 | 0 | pp512 | 403.20 ± 2.04 | +| deepseek2 30B.A3B BF16 | 55.79 GiB | 29.94 B | ROCm | 99 | 1 | 0 | tg128 | 20.43 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..508de56 --- /dev/null +++ b/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| deepseek2 30B.A3B BF16 | 55.79 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 87.63 ± 0.21 | +| deepseek2 30B.A3B BF16 | 55.79 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 15.31 ± 0.02 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..cf9cc2e --- /dev/null +++ b/benchmark/results/GLM-4.7-Flash-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| deepseek2 30B.A3B BF16 | 55.79 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 46.45 ± 0.01 | +| deepseek2 30B.A3B BF16 | 55.79 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 11.98 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1.log b/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..e64c344 --- /dev/null +++ b/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| deepseek2 30B.A3B Q8_0 | 32.70 GiB | 29.94 B | ROCm | 99 | 1 | 0 | pp512 | 905.99 ± 2.05 | +| deepseek2 30B.A3B Q8_0 | 32.70 GiB | 29.94 B | ROCm | 99 | 1 | 0 | tg128 | 32.65 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..7f43dd2 --- /dev/null +++ b/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| deepseek2 30B.A3B Q8_0 | 32.70 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 91.59 ± 0.16 | +| deepseek2 30B.A3B Q8_0 | 32.70 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 20.36 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..b6a4eed --- /dev/null +++ b/benchmark/results/GLM-4.7-Flash-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| deepseek2 30B.A3B Q8_0 | 32.70 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 47.51 ± 0.05 | +| deepseek2 30B.A3B Q8_0 | 32.70 GiB | 29.94 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 14.86 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1.log b/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..b78725e --- /dev/null +++ b/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| minimax-m2 230B.A10B Q3_K - Small | 87.20 GiB | 228.69 B | ROCm | 99 | 1 | 0 | pp512 | 236.96 ± 1.25 | +| minimax-m2 230B.A10B Q3_K - Small | 87.20 GiB | 228.69 B | ROCm | 99 | 1 | 0 | tg128 | 22.81 ± 0.01 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..7ffced2 --- /dev/null +++ b/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| minimax-m2 230B.A10B Q3_K - Small | 87.20 GiB | 228.69 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 62.13 ± 0.56 | +| minimax-m2 230B.A10B Q3_K - Small | 87.20 GiB | 228.69 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 6.15 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..bc1868c --- /dev/null +++ b/benchmark/results/MiniMax-M2.7-UD-Q3_K_S-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,2 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB diff --git a/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log b/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..d61700a --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B BF16 | 47.02 GiB | 25.23 B | ROCm | 99 | 1 | 0 | pp512 | 796.06 ± 126.76 | +| gemma4 26B.A4B BF16 | 47.02 GiB | 25.23 B | ROCm | 99 | 1 | 0 | tg128 | 22.68 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..063ea41 --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B BF16 | 47.02 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 628.20 ± 5.71 | +| gemma4 26B.A4B BF16 | 47.02 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 20.12 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..4909514 --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B BF16 | 47.02 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 426.61 ± 3.98 | +| gemma4 26B.A4B BF16 | 47.02 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 19.55 ± 0.01 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log b/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..88d1ad9 --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B Q4_K - Medium | 15.90 GiB | 25.23 B | ROCm | 99 | 1 | 0 | pp512 | 1301.66 ± 10.69 | +| gemma4 26B.A4B Q4_K - Medium | 15.90 GiB | 25.23 B | ROCm | 99 | 1 | 0 | tg128 | 46.42 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..00152b7 --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B Q4_K - Medium | 15.90 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 693.56 ± 10.92 | +| gemma4 26B.A4B Q4_K - Medium | 15.90 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 36.83 ± 0.15 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..dd17c6d --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B Q4_K - Medium | 15.90 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 450.75 ± 0.77 | +| gemma4 26B.A4B Q4_K - Medium | 15.90 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 34.84 ± 0.16 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log b/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..b8fb75e --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B Q8_0 | 25.94 GiB | 25.23 B | ROCm | 99 | 1 | 0 | pp512 | 1301.93 ± 18.07 | +| gemma4 26B.A4B Q8_0 | 25.94 GiB | 25.23 B | ROCm | 99 | 1 | 0 | tg128 | 41.35 ± 0.03 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..d15b313 --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B Q8_0 | 25.94 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 695.00 ± 7.26 | +| gemma4 26B.A4B Q8_0 | 25.94 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 33.41 ± 0.01 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..c68c3d3 --- /dev/null +++ b/benchmark/results/gemma-4-26B-A4B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 26B.A4B Q8_0 | 25.94 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 447.63 ± 3.37 | +| gemma4 26B.A4B Q8_0 | 25.94 GiB | 25.23 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 31.76 ± 0.04 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log b/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..9ea45f7 --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma4 31B BF16 | 57.18 GiB | 30.70 B | ROCm | 99 | 1 | 0 | pp512 | 376.47 ± 1.77 | +| gemma4 31B BF16 | 57.18 GiB | 30.70 B | ROCm | 99 | 1 | 0 | tg128 | 3.44 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..d6a45c5 --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 31B BF16 | 57.18 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 152.54 ± 2.43 | +| gemma4 31B BF16 | 57.18 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 3.18 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..982b824 --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-BF16-00001-of-00002__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 31B BF16 | 57.18 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 98.53 ± 0.11 | +| gemma4 31B BF16 | 57.18 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 3.08 ± 0.02 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log b/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..60b508a --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma4 31B Q4_K - Medium | 17.46 GiB | 30.70 B | ROCm | 99 | 1 | 0 | pp512 | 309.16 ± 0.31 | +| gemma4 31B Q4_K - Medium | 17.46 GiB | 30.70 B | ROCm | 99 | 1 | 0 | tg128 | 10.49 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..aa2a5d6 --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 31B Q4_K - Medium | 17.46 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 139.29 ± 1.61 | +| gemma4 31B Q4_K - Medium | 17.46 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 8.10 ± 0.01 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..785a7fd --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-UD-Q4_K_XL__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 31B Q4_K - Medium | 17.46 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 92.80 ± 0.18 | +| gemma4 31B Q4_K - Medium | 17.46 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 7.51 ± 0.01 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log b/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..8b2bd22 --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma4 31B Q8_0 | 32.60 GiB | 30.70 B | ROCm | 99 | 1 | 0 | pp512 | 309.61 ± 0.81 | +| gemma4 31B Q8_0 | 32.60 GiB | 30.70 B | ROCm | 99 | 1 | 0 | tg128 | 6.15 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..3e6693a --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 31B Q8_0 | 32.60 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 139.33 ± 1.73 | +| gemma4 31B Q8_0 | 32.60 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 5.25 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..ea393fd --- /dev/null +++ b/benchmark/results/gemma-4-31B-it-UD-Q8_K_XL__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gemma4 31B Q8_0 | 32.60 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 93.30 ± 0.54 | +| gemma4 31B Q8_0 | 32.60 GiB | 30.70 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 5.00 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..b9a196c --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 635.33 ± 4.03 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 50.99 ± 0.05 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..ba6f437 --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 302.31 ± 0.46 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 35.94 ± 0.12 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..88e55ac --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 175.25 ± 0.49 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 27.78 ± 0.06 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..80a1680 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1636.30 ± 15.57 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 72.58 ± 0.05 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..ac847ab --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 487.06 ± 3.04 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 51.73 ± 0.18 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..5d62b87 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 276.26 ± 0.34 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 40.32 ± 0.08 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1.log new file mode 100644 index 0000000..0e24968 --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1543.35 ± 8.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 50.50 ± 0.12 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx32768.log b/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx32768.log new file mode 100644 index 0000000..5f07961 --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx32768.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 159.17 ± 0.71 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 5.64 ± 0.00 | + +build: 8e1f9d083 (9112) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx65536.log b/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx65536.log new file mode 100644 index 0000000..f85d75e --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm-7_2_3__fa1__longctx65536.log @@ -0,0 +1,8 @@ +ggml_cuda_init: found 1 ROCm devices (Total VRAM: 126976 MiB): + Device 0: AMD Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32, VRAM: 126976 MiB +| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 2048 | 1 | 0 | pp2048 @ d65536 | 72.03 ± 0.23 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 2048 | 1 | 0 | tg32 @ d65536 | 3.00 ± 0.00 | + +build: 8e1f9d083 (9112)