diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log index 7465f25..3d2d9c2 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 126.62 ± 0.10 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 19.95 ± 0.02 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 125.93 ± 0.26 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.52 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log index f8e8b6b..47aebe3 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 135.10 ± 0.35 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.14 ± 0.01 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 135.40 ± 0.23 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.69 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log index fad8a13..ad3bc31 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 130.99 ± 0.36 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.14 ± 0.01 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 132.28 ± 0.14 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.50 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log index d4132eb..f3696a4 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 140.15 ± 0.41 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.15 ± 0.01 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 139.86 ± 0.32 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.70 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log index b84b584..cba7aea 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 126.66 ± 0.22 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.14 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 125.92 ± 0.27 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.52 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log index 350aa44..3f6be05 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 100.20 ± 0.13 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.30 ± 0.01 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 134.12 ± 0.59 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.66 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log index ec3889b..b894fe4 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x2624d340) reason :GPU Hang -✖ ! [rocm6_4_3] GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002 __hblt0 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 131.45 ± 0.35 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.53 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log index 2d21418..ffffffb 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x37c5d340) on address 0x7f2e3516f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3] GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 140.67 ± 0.26 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.67 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..d34b8a5 --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 94.56 ± 0.11 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 19.90 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..638b42c --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 127.25 ± 0.57 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.66 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..6310d52 --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 128.69 ± 0.57 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.56 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..8605e6d --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 169.19 ± 0.12 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.67 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log index d99dfb8..e2005d4 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 117.48 ± 0.53 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.11 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 94.71 ± 0.12 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.53 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log index b74a931..f579cee 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 126.27 ± 0.47 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 19.86 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 126.97 ± 0.54 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.70 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log index 906c8ff..38f5f9b 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 158.54 ± 0.42 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.11 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 160.39 ± 0.34 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.56 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log index d51baac..f4788eb 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 166.11 ± 0.32 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 19.83 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 169.35 ± 0.56 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.65 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log index 3c47854..bf5b0b4 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 89.60 ± 0.20 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.22 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 94.73 ± 0.22 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.47 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log index bf63f0d..7d4fe91 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 64.66 ± 0.16 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.35 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 93.27 ± 0.18 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.67 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log index a3b2c08..6afa9b4 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x1d380ea0) reason :GPU Hang -✖ ! [rocm7_rc] GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002 __hblt0 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 159.89 ± 0.44 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 20.55 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log index 9cc7929..543674d 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x4a0fea0) on address 0x7f3bf796f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 170.42 ± 0.33 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 20.66 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log index d8fa4d5..426e6cc 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 197.95 ± 0.29 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 23.24 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 217.22 ± 0.49 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 24.18 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log index eece528..9a614f0 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 199.40 ± 0.35 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 23.26 ± 0.00 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 219.61 ± 0.55 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 24.21 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log index 5b8bc47..5471167 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 126.28 ± 0.17 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 23.33 ± 0.01 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 212.60 ± 0.74 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 24.18 ± 0.03 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log index 41d8077..9353954 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 131.64 ± 0.32 | -| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 23.88 ± 0.01 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 224.85 ± 2.55 | +| glm4moe 106B.A12B Q4_K - Medium | 68.01 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 24.64 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log index f5f0ad3..5dfa73b 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 121.82 ± 0.35 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.59 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 120.87 ± 0.23 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.86 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log index 2bfbda7..d31d01b 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 126.60 ± 0.30 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.62 ± 0.04 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 128.65 ± 0.59 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.96 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log index 159477b..a89a06c 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x26e36340) on address 0x7fcef3635000. Reason: Page not present or supervisor privilege. +HW Exception by GPU node-1 (Agent handle: 0xe6e7340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __hblt0 failed (exit 134) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log index f625092..4115027 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x35263340) reason :GPU Hang +Memory access fault by GPU node-1 (Agent handle: 0x400a9340) on address 0x7ef17b435000. Reason: Page not present or supervisor privilege. ✖ ! [rocm6_4_3-rocwmma] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3.log index 8ac4440..b7bc1fd 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 117.95 ± 0.30 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.65 ± 0.01 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 120.53 ± 0.28 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.87 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__fa1.log index 581bc16..d9d9b0b 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x28aa3340) on address 0x7fb93761b000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 129.22 ± 0.41 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.95 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0.log index 486bf8f..b47fc74 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x14d05340) reason :GPU Hang +Memory access fault by GPU node-1 (Agent handle: 0x22558310) on address 0x7f7830fad000. Reason: Page not present or supervisor privilege. ✖ ! [rocm6_4_3] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __hblt0 failed (exit 134) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log index fb08717..d51b566 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x265e8340) reason :GPU Hang -✖ ! [rocm6_4_3] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 128.68 ± 0.22 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.96 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..b0e83f6 --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 91.95 ± 0.25 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.76 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..252ee22 --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 70.00 ± 0.17 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.98 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..2210c41 --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 134.22 ± 0.50 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.90 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..07268a4 --- /dev/null +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 159.75 ± 0.33 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.99 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma.log index 8ca5c1e..96555fb 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 69.19 ± 0.20 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.64 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 92.18 ± 0.04 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.92 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log index 6eaa1df..e3f3b88 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 114.61 ± 0.20 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.51 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 121.75 ± 0.32 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.97 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log index 5fb6167..ebfc2f2 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 120.88 ± 0.92 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.61 ± 0.09 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 151.32 ± 0.45 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.90 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log index ec552ad..c57d72f 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 150.07 ± 0.56 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.52 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 161.10 ± 0.36 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.99 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc.log index dc5e6ec..3f154b9 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 69.52 ± 0.17 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.63 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 92.20 ± 0.11 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.85 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__fa1.log index c1980f8..068b285 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 74.02 ± 0.13 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.73 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 71.02 ± 0.16 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.96 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0.log index 350f64d..88bcc9c 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 142.67 ± 0.75 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.68 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | pp512 | 147.32 ± 0.43 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 0 | tg128 | 15.91 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log index 6a1fdfd..928c3ad 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x1c536ea0) on address 0x7f623b57e000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | pp512 | 161.37 ± 0.36 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | ROCm | 99 | 1 | 0 | tg128 | 15.99 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk.log index 1a18e9c..a8b6d69 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 219.81 ± 0.70 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 16.80 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 264.50 ± 0.99 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 17.27 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log index bc34d0a..740e1fd 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 222.20 ± 0.63 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 16.82 ± 0.01 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 267.86 ± 1.22 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 17.28 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv.log index 1c621cf..9f963a8 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 126.55 ± 0.40 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 17.07 ± 0.01 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | pp512 | 208.01 ± 0.73 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 0 | tg128 | 17.49 ± 0.02 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv__fa1.log b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv__fa1.log index 3dbeebf..c1968b9 100644 --- a/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv__fa1.log +++ b/benchmark/results/GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 131.25 ± 0.50 | -| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 17.31 ± 0.00 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | pp512 | 221.63 ± 1.26 | +| glm4moe 106B.A12B Q6_K | 94.57 GiB | 110.47 B | Vulkan | 99 | 1 | 0 | tg128 | 17.71 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log index 769f2f3..07addc3 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log @@ -7,9 +7,5 @@ This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASL rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 98.02 ± 0.18 | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.77 ± 0.00 | - -build: 1fe00296 (6182) +HW Exception by GPU node-1 (Agent handle: 0x284c3340) reason :GPU Hang +✖ ! [rocm6_4_3-rocwmma] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log index 4451e68..cbc1a68 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 101.83 ± 0.11 | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.77 ± 0.00 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 101.82 ± 0.06 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log index 2553f31..635e8f4 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x21da1340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x7166340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __hblt0 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log index 1a88dcd..8ab83e5 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x15ac2340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x37f0e340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3.log index 56497ee..ed7de0e 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 97.13 ± 0.17 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 94.79 ± 0.14 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__fa1.log index 08a5922..ab92419 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 80.42 ± 0.08 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 104.62 ± 0.08 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0.log index f5b7147..b2280bc 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x2c1e5340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x12cee310) reason :GPU Hang ✖ ! [rocm6_4_3] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __hblt0 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log index 013fc7c..d7451a9 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x3e536340) on address 0x7f9182f6f000. Reason: Page not present or supervisor privilege. +Memory access fault by GPU node-1 (Agent handle: 0x367c310) on address 0x7fc07ad93000. Reason: Page not present or supervisor privilege. ✖ ! [rocm6_4_3] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..4f142bc --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 98.15 ± 0.16 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.77 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..7ec867a --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 102.79 ± 0.14 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..102a571 --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 93.89 ± 0.22 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.78 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..5d3aace --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 97.53 ± 0.17 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma.log index 62ab0f7..896a9c7 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 97.31 ± 0.20 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 97.42 ± 0.12 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log index 1e31a60..a3dbd8f 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 100.85 ± 0.13 | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.77 ± 0.00 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 101.56 ± 0.04 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log index c5612f6..9ead8d1 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 93.00 ± 0.22 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 92.02 ± 0.17 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log index e557123..80ec16d 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 97.88 ± 0.09 | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.77 ± 0.00 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 97.10 ± 0.17 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log index 7c6d5fd..fb7b0d0 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 99.41 ± 0.36 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 95.12 ± 0.17 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.77 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__fa1.log index f65845d..8390da3 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x1f66bec0) on address 0x7f3e84b6f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 103.16 ± 0.07 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0.log index 44b15c4..f0129d8 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 94.06 ± 0.09 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 93.86 ± 0.18 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log index 970522b..23bf2f4 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0xac09ec0) on address 0x7f283f56f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | pp512 | 95.87 ± 0.08 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log index 13252f9..114cbeb 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 98.03 ± 0.24 | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 2.78 ± 0.00 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 97.72 ± 0.36 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 2.81 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log index 5781898..4d303d4 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | pp512 | 99.12 ± 0.25 | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | tg128 | 2.77 ± 0.00 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | pp512 | 99.04 ± 0.31 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | tg128 | 2.80 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log index 9500e62..dcad22c 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 75.59 ± 0.28 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 78.94 ± 0.51 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 2.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv__fa1.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv__fa1.log index 6c47ac0..d7b8226 100644 --- a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv__fa1.log +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | pp512 | 80.09 ± 0.38 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | pp512 | 80.90 ± 0.77 | | llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 1 | 0 | tg128 | 2.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma.log index ebcf552..1a327e9 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma.log @@ -7,5 +7,9 @@ This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASL rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. -Memory access fault by GPU node-1 (Agent handle: 0x1a840340) on address 0x7f3babb56000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 265.76 ± 0.95 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.69 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__fa1.log index ab25429..eec534a 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 291.08 ± 1.26 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.53 ± 0.00 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 289.14 ± 1.57 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.64 ± 0.15 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log index 65957d1..3fc5e97 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log @@ -2,9 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 134.19 ± 1.49 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.56 ± 0.01 | - -build: 1fe00296 (6182) +HW Exception by GPU node-1 (Agent handle: 0x24187340) reason :GPU Hang +✖ ! [rocm6_4_3-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __hblt0 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log index dcff3bd..b5d2530 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x1de78340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x3da9340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3.log index f75714d..ba3f44f 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3.log @@ -7,9 +7,5 @@ This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASL rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 270.28 ± 1.29 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.58 ± 0.03 | - -build: 1fe00296 (6182) +HW Exception by GPU node-1 (Agent handle: 0x11bc3310) reason :GPU Hang +✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__fa1.log index 7c8dc76..772e891 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x2162b340) on address 0x7f500556f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 291.67 ± 0.91 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.71 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0.log index 96e7a94..b338079 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0xdacf340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x8a0a310) reason :GPU Hang ✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __hblt0 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0__fa1.log index 65a2e33..d72a6cb 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x3dc00340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x1ada6310) reason :GPU Hang ✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..334b616 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 276.44 ± 1.46 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.55 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..632535a --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 292.67 ± 1.04 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.71 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..8fca923 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 273.88 ± 1.14 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.70 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..9bba0b6 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 284.81 ± 1.55 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.72 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma.log index 64b52d5..3e4a420 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x3882bf60) reason :GPU Hang -✖ ! [rocm7_rc-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 274.13 ± 0.84 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.71 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__fa1.log index 11cbda1..44047fc 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 285.84 ± 9.41 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.37 ± 0.00 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 292.92 ± 2.63 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.71 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0.log index 7575a7e..52dda2f 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 273.97 ± 1.67 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.57 ± 0.05 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 273.23 ± 1.35 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.70 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log index 647f737..f01372d 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log @@ -2,9 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -| model | size | params | backend | ngl | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 285.26 ± 1.79 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.33 ± 0.03 | - -build: de219279 (6181) +HW Exception by GPU node-1 (Agent handle: 0x13c5d180) reason :GPU Hang +✖ ! [rocm7_rc-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log index 6ac1b55..f79d354 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log @@ -2,9 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 276.37 ± 1.65 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.57 ± 0.04 | - -build: de219279 (6181) +Memory access fault by GPU node-1 (Agent handle: 0x381db160) on address 0x7f72baf68000. Reason: Page not present or supervisor privilege. +✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__fa1.log index bc46574..d2e6f1c 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0xa893ec0) on address 0x7f070a3a9000. Reason: Page not present or supervisor privilege. +HW Exception by GPU node-1 (Agent handle: 0x34902180) reason :GPU Hang ✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0.log index ca30067..ff5e1de 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 269.17 ± 0.99 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.63 ± 0.01 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 274.52 ± 1.78 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.70 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0__fa1.log index 56c35f6..e29a4b0 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x1db86ec0) on address 0x7f2273f6f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 287.04 ± 1.92 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 14.71 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log index 8c8e292..e235c42 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 242.07 ± 1.05 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 15.56 ± 0.01 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 224.02 ± 2.86 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 15.98 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk__fa1.log index 71556fe..9e0f45f 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 244.49 ± 1.13 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 15.33 ± 0.00 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 234.30 ± 1.10 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 15.75 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log index 33a8b80..eff084d 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 147.08 ± 0.98 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 15.50 ± 0.01 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 201.49 ± 2.22 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 15.77 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv__fa1.log index caf7973..ff879bc 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 149.97 ± 1.10 | -| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 15.49 ± 0.00 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 202.49 ± 5.98 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 15.74 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma.log index 40cf34f..e0ffee4 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 270.35 ± 3.39 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.78 ± 0.03 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 264.44 ± 24.69 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.88 ± 0.05 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__fa1.log index 4a19b96..1f9cd7e 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 292.23 ± 3.13 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.73 ± 0.03 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 298.83 ± 1.59 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.89 ± 0.06 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log index 65afec2..a4e7e73 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x5f69340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x3265f340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __hblt0 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log index 98dc8c5..2f00d23 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -2,9 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -| model | size | params | backend | ngl | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 140.27 ± 0.97 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.74 ± 0.00 | - -build: 1fe00296 (6182) +HW Exception by GPU node-1 (Agent handle: 0x33cad340) reason :GPU Hang +✖ ! [rocm6_4_3-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3.log index 8ac0514..052bfbb 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3.log @@ -7,5 +7,9 @@ This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASL rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. -HW Exception by GPU node-1 (Agent handle: 0x2079b340) reason :GPU Hang -✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 274.49 ± 1.84 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.91 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__fa1.log index 8d6068b..5571471 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x37ff7340) on address 0x7fa76bba9000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 298.07 ± 2.73 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.89 ± 0.06 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0.log index 2add86a..27d8d9a 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x2a344340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x1ac74310) reason :GPU Hang ✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __hblt0 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0__fa1.log index 977948c..6e6a4d0 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x27934340) on address 0x7f656656f000. Reason: Page not present or supervisor privilege. +HW Exception by GPU node-1 (Agent handle: 0x390d2310) reason :GPU Hang ✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..7928822 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 275.21 ± 1.93 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.85 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..1e1c530 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 292.69 ± 2.25 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.91 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..7b4105e --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 267.51 ± 12.72 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.90 ± 0.04 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..546aac5 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 286.25 ± 4.29 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.90 ± 0.05 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma.log index 03112eb..42ba624 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 279.13 ± 2.90 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.79 ± 0.07 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 260.60 ± 10.80 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.82 ± 0.22 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__fa1.log index 182bf9d..a58997b 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 293.60 ± 3.84 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.62 ± 0.02 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 293.26 ± 3.75 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.92 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0.log index 66c4a41..2b05052 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 264.02 ± 2.74 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.79 ± 0.06 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 275.91 ± 1.81 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.91 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log index 5439b6b..e5a0196 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 279.69 ± 2.30 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.60 ± 0.04 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 260.83 ± 5.18 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 11.82 ± 0.20 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log index f3af1c9..268c6ee 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 252.38 ± 7.70 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.35 ± 0.60 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 279.56 ± 3.76 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.88 ± 0.02 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__fa1.log index 9932c99..a877428 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x2e56aec0) on address 0x7f4102f6f000. Reason: Page not present or supervisor privilege. +HW Exception by GPU node-1 (Agent handle: 0x30007180) reason :GPU Hang ✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0.log index d31f283..b6e5310 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 271.54 ± 4.10 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.57 ± 0.58 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 278.61 ± 2.47 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 11.92 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0__fa1.log index 986706b..19dcf24 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x1aa83ec0) on address 0x7f9f1e96f000. Reason: Page not present or supervisor privilege. +HW Exception by GPU node-1 (Agent handle: 0x231e9180) reason :GPU Hang ✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log index 089be05..5436987 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 258.54 ± 1.39 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 12.45 ± 0.01 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 343.36 ± 1.37 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 12.57 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk__fa1.log index 42f4672..1755f19 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 262.84 ± 1.39 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 12.30 ± 0.01 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 347.56 ± 1.15 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 12.42 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log index e7df5fa..0d356f0 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 169.23 ± 0.84 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 12.45 ± 0.00 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 244.52 ± 1.08 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 12.57 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv__fa1.log index 2776458..be8e1fa 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 173.79 ± 0.85 | -| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 12.44 ± 0.01 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 253.13 ± 1.36 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 12.56 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log index f500cd5..fb8f81b 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma.log @@ -7,9 +7,5 @@ This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASL rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 285.51 ± 1.64 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.70 ± 0.01 | - -build: 1fe00296 (6182) +HW Exception by GPU node-1 (Agent handle: 0x1f49a340) reason :GPU Hang +✖ ! [rocm6_4_3-rocwmma] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log index 85f9bfb..ca1ff97 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 308.62 ± 2.62 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.54 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 307.79 ± 3.48 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.81 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log index 78e4255..a0f2615 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 137.71 ± 0.62 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.71 ± 0.01 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 138.51 ± 0.72 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.80 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log index 8945a72..1e52153 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 142.62 ± 0.82 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.55 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 142.41 ± 0.57 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.80 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log index 5aa96bf..1039808 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3.log @@ -2,14 +2,14 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -rocBLAS error: No hipBLASLt solution found +hipBLASLt error: Heuristic Fetch Failed! This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASLT_ERROR environment variable is set. rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 286.37 ± 1.44 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.70 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 282.50 ± 1.23 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log index 98b05eb..720ab4a 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 234.68 ± 1.31 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.71 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 306.89 ± 1.35 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.82 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log index 186cdf8..91a8494 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0xa3c7340) reason :GPU Hang -✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 __hblt0 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 137.98 ± 0.67 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.79 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log index 19b75a4..7b80e05 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x118c6340) reason :GPU Hang -✖ ! [rocm6_4_3] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 143.18 ± 0.45 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.81 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..86b7922 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 283.86 ± 1.18 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.71 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..562f9a8 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 299.13 ± 2.14 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.77 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..5230084 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 286.66 ± 1.37 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.79 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..8db788a --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 300.00 ± 1.51 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.82 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log index 54d6795..7ce4256 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 290.54 ± 1.59 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.67 ± 0.01 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 284.38 ± 0.76 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.82 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log index 6f81260..82d7419 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 304.99 ± 0.37 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.28 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 306.40 ± 1.77 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log index 721ffc5..fe0f034 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 283.93 ± 1.57 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.65 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 286.08 ± 2.96 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.82 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log index 241fb21..1ae0137 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 300.13 ± 1.26 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.27 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 297.71 ± 1.73 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.78 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log index 19895f7..2464692 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log @@ -2,9 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 291.60 ± 1.95 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.73 ± 0.00 | - -build: de219279 (6181) +Memory access fault by GPU node-1 (Agent handle: 0x4092b180) on address 0x7fe1ddb56000. Reason: Page not present or supervisor privilege. +✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log index d607e30..a5a4c8a 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x134adec0) on address 0x7f0318984000. Reason: Page not present or supervisor privilege. +Memory access fault by GPU node-1 (Agent handle: 0x2b6cf180) on address 0x7f9694f56000. Reason: Page not present or supervisor privilege. ✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 __fa1 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log index a0c7756..a9689cc 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 285.56 ± 1.41 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.72 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 284.17 ± 2.14 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.80 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log index ae6829f..b0472bd 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 227.75 ± 1.52 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.73 ± 0.00 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | pp512 | 300.96 ± 1.85 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 1 | 0 | tg128 | 17.81 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log index f89d42c..06fe45e 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 216.64 ± 2.76 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 20.39 ± 0.02 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 191.71 ± 1.02 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 21.03 ± 0.02 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log index 7ee60d9..9480722 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 217.68 ± 4.15 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 19.97 ± 0.01 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 193.39 ± 1.52 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 20.61 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log index 002154b..71442c3 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 151.98 ± 0.60 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 20.26 ± 0.02 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 213.71 ± 2.99 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 20.87 ± 0.03 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log index 64d4625..07180ac 100644 --- a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 154.96 ± 0.82 | -| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 20.28 ± 0.01 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | pp512 | 217.08 ± 5.59 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 1 | 0 | tg128 | 20.85 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log index d927ed6..74dae23 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 130.11 ± 0.68 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.95 ± 0.04 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 129.22 ± 0.43 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.25 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log index 4638f26..3f8d6e9 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 144.31 ± 0.80 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 13.71 ± 0.00 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 143.48 ± 1.30 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.32 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log index d8218bd..148c268 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x8063340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0xd6b8340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __hblt0 failed (exit 134) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log index 3a247e3..131dcc4 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x18398340) reason :GPU Hang -✖ ! [rocm6_4_3-rocwmma] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 75.22 ± 0.16 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.29 ± 0.03 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3.log index 82caf25..c9464c6 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 131.78 ± 1.03 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.68 ± 0.43 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 129.64 ± 0.35 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.24 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__fa1.log index a0ed178..f654098 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x50aa340) on address 0x7f7365ba9000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 144.82 ± 0.84 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.32 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0.log index 3767019..64085e5 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x1990d340) reason :GPU Hang -✖ ! [rocm6_4_3] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __hblt0 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 74.17 ± 0.11 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.27 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log index f1dc1f5..78154a2 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_3__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x180d4340) on address 0x7f11c8f6f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm6_4_3] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 77.91 ± 0.23 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.31 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..304856f --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 131.98 ± 0.86 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.14 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..4b9aafa --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 141.08 ± 0.51 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.29 ± 0.04 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..fdc7289 --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 130.87 ± 0.83 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.25 ± 0.05 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..011ab77 --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 137.23 ± 0.55 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.32 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma.log index 38826b0..bac74c4 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x3c89ef80) on address 0x7f777640a000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc-rocwmma] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 failed (exit 134) +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 132.60 ± 0.56 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.33 ± 0.02 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log index a74258c..3ce3c01 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 141.61 ± 0.92 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 13.34 ± 0.02 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 143.76 ± 0.58 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.36 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log index 966ba3e..f3e8a66 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 133.33 ± 0.68 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.78 ± 0.04 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 134.24 ± 0.57 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.32 ± 0.03 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log index 22a4e47..baf0257 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 139.60 ± 0.47 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 13.03 ± 0.57 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 141.84 ± 0.84 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.37 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log index daaa296..818e7e0 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 134.95 ± 0.76 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.99 ± 0.01 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 134.45 ± 0.50 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.32 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__fa1.log index 0d3acaf..7cfb635 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__fa1.log @@ -2,6 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -:0:rocdevice.cpp :3675: 29915649820 us: Callback: Queue 0x7f500c700000 aborting with error : HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016 -Memory access fault by GPU node-1 (Agent handle: 0x2aad9ec0) on address 0x7f37c576f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | pp512 | 145.01 ± 0.84 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 1 | 0 | tg128 | 14.36 ± 0.02 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0.log index 8135bc1..5c4c044 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 135.29 ± 0.51 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.97 ± 0.04 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 133.40 ± 0.48 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 14.32 ± 0.04 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log index fc4a37f..c4be1e2 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x122f2ec0) on address 0x7f10537a9000. Reason: Page not present or supervisor privilege. +Memory access fault by GPU node-1 (Agent handle: 0x293f4180) on address 0x7f5ee4f70000. Reason: Page not present or supervisor privilege. ✖ ! [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log index ec978bb..9c8e307 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | pp512 | 112.93 ± 0.63 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | tg128 | 16.43 ± 0.01 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | pp512 | 131.33 ± 1.43 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | tg128 | 17.27 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log index e560c0b..0d15b29 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | pp512 | 114.35 ± 1.12 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | tg128 | 16.27 ± 0.01 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | pp512 | 133.32 ± 1.63 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | tg128 | 17.12 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log index 5fbf704..4512dd6 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | pp512 | 64.60 ± 0.38 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | tg128 | 17.03 ± 0.01 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | pp512 | 115.77 ± 1.42 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | tg128 | 17.75 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv__fa1.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv__fa1.log index e878778..1fdb504 100644 --- a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv__fa1.log +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | pp512 | 66.60 ± 0.42 | -| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | tg128 | 17.28 ± 0.01 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | pp512 | 121.80 ± 1.81 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 1 | 0 | tg128 | 18.10 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma.log index cd9f858..1e65c4b 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 438.42 ± 4.14 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.57 ± 0.01 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 426.32 ± 6.04 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.00 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log index 3adfafe..f0325ad 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 475.43 ± 7.40 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.08 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 479.22 ± 4.77 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.91 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log index 96ee7c7..3f88455 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 158.13 ± 2.40 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.58 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 158.56 ± 4.20 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.03 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log index e330db0..f0a6412 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 163.40 ± 3.21 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.14 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 165.57 ± 2.56 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.89 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3.log index f983fbc..cff659b 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 441.36 ± 3.35 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.60 ± 0.01 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 432.19 ± 6.24 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.02 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__fa1.log index 04c9b9c..77fe8ec 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 337.36 ± 3.48 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.45 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 477.24 ± 5.25 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.88 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0.log index c1ff6cb..501ffea 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 161.73 ± 1.23 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.58 ± 0.01 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 162.44 ± 4.25 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.05 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log index aaa3b75..1c49a1a 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 143.05 ± 2.10 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.42 ± 0.01 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 160.17 ± 3.26 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.91 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..11cfce9 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 435.53 ± 2.47 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.69 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..cdcd330 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 476.36 ± 3.91 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.93 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..85da425 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 330.47 ± 5.12 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.09 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..b9f0191 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 343.19 ± 4.41 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.90 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma.log index 7f744f7..02d9e91 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 448.63 ± 5.90 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.96 ± 0.02 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 431.59 ± 5.03 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.06 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log index 61f996e..7c575f0 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 473.34 ± 8.60 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 23.99 ± 0.01 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 476.09 ± 5.36 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.93 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log index 8b322b7..ca2d22c 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 338.07 ± 3.03 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.93 ± 0.03 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 332.32 ± 3.60 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.11 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log index a3675a1..368ff16 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 342.57 ± 3.12 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 23.97 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 344.55 ± 3.84 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.92 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log index c7f3224..579c663 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 444.30 ± 6.78 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.66 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 431.29 ± 3.17 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.10 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__fa1.log index 8384bda..aadc637 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x38fecea0) on address 0x7f31ea76f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 __fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 475.35 ± 3.41 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.94 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0.log index aab676c..9061b2e 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 333.42 ± 6.83 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.69 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 329.24 ± 2.98 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 25.06 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log index dba441c..ae585b8 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log @@ -2,5 +2,9 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -Memory access fault by GPU node-1 (Agent handle: 0x1f121ea0) on address 0x7fd78e16f000. Reason: Page not present or supervisor privilege. -✖ ! [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 __hblt0__fa1 failed (exit 134) +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 348.53 ± 5.60 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 24.92 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log index 67aa5d3..9733b38 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 106.47 ± 0.10 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 8.18 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 139.51 ± 0.90 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 8.31 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk__fa1.log index de3ec24..89ebf52 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 106.77 ± 0.12 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 8.11 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 140.62 ± 1.53 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 8.26 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log index 7ea35f3..f88d6ea 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 84.71 ± 0.11 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 7.52 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 150.84 ± 1.38 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 8.24 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv__fa1.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv__fa1.log index 2aa8bc0..37c3d79 100644 --- a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 85.70 ± 0.10 | -| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 7.52 ± 0.00 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 156.53 ± 2.33 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 8.29 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma.log index 7499112..b163e9b 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 564.83 ± 6.58 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.68 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 538.66 ± 2.16 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.01 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__fa1.log index 8947515..d8e45a2 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 624.99 ± 3.81 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 48.64 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 620.78 ± 3.75 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.74 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0.log index 1488828..4b2bee2 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 389.25 ± 2.01 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.66 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 389.41 ± 1.99 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.13 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log index d3d972e..0cc6bbd 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 412.18 ± 1.15 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 48.80 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 413.64 ± 1.55 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.76 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3.log index bb39f34..26ad0c6 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 562.86 ± 10.14 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.74 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 543.05 ± 4.56 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.14 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__fa1.log index e501cb6..c9f4948 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 418.07 ± 1.65 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 50.11 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 624.71 ± 4.66 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.74 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0.log index 87b8aec..ee0b076 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 387.74 ± 1.70 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.65 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 387.73 ± 2.27 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.30 ± 0.02 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0__fa1.log index 17416ee..6fe28d1 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 301.31 ± 0.65 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 50.37 ± 0.02 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 415.19 ± 1.76 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.59 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..a014b00 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 552.48 ± 1.39 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.22 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..2e173b9 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 618.51 ± 8.44 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.82 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..be7305a --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 576.87 ± 7.86 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.42 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..9741f2c --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 609.51 ± 4.26 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.77 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma.log index 444ca09..9bb2bf0 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 570.31 ± 5.05 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.52 ± 0.02 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 555.30 ± 3.11 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.34 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__fa1.log index c11bc62..f4f3cce 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 612.79 ± 4.77 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 46.73 ± 0.00 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 618.71 ± 2.77 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.77 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0.log index e17b52a..d707246 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 572.09 ± 8.22 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.45 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 575.05 ± 4.27 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.33 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log index 19e8321..1856861 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 605.49 ± 1.47 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 46.73 ± 0.00 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 618.89 ± 4.53 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.69 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc.log index f2e3ece..ac9d02d 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 573.05 ± 6.77 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.80 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 549.65 ± 6.16 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.42 ± 0.02 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__fa1.log index 3fa7435..4848dd8 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 416.05 ± 3.44 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 50.33 ± 0.00 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 621.80 ± 7.09 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.78 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0.log index db8de56..37ba478 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 576.38 ± 3.91 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 50.85 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 575.05 ± 3.02 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 53.42 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0__fa1.log index f19e470..75f7de4 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 414.62 ± 3.23 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 50.22 ± 0.01 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | pp512 | 614.05 ± 4.83 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | ROCm | 99 | 1 | 0 | tg128 | 52.83 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk.log index 10609cc..e1f8a5a 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 733.40 ± 2.59 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 59.36 ± 0.05 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 1027.23 ± 5.64 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 63.42 ± 0.03 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk__fa1.log index 486113e..460a956 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 725.54 ± 2.84 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 55.57 ± 0.02 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 1005.86 ± 4.35 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 59.12 ± 0.04 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv.log index 00cd713..e4e2810 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 392.54 ± 1.80 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 61.56 ± 0.02 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 764.63 ± 3.75 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 64.77 ± 0.10 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv__fa1.log b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv__fa1.log index 3123ba1..b854ab4 100644 --- a/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv__fa1.log +++ b/benchmark/results/Qwen3-30B-A3B-Instruct-2507-UD-Q6_K_XL__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 403.74 ± 1.69 | -| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 60.57 ± 0.08 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | pp512 | 801.22 ± 4.04 | +| qwen3moe 30B.A3B Q6_K | 24.53 GiB | 30.53 B | Vulkan | 99 | 1 | 0 | tg128 | 63.44 ± 0.12 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma.log index ba675a0..433401a 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 734.26 ± 0.94 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.05 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 726.41 ± 1.42 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__fa1.log index a834fd9..1e7652b 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 820.41 ± 1.59 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.77 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 822.38 ± 0.84 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.87 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0.log index 3b668bb..3242eea 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 213.40 ± 3.62 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.04 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 219.78 ± 3.65 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log index ed14086..8b27e59 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 224.20 ± 4.73 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.78 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 227.29 ± 2.29 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.87 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3.log index b6d34c8..3cdf4e3 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 734.70 ± 1.48 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.03 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 703.97 ± 0.49 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__fa1.log index a9a4641..5d82b39 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 554.49 ± 0.62 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.78 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 803.68 ± 0.98 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.89 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0.log index 21731f6..2121415 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 220.22 ± 1.60 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.04 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 222.73 ± 0.32 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0__fa1.log index ecb65ad..299085e 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 193.90 ± 1.19 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.77 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 220.75 ± 2.43 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.86 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..b9c2ba9 --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 714.52 ± 1.47 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.16 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..dd2b67e --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 810.36 ± 1.88 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.89 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..b5de28a --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 752.18 ± 0.86 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.14 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..3a710b2 --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 797.91 ± 0.87 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.88 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma.log index f5c1c56..d2b180e 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 751.04 ± 1.24 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.01 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 738.56 ± 1.66 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__fa1.log index c86174c..7e8c499 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 811.04 ± 1.22 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.45 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 810.24 ± 2.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.89 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0.log index 3f70b79..6871a37 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 752.99 ± 1.44 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.00 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 751.87 ± 1.69 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log index 4cc26aa..a0de968 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 794.90 ± 1.42 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.45 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 798.06 ± 1.45 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.88 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log index 301a2b5..48ce734 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 752.36 ± 0.48 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.05 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 738.50 ± 1.56 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__fa1.log index 1ddb96e..a0dd7bf 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 569.66 ± 0.60 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.78 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 801.53 ± 1.48 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.88 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0.log index e22f092..352a972 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 750.36 ± 1.88 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.05 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 751.81 ± 0.96 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 14.15 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0__fa1.log index 01fed10..3bdf7bd 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 559.73 ± 0.51 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.79 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | pp512 | 791.04 ± 2.55 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 1 | 0 | tg128 | 13.87 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log index 6d1ed7b..431a37b 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | pp512 | 680.44 ± 0.55 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | tg128 | 14.39 ± 0.03 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | pp512 | 679.86 ± 1.33 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | tg128 | 14.60 ± 0.03 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk__fa1.log index 3c0a8e7..bbdf2e7 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | pp512 | 371.66 ± 0.51 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | tg128 | 12.62 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | pp512 | 659.67 ± 0.72 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | tg128 | 14.50 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log index 5a3ee90..2c0ca73 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | pp512 | 502.88 ± 1.45 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | tg128 | 14.21 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | pp512 | 504.31 ± 3.20 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | tg128 | 14.14 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv__fa1.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv__fa1.log index 8f4867f..f084df4 100644 --- a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv__fa1.log +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | pp512 | 496.33 ± 1.83 | -| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | tg128 | 14.02 ± 0.00 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | pp512 | 501.78 ± 2.80 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 1 | 0 | tg128 | 13.95 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma.log index 523552b..0199908 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 395.28 ± 0.22 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 3.96 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 413.72 ± 0.86 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.09 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log index f8b41b5..86b7295 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 468.37 ± 1.54 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.08 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 469.46 ± 1.37 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.11 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log index 1ce39b8..eb32966 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 79.42 ± 0.41 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 3.97 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 84.71 ± 8.12 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.10 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log index 29c9209..cc49148 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 89.19 ± 0.53 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.06 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 84.12 ± 9.82 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.11 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3.log index fcaf5b3..8dc76ac 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 398.35 ± 1.07 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.09 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 408.40 ± 1.09 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.10 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__fa1.log index bb05e2f..f94a57d 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 302.82 ± 2.53 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.09 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 470.49 ± 1.46 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0.log index f41ad73..4fb6887 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 59.13 ± 7.79 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.09 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 84.93 ± 8.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.10 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log index c07bd16..ee44cb5 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 61.26 ± 10.54 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.09 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 83.22 ± 10.78 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..3332c2a --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 412.86 ± 1.22 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.10 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..a8e7f5a --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 465.55 ± 1.95 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..df6e790 --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 453.66 ± 0.77 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.10 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..89d3127 --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 498.77 ± 0.53 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma.log index d48d219..282d7f2 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 418.46 ± 0.10 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 419.05 ± 0.86 | | gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.09 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log index ba0b9fa..2c4f703 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 466.83 ± 1.65 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.07 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 466.36 ± 1.34 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log index f152fdb..4cda7ad 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 454.10 ± 1.09 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 451.57 ± 0.41 | | gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.09 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log index a66ffac..c745c9f 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 499.43 ± 1.24 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.06 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 499.87 ± 0.47 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log index adb03dd..9f5c43c 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 392.50 ± 0.50 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 3.97 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 422.00 ± 0.56 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.10 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__fa1.log index 5ec86d3..e43f6df 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 311.25 ± 0.72 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.09 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 469.45 ± 1.83 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.11 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0.log index bec2363..2253da3 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 451.69 ± 0.62 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 453.24 ± 0.64 | | gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.09 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log index 3a00d5c..98a164a 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 324.43 ± 0.22 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.09 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | pp512 | 502.26 ± 0.81 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 1 | 0 | tg128 | 4.10 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log index 9dfdc23..19c65b3 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 0 | pp512 | 129.49 ± 0.34 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 0 | tg128 | 4.06 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 0 | pp512 | 134.52 ± 0.99 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 0 | tg128 | 3.92 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv__fa1.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv__fa1.log index 96c0e93..a1f09eb 100644 --- a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv__fa1.log +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 1 | 0 | pp512 | 137.67 ± 1.25 | -| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 1 | 0 | tg128 | 4.06 ± 0.00 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 1 | 0 | pp512 | 138.59 ± 1.23 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 1 | 0 | tg128 | 3.93 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma.log index 4a3544a..8013724 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2033.46 ± 5.16 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 76.47 ± 0.26 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1886.62 ± 6.81 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 76.36 ± 5.10 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__fa1.log index 78f4e0c..725652c 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2276.86 ± 9.60 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 70.76 ± 0.26 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2282.08 ± 7.86 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 72.40 ± 0.03 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0.log index 54a23d2..3fa4608 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 727.18 ± 2.22 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 75.65 ± 0.74 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 713.12 ± 38.25 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 67.01 ± 5.06 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0__fa1.log index 24c6a23..978ff37 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 740.27 ± 10.38 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 70.76 ± 0.11 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 676.80 ± 75.42 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 59.10 ± 2.57 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3.log index e669ba8..654df84 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2035.38 ± 4.03 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 75.40 ± 0.80 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1857.54 ± 7.32 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 75.34 ± 7.91 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__fa1.log index 1562460..1842c2a 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 1515.55 ± 8.10 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 70.20 ± 0.39 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2214.91 ± 7.20 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 72.36 ± 0.05 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0.log index b49eeff..83e72da 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 714.75 ± 27.98 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 66.10 ± 5.25 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 656.82 ± 60.97 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 63.81 ± 3.45 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0__fa1.log index f75c3de..60ffb7f 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 596.86 ± 37.66 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 58.75 ± 3.09 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 663.36 ± 79.77 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 58.63 ± 2.61 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..0a762e2 --- /dev/null +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1830.34 ± 15.12 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 60.04 ± 4.39 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..b4e8cfb --- /dev/null +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2178.17 ± 91.83 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 55.78 ± 3.20 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..9113e30 --- /dev/null +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2016.93 ± 4.81 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 58.29 ± 3.79 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..8260f58 --- /dev/null +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2094.58 ± 12.74 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 66.23 ± 8.72 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma.log index c7f6ba3..d25fd4e 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2014.60 ± 24.35 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 59.16 ± 3.76 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1865.95 ± 7.12 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 79.56 ± 0.09 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__fa1.log index 0862fc6..fe4b290 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2191.77 ± 78.21 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 54.32 ± 2.65 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2252.60 ± 11.74 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 72.57 ± 0.05 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0.log index 4293b33..5483c20 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1991.71 ± 2.91 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 56.37 ± 3.40 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2011.51 ± 6.91 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 79.65 ± 0.03 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0__fa1.log index 1af2b23..e572ab1 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2096.22 ± 4.59 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 64.88 ± 0.05 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2111.65 ± 7.03 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 67.62 ± 4.71 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc.log index 7e86d5a..362a77a 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2027.41 ± 4.62 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 77.12 ± 0.03 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1869.83 ± 5.67 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 79.48 ± 0.04 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__fa1.log index a3497bf..39df255 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 1550.55 ± 4.52 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 70.54 ± 0.06 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2229.43 ± 7.33 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 72.58 ± 0.03 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0.log index 51f19f4..53225e6 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 1992.48 ± 7.34 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 77.05 ± 0.03 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | pp512 | 2014.48 ± 4.39 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 0 | tg128 | 79.61 ± 0.04 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0__fa1.log index a65b575..d2c4064 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 1474.15 ± 1.44 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 70.44 ± 0.01 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | pp512 | 2064.91 ± 7.11 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | ROCm | 99 | 1 | 0 | tg128 | 72.45 ± 0.03 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk.log index 2624621..65403f9 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | pp512 | 1593.62 ± 2.90 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | tg128 | 85.26 ± 0.26 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | pp512 | 1288.81 ± 206.13 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | tg128 | 86.61 ± 1.74 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk__fa1.log index 20e82ce..14f5f51 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | pp512 | 936.52 ± 2.35 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | tg128 | 60.89 ± 0.10 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | pp512 | 1149.64 ± 181.24 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | tg128 | 85.50 ± 1.74 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv.log index 6ba35a8..799317c 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | pp512 | 1515.05 ± 2.98 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | tg128 | 87.54 ± 0.18 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | pp512 | 967.51 ± 123.30 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 0 | tg128 | 86.74 ± 1.45 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv__fa1.log b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv__fa1.log index a1a86a0..513be95 100644 --- a/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv__fa1.log +++ b/benchmark/results/gemma-3-4b-it-Q3_K_S__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | pp512 | 1476.16 ± 5.12 | -| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | tg128 | 82.48 ± 0.36 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | pp512 | 991.94 ± 120.98 | +| gemma3 4B Q3_K - Small | 1.80 GiB | 3.88 B | Vulkan | 99 | 1 | 0 | tg128 | 85.61 ± 0.84 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma.log index 2b2057d..9e4b08e 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma.log @@ -2,14 +2,14 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -rocBLAS error: No hipBLASLt solution found +hipBLASLt error: Heuristic Fetch Failed! This message will be only be displayed once, unless the ROCBLAS_VERBOSE_HIPBLASLT_ERROR environment variable is set. rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 622.16 ± 6.71 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.91 ± 0.01 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 629.19 ± 3.98 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.79 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__fa1.log index a8e0637..5fe051d 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 743.09 ± 4.89 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.76 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 780.88 ± 9.39 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.14 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0.log index d94bfc6..8a0d184 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 354.98 ± 0.72 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.86 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 364.08 ± 1.11 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.81 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0__fa1.log index 95ce008..aaf527f 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 390.67 ± 0.97 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.79 ± 0.01 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 400.84 ± 0.84 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.17 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3.log index 320bdde..41af850 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 617.00 ± 4.97 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.90 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 627.57 ± 4.14 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.78 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__fa1.log index aaf9547..f097e15 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 543.39 ± 5.51 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.28 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 783.40 ± 1.22 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.15 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0.log index 717bdb1..7346088 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0.log @@ -2,9 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -| model | size | params | backend | ngl | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 354.18 ± 0.29 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.88 ± 0.00 | - -build: 1fe00296 (6182) +HW Exception by GPU node-1 (Agent handle: 0x37f5d310) reason :GPU Hang +✖ ! [rocm6_4_3] gpt-oss-120b-F16 __hblt0 failed (exit 134) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0__fa1.log index a328319..8eda0fb 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 322.46 ± 0.46 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.33 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 402.16 ± 1.31 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.16 ± 0.02 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..c69bc04 --- /dev/null +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 650.02 ± 4.28 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.00 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..816395b --- /dev/null +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 778.25 ± 3.40 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.25 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..3948df0 --- /dev/null +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 688.70 ± 7.72 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.83 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..9ccc2fb --- /dev/null +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 780.39 ± 6.28 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.20 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma.log index bfcbd06..f7645fb 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 643.61 ± 7.14 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.91 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 653.89 ± 3.96 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.89 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__fa1.log index 9e50477..c286eb1 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 736.33 ± 3.33 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.74 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 770.19 ± 5.64 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.18 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0.log index a3eb53e..ab867d3 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 651.63 ± 3.08 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.88 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 691.27 ± 4.90 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.83 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0__fa1.log index 422fcf2..07e6796 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 738.84 ± 9.12 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.79 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 772.44 ± 6.68 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.24 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc.log index 8fc2f66..34bd11a 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 649.28 ± 0.87 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 33.99 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 653.09 ± 7.25 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.86 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc__fa1.log index 35770c3..384d751 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 550.01 ± 3.85 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.38 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 779.77 ± 4.78 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.28 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0.log index 7ed9087..3cc22b3 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 659.79 ± 3.13 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.01 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 683.90 ± 5.36 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 34.77 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0__fa1.log index af54d9f..1f2d84e 100644 --- a/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 553.65 ± 2.40 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 33.31 ± 0.00 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 777.37 ± 4.77 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 34.17 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk.log b/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk.log index 9f2e80d..d68fc95 100644 --- a/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk.log +++ b/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 449.86 ± 1.68 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 34.19 ± 0.02 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 626.37 ± 2.34 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 35.23 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk__fa1.log b/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk__fa1.log index 9fa4616..10c4fa1 100644 --- a/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 496.21 ± 1.71 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 33.64 ± 0.01 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 723.80 ± 2.49 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 34.59 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__vulkan_radv.log b/benchmark/results/gpt-oss-120b-F16__vulkan_radv.log index 71ff53d..80e1e5e 100644 --- a/benchmark/results/gpt-oss-120b-F16__vulkan_radv.log +++ b/benchmark/results/gpt-oss-120b-F16__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 230.09 ± 0.83 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 33.57 ± 0.02 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 401.61 ± 1.76 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 33.86 ± 0.03 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-F16__vulkan_radv__fa1.log b/benchmark/results/gpt-oss-120b-F16__vulkan_radv__fa1.log index 547d915..2c9a045 100644 --- a/benchmark/results/gpt-oss-120b-F16__vulkan_radv__fa1.log +++ b/benchmark/results/gpt-oss-120b-F16__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 243.96 ± 0.96 | -| gpt-oss ?B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 33.79 ± 0.01 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 444.61 ± 1.65 | +| gpt-oss 120B F16 | 60.87 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 33.84 ± 0.02 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma.log index f3ebd8d..3b2c5d3 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 606.86 ± 5.18 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.26 ± 0.02 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 612.55 ± 6.58 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.08 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__fa1.log index 78fca14..78f5bac 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 732.72 ± 4.06 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 45.14 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 766.08 ± 2.67 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 45.93 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log index 0436056..6d219e6 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 351.42 ± 1.56 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.39 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 362.01 ± 1.06 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.04 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log index 4fe67c3..0808d48 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -2,5 +2,5 @@ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 -HW Exception by GPU node-1 (Agent handle: 0x3273c340) reason :GPU Hang +HW Exception by GPU node-1 (Agent handle: 0x32c91340) reason :GPU Hang ✖ ! [rocm6_4_3-rocwmma] gpt-oss-120b-mxfp4-00001-of-00003 __hblt0__fa1 failed (exit 134) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3.log index f5b3307..7506230 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 608.20 ± 7.04 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.40 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 614.68 ± 3.32 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.04 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__fa1.log index 7a9d128..8093c5f 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 533.95 ± 3.58 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 44.41 ± 0.03 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 768.28 ± 5.81 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 45.86 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0.log index 3f1f7ba..636f327 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 352.53 ± 0.81 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.41 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 362.06 ± 1.45 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.11 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0__fa1.log index fa1e33d..957a4d0 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 320.78 ± 0.96 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 44.49 ± 0.03 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 397.06 ± 1.41 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.01 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..85ade63 --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 639.82 ± 2.41 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 46.29 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..28c282b --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 756.98 ± 1.30 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.00 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..ccbcd7a --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 683.94 ± 2.89 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.29 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..8b573b2 --- /dev/null +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 778.15 ± 4.46 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.04 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma.log index 17a14c7..b346209 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 635.84 ± 5.72 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.26 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 641.91 ± 7.56 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.20 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__fa1.log index 8695248..4300bb0 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 708.36 ± 12.96 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 44.85 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 756.17 ± 4.24 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.05 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0.log index 117d484..d80b648 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 650.68 ± 9.08 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.26 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 681.37 ± 3.54 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.19 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log index 173cfbe..428b0ae 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 734.35 ± 10.26 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 44.85 ± 0.00 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 770.60 ± 3.18 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.09 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc.log index a5dcda7..b581add 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 646.07 ± 6.86 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.50 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 641.87 ± 3.27 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.17 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__fa1.log index c8991e9..8aaf148 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 541.57 ± 3.26 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 44.31 ± 0.00 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 757.39 ± 3.80 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.04 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0.log index f2a91fb..cb85c29 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 657.58 ± 3.78 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 45.56 ± 0.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | pp512 | 688.94 ± 3.90 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 0 | tg128 | 47.06 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0__fa1.log index 698f338..53cccf3 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 550.79 ± 2.99 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 44.41 ± 0.00 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | pp512 | 769.31 ± 5.48 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | ROCm | 99 | 1 | 0 | tg128 | 46.07 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk.log index 9d4e9e1..fc9cedb 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 485.54 ± 2.45 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 49.29 ± 0.03 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 682.60 ± 3.30 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 51.41 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk__fa1.log index c4cd434..82270b1 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 540.81 ± 2.56 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 48.25 ± 0.03 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 790.49 ± 4.84 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 50.15 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv.log index acd5b3a..f38971f 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 239.24 ± 1.27 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 50.39 ± 0.05 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | pp512 | 426.15 ± 2.65 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 0 | tg128 | 52.79 ± 0.16 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv__fa1.log b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv__fa1.log index f04d91c..d3f726b 100644 --- a/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv__fa1.log +++ b/benchmark/results/gpt-oss-120b-mxfp4-00001-of-00003__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 255.50 ± 1.49 | -| gpt-oss ?B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 50.41 ± 0.04 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | pp512 | 478.69 ± 3.26 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | Vulkan | 99 | 1 | 0 | tg128 | 52.75 ± 0.06 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma.log index 45d2dea..802b5a6 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1198.51 ± 10.39 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.14 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1140.40 ± 8.72 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.24 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__fa1.log index 219d081..5aa7e87 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1506.44 ± 7.03 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 27.10 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1492.30 ± 22.33 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.96 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0.log index 2c0a1a3..5cd945e 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 326.80 ± 4.56 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.13 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 327.64 ± 1.89 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.19 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0__fa1.log index b04117d..8086999 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 350.18 ± 5.10 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 27.09 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 342.77 ± 3.39 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 27.00 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3.log index 8d5aeeb..64e0253 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1185.57 ± 6.55 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.12 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1147.38 ± 6.40 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.24 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__fa1.log index 4138942..2e70988 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1000.77 ± 2.37 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.83 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1508.59 ± 26.99 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 27.00 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0.log index 76ec711..4437a50 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 322.00 ± 4.37 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.14 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 326.33 ± 6.68 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.20 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0__fa1.log index 9ce8ee4..3ae063a 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 303.26 ± 4.84 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.90 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 344.41 ± 7.32 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.96 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..2e92934 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1202.41 ± 13.79 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 26.03 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..e6da7d3 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1484.60 ± 5.26 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.90 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..7d9f2db --- /dev/null +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1225.63 ± 9.42 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.25 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..d23467b --- /dev/null +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1421.82 ± 12.16 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.95 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma.log index d80b5e6..8300d8a 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1256.75 ± 10.54 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.11 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1209.21 ± 16.57 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.23 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__fa1.log index 2648480..9cb1cb9 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1481.17 ± 9.67 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 27.03 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1489.00 ± 6.12 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.98 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0.log index 9c6df9d..9bb131b 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1202.19 ± 5.53 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.10 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1218.32 ± 13.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.21 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0__fa1.log index 8b68b31..2210bb8 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1422.90 ± 11.48 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 27.04 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1424.60 ± 8.06 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.98 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc.log index 8a71c43..1aa5162 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1253.01 ± 23.20 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.11 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1198.99 ± 21.23 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.25 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc__fa1.log index eaf7b98..dd419ad 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1005.24 ± 32.45 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.89 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1506.46 ± 15.83 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.98 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0.log index 685527d..9a0a4d9 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1220.02 ± 12.30 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.17 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1224.83 ± 11.58 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 27.20 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0__fa1.log index 18915d7..96a9cd4 100644 --- a/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 985.58 ± 10.64 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.88 ± 0.00 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1446.22 ± 17.28 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 26.99 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk.log b/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk.log index 381cc89..8e3da78 100644 --- a/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk.log +++ b/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 367.61 ± 1.90 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 8.69 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 469.06 ± 6.82 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 15.25 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk__fa1.log b/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk__fa1.log index 4212c20..efee579 100644 --- a/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 386.12 ± 1.98 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 8.66 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 514.17 ± 6.35 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 15.13 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__vulkan_radv.log b/benchmark/results/gpt-oss-20b-F32__vulkan_radv.log index a3f7dda..7b2fe16 100644 --- a/benchmark/results/gpt-oss-20b-F32__vulkan_radv.log +++ b/benchmark/results/gpt-oss-20b-F32__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 315.56 ± 1.40 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 7.86 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 537.62 ± 2.10 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 14.85 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-F32__vulkan_radv__fa1.log b/benchmark/results/gpt-oss-20b-F32__vulkan_radv__fa1.log index 257f941..830fbc3 100644 --- a/benchmark/results/gpt-oss-20b-F32__vulkan_radv__fa1.log +++ b/benchmark/results/gpt-oss-20b-F32__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 333.31 ± 1.47 | -| gpt-oss ?B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 7.92 ± 0.01 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 583.32 ± 3.38 | +| gpt-oss 20B BF16 | 38.97 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 14.86 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma.log index 2c92204..65b10c2 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1184.03 ± 8.37 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 65.07 ± 0.00 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1164.39 ± 11.24 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.35 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__fa1.log index cd2e38f..1edd08e 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1480.28 ± 9.38 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 64.45 ± 0.02 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1531.44 ± 9.83 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.78 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0.log index f7e992e..1f991a5 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 584.04 ± 2.52 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 64.87 ± 0.02 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 590.66 ± 1.42 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.35 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0__fa1.log index 2016c11..084ea0a 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 643.25 ± 3.86 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 64.67 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 653.80 ± 1.17 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.72 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3.log index d5473e4..aae04b7 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3.log @@ -9,7 +9,7 @@ rocBLAS warning: hipBlasLT failed, falling back to tensile. This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set. | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1171.02 ± 7.04 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 64.94 ± 0.04 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1160.12 ± 12.72 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.19 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__fa1.log index abd5fd2..eb25e8e 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 996.31 ± 6.53 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 63.68 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1539.79 ± 14.33 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.81 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0.log index be6782a..cee8253 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 582.51 ± 2.41 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 64.89 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 591.28 ± 2.68 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.35 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0__fa1.log index 90bc0d7..7dbc6fc 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 522.63 ± 1.74 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 63.66 ± 0.03 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 655.10 ± 1.75 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.85 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..a52d858 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1222.12 ± 10.04 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.34 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..5070ee2 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1515.09 ± 6.22 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.75 ± 0.00 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..598c877 --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1335.14 ± 17.02 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.33 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..8fe24ad --- /dev/null +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1562.66 ± 9.76 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.84 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma.log index 8e4bdde..f28a63e 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1236.64 ± 11.20 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 64.78 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1215.59 ± 8.93 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.39 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__fa1.log index c87a02c..c75c722 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1460.58 ± 11.92 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 64.26 ± 0.00 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1521.41 ± 10.84 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.89 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0.log index 2dd9efb..fdcb0a4 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1299.34 ± 7.77 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 64.85 ± 0.00 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1334.89 ± 9.58 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.43 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0__fa1.log index fc2acae..f407d2f 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1516.33 ± 21.51 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 64.40 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1567.58 ± 12.62 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.78 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc.log index 34dc4e8..0607b26 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1246.14 ± 8.32 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 65.15 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1219.34 ± 5.57 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.37 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__fa1.log index f6feec3..120f9df 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1010.38 ± 6.35 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 63.49 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1530.70 ± 9.71 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.84 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0.log index 425b7fd..079a2c7 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1303.74 ± 6.94 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 65.10 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | pp512 | 1331.00 ± 21.19 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 0 | tg128 | 67.41 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0__fa1.log index 0bd1151..1d5c6f3 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1037.92 ± 11.67 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 63.63 ± 0.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | pp512 | 1575.63 ± 16.60 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | ROCm | 99 | 1 | 0 | tg128 | 65.76 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk.log b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk.log index 10fffaa..3df5436 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 1220.69 ± 8.95 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 71.42 ± 0.20 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 1498.39 ± 12.53 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 74.08 ± 0.09 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk__fa1.log index 185b49c..51abfd5 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 1467.61 ± 12.70 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 69.47 ± 0.09 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 1914.72 ± 22.77 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 72.57 ± 0.12 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv.log b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv.log index cf40790..460e04d 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 651.21 ± 5.24 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 72.35 ± 0.08 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | pp512 | 1002.66 ± 7.71 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 0 | tg128 | 74.77 ± 0.18 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv__fa1.log b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv__fa1.log index ed9a28f..2a7699d 100644 --- a/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv__fa1.log +++ b/benchmark/results/gpt-oss-20b-mxfp4__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 732.35 ± 7.51 | -| gpt-oss ?B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 72.05 ± 0.07 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | pp512 | 1204.49 ± 13.52 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | Vulkan | 99 | 1 | 0 | tg128 | 74.94 ± 0.14 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma.log index 2220109..a4d3dc7 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 981.76 ± 1.61 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.26 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 979.46 ± 1.57 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.90 ± 0.02 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__fa1.log index 79d7534..2b466fa 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1096.97 ± 5.09 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 48.33 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1100.15 ± 1.95 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.29 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0.log index 54ebe40..d2b3dfa 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 348.00 ± 0.44 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.39 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 348.31 ± 0.71 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.90 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0__fa1.log index 3d3918a..5c7f9e5 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 367.46 ± 0.31 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 48.20 ± 0.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 366.05 ± 1.98 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.32 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3.log index 2c94893..638eede 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 978.30 ± 1.98 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.39 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 979.46 ± 2.10 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.90 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__fa1.log index fd39b3e..6d11aa9 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 686.88 ± 0.38 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 48.80 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1105.32 ± 2.28 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.24 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0.log index 689c168..acf5f29 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 348.07 ± 0.50 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.36 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 348.63 ± 0.64 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.81 ± 0.00 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0__fa1.log index eca19dc..6b8a5cd 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm6_4_3__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 307.39 ± 0.70 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 48.75 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 368.28 ± 0.81 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.35 ± 0.01 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants.log new file mode 100644 index 0000000..4db56a5 --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 977.63 ± 2.98 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.91 ± 0.02 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__fa1.log new file mode 100644 index 0000000..0fa5959 --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1097.55 ± 1.49 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.33 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__hblt0.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__hblt0.log new file mode 100644 index 0000000..5636ade --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__hblt0.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 860.30 ± 0.57 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.89 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log new file mode 100644 index 0000000..3b3feea --- /dev/null +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma-fa_all_quants__hblt0__fa1.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | fa | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 953.79 ± 3.60 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.35 ± 0.01 | + +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma.log index 075f5ba..08e2ece 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 978.15 ± 1.18 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.15 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 984.61 ± 2.65 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.94 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__fa1.log index 669a6d8..be2842e 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1089.54 ± 1.93 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 46.47 ± 0.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1095.50 ± 2.69 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.34 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0.log index 761a77a..98855b5 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 863.15 ± 2.90 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.09 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 859.46 ± 1.91 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.90 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0__fa1.log index 655dfd3..847066a 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc-rocwmma__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 947.88 ± 1.69 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 46.48 ± 0.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 952.18 ± 1.68 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.32 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc.log index 8b649c5..d564f8b 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 979.59 ± 2.44 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.38 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 980.24 ± 1.40 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.90 ± 0.00 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__fa1.log index 05d74ad..e714e54 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 684.81 ± 1.14 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 48.97 ± 0.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 1100.05 ± 4.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.29 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0.log index 27bd0d7..bc6ac1c 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 865.92 ± 1.53 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.40 ± 0.00 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | pp512 | 860.23 ± 0.94 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 0 | tg128 | 49.92 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0__fa1.log b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0__fa1.log index 28a8543..f12456c 100644 --- a/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__rocm7_rc__hblt0__fa1.log @@ -4,7 +4,7 @@ ggml_cuda_init: found 1 ROCm devices: Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 630.67 ± 1.16 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 48.83 ± 0.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | pp512 | 958.47 ± 2.31 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | ROCm | 99 | 1 | 0 | tg128 | 49.29 ± 0.01 | -build: de219279 (6181) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk.log b/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk.log index 20052b6..6162d50 100644 --- a/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk.log +++ b/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | pp512 | 1305.67 ± 1.36 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | tg128 | 48.48 ± 0.11 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | pp512 | 1317.02 ± 4.04 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | tg128 | 53.59 ± 0.07 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk__fa1.log b/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk__fa1.log index a9dca39..85db8fc 100644 --- a/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__vulkan_amdvlk__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | pp512 | 1377.39 ± 0.62 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | tg128 | 47.91 ± 0.01 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | pp512 | 1380.42 ± 7.77 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | tg128 | 52.95 ± 0.07 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__vulkan_radv.log b/benchmark/results/llama-2-7b.Q4_0__vulkan_radv.log index f19f729..979748b 100644 --- a/benchmark/results/llama-2-7b.Q4_0__vulkan_radv.log +++ b/benchmark/results/llama-2-7b.Q4_0__vulkan_radv.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | pp512 | 875.74 ± 6.47 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | tg128 | 52.85 ± 0.12 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | pp512 | 868.70 ± 8.94 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 0 | tg128 | 54.37 ± 0.04 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/benchmark/results/llama-2-7b.Q4_0__vulkan_radv__fa1.log b/benchmark/results/llama-2-7b.Q4_0__vulkan_radv__fa1.log index 3c88a39..d3a0b5a 100644 --- a/benchmark/results/llama-2-7b.Q4_0__vulkan_radv__fa1.log +++ b/benchmark/results/llama-2-7b.Q4_0__vulkan_radv__fa1.log @@ -2,7 +2,7 @@ ggml_vulkan: Found 1 Vulkan devices: ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat | model | size | params | backend | ngl | fa | mmap | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | pp512 | 957.61 ± 5.26 | -| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | tg128 | 52.16 ± 0.08 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | pp512 | 957.23 ± 9.23 | +| llama 7B Q4_0 | 3.56 GiB | 6.74 B | Vulkan | 99 | 1 | 0 | tg128 | 53.49 ± 0.04 | -build: 1fe00296 (6182) +build: f1fbffb5 (6486) diff --git a/docs/index.html b/docs/index.html index d8963c8..6cd3e4a 100644 --- a/docs/index.html +++ b/docs/index.html @@ -214,6 +214,13 @@ border: 1px solid #b19cff55; } + .faall { + background: #cfe9ff; + /* light blue chip */ + color: #000000; + border: 1px solid #9bc9ff55; + } + .meta { padding: 0 20px 14px; color: var(--muted); @@ -250,6 +257,21 @@ border-radius: 8px; } + .scroller-top { + overflow-x: auto; + overflow-y: hidden; + height: 12px; + /* slim */ + margin: 0 0 6px; + /* a little gap above the table */ + } + + .scroller-top .scroller-spacer { + height: 1px; + /* tiny content so the bar renders */ + } + + table { width: max-content; min-width: 100vw; @@ -415,28 +437,36 @@