updated rpc benchmakrs with long context

2025-11-19 07:35:56 +00:00
parent 1d88fca07d
commit c7f4ffc346
37 changed files with 2283 additions and 1 deletions
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          6.54 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.81 ± 0.00 |
+
+build: caca0d55c (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          6.03 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.81 ± 0.00 |
+
+build: caca0d55c (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         16.47 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.84 ± 0.00 |
+
+build: 86f1f4411 (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         13.99 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.82 ± 0.00 |
+
+build: 86f1f4411 (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          8.59 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.81 ± 0.00 |
+
+build: f1840a25d (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          8.11 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.81 ± 0.00 |
+
+build: f1840a25d (7085)
@@ -0,0 +1,36 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+:0:rocdevice.cpp            :3580: 143267106124 us:  Callback: Queue 0x7f2f8a400000 aborting with error : HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+Kernel Name: _ZL15flash_attn_tileILi128ELi128ELi16ELi4ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiiiiiiiiiiiiiliiliiiiil
+VGPU=0xe824a70 SWq=0x7f308c9e2000, HWq=0x7f2f8a400000, id=2
+	Dispatch Header =0xb02 (type=2, barrier=1, acquire=1, release=1), setup=0
+	grid=[4096, 8, 24], workgroup=[32, 8, 1]
+	private_seg_size=0, group_seg_size=33792
+	kernel_obj=0x7f11bd42f700, kernarg_address=0x0x7f2f8a201e80
+	completion_signal=0x0, correlation_id=0
+	rptr=23, wptr=24
+ /opt/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:89: ROCm error
+/usr/local/lib64/libggml-base.so.0(+0x3565) [0x7f30cc2c5565]
+/usr/local/lib64/libggml-base.so.0(ggml_print_backtrace+0x1eb) [0x7f30cc2c592b]
+/usr/local/lib64/libggml-base.so.0(ggml_abort+0x11f) [0x7f30cc2c5aaf]
+/usr/local/lib64/libggml-hip.so.0(+0x294e092) [0x7f30cecd0092]
+/usr/local/lib64/libggml-hip.so.0(+0x295f107) [0x7f30cece1107]
+/usr/local/lib64/libggml-hip.so.0(+0x295d9dd) [0x7f30cecdf9dd]
+/usr/local/lib64/libggml-hip.so.0(+0x295c95d) [0x7f30cecde95d]
+/usr/local/lib64/libggml-hip.so.0(+0x29575c7) [0x7f30cecd95c7]
+/usr/local/lib64/libggml-hip.so.0(+0x29540ea) [0x7f30cecd60ea]
+/usr/local/lib64/libggml-hip.so.0(+0x295319f) [0x7f30cecd519f]
+/usr/local/lib64/libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x7f3) [0x7f30cc2dfde3]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa0) [0x7f30cf378650]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0xe2) [0x7f30cf37a2e2]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x3bf) [0x7f30cf37f1bf]
+/usr/local/lib64/libllama.so.0(llama_decode+0xe) [0x7f30cf38000e]
+/usr/local/bin/llama-bench() [0x40a3db]
+/usr/local/bin/llama-bench() [0x407edc]
+/lib64/libc.so.6(+0x35b5) [0x7f30cbc5b5b5]
+/lib64/libc.so.6(__libc_start_main+0x88) [0x7f30cbc5b668]
+/usr/local/bin/llama-bench() [0x409255]
@@ -0,0 +1,21 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+/opt/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp:858: Remote RPC server crashed or returned malformed response
+/usr/local/lib64/libggml-base.so.0(+0x3565) [0x7f15cac25565]
+/usr/local/lib64/libggml-base.so.0(ggml_print_backtrace+0x1eb) [0x7f15cac2592b]
+/usr/local/lib64/libggml-base.so.0(ggml_abort+0x11f) [0x7f15cac25aaf]
+/usr/local/lib64/libggml-rpc.so.0(+0xa195) [0x7f15cacd3195]
+/usr/local/lib64/libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x7f3) [0x7f15cac3fde3]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa0) [0x7f15cdcd8650]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0xe2) [0x7f15cdcda2e2]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x3bf) [0x7f15cdcdf1bf]
+/usr/local/lib64/libllama.so.0(llama_decode+0xe) [0x7f15cdce000e]
+/usr/local/bin/llama-bench() [0x40a3db]
+/usr/local/bin/llama-bench() [0x407edc]
+/lib64/libc.so.6(+0x35b5) [0x7f15ca5bb5b5]
+/lib64/libc.so.6(__libc_start_main+0x88) [0x7f15ca5bb668]
+/usr/local/bin/llama-bench() [0x409255]
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         10.48 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.42 ± 0.00 |
+
+build: 12bb5c37 (7074)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          9.05 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.42 ± 0.00 |
+
+build: 12bb5c37 (7074)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          6.98 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.66 ± 0.00 |
+
+build: 4db63cdde (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          6.61 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.66 ± 0.00 |
+
+build: 4db63cdde (7085)
@@ -0,0 +1,32 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+:0:rocdevice.cpp            :3588: 155339106534 us:  Callback: Queue 0x7f48ed200000 aborting with error : HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+Kernel Name: _ZL15flash_attn_tileILi128ELi128ELi16ELi4ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiiiiiiiiiiiiiliiliiiiil
+VGPU=0x1eb0d870 SWq=0x7f48efd6a000, HWq=0x7f48ed200000, id=2
+	Dispatch Header =0xb02 (type=2, barrier=1, acquire=1, release=1), setup=0
+	grid=[4096, 8, 24], workgroup=[32, 8, 1]
+	private_seg_size=0, group_seg_size=33792
+	kernel_obj=0x7f48ed02f700, kernarg_address=0x0x7f48ec41e800
+	completion_signal=0x0, correlation_id=0
+	rptr=5346, wptr=5361
+ /opt/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu:89: ROCm error
+/usr/local/lib64/libggml-base.so.0(+0x3565) [0x7f48fc619565]
+/usr/local/lib64/libggml-base.so.0(ggml_print_backtrace+0x1eb) [0x7f48fc61992b]
+/usr/local/lib64/libggml-base.so.0(ggml_abort+0x11f) [0x7f48fc619aaf]
+/usr/local/lib64/libggml-hip.so.0(+0x28bcf12) [0x7f48fef92f12]
+/usr/local/lib64/libggml-hip.so.0(+0x28c4a66) [0x7f48fef9aa66]
+/usr/local/lib64/libggml-hip.so.0(+0x28c1fcf) [0x7f48fef97fcf]
+/usr/local/lib64/libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x7f3) [0x7f48fc633de3]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa0) [0x7f48ff666650]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0xe2) [0x7f48ff6682e2]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x3bf) [0x7f48ff66d1bf]
+/usr/local/lib64/libllama.so.0(llama_decode+0xe) [0x7f48ff66e00e]
+/usr/local/bin/llama-bench() [0x40a3db]
+/usr/local/bin/llama-bench() [0x407edc]
+/lib64/libc.so.6(+0x35b5) [0x7f48fbfaf5b5]
+/lib64/libc.so.6(__libc_start_main+0x88) [0x7f48fbfaf668]
+/usr/local/bin/llama-bench() [0x409255]
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         12.79 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.43 ± 0.00 |
+
+build: 4fc43d43d (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          8.70 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.81 ± 0.00 |
+
+build: b447a9a4b (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |          8.13 ± 0.00 |
+| glm4moe 355B.A32B Q4_K - Medium | 189.69 GiB |   356.79 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          0.81 ± 0.00 |
+
+build: b447a9a4b (7085)
@@ -0,0 +1,21 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+/opt/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp:858: Remote RPC server crashed or returned malformed response
+/usr/local/lib64/libggml-base.so.0(+0x3565) [0x7faca1d2c565]
+/usr/local/lib64/libggml-base.so.0(ggml_print_backtrace+0x1eb) [0x7faca1d2c92b]
+/usr/local/lib64/libggml-base.so.0(ggml_abort+0x11f) [0x7faca1d2caaf]
+/usr/local/lib64/libggml-rpc.so.0(+0xa195) [0x7faca1dda195]
+/usr/local/lib64/libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x7f3) [0x7faca1d46de3]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa0) [0x7faca4ddf650]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0xe2) [0x7faca4de12e2]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x3bf) [0x7faca4de61bf]
+/usr/local/lib64/libllama.so.0(llama_decode+0xe) [0x7faca4de700e]
+/usr/local/bin/llama-bench() [0x40a3db]
+/usr/local/bin/llama-bench() [0x407edc]
+/lib64/libc.so.6(+0x35b5) [0x7faca16c25b5]
+/lib64/libc.so.6(__libc_start_main+0x88) [0x7faca16c2668]
+/usr/local/bin/llama-bench() [0x409255]
@@ -0,0 +1,21 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+/opt/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp:858: Remote RPC server crashed or returned malformed response
+/usr/local/lib64/libggml-base.so.0(+0x3565) [0x7fb39df29565]
+/usr/local/lib64/libggml-base.so.0(ggml_print_backtrace+0x1eb) [0x7fb39df2992b]
+/usr/local/lib64/libggml-base.so.0(ggml_abort+0x11f) [0x7fb39df29aaf]
+/usr/local/lib64/libggml-rpc.so.0(+0xa195) [0x7fb39dfd7195]
+/usr/local/lib64/libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x7f3) [0x7fb39df43de3]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa0) [0x7fb3a0fdc650]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0xe2) [0x7fb3a0fde2e2]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x3bf) [0x7fb3a0fe31bf]
+/usr/local/lib64/libllama.so.0(llama_decode+0xe) [0x7fb3a0fe400e]
+/usr/local/bin/llama-bench() [0x40a3db]
+/usr/local/bin/llama-bench() [0x407edc]
+/lib64/libc.so.6(+0x35b5) [0x7fb39d8bf5b5]
+/lib64/libc.so.6(__libc_start_main+0x88) [0x7fb39d8bf668]
+/usr/local/bin/llama-bench() [0x409255]
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         20.68 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.06 ± 0.00 |
+
+build: caca0d55c (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         20.85 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.07 ± 0.00 |
+
+build: caca0d55c (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         56.85 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.68 ± 0.00 |
+
+build: 86f1f4411 (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         55.76 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.75 ± 0.00 |
+
+build: 86f1f4411 (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         29.02 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.07 ± 0.00 |
+
+build: f1840a25d (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         28.97 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.06 ± 0.00 |
+
+build: f1840a25d (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         61.44 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.68 ± 0.00 |
+
+build: 677be4d78 (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         59.64 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.66 ± 0.00 |
+
+build: 677be4d78 (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         34.46 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.66 ± 0.00 |
+
+build: 12bb5c37 (7074)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         34.62 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.65 ± 0.00 |
+
+build: 12bb5c37 (7074)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         22.40 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          1.84 ± 0.00 |
+
+build: 4db63cdde (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         22.47 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          1.84 ± 0.00 |
+
+build: 4db63cdde (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         55.05 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.69 ± 0.00 |
+
+build: 4fc43d43d (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         55.38 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.70 ± 0.00 |
+
+build: 4fc43d43d (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         29.15 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.07 ± 0.00 |
+
+build: b447a9a4b (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         29.32 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          3.07 ± 0.00 |
+
+build: b447a9a4b (7085)
@@ -0,0 +1,10 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |         57.81 ± 0.00 |
+| minimax-m2 230B.A10B Q6_K      | 180.94 GiB |   228.69 B | ROCm,RPC   |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |          5.67 ± 0.00 |
+
+build: fa5c85a8b (7085)
@@ -0,0 +1,21 @@
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+/opt/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp:858: Remote RPC server crashed or returned malformed response
+/usr/local/lib64/libggml-base.so.0(+0x3565) [0x7f1a02e06565]
+/usr/local/lib64/libggml-base.so.0(ggml_print_backtrace+0x1eb) [0x7f1a02e0692b]
+/usr/local/lib64/libggml-base.so.0(ggml_abort+0x11f) [0x7f1a02e06aaf]
+/usr/local/lib64/libggml-rpc.so.0(+0xa195) [0x7f1a02eb4195]
+/usr/local/lib64/libggml-base.so.0(ggml_backend_sched_graph_compute_async+0x7f3) [0x7f1a02e20de3]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context13graph_computeEP11ggml_cgraphb+0xa0) [0x7f1a05eb9650]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context14process_ubatchERK12llama_ubatch14llm_graph_typeP22llama_memory_context_iR11ggml_status+0xe2) [0x7f1a05ebb2e2]
+/usr/local/lib64/libllama.so.0(_ZN13llama_context6decodeERK11llama_batch+0x3bf) [0x7f1a05ec01bf]
+/usr/local/lib64/libllama.so.0(llama_decode+0xe) [0x7f1a05ec100e]
+/usr/local/bin/llama-bench() [0x40a3db]
+/usr/local/bin/llama-bench() [0x408087]
+/lib64/libc.so.6(+0x35b5) [0x7f1a0279c5b5]
+/lib64/libc.so.6(__libc_start_main+0x88) [0x7f1a0279c668]
+/usr/local/bin/llama-bench() [0x409255]