From e7e27e6cf39ce809146a15743e6c87e92789c21b Mon Sep 17 00:00:00 2001 From: Donato Capitella Date: Sun, 3 Aug 2025 13:05:52 +0100 Subject: [PATCH] Benchmark and container updates --- README.md | 431 +++++++----------- ...B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log | 172 +++++++ ...-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log | 172 +++++++ ...2B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log | 172 +++++++ ...-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log | 123 +++++ ...UD-Q8_K_XL-00001-of-00002__vulkan_radv.log | 170 +++++++ ...t-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log | 163 +++++++ ...-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log | 163 +++++++ ...ct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log | 163 +++++++ ...-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log | 161 +++++++ ...UD-Q8_K_XL-00001-of-00002__vulkan_radv.log | 161 +++++++ ...nstruct-Q6_K-00001-of-00002__rocm6_4_2.log | 179 ++++++++ ...struct-Q6_K-00001-of-00002__rocm7_beta.log | 179 ++++++++ ...Instruct-Q6_K-00001-of-00002__rocm7_rc.log | 179 ++++++++ ...uct-Q6_K-00001-of-00002__vulkan_amdvlk.log | 177 +++++++ ...truct-Q6_K-00001-of-00002__vulkan_radv.log | 177 +++++++ ...nstruct-Q8_0-00001-of-00003__rocm6_4_2.log | 179 ++++++++ ...struct-Q8_0-00001-of-00003__rocm7_beta.log | 179 ++++++++ ...Instruct-Q8_0-00001-of-00003__rocm7_rc.log | 179 ++++++++ ...uct-Q8_0-00001-of-00003__vulkan_amdvlk.log | 177 +++++++ ...truct-Q8_0-00001-of-00003__vulkan_radv.log | 177 +++++++ ...t-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log | 181 ++++++++ ...-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log | 162 +++++++ ...ct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log | 174 +++++++ ...-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log | 179 ++++++++ ...UD-Q4_K_XL-00001-of-00002__vulkan_radv.log | 179 ++++++++ ...7-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log | 184 ++++++++ ...-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log | 184 ++++++++ ...07-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log | 184 ++++++++ ...-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log | 182 ++++++++ ...UD-Q3_K_XL-00001-of-00003__vulkan_radv.log | 182 ++++++++ ...30B-A3B-BF16-00001-of-00002__rocm6_4_2.log | 167 +++++++ ...0B-A3B-BF16-00001-of-00002__rocm7_beta.log | 167 +++++++ ...-30B-A3B-BF16-00001-of-00002__rocm7_rc.log | 167 +++++++ ...A3B-BF16-00001-of-00002__vulkan_amdvlk.log | 165 +++++++ ...B-A3B-BF16-00001-of-00002__vulkan_radv.log | 165 +++++++ ...nstruct-BF16-00001-of-00002__rocm6_4_2.log | 176 +++++++ ...struct-BF16-00001-of-00002__rocm7_beta.log | 176 +++++++ ...Instruct-BF16-00001-of-00002__rocm7_rc.log | 176 +++++++ ...uct-BF16-00001-of-00002__vulkan_amdvlk.log | 174 +++++++ ...truct-BF16-00001-of-00002__vulkan_radv.log | 174 +++++++ .../gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log | 165 +++++++ .../gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log | 165 +++++++ .../gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log | 165 +++++++ ...mma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log | 163 +++++++ ...gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log | 163 +++++++ ...-27b-it-BF16-00001-of-00002__rocm6_4_2.log | 164 +++++++ ...27b-it-BF16-00001-of-00002__rocm7_beta.log | 164 +++++++ ...3-27b-it-BF16-00001-of-00002__rocm7_rc.log | 164 +++++++ ...-it-BF16-00001-of-00002__vulkan_amdvlk.log | 113 +++++ ...7b-it-BF16-00001-of-00002__vulkan_radv.log | 162 +++++++ .../llama3.3-70.6B-Q4_K_M__rocm6_4_2.log | 159 +++++++ .../llama3.3-70.6B-Q4_K_M__rocm7_beta.log | 159 +++++++ .../llama3.3-70.6B-Q4_K_M__rocm7_rc.log | 159 +++++++ .../llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log | 157 +++++++ .../llama3.3-70.6B-Q4_K_M__vulkan_radv.log | 157 +++++++ benchmark/parse_benchmark_results.py | 120 +++++ benchmark/parse_loadtime_results.py | 71 +++ ...B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log | 6 + ...-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log | 6 + ...2B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log | 5 + ...-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log | 8 + ...UD-Q8_K_XL-00001-of-00002__vulkan_radv.log | 8 + ...t-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log | 10 + ...-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log | 6 + ...ct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log | 5 + ...-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log | 8 + ...UD-Q8_K_XL-00001-of-00002__vulkan_radv.log | 8 + ...nstruct-Q6_K-00001-of-00002__rocm6_4_2.log | 10 + ...struct-Q6_K-00001-of-00002__rocm7_beta.log | 6 + ...Instruct-Q6_K-00001-of-00002__rocm7_rc.log | 10 + ...uct-Q6_K-00001-of-00002__vulkan_amdvlk.log | 8 + ...truct-Q6_K-00001-of-00002__vulkan_radv.log | 8 + ...nstruct-Q8_0-00001-of-00003__rocm6_4_2.log | 6 + ...struct-Q8_0-00001-of-00003__rocm7_beta.log | 6 + ...Instruct-Q8_0-00001-of-00003__rocm7_rc.log | 5 + ...uct-Q8_0-00001-of-00003__vulkan_amdvlk.log | 8 + ...truct-Q8_0-00001-of-00003__vulkan_radv.log | 8 + ...t-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log | 10 + ...-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log | 10 + ...ct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log | 5 + ...-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log | 8 + ...UD-Q4_K_XL-00001-of-00002__vulkan_radv.log | 8 + ...7-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log | 10 + ...-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log | 6 + ...07-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log | 10 + ...-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log | 8 + ...UD-Q3_K_XL-00001-of-00003__vulkan_radv.log | 8 + ...30B-A3B-BF16-00001-of-00002__rocm6_4_2.log | 10 + ...0B-A3B-BF16-00001-of-00002__rocm7_beta.log | 10 + ...-30B-A3B-BF16-00001-of-00002__rocm7_rc.log | 10 + ...A3B-BF16-00001-of-00002__vulkan_amdvlk.log | 8 + ...B-A3B-BF16-00001-of-00002__vulkan_radv.log | 8 + ...nstruct-BF16-00001-of-00002__rocm6_4_2.log | 10 + ...struct-BF16-00001-of-00002__rocm7_beta.log | 10 + ...Instruct-BF16-00001-of-00002__rocm7_rc.log | 10 + ...uct-BF16-00001-of-00002__vulkan_amdvlk.log | 8 + ...truct-BF16-00001-of-00002__vulkan_radv.log | 8 + .../gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log | 10 + .../gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log | 10 + .../gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log | 10 + ...mma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log | 8 + ...gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log | 8 + ...-27b-it-BF16-00001-of-00002__rocm6_4_2.log | 10 + ...27b-it-BF16-00001-of-00002__rocm7_beta.log | 10 + ...3-27b-it-BF16-00001-of-00002__rocm7_rc.log | 10 + ...-it-BF16-00001-of-00002__vulkan_amdvlk.log | 8 + ...7b-it-BF16-00001-of-00002__vulkan_radv.log | 8 + .../llama3.3-70.6B-Q4_K_M__rocm6_4_2.log | 10 + .../llama3.3-70.6B-Q4_K_M__rocm7_beta.log | 10 + .../llama3.3-70.6B-Q4_K_M__rocm7_rc.log | 10 + .../llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log | 8 + .../llama3.3-70.6B-Q4_K_M__vulkan_radv.log | 8 + benchmark/run_benchmarks.log | 314 +++++++++++++ benchmark/run_benchmarks.log.backup | 358 +++++++++++++++ benchmark/run_benchmarks.sh | 57 +++ benchmark/run_loadtime_benchmark.log | 277 +++++++++++ benchmark/run_loadtime_benchmark.log.backup | 331 ++++++++++++++ benchmark/run_loadtime_benchmark.sh | 88 ++++ benchmark/temp.py | 147 ++++++ .../Dockerfile.rocm-6.4.2 | 0 .../Dockerfile.rocm-7beta | 0 docker-toolboxes/Dockerfile.rocm-7rc | 79 ++++ docker-toolboxes/Dockerfile.vulkan-amdvlk | 38 ++ .../Dockerfile.vulkan-radv | 0 .../gguf-vram-estimator.py | 0 docker-toolboxes/hip-rocm7rc.patch | 28 ++ docs/benchmarks.md | 138 ++++++ docs/building.md | 75 +++ docs/vram-estimator.md | 89 ++++ 130 files changed, 12111 insertions(+), 267 deletions(-) create mode 100644 benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log create mode 100644 benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log create mode 100644 benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log create mode 100644 benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log create mode 100644 benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log create mode 100644 benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log create mode 100644 benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log create mode 100644 benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log create mode 100644 benchmark/parse_benchmark_results.py create mode 100755 benchmark/parse_loadtime_results.py create mode 100644 benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log create mode 100644 benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log create mode 100644 benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log create mode 100644 benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log create mode 100644 benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log create mode 100644 benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log create mode 100644 benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log create mode 100644 benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log create mode 100644 benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log create mode 100644 benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log create mode 100644 benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log create mode 100644 benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log create mode 100644 benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log create mode 100644 benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log create mode 100644 benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log create mode 100644 benchmark/results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log create mode 100644 benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log create mode 100644 benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log create mode 100644 benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log create mode 100644 benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log create mode 100644 benchmark/run_benchmarks.log create mode 100644 benchmark/run_benchmarks.log.backup create mode 100755 benchmark/run_benchmarks.sh create mode 100644 benchmark/run_loadtime_benchmark.log create mode 100644 benchmark/run_loadtime_benchmark.log.backup create mode 100755 benchmark/run_loadtime_benchmark.sh create mode 100644 benchmark/temp.py rename Dockerfile.rocm-6.4.2 => docker-toolboxes/Dockerfile.rocm-6.4.2 (100%) rename Dockerfile.rocm-7beta => docker-toolboxes/Dockerfile.rocm-7beta (100%) create mode 100644 docker-toolboxes/Dockerfile.rocm-7rc create mode 100644 docker-toolboxes/Dockerfile.vulkan-amdvlk rename Dockerfile.vulkan => docker-toolboxes/Dockerfile.vulkan-radv (100%) rename gguf-vram-estimator.py => docker-toolboxes/gguf-vram-estimator.py (100%) create mode 100644 docker-toolboxes/hip-rocm7rc.patch create mode 100644 docs/benchmarks.md create mode 100644 docs/building.md create mode 100644 docs/vram-estimator.md diff --git a/README.md b/README.md index 619315e..a443a31 100644 --- a/README.md +++ b/README.md @@ -1,307 +1,192 @@ -# amd-strix-halo-toolboxes +# AMD Strix Halo Llama.cpp Toolboxes -Fedora Rawhide-based containers for AMD Ryzen AI MAX+ 395 **Strix Halo** chips with integrated GPU (gfx1151) and unified memory. Pre-built with `llama.cpp` and GPU compute libraries. +This project provides pre-built containers (“toolboxes”) for running LLMs on **AMD Ryzen AI Max “Strix Halo”** integrated GPUs. Toolbx is the standard developer container system in Fedora (and now works on Ubuntu, openSUSE, Arch, etc). + +## Why Toolbx? + +* Reproducible: never pollute your host system +* Seamless: shares your home and GPU devices, works like a native shell +* Flexible: easy to switch between Vulkan (open/closed drivers) and ROCm ## Table of Contents -- [1. Performance Summary](#1-performance-summary) -- [2. Available Containers](#2-available-containers) -- [3. Quick Start](#3-quick-start) - - [3.1 Prerequisites](#31-prerequisites) - - [3.2 Pull Pre-built Images](#32-pull-pre-built-images) - - [3.3 Create Toolboxes](#33-create-toolboxes) - - [3.4 Enter and Test](#34-enter-and-test) -- [4. Performance Benchmarks](#4-performance-benchmarks) - - [4.1 Prompt Processing Results](#41-prompt-processing-pp512---tokenssecond) - - [4.2 Text Generation Results](#42-text-generation-tg128---tokenssecond) - - [4.3 Performance Analysis](#43-performance-analysis) -- [5. Memory Planning](#5-memory-planning) - - [5.1 VRAM Estimation Tool](#51-the-gguf-vram-estimatorpy-utility) - - [5.2 Usage Examples](#52-practical-examples-planning-for-a-128gb-strix-halo-system) -- [6. Building Locally](#6-building-containers-locally-optional) -- [7. Host Configuration](#7-host-configuration) - - [7.1 Test Configuration](#71-test-configuration) - - [7.2 Kernel Parameters (tested on Fedora 42)](#72-kernel-parameters-tested-on-fedora-42) - - [7.3 Ubuntu 24.04](#73-ubuntu-2404) +1. [Llama.cpp Compiled for Every Backend](#1-llamacpp-compiled-for-every-backend) + 1.1 [Supported Container Images](#11-supported-container-images) +2. [Quickest Usage Example](#2-quickest-usage-example) + 2.1 [Creating the toolboxes with GPU access](#21-creating-the-toolboxes-with-gpu-access) + 2.2 [Running models inside the toolboxes](#22-running-models-inside-the-toolboxes) +3. [Performance Benchmarks (Key Results)](#3-performance-benchmarks-key-results) +4. [Memory Planning & VRAM Estimator](#4-memory-planning--vram-estimator) +5. [Building Containers Locally](#5-building-containers-locally) +6. [Host Configuration](#6-host-configuration) + 6.1 [Test Configuration](#61-test-configuration) + 6.2 [Kernel Parameters (tested on Fedora 42)](#62-kernel-parameters-tested-on-fedora-42) + 6.3 [Ubuntu 24.04](#63-ubuntu-2404) +7. [More Documentation](#7-more-documentation) +8. [References](#8-references) +## 1. Llama.cpp Compiled for Every Backend -## 1. Performance Summary +This project uses [Llama.cpp](https://github.com/ggerganov/llama.cpp), a high-performance inference engine for running local LLMs (large language models) on CPUs and GPUs. Llama.cpp is open source, extremely fast, and is the only engine supporting all key backends for AMD Strix Halo: Vulkan (RADV, AMDVLK) and ROCm/HIP -**Vulkan is currently the most stable and performant option** for Strix Halo GPUs: +* **Vulkan** is a cross-platform, low-level graphics and compute API. Llama.cpp can use Vulkan for GPU inference with either the open Mesa RADV driver or AMD's "official" open AMDVLK driver. This is the most stable and supported option for AMD CPUs at the moment. +* **ROCm** is AMD's open-source answer to CUDA: a GPU compute stack for machine learning and HPC. With ROCm, you can run Llama.cpp on AMD GPUs in a way similar to how CUDA works on NVIDIA - this is not the most stable/mature, but recently it's been getting better. -| Backend | Status | Notes | -|---------|---------|-------| -| **Vulkan** | ✅ **Recommended** | Most stable, best performance across all model sizes | -| **ROCm 6.4.2** | ⚠️ Limited | Works ok, but extremely slow past 64GB memory allocations [GitHub Issue #15018](https://github.com/ggml-org/llama.cpp/issues/15018) | -| **ROCm 7.0 beta** | ❌ Unstable | Frequent crashes under heavy load (llama-bench), basic usage possible | +### 1.1 Supported Container Images -## 2. Available Containers +| Container Tag | Backend/Stack | Purpose / Notes | +| --------------- | ------------------- | ------------------------------------------------------------------------------------- | +| `vulkan-amdvlk` | Vulkan (AMDVLK) | Fastest backend—use if model loads. AMD Open Source driver. May fail on >40 GiB/BF16. | +| `vulkan-radv` | Vulkan (Mesa RADV) | Most stable/compatible. Recommended for most users and all models. | +| `rocm-6.4.2` | ROCm 6.4.2 (HIP) | Stable for smaller (<40 GiB) and BF16 models. Crashes on larger quantized models. | +| `rocm-7beta` | ROCm 7.0 Beta (HIP) | Latest ROCm beta. No real improvement for Llama.cpp. Similar model limits to 6.4.2. | +| `rocm-7rc` | ROCm 7.0 RC (HIP) | Release candidate ROCm 7.0. Same behavior as above. | -| Container | Backend | Status | Use Case | -|-----------|---------|---------|----------| -| `vulkan` | Vulkan compute | Stable | **Primary recommendation** | -| `rocm-6.4.2` | ROCm 6.4.2 (HIP) | Stable for <64GB models | Smaller models only | -| `rocm-7beta` | ROCm 7.0 beta (HIP) | Beta/Unstable | Testing only | +> *Each container is based on Fedora Rawhide and is built for maximum compatibility and performance on Strix Halo.* -All containers include up-to-date libraries from Fedora Rawhide, except ROCm 7.0 beta which uses [official AMD RPMs](https://repo.radeon.com/rocm/el9/7.0_beta/main). +--- -## 3. Quick Start +## 2. Quickest Usage Example -### 3.1 Prerequisites +### 2.1 Creating the toolboxes with GPU access -- [Podman](https://podman.io/) (or Docker with alias) -- [Toolbox](https://containertoolbx.org/) -- Linux kernel with AMD GPU (`amdgpu`) drivers -- AMD Strix Halo GPU with proper host configuration (see [7. Host Configuration](#7-host-configuration)) +To use Llama.cpp with hardware acceleration inside a toolbox container, you must expose the GPU devices from your host. The exact flags and devices depend on the backend: -### 3.2 Pull Pre-built Images +* **For Vulkan (RADV/AMDVLK):** Only `/dev/dri` is required. + *Add the user to the video group for access to GPU devices.* -```bash -# Recommended: Vulkan (most stable) -podman pull docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan + ```sh + toolbox create llama-vulkan-radv \ + --image docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan-radv \ + -- --device /dev/dri --group-add video --security-opt seccomp=unconfined + ``` -# Optional: ROCm variants for testing -podman pull docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-6.4.2 -podman pull docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7beta +* **For ROCm:** You must expose both `/dev/dri` and `/dev/kfd` (and sometimes `/dev/hsa` for older hardware), and add the user to extra groups for compute access. + + ```sh + toolbox create llama-rocm-6.4.2 \ + --image docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-6.4.2 \ + -- --device /dev/dri --device /dev/kfd \ + --group-add video --group-add render --group-add sudo --security-opt seccomp=unconfined + ``` + +*Swap in the image/tag for the backend you want to use.* + +> **Note:** +> +> * `--device /dev/dri` provides graphics/video device nodes. +> * `--device /dev/kfd` is required for ROCm compute. +> * Extra groups (`video`, `render`, `sudo`) may be required for full access to GPU nodes and compute features, especially with ROCm. +> * Use `--security-opt seccomp=unconfined` to avoid seccomp sandbox issues (needed for some GPU syscalls). + +### 2.2 Running models inside the toolboxes + +Before running any commands, you must first enter your toolbox container shell using: + +```sh +toolbox enter llama-vulkan-radv ``` -### 3.3 Create Toolboxes +*This will drop you into a shell inside the toolbox, using your regular user account. The container shares your host home directory—so anything in your home is directly accessible (take care: your files are exposed and writable inside the toolbox!).* + +Once inside, the following commands show how to run local LLMs: + +* `llama-cli --list-devices` + *Lists available GPU devices for Llama.cpp.* +* `llama-cli --no-mmap --ngl 999 -fa -m ` + *Runs inference on the specified model, with all layers on GPU and flash attention enabled (replace \*\* with your model path).* + +## 3. Performance Benchmarks (Key Results) + +Below are some results from real runs on Strix Halo hardware of `llama-bench`. For full tables and model-by-model breakdowns (including both prompt processing and token generation speeds), see docs/benchmarks.md. + +| Model | Vulkan (AMDVLK) | Vulkan (RADV) | ROCm 6.4.2 | ROCm 7.0 Beta | ROCm 7.0 RC | 🏆 Best PP | 🏆 Best TG | +| ------------------------------ | ---------------- | ---------------- | ---------------- | ---------------- | ---------------- | ------------- | -------------- | +| **Gemma3 12B Q8\_0** | 686 pp / 13.9 tg | 509 pp / 13.7 tg | 223 pp / 13.8 tg | 223 pp / 13.8 tg | 223 pp / 13.8 tg | **AMDVLK** | **AMDVLK** | +| **Gemma3 27B BF16** | ❌ Crash | 135 pp / 4.0 tg | 89 pp / 4.0 tg | 82 pp / 4.0 tg | 83 pp / 4.0 tg | **RADV** | **ROCm6.4.2** | +| **Llama-4-Scout 17B Q8\_0** | 241 pp / 12.3 tg | 146 pp / 12.3 tg | ❌ Crash | ❌ Crash | ❌ Crash | **AMDVLK** | **AMDVLK** | +| **Llama-4-Scout 17B Q4\_K XL** | 209 pp / 20.1 tg | 133 pp / 20.0 tg | 133 pp / 17.3 tg | 134 pp / 17.3 tg | ❌ Crash | **AMDVLK** | **AMDVLK** | +| **Qwen3 MoE 30B BF16** | 90 pp / 8.0 tg | 71 pp / 7.3 tg | 158 pp / 22.9 tg | 151 pp / 23.8 tg | 155 pp / 23.1 tg | **ROCm6.4.2** | **ROCm7 Beta** | +| **Qwen3-235B Q3\_K XL** | 99 pp / 15.7 tg | 58 pp / 16.3 tg | 69 pp / 13.5 tg | ❌ Crash | 75 pp / 13.6 tg | **AMDVLK** | **RADV** | + + +* **pp = tokens/sec, prompt processing (pre-fill, max speed)** +* **tg = tokens/sec, generation (interactive, single token at a time)** +* 🏆 denotes the winner + +**Takeaways:** + +* **Vulkan AMDVLK** is the fastest—when it works. May crash on large or BF16 models. +* **Vulkan RADV** is the most stable and compatible (recommended for most usage). +* **ROCm** is only superior on BF16 models, otherwise less stable and may crash or hang. + +## 4. Memory Planning & VRAM Estimator + +Running large language models locally requires estimating **total VRAM required**—not just for the model weights, but also for the "context" (number of active tokens) and extra overhead. + +Use `gguf-vram-estimator.py` to check exactly how much memory you need for a given `.gguf` model and target context length. Example output: -**For Vulkan (Recommended):** -```bash -toolbox create llama-vulkan \ - --image docker.io/kyuz0/amd-strix-halo-toolboxes:vulkan \ - -- \ - --device /dev/dri \ - --group-add video \ - --security-opt seccomp=unconfined ``` +$ gguf-vram-estimator.py models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf --contexts 4096 32768 1048576 -**For ROCm 6.4.2:** -```bash -toolbox create llama-rocm-6.4.2 \ - --image docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-6.4.2 \ - -- \ - --device /dev/kfd \ - --device /dev/dri \ - --group-add video \ - --security-opt seccomp=unconfined -``` - -**For ROCm 7.0 beta:** -```bash -toolbox create llama-rocm-7beta \ - --image docker.io/kyuz0/amd-strix-halo-toolboxes:rocm-7beta \ - -- \ - --device /dev/kfd \ - --device /dev/dri \ - --group-add video \ - --security-opt seccomp=unconfined -``` - -> **Note:** The `--` separator passes the remaining flags to Podman/Docker for GPU access. - -### 3.4 Enter and Test - -**Test Vulkan container:** -```bash -toolbox enter llama-vulkan -vulkaninfo | head -n 10 -llama-cli --list-devices -``` - -**Test ROCm containers:** -```bash -toolbox enter llama-rocm-6.4.2 -llama-cli --list-devices -rocm-smi -``` - -## 4. Performance Benchmarks - -All benchmarks performed on HP Z2 Mini G1a with 128GB RAM, using `llama-bench` with all layers offloaded to GPU. - -### 4.1 Prompt Processing (pp512) - tokens/second - -| Model | Size | Params | Vulkan | ROCm 6.4.2 | ROCm 7 Beta | Winner | -|-------|------|---------|---------|-------------|-------------|---------| -| **Gemma3 12B Q8_0** | 13.40 GiB | 11.77B | 509.45 ± 1.01 | 224.43 ± 0.26 | 219.55 ± 0.41 | 🏆 **Vulkan** (+132%) | -| **Qwen3 MoE 30B.A3B BF16** | 56.89 GiB | 30.53B | 74.62 ± 0.63 | 157.87 ± 2.71 | 155.37 ± 2.64 | 🏆 **ROCm 6.4.2** (+112%) | -| **Llama4 17Bx16E (Scout) Q4_K** | 57.73 GiB | 107.77B | 136.47 ± 1.52 | 132.61 ± 0.65 | ❌ GPU Hang | 🏆 **Vulkan** (+3%) | -| **Llama3.3 70B Q8_0** | 75.65 GiB | 70.55B | 76.51 ± 0.47 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | -| **Llama4 17Bx16E (Scout) Q6_K** | 82.35 GiB | 107.77B | 139.05 ± 0.79 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | -| **Qwen3 MoE 235B.A22B Q3_K** | 96.99 GiB | 235.09B | 59.12 ± 0.39 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | -| **Llama4 17Bx16E (Scout) Q8_0** | 106.65 GiB | 107.77B | 148.17 ± 2.99 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | - -### 4.2 Text Generation (tg128) - tokens/second - -| Model | Size | Params | Vulkan | ROCm 6.4.2 | ROCm 7 Beta | Winner | -|-------|------|---------|---------|-------------|-------------|---------| -| **Gemma3 12B Q8_0** | 13.40 GiB | 11.77B | 13.67 ± 0.01 | 13.80 ± 0.00 | 13.43 ± 0.00 | 🏆 **ROCm 6.4.2** (+1%) | -| **Qwen3 MoE 30B.A3B BF16** | 56.89 GiB | 30.53B | 7.36 ± 0.00 | 23.67 ± 0.02 | 22.21 ± 0.00 | 🏆 **ROCm 6.4.2** (+222%) | -| **Llama4 17Bx16E (Scout) Q4_K** | 57.73 GiB | 107.77B | 20.05 ± 0.00 | 17.61 ± 0.00 | ❌ GPU Hang | 🏆 **Vulkan** (+14%) | -| **Llama3.3 70B Q8_0** | 75.65 GiB | 70.55B | 2.72 ± 0.00 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | -| **Llama4 17Bx16E (Scout) Q6_K** | 82.35 GiB | 107.77B | 15.22 ± 0.01 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | -| **Qwen3 MoE 235B.A22B Q3_K** | 96.99 GiB | 235.09B | 15.97 ± 0.02 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | -| **Llama4 17Bx16E (Scout) Q8_0** | 106.65 GiB | 107.77B | 12.22 ± 0.01 | ⚠️ Too slow | ⚠️ Too slow | 🏆 **Vulkan only** | - -### 4.3 Performance Analysis - -**🏆 Vulkan Advantages:** -- Consistently stable across all model sizes -- Significantly better prompt processing on smaller quantized models (127% faster on Gemma3 12B) -- Only option that can handle >64GB models efficiently -- Moderate advantage on larger quantized models (3-14% better on Llama4 17B) - -**🏆 ROCm 6.4.2 Advantages:** -- **Dramatically superior performance on BF16 models** (112% faster prompt processing, 222% faster text generation on Qwen3 MoE 30B) -- Optimized native floating-point operations through HIP compute -- Better suited for models using native precision formats - -**📊 Performance by Model Type:** -- **BF16/Native Precision Models**: ROCm 6.4.2 is the clear winner with 2-3x better performance -- **Small Quantized Models**: Vulkan has significant advantages for prompt processing -- **Large Quantized Models**: Performance is similar between backends (differences within noise) -- **Large Models (>64GB)**: Vulkan is the only viable option due to ROCm's memory allocation issues - -**❌ ROCm 6.4.2 Limitations:** -- Extremely slow memory loading for models >64GB (unusable) -- Performance advantage limited to BF16/native precision models - -**❌ ROCm 7.0 Beta Issues:** -- GPU hangs/crashes on larger models (Llama4 17B causes "GPU Hang" and core dump) -- Similar slow loading issues as ROCm 6.4.2 for models >64GB -- Performance similar to ROCm 6.4.2 when it works, but reliability is poor -- Uses [official AMD RPMs](https://repo.radeon.com/rocm/el9/7.0_beta/main) (beta quality) - -**💡 Recommendation Strategy:** -- Use **ROCm 6.4.2** for BF16/native precision models under 64GB -- Use **Vulkan** for quantized models (especially smaller ones) and all models over 64GB -- For large quantized models under 64GB, either backend performs similarly -- Avoid ROCm 7.0 beta for production workloads - -## 5. Memory Planning - -VRAM usage has three components: **Model Weights + Context Memory (KV Cache) + Overhead**. The `gguf-vram-estimator.py` tool helps you choose the right model quantization and context size to fit within 128GB. - -### 5.1 The `gguf-vram-estimator.py` Utility - -Calculate total VRAM requirements for different context lengths: - -```bash -# Basic usage -gguf-vram-estimator.py [options] -``` - -**Key Options:** -- `--contexts`: Space-separated list of context sizes (e.g., `--contexts 4096 16384`) -- `--overhead`: Estimated overhead in GiB (default: `2.0`) - -### 5.2 Practical Examples: Planning for a 128GB Strix Halo System - -#### Scenario 1: High Quality, Short Context (Coding & Chat) - -```bash -gguf-vram-estimator.py models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf -``` -``` --- Model 'Llama-4-Scout-17B-16E-Instruct' --- Max Context: 10,485,760 tokens -Model Size: 106.67 GiB (from file size) -Incl. Overhead: 2.00 GiB (for compute buffer, etc. adjustable via --overhead) +Model Size: 57.74 GiB +Incl. Overhead: 2.00 GiB --- Memory Footprint Estimation --- Context Size | Context Memory | Est. Total VRAM --------------------------------------------------- - 4,096 | 768.00 MiB | 109.42 GiB - 8,192 | 1.50 GiB | 110.17 GiB - 16,384 | 1.88 GiB | 110.54 GiB + 4,096 | 1.88 GiB | 61.62 GiB + 32,768 | 15.06 GiB | 74.80 GiB + 1,048,576 | 49.12 GiB | 108.87 GiB ``` -**Analysis:** The `Q8_0` model consumes **106.7 GiB**. A 16k context adds another **~1.9 GiB**, for a total of **~111 GiB**. This fits comfortably within a 128GB system. -#### Scenario 2: Large Context, Lower Precision (Long Document/Data/Code Analysis, Back-and-Forth Feedback) +With Q4\_K quantization, **Llama-4-Scout 17B** can reach a 1M token context and still fit within a 128GB system, but... **it will be extremely slow to process such a long context**: see benchmarks (e.g. \~200 tokens/sec for prompt processing). Processing a 1M token context may take hours. + +Contrast: Qwen3-235B Q3\_K (quantized, 97GiB model): -```bash -gguf-vram-estimator.py models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf ``` -``` ---- Model 'Llama-4-Scout-17B-16E-Instruct' --- -Max Context: 10,485,760 tokens -Model Size: 57.74 GiB (from file size) -Incl. Overhead: 2.00 GiB (for compute buffer, etc. adjustable via --overhead) +$ gguf-vram-estimator.py models/qwen3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf --contexts 65536 131072 262144 --- Memory Footprint Estimation --- Context Size | Context Memory | Est. Total VRAM --------------------------------------------------- - 524,288 | 25.12 GiB | 84.87 GiB - 1,048,576 | 49.12 GiB | 108.87 GiB -``` -**Analysis:** To enable this, we use the `Q4_K_XL` quantization of Llama-4-Scout that is only **57.7 GiB**. The 1M token context adds a massive **49.1 GiB** of memory. The total, **~109 GiB**, is a tight but achievable fit on a 128GB system. - -#### Scenario 3: Fitting a Very Large Model - -```bash -gguf-vram-estimator.py models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf -``` -``` ---- Model 'Qwen3-235B-A22B-Instruct-2507' --- -Max Context: 262,144 tokens -Model Size: 97.00 GiB (from file size) -Incl. Overhead: 2.00 GiB (for compute buffer, etc. adjustable via --overhead) - ---- Memory Footprint Estimation --- - Context Size | Context Memory | Est. Total VRAM ---------------------------------------------------- - 65,536 | 11.75 GiB | 110.75 GiB - 131,072 | 23.50 GiB | 122.50 GiB - 262,144 | 47.00 GiB | 146.00 GiB -``` -**Analysis:** The base model takes **97 GiB**. You have approximately **30 GiB** of headroom. This allows for a very large context of **~131k tokens** before exceeding the system's 128GB capacity. Attempting the full 262k context would require `146 GiB` and fail. - -## 6. Building Containers Locally (Optional) - -```bash -# Build all variants -podman build -t localhost/llama-vulkan -f Dockerfile.vulkan . -podman build -t localhost/llama-rocm-6.4.2 -f Dockerfile.rocm-6.4.2 . -podman build -t localhost/llama-rocm-7beta -f Dockerfile.rocm-7beta . + 65,536 | 11.75 GiB | 110.75 GiB + 131,072 | 23.50 GiB | 122.50 GiB + 262,144 | 47.00 GiB | 146.00 GiB ``` -### Create Toolboxes from Local Images +For Qwen3-235B, **128GB RAM allows you to run with context up to \~130k tokens.** -```bash -# Using locally built images -toolbox create llama-vulkan-local \ - --image localhost/llama-vulkan \ - -- \ - --device /dev/dri \ - --group-add video \ - --security-opt seccomp=unconfined +* The estimator lets you plan ahead and avoid out-of-memory errors when loading or using models. +* For more examples and a breakdown of VRAM components, see docs/vram-estimator.md. -toolbox create llama-rocm-local \ - --image localhost/llama-rocm-6.4.2 \ - -- \ - --device /dev/kfd \ - --device /dev/dri \ - --group-add video \ - --security-opt seccomp=unconfined -``` +--- -## 7. Host Configuration +## 5. Building Containers Locally + +Pre-built toolbox container images are published on Docker Hub for immediate use. If you wish to build the containers yourself (for example, to customize packages or rebuild with a different llama.cpp version), see: + +Full instructions: docs/building.md + +--- + +## 6. Host Configuration This should work on any Strix Halo. For a complete list of available hardware, see: [Strix Halo Hardware Database](https://strixhalo-homelab.d7.wtf/Hardware) - ### 7.1 Test Configuration +### 6.1 Test Configuration - | Component | Specification | - |-----------|---------------| - | **Test Machine** | HP Z2 Mini G1a | - | **CPU** | Ryzen AI MAX+ 395 "Strix Halo" | - | **System Memory** | 128 GB RAM | - | **GPU Memory** | 512 MB allocated in BIOS | - | **Host OS** | Fedora 42, kernel 6.15.6-200.fc42.x86_86_64 | +| | | +| ----------------- | --------------------------------------------- | +| **Test Machine** | HP Z2 Mini G1a | +| **CPU** | Ryzen AI MAX+ 395 "Strix Halo" | +| **System Memory** | 128 GB RAM | +| **GPU Memory** | 512 MB allocated in BIOS | +| **Host OS** | Fedora 42, kernel 6.15.6-200.fc42.x86\_86\_64 | -### 7.2 Kernel Parameters (tested on Fedora 42) +### 6.2 Kernel Parameters (tested on Fedora 42) Add these boot parameters to enable unified memory and optimal performance: @@ -309,23 +194,35 @@ Add these boot parameters to enable unified memory and optimal performance: amd_iommu=off amdgpu.gttsize=131072 ttm.pages_limit=335544321 ``` -| Parameter | Purpose | -|-----------|---------| -| `amd_iommu=off` | Disables IOMMU for lower latency | -| `amdgpu.gttsize=131072` | Enables unified GPU/system memory (up to 128 GB) | -| `ttm.pages_limit=335544321` | Allows large pinned memory allocations | +| Parameter | Purpose | +| --------------------------- | ------------------------------------------------ | +| `amd_iommu=off` | Disables IOMMU for lower latency | +| `amdgpu.gttsize=131072` | Enables unified GPU/system memory (up to 128 GB) | +| `ttm.pages_limit=335544321` | Allows large pinned memory allocations | **Apply the changes:** -```bash + +``` # Edit /etc/default/grub to add parameters to GRUB_CMDLINE_LINUX sudo grub2-mkconfig -o /boot/grub2/grub.cfg sudo reboot ``` -### 7.3 Ubuntu 24.04 +### 6.3 Ubuntu 24.04 -Follow this guide by TechnigmaAI for a working configuration on Ubuntu 22.04: +Follow this guide by TechnigmaAI for a working configuration on Ubuntu 24.04: -https://github.com/technigmaai/technigmaai-wiki/wiki/AMD-Ryzen-AI-Max--395:-GTT--Memory-Step%E2%80%90by%E2%80%90Step-Instructions-(Ubuntu-24.04) +[https://github.com/technigmaai/technigmaai-wiki/wiki/AMD-Ryzen-AI-Max--395:-GTT--Memory-Step%E2%80%90by%E2%80%90Step-Instructions-(Ubuntu-24.04)](https://github.com/technigmaai/technigmaai-wiki/wiki/AMD-Ryzen-AI-Max--395:-GTT--Memory-Step%E2%80%90by%E2%80%90Step-Instructions-%28Ubuntu-24.04%29) +## 7. More Documentation +* docs/benchmarks.md: Full benchmark logs, model list, parsed results +* docs/vram-estimator.md: Memory planning, practical example runs +* docs/building.md: Local build, toolbox customization, advanced use + +## 8. References + +* The main reference for AMD Ryzen AI MAX home labs, by deseven (there's also a Discord server): [https://strixhalo-homelab.d7.wtf/](https://strixhalo-homelab.d7.wtf/) +* Most comprehesive repostiry of test builds for Strix Halo by lhl -> [https://github.com/lhl/strix-halo-testing/tree/main](https://github.com/lhl/strix-halo-testing/tree/main) +* Ubunru 24.04 configuration + [https://github.com/technigmaai/technigmaai-wiki/wiki/AMD-Ryzen-AI-Max--395:-GTT--Memory-Step%E2%80%90by%E2%80%90Step-Instructions-(Ubuntu-24.04)](https://github.com/technigmaai/technigmaai-wiki/wiki/AMD-Ryzen-AI-Max--395:-GTT--Memory-Step%E2%80%90by%E2%80%90Step-Instructions-%28Ubuntu-24.04%29) diff --git a/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..f09c0a1 --- /dev/null +++ b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,172 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen2 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Kimi-Dev-72B +llama_model_loader: - kv 3: general.basename str = Kimi-Dev-72B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 72B +llama_model_loader: - kv 6: general.license str = mit +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Kimi Dev 72B +llama_model_loader: - kv 10: general.base_model.0.organization str = Moonshotai +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/moonshotai/Kim... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["code", "unsloth", "swebench", "soft... +llama_model_loader: - kv 13: qwen2.block_count u32 = 80 +llama_model_loader: - kv 14: qwen2.context_length u32 = 131072 +llama_model_loader: - kv 15: qwen2.embedding_length u32 = 8192 +llama_model_loader: - kv 16: qwen2.feed_forward_length u32 = 29568 +llama_model_loader: - kv 17: qwen2.attention.head_count u32 = 64 +llama_model_loader: - kv 18: qwen2.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: qwen2.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 20: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 21: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 22: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 24: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 25: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 29: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Kimi-Dev-72B.txt +llama_model_loader: - kv 34: quantize.imatrix.entries_count u32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count u32 = 685 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 963 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 401 tensors +llama_model_loader: - type f16: 107 tensors +llama_model_loader: - type q8_0: 455 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 78.21 GiB (9.24 BPW) +load: special tokens cache size = 22 +load: token to piece cache size = 0.9310 MB +print_info: arch = qwen2 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 29568 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = -1 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 72.71 B +print_info: general.name = Kimi-Dev-72B +print_info: vocab type = BPE +print_info: n_vocab = 152064 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: ROCm0 model buffer size = 77715.11 MiB +load_tensors: ROCm_Host model buffer size = 2376.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 313.00 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 2887 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1808727616 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello0 + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 31746.03 tokens per second) +llama_perf_context_print: load time = 31744.47 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 463.93 ms / 1 runs ( 463.93 ms per token, 2.16 tokens per second) +llama_perf_context_print: total time = 470.35 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 36.639378936s + Run #3 status: 0 + → Avg over 3 runs: 35.301s diff --git a/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..0006a09 --- /dev/null +++ b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log @@ -0,0 +1,172 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen2 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Kimi-Dev-72B +llama_model_loader: - kv 3: general.basename str = Kimi-Dev-72B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 72B +llama_model_loader: - kv 6: general.license str = mit +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Kimi Dev 72B +llama_model_loader: - kv 10: general.base_model.0.organization str = Moonshotai +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/moonshotai/Kim... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["code", "unsloth", "swebench", "soft... +llama_model_loader: - kv 13: qwen2.block_count u32 = 80 +llama_model_loader: - kv 14: qwen2.context_length u32 = 131072 +llama_model_loader: - kv 15: qwen2.embedding_length u32 = 8192 +llama_model_loader: - kv 16: qwen2.feed_forward_length u32 = 29568 +llama_model_loader: - kv 17: qwen2.attention.head_count u32 = 64 +llama_model_loader: - kv 18: qwen2.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: qwen2.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 20: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 21: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 22: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 24: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 25: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 29: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Kimi-Dev-72B.txt +llama_model_loader: - kv 34: quantize.imatrix.entries_count u32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count u32 = 685 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 963 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 401 tensors +llama_model_loader: - type f16: 107 tensors +llama_model_loader: - type q8_0: 455 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 78.21 GiB (9.24 BPW) +load: special tokens cache size = 22 +load: token to piece cache size = 0.9310 MB +print_info: arch = qwen2 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 29568 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = -1 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 72.71 B +print_info: general.name = Kimi-Dev-72B +print_info: vocab type = BPE +print_info: n_vocab = 152064 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: ROCm0 model buffer size = 77715.11 MiB +load_tensors: ROCm_Host model buffer size = 2376.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 313.00 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 2887 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3691857665 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello0 + +llama_perf_sampler_print: sampling time = 0.07 ms / 2 runs ( 0.04 ms per token, 27027.03 tokens per second) +llama_perf_context_print: load time = 30932.72 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 559.63 ms / 1 runs ( 559.63 ms per token, 1.79 tokens per second) +llama_perf_context_print: total time = 566.03 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 32.156014765s + Run #3 status: 0 + → Avg over 3 runs: 30.024s diff --git a/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..cd42a6f --- /dev/null +++ b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log @@ -0,0 +1,172 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen2 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Kimi-Dev-72B +llama_model_loader: - kv 3: general.basename str = Kimi-Dev-72B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 72B +llama_model_loader: - kv 6: general.license str = mit +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Kimi Dev 72B +llama_model_loader: - kv 10: general.base_model.0.organization str = Moonshotai +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/moonshotai/Kim... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["code", "unsloth", "swebench", "soft... +llama_model_loader: - kv 13: qwen2.block_count u32 = 80 +llama_model_loader: - kv 14: qwen2.context_length u32 = 131072 +llama_model_loader: - kv 15: qwen2.embedding_length u32 = 8192 +llama_model_loader: - kv 16: qwen2.feed_forward_length u32 = 29568 +llama_model_loader: - kv 17: qwen2.attention.head_count u32 = 64 +llama_model_loader: - kv 18: qwen2.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: qwen2.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 20: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 21: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 22: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 24: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 25: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 29: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Kimi-Dev-72B.txt +llama_model_loader: - kv 34: quantize.imatrix.entries_count u32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count u32 = 685 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 963 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 401 tensors +llama_model_loader: - type f16: 107 tensors +llama_model_loader: - type q8_0: 455 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 78.21 GiB (9.24 BPW) +load: special tokens cache size = 22 +load: token to piece cache size = 0.9310 MB +print_info: arch = qwen2 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 29568 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = -1 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 72.71 B +print_info: general.name = Kimi-Dev-72B +print_info: vocab type = BPE +print_info: n_vocab = 152064 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: ROCm0 model buffer size = 77715.11 MiB +load_tensors: ROCm_Host model buffer size = 2376.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 313.00 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 2887 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3133611532 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello0 + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 35087.72 tokens per second) +llama_perf_context_print: load time = 25127.98 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 383.37 ms / 1 runs ( 383.37 ms per token, 2.61 tokens per second) +llama_perf_context_print: total time = 389.90 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 26.238043008s + Run #3 status: 0 + → Avg over 3 runs: 26.362s diff --git a/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..ffaaa4a --- /dev/null +++ b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,123 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen2 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Kimi-Dev-72B +llama_model_loader: - kv 3: general.basename str = Kimi-Dev-72B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 72B +llama_model_loader: - kv 6: general.license str = mit +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Kimi Dev 72B +llama_model_loader: - kv 10: general.base_model.0.organization str = Moonshotai +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/moonshotai/Kim... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["code", "unsloth", "swebench", "soft... +llama_model_loader: - kv 13: qwen2.block_count u32 = 80 +llama_model_loader: - kv 14: qwen2.context_length u32 = 131072 +llama_model_loader: - kv 15: qwen2.embedding_length u32 = 8192 +llama_model_loader: - kv 16: qwen2.feed_forward_length u32 = 29568 +llama_model_loader: - kv 17: qwen2.attention.head_count u32 = 64 +llama_model_loader: - kv 18: qwen2.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: qwen2.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 20: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 21: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 22: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 24: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 25: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 29: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Kimi-Dev-72B.txt +llama_model_loader: - kv 34: quantize.imatrix.entries_count u32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count u32 = 685 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 963 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 401 tensors +llama_model_loader: - type f16: 107 tensors +llama_model_loader: - type q8_0: 455 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 78.21 GiB (9.24 BPW) +load: special tokens cache size = 22 +load: token to piece cache size = 0.9310 MB +print_info: arch = qwen2 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 29568 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = -1 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 72.71 B +print_info: general.name = Kimi-Dev-72B +print_info: vocab type = BPE +print_info: n_vocab = 152064 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +ggml_vulkan: Device memory allocation of size 2491416576 failed. +ggml_vulkan: Requested buffer size exceeds device memory allocation limit: ErrorOutOfDeviceMemory +alloc_tensor_range: failed to allocate Vulkan0 buffer of size 2491416576 +llama_model_load: error loading model: unable to allocate Vulkan0 buffer +llama_model_load_from_file_impl: failed to load model +common_init_from_params: failed to load model '/home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf' +main: error: unable to load model + Elapsed #3: .334893088s + Run #3 status: 1 + ✖ run #3 failed + → No successful runs diff --git a/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..dd58c7a --- /dev/null +++ b/benchmark/loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log @@ -0,0 +1,170 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen2 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Kimi-Dev-72B +llama_model_loader: - kv 3: general.basename str = Kimi-Dev-72B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 72B +llama_model_loader: - kv 6: general.license str = mit +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Kimi Dev 72B +llama_model_loader: - kv 10: general.base_model.0.organization str = Moonshotai +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/moonshotai/Kim... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["code", "unsloth", "swebench", "soft... +llama_model_loader: - kv 13: qwen2.block_count u32 = 80 +llama_model_loader: - kv 14: qwen2.context_length u32 = 131072 +llama_model_loader: - kv 15: qwen2.embedding_length u32 = 8192 +llama_model_loader: - kv 16: qwen2.feed_forward_length u32 = 29568 +llama_model_loader: - kv 17: qwen2.attention.head_count u32 = 64 +llama_model_loader: - kv 18: qwen2.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: qwen2.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 20: qwen2.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 21: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 22: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,152064] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 24: tokenizer.ggml.token_type arr[i32,152064] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 25: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 29: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Kimi-Dev-72B.txt +llama_model_loader: - kv 34: quantize.imatrix.entries_count u32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count u32 = 685 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 963 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 401 tensors +llama_model_loader: - type f16: 107 tensors +llama_model_loader: - type q8_0: 455 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 78.21 GiB (9.24 BPW) +load: special tokens cache size = 22 +load: token to piece cache size = 0.9310 MB +print_info: arch = qwen2 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 29568 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = -1 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 72.71 B +print_info: general.name = Kimi-Dev-72B +print_info: vocab type = BPE +print_info: n_vocab = 152064 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: Vulkan0 model buffer size = 77715.09 MiB +load_tensors: Vulkan_Host model buffer size = 2376.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 313.00 MiB +llama_context: Vulkan_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2887 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4071074447 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello beğen + +llama_perf_sampler_print: sampling time = 0.05 ms / 2 runs ( 0.03 ms per token, 37037.04 tokens per second) +llama_perf_context_print: load time = 29902.30 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 392.32 ms / 1 runs ( 392.32 ms per token, 2.55 tokens per second) +llama_perf_context_print: total time = 399.50 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 30.654893638s + Run #3 status: 0 + → Avg over 3 runs: 30.591s diff --git a/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..fa41d28 --- /dev/null +++ b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,163 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: llama.block_count u32 = 80 +llama_model_loader: - kv 9: llama.context_length u32 = 131072 +llama_model_loader: - kv 10: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 11: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 12: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 13: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 14: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 15: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 16: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 17: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 18: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 128004 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 29: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Llama-3.3-70B-Instruct-GGUF/imatrix_u... +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Llama-3.3-70B-Ins... +llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 689 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 724 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q8_0: 455 tensors +llama_model_loader: - type bf16: 107 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 75.65 GiB (9.21 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama-3.3-70B-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: PAD token = 128004 '<|finetune_right_pad_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: ROCm0 model buffer size = 75456.53 MiB +load_tensors: ROCm_Host model buffer size = 2004.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 266.50 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 1 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 192699360 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.05 ms / 3 runs ( 0.02 ms per token, 63829.79 tokens per second) +llama_perf_context_print: load time = 24487.91 ms +llama_perf_context_print: prompt eval time = 368.54 ms / 2 tokens ( 184.27 ms per token, 5.43 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 383.50 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 28.922457711s + Run #3 status: 0 + → Avg over 3 runs: 30.998s diff --git a/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..611a7c5 --- /dev/null +++ b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log @@ -0,0 +1,163 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: llama.block_count u32 = 80 +llama_model_loader: - kv 9: llama.context_length u32 = 131072 +llama_model_loader: - kv 10: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 11: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 12: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 13: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 14: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 15: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 16: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 17: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 18: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 128004 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 29: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Llama-3.3-70B-Instruct-GGUF/imatrix_u... +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Llama-3.3-70B-Ins... +llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 689 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 724 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q8_0: 455 tensors +llama_model_loader: - type bf16: 107 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 75.65 GiB (9.21 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama-3.3-70B-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: PAD token = 128004 '<|finetune_right_pad_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: ROCm0 model buffer size = 75456.53 MiB +load_tensors: ROCm_Host model buffer size = 2004.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 266.50 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 1 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3478849877 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello H + +llama_perf_sampler_print: sampling time = 0.06 ms / 3 runs ( 0.02 ms per token, 53571.43 tokens per second) +llama_perf_context_print: load time = 32005.62 ms +llama_perf_context_print: prompt eval time = 456.36 ms / 2 tokens ( 228.18 ms per token, 4.38 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 471.29 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 33.222127697s + Run #3 status: 0 + → Avg over 3 runs: 32.796s diff --git a/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..f6dd5ab --- /dev/null +++ b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log @@ -0,0 +1,163 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: llama.block_count u32 = 80 +llama_model_loader: - kv 9: llama.context_length u32 = 131072 +llama_model_loader: - kv 10: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 11: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 12: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 13: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 14: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 15: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 16: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 17: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 18: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 128004 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 29: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Llama-3.3-70B-Instruct-GGUF/imatrix_u... +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Llama-3.3-70B-Ins... +llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 689 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 724 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q8_0: 455 tensors +llama_model_loader: - type bf16: 107 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 75.65 GiB (9.21 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama-3.3-70B-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: PAD token = 128004 '<|finetune_right_pad_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: ROCm0 model buffer size = 75456.53 MiB +load_tensors: ROCm_Host model buffer size = 2004.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 266.50 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 1 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4130863841 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello: + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 44117.65 tokens per second) +llama_perf_context_print: load time = 32184.35 ms +llama_perf_context_print: prompt eval time = 697.57 ms / 2 tokens ( 348.79 ms per token, 2.87 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 712.61 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 33.659541277s + Run #3 status: 0 + → Avg over 3 runs: 32.911s diff --git a/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..3a9005c --- /dev/null +++ b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,161 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: llama.block_count u32 = 80 +llama_model_loader: - kv 9: llama.context_length u32 = 131072 +llama_model_loader: - kv 10: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 11: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 12: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 13: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 14: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 15: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 16: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 17: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 18: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 128004 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 29: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Llama-3.3-70B-Instruct-GGUF/imatrix_u... +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Llama-3.3-70B-Ins... +llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 689 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 724 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q8_0: 455 tensors +llama_model_loader: - type bf16: 107 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 75.65 GiB (9.21 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama-3.3-70B-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: PAD token = 128004 '<|finetune_right_pad_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: Vulkan0 model buffer size = 75456.53 MiB +load_tensors: Vulkan_Host model buffer size = 2004.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 266.50 MiB +llama_context: Vulkan_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 327404797 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.06 ms / 3 runs ( 0.02 ms per token, 50847.46 tokens per second) +llama_perf_context_print: load time = 26953.87 ms +llama_perf_context_print: prompt eval time = 387.45 ms / 2 tokens ( 193.72 ms per token, 5.16 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 404.05 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 28.173844492s + Run #3 status: 0 + → Avg over 3 runs: 30.604s diff --git a/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..c33e52c --- /dev/null +++ b/benchmark/loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log @@ -0,0 +1,161 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Llama-3.3-70B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: llama.block_count u32 = 80 +llama_model_loader: - kv 9: llama.context_length u32 = 131072 +llama_model_loader: - kv 10: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 11: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 12: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 13: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 14: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 15: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 16: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 17: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 18: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 128004 +llama_model_loader: - kv 28: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 29: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: general.file_type u32 = 7 +llama_model_loader: - kv 32: quantize.imatrix.file str = Llama-3.3-70B-Instruct-GGUF/imatrix_u... +llama_model_loader: - kv 33: quantize.imatrix.dataset str = unsloth_calibration_Llama-3.3-70B-Ins... +llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 560 +llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 689 +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.tensors.count i32 = 724 +llama_model_loader: - kv 38: split.count u16 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q8_0: 455 tensors +llama_model_loader: - type bf16: 107 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 75.65 GiB (9.21 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama-3.3-70B-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: PAD token = 128004 '<|finetune_right_pad_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: Vulkan0 model buffer size = 75456.53 MiB +load_tensors: Vulkan_Host model buffer size = 2004.00 MiB +................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 266.50 MiB +llama_context: Vulkan_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 2154218339 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello’s + +llama_perf_sampler_print: sampling time = 0.06 ms / 3 runs ( 0.02 ms per token, 51724.14 tokens per second) +llama_perf_context_print: load time = 29443.29 ms +llama_perf_context_print: prompt eval time = 376.13 ms / 2 tokens ( 188.07 ms per token, 5.32 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 392.17 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 30.227365941s + Run #3 status: 0 + → Avg over 3 runs: 30.376s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..87d6d92 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,179 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 18 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q6_K: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q6_K +print_info: file size = 82.35 GiB (6.56 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: CPU model buffer size = 809.29 MiB +load_tensors: ROCm0 model buffer size = 83513.68 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 442.62 MiB +llama_context: ROCm_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1642319140 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 42857.14 tokens per second) +llama_perf_context_print: load time = 26639.60 ms +llama_perf_context_print: prompt eval time = 107.52 ms / 2 tokens ( 53.76 ms per token, 18.60 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 127.12 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 30.905590182s + Run #3 status: 0 + → Avg over 3 runs: 31.792s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..b3a421c --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log @@ -0,0 +1,179 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 18 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q6_K: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q6_K +print_info: file size = 82.35 GiB (6.56 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: CPU model buffer size = 809.29 MiB +load_tensors: ROCm0 model buffer size = 83513.68 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 442.62 MiB +llama_context: ROCm_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1329865451 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello1 + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 44776.12 tokens per second) +llama_perf_context_print: load time = 27337.52 ms +llama_perf_context_print: prompt eval time = 135.84 ms / 2 tokens ( 67.92 ms per token, 14.72 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 155.35 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 28.220065203s + Run #3 status: 0 + → Avg over 3 runs: 28.221s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..84d5fa3 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log @@ -0,0 +1,179 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 18 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q6_K: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q6_K +print_info: file size = 82.35 GiB (6.56 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: CPU model buffer size = 809.29 MiB +load_tensors: ROCm0 model buffer size = 83513.68 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 442.62 MiB +llama_context: ROCm_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3194189125 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello: + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 46153.85 tokens per second) +llama_perf_context_print: load time = 26424.61 ms +llama_perf_context_print: prompt eval time = 106.73 ms / 2 tokens ( 53.37 ms per token, 18.74 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 126.53 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 27.353142250s + Run #3 status: 0 + → Avg over 3 runs: 28.435s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..da4b832 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,177 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 18 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q6_K: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q6_K +print_info: file size = 82.35 GiB (6.56 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 83513.68 MiB +load_tensors: CPU model buffer size = 809.29 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 440.63 MiB +llama_context: Vulkan_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4111748233 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello: + +llama_perf_sampler_print: sampling time = 0.15 ms / 3 runs ( 0.05 ms per token, 20134.23 tokens per second) +llama_perf_context_print: load time = 31375.27 ms +llama_perf_context_print: prompt eval time = 267.76 ms / 2 tokens ( 133.88 ms per token, 7.47 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 295.92 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 33.122388042s + Run #3 status: 0 + → Avg over 3 runs: 35.541s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..acb490f --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log @@ -0,0 +1,177 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 18 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q6_K: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q6_K +print_info: file size = 82.35 GiB (6.56 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 83513.68 MiB +load_tensors: CPU model buffer size = 809.29 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 440.63 MiB +llama_context: Vulkan_Host compute buffer size = 26.02 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1422642604 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello1 + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 32967.03 tokens per second) +llama_perf_context_print: load time = 32072.23 ms +llama_perf_context_print: prompt eval time = 296.78 ms / 2 tokens ( 148.39 ms per token, 6.74 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 324.57 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 32.859879045s + Run #3 status: 0 + → Avg over 3 runs: 32.810s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log new file mode 100644 index 0000000..eaf30ee --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log @@ -0,0 +1,179 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 7 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 3 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q8_0: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 106.65 GiB (8.50 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 108165.12 MiB +load_tensors: ROCm_Host model buffer size = 1048.22 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 434.62 MiB +llama_context: ROCm_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 1 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 2885096603 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello. + +llama_perf_sampler_print: sampling time = 0.06 ms / 3 runs ( 0.02 ms per token, 46875.00 tokens per second) +llama_perf_context_print: load time = 36882.65 ms +llama_perf_context_print: prompt eval time = 127.76 ms / 2 tokens ( 63.88 ms per token, 15.65 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 158.41 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 41.426125320s + Run #3 status: 0 + → Avg over 3 runs: 40.739s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log new file mode 100644 index 0000000..3675c18 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log @@ -0,0 +1,179 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 7 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 3 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q8_0: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 106.65 GiB (8.50 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 108165.12 MiB +load_tensors: ROCm_Host model buffer size = 1048.22 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 434.62 MiB +llama_context: ROCm_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 1 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1149431120 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello: + +llama_perf_sampler_print: sampling time = 0.06 ms / 3 runs ( 0.02 ms per token, 48387.10 tokens per second) +llama_perf_context_print: load time = 35959.68 ms +llama_perf_context_print: prompt eval time = 127.62 ms / 2 tokens ( 63.81 ms per token, 15.67 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 157.80 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 36.919182117s + Run #3 status: 0 + → Avg over 3 runs: 36.400s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log new file mode 100644 index 0000000..4673a8a --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log @@ -0,0 +1,179 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 7 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 3 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q8_0: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 106.65 GiB (8.50 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 108165.12 MiB +load_tensors: ROCm_Host model buffer size = 1048.22 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 434.62 MiB +llama_context: ROCm_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 1 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 406280533 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello The + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 45454.55 tokens per second) +llama_perf_context_print: load time = 34222.03 ms +llama_perf_context_print: prompt eval time = 136.79 ms / 2 tokens ( 68.40 ms per token, 14.62 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 156.58 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 35.217307205s + Run #3 status: 0 + → Avg over 3 runs: 35.742s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log new file mode 100644 index 0000000..ec3aa5e --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log @@ -0,0 +1,177 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 7 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 3 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q8_0: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 106.65 GiB (8.50 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 108165.12 MiB +load_tensors: Vulkan_Host model buffer size = 1048.22 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 440.63 MiB +llama_context: Vulkan_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3690416473 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 32967.03 tokens per second) +llama_perf_context_print: load time = 41237.01 ms +llama_perf_context_print: prompt eval time = 233.96 ms / 2 tokens ( 116.98 ms per token, 8.55 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 261.97 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 45.548750208s + Run #3 status: 0 + → Avg over 3 runs: 47.967s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log new file mode 100644 index 0000000..48132f0 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log @@ -0,0 +1,177 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 7 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 3 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q8_0: 482 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 106.65 GiB (8.50 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 108165.12 MiB +load_tensors: Vulkan_Host model buffer size = 1048.22 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 440.63 MiB +llama_context: Vulkan_Host compute buffer size = 26.02 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4068031204 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 32967.03 tokens per second) +llama_perf_context_print: load time = 41299.30 ms +llama_perf_context_print: prompt eval time = 252.99 ms / 2 tokens ( 126.49 ms per token, 7.91 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 280.67 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 42.081911936s + Run #3 status: 0 + → Avg over 3 runs: 41.626s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..73fb564 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,181 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 15 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q4_K: 421 tensors +llama_model_loader: - type q5_K: 43 tensors +llama_model_loader: - type q6_K: 18 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 57.73 GiB (4.60 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: CPU model buffer size = 554.94 MiB +load_tensors: ROCm0 model buffer size = 58558.57 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 442.62 MiB +llama_context: ROCm_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4182963810 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello The + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 46153.85 tokens per second) +llama_perf_context_print: load time = 9663.18 ms +llama_perf_context_print: prompt eval time = 90.98 ms / 2 tokens ( 45.49 ms per token, 21.98 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 110.40 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 13.853856771s + Run #3 status: 0 + → Avg over 3 runs: 15.776s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..1fb554b --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log @@ -0,0 +1,162 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 15 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q4_K: 421 tensors +llama_model_loader: - type q5_K: 43 tensors +llama_model_loader: - type q6_K: 18 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 57.73 GiB (4.60 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: CPU model buffer size = 554.94 MiB +load_tensors: ROCm0 model buffer size = 58558.57 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 442.62 MiB +llama_context: ROCm_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +HW Exception by GPU node-1 (Agent handle: 0x48fa1f0) reason :GPU Hang + Elapsed #3: 22.180402418s + Run #3 status: 134 + ✖ run #3 failed + → No successful runs diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..9ffcb33 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log @@ -0,0 +1,174 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 15 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q4_K: 421 tensors +llama_model_loader: - type q5_K: 43 tensors +llama_model_loader: - type q6_K: 18 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 57.73 GiB (4.60 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: CPU model buffer size = 554.94 MiB +load_tensors: ROCm0 model buffer size = 58558.57 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 442.62 MiB +llama_context: ROCm_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 722371466 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello Elapsed #3: 22.602610057s + Run #3 status: 134 + ✖ run #3 failed + → Avg over 2 runs: 19.365s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..5fdc5b4 --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,179 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 15 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q4_K: 421 tensors +llama_model_loader: - type q5_K: 43 tensors +llama_model_loader: - type q6_K: 18 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 57.73 GiB (4.60 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 58558.57 MiB +load_tensors: CPU model buffer size = 554.94 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 440.63 MiB +llama_context: Vulkan_Host compute buffer size = 26.01 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 83044290 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello + +llama_perf_sampler_print: sampling time = 0.16 ms / 3 runs ( 0.05 ms per token, 18518.52 tokens per second) +llama_perf_context_print: load time = 13560.35 ms +llama_perf_context_print: prompt eval time = 257.61 ms / 2 tokens ( 128.81 ms per token, 7.76 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 285.54 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 14.548378284s + Run #3 status: 0 + → Avg over 3 runs: 16.752s diff --git a/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..403f25b --- /dev/null +++ b/benchmark/loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log @@ -0,0 +1,179 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama4 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 3: general.finetune str = 16E-Instruct +llama_model_loader: - kv 4: general.basename str = Llama-4-Scout-17B-16E-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 17B +llama_model_loader: - kv 7: general.license str = other +llama_model_loader: - kv 8: general.license.name str = llama4 +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Llama 4 Scout 17B 16E Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 14: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 15: general.languages arr[str,12] = ["ar", "de", "en", "es", "fr", "hi", ... +llama_model_loader: - kv 16: llama4.block_count u32 = 48 +llama_model_loader: - kv 17: llama4.context_length u32 = 10485760 +llama_model_loader: - kv 18: llama4.embedding_length u32 = 5120 +llama_model_loader: - kv 19: llama4.feed_forward_length u32 = 16384 +llama_model_loader: - kv 20: llama4.attention.head_count u32 = 40 +llama_model_loader: - kv 21: llama4.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 22: llama4.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 23: llama4.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 24: llama4.expert_count u32 = 16 +llama_model_loader: - kv 25: llama4.expert_used_count u32 = 1 +llama_model_loader: - kv 26: llama4.attention.key_length u32 = 128 +llama_model_loader: - kv 27: llama4.attention.value_length u32 = 128 +llama_model_loader: - kv 28: llama4.vocab_size u32 = 202048 +llama_model_loader: - kv 29: llama4.rope.dimension_count u32 = 128 +llama_model_loader: - kv 30: llama4.interleave_moe_layer_step u32 = 1 +llama_model_loader: - kv 31: llama4.expert_feed_forward_length u32 = 8192 +llama_model_loader: - kv 32: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 33: tokenizer.ggml.pre str = llama4 +llama_model_loader: - kv 34: tokenizer.ggml.tokens arr[str,202048] = ["À", "Á", "õ", "ö", "÷", "ø", ... +llama_model_loader: - kv 35: tokenizer.ggml.token_type arr[i32,202048] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 36: tokenizer.ggml.merges arr[str,439802] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 37: tokenizer.ggml.bos_token_id u32 = 200000 +llama_model_loader: - kv 38: tokenizer.ggml.eos_token_id u32 = 200008 +llama_model_loader: - kv 39: tokenizer.ggml.padding_token_id u32 = 200018 +llama_model_loader: - kv 40: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 41: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 42: general.quantization_version u32 = 2 +llama_model_loader: - kv 43: general.file_type u32 = 15 +llama_model_loader: - kv 44: quantize.imatrix.file str = Llama-4-Scout-17B-16E-Instruct-GGUF/i... +llama_model_loader: - kv 45: quantize.imatrix.dataset str = unsloth_calibration_Llama-4-Scout-17B... +llama_model_loader: - kv 46: quantize.imatrix.entries_count u32 = 528 +llama_model_loader: - kv 47: quantize.imatrix.chunks_count u32 = 729 +llama_model_loader: - kv 48: split.no u16 = 0 +llama_model_loader: - kv 49: split.tensors.count i32 = 628 +llama_model_loader: - kv 50: split.count u16 = 2 +llama_model_loader: - type f32: 146 tensors +llama_model_loader: - type q4_K: 421 tensors +llama_model_loader: - type q5_K: 43 tensors +llama_model_loader: - type q6_K: 18 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 57.73 GiB (4.60 BPW) +load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect +load: special tokens cache size = 1135 +load: token to piece cache size = 1.3873 MB +print_info: arch = llama4 +print_info: vocab_only = 0 +print_info: n_ctx_train = 10485760 +print_info: n_embd = 5120 +print_info: n_layer = 48 +print_info: n_head = 40 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 8192 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 5 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 16384 +print_info: n_expert = 16 +print_info: n_expert_used = 1 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 10485760 +print_info: rope_finetuned = unknown +print_info: model type = 17Bx16E (Scout) +print_info: model params = 107.77 B +print_info: general.name = Llama-4-Scout-17B-16E-Instruct +print_info: vocab type = BPE +print_info: n_vocab = 202048 +print_info: n_merges = 439802 +print_info: BOS token = 200000 '<|begin_of_text|>' +print_info: EOS token = 200008 '<|eot|>' +print_info: PAD token = 200018 '<|finetune_right_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 200002 '<|fim_prefix|>' +print_info: FIM SUF token = 200004 '<|fim_suffix|>' +print_info: FIM MID token = 200003 '<|fim_middle|>' +print_info: EOG token = 200001 '<|end_of_text|>' +print_info: EOG token = 200008 '<|eot|>' +print_info: max token length = 192 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 58558.57 MiB +load_tensors: CPU model buffer size = 554.94 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.77 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 192.00 MiB +llama_kv_cache_unified: size = 192.00 MiB ( 4096 cells, 12 layers, 1/ 1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 576.00 MiB +llama_kv_cache_unified: size = 576.00 MiB ( 4096 cells, 36 layers, 1/ 1 seqs), K (f16): 288.00 MiB, V (f16): 288.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 440.63 MiB +llama_context: Vulkan_Host compute buffer size = 26.02 MiB +llama_context: graph nodes = 2420 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eot|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 2510811977 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello ( + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 32608.70 tokens per second) +llama_perf_context_print: load time = 16387.21 ms +llama_perf_context_print: prompt eval time = 291.47 ms / 2 tokens ( 145.73 ms per token, 6.86 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 319.42 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 17.154124582s + Run #3 status: 0 + → Avg over 3 runs: 20.045s diff --git a/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log new file mode 100644 index 0000000..5a96dc9 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log @@ -0,0 +1,184 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 3: general.version str = 2507 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 6: general.quantized_by str = Unsloth +llama_model_loader: - kv 7: general.size_label str = 235B-A22B +llama_model_loader: - kv 8: general.license str = apache-2.0 +llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 11: general.base_model.count u32 = 1 +llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 235B A22B Instruct 2507 +llama_model_loader: - kv 13: general.base_model.0.version str = 2507 +llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 17: qwen3moe.block_count u32 = 94 +llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 4096 +llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 12288 +llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 64 +llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 5000000.000000 +llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 1536 +llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 39: general.quantization_version u32 = 2 +llama_model_loader: - kv 40: general.file_type u32 = 12 +llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-235B-A22B-Instruct-2507-GGUF/im... +llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-235B-A22B-I... +llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 745 +llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 +llama_model_loader: - kv 45: split.no u16 = 0 +llama_model_loader: - kv 46: split.tensors.count i32 = 1131 +llama_model_loader: - kv 47: split.count u16 = 3 +llama_model_loader: - type f32: 471 tensors +llama_model_loader: - type q3_K: 267 tensors +llama_model_loader: - type q4_K: 362 tensors +llama_model_loader: - type q5_K: 20 tensors +llama_model_loader: - type q6_K: 11 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q3_K - Medium +print_info: file size = 96.99 GiB (3.54 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 4096 +print_info: n_layer = 94 +print_info: n_head = 64 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 16 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 12288 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 5000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 235B.A22B +print_info: model params = 235.09 B +print_info: general.name = Qwen3-235B-A22B-Instruct-2507 +print_info: n_ff_exp = 1536 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 94 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 95/95 layers to GPU +load_tensors: CPU model buffer size = 333.84 MiB +load_tensors: ROCm0 model buffer size = 98988.40 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 5000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 752.00 MiB +llama_kv_cache_unified: size = 752.00 MiB ( 4096 cells, 94 layers, 1/ 1 seqs), K (f16): 376.00 MiB, V (f16): 376.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 304.75 MiB +llama_context: ROCm_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 6023 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4068503868 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello, + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 35087.72 tokens per second) +llama_perf_context_print: load time = 34531.90 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 74.04 ms / 1 runs ( 74.04 ms per token, 13.51 tokens per second) +llama_perf_context_print: total time = 87.46 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 38.606270419s + Run #3 status: 0 + → Avg over 3 runs: 39.062s diff --git a/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log new file mode 100644 index 0000000..a59adde --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log @@ -0,0 +1,184 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 3: general.version str = 2507 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 6: general.quantized_by str = Unsloth +llama_model_loader: - kv 7: general.size_label str = 235B-A22B +llama_model_loader: - kv 8: general.license str = apache-2.0 +llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 11: general.base_model.count u32 = 1 +llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 235B A22B Instruct 2507 +llama_model_loader: - kv 13: general.base_model.0.version str = 2507 +llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 17: qwen3moe.block_count u32 = 94 +llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 4096 +llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 12288 +llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 64 +llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 5000000.000000 +llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 1536 +llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 39: general.quantization_version u32 = 2 +llama_model_loader: - kv 40: general.file_type u32 = 12 +llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-235B-A22B-Instruct-2507-GGUF/im... +llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-235B-A22B-I... +llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 745 +llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 +llama_model_loader: - kv 45: split.no u16 = 0 +llama_model_loader: - kv 46: split.tensors.count i32 = 1131 +llama_model_loader: - kv 47: split.count u16 = 3 +llama_model_loader: - type f32: 471 tensors +llama_model_loader: - type q3_K: 267 tensors +llama_model_loader: - type q4_K: 362 tensors +llama_model_loader: - type q5_K: 20 tensors +llama_model_loader: - type q6_K: 11 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q3_K - Medium +print_info: file size = 96.99 GiB (3.54 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 4096 +print_info: n_layer = 94 +print_info: n_head = 64 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 16 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 12288 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 5000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 235B.A22B +print_info: model params = 235.09 B +print_info: general.name = Qwen3-235B-A22B-Instruct-2507 +print_info: n_ff_exp = 1536 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 94 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 95/95 layers to GPU +load_tensors: CPU model buffer size = 333.84 MiB +load_tensors: ROCm0 model buffer size = 98988.40 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 5000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 752.00 MiB +llama_kv_cache_unified: size = 752.00 MiB ( 4096 cells, 94 layers, 1/ 1 seqs), K (f16): 376.00 MiB, V (f16): 376.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 304.75 MiB +llama_context: ROCm_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 6023 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 698255200 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello! + +llama_perf_sampler_print: sampling time = 0.05 ms / 2 runs ( 0.03 ms per token, 37037.04 tokens per second) +llama_perf_context_print: load time = 34496.41 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 74.48 ms / 1 runs ( 74.48 ms per token, 13.43 tokens per second) +llama_perf_context_print: total time = 87.80 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 35.247053632s + Run #3 status: 0 + → Avg over 3 runs: 35.392s diff --git a/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log new file mode 100644 index 0000000..53a04cc --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log @@ -0,0 +1,184 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 3: general.version str = 2507 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 6: general.quantized_by str = Unsloth +llama_model_loader: - kv 7: general.size_label str = 235B-A22B +llama_model_loader: - kv 8: general.license str = apache-2.0 +llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 11: general.base_model.count u32 = 1 +llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 235B A22B Instruct 2507 +llama_model_loader: - kv 13: general.base_model.0.version str = 2507 +llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 17: qwen3moe.block_count u32 = 94 +llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 4096 +llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 12288 +llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 64 +llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 5000000.000000 +llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 1536 +llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 39: general.quantization_version u32 = 2 +llama_model_loader: - kv 40: general.file_type u32 = 12 +llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-235B-A22B-Instruct-2507-GGUF/im... +llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-235B-A22B-I... +llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 745 +llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 +llama_model_loader: - kv 45: split.no u16 = 0 +llama_model_loader: - kv 46: split.tensors.count i32 = 1131 +llama_model_loader: - kv 47: split.count u16 = 3 +llama_model_loader: - type f32: 471 tensors +llama_model_loader: - type q3_K: 267 tensors +llama_model_loader: - type q4_K: 362 tensors +llama_model_loader: - type q5_K: 20 tensors +llama_model_loader: - type q6_K: 11 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q3_K - Medium +print_info: file size = 96.99 GiB (3.54 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 4096 +print_info: n_layer = 94 +print_info: n_head = 64 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 16 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 12288 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 5000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 235B.A22B +print_info: model params = 235.09 B +print_info: general.name = Qwen3-235B-A22B-Instruct-2507 +print_info: n_ff_exp = 1536 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 94 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 95/95 layers to GPU +load_tensors: CPU model buffer size = 333.84 MiB +load_tensors: ROCm0 model buffer size = 98988.40 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 5000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 752.00 MiB +llama_kv_cache_unified: size = 752.00 MiB ( 4096 cells, 94 layers, 1/ 1 seqs), K (f16): 376.00 MiB, V (f16): 376.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 304.75 MiB +llama_context: ROCm_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 6023 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 715670654 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello, + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 34482.76 tokens per second) +llama_perf_context_print: load time = 31968.90 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 73.79 ms / 1 runs ( 73.79 ms per token, 13.55 tokens per second) +llama_perf_context_print: total time = 87.27 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 32.781452355s + Run #3 status: 0 + → Avg over 3 runs: 33.458s diff --git a/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log new file mode 100644 index 0000000..6d7f34b --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log @@ -0,0 +1,182 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 3: general.version str = 2507 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 6: general.quantized_by str = Unsloth +llama_model_loader: - kv 7: general.size_label str = 235B-A22B +llama_model_loader: - kv 8: general.license str = apache-2.0 +llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 11: general.base_model.count u32 = 1 +llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 235B A22B Instruct 2507 +llama_model_loader: - kv 13: general.base_model.0.version str = 2507 +llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 17: qwen3moe.block_count u32 = 94 +llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 4096 +llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 12288 +llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 64 +llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 5000000.000000 +llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 1536 +llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 39: general.quantization_version u32 = 2 +llama_model_loader: - kv 40: general.file_type u32 = 12 +llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-235B-A22B-Instruct-2507-GGUF/im... +llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-235B-A22B-I... +llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 745 +llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 +llama_model_loader: - kv 45: split.no u16 = 0 +llama_model_loader: - kv 46: split.tensors.count i32 = 1131 +llama_model_loader: - kv 47: split.count u16 = 3 +llama_model_loader: - type f32: 471 tensors +llama_model_loader: - type q3_K: 267 tensors +llama_model_loader: - type q4_K: 362 tensors +llama_model_loader: - type q5_K: 20 tensors +llama_model_loader: - type q6_K: 11 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q3_K - Medium +print_info: file size = 96.99 GiB (3.54 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 4096 +print_info: n_layer = 94 +print_info: n_head = 64 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 16 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 12288 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 5000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 235B.A22B +print_info: model params = 235.09 B +print_info: general.name = Qwen3-235B-A22B-Instruct-2507 +print_info: n_ff_exp = 1536 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 94 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 95/95 layers to GPU +load_tensors: Vulkan0 model buffer size = 98988.40 MiB +load_tensors: CPU model buffer size = 333.84 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 5000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 752.00 MiB +llama_kv_cache_unified: size = 752.00 MiB ( 4096 cells, 94 layers, 1/ 1 seqs), K (f16): 376.00 MiB, V (f16): 376.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 304.75 MiB +llama_context: Vulkan_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 6023 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4076614647 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello, + +llama_perf_sampler_print: sampling time = 0.07 ms / 2 runs ( 0.04 ms per token, 28571.43 tokens per second) +llama_perf_context_print: load time = 40072.88 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 67.40 ms / 1 runs ( 67.40 ms per token, 14.84 tokens per second) +llama_perf_context_print: total time = 86.12 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 43.569299668s + Run #3 status: 0 + → Avg over 3 runs: 44.883s diff --git a/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log new file mode 100644 index 0000000..e2045f0 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log @@ -0,0 +1,182 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 2 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 3: general.version str = 2507 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Qwen3-235B-A22B-Instruct-2507 +llama_model_loader: - kv 6: general.quantized_by str = Unsloth +llama_model_loader: - kv 7: general.size_label str = 235B-A22B +llama_model_loader: - kv 8: general.license str = apache-2.0 +llama_model_loader: - kv 9: general.license.link str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 10: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 11: general.base_model.count u32 = 1 +llama_model_loader: - kv 12: general.base_model.0.name str = Qwen3 235B A22B Instruct 2507 +llama_model_loader: - kv 13: general.base_model.0.version str = 2507 +llama_model_loader: - kv 14: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 15: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-235... +llama_model_loader: - kv 16: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 17: qwen3moe.block_count u32 = 94 +llama_model_loader: - kv 18: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 19: qwen3moe.embedding_length u32 = 4096 +llama_model_loader: - kv 20: qwen3moe.feed_forward_length u32 = 12288 +llama_model_loader: - kv 21: qwen3moe.attention.head_count u32 = 64 +llama_model_loader: - kv 22: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 23: qwen3moe.rope.freq_base f32 = 5000000.000000 +llama_model_loader: - kv 24: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 25: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 26: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 27: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 29: qwen3moe.expert_feed_forward_length u32 = 1536 +llama_model_loader: - kv 30: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 31: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 32: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 33: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 34: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 35: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 36: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 37: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 38: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 39: general.quantization_version u32 = 2 +llama_model_loader: - kv 40: general.file_type u32 = 12 +llama_model_loader: - kv 41: quantize.imatrix.file str = Qwen3-235B-A22B-Instruct-2507-GGUF/im... +llama_model_loader: - kv 42: quantize.imatrix.dataset str = unsloth_calibration_Qwen3-235B-A22B-I... +llama_model_loader: - kv 43: quantize.imatrix.entries_count u32 = 745 +llama_model_loader: - kv 44: quantize.imatrix.chunks_count u32 = 693 +llama_model_loader: - kv 45: split.no u16 = 0 +llama_model_loader: - kv 46: split.tensors.count i32 = 1131 +llama_model_loader: - kv 47: split.count u16 = 3 +llama_model_loader: - type f32: 471 tensors +llama_model_loader: - type q3_K: 267 tensors +llama_model_loader: - type q4_K: 362 tensors +llama_model_loader: - type q5_K: 20 tensors +llama_model_loader: - type q6_K: 11 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q3_K - Medium +print_info: file size = 96.99 GiB (3.54 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 4096 +print_info: n_layer = 94 +print_info: n_head = 64 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 16 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 12288 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 5000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 235B.A22B +print_info: model params = 235.09 B +print_info: general.name = Qwen3-235B-A22B-Instruct-2507 +print_info: n_ff_exp = 1536 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 94 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 95/95 layers to GPU +load_tensors: Vulkan0 model buffer size = 98988.40 MiB +load_tensors: CPU model buffer size = 333.84 MiB +.................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 5000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 752.00 MiB +llama_kv_cache_unified: size = 752.00 MiB ( 4096 cells, 94 layers, 1/ 1 seqs), K (f16): 376.00 MiB, V (f16): 376.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 304.75 MiB +llama_context: Vulkan_Host compute buffer size = 16.01 MiB +llama_context: graph nodes = 6023 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1959920459 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello, + +llama_perf_sampler_print: sampling time = 0.08 ms / 2 runs ( 0.04 ms per token, 25641.03 tokens per second) +llama_perf_context_print: load time = 40114.24 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 67.08 ms / 1 runs ( 67.08 ms per token, 14.91 tokens per second) +llama_perf_context_print: total time = 86.46 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 40.621909942s + Run #3 status: 0 + → Avg over 3 runs: 40.722s diff --git a/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..58caccd --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,167 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B +llama_model_loader: - kv 3: general.basename str = Qwen3-30B-A3B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 30B-A3B +llama_model_loader: - kv 6: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 7: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 8: qwen3moe.context_length u32 = 40960 +llama_model_loader: - kv 9: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 10: qwen3moe.feed_forward_length u32 = 6144 +llama_model_loader: - kv 11: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 12: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 13: qwen3moe.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 14: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 15: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 16: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 17: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 18: general.file_type u32 = 32 +llama_model_loader: - kv 19: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 20: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 21: general.quantization_version u32 = 2 +llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 23: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 28: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 29: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 30: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 31: split.no u16 = 0 +llama_model_loader: - kv 32: split.count u16 = 2 +llama_model_loader: - kv 33: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 40960 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 6144 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 40960 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-30B-A3B +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 57666.30 MiB +load_tensors: ROCm_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 300.75 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1093628111 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello - + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 34482.76 tokens per second) +llama_perf_context_print: load time = 19374.51 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 42.85 ms / 1 runs ( 42.85 ms per token, 23.34 tokens per second) +llama_perf_context_print: total time = 73.04 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 23.364750813s + Run #3 status: 0 + → Avg over 3 runs: 22.166s diff --git a/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..71d5dad --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log @@ -0,0 +1,167 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B +llama_model_loader: - kv 3: general.basename str = Qwen3-30B-A3B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 30B-A3B +llama_model_loader: - kv 6: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 7: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 8: qwen3moe.context_length u32 = 40960 +llama_model_loader: - kv 9: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 10: qwen3moe.feed_forward_length u32 = 6144 +llama_model_loader: - kv 11: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 12: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 13: qwen3moe.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 14: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 15: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 16: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 17: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 18: general.file_type u32 = 32 +llama_model_loader: - kv 19: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 20: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 21: general.quantization_version u32 = 2 +llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 23: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 28: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 29: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 30: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 31: split.no u16 = 0 +llama_model_loader: - kv 32: split.count u16 = 2 +llama_model_loader: - kv 33: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 40960 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 6144 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 40960 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-30B-A3B +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 57666.30 MiB +load_tensors: ROCm_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 300.75 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3515911169 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello * + +llama_perf_sampler_print: sampling time = 0.05 ms / 2 runs ( 0.03 ms per token, 37037.04 tokens per second) +llama_perf_context_print: load time = 12423.68 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 43.15 ms / 1 runs ( 43.15 ms per token, 23.18 tokens per second) +llama_perf_context_print: total time = 62.68 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 13.032265401s + Run #3 status: 0 + → Avg over 3 runs: 15.930s diff --git a/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..7d9b984 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log @@ -0,0 +1,167 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B +llama_model_loader: - kv 3: general.basename str = Qwen3-30B-A3B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 30B-A3B +llama_model_loader: - kv 6: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 7: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 8: qwen3moe.context_length u32 = 40960 +llama_model_loader: - kv 9: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 10: qwen3moe.feed_forward_length u32 = 6144 +llama_model_loader: - kv 11: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 12: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 13: qwen3moe.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 14: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 15: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 16: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 17: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 18: general.file_type u32 = 32 +llama_model_loader: - kv 19: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 20: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 21: general.quantization_version u32 = 2 +llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 23: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 28: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 29: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 30: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 31: split.no u16 = 0 +llama_model_loader: - kv 32: split.count u16 = 2 +llama_model_loader: - kv 33: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 40960 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 6144 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 40960 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-30B-A3B +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 57666.30 MiB +load_tensors: ROCm_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 300.75 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4057380724 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello this + +llama_perf_sampler_print: sampling time = 0.05 ms / 2 runs ( 0.03 ms per token, 37037.04 tokens per second) +llama_perf_context_print: load time = 21106.31 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 43.24 ms / 1 runs ( 43.24 ms per token, 23.13 tokens per second) +llama_perf_context_print: total time = 62.41 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 21.852416396s + Run #3 status: 0 + → Avg over 3 runs: 22.669s diff --git a/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..1a2f40e --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,165 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B +llama_model_loader: - kv 3: general.basename str = Qwen3-30B-A3B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 30B-A3B +llama_model_loader: - kv 6: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 7: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 8: qwen3moe.context_length u32 = 40960 +llama_model_loader: - kv 9: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 10: qwen3moe.feed_forward_length u32 = 6144 +llama_model_loader: - kv 11: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 12: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 13: qwen3moe.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 14: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 15: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 16: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 17: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 18: general.file_type u32 = 32 +llama_model_loader: - kv 19: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 20: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 21: general.quantization_version u32 = 2 +llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 23: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 28: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 29: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 30: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 31: split.no u16 = 0 +llama_model_loader: - kv 32: split.count u16 = 2 +llama_model_loader: - kv 33: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 40960 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 6144 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 40960 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-30B-A3B +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 57666.30 MiB +load_tensors: Vulkan_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 304.75 MiB +llama_context: Vulkan_Host compute buffer size = 12.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 157667903 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello and + +llama_perf_sampler_print: sampling time = 0.08 ms / 2 runs ( 0.04 ms per token, 24390.24 tokens per second) +llama_perf_context_print: load time = 10008.37 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 128.73 ms / 1 runs ( 128.73 ms per token, 7.77 tokens per second) +llama_perf_context_print: total time = 155.88 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 10.759732568s + Run #3 status: 0 + → Avg over 3 runs: 12.935s diff --git a/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..4de7e68 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log @@ -0,0 +1,165 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-30B-A3B +llama_model_loader: - kv 3: general.basename str = Qwen3-30B-A3B +llama_model_loader: - kv 4: general.quantized_by str = Unsloth +llama_model_loader: - kv 5: general.size_label str = 30B-A3B +llama_model_loader: - kv 6: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 7: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 8: qwen3moe.context_length u32 = 40960 +llama_model_loader: - kv 9: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 10: qwen3moe.feed_forward_length u32 = 6144 +llama_model_loader: - kv 11: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 12: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 13: qwen3moe.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 14: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 15: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 16: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 17: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 18: general.file_type u32 = 32 +llama_model_loader: - kv 19: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 20: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 21: general.quantization_version u32 = 2 +llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 23: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 28: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 29: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 30: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>... +llama_model_loader: - kv 31: split.no u16 = 0 +llama_model_loader: - kv 32: split.count u16 = 2 +llama_model_loader: - kv 33: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 40960 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 6144 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 40960 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-30B-A3B +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 57666.30 MiB +load_tensors: Vulkan_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 304.75 MiB +llama_context: Vulkan_Host compute buffer size = 12.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1118253234 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello - + +llama_perf_sampler_print: sampling time = 0.08 ms / 2 runs ( 0.04 ms per token, 25316.46 tokens per second) +llama_perf_context_print: load time = 12501.96 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 137.49 ms / 1 runs ( 137.49 ms per token, 7.27 tokens per second) +llama_perf_context_print: total time = 164.69 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 13.022605949s + Run #3 status: 0 + → Avg over 3 runs: 14.761s diff --git a/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..4375988 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,176 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 30B-A3B +llama_model_loader: - kv 7: general.license str = apache-2.0 +llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Qwen3 Coder 30B A3B Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 5472 +llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000 +llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 26: general.file_type u32 = 32 +llama_model_loader: - kv 27: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 29: qwen3moe.expert_shared_feed_forward_length u32 = 0 +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 32: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 33: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 34: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 35: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 36: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 37: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 38: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 39: tokenizer.chat_template str = {#- Copyright 2025-present the Unslot... +llama_model_loader: - kv 40: split.no u16 = 0 +llama_model_loader: - kv 41: split.count u16 = 2 +llama_model_loader: - kv 42: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 5472 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 10000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-Coder-30B-A3B-Instruct +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 57666.30 MiB +load_tensors: ROCm_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 10000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 300.75 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3288748167 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello: + +llama_perf_sampler_print: sampling time = 0.05 ms / 2 runs ( 0.03 ms per token, 38461.54 tokens per second) +llama_perf_context_print: load time = 12175.61 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 42.43 ms / 1 runs ( 42.43 ms per token, 23.57 tokens per second) +llama_perf_context_print: total time = 81.77 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 16.099845533s + Run #3 status: 0 + → Avg over 3 runs: 17.779s diff --git a/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..c80143b --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log @@ -0,0 +1,176 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 30B-A3B +llama_model_loader: - kv 7: general.license str = apache-2.0 +llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Qwen3 Coder 30B A3B Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 5472 +llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000 +llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 26: general.file_type u32 = 32 +llama_model_loader: - kv 27: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 29: qwen3moe.expert_shared_feed_forward_length u32 = 0 +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 32: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 33: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 34: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 35: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 36: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 37: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 38: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 39: tokenizer.chat_template str = {#- Copyright 2025-present the Unslot... +llama_model_loader: - kv 40: split.no u16 = 0 +llama_model_loader: - kv 41: split.count u16 = 2 +llama_model_loader: - kv 42: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 5472 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 10000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-Coder-30B-A3B-Instruct +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 57666.30 MiB +load_tensors: ROCm_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 10000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 300.75 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3173540432 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello: + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 35087.72 tokens per second) +llama_perf_context_print: load time = 11733.11 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 42.68 ms / 1 runs ( 42.68 ms per token, 23.43 tokens per second) +llama_perf_context_print: total time = 82.14 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 12.376138939s + Run #3 status: 0 + → Avg over 3 runs: 14.392s diff --git a/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..729954c --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log @@ -0,0 +1,176 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 30B-A3B +llama_model_loader: - kv 7: general.license str = apache-2.0 +llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Qwen3 Coder 30B A3B Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 5472 +llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000 +llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 26: general.file_type u32 = 32 +llama_model_loader: - kv 27: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 29: qwen3moe.expert_shared_feed_forward_length u32 = 0 +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 32: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 33: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 34: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 35: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 36: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 37: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 38: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 39: tokenizer.chat_template str = {#- Copyright 2025-present the Unslot... +llama_model_loader: - kv 40: split.no u16 = 0 +llama_model_loader: - kv 41: split.count u16 = 2 +llama_model_loader: - kv 42: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 5472 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 10000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-Coder-30B-A3B-Instruct +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 57666.30 MiB +load_tensors: ROCm_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 10000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 300.75 MiB +llama_context: ROCm_Host compute buffer size = 8.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 1 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1388157865 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello: + +llama_perf_sampler_print: sampling time = 0.06 ms / 2 runs ( 0.03 ms per token, 36363.64 tokens per second) +llama_perf_context_print: load time = 11788.33 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 43.56 ms / 1 runs ( 43.56 ms per token, 22.95 tokens per second) +llama_perf_context_print: total time = 82.77 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 12.528214562s + Run #3 status: 0 + → Avg over 3 runs: 16.161s diff --git a/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..a26f8f8 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,174 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 30B-A3B +llama_model_loader: - kv 7: general.license str = apache-2.0 +llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Qwen3 Coder 30B A3B Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 5472 +llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000 +llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 26: general.file_type u32 = 32 +llama_model_loader: - kv 27: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 29: qwen3moe.expert_shared_feed_forward_length u32 = 0 +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 32: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 33: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 34: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 35: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 36: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 37: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 38: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 39: tokenizer.chat_template str = {#- Copyright 2025-present the Unslot... +llama_model_loader: - kv 40: split.no u16 = 0 +llama_model_loader: - kv 41: split.count u16 = 2 +llama_model_loader: - kv 42: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 5472 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 10000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-Coder-30B-A3B-Instruct +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 57666.30 MiB +load_tensors: Vulkan_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 10000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 304.75 MiB +llama_context: Vulkan_Host compute buffer size = 12.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 243266880 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello: + +llama_perf_sampler_print: sampling time = 0.08 ms / 2 runs ( 0.04 ms per token, 26315.79 tokens per second) +llama_perf_context_print: load time = 9973.02 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 130.78 ms / 1 runs ( 130.78 ms per token, 7.65 tokens per second) +llama_perf_context_print: total time = 185.17 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 10.756452016s + Run #3 status: 0 + → Avg over 3 runs: 12.940s diff --git a/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..ef76488 --- /dev/null +++ b/benchmark/loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log @@ -0,0 +1,174 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = qwen3moe +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 3: general.finetune str = Instruct +llama_model_loader: - kv 4: general.basename str = Qwen3-Coder-30B-A3B-Instruct +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 30B-A3B +llama_model_loader: - kv 7: general.license str = apache-2.0 +llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 9: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 10: general.base_model.count u32 = 1 +llama_model_loader: - kv 11: general.base_model.0.name str = Qwen3 Coder 30B A3B Instruct +llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen +llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-Cod... +llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"] +llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48 +llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144 +llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048 +llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 5472 +llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32 +llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4 +llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000 +llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8 +llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128 +llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128 +llama_model_loader: - kv 26: general.file_type u32 = 32 +llama_model_loader: - kv 27: qwen3moe.expert_count u32 = 128 +llama_model_loader: - kv 28: qwen3moe.expert_feed_forward_length u32 = 768 +llama_model_loader: - kv 29: qwen3moe.expert_shared_feed_forward_length u32 = 0 +llama_model_loader: - kv 30: general.quantization_version u32 = 2 +llama_model_loader: - kv 31: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 32: tokenizer.ggml.pre str = qwen2 +llama_model_loader: - kv 33: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 34: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 35: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",... +llama_model_loader: - kv 36: tokenizer.ggml.eos_token_id u32 = 151645 +llama_model_loader: - kv 37: tokenizer.ggml.padding_token_id u32 = 151654 +llama_model_loader: - kv 38: tokenizer.ggml.add_bos_token bool = false +llama_model_loader: - kv 39: tokenizer.chat_template str = {#- Copyright 2025-present the Unslot... +llama_model_loader: - kv 40: split.no u16 = 0 +llama_model_loader: - kv 41: split.count u16 = 2 +llama_model_loader: - kv 42: split.tensors.count i32 = 579 +llama_model_loader: - type f32: 241 tensors +llama_model_loader: - type bf16: 338 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 56.89 GiB (16.01 BPW) +load: special tokens cache size = 26 +load: token to piece cache size = 0.9311 MB +print_info: arch = qwen3moe +print_info: vocab_only = 0 +print_info: n_ctx_train = 262144 +print_info: n_embd = 2048 +print_info: n_layer = 48 +print_info: n_head = 32 +print_info: n_head_kv = 4 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 512 +print_info: n_embd_v_gqa = 512 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 5472 +print_info: n_expert = 128 +print_info: n_expert_used = 8 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 10000000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 262144 +print_info: rope_finetuned = unknown +print_info: model type = 30B.A3B +print_info: model params = 30.53 B +print_info: general.name = Qwen3-Coder-30B-A3B-Instruct +print_info: n_ff_exp = 768 +print_info: vocab type = BPE +print_info: n_vocab = 151936 +print_info: n_merges = 151387 +print_info: BOS token = 11 ',' +print_info: EOS token = 151645 '<|im_end|>' +print_info: EOT token = 151645 '<|im_end|>' +print_info: PAD token = 151654 '<|vision_pad|>' +print_info: LF token = 198 'Ċ' +print_info: FIM PRE token = 151659 '<|fim_prefix|>' +print_info: FIM SUF token = 151661 '<|fim_suffix|>' +print_info: FIM MID token = 151660 '<|fim_middle|>' +print_info: FIM PAD token = 151662 '<|fim_pad|>' +print_info: FIM REP token = 151663 '<|repo_name|>' +print_info: FIM SEP token = 151664 '<|file_sep|>' +print_info: EOG token = 151643 '<|endoftext|>' +print_info: EOG token = 151645 '<|im_end|>' +print_info: EOG token = 151662 '<|fim_pad|>' +print_info: EOG token = 151663 '<|repo_name|>' +print_info: EOG token = 151664 '<|file_sep|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 57666.30 MiB +load_tensors: Vulkan_Host model buffer size = 593.50 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 10000000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.58 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 384.00 MiB +llama_kv_cache_unified: size = 384.00 MiB ( 4096 cells, 48 layers, 1/ 1 seqs), K (f16): 192.00 MiB, V (f16): 192.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 304.75 MiB +llama_context: Vulkan_Host compute buffer size = 12.01 MiB +llama_context: graph nodes = 3079 +llama_context: graph splits = 2 +common_init_from_params: added <|endoftext|> logit bias = -inf +common_init_from_params: added <|im_end|> logit bias = -inf +common_init_from_params: added <|fim_pad|> logit bias = -inf +common_init_from_params: added <|repo_name|> logit bias = -inf +common_init_from_params: added <|file_sep|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 2350977163 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0 + +Hello: + +llama_perf_sampler_print: sampling time = 0.07 ms / 2 runs ( 0.04 ms per token, 27027.03 tokens per second) +llama_perf_context_print: load time = 13008.56 ms +llama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: eval time = 140.05 ms / 1 runs ( 140.05 ms per token, 7.14 tokens per second) +llama_perf_context_print: total time = 194.09 ms / 2 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 13.570267879s + Run #3 status: 0 + → Avg over 3 runs: 14.021s diff --git a/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log new file mode 100644 index 0000000..c653b8c --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log @@ -0,0 +1,165 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-12B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-12B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 12B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 3840 +llama_model_loader: - kv 10: gemma3.block_count u32 = 48 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 15360 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 16 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 256 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 256 +llama_model_loader: - kv 16: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 17: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 18: gemma3.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 20: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 21: tokenizer.ggml.model str = llama +llama_model_loader: - kv 22: tokenizer.ggml.pre str = default +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 32: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 33: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 34: general.quantization_version u32 = 2 +llama_model_loader: - kv 35: general.file_type u32 = 7 +llama_model_loader: - kv 36: quantize.imatrix.file str = gemma-3-12b-it-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 37: quantize.imatrix.dataset str = unsloth_calibration_gemma-3-12b-it.txt +llama_model_loader: - kv 38: quantize.imatrix.entries_count i32 = 336 +llama_model_loader: - kv 39: quantize.imatrix.chunks_count i32 = 663 +llama_model_loader: - type f32: 289 tensors +llama_model_loader: - type q8_0: 311 tensors +llama_model_loader: - type bf16: 26 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 13.40 GiB (9.78 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 3840 +print_info: n_layer = 48 +print_info: n_head = 16 +print_info: n_head_kv = 8 +print_info: n_rot = 256 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 256 +print_info: n_embd_head_v = 256 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 6.2e-02 +print_info: n_ff = 15360 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 12B +print_info: model params = 11.77 B +print_info: general.name = Gemma-3-12B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 13721.20 MiB +load_tensors: ROCm_Host model buffer size = 1920.47 MiB +............................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 256.00 MiB +llama_kv_cache_unified: size = 256.00 MiB ( 4096 cells, 8 layers, 1/ 1 seqs), K (f16): 128.00 MiB, V (f16): 128.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 480.00 MiB +llama_kv_cache_unified: size = 480.00 MiB ( 1536 cells, 40 layers, 1/ 1 seqs), K (f16): 240.00 MiB, V (f16): 240.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 519.62 MiB +llama_context: ROCm_Host compute buffer size = 11.01 MiB +llama_context: graph nodes = 2025 +llama_context: graph splits = 1 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3471752321 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello** + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 35294.12 tokens per second) +llama_perf_context_print: load time = 2510.88 ms +llama_perf_context_print: prompt eval time = 74.99 ms / 2 tokens ( 37.49 ms per token, 26.67 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 79.74 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 6.594391168s + Run #3 status: 0 + → Avg over 3 runs: 6.686s diff --git a/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log new file mode 100644 index 0000000..c06575b --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log @@ -0,0 +1,165 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-12B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-12B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 12B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 3840 +llama_model_loader: - kv 10: gemma3.block_count u32 = 48 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 15360 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 16 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 256 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 256 +llama_model_loader: - kv 16: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 17: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 18: gemma3.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 20: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 21: tokenizer.ggml.model str = llama +llama_model_loader: - kv 22: tokenizer.ggml.pre str = default +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 32: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 33: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 34: general.quantization_version u32 = 2 +llama_model_loader: - kv 35: general.file_type u32 = 7 +llama_model_loader: - kv 36: quantize.imatrix.file str = gemma-3-12b-it-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 37: quantize.imatrix.dataset str = unsloth_calibration_gemma-3-12b-it.txt +llama_model_loader: - kv 38: quantize.imatrix.entries_count i32 = 336 +llama_model_loader: - kv 39: quantize.imatrix.chunks_count i32 = 663 +llama_model_loader: - type f32: 289 tensors +llama_model_loader: - type q8_0: 311 tensors +llama_model_loader: - type bf16: 26 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 13.40 GiB (9.78 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 3840 +print_info: n_layer = 48 +print_info: n_head = 16 +print_info: n_head_kv = 8 +print_info: n_rot = 256 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 256 +print_info: n_embd_head_v = 256 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 6.2e-02 +print_info: n_ff = 15360 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 12B +print_info: model params = 11.77 B +print_info: general.name = Gemma-3-12B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 13721.20 MiB +load_tensors: ROCm_Host model buffer size = 1920.47 MiB +............................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 256.00 MiB +llama_kv_cache_unified: size = 256.00 MiB ( 4096 cells, 8 layers, 1/ 1 seqs), K (f16): 128.00 MiB, V (f16): 128.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 480.00 MiB +llama_kv_cache_unified: size = 480.00 MiB ( 1536 cells, 40 layers, 1/ 1 seqs), K (f16): 240.00 MiB, V (f16): 240.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 519.62 MiB +llama_context: ROCm_Host compute buffer size = 11.01 MiB +llama_context: graph nodes = 2025 +llama_context: graph splits = 1 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 854716185 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +HelloWhat + +llama_perf_sampler_print: sampling time = 0.14 ms / 3 runs ( 0.05 ms per token, 21428.57 tokens per second) +llama_perf_context_print: load time = 2695.72 ms +llama_perf_context_print: prompt eval time = 75.18 ms / 2 tokens ( 37.59 ms per token, 26.60 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 82.57 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 3.208919123s + Run #3 status: 0 + → Avg over 3 runs: 3.434s diff --git a/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log new file mode 100644 index 0000000..0c8b97e --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log @@ -0,0 +1,165 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-12B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-12B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 12B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 3840 +llama_model_loader: - kv 10: gemma3.block_count u32 = 48 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 15360 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 16 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 256 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 256 +llama_model_loader: - kv 16: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 17: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 18: gemma3.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 20: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 21: tokenizer.ggml.model str = llama +llama_model_loader: - kv 22: tokenizer.ggml.pre str = default +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 32: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 33: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 34: general.quantization_version u32 = 2 +llama_model_loader: - kv 35: general.file_type u32 = 7 +llama_model_loader: - kv 36: quantize.imatrix.file str = gemma-3-12b-it-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 37: quantize.imatrix.dataset str = unsloth_calibration_gemma-3-12b-it.txt +llama_model_loader: - kv 38: quantize.imatrix.entries_count i32 = 336 +llama_model_loader: - kv 39: quantize.imatrix.chunks_count i32 = 663 +llama_model_loader: - type f32: 289 tensors +llama_model_loader: - type q8_0: 311 tensors +llama_model_loader: - type bf16: 26 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 13.40 GiB (9.78 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 3840 +print_info: n_layer = 48 +print_info: n_head = 16 +print_info: n_head_kv = 8 +print_info: n_rot = 256 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 256 +print_info: n_embd_head_v = 256 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 6.2e-02 +print_info: n_ff = 15360 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 12B +print_info: model params = 11.77 B +print_info: general.name = Gemma-3-12B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: ROCm0 model buffer size = 13721.20 MiB +load_tensors: ROCm_Host model buffer size = 1920.47 MiB +............................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 256.00 MiB +llama_kv_cache_unified: size = 256.00 MiB ( 4096 cells, 8 layers, 1/ 1 seqs), K (f16): 128.00 MiB, V (f16): 128.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 480.00 MiB +llama_kv_cache_unified: size = 480.00 MiB ( 1536 cells, 40 layers, 1/ 1 seqs), K (f16): 240.00 MiB, V (f16): 240.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 519.62 MiB +llama_context: ROCm_Host compute buffer size = 11.01 MiB +llama_context: graph nodes = 2025 +llama_context: graph splits = 1 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 754281730 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +HelloThe + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 32608.70 tokens per second) +llama_perf_context_print: load time = 3090.57 ms +llama_perf_context_print: prompt eval time = 75.62 ms / 2 tokens ( 37.81 ms per token, 26.45 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 81.49 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 3.616272374s + Run #3 status: 0 + → Avg over 3 runs: 3.861s diff --git a/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log new file mode 100644 index 0000000..3e8f841 --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log @@ -0,0 +1,163 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-12B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-12B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 12B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 3840 +llama_model_loader: - kv 10: gemma3.block_count u32 = 48 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 15360 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 16 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 256 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 256 +llama_model_loader: - kv 16: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 17: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 18: gemma3.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 20: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 21: tokenizer.ggml.model str = llama +llama_model_loader: - kv 22: tokenizer.ggml.pre str = default +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 32: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 33: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 34: general.quantization_version u32 = 2 +llama_model_loader: - kv 35: general.file_type u32 = 7 +llama_model_loader: - kv 36: quantize.imatrix.file str = gemma-3-12b-it-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 37: quantize.imatrix.dataset str = unsloth_calibration_gemma-3-12b-it.txt +llama_model_loader: - kv 38: quantize.imatrix.entries_count i32 = 336 +llama_model_loader: - kv 39: quantize.imatrix.chunks_count i32 = 663 +llama_model_loader: - type f32: 289 tensors +llama_model_loader: - type q8_0: 311 tensors +llama_model_loader: - type bf16: 26 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 13.40 GiB (9.78 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 3840 +print_info: n_layer = 48 +print_info: n_head = 16 +print_info: n_head_kv = 8 +print_info: n_rot = 256 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 256 +print_info: n_embd_head_v = 256 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 6.2e-02 +print_info: n_ff = 15360 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 12B +print_info: model params = 11.77 B +print_info: general.name = Gemma-3-12B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 13721.12 MiB +load_tensors: Vulkan_Host model buffer size = 1920.47 MiB +............................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 256.00 MiB +llama_kv_cache_unified: size = 256.00 MiB ( 4096 cells, 8 layers, 1/ 1 seqs), K (f16): 128.00 MiB, V (f16): 128.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 480.00 MiB +llama_kv_cache_unified: size = 480.00 MiB ( 1536 cells, 40 layers, 1/ 1 seqs), K (f16): 240.00 MiB, V (f16): 240.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 519.62 MiB +llama_context: Vulkan_Host compute buffer size = 18.51 MiB +llama_context: graph nodes = 2025 +llama_context: graph splits = 2 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 356896032 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello + +llama_perf_sampler_print: sampling time = 0.12 ms / 3 runs ( 0.04 ms per token, 24390.24 tokens per second) +llama_perf_context_print: load time = 3459.76 ms +llama_perf_context_print: prompt eval time = 90.54 ms / 2 tokens ( 45.27 ms per token, 22.09 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 98.48 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 3.933674345s + Run #3 status: 0 + → Avg over 3 runs: 3.955s diff --git a/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log new file mode 100644 index 0000000..dcf49f7 --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log @@ -0,0 +1,163 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-12B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-12B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 12B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 3840 +llama_model_loader: - kv 10: gemma3.block_count u32 = 48 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 15360 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 16 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 256 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 256 +llama_model_loader: - kv 16: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 17: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 18: gemma3.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 19: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 20: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 21: tokenizer.ggml.model str = llama +llama_model_loader: - kv 22: tokenizer.ggml.pre str = default +llama_model_loader: - kv 23: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 24: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 26: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 27: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 28: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 29: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 32: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 33: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 34: general.quantization_version u32 = 2 +llama_model_loader: - kv 35: general.file_type u32 = 7 +llama_model_loader: - kv 36: quantize.imatrix.file str = gemma-3-12b-it-GGUF/imatrix_unsloth.dat +llama_model_loader: - kv 37: quantize.imatrix.dataset str = unsloth_calibration_gemma-3-12b-it.txt +llama_model_loader: - kv 38: quantize.imatrix.entries_count i32 = 336 +llama_model_loader: - kv 39: quantize.imatrix.chunks_count i32 = 663 +llama_model_loader: - type f32: 289 tensors +llama_model_loader: - type q8_0: 311 tensors +llama_model_loader: - type bf16: 26 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q8_0 +print_info: file size = 13.40 GiB (9.78 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 3840 +print_info: n_layer = 48 +print_info: n_head = 16 +print_info: n_head_kv = 8 +print_info: n_rot = 256 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 256 +print_info: n_embd_head_v = 256 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 6.2e-02 +print_info: n_ff = 15360 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 12B +print_info: model params = 11.77 B +print_info: general.name = Gemma-3-12B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 48 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 49/49 layers to GPU +load_tensors: Vulkan0 model buffer size = 13721.12 MiB +load_tensors: Vulkan_Host model buffer size = 1920.47 MiB +............................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 256.00 MiB +llama_kv_cache_unified: size = 256.00 MiB ( 4096 cells, 8 layers, 1/ 1 seqs), K (f16): 128.00 MiB, V (f16): 128.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 480.00 MiB +llama_kv_cache_unified: size = 480.00 MiB ( 1536 cells, 40 layers, 1/ 1 seqs), K (f16): 240.00 MiB, V (f16): 240.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 519.62 MiB +llama_context: Vulkan_Host compute buffer size = 18.51 MiB +llama_context: graph nodes = 2025 +llama_context: graph splits = 2 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3541901199 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +HelloI + +llama_perf_sampler_print: sampling time = 0.12 ms / 3 runs ( 0.04 ms per token, 24590.16 tokens per second) +llama_perf_context_print: load time = 3946.08 ms +llama_perf_context_print: prompt eval time = 78.51 ms / 2 tokens ( 39.26 ms per token, 25.47 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 86.43 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 4.313578800s + Run #3 status: 0 + → Avg over 3 runs: 4.295s diff --git a/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..4834b42 --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,164 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-27B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-27B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 27B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 5376 +llama_model_loader: - kv 10: gemma3.block_count u32 = 62 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 21504 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 32 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 128 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 128 +llama_model_loader: - kv 16: general.file_type u32 = 32 +llama_model_loader: - kv 17: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 18: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 19: gemma3.attention.head_count_kv u32 = 16 +llama_model_loader: - kv 20: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 21: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 22: general.quantization_version u32 = 2 +llama_model_loader: - kv 23: tokenizer.ggml.model str = llama +llama_model_loader: - kv 24: tokenizer.ggml.pre str = default +llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 26: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 30: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 31: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 33: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 34: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 35: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.count u16 = 2 +llama_model_loader: - kv 38: split.tensors.count i32 = 808 +llama_model_loader: - type f32: 373 tensors +llama_model_loader: - type bf16: 435 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 50.31 GiB (16.00 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 5376 +print_info: n_layer = 62 +print_info: n_head = 32 +print_info: n_head_kv = 16 +print_info: n_rot = 128 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 7.7e-02 +print_info: n_ff = 21504 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 27B +print_info: model params = 27.01 B +print_info: general.name = Gemma-3-27B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 62 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 63/63 layers to GPU +load_tensors: ROCm0 model buffer size = 51518.82 MiB +load_tensors: ROCm_Host model buffer size = 2688.66 MiB +............................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 320.00 MiB +llama_kv_cache_unified: size = 320.00 MiB ( 4096 cells, 10 layers, 1/ 1 seqs), K (f16): 160.00 MiB, V (f16): 160.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 624.00 MiB +llama_kv_cache_unified: size = 624.00 MiB ( 1536 cells, 52 layers, 1/ 1 seqs), K (f16): 312.00 MiB, V (f16): 312.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 522.62 MiB +llama_context: ROCm_Host compute buffer size = 11.01 MiB +llama_context: graph nodes = 2613 +llama_context: graph splits = 1 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 204092650 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello + +llama_perf_sampler_print: sampling time = 0.08 ms / 3 runs ( 0.03 ms per token, 39473.68 tokens per second) +llama_perf_context_print: load time = 7815.59 ms +llama_perf_context_print: prompt eval time = 253.33 ms / 2 tokens ( 126.66 ms per token, 7.89 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 258.00 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 11.830337249s + Run #3 status: 0 + → Avg over 3 runs: 12.495s diff --git a/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..55f27ac --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log @@ -0,0 +1,164 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-27B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-27B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 27B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 5376 +llama_model_loader: - kv 10: gemma3.block_count u32 = 62 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 21504 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 32 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 128 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 128 +llama_model_loader: - kv 16: general.file_type u32 = 32 +llama_model_loader: - kv 17: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 18: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 19: gemma3.attention.head_count_kv u32 = 16 +llama_model_loader: - kv 20: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 21: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 22: general.quantization_version u32 = 2 +llama_model_loader: - kv 23: tokenizer.ggml.model str = llama +llama_model_loader: - kv 24: tokenizer.ggml.pre str = default +llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 26: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 30: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 31: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 33: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 34: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 35: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.count u16 = 2 +llama_model_loader: - kv 38: split.tensors.count i32 = 808 +llama_model_loader: - type f32: 373 tensors +llama_model_loader: - type bf16: 435 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 50.31 GiB (16.00 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 5376 +print_info: n_layer = 62 +print_info: n_head = 32 +print_info: n_head_kv = 16 +print_info: n_rot = 128 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 7.7e-02 +print_info: n_ff = 21504 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 27B +print_info: model params = 27.01 B +print_info: general.name = Gemma-3-27B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 62 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 63/63 layers to GPU +load_tensors: ROCm0 model buffer size = 51518.82 MiB +load_tensors: ROCm_Host model buffer size = 2688.66 MiB +............................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 320.00 MiB +llama_kv_cache_unified: size = 320.00 MiB ( 4096 cells, 10 layers, 1/ 1 seqs), K (f16): 160.00 MiB, V (f16): 160.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 624.00 MiB +llama_kv_cache_unified: size = 624.00 MiB ( 1536 cells, 52 layers, 1/ 1 seqs), K (f16): 312.00 MiB, V (f16): 312.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 522.62 MiB +llama_context: ROCm_Host compute buffer size = 11.01 MiB +llama_context: graph nodes = 2613 +llama_context: graph splits = 1 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 88592582 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 35294.12 tokens per second) +llama_perf_context_print: load time = 10385.57 ms +llama_perf_context_print: prompt eval time = 253.71 ms / 2 tokens ( 126.85 ms per token, 7.88 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 259.35 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 11.144656718s + Run #3 status: 0 + → Avg over 3 runs: 10.486s diff --git a/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..acb8825 --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log @@ -0,0 +1,164 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-27B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-27B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 27B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 5376 +llama_model_loader: - kv 10: gemma3.block_count u32 = 62 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 21504 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 32 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 128 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 128 +llama_model_loader: - kv 16: general.file_type u32 = 32 +llama_model_loader: - kv 17: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 18: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 19: gemma3.attention.head_count_kv u32 = 16 +llama_model_loader: - kv 20: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 21: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 22: general.quantization_version u32 = 2 +llama_model_loader: - kv 23: tokenizer.ggml.model str = llama +llama_model_loader: - kv 24: tokenizer.ggml.pre str = default +llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 26: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 30: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 31: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 33: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 34: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 35: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.count u16 = 2 +llama_model_loader: - kv 38: split.tensors.count i32 = 808 +llama_model_loader: - type f32: 373 tensors +llama_model_loader: - type bf16: 435 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 50.31 GiB (16.00 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 5376 +print_info: n_layer = 62 +print_info: n_head = 32 +print_info: n_head_kv = 16 +print_info: n_rot = 128 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 7.7e-02 +print_info: n_ff = 21504 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 27B +print_info: model params = 27.01 B +print_info: general.name = Gemma-3-27B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 62 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 63/63 layers to GPU +load_tensors: ROCm0 model buffer size = 51518.82 MiB +load_tensors: ROCm_Host model buffer size = 2688.66 MiB +............................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 320.00 MiB +llama_kv_cache_unified: size = 320.00 MiB ( 4096 cells, 10 layers, 1/ 1 seqs), K (f16): 160.00 MiB, V (f16): 160.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: ROCm0 KV buffer size = 624.00 MiB +llama_kv_cache_unified: size = 624.00 MiB ( 1536 cells, 52 layers, 1/ 1 seqs), K (f16): 312.00 MiB, V (f16): 312.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 522.62 MiB +llama_context: ROCm_Host compute buffer size = 11.01 MiB +llama_context: graph nodes = 2613 +llama_context: graph splits = 1 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1422263455 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.09 ms / 3 runs ( 0.03 ms per token, 35294.12 tokens per second) +llama_perf_context_print: load time = 9620.16 ms +llama_perf_context_print: prompt eval time = 256.55 ms / 2 tokens ( 128.27 ms per token, 7.80 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 261.63 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 10.587027979s + Run #3 status: 0 + → Avg over 3 runs: 10.417s diff --git a/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..d13050e --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,113 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-27B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-27B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 27B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 5376 +llama_model_loader: - kv 10: gemma3.block_count u32 = 62 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 21504 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 32 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 128 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 128 +llama_model_loader: - kv 16: general.file_type u32 = 32 +llama_model_loader: - kv 17: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 18: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 19: gemma3.attention.head_count_kv u32 = 16 +llama_model_loader: - kv 20: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 21: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 22: general.quantization_version u32 = 2 +llama_model_loader: - kv 23: tokenizer.ggml.model str = llama +llama_model_loader: - kv 24: tokenizer.ggml.pre str = default +llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 26: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 30: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 31: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 33: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 34: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 35: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.count u16 = 2 +llama_model_loader: - kv 38: split.tensors.count i32 = 808 +llama_model_loader: - type f32: 373 tensors +llama_model_loader: - type bf16: 435 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 50.31 GiB (16.00 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 5376 +print_info: n_layer = 62 +print_info: n_head = 32 +print_info: n_head_kv = 16 +print_info: n_rot = 128 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 7.7e-02 +print_info: n_ff = 21504 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 27B +print_info: model params = 27.01 B +print_info: general.name = Gemma-3-27B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +ggml_vulkan: Device memory allocation of size 2819260416 failed. +ggml_vulkan: Requested buffer size exceeds device memory allocation limit: ErrorOutOfDeviceMemory +alloc_tensor_range: failed to allocate Vulkan0 buffer of size 2819260416 +llama_model_load: error loading model: unable to allocate Vulkan0 buffer +llama_model_load_from_file_impl: failed to load model +common_init_from_params: failed to load model '/home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf' +main: error: unable to load model + Elapsed #3: .416644024s + Run #3 status: 1 + ✖ run #3 failed + → No successful runs diff --git a/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..095974b --- /dev/null +++ b/benchmark/loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log @@ -0,0 +1,162 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: additional 1 GGUFs metadata loaded. +llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = gemma3 +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Gemma-3-27B-It +llama_model_loader: - kv 3: general.finetune str = it +llama_model_loader: - kv 4: general.basename str = Gemma-3-27B-It +llama_model_loader: - kv 5: general.quantized_by str = Unsloth +llama_model_loader: - kv 6: general.size_label str = 27B +llama_model_loader: - kv 7: general.repo_url str = https://huggingface.co/unsloth +llama_model_loader: - kv 8: gemma3.context_length u32 = 131072 +llama_model_loader: - kv 9: gemma3.embedding_length u32 = 5376 +llama_model_loader: - kv 10: gemma3.block_count u32 = 62 +llama_model_loader: - kv 11: gemma3.feed_forward_length u32 = 21504 +llama_model_loader: - kv 12: gemma3.attention.head_count u32 = 32 +llama_model_loader: - kv 13: gemma3.attention.layer_norm_rms_epsilon f32 = 0.000001 +llama_model_loader: - kv 14: gemma3.attention.key_length u32 = 128 +llama_model_loader: - kv 15: gemma3.attention.value_length u32 = 128 +llama_model_loader: - kv 16: general.file_type u32 = 32 +llama_model_loader: - kv 17: gemma3.rope.freq_base f32 = 1000000.000000 +llama_model_loader: - kv 18: gemma3.attention.sliding_window u32 = 1024 +llama_model_loader: - kv 19: gemma3.attention.head_count_kv u32 = 16 +llama_model_loader: - kv 20: gemma3.rope.scaling.type str = linear +llama_model_loader: - kv 21: gemma3.rope.scaling.factor f32 = 8.000000 +llama_model_loader: - kv 22: general.quantization_version u32 = 2 +llama_model_loader: - kv 23: tokenizer.ggml.model str = llama +llama_model_loader: - kv 24: tokenizer.ggml.pre str = default +llama_model_loader: - kv 25: tokenizer.ggml.tokens arr[str,262208] = ["", "", "", "", ... +llama_model_loader: - kv 26: tokenizer.ggml.scores arr[f32,262208] = [-1000.000000, -1000.000000, -1000.00... +llama_model_loader: - kv 27: tokenizer.ggml.token_type arr[i32,262208] = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ... +llama_model_loader: - kv 28: tokenizer.ggml.bos_token_id u32 = 2 +llama_model_loader: - kv 29: tokenizer.ggml.eos_token_id u32 = 106 +llama_model_loader: - kv 30: tokenizer.ggml.unknown_token_id u32 = 3 +llama_model_loader: - kv 31: tokenizer.ggml.padding_token_id u32 = 0 +llama_model_loader: - kv 32: tokenizer.ggml.add_bos_token bool = true +llama_model_loader: - kv 33: tokenizer.ggml.add_eos_token bool = false +llama_model_loader: - kv 34: tokenizer.chat_template str = {{ bos_token }}\n{%- if messages[0]['r... +llama_model_loader: - kv 35: tokenizer.ggml.add_space_prefix bool = false +llama_model_loader: - kv 36: split.no u16 = 0 +llama_model_loader: - kv 37: split.count u16 = 2 +llama_model_loader: - kv 38: split.tensors.count i32 = 808 +llama_model_loader: - type f32: 373 tensors +llama_model_loader: - type bf16: 435 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = BF16 +print_info: file size = 50.31 GiB (16.00 BPW) +load: special tokens cache size = 6415 +load: token to piece cache size = 1.9446 MB +print_info: arch = gemma3 +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 5376 +print_info: n_layer = 62 +print_info: n_head = 32 +print_info: n_head_kv = 16 +print_info: n_rot = 128 +print_info: n_swa = 1024 +print_info: is_swa_any = 1 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 2 +print_info: n_embd_k_gqa = 2048 +print_info: n_embd_v_gqa = 2048 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-06 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 7.7e-02 +print_info: n_ff = 21504 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 2 +print_info: rope scaling = linear +print_info: freq_base_train = 1000000.0 +print_info: freq_scale_train = 0.125 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 27B +print_info: model params = 27.01 B +print_info: general.name = Gemma-3-27B-It +print_info: vocab type = SPM +print_info: n_vocab = 262208 +print_info: n_merges = 0 +print_info: BOS token = 2 '' +print_info: EOS token = 106 '' +print_info: EOT token = 106 '' +print_info: UNK token = 3 '' +print_info: PAD token = 0 '' +print_info: LF token = 248 '<0x0A>' +print_info: EOG token = 106 '' +print_info: max token length = 48 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 62 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 63/63 layers to GPU +load_tensors: Vulkan0 model buffer size = 51518.82 MiB +load_tensors: Vulkan_Host model buffer size = 2688.66 MiB +............................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 1000000.0 +llama_context: freq_scale = 0.125 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 1.00 MiB +llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 320.00 MiB +llama_kv_cache_unified: size = 320.00 MiB ( 4096 cells, 10 layers, 1/ 1 seqs), K (f16): 160.00 MiB, V (f16): 160.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_kv_cache_unified_iswa: creating SWA KV cache, size = 1536 cells +llama_kv_cache_unified: Vulkan0 KV buffer size = 624.00 MiB +llama_kv_cache_unified: size = 624.00 MiB ( 1536 cells, 52 layers, 1/ 1 seqs), K (f16): 312.00 MiB, V (f16): 312.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 522.62 MiB +llama_context: Vulkan_Host compute buffer size = 21.51 MiB +llama_context: graph nodes = 2613 +llama_context: graph splits = 2 +common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting +common_init_from_params: added logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 4215263583 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.18 ms / 3 runs ( 0.06 ms per token, 16666.67 tokens per second) +llama_perf_context_print: load time = 14451.51 ms +llama_perf_context_print: prompt eval time = 257.32 ms / 2 tokens ( 128.66 ms per token, 7.77 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 265.56 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 15.024330058s + Run #3 status: 0 + → Avg over 3 runs: 13.579s diff --git a/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log new file mode 100644 index 0000000..dc8cd03 --- /dev/null +++ b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log @@ -0,0 +1,159 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free +llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama 3.1 70B Instruct 2024 12 +llama_model_loader: - kv 3: general.version str = 2024-12 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Llama-3.1 +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.license str = llama3.1 +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Llama 3.1 70B +llama_model_loader: - kv 10: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 13: general.languages arr[str,7] = ["fr", "it", "pt", "hi", "es", "th", ... +llama_model_loader: - kv 14: llama.block_count u32 = 80 +llama_model_loader: - kv 15: llama.context_length u32 = 131072 +llama_model_loader: - kv 16: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 17: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 18: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 20: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 22: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 23: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 24: general.file_type u32 = 15 +llama_model_loader: - kv 25: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 27: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 28: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 30: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 31: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 34: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 35: general.quantization_version u32 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q4_K: 441 tensors +llama_model_loader: - type q5_K: 40 tensors +llama_model_loader: - type q6_K: 81 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 39.59 GiB (4.82 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama 3.1 70B Instruct 2024 12 +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: CPU model buffer size = 563.62 MiB +load_tensors: ROCm0 model buffer size = 39979.48 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 266.50 MiB +llama_context: ROCm_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1295757489 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.05 ms / 3 runs ( 0.02 ms per token, 61224.49 tokens per second) +llama_perf_context_print: load time = 5592.62 ms +llama_perf_context_print: prompt eval time = 248.28 ms / 2 tokens ( 124.14 ms per token, 8.06 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 263.25 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 9.635053314s + Run #3 status: 0 + → Avg over 3 runs: 9.887s diff --git a/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log new file mode 100644 index 0000000..3dd2b4b --- /dev/null +++ b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log @@ -0,0 +1,159 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama 3.1 70B Instruct 2024 12 +llama_model_loader: - kv 3: general.version str = 2024-12 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Llama-3.1 +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.license str = llama3.1 +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Llama 3.1 70B +llama_model_loader: - kv 10: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 13: general.languages arr[str,7] = ["fr", "it", "pt", "hi", "es", "th", ... +llama_model_loader: - kv 14: llama.block_count u32 = 80 +llama_model_loader: - kv 15: llama.context_length u32 = 131072 +llama_model_loader: - kv 16: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 17: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 18: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 20: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 22: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 23: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 24: general.file_type u32 = 15 +llama_model_loader: - kv 25: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 27: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 28: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 30: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 31: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 34: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 35: general.quantization_version u32 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q4_K: 441 tensors +llama_model_loader: - type q5_K: 40 tensors +llama_model_loader: - type q6_K: 81 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 39.59 GiB (4.82 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama 3.1 70B Instruct 2024 12 +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: CPU model buffer size = 563.62 MiB +load_tensors: ROCm0 model buffer size = 39979.48 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 266.50 MiB +llama_context: ROCm_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 3791928713 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello. + +llama_perf_sampler_print: sampling time = 0.05 ms / 3 runs ( 0.02 ms per token, 57692.31 tokens per second) +llama_perf_context_print: load time = 6133.42 ms +llama_perf_context_print: prompt eval time = 247.67 ms / 2 tokens ( 123.83 ms per token, 8.08 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 268.37 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 6.904239282s + Run #3 status: 0 + → Avg over 3 runs: 9.338s diff --git a/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log new file mode 100644 index 0000000..687d9ff --- /dev/null +++ b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log @@ -0,0 +1,159 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free +llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama 3.1 70B Instruct 2024 12 +llama_model_loader: - kv 3: general.version str = 2024-12 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Llama-3.1 +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.license str = llama3.1 +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Llama 3.1 70B +llama_model_loader: - kv 10: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 13: general.languages arr[str,7] = ["fr", "it", "pt", "hi", "es", "th", ... +llama_model_loader: - kv 14: llama.block_count u32 = 80 +llama_model_loader: - kv 15: llama.context_length u32 = 131072 +llama_model_loader: - kv 16: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 17: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 18: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 20: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 22: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 23: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 24: general.file_type u32 = 15 +llama_model_loader: - kv 25: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 27: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 28: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 30: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 31: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 34: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 35: general.quantization_version u32 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q4_K: 441 tensors +llama_model_loader: - type q5_K: 40 tensors +llama_model_loader: - type q6_K: 81 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 39.59 GiB (4.82 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama 3.1 70B Instruct 2024 12 +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: CPU model buffer size = 563.62 MiB +load_tensors: ROCm0 model buffer size = 39979.48 MiB +................................................................................................... +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: ROCm_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: ROCm0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: ROCm0 compute buffer size = 266.50 MiB +llama_context: ROCm_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 59935472 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello. + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 46153.85 tokens per second) +llama_perf_context_print: load time = 12737.72 ms +llama_perf_context_print: prompt eval time = 291.99 ms / 2 tokens ( 145.99 ms per token, 6.85 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 306.96 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 13.680764475s + Run #3 status: 0 + → Avg over 3 runs: 14.602s diff --git a/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log new file mode 100644 index 0000000..267da60 --- /dev/null +++ b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log @@ -0,0 +1,157 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free +llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama 3.1 70B Instruct 2024 12 +llama_model_loader: - kv 3: general.version str = 2024-12 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Llama-3.1 +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.license str = llama3.1 +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Llama 3.1 70B +llama_model_loader: - kv 10: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 13: general.languages arr[str,7] = ["fr", "it", "pt", "hi", "es", "th", ... +llama_model_loader: - kv 14: llama.block_count u32 = 80 +llama_model_loader: - kv 15: llama.context_length u32 = 131072 +llama_model_loader: - kv 16: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 17: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 18: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 20: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 22: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 23: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 24: general.file_type u32 = 15 +llama_model_loader: - kv 25: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 27: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 28: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 30: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 31: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 34: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 35: general.quantization_version u32 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q4_K: 441 tensors +llama_model_loader: - type q5_K: 40 tensors +llama_model_loader: - type q6_K: 81 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 39.59 GiB (4.82 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama 3.1 70B Instruct 2024 12 +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: Vulkan0 model buffer size = 39979.48 MiB +load_tensors: CPU model buffer size = 563.62 MiB +.................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 266.50 MiB +llama_context: Vulkan_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 1976378490 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello, + +llama_perf_sampler_print: sampling time = 0.08 ms / 3 runs ( 0.03 ms per token, 36585.37 tokens per second) +llama_perf_context_print: load time = 6987.06 ms +llama_perf_context_print: prompt eval time = 210.77 ms / 2 tokens ( 105.39 ms per token, 9.49 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 232.45 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 7.786884955s + Run #3 status: 0 + → Avg over 3 runs: 9.176s diff --git a/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log new file mode 100644 index 0000000..326ab94 --- /dev/null +++ b/benchmark/loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log @@ -0,0 +1,157 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux +main: llama backend init +main: load the model and apply lora adapter, if any +llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free +llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest)) +llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. +llama_model_loader: - kv 0: general.architecture str = llama +llama_model_loader: - kv 1: general.type str = model +llama_model_loader: - kv 2: general.name str = Llama 3.1 70B Instruct 2024 12 +llama_model_loader: - kv 3: general.version str = 2024-12 +llama_model_loader: - kv 4: general.finetune str = Instruct +llama_model_loader: - kv 5: general.basename str = Llama-3.1 +llama_model_loader: - kv 6: general.size_label str = 70B +llama_model_loader: - kv 7: general.license str = llama3.1 +llama_model_loader: - kv 8: general.base_model.count u32 = 1 +llama_model_loader: - kv 9: general.base_model.0.name str = Llama 3.1 70B +llama_model_loader: - kv 10: general.base_model.0.organization str = Meta Llama +llama_model_loader: - kv 11: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Lla... +llama_model_loader: - kv 12: general.tags arr[str,5] = ["facebook", "meta", "pytorch", "llam... +llama_model_loader: - kv 13: general.languages arr[str,7] = ["fr", "it", "pt", "hi", "es", "th", ... +llama_model_loader: - kv 14: llama.block_count u32 = 80 +llama_model_loader: - kv 15: llama.context_length u32 = 131072 +llama_model_loader: - kv 16: llama.embedding_length u32 = 8192 +llama_model_loader: - kv 17: llama.feed_forward_length u32 = 28672 +llama_model_loader: - kv 18: llama.attention.head_count u32 = 64 +llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8 +llama_model_loader: - kv 20: llama.rope.freq_base f32 = 500000.000000 +llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 +llama_model_loader: - kv 22: llama.attention.key_length u32 = 128 +llama_model_loader: - kv 23: llama.attention.value_length u32 = 128 +llama_model_loader: - kv 24: general.file_type u32 = 15 +llama_model_loader: - kv 25: llama.vocab_size u32 = 128256 +llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128 +llama_model_loader: - kv 27: tokenizer.ggml.model str = gpt2 +llama_model_loader: - kv 28: tokenizer.ggml.pre str = llama-bpe +llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... +llama_model_loader: - kv 30: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... +llama_model_loader: - kv 31: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... +llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 128000 +llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 128009 +llama_model_loader: - kv 34: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ... +llama_model_loader: - kv 35: general.quantization_version u32 = 2 +llama_model_loader: - type f32: 162 tensors +llama_model_loader: - type q4_K: 441 tensors +llama_model_loader: - type q5_K: 40 tensors +llama_model_loader: - type q6_K: 81 tensors +print_info: file format = GGUF V3 (latest) +print_info: file type = Q4_K - Medium +print_info: file size = 39.59 GiB (4.82 BPW) +load: special tokens cache size = 256 +load: token to piece cache size = 0.7999 MB +print_info: arch = llama +print_info: vocab_only = 0 +print_info: n_ctx_train = 131072 +print_info: n_embd = 8192 +print_info: n_layer = 80 +print_info: n_head = 64 +print_info: n_head_kv = 8 +print_info: n_rot = 128 +print_info: n_swa = 0 +print_info: is_swa_any = 0 +print_info: n_embd_head_k = 128 +print_info: n_embd_head_v = 128 +print_info: n_gqa = 8 +print_info: n_embd_k_gqa = 1024 +print_info: n_embd_v_gqa = 1024 +print_info: f_norm_eps = 0.0e+00 +print_info: f_norm_rms_eps = 1.0e-05 +print_info: f_clamp_kqv = 0.0e+00 +print_info: f_max_alibi_bias = 0.0e+00 +print_info: f_logit_scale = 0.0e+00 +print_info: f_attn_scale = 0.0e+00 +print_info: n_ff = 28672 +print_info: n_expert = 0 +print_info: n_expert_used = 0 +print_info: causal attn = 1 +print_info: pooling type = 0 +print_info: rope type = 0 +print_info: rope scaling = linear +print_info: freq_base_train = 500000.0 +print_info: freq_scale_train = 1 +print_info: n_ctx_orig_yarn = 131072 +print_info: rope_finetuned = unknown +print_info: model type = 70B +print_info: model params = 70.55 B +print_info: general.name = Llama 3.1 70B Instruct 2024 12 +print_info: vocab type = BPE +print_info: n_vocab = 128256 +print_info: n_merges = 280147 +print_info: BOS token = 128000 '<|begin_of_text|>' +print_info: EOS token = 128009 '<|eot_id|>' +print_info: EOT token = 128009 '<|eot_id|>' +print_info: EOM token = 128008 '<|eom_id|>' +print_info: LF token = 198 'Ċ' +print_info: EOG token = 128001 '<|end_of_text|>' +print_info: EOG token = 128008 '<|eom_id|>' +print_info: EOG token = 128009 '<|eot_id|>' +print_info: max token length = 256 +load_tensors: loading model tensors, this can take a while... (mmap = false) +load_tensors: offloading 80 repeating layers to GPU +load_tensors: offloading output layer to GPU +load_tensors: offloaded 81/81 layers to GPU +load_tensors: Vulkan0 model buffer size = 39979.48 MiB +load_tensors: CPU model buffer size = 563.62 MiB +.................................................................................................. +llama_context: constructing llama_context +llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache +llama_context: n_seq_max = 1 +llama_context: n_ctx = 4096 +llama_context: n_ctx_per_seq = 4096 +llama_context: n_batch = 2048 +llama_context: n_ubatch = 512 +llama_context: causal_attn = 1 +llama_context: flash_attn = 1 +llama_context: kv_unified = true +llama_context: freq_base = 500000.0 +llama_context: freq_scale = 1 +llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: Vulkan_Host output buffer size = 0.49 MiB +llama_kv_cache_unified: Vulkan0 KV buffer size = 1280.00 MiB +llama_kv_cache_unified: size = 1280.00 MiB ( 4096 cells, 80 layers, 1/ 1 seqs), K (f16): 640.00 MiB, V (f16): 640.00 MiB +llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility +llama_context: Vulkan0 compute buffer size = 266.50 MiB +llama_context: Vulkan_Host compute buffer size = 24.01 MiB +llama_context: graph nodes = 2647 +llama_context: graph splits = 2 +common_init_from_params: added <|end_of_text|> logit bias = -inf +common_init_from_params: added <|eom_id|> logit bias = -inf +common_init_from_params: added <|eot_id|> logit bias = -inf +common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096 +common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) +main: llama threadpool init, n_threads = 16 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +sampler seed: 2613669910 +sampler params: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096 + top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist +generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1 + +Hello's + +llama_perf_sampler_print: sampling time = 0.07 ms / 3 runs ( 0.02 ms per token, 40540.54 tokens per second) +llama_perf_context_print: load time = 8119.06 ms +llama_perf_context_print: prompt eval time = 204.01 ms / 2 tokens ( 102.01 ms per token, 9.80 tokens per second) +llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) +llama_perf_context_print: total time = 225.18 ms / 3 tokens +llama_perf_context_print: graphs reused = 0 + Elapsed #3: 8.699816033s + Run #3 status: 0 + → Avg over 3 runs: 8.816s diff --git a/benchmark/parse_benchmark_results.py b/benchmark/parse_benchmark_results.py new file mode 100644 index 0000000..ed9f60b --- /dev/null +++ b/benchmark/parse_benchmark_results.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +import re, glob, os + +# This script parses llama-bench logs in 'results/' to produce +# Markdown tables for pp512 (prompt processing) and tg128 (text generation). + +# Regex patterns to extract tokens/sec rows +PP_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*pp512\s*\|\s*([\d.]+)\s*±\s*([\d.]+)") +TG_RE = re.compile(r"\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|\s*tg128\s*\|\s*([\d.]+)\s*±\s*([\d.]+)") + +# Patterns to classify errors +LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed", re.IGNORECASE) +HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE) +GENERIC_ERR = re.compile(r"error:|exit \d+", re.IGNORECASE) + +# Env ordering +ENV_ORDER = ["vulkan_radv","vulkan_amdvlk","rocm6_4_2","rocm7_beta","rocm7_rc"] + +data = {} + +# Utility to clean model names +def clean_name(raw): + return re.sub(r"-000\d+-of-000\d+", "", raw) + +# Scan logs +glob_pattern = os.path.join("results", "*.log") +for path in sorted(glob.glob(glob_pattern)): + # Fix: use rsplit, not rssplit + base = os.path.basename(path).rsplit('.log',1)[0] + if '__' not in base: + continue + model_raw, env = base.split('__',1) + model = clean_name(model_raw) + + text = open(path, errors='ignore').read() + # Determine error type + if LOAD_ERR.search(text): + err_type = 'load' + elif HANG_ERR.search(text): + err_type = 'hang' + elif GENERIC_ERR.search(text) and not (PP_RE.search(text) and TG_RE.search(text)): + err_type = 'runtime' + else: + err_type = None + + # Extract performance if no load error + pp_match = PP_RE.search(text) if err_type is None else None + tg_match = TG_RE.search(text) if err_type is None else None + + for key, match in [('pp512', pp_match), ('tg128', tg_match)]: + cell = { + 'mean': match.group(1) if match else None, + 'std': match.group(2) if match else None, + 'error': err_type is not None, + 'etype': err_type + } + data.setdefault(model, {}).setdefault(key, {})[env] = cell + +# Select winner +def pick_winner(env_data): + scores = {e: float(d['mean']) for e,d in env_data.items() if not d['error'] and d['mean']} + if not scores: + return '—' + best = max(scores, key=scores.get) + others = [v for k,v in scores.items() if k!=best] + tag = f"🏆 **{best}**" + if others: + gain = (scores[best]/max(others)-1)*100 + tag += f" (+{gain:.0f}%)" + return tag + +# Render table with distinct error messages +def render_table(test_label, display_name): + print(f"### {display_name} — tokens/second\n") + header = ['Model'] + [e.replace('_',' ').title() for e in ENV_ORDER] + ['Winner'] + print("| " + " | ".join(header) + " |") + print("|" + "|".join(['---']*len(header)) + "|") + + for model in sorted(data, key=lambda s: s.lower()): + row = [f"**{model}**"] + env_data = data[model].get(test_label, {}) + for env in ENV_ORDER: + d = env_data.get(env) + if not d: + cell = '—' + elif d['error']: + et = d['etype'] + if et=='load': + cell = '⚠️ Load Error' + elif et=='hang': + cell = '⚠️ GPU Hang' + else: + cell = '⚠️ Runtime Error' + else: + cell = f"{float(d['mean']):.2f} ± {float(d['std']):.2f}" + row.append(cell) + row.append(pick_winner(env_data)) + print("| " + " | ".join(row) + " |") + print() + +# Output tables +render_table('pp512','Prompt Processing (pp512)') +render_table('tg128','Text Generation (tg128)') + +# Summary of failures by type +fail_lines = [] +for model in sorted(data, key=lambda s: s.lower()): + for test_label, envs in data[model].items(): + for env,d in envs.items(): + if d['error']: + et = d['etype'] or 'unknown' + desc = { + 'load':'failed to load', + 'hang':'GPU hang', + 'runtime':'runtime error', + }.get(et, 'error') + fail_lines.append(f"- **{model}** [{test_label}] on *{env}*: {desc}") +if fail_lines: + print("## Failed Runs\n") + print("\n".join(fail_lines)) \ No newline at end of file diff --git a/benchmark/parse_loadtime_results.py b/benchmark/parse_loadtime_results.py new file mode 100755 index 0000000..54d5297 --- /dev/null +++ b/benchmark/parse_loadtime_results.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Parse the console output of run_loadtime_benchmarks.sh stored in run_loadtime_benchmarks.log, +then produce a Markdown table of average load+inference times per model/env. +""" +import re +from collections import defaultdict, OrderedDict +import sys + +LOGFILE = 'run_loadtime_benchmark.log' +# Define expected environments in desired column order +ENV_ORDER = ['vulkan_radv','vulkan_amdvlk','rocm6_4_2','rocm7_beta','rocm7_rc'] + +# Regex patterns +ENTRY_RE = re.compile(r"✔ \[(?P[^]]+)\] (?P[^ ]+) avg=(?P[0-9.]+)s over (?P[0-9]+) runs") +FAIL_RE = re.compile(r"✖ \[(?P[^]]+)\] (?P[^ ]+) all runs failed") + +# Data containers +results = defaultdict(lambda: {}) # results[model][env] = float or 'ERR' + +# Read and parse log +with open(LOGFILE) as f: + for line in f: + line = line.strip() + m = ENTRY_RE.match(line) + if m: + env = m.group('env') + model = m.group('model') + avg = float(m.group('avg')) + results[model][env] = avg + continue + m2 = FAIL_RE.match(line) + if m2: + env = m2.group('env') + model = m2.group('model') + results[model][env] = None # indicate failure + +# Compute winner per model: smallest time +md_lines = [] +# Header +header = ['Model'] + [e.replace('_',' ').title() for e in ENV_ORDER] + ['Fastest'] +md_lines.append('| ' + ' | '.join(header) + ' |') +md_lines.append('|' + '|'.join(['---']*len(header)) + '|') + +for model in sorted(results, key=lambda s: s.lower()): + row = [f"**{model}**"] + env_times = results[model] + # find fastest + valid = {e:env_times[e] for e in ENV_ORDER if e in env_times and env_times[e] is not None} + if valid: + best_env = min(valid, key=lambda k: valid[k]) + fastest = f"🏆 **{best_env}**" + else: + fastest = '—' + for env in ENV_ORDER: + if env not in env_times: + cell = '—' + else: + t = env_times[env] + if t is None: + cell = '⚠️ Fail' + else: + cell = f"{t:.2f}s" + row.append(cell) + row.append(fastest) + md_lines.append('| ' + ' | '.join(row) + ' |') + +# Print markdown +table = '\n'.join(md_lines) +print(table) + diff --git a/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..3cb770a --- /dev/null +++ b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0x68b7b10) reason :GPU Hang +✖ ! [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..d0101aa --- /dev/null +++ b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0x1587b430) reason :GPU Hang +✖ ! [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..d77c334 --- /dev/null +++ b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log @@ -0,0 +1,5 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +✖ ! [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..7d3b718 --- /dev/null +++ b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +ggml_vulkan: Device memory allocation of size 2491416576 failed. +ggml_vulkan: Requested buffer size exceeds device memory allocation limit: ErrorOutOfDeviceMemory +main: error: failed to load model '/home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf' +✖ ! [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 failed (exit 1) diff --git a/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..f9cf4af --- /dev/null +++ b/benchmark/results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen2 70B Q8_0 | 78.21 GiB | 72.71 B | Vulkan | 99 | 0 | pp512 | 76.48 ± 0.23 | +| qwen2 70B Q8_0 | 78.21 GiB | 72.71 B | Vulkan | 99 | 0 | tg128 | 2.65 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..1d1603d --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 33.17 ± 0.07 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 2.72 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..0d64f23 --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0xa5e9440) reason :GPU Hang +✖ ! [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..b9ba150 --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log @@ -0,0 +1,5 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +✖ ! [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..747dc38 --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 96.23 ± 0.16 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 2.72 ± 0.00 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..22a6a30 --- /dev/null +++ b/benchmark/results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 79.71 ± 0.13 | +| llama 70B Q8_0 | 75.65 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 2.72 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..33fcb65 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 121.52 ± 0.98 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.28 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..535626f --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0x27159430) reason :GPU Hang +✖ ! [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..995bcbd --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 135.36 ± 0.39 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 14.29 ± 0.00 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..f1d30fc --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 243.19 ± 1.20 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 15.28 ± 0.03 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..89e18de --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 137.97 ± 0.99 | +| llama4 17Bx16E (Scout) Q6_K | 82.35 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 15.07 ± 0.05 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log new file mode 100644 index 0000000..b7b6ab3 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0x2b17db10) reason :GPU Hang +✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log new file mode 100644 index 0000000..4981e9d --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0x1a77430) reason :GPU Hang +✖ ! [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log new file mode 100644 index 0000000..9c06e2b --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log @@ -0,0 +1,5 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log new file mode 100644 index 0000000..cda78f6 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 238.93 ± 2.89 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 12.25 ± 0.01 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log new file mode 100644 index 0000000..6a5f1fb --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 145.86 ± 2.44 | +| llama4 17Bx16E (Scout) Q8_0 | 106.65 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 12.27 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..5242c84 --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 132.66 ± 0.56 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.29 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..c8275dd --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | pp512 | 133.71 ± 0.64 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | ROCm | 99 | 0 | tg128 | 17.35 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..2e1a6fc --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log @@ -0,0 +1,5 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +✖ ! [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 failed (exit 134) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..72a362e --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 208.84 ± 1.35 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 20.06 ± 0.01 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..71adfea --- /dev/null +++ b/benchmark/results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | pp512 | 133.49 ± 1.83 | +| llama4 17Bx16E (Scout) Q4_K - Medium | 57.73 GiB | 107.77 B | Vulkan | 99 | 0 | tg128 | 19.99 ± 0.01 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log new file mode 100644 index 0000000..c21206d --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 69.48 ± 0.09 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.54 ± 0.01 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log new file mode 100644 index 0000000..6cb77a4 --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log @@ -0,0 +1,6 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +HW Exception by GPU node-1 (Agent handle: 0x1a8d440) reason :GPU Hang +✖ ! [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 failed (exit 134) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log new file mode 100644 index 0000000..2421a1b --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | pp512 | 74.69 ± 0.17 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | ROCm | 99 | 0 | tg128 | 13.56 ± 0.00 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log new file mode 100644 index 0000000..dba1565 --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | pp512 | 99.94 ± 0.91 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | tg128 | 15.72 ± 0.01 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log new file mode 100644 index 0000000..11f7672 --- /dev/null +++ b/benchmark/results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | pp512 | 58.40 ± 0.21 | +| qwen3moe 235B.A22B Q3_K - Medium | 96.99 GiB | 235.09 B | Vulkan | 99 | 0 | tg128 | 16.29 ± 0.01 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..e10adbb --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 157.74 ± 2.65 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 22.88 ± 0.01 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..bb3fa29 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 151.25 ± 3.33 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 23.80 ± 0.09 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..47cba1d --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 154.95 ± 1.58 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 23.08 ± 0.08 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..c75a868 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 90.91 ± 0.35 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 7.96 ± 0.03 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..ef72d92 --- /dev/null +++ b/benchmark/results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 71.16 ± 0.92 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 7.33 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..445b37e --- /dev/null +++ b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 150.53 ± 1.83 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 22.13 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..1204c49 --- /dev/null +++ b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 147.31 ± 2.22 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 24.12 ± 0.06 | + +build: 66625a59 (6040) diff --git a/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..b366cae --- /dev/null +++ b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | pp512 | 144.59 ± 3.08 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | ROCm | 99 | 0 | tg128 | 23.48 ± 0.01 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..33fe404 --- /dev/null +++ b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 90.38 ± 0.57 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 8.00 ± 0.03 | + +build: 9c35706b (6060) diff --git a/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..cd83c90 --- /dev/null +++ b/benchmark/results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | pp512 | 71.53 ± 1.06 | +| qwen3moe 30B.A3B BF16 | 56.89 GiB | 30.53 B | Vulkan | 99 | 0 | tg128 | 7.34 ± 0.01 | + +build: 66625a59 (6040) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log new file mode 100644 index 0000000..21a2b99 --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 223.36 ± 0.23 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 13.81 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log new file mode 100644 index 0000000..fc2cc5b --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 222.95 ± 0.15 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 13.80 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log new file mode 100644 index 0000000..acf4970 --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | pp512 | 222.99 ± 0.24 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | ROCm | 99 | 0 | tg128 | 13.81 ± 0.00 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log new file mode 100644 index 0000000..2ba5269 --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | pp512 | 683.07 ± 1.03 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | tg128 | 13.84 ± 0.02 | + +build: 9c35706b (6060) diff --git a/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log new file mode 100644 index 0000000..5d31829 --- /dev/null +++ b/benchmark/results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | pp512 | 508.55 ± 0.90 | +| gemma3 12B Q8_0 | 13.40 GiB | 11.77 B | Vulkan | 99 | 0 | tg128 | 13.65 ± 0.02 | + +build: 66625a59 (6040) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log new file mode 100644 index 0000000..bbf9e04 --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 88.73 ± 0.50 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 4.02 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log new file mode 100644 index 0000000..a664b0b --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 82.31 ± 0.29 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 3.99 ± 0.01 | + +build: 66625a59 (6040) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log new file mode 100644 index 0000000..8ab75d9 --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | pp512 | 83.18 ± 0.41 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | ROCm | 99 | 0 | tg128 | 3.99 ± 0.00 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log new file mode 100644 index 0000000..45f0b37 --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +ggml_vulkan: Device memory allocation of size 2819260416 failed. +ggml_vulkan: Requested buffer size exceeds device memory allocation limit: ErrorOutOfDeviceMemory +main: error: failed to load model '/home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf' +✖ ! [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 failed (exit 1) diff --git a/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log new file mode 100644 index 0000000..0dccabf --- /dev/null +++ b/benchmark/results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 0 | pp512 | 135.40 ± 0.29 | +| gemma3 27B BF16 | 50.31 GiB | 27.01 B | Vulkan | 99 | 0 | tg128 | 3.98 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log b/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log new file mode 100644 index 0000000..cd91f9d --- /dev/null +++ b/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 33.89 ± 0.03 | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 4.59 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log b/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log new file mode 100644 index 0000000..cdd01d1 --- /dev/null +++ b/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 33.91 ± 0.04 | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 4.60 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log b/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log new file mode 100644 index 0000000..782d37e --- /dev/null +++ b/benchmark/results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log @@ -0,0 +1,10 @@ +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 ROCm devices: + Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32 +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | ROCm | 99 | 0 | pp512 | 33.82 ± 0.05 | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | ROCm | 99 | 0 | tg128 | 4.52 ± 0.00 | + +build: 4cb208c9 (6066) diff --git a/benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log b/benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log new file mode 100644 index 0000000..2755187 --- /dev/null +++ b/benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 72.75 ± 0.03 | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 5.01 ± 0.00 | + +build: 9c35706b (6060) diff --git a/benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log b/benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log new file mode 100644 index 0000000..b827d6f --- /dev/null +++ b/benchmark/results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log @@ -0,0 +1,8 @@ +ggml_vulkan: Found 1 Vulkan devices: +ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat +| model | size | params | backend | ngl | mmap | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | Vulkan | 99 | 0 | pp512 | 79.12 ± 0.14 | +| llama 70B Q4_K - Medium | 39.59 GiB | 70.55 B | Vulkan | 99 | 0 | tg128 | 4.97 ± 0.00 | + +build: 66625a59 (6040) diff --git a/benchmark/run_benchmarks.log b/benchmark/run_benchmarks.log new file mode 100644 index 0000000..0abefb2 --- /dev/null +++ b/benchmark/run_benchmarks.log @@ -0,0 +1,314 @@ +Found 11 model(s) to bench: + • /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + • /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + • /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + • /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + • /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + • /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + • /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + • /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + • /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + • /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + • /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [rocm7_rc] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [rocm7_beta] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [vulkan_radv] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [vulkan_amdvlk] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [rocm6_4_2] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + + +▶ [rocm7_rc] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [rocm7_beta] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [vulkan_radv] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + * [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 : FAILED + +▶ [rocm6_4_2] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + + * [host] gemma-3-27b-it-BF16-00001-of-00002 : FAILED + +▶ [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [vulkan_radv] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + + * [host] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [vulkan_radv] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + + +▶ [rocm7_rc] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [rocm7_beta] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [vulkan_radv] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [vulkan_amdvlk] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [rocm6_4_2] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + * [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + * [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 : FAILED + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + * [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 : FAILED + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + * [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 : FAILED + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + * [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 : FAILED + + +▶ [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + * [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 : FAILED + +▶ [vulkan_radv] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [vulkan_amdvlk] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + + +▶ [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [rocm7_beta] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [vulkan_radv] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [rocm6_4_2] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + + +▶ [rocm7_rc] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [rocm7_beta] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [vulkan_radv] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [rocm6_4_2] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + diff --git a/benchmark/run_benchmarks.log.backup b/benchmark/run_benchmarks.log.backup new file mode 100644 index 0000000..47d48fc --- /dev/null +++ b/benchmark/run_benchmarks.log.backup @@ -0,0 +1,358 @@ +Found 11 model(s) to bench: + • /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + • /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + • /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + • /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + • /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + • /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + • /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + • /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + • /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + • /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + • /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [rocm7_rc] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [rocm7_beta] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [vulkan_radv] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [vulkan_amdvlk] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [rocm6_4_2] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [host] gemma-3-12b-it-UD-Q8_K_XL + → log: results/gemma-3-12b-it-UD-Q8_K_XL__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf + + +▶ [rocm7_rc] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [rocm7_beta] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [vulkan_radv] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + * [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 : FAILED + +▶ [rocm6_4_2] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + +▶ [host] gemma-3-27b-it-BF16-00001-of-00002 + → log: results/gemma-3-27b-it-BF16-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf + + * [host] gemma-3-27b-it-BF16-00001-of-00002 : FAILED + +▶ [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [vulkan_radv] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [host] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 + → log: results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf + + * [host] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + * [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 : FAILED + +▶ [vulkan_radv] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [host] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 + → log: results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf + + +▶ [rocm7_rc] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [rocm7_beta] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [vulkan_radv] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [vulkan_amdvlk] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [rocm6_4_2] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [host] llama3.3-70.6B-Q4_K_M + → log: results/llama3.3-70.6B-Q4_K_M__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf + + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + * [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 : FAILED + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [host] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf + + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + * [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 : FAILED + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [host] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf + + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + * [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 : FAILED + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + * [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 : FAILED + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + * [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 : FAILED + +▶ [host] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 + → log: results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf + + +▶ [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + * [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 : FAILED + +▶ [vulkan_radv] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [vulkan_amdvlk] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [host] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 + → log: results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf + + +▶ [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [rocm7_beta] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [vulkan_radv] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [rocm6_4_2] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [host] Qwen3-30B-A3B-BF16-00001-of-00002 + → log: results/Qwen3-30B-A3B-BF16-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf + + +▶ [rocm7_rc] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log + → cmd: toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [rocm7_beta] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log + → cmd: toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [vulkan_radv] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log + → cmd: toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [vulkan_amdvlk] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log + → cmd: toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [rocm6_4_2] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log + → cmd: toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + + +▶ [host] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 + → log: results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__host.log + → cmd: llama-bench -ngl 99 -mmp 0 -m /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf + diff --git a/benchmark/run_benchmarks.sh b/benchmark/run_benchmarks.sh new file mode 100755 index 0000000..69571c0 --- /dev/null +++ b/benchmark/run_benchmarks.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -uo pipefail + +MODEL_DIR="$(realpath models)" +RESULTDIR="results" +mkdir -p "$RESULTDIR" + +# Pick exactly one .gguf per model: either +# - any .gguf without "-000*-of-" (single-file models) +# - or the first shard "*-00001-of-*.gguf" +mapfile -t MODEL_PATHS < <( + find "$MODEL_DIR" -type f -name '*.gguf' \ + \( -name '*-00001-of-*.gguf' -o -not -name '*-000*-of-*.gguf' \) \ + | sort +) + +if (( ${#MODEL_PATHS[@]} == 0 )); then + echo "❌ No models found under $MODEL_DIR – check your paths/patterns!" + exit 1 +fi + +echo "Found ${#MODEL_PATHS[@]} model(s) to bench:" +for p in "${MODEL_PATHS[@]}"; do + echo " • $p" +done +echo + +declare -A CMDS=( + [rocm6_4_2]="toolbox run -c llama-rocm-6.4.2 -- /usr/local/bin/llama-bench" + [rocm7_beta]="toolbox run -c llama-rocm-7beta -- /usr/local/bin/llama-bench" + [rocm7_rc]="toolbox run -c llama-rocm-7rc -- /usr/local/bin/llama-bench" + [vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- /usr/sbin/llama-bench" + [vulkan_radv]="toolbox run -c llama-vulkan-radv -- /usr/sbin/llama-bench" +) + +for MODEL_PATH in "${MODEL_PATHS[@]}"; do + MODEL_NAME="$(basename "$MODEL_PATH" .gguf)" + + for ENV in "${!CMDS[@]}"; do + CMD="${CMDS[$ENV]}" + OUT="$RESULTDIR/${MODEL_NAME}__${ENV}.log" + + # build command array + FULL_CMD=( $CMD -ngl 99 -mmp 0 -m "$MODEL_PATH" ) + + printf "\n▶ [%s] %s\n" "$ENV" "$MODEL_NAME" + printf " → log: %s\n" "$OUT" + printf " → cmd: %s\n\n" "${FULL_CMD[*]}" # ← single‐line echo + + # execute + "${FULL_CMD[@]}" >"$OUT" 2>&1 || { + echo "✖ ! [${ENV}] ${MODEL_NAME} failed (exit $?)" >>"$OUT" + echo " * [${ENV}] ${MODEL_NAME} : FAILED" + } + done +done + diff --git a/benchmark/run_loadtime_benchmark.log b/benchmark/run_loadtime_benchmark.log new file mode 100644 index 0000000..c4de9de --- /dev/null +++ b/benchmark/run_loadtime_benchmark.log @@ -0,0 +1,277 @@ +Found 11 models to test with llama-cli (3 runs each) + +▶ [rocm7_rc] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] gemma-3-12b-it-UD-Q8_K_XL avg=3.861s over 3 runs + +▶ [rocm7_beta] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] gemma-3-12b-it-UD-Q8_K_XL avg=3.434s over 3 runs + +▶ [vulkan_radv] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] gemma-3-12b-it-UD-Q8_K_XL avg=4.295s over 3 runs + +▶ [vulkan_amdvlk] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] gemma-3-12b-it-UD-Q8_K_XL avg=3.955s over 3 runs + +▶ [rocm6_4_2] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] gemma-3-12b-it-UD-Q8_K_XL avg=6.686s over 3 runs + +▶ [rocm7_rc] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] gemma-3-27b-it-BF16-00001-of-00002 avg=10.417s over 3 runs + +▶ [rocm7_beta] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] gemma-3-27b-it-BF16-00001-of-00002 avg=10.486s over 3 runs + +▶ [vulkan_radv] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] gemma-3-27b-it-BF16-00001-of-00002 avg=13.579s over 3 runs + +▶ [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 all runs failed + +▶ [rocm6_4_2] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] gemma-3-27b-it-BF16-00001-of-00002 avg=12.495s over 3 runs + +▶ [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=26.362s over 3 runs + +▶ [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=30.024s over 3 runs + +▶ [vulkan_radv] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=30.591s over 3 runs + +▶ [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 all runs failed + +▶ [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=35.301s over 3 runs + +▶ [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=32.911s over 3 runs + +▶ [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=32.796s over 3 runs + +▶ [vulkan_radv] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=30.376s over 3 runs + +▶ [vulkan_amdvlk] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=30.604s over 3 runs + +▶ [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=30.998s over 3 runs + +▶ [rocm7_rc] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] llama3.3-70.6B-Q4_K_M avg=14.602s over 3 runs + +▶ [rocm7_beta] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] llama3.3-70.6B-Q4_K_M avg=9.338s over 3 runs + +▶ [vulkan_radv] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] llama3.3-70.6B-Q4_K_M avg=8.816s over 3 runs + +▶ [vulkan_amdvlk] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] llama3.3-70.6B-Q4_K_M avg=9.176s over 3 runs + +▶ [rocm6_4_2] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] llama3.3-70.6B-Q4_K_M avg=9.887s over 3 runs + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=19.365s over 2 runs + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 all runs failed + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=20.045s over 3 runs + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=16.752s over 3 runs + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=15.776s over 3 runs + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=28.435s over 3 runs + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=28.221s over 3 runs + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=32.810s over 3 runs + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=35.541s over 3 runs + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=31.792s over 3 runs + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=35.742s over 3 runs + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=36.400s over 3 runs + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=41.626s over 3 runs + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=47.967s over 3 runs + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=40.739s over 3 runs + +▶ [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=33.458s over 3 runs + +▶ [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=35.392s over 3 runs + +▶ [vulkan_radv] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=40.722s over 3 runs + +▶ [vulkan_amdvlk] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=44.883s over 3 runs + +▶ [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=39.062s over 3 runs + +▶ [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 avg=22.669s over 3 runs + +▶ [rocm7_beta] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Qwen3-30B-A3B-BF16-00001-of-00002 avg=15.930s over 3 runs + +▶ [vulkan_radv] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Qwen3-30B-A3B-BF16-00001-of-00002 avg=14.761s over 3 runs + +▶ [vulkan_amdvlk] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Qwen3-30B-A3B-BF16-00001-of-00002 avg=12.935s over 3 runs + +▶ [rocm6_4_2] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Qwen3-30B-A3B-BF16-00001-of-00002 avg=22.166s over 3 runs + +▶ [rocm7_rc] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=16.161s over 3 runs + +▶ [rocm7_beta] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=14.392s over 3 runs + +▶ [vulkan_radv] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=14.021s over 3 runs + +▶ [vulkan_amdvlk] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=12.940s over 3 runs + +▶ [rocm6_4_2] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=17.779s over 3 runs + diff --git a/benchmark/run_loadtime_benchmark.log.backup b/benchmark/run_loadtime_benchmark.log.backup new file mode 100644 index 0000000..e3578c3 --- /dev/null +++ b/benchmark/run_loadtime_benchmark.log.backup @@ -0,0 +1,331 @@ +Found 11 models to test with llama-cli (3 runs each) + +▶ [rocm7_rc] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] gemma-3-12b-it-UD-Q8_K_XL avg=3.861s over 3 runs + +▶ [rocm7_beta] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] gemma-3-12b-it-UD-Q8_K_XL avg=3.434s over 3 runs + +▶ [vulkan_radv] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] gemma-3-12b-it-UD-Q8_K_XL avg=4.295s over 3 runs + +▶ [vulkan_amdvlk] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] gemma-3-12b-it-UD-Q8_K_XL avg=3.955s over 3 runs + +▶ [rocm6_4_2] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] gemma-3-12b-it-UD-Q8_K_XL avg=6.686s over 3 runs + +▶ [host] gemma-3-12b-it-UD-Q8_K_XL (runs: 3) + → log : loadtime_results/gemma-3-12b-it-UD-Q8_K_XL__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] gemma-3-12b-it-UD-Q8_K_XL avg=3.785s over 3 runs + +▶ [rocm7_rc] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] gemma-3-27b-it-BF16-00001-of-00002 avg=10.417s over 3 runs + +▶ [rocm7_beta] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] gemma-3-27b-it-BF16-00001-of-00002 avg=10.486s over 3 runs + +▶ [vulkan_radv] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] gemma-3-27b-it-BF16-00001-of-00002 avg=13.579s over 3 runs + +▶ [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [vulkan_amdvlk] gemma-3-27b-it-BF16-00001-of-00002 all runs failed + +▶ [rocm6_4_2] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] gemma-3-27b-it-BF16-00001-of-00002 avg=12.495s over 3 runs + +▶ [host] gemma-3-27b-it-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/gemma-3-27b-it-BF16-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [host] gemma-3-27b-it-BF16-00001-of-00002 all runs failed + +▶ [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=26.362s over 3 runs + +▶ [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=30.024s over 3 runs + +▶ [vulkan_radv] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=30.591s over 3 runs + +▶ [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [vulkan_amdvlk] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 all runs failed + +▶ [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 avg=35.301s over 3 runs + +▶ [host] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [host] Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002 all runs failed + +▶ [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=32.911s over 3 runs + +▶ [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=32.796s over 3 runs + +▶ [vulkan_radv] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=30.376s over 3 runs + +▶ [vulkan_amdvlk] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=30.604s over 3 runs + +▶ [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=30.998s over 3 runs + +▶ [host] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 avg=31.133s over 3 runs + +▶ [rocm7_rc] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] llama3.3-70.6B-Q4_K_M avg=14.602s over 3 runs + +▶ [rocm7_beta] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] llama3.3-70.6B-Q4_K_M avg=9.338s over 3 runs + +▶ [vulkan_radv] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] llama3.3-70.6B-Q4_K_M avg=8.816s over 3 runs + +▶ [vulkan_amdvlk] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] llama3.3-70.6B-Q4_K_M avg=9.176s over 3 runs + +▶ [rocm6_4_2] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] llama3.3-70.6B-Q4_K_M avg=9.887s over 3 runs + +▶ [host] llama3.3-70.6B-Q4_K_M (runs: 3) + → log : loadtime_results/llama3.3-70.6B-Q4_K_M__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] llama3.3-70.6B-Q4_K_M avg=8.979s over 3 runs + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=19.365s over 2 runs + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✖ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 all runs failed + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=20.045s over 3 runs + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=16.752s over 3 runs + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=15.776s over 3 runs + +▶ [host] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 avg=18.146s over 3 runs + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=28.435s over 3 runs + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=28.221s over 3 runs + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=32.810s over 3 runs + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=35.541s over 3 runs + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=31.792s over 3 runs + +▶ [host] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 avg=33.403s over 3 runs + +▶ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=35.742s over 3 runs + +▶ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=36.400s over 3 runs + +▶ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=41.626s over 3 runs + +▶ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=47.967s over 3 runs + +▶ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=40.739s over 3 runs + +▶ [host] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 (runs: 3) + → log : loadtime_results/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 avg=47.723s over 3 runs + +▶ [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=33.458s over 3 runs + +▶ [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=35.392s over 3 runs + +▶ [vulkan_radv] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=40.722s over 3 runs + +▶ [vulkan_amdvlk] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=44.883s over 3 runs + +▶ [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=39.062s over 3 runs + +▶ [host] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 (runs: 3) + → log : loadtime_results/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 avg=44.276s over 3 runs + +▶ [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Qwen3-30B-A3B-BF16-00001-of-00002 avg=22.669s over 3 runs + +▶ [rocm7_beta] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Qwen3-30B-A3B-BF16-00001-of-00002 avg=15.930s over 3 runs + +▶ [vulkan_radv] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Qwen3-30B-A3B-BF16-00001-of-00002 avg=14.761s over 3 runs + +▶ [vulkan_amdvlk] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Qwen3-30B-A3B-BF16-00001-of-00002 avg=12.935s over 3 runs + +▶ [rocm6_4_2] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Qwen3-30B-A3B-BF16-00001-of-00002 avg=22.166s over 3 runs + +▶ [host] Qwen3-30B-A3B-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-30B-A3B-BF16-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Qwen3-30B-A3B-BF16-00001-of-00002 avg=13.034s over 3 runs + +▶ [rocm7_rc] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_rc.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_rc] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=16.161s over 3 runs + +▶ [rocm7_beta] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm7_beta.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm7_beta] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=14.392s over 3 runs + +▶ [vulkan_radv] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_radv.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_radv] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=14.021s over 3 runs + +▶ [vulkan_amdvlk] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__vulkan_amdvlk.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [vulkan_amdvlk] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=12.940s over 3 runs + +▶ [rocm6_4_2] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__rocm6_4_2.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [rocm6_4_2] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=17.779s over 3 runs + +▶ [host] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 (runs: 3) + → log : loadtime_results/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002__host.log + → flags : -ngl 999 -fa --no-mmap -no-cnv -n 1 +✔ [host] Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002 avg=12.546s over 3 runs diff --git a/benchmark/run_loadtime_benchmark.sh b/benchmark/run_loadtime_benchmark.sh new file mode 100755 index 0000000..57612da --- /dev/null +++ b/benchmark/run_loadtime_benchmark.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# run_loadtime_benchmarks.sh +# Benchmark each model with llama-cli: measure load + single-token inference times (including load time) +# Run each model/env combination 3 times and compute average elapsed time +set -uo pipefail + +MODEL_DIR="$(realpath models)" +RESULTDIR="loadtime_results" +mkdir -p "$RESULTDIR" + +# 1) Gather one .gguf per model (single-file or first shard) +mapfile -t MODELS < <( + find "$MODEL_DIR" -type f -name '*.gguf' \ + \( -name '*-00001-of-*.gguf' -o ! -name '*-000*-of-*.gguf' \) \ + | sort +) +if (( ${#MODELS[@]} == 0 )); then + echo "❌ No models found in $MODEL_DIR" >&2 + exit 1 +fi + +echo "Found ${#MODELS[@]} models to test with llama-cli (3 runs each)" + +# 2) Define environments and llama-cli prefix +declare -A ENVS=( + [rocm6_4_2]="toolbox run -c llama-rocm-6.4.2 -- llama-cli" + [rocm7_beta]="toolbox run -c llama-rocm-7beta -- llama-cli" + [rocm7_rc]="toolbox run -c llama-rocm-7rc -- llama-cli" + [vulkan_amdvlk]="toolbox run -c llama-vulkan-amdvlk -- llama-cli" + [vulkan_radv]="toolbox run -c llama-vulkan-radv -- llama-cli" +) + +# Prompt and flags +PROMPT="Hello" +BASE_FLAGS=( -ngl 999 -fa --no-mmap -no-cnv -n 1 ) +REPEATS=3 + +# 3) Loop models/envs +for MODEL_PATH in "${MODELS[@]}"; do + MODEL_NAME="$(basename "${MODEL_PATH%.gguf}")" + + for ENV in "${!ENVS[@]}"; do + # Prepare output file + OUTFILE="$RESULTDIR/${MODEL_NAME}__${ENV}.log" + rm -f "$OUTFILE" + + # Build command prefix array + IFS=' ' read -r -a PREFIX_CMD <<< "${ENVS[$ENV]}" + FLAG_ARRAY=( "${BASE_FLAGS[@]}" ) + + echo + echo "▶ [$ENV] $MODEL_NAME (runs: $REPEATS)" + echo " → log : $OUTFILE" + echo " → flags : ${FLAG_ARRAY[*]}" + + sum=0 + success=0 + + for i in $(seq 1 $REPEATS); do + echo " Run #$i..." >>"$OUTFILE" + start=$(date +%s.%N) + # Run llama-cli; suppress its output to log (no tee) + "${PREFIX_CMD[@]}" "${FLAG_ARRAY[@]}" -m "$MODEL_PATH" -p "$PROMPT" >"$OUTFILE" 2>&1 + status=$? + end=$(date +%s.%N) + elapsed=$(echo "$end - $start" | bc) + echo " Elapsed #$i: ${elapsed}s" >>"$OUTFILE" + echo " Run #$i status: $status" >>"$OUTFILE" + + if [ $status -eq 0 ]; then + sum=$(echo "$sum + $elapsed" | bc) + ((success++)) + else + echo " ✖ run #$i failed" >>"$OUTFILE" + fi + done + + if [ $success -gt 0 ]; then + avg=$(echo "scale=3; $sum / $success" | bc) + echo " → Avg over $success runs: ${avg}s" >>"$OUTFILE" + echo "✔ [$ENV] $MODEL_NAME avg=${avg}s over $success runs" + else + echo " → No successful runs" >>"$OUTFILE" + echo "✖ [$ENV] $MODEL_NAME all runs failed" + fi + done +done + diff --git a/benchmark/temp.py b/benchmark/temp.py new file mode 100644 index 0000000..aa3eba7 --- /dev/null +++ b/benchmark/temp.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +""" +Script to remove host-related entries from log files and delete host files. +""" + +import os +import glob +import shutil +from pathlib import Path + + +def remove_host_entries_from_log(log_file): + """ + Remove all entries that start with '[host]' from the log file. + Each entry is separated by empty lines. + """ + if not os.path.exists(log_file): + print(f"Log file {log_file} not found!") + return False + + # Create backup + backup_file = f"{log_file}.backup" + shutil.copy2(log_file, backup_file) + print(f"Created backup: {backup_file}") + + with open(log_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + filtered_lines = [] + i = 0 + + while i < len(lines): + line = lines[i].strip() + + # Check if this line starts a host entry + if line.startswith('▶ [host]'): + # Skip this entry by finding the next empty line or end of file + i += 1 + while i < len(lines) and lines[i].strip() != '': + i += 1 + # Skip the empty line too if we found one + if i < len(lines) and lines[i].strip() == '': + i += 1 + else: + # Keep this line + filtered_lines.append(lines[i]) + i += 1 + + # Write the filtered content back + with open(log_file, 'w', encoding='utf-8') as f: + f.writelines(filtered_lines) + + print(f"Removed host entries from {log_file}") + return True + + +def remove_host_files(): + """Remove all files with 'host' in their filename.""" + host_files = glob.glob('*host*') + + if not host_files: + print("No files with 'host' in filename found.") + return + + print("Files to be removed:") + for file in host_files: + print(f" - {file}") + + for file in host_files: + try: + os.remove(file) + print(f"Removed: {file}") + except OSError as e: + print(f"Error removing {file}: {e}") + + +def preview_host_entries(log_file): + """Preview what host entries would be removed.""" + if not os.path.exists(log_file): + print(f"Log file {log_file} not found!") + return + + with open(log_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + print("Host entries that would be removed:") + print("-" * 50) + + i = 0 + entry_count = 0 + + while i < len(lines): + line = lines[i].strip() + + if line.startswith('▶ [host]'): + entry_count += 1 + print(f"Entry {entry_count}:") + + # Print this entry until we hit an empty line + while i < len(lines) and lines[i].strip() != '': + print(lines[i].rstrip()) + i += 1 + print() # Add empty line after entry + else: + i += 1 + + print(f"Total host entries found: {entry_count}") + + +def main(): + log_file = "run_benchmarks.log" # Change this to your actual log file name + + print("Host Entry and File Removal Script") + print("=" * 40) + + # Preview what would be removed + preview_host_entries(log_file) + + # Show files that would be removed + host_files = glob.glob('*host*') + if host_files: + print(f"\nFiles with 'host' in filename ({len(host_files)} found):") + for file in host_files: + print(f" - {file}") + + print("\nThis script will:") + print(f"1. Remove host entries from log file: {log_file}") + print("2. Remove all files with 'host' in the filename") + + response = input("\nContinue? (y/N): ").strip().lower() + + if response == 'y' or response == 'yes': + # Remove host entries from log + if remove_host_entries_from_log(log_file): + print("✓ Host entries removed from log file") + + # Remove host files + remove_host_files() + print("✓ Host files removed") + + print("\nDone!") + else: + print("Aborted.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Dockerfile.rocm-6.4.2 b/docker-toolboxes/Dockerfile.rocm-6.4.2 similarity index 100% rename from Dockerfile.rocm-6.4.2 rename to docker-toolboxes/Dockerfile.rocm-6.4.2 diff --git a/Dockerfile.rocm-7beta b/docker-toolboxes/Dockerfile.rocm-7beta similarity index 100% rename from Dockerfile.rocm-7beta rename to docker-toolboxes/Dockerfile.rocm-7beta diff --git a/docker-toolboxes/Dockerfile.rocm-7rc b/docker-toolboxes/Dockerfile.rocm-7rc new file mode 100644 index 0000000..c15310b --- /dev/null +++ b/docker-toolboxes/Dockerfile.rocm-7rc @@ -0,0 +1,79 @@ +FROM fedora:rawhide + +# 1) Install dependencies +RUN dnf install -y \ + make gcc cmake lld clang clang-devel compiler-rt libcurl-devel \ + radeontop git vim patch curl \ + && dnf clean all + +# 2) Download ROCm nightly tarball +WORKDIR /tmp +RUN curl -L -o therock.tar.gz \ + https://github.com/ROCm/TheRock/releases/download/nightly-tarball/therock-dist-linux-gfx1151-7.0.0rc20250714.tar.gz + +# 3) Extract into /opt/rocm-7.0 +RUN mkdir -p /opt/rocm-7.0 \ + && tar xvf therock.tar.gz -C /opt/rocm-7.0 --strip-components=1 + +# 4) Bake in ROCm env + full system PATH +ENV ROCM_PATH=/opt/rocm-7.0 \ + HIP_PLATFORM=amd \ + HIP_PATH=/opt/rocm-7.0 \ + HIP_CLANG_PATH=/opt/rocm-7.0/llvm/bin \ + HIP_INCLUDE_PATH=/opt/rocm-7.0/include \ + HIP_LIB_PATH=/opt/rocm-7.0/lib \ + HIP_DEVICE_LIB_PATH=/opt/rocm-7.0/lib/llvm/amdgcn/bitcode \ + PATH=/opt/rocm-7.0/bin:/opt/rocm-7.0/llvm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \ + LD_LIBRARY_PATH=/opt/rocm-7.0/lib:/opt/rocm-7.0/lib64:/opt/rocm-7.0/llvm/lib \ + LIBRARY_PATH=/opt/rocm-7.0/lib:/opt/rocm-7.0/lib64 \ + CPATH=/opt/rocm-7.0/include \ + PKG_CONFIG_PATH=/opt/rocm-7.0/lib/pkgconfig + +# 5) (Optional) profile.d snippet for login & interactive shells +RUN tee /etc/profile.d/rocm.sh << 'EOF' +export ROCM_PATH=/opt/rocm-7.0 +export HIP_PLATFORM=amd +export HIP_PATH=/opt/rocm-7.0 +export HIP_CLANG_PATH=/opt/rocm-7.0/llvm/bin +export HIP_INCLUDE_PATH=/opt/rocm-7.0/include +export HIP_LIB_PATH=/opt/rocm-7.0/lib +export HIP_DEVICE_LIB_PATH=/opt/rocm-7.0/lib/llvm/amdgcn/bitcode + +export PATH="$ROCM_PATH/bin:$HIP_CLANG_PATH:$PATH" +export LD_LIBRARY_PATH="$HIP_LIB_PATH:$ROCM_PATH/lib:$ROCM_PATH/lib64:$ROCM_PATH/llvm/lib" +export LIBRARY_PATH="$HIP_LIB_PATH:$ROCM_PATH/lib:$ROCM_PATH/lib64" +export CPATH="$HIP_INCLUDE_PATH" +export PKG_CONFIG_PATH="$ROCM_PATH/lib/pkgconfig" +EOF +RUN chmod +x /etc/profile.d/rocm.sh \ + && echo 'source /etc/profile.d/rocm.sh' >> /etc/bashrc + +# 6) Clone llama.cpp +WORKDIR /opt/llama.cpp +RUN git clone --recursive https://github.com/ggerganov/llama.cpp.git . \ + && git clean -xdf \ + && git submodule update --recursive + +# 7) Copy in your external patch and apply +COPY hip-rocm7rc.patch /opt/llama.cpp/hip-rocm7rc.patch +RUN patch -p1 < hip-rocm7rc.patch + +# 8) Configure, build & install llama.cpp with HIP +RUN cmake -S . -B build \ + -DGGML_HIP=ON \ + -DAMDGPU_TARGETS=gfx1151 \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLAMA_HIP_UMA=ON \ + && cmake --build build --config Release -- -j$(nproc) \ + && cmake --install build --config Release + +# 9) Copy the .so from build/bin into /usr/lib64 so ldconfig can see it +RUN find /opt/llama.cpp/build -type f -name 'lib*.so*' -exec cp {} /usr/lib64/ \; \ + && ldconfig + +# 10) Install helper script +COPY gguf-vram-estimator.py /usr/local/bin/ +RUN chmod +x /usr/local/bin/gguf-vram-estimator.py + +# 11) Default to interactive bash +CMD ["/bin/bash"] diff --git a/docker-toolboxes/Dockerfile.vulkan-amdvlk b/docker-toolboxes/Dockerfile.vulkan-amdvlk new file mode 100644 index 0000000..79c8a10 --- /dev/null +++ b/docker-toolboxes/Dockerfile.vulkan-amdvlk @@ -0,0 +1,38 @@ +FROM fedora:rawhide + +# Install build tools, Vulkan headers/loader, and glslc +RUN dnf install -y \ + git vim \ + make gcc cmake ninja-build lld clang clang-devel compiler-rt libcurl-devel \ + vulkan-loader-devel mesa-vulkan-drivers \ + radeontop glslc wget \ + && dnf clean all + +# Get AMDVLK drivers +RUN curl -L -o /tmp/amdvlk-2025.Q2.1.x86_64.rpm \ + https://github.com/GPUOpen-Drivers/AMDVLK/releases/download/v-2025.Q2.1/amdvlk-2025.Q2.1.x86_64.rpm +RUN dnf install -y /tmp/amdvlk-*.rpm + +WORKDIR /opt/llama.cpp + +# Clone llama.cpp +RUN git clone --recursive https://github.com/ggerganov/llama.cpp.git . + +# Build with Vulkan support +RUN git clean -xdf \ + && git pull \ + && git submodule update --recursive \ + && cmake -S . -B build -G Ninja \ + -DGGML_VULKAN=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=ON \ + -DLLAMA_BUILD_SERVER=ON \ + && cmake --build build --config Release \ + && cmake --install build --config Release + +COPY gguf-vram-estimator.py /usr/local/bin/gguf-vram-estimator.py +RUN chmod +x /usr/local/bin/gguf-vram-estimator.py + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.vulkan b/docker-toolboxes/Dockerfile.vulkan-radv similarity index 100% rename from Dockerfile.vulkan rename to docker-toolboxes/Dockerfile.vulkan-radv diff --git a/gguf-vram-estimator.py b/docker-toolboxes/gguf-vram-estimator.py similarity index 100% rename from gguf-vram-estimator.py rename to docker-toolboxes/gguf-vram-estimator.py diff --git a/docker-toolboxes/hip-rocm7rc.patch b/docker-toolboxes/hip-rocm7rc.patch new file mode 100644 index 0000000..ac2204f --- /dev/null +++ b/docker-toolboxes/hip-rocm7rc.patch @@ -0,0 +1,28 @@ +diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h +index 8b172e60..b813f523 100644 +--- a/ggml/src/ggml-cuda/vendors/hip.h ++++ b/ggml/src/ggml-cuda/vendors/hip.h +@@ -137,19 +137,11 @@ + #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR + #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED + +-#if HIP_VERSION >= 70000000 +-#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F +-#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F ++#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F ++#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F + #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F +-#define cublasComputeType_t hipblasComputeType_t +-#define cudaDataType_t hipDataType +-#else +-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +-#define cublasComputeType_t hipblasDatatype_t +-#define cudaDataType_t hipblasDatatype_t +-#endif // HIP_VERSION >= 7000000 ++#define cublasComputeType_t hipblasComputeType_t ++#define cudaDataType_t hipDataType + + #if !defined(__HIP_PLATFORM_AMD__) + #error "The HIP backend supports only AMD targets" diff --git a/docs/benchmarks.md b/docs/benchmarks.md new file mode 100644 index 0000000..8679479 --- /dev/null +++ b/docs/benchmarks.md @@ -0,0 +1,138 @@ +# 1. Benchmark Results: Strix Halo Llama.cpp Toolboxes + +This document presents comprehensive benchmarks of all supported Llama.cpp containers and backends, focusing on real GPU workloads and model loading times on the AMD Ryzen AI Max 395 "Strix Halo" iGPU. + +## 2. Benchmark Methodology + +Benchmarks cover both end-to-end performance (prompt processing and text generation) and model load times. Model load time benchmarks (llama-cli) are averaged over three runs per environment; inference benchmarks (llama-bench) use default tool settings. + +Backends tested: + +* **Vulkan RADV** (open source Vulkan driver) +* **Vulkan AMDVLK** (official AMD open Vulkan driver) +* **ROCm 6.4.2** (AMD's compute stack) +* **ROCm 7.0 beta** (AMD's compute stack) +* **ROCm 7.0 rc** (AMD's compute stack) + +### 2.1. Llama.cpp Inference Benchmarks + +#### 2.1.1. Script: `run_benchmarks.sh` + +This script runs each model through every container/backend using the `llama-bench` tool. + +##### Command Used + +```bash +llama-bench -ngl 99 -mmp 0 -m /path/to/model.gguf +``` + +* `-ngl 99` — Use all available GPU layers +* `-mmp 0` — Disable mmap (required for ROCm to avoid extremely slow loads for models >64GB, and also improves speed for Vulkan drivers) +* `-m` — Path to the GGUF model file + +Script location: `benchmark/run_benchmarks.sh` +Benchmark logs: `benchmark/results/` + +##### Model Location + +All scripts expect models in the `models/` directory (absolute path is recommended). For sharded models, the first shard must be present and named according to the GGUF naming convention (`*-00001-of-00002.gguf`). + +### Prompt Processing (pp512) — tokens/second + +| Model | Vulkan Radv | Vulkan Amdvlk | Rocm6 4 2 | Rocm7 Beta | Rocm7 Rc | Winner | +|---|---|---|---|---|---|---| +| **gemma-3-12b-it-UD-Q8_K_XL** | 508.55 ± 0.90 | 683.07 ± 1.03 | 223.36 ± 0.23 | 222.95 ± 0.15 | 222.99 ± 0.24 | 🏆 **vulkan_amdvlk** (+34%) | +| **gemma-3-27b-it-BF16** | 135.40 ± 0.29 | ⚠️ Load Error | 88.73 ± 0.50 | 82.31 ± 0.29 | 83.18 ± 0.41 | 🏆 **vulkan_radv** (+53%) | +| **Kimi-Dev-72B-UD-Q8_K_XL** | 76.48 ± 0.23 | ⚠️ Load Error | ⚠️ GPU Hang | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **vulkan_radv** | +| **Llama-3.3-70B-Instruct-UD-Q8_K_XL** | 79.71 ± 0.13 | 96.23 ± 0.16 | 33.17 ± 0.07 | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **vulkan_amdvlk** (+21%) | +| **Llama-4-Scout-17B-16E-Instruct-Q6_K** | 137.97 ± 0.99 | 243.19 ± 1.20 | 121.52 ± 0.98 | ⚠️ GPU Hang | 135.36 ± 0.39 | 🏆 **vulkan_amdvlk** (+76%) | +| **Llama-4-Scout-17B-16E-Instruct-Q8_0** | 145.86 ± 2.44 | 238.93 ± 2.89 | ⚠️ GPU Hang | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **vulkan_amdvlk** (+64%) | +| **Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL** | 133.49 ± 1.83 | 208.84 ± 1.35 | 132.66 ± 0.56 | 133.71 ± 0.64 | ⚠️ Runtime Error | 🏆 **vulkan_amdvlk** (+56%) | +| **llama3.3-70.6B-Q4_K_M** | 79.12 ± 0.14 | 72.75 ± 0.03 | 33.89 ± 0.03 | 33.91 ± 0.04 | 33.82 ± 0.05 | 🏆 **vulkan_radv** (+9%) | +| **Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL** | 58.40 ± 0.21 | 99.94 ± 0.91 | 69.48 ± 0.09 | ⚠️ GPU Hang | 74.69 ± 0.17 | 🏆 **vulkan_amdvlk** (+34%) | +| **Qwen3-30B-A3B-BF16** | 71.16 ± 0.92 | 90.91 ± 0.35 | 157.74 ± 2.65 | 151.25 ± 3.33 | 154.95 ± 1.58 | 🏆 **rocm6_4_2** (+2%) | +| **Qwen3-Coder-30B-A3B-Instruct-BF16** | 71.53 ± 1.06 | 90.38 ± 0.57 | 150.53 ± 1.83 | 147.31 ± 2.22 | 144.59 ± 3.08 | 🏆 **rocm6_4_2** (+2%) | + +### Text Generation (tg128) — tokens/second + +| Model | Vulkan Radv | Vulkan Amdvlk | Rocm6 4 2 | Rocm7 Beta | Rocm7 Rc | Winner | +|---|---|---|---|---|---|---| +| **gemma-3-12b-it-UD-Q8_K_XL** | 13.65 ± 0.02 | 13.84 ± 0.02 | 13.81 ± 0.00 | 13.80 ± 0.00 | 13.81 ± 0.00 | 🏆 **vulkan_amdvlk** (+0%) | +| **gemma-3-27b-it-BF16** | 3.98 ± 0.00 | ⚠️ Load Error | 4.02 ± 0.00 | 3.99 ± 0.01 | 3.99 ± 0.00 | 🏆 **rocm6_4_2** (+1%) | +| **Kimi-Dev-72B-UD-Q8_K_XL** | 2.65 ± 0.00 | ⚠️ Load Error | ⚠️ GPU Hang | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **vulkan_radv** | +| **Llama-3.3-70B-Instruct-UD-Q8_K_XL** | 2.72 ± 0.00 | 2.72 ± 0.00 | 2.72 ± 0.00 | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **rocm6_4_2** (+0%) | +| **Llama-4-Scout-17B-16E-Instruct-Q6_K** | 15.07 ± 0.05 | 15.28 ± 0.03 | 14.28 ± 0.00 | ⚠️ GPU Hang | 14.29 ± 0.00 | 🏆 **vulkan_amdvlk** (+1%) | +| **Llama-4-Scout-17B-16E-Instruct-Q8_0** | 12.27 ± 0.00 | 12.25 ± 0.01 | ⚠️ GPU Hang | ⚠️ GPU Hang | ⚠️ Runtime Error | 🏆 **vulkan_radv** (+0%) | +| **Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL** | 19.99 ± 0.01 | 20.06 ± 0.01 | 17.29 ± 0.00 | 17.35 ± 0.00 | ⚠️ Runtime Error | 🏆 **vulkan_amdvlk** (+0%) | +| **llama3.3-70.6B-Q4_K_M** | 4.97 ± 0.00 | 5.01 ± 0.00 | 4.59 ± 0.00 | 4.60 ± 0.00 | 4.52 ± 0.00 | 🏆 **vulkan_amdvlk** (+1%) | +| **Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL** | 16.29 ± 0.01 | 15.72 ± 0.01 | 13.54 ± 0.01 | ⚠️ GPU Hang | 13.56 ± 0.00 | 🏆 **vulkan_radv** (+4%) | +| **Qwen3-30B-A3B-BF16** | 7.33 ± 0.00 | 7.96 ± 0.03 | 22.88 ± 0.01 | 23.80 ± 0.09 | 23.08 ± 0.08 | 🏆 **rocm7_beta** (+3%) | +| **Qwen3-Coder-30B-A3B-Instruct-BF16** | 7.34 ± 0.01 | 8.00 ± 0.03 | 22.13 ± 0.00 | 24.12 ± 0.06 | 23.48 ± 0.01 | 🏆 **rocm7_beta** (+3%) | + +##### Error Legend + +* `⚠️ Load Error` — Model failed to load in this environment (usually OOM or driver error) +* `⚠️ GPU Hang` — GPU hung during inference (may work outside stress test) +* `⚠️ Runtime Error` — Miscellaneous runtime failure (check logs) + +### 2.2. Model Loading Time Benchmarks + +#### 2.2.1. Script: `run_loadtime_benchmark.sh` + +This script benchmarks **model load + single-token inference** (using `llama-cli`) for every backend, using a minimal prompt. Three runs per combination are averaged. + +##### Command Used + +```bash +llama-cli -ngl 999 -fa --no-mmap -no-cnv -n 1 -m /path/to/model.gguf -p "Hello" +``` + +* `-ngl 999` — Use all available GPU layers +* `-fa` — Enable fast attention (default for most GPU builds) +* `--no-mmap` — Disable mmap (ensures all RAM usage is counted) +* `-no-cnv` — Disable convolution (relevant for some models) +* `-n 1` — Generate only one token (measures load + first inference) +* `-m` — Path to GGUF model +* `-p` — Prompt text ("Hello") + +Script location: `benchmark/run_loadtime_benchmark.sh` +Logs: `benchmark/loadtime_results/` + +#### 2.2.2. Results: Model Load + First Token (Seconds, Lower is Better) + +| Model | Vulkan Radv | Vulkan Amdvlk | Rocm6 4 2 | Rocm7 Beta | Rocm7 Rc | Fastest | +|---|---|---|---|---|---|---| +| **gemma-3-12b-it-UD-Q8_K_XL** | 4.29s | 3.96s | 6.69s | 3.43s | 3.86s | 🏆 **rocm7_beta** | +| **gemma-3-27b-it-BF16-00001-of-00002** | 13.58s | ⚠️ Fail | 12.49s | 10.49s | 10.42s | 🏆 **rocm7_rc** | +| **Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002** | 30.59s | ⚠️ Fail | 35.30s | 30.02s | 26.36s | 🏆 **rocm7_rc** | +| **Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002** | 30.38s | 30.60s | 31.00s | 32.80s | 32.91s | 🏆 **vulkan_radv** | +| **Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002** | 32.81s | 35.54s | 31.79s | 28.22s | 28.43s | 🏆 **rocm7_beta** | +| **Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003** | 41.63s | 47.97s | 40.74s | 36.40s | 35.74s | 🏆 **rocm7_rc** | +| **Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002** | 20.05s | 16.75s | 15.78s | ⚠️ Fail | 19.36s | 🏆 **rocm6_4_2** | +| **llama3.3-70.6B-Q4_K_M** | 8.82s | 9.18s | 9.89s | 9.34s | 14.60s | 🏆 **vulkan_radv** | +| **Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003** | 40.72s | 44.88s | 39.06s | 35.39s | 33.46s | 🏆 **rocm7_rc** | +| **Qwen3-30B-A3B-BF16-00001-of-00002** | 14.76s | 12.94s | 22.17s | 15.93s | 22.67s | 🏆 **vulkan_amdvlk** | +| **Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002** | 14.02s | 12.94s | 17.78s | 14.39s | 16.16s | 🏆 **vulkan_amdvlk** | + +##### Error Legend + +* `⚠️ Fail` — Model failed to load (OOM or crash). May succeed if not under stress/test conditions. + +--- + +## 3. Interpreting the Results & Caveats + +* **Vulkan AMDVLK** generally gives the best performance for small/medium models, but ROCm 7.x improves as model size increases. +* **Vulkan RADV** is highly reliable and competitive on large models (esp. if AMDVLK fails to load). +* **ROCm** (especially 7.0 RC) delivers the fastest load times for the largest models. +* Many models that fail under `llama-bench` (e.g., due to GPU hangs or OOM) can sometimes still run interactively (especially outside a stress-test context). + +## 4. How to Reproduce These Benchmarks + +* Place all GGUF models in your `models/` directory. +* Use the scripts from the `benchmark/` folder: + + * `run_benchmarks.sh` for inference throughput + * `run_loadtime_benchmark.sh` for loading times +* Output logs and tables will be written in `benchmark/results/` and `benchmark/loadtime_results/`. + diff --git a/docs/building.md b/docs/building.md new file mode 100644 index 0000000..e699907 --- /dev/null +++ b/docs/building.md @@ -0,0 +1,75 @@ + +# Building Containers Locally + +If you want to build or customize the toolbox containers yourself (rather than using the pre-built Docker Hub images), this guide explains the process. Local builds are useful if you want to: + +* Use a patched or forked version of llama.cpp +* Add additional tools or libraries +* Change the Fedora base image (Rawhide vs. stable) +* Audit every installed dependency + +--- + +## 1. Prerequisites + +* **Podman** (recommended on Fedora) or **Docker** (also fine) + +--- + +## 2. Build an Image + +Each backend has its own subdirectory and Dockerfile in `toolboxes/`. + +**Example: Build the Vulkan RADV toolbox image** + +```sh +cd toolboxes +podman build -t llama-vulkan-radv Dockerfile.vulkan-radv +``` + +**Example: Build the ROCm 6.4.2 toolbox image** + +```sh +cd ../rocm-6.4.2 +podman build -t llama-rocm-6.4.2 Dockerfile.rocm-6.4.2 +``` + +> You can use `docker build` if you prefer Docker. + +--- + +## 3. Customizing the Build + +* **llama.cpp version**: Change the `git clone` or `git checkout` line in the Dockerfile. +* **Extra dependencies**: Add them to the Dockerfile as needed. +* **Other customizations**: Install tools, patch scripts, or swap to a different base image. + +--- + +## 4. Using the Custom Image with Toolbx + +Create a new toolbox using your freshly built image: + +```sh +toolbox create llama-vulkan-radv --image localhost/llama-vulkan-radv \ + -- --device /dev/dri --group-add video --security-opt seccomp=unconfined +``` + +Replace the backend/image name and device/group options as needed (see main README Section 2.1). + +--- + +## 5. Troubleshooting + +* **Build fails (ROCm images especially):** Try building with more memory or swap. +* **Toolbox can't access GPU:** Make sure you pass the correct device/group options. + +--- + +## 6. References + +* [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/) +* [Podman Build Reference](https://docs.podman.io/en/latest/markdown/podman-build.1.html) +* [Docker Build Reference](https://docs.docker.com/engine/reference/commandline/build/) + + diff --git a/docs/vram-estimator.md b/docs/vram-estimator.md new file mode 100644 index 0000000..9612b66 --- /dev/null +++ b/docs/vram-estimator.md @@ -0,0 +1,89 @@ +--- + +## docs/vram-estimator.md + +--- + +# 1. Memory Planning with `gguf-vram-estimator.py` + +Estimating memory requirements is critical when running large models on Strix Halo (or any GPU with limited RAM). It's not enough to check just the model file size: context length and runtime overheads matter. + +This repo provides a tool, **`gguf-vram-estimator.py`**, which reads a `.gguf` model and prints the estimated VRAM needed for different context sizes. + +**Why?** + +* Helps decide what fits on 32GB, 64GB, 128GB, etc—especially with multi-shard models or large quantized files. + +--- + +## 2. Usage + +Make sure you have the estimator script (in `tools/`): + +```sh +python3 tools/gguf-vram-estimator.py x` +``` + +* Supply one or more context lengths to get the corresponding VRAM footprint. +* Handles multi-shard and single-shard models. + +--- + +## 3. Examples + +### 3.1 Llama-4-Scout 17B Q4\_K\_XL, up to 1M tokens + +``` +$ python3 tools/gguf-vram-estimator.py models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf --contexts 4096 32768 1048576 + +--- Model 'Llama-4-Scout-17B-16E-Instruct' --- +Max Context: 10,485,760 tokens +Model Size: 57.74 GiB +Incl. Overhead: 2.00 GiB + +--- Memory Footprint Estimation --- + Context Size | Context Memory | Est. Total VRAM +--------------------------------------------------- + 4,096 | 1.88 GiB | 61.62 GiB + 32,768 | 15.06 GiB | 74.80 GiB + 1,048,576 | 49.12 GiB | 108.87 GiB +``` + +* **Takeaway:** + + * Q4\_K quantization allows for a huge context in 128GB, but *processing 1M tokens will be extremely slow* (see benchmark: 200 tokens/sec prompt processing ⇒ almost 1.5 hours for a full 1M context fill). + +--- + +### 3.2 Qwen3-235B Q3\_K XL, high context + +``` +$ python3 tools/gguf-vram-estimator.py models/qwen3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf --contexts 65536 131072 262144 + +--- Memory Footprint Estimation --- + Context Size | Context Memory | Est. Total VRAM +--------------------------------------------------- + 65,536 | 11.75 GiB | 110.75 GiB + 131,072 | 23.50 GiB | 122.50 GiB + 262,144 | 47.00 GiB | 146.00 GiB +``` + +* **Takeaway:** + + * With 128GB, you can go up to \~130k context on this Qwen 235B quantized model. + * If you go higher, you will OOM—even before context reaches the model's max. + +--- + +## 4. Notes + +* “Est. Total VRAM” is the minimum you’ll need for the model + context, but does not include OS, other processes, or toolbox/container overhead—leave a margin. +* For detailed methodology or custom scenarios, check the script source. +* Benchmark speed for large context sizes is often the real bottleneck—see `docs/benchmarks.md` for real throughput figures. + +--- + +## 5. Related + +* Main README section [Memory Planning & VRAM Estimator](../Readme#4--memory-planning--vram-estimator) +* [docs/benchmarks.md](benchmarks.md) for full speed/compat charts