diff --git a/README.md b/README.md index c166fe0..d1fea38 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,12 @@ llama-server -m models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF1 -c 8192 -ngl 999 -fa 1 --no-mmap ``` +**Router Mode:** +> Uses [`models.ini`](docs/models.ini.example) preset configuration for multi-model routing. +```sh +llama-server --models-preset models.ini --host 0.0.0.0 --port 8080 --models-max 1 --parallel 1 +``` + **CLI Mode:** ```sh llama-cli --no-mmap -ngl 999 -fa 1 \ diff --git a/docs/models.ini.example b/docs/models.ini.example new file mode 100644 index 0000000..a7ba2ed --- /dev/null +++ b/docs/models.ini.example @@ -0,0 +1,92 @@ +version = 1 + +[*] +threads = 12 +flash-attn = on +mlock = off +mmap = off +fit = off +warmup = off +batch-size = 4096 +ubatch-size = 512 +cache-type-k = q8_0 +cache-type-v = q8_0 +jinja = true +direct-io = on +cache-prompt = true +cache-reuse = 256 +cache-ram = 32768 + +# --- MODELS --- + +[unsloth-Qwen3-Coder-Next] +model = /path/to/models/unsloth-Qwen3-Coder-Next/UD-Q6_K_XL/Qwen3-Coder-Next-UD-Q6_K_XL-00001-of-00003.gguf +n-gpu-layers = 999 +ctx-size = 128000 +temp = 1.0 +top-p = 0.95 +top-k = 40 +min-p = 0.01 + +[unsloth-GLM-4.7-Flash] +model = /path/to/models/unsloth-GLM-4.7-Flash-GGUF/GLM-4.7-Flash-UD-Q6_K_XL.gguf +n-gpu-layers = 999 +ctx-size = 128000 +temp = 0.7 +top-p = 1.0 +min-p = 0.01 + +[gpt-oss-120b] +model = /path/to/models/gpt-oss-120/ud-q8_k_xl/gpt-oss-120b-UD-Q8_K_XL-00001-of-00002.gguf +alias = gpt-120b +n-gpu-layers = 999 +ctx-size = 65536 +temp = 0.8 +min-p = 0.05 +chat-template-kwargs = {"reasoning_effort": "high"} + +[llama-4-scout] +model = /path/to/models/llama4-scout-17b-16e/unsloth/q4_k_xl/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf +mmproj = /path/to/models/llama4-scout-17b-16e/unsloth/q4_k_xl/unsloth-Llama-4-Scout-17B-16E-Instruct-GGUF-mmproj-BF16.gguf +alias = llama-4 +n-gpu-layers = 999 +ctx-size = 65536 + +[unsloth-MiniMax-M2.5] +model = /path/to/models/unsloth-MiniMax-M2.5-GGUF/UD-IQ3_XXS/MiniMax-M2.5-UD-IQ3_XXS-00001-of-00003.gguf +n-gpu-layers = 999 +ctx-size = 128000 +temp = 1.0 +top-p = 0.95 +top-k = 40 +min-p = 0.01 + +[unsloth-Qwen3.5-397B-A17B] +model = /path/to/models/unsloth-Qwen3.5-397B-A17B/Qwen3.5-397B-A17B-UD-TQ1_0.gguf +mmproj = /path/to/models/unsloth-Qwen3.5-397B-A17B/mmproj-BF16.gguf +alias = qwen3.5 +n-gpu-layers = 999 +ctx-size = 128000 +temp = 0.6 +top-p = 0.95 +top-k = 20 +min-p = 0.0 +cache-type-k = q4_0 +cache-type-v = q4_0 +chat-template-kwargs = {"enable_thinking": true} +ubatch-size = 256 +cache-reuse = 0 + +[unsloth-Qwen3.5-122B-A10B] +model = /path/to/models/unsloth-Qwen3.5-122B-A10B-GGUF/Qwen3.5-122B-A10B-MXFP4_MOE-00001-of-00003.gguf +alias = qwen3.5-122b +n-gpu-layers = 999 +ctx-size = 128000 +temp = 1.0 +top-p = 0.95 +top-k = 20 +min-p = 0.0 +ubatch-size = 2048 +cache-type-k = q4_0 +cache-type-v = q4_0 +cache-reuse = 0 \ No newline at end of file