Updated benchmakrs, removed old toolboxes and results

2025-08-17 12:32:08 +01:00
parent 62e5080102
commit b71a37647f
130 changed files with 733 additions and 14425 deletions
@@ -0,0 +1,571 @@
+#!/usr/bin/env python3
+"""
+gen_benchmarks_md.py — Generate Markdown for README + detailed benchmarks from results.json
+
+Defaults:
+- Input JSON: ../docs/results.json
+- Outputs: ./README_benchmarks_section.md and ./benchmarks_generated.md
+"""
+
+from __future__ import annotations
+import json
+import argparse
+import statistics as stats
+from pathlib import Path
+from collections import defaultdict
+from typing import Dict, List, Tuple, Optional
+
+# === ENV LABELS ===
+ENV_LABEL: Dict[str, str] = {
+    # ROCm 7 RC
+    "rocm7_rc-rocwmma": "ROCm 7 RC + ROCWMMA + hipBLASLt",
+    "rocm7_rc": "ROCm 7 RC (hipBLASLt)",
+    "rocm7_rc-hblt0": "ROCm 7 RC (hipBLASLt OFF)",
+    "rocm7_rc-rocwmma-hblt0": "ROCm 7 RC + ROCWMMA (hipBLASLt OFF)",
+
+    # ROCm 6.4.3
+    "rocm6_4_3": "ROCm 6.4.3 (hipBLASLt)",
+    "rocm6_4_3-hblt0": "ROCm 6.4.3 (hipBLASLt OFF)",
+    "rocm6_4_3-rocwmma": "ROCm 6.4.3 + ROCWMMA (hipBLASLt)",
+    "rocm6_4_3-rocwmma-hblt0": "ROCm 6.4.3 + ROCWMMA (hipBLASLt OFF)",
+
+    # Vulkan
+    "vulkan_amdvlk": "Vulkan AMDVLK",
+    "vulkan_radv": "Vulkan RADV",
+}
+
+TESTS = ["pp512", "tg128"]
+
+def md_row(values: List[str]) -> str:
+    return "| " + " | ".join(values) + " |"
+
+
+def load_results(path: Path) -> Dict:
+    data = json.loads(path.read_text())
+    assert "runs" in data and isinstance(data["runs"], list), "results.json must have a top-level 'runs' list"
+    return data
+
+
+def envs_present(runs: List[Dict], only_env: Optional[List[str]], include_all_envs: bool) -> List[str]:
+    present = {r.get("env") for r in runs if r.get("env")}
+    if only_env:
+        present = present.intersection(set(only_env))
+    if include_all_envs:
+        # Include even if not present (might appear 0 rows in tables)
+        envs = [e for e in ENV_LABEL.keys() if (not only_env or e in only_env)]
+    else:
+        envs = [e for e in ENV_LABEL.keys() if e in present and (not only_env or e in only_env)]
+    return envs
+
+
+def fa_to_filter(fa: str) -> Optional[bool]:
+    fa = fa.lower().strip()
+    if fa == "on":
+        return True
+    if fa == "off":
+        return False
+    if fa == "any":
+        return None
+    raise ValueError("--fa must be on/off/any")
+
+
+def margin_aware_placements(
+    runs: List[Dict],
+    envs: List[str],
+    test_filter: str,
+    fa_filter: Optional[bool]
+) -> Tuple[Dict[str, Dict[str, int]], int]:
+    """
+    Returns (placements, sample_count)
+    placements[env] -> {"first": n, "second": n, "third": n}
+    sample_count = number of model+quant comparisons considered
+    """
+    placements = defaultdict(lambda: {"first": 0, "second": 0, "third": 0})
+    # group by (model, quant)
+    grouped = defaultdict(list)
+    for r in runs:
+        if r.get("error"):
+            continue
+        if r.get("test") != test_filter:
+            continue
+        if fa_filter is not None and r.get("fa") != fa_filter:
+            continue
+        if r.get("env") not in envs:
+            continue
+        key = (r.get("model_clean"), r.get("quant"))
+        grouped[key].append(r)
+
+    samples = 0
+    for key, entries in grouped.items():
+        # collate by env
+        env_groups = defaultdict(list)
+        for e in entries:
+            env_groups[e["env"]].append(e)
+        env_list = [e for e in envs if e in env_groups]  # keep requested order
+        if len(env_list) < 2:
+            continue
+
+        # summarize median mean ± median err per env
+        summary = {}
+        for env in env_list:
+            means = [x["tps_mean"] for x in env_groups[env] if x.get("tps_mean") is not None]
+            errs = [x.get("tps_err", 0.0) or 0.0 for x in env_groups[env]]
+            if not means:
+                continue
+            m = stats.median(means)
+            e = stats.median(errs) if errs else 0.0
+            summary[env] = (m - e, m + e, m)
+        if len(summary) < 2:
+            continue
+
+        samples += 1
+
+        # rank with overlap -> ties share rank
+        remaining = [env for env, _ in sorted(summary.items(), key=lambda kv: kv[1][2], reverse=True)]
+        assigned = {}
+        current_rank = 1
+        while remaining and current_rank <= 3:
+            env0 = remaining[0]
+            low0, high0, _ = summary[env0]
+            tied = [env0]
+            for env in remaining[1:]:
+                low, high, _ = summary[env]
+                if not (low > high0 or high < low0):  # overlap -> tie
+                    tied.append(env)
+            for env in tied:
+                assigned[env] = current_rank
+            remaining = [e for e in remaining if e not in tied]
+            current_rank += 1
+
+        for env, rk in assigned.items():
+            if rk == 1:
+                placements[env]["first"] += 1
+            elif rk == 2:
+                placements[env]["second"] += 1
+            elif rk == 3:
+                placements[env]["third"] += 1
+
+    return placements, samples
+
+
+def pairwise_win_counts(runs: List[Dict], envA: str, envB: str, test: str, fa_filter: Optional[bool]) -> Tuple[int, int, int, int]:
+    A = {}
+    B = {}
+    for r in runs:
+        if r.get("error") or r.get("test") != test:
+            continue
+        if fa_filter is not None and r.get("fa") != fa_filter:
+            continue
+        key = (r.get("model_clean"), r.get("quant"))
+        if r.get("env") == envA:
+            A[key] = r["tps_mean"]
+        elif r.get("env") == envB:
+            B[key] = r["tps_mean"]
+    winsA = winsB = ties = 0
+    for k in (set(A) & set(B)):
+        if A[k] > B[k]:
+            winsA += 1
+        elif B[k] > A[k]:
+            winsB += 1
+        else:
+            ties += 1
+    total = winsA + winsB + ties
+    return winsA, winsB, ties, total
+
+
+def average_ranks(place_dict: Dict[str, Dict[str, int]]) -> Dict[str, Optional[float]]:
+    avg = {}
+    for env, c in place_dict.items():
+        total = c.get("first", 0) + c.get("second", 0) + c.get("third", 0)
+        if total == 0:
+            avg[env] = None
+        else:
+            avg[env] = round((1 * c.get("first", 0) + 2 * c.get("second", 0) + 3 * c.get("third", 0)) / total, 2)
+    return avg
+
+
+def flash_attention_effect(runs: List[Dict], envs: List[str]) -> Dict[str, Dict[str, Dict[str, float]]]:
+    """
+    Returns: effects[env][test] = {n_pairs, median_pct, min, max}
+    Based on paired model+quant runs (ON vs OFF).
+    """
+    model_pairs = defaultdict(lambda: defaultdict(dict))  # (env,test)->(model,quant)->{fa: tps}
+    for r in runs:
+        if r.get("error") or r.get("tps_mean") is None:
+            continue
+        if r.get("test") not in TESTS:
+            continue
+        if r.get("env") not in envs:
+            continue
+        model_key = (r.get("model_clean"), r.get("quant"))
+        model_pairs[(r["env"], r["test"])][model_key][r.get("fa")] = r["tps_mean"]
+
+    summary = defaultdict(dict)
+    for (env, test), d in model_pairs.items():
+        deltas = []
+        for mk, vals in d.items():
+            if True in vals and False in vals and vals[False] > 0:
+                deltas.append((vals[True] - vals[False]) / vals[False] * 100.0)
+        if deltas:
+            summary[env][test] = {
+                "n_pairs": len(deltas),
+                "median_pct": round(stats.median(deltas), 1),
+                "min": round(min(deltas), 1),
+                "max": round(max(deltas), 1),
+            }
+    return summary
+
+
+def rocwmma_effect(runs: List[Dict], pairs_to_compare: List[Tuple[str, str, str]], tests: List[str]) -> List[Tuple[str, str, str, str, int, float]]:
+    """
+    Compare ROCWMMA ON vs OFF with same hipBLASLt state.
+    Returns rows of (context_label, test, env_on, env_off, n_pairs, median_delta_pct)
+    where delta_pct = median(ON/OFF - 1)*100 over common model+quant.
+    """
+    rows = []
+    for env_on, env_off, label in pairs_to_compare:
+        for test in tests:
+            data_on = defaultdict(list)
+            data_off = defaultdict(list)
+            for r in runs:
+                if r.get("error") or r.get("test") != test:
+                    continue
+                if r.get("env") == env_on:
+                    data_on[(r.get("model_clean"), r.get("quant"))].append(r["tps_mean"])
+                elif r.get("env") == env_off:
+                    data_off[(r.get("model_clean"), r.get("quant"))].append(r["tps_mean"])
+            common = sorted(set(data_on) & set(data_off))
+            if not common:
+                continue
+            ratios = []
+            for k in common:
+                aon = stats.median(data_on[k])
+                aoff = stats.median(data_off[k])
+                if aoff > 0:
+                    ratios.append(aon / aoff - 1.0)
+            if ratios:
+                rows.append((label, test, env_on, env_off, len(ratios), round(100 * stats.median(ratios), 1)))
+    return rows
+
+
+def hipblaslt_effect(runs: List[Dict], pairs_to_compare: List[Tuple[str, str, str]], tests: List[str]) -> List[Tuple[str, str, str, str, int, float]]:
+    """
+    Compare hipBLASLt ON vs OFF with same ROCWMMA state.
+    Returns rows of (context_label, test, env_on, env_off, n_pairs, median_delta_pct)
+    where delta_pct = median(ON/OFF - 1)*100 over common model+quant.
+    """
+    rows = []
+    for env_on, env_off, label in pairs_to_compare:
+        for test in tests:
+            data_on = defaultdict(list)
+            data_off = defaultdict(list)
+            for r in runs:
+                if r.get("error") or r.get("test") != test:
+                    continue
+                if r.get("env") == env_on:
+                    data_on[(r.get("model_clean"), r.get("quant"))].append(r["tps_mean"])
+                elif r.get("env") == env_off:
+                    data_off[(r.get("model_clean"), r.get("quant"))].append(r["tps_mean"])
+            common = sorted(set(data_on) & set(data_off))
+            if not common:
+                continue
+            ratios = []
+            for k in common:
+                aon = stats.median(data_on[k])
+                aoff = stats.median(data_off[k])
+                if aoff > 0:
+                    ratios.append(aon / aoff - 1.0)
+            if ratios:
+                rows.append((label, test, env_on, env_off, len(ratios), round(100 * stats.median(ratios), 1)))
+    return rows
+
+
+def amdvlk_vs_radv(runs: List[Dict], fa_filter: Optional[bool]) -> List[Tuple[str, int, int, int, int]]:
+    rows = []
+    for test in TESTS:
+        wa, wr, ties, total = pairwise_win_counts(runs, "vulkan_amdvlk", "vulkan_radv", test, fa_filter)
+        rows.append((test, wa, wr, ties, total))
+    return rows
+
+
+def winners(place_dict: Dict[str, Dict[str, int]], slot="first") -> Tuple[List[str], int]:
+    max_count = max((c.get(slot, 0) for c in place_dict.values()), default=0)
+    win_list = [env for env, c in place_dict.items() if c.get(slot, 0) == max_count and max_count > 0]
+    return win_list, max_count
+
+
+def human_list(envs: List[str]) -> str:
+    return ", ".join(ENV_LABEL.get(e, e) for e in envs) if envs else "—"
+
+
+def build_readme_section(
+    envs: List[str],
+    pp_place: Dict[str, Dict[str, int]],
+    tg_place: Dict[str, Dict[str, int]],
+    fa_filter: Optional[bool]
+) -> str:
+    # Winners
+    pp_wins, _ = winners(pp_place, "first")
+    tg_wins, _ = winners(tg_place, "first")
+
+    lines: List[str] = []
+    lines.append("## 3. Performance Benchmarks (Key Results)")
+    lines.append("")
+    lines.append("🌐 Interactive exploration of the latest benchmark runs: [Interactie Benchmark Viewer](https://kyuz0.github.io/amd-strix-halo-toolboxes/)")
+    lines.append("")
+    lines.append("Benchmarks were analysed with **error-aware ties** (mean ± σ). If two backends overlap within margins, they are treated as a tie. All placement counts below use **Flash Attention ON**.")
+    lines.append("")
+
+    # Placement tables
+    def place_table(title: str, place_dict: Dict[str, Dict[str, int]]):
+        lines.append(f"**{title}**")
+        lines.append(md_row(["Backend", "1st", "2nd", "3rd"]))
+        lines.append(md_row(["---", "---:", "---:", "---:"]))
+        order = sorted(place_dict.items(), key=lambda kv: (-kv[1].get("first", 0), -kv[1].get("second", 0), kv[0]))
+        for env, c in order:
+            lines.append(md_row([ENV_LABEL.get(env, env), str(c.get("first", 0)), str(c.get("second", 0)), str(c.get("third", 0))]))
+        lines.append("")
+
+    place_table("Prompt Processing (pp512)", pp_place)
+    place_table("Token Generation (tg128)", tg_place)
+
+    # Data-driven recommendations
+    def total_score(c: Dict[str, int]) -> int:
+        # weight 1st more than 2nd
+        return c.get("first", 0) * 2 + c.get("second", 0)
+
+    best_bal_score = -1
+    balanced: List[str] = []
+    for env in envs:
+        score = total_score(pp_place.get(env, {})) + total_score(tg_place.get(env, {}))
+        if score > best_bal_score:
+            best_bal_score = score
+            balanced = [env]
+        elif score == best_bal_score:
+            balanced.append(env)
+
+    lines.append("### Summary & Recommendations")
+    lines.append(f"- **Fastest prompt processing:** {human_list(pp_wins)} (most 1st-place finishes).")
+    lines.append(f"- **Fastest token generation:** {human_list(tg_wins)} (most 1st-place finishes).")
+    lines.append(f"- **Balanced choice:** {human_list(balanced)} (consistently near the top across PP/TG).")
+    lines.append("")
+    lines.append("> **Note (ROCm 7):** Toolboxes enable **hipBLASLt** by default. The benchmark suite also runs **hipBLASLt OFF** variants to show its impact.")
+    return "\n".join(lines)
+
+
+def build_benchmarks_doc(
+    runs: List[Dict],
+    envs: List[str],
+    pp_place: Dict[str, Dict[str, int]],
+    tg_place: Dict[str, Dict[str, int]],
+    fa_filter: Optional[bool],
+) -> str:
+    lines: List[str] = []
+    lines.append("# AMD Strix Halo — llama.cpp Toolboxes (Benchmarks)")
+    lines.append("")
+    lines.append("**Interactive results:** https://kyuz0.github.io/amd-strix-halo-toolboxes/")
+    lines.append("")
+    lines.append("## Table of Contents")
+    lines.append("- [Benchmark methodology](#benchmark-methodology)")
+    lines.append("- [Summary of current dataset (Flash Attention ON)](#summary-of-current-dataset-flash-attention-on)")
+    lines.append("  - [Placement counts](#placement-counts)")
+    lines.append("  - [Pairwise head-to-head wins](#pairwise-head-to-head-wins)")
+    lines.append("  - [Average ranks](#average-ranks)")
+    lines.append("- [Analyses by feature](#analyses-by-feature)")
+    lines.append("  - [Impact of Flash Attention](#impact-of-flash-attention)")
+    lines.append("  - [Impact of ROCWMMA](#impact-of-rocwmma)")
+    lines.append("  - [Impact of hipBLASLt](#impact-of-hipblaslt)")
+    lines.append("  - [Vulkan: AMDVLK vs RADV](#vulkan-amdvlk-vs-radv)")
+    lines.append("- [Recommendations](#recommendations)")
+    lines.append("- [Winner calculation](#winner-calculation)")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    lines.append("## Benchmark methodology")
+    lines.append("")
+    lines.append("- **pp512** — prompt processing throughput (tokens/sec, prefill)")
+    lines.append("- **tg128** — token generation throughput (tokens/sec, interactive)")
+    lines.append("- Each backend tested twice per model: `-fa 0` and `-fa 1`")
+    lines.append("- Winners per model/test are **margin-aware**; multiple winners are possible when mean±σ overlap")
+    lines.append("- Built from the same llama.cpp commit for consistency")
+    lines.append("")
+    lines.append("**Backends in this dataset:** " + ", ".join(ENV_LABEL.get(e, e) for e in envs))
+    lines.append("")
+    lines.append("**ROCm 7 hipBLASLt policy:** Toolboxes ship with **hipBLASLt enabled** by default (`ROCBLAS_USE_HIPBLASLT=1`). The benchmark script also runs **hipBLASLt OFF** variants (`-hblt0`) to measure its effect.")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    lines.append("## Summary of current dataset (Flash Attention ON)")
+    lines.append("")
+    # Placement counts
+    lines.append("### Placement counts")
+    def place_block(title: str, place_dict: Dict[str, Dict[str, int]]):
+        lines.append(f"**{title}**")
+        lines.append(md_row(["Backend", "1st", "2nd", "3rd"]))
+        lines.append(md_row(["---", "---:", "---:", "---:"]))
+        order = sorted(place_dict.items(), key=lambda kv: (-kv[1].get("first", 0), -kv[1].get("second", 0), kv[0]))
+        for env, c in order:
+            lines.append(md_row([ENV_LABEL.get(env, env), str(c.get("first", 0)), str(c.get("second", 0)), str(c.get("third", 0))]))
+        lines.append("")
+    place_block("Prompt Processing (pp512)", pp_place)
+    place_block("Token Generation (tg128)", tg_place)
+
+    # Pairwise wins
+    lines.append("### Pairwise head-to-head wins")
+    lines.append("For any model+quant where both backends succeeded, this counts who was faster (ties when equal).")
+    lines.append(md_row(["Comparison", "Test", "A wins", "B wins", "Ties", "Total"]))
+    lines.append(md_row(["---", "---", "---:", "---:", "---:", "---:"]))
+    pairs = [
+        ("ROCm 7 RC + ROCWMMA + hipBLASLt", "Vulkan AMDVLK", "rocm7_rc-rocwmma", "vulkan_amdvlk"),
+        ("ROCm 7 RC + ROCWMMA + hipBLASLt", "Vulkan RADV", "rocm7_rc-rocwmma", "vulkan_radv"),
+        ("Vulkan AMDVLK", "Vulkan RADV", "vulkan_amdvlk", "vulkan_radv"),
+    ]
+    for labelA, labelB, envA, envB in pairs:
+        for test in TESTS:
+            a, b, t, total = pairwise_win_counts(runs, envA, envB, test, fa_filter)
+            lines.append(md_row([f"{labelA} vs {labelB}", test, str(a), str(b), str(t), str(total)]))
+    lines.append("")
+
+    # Average ranks
+    lines.append("### Average ranks")
+    avg_pp = average_ranks(pp_place)
+    avg_tg = average_ranks(tg_place)
+    lines.append("**Prompt Processing (pp512)**")
+    lines.append(md_row(["Backend", "Avg Rank (↓ is better)"]))
+    lines.append(md_row(["---", "---:"]))
+    for env, val in sorted(avg_pp.items(), key=lambda kv: (kv[1] is None, kv[1] or 99)):
+        lines.append(md_row([ENV_LABEL.get(env, env), str(val) if val is not None else "—"]))
+    lines.append("")
+    lines.append("**Token Generation (tg128)**")
+    lines.append(md_row(["Backend", "Avg Rank (↓ is better)"]))
+    lines.append(md_row(["---", "---:"]))
+    for env, val in sorted(avg_tg.items(), key=lambda kv: (kv[1] is None, kv[1] or 99)):
+        lines.append(md_row([ENV_LABEL.get(env, env), str(val) if val is not None else "—"]))
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    lines.append("## Analyses by feature")
+    lines.append("")
+
+    # Flash Attention effect
+    lines.append("### Impact of Flash Attention")
+    fa_eff = flash_attention_effect(runs, envs)
+    lines.append("Median % change when **Flash Attention ON vs OFF**, paired by model+quant, per backend:")
+    lines.append(md_row(["Backend", "pp512 Δ% (median, min..max, n)", "tg128 Δ% (median, min..max, n)"]))
+    lines.append(md_row(["---", "---", "---"]))
+    def fmt_eff(row: Optional[Dict[str, float]]) -> str:
+        return f"{row['median_pct']}% ({row['min']}..{row['max']}), n={row['n_pairs']}" if row else "—"
+    for env in envs:
+        row_pp = fa_eff.get(env, {}).get("pp512")
+        row_tg = fa_eff.get(env, {}).get("tg128")
+        lines.append(md_row([ENV_LABEL.get(env, env), fmt_eff(row_pp), fmt_eff(row_tg)]))
+    lines.append("")
+
+    # ROCWMMA effect — check both ROCm 7 and 6.4.3 families if present
+    lines.append("### Impact of ROCWMMA")
+    rocwmma_pairs = []
+    if "rocm7_rc-rocwmma" in envs and "rocm7_rc" in envs:
+        rocwmma_pairs.append(("rocm7_rc-rocwmma", "rocm7_rc", "ROCm 7 RC (hipBLASLt)"))
+    if "rocm7_rc-rocwmma-hblt0" in envs and "rocm7_rc-hblt0" in envs:
+        rocwmma_pairs.append(("rocm7_rc-rocwmma-hblt0", "rocm7_rc-hblt0", "ROCm 7 RC (hipBLASLt OFF)"))
+    if "rocm6_4_3-rocwmma" in envs and "rocm6_4_3" in envs:
+        rocwmma_pairs.append(("rocm6_4_3-rocwmma", "rocm6_4_3", "ROCm 6.4.3 (hipBLASLt)"))
+    if "rocm6_4_3-rocwmma-hblt0" in envs and "rocm6_4_3-hblt0" in envs:
+        rocwmma_pairs.append(("rocm6_4_3-rocwmma-hblt0", "rocm6_4_3-hblt0", "ROCm 6.4.3 (hipBLASLt OFF)"))
+
+    rocwmma_rows = rocwmma_effect(runs, rocwmma_pairs, TESTS)
+    lines.append(md_row(["Context", "Test", "Compared Envs", "Pairs", "Median Δ%"]))
+    lines.append(md_row(["---", "---", "---", "---:", "---:"]))
+    for label, test, env_on, env_off, n, delta in rocwmma_rows:
+        lines.append(md_row([label, test, f"{ENV_LABEL.get(env_on, env_on)} vs {ENV_LABEL.get(env_off, env_off)}", str(n), f"{delta}%"]))
+    lines.append("")
+
+    # hipBLASLt effect — for both ROCm 7 and 6.4.3 families
+    lines.append("### Impact of hipBLASLt")
+    hip_pairs = []
+    if "rocm7_rc" in envs and "rocm7_rc-hblt0" in envs:
+        hip_pairs.append(("rocm7_rc", "rocm7_rc-hblt0", "ROCm 7 RC (no ROCWMMA)"))
+    if "rocm7_rc-rocwmma" in envs and "rocm7_rc-rocwmma-hblt0" in envs:
+        hip_pairs.append(("rocm7_rc-rocwmma", "rocm7_rc-rocwmma-hblt0", "ROCm 7 RC + ROCWMMA"))
+    if "rocm6_4_3" in envs and "rocm6_4_3-hblt0" in envs:
+        hip_pairs.append(("rocm6_4_3", "rocm6_4_3-hblt0", "ROCm 6.4.3 (no ROCWMMA)"))
+    if "rocm6_4_3-rocwmma" in envs and "rocm6_4_3-rocwmma-hblt0" in envs:
+        hip_pairs.append(("rocm6_4_3-rocwmma", "rocm6_4_3-rocwmma-hblt0", "ROCm 6.4.3 + ROCWMMA"))
+
+    hip_rows = hipblaslt_effect(runs, hip_pairs, TESTS)
+    lines.append(md_row(["Context", "Test", "Compared Envs", "Pairs", "Median Δ%"]))
+    lines.append(md_row(["---", "---", "---", "---:", "---:"]))
+    for label, test, env_on, env_off, n, delta in hip_rows:
+        lines.append(md_row([label, test, f"{ENV_LABEL.get(env_on, env_on)} vs {ENV_LABEL.get(env_off, env_off)}", str(n), f"{delta}%"]))
+    lines.append("")
+
+    # AMDVLK vs RADV
+    lines.append("### Vulkan: AMDVLK vs RADV")
+    lines.append("Head-to-head wins with selected Flash Attention filter:")
+    lines.append(md_row(["Test", "AMDVLK wins", "RADV wins", "Ties", "Total"]))
+    lines.append(md_row(["---", "---:", "---:", "---:", "---:"]))
+    for test, wa, wr, t, total in amdvlk_vs_radv(runs, fa_filter):
+        lines.append(md_row([test, str(wa), str(wr), str(t), str(total)]))
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    lines.append("## Recommendations")
+    pp_wins, _ = winners(pp_place, "first")
+    tg_wins, _ = winners(tg_place, "first")
+    lines.append(f"- **Fastest prompt processing:** {human_list(pp_wins)} (most 1st-place finishes with selected Flash Attention filter).")
+    lines.append(f"- **Fastest token generation:** {human_list(tg_wins)} (most 1st-place finishes with selected Flash Attention filter).")
+    # Balanced: highest (2*first + second) across PP+TG
+    def score(c: Dict[str, int]) -> int:
+        return c.get("first", 0) * 2 + c.get("second", 0)
+    best_bal = -1
+    balanced: List[str] = []
+    for env in envs:
+        s = score(pp_place.get(env, {})) + score(tg_place.get(env, {}))
+        if s > best_bal:
+            best_bal = s
+            balanced = [env]
+        elif s == best_bal:
+            balanced.append(env)
+    lines.append(f"- **Balanced choice:** {human_list(balanced)} (consistently near the top across PP/TG).")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    lines.append("## Winner calculation")
+    lines.append("A backend is counted as a winner if its mean throughput is within the best backend’s pooled ± error margin for that model/test type. This treats results within measurement noise as ties instead of false losses.")
+    return "\n".join(lines)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--file", type=Path, default=Path("../docs/results.json"),
+                    help="Path to results.json (default: ../docs/results.json)")
+    ap.add_argument("--out-readme", type=Path, default=Path("./README_benchmarks_section.md"),
+                    help="Path to write README section Markdown (default: ./README_benchmarks_section.md)")
+    ap.add_argument("--out-bench", type=Path, default=Path("./benchmarks_generated.md"),
+                    help="Path to write detailed benchmarks Markdown (default: ./benchmarks_generated.md)")
+    ap.add_argument("--fa", choices=["on", "off", "any"], default="on",
+                    help="Flash Attention filter (default: on)")
+    ap.add_argument("--include-all-envs", action="store_true",
+                    help="Include envs even if not present in results.json")
+    ap.add_argument("--only-env", action="append",
+                    help="Restrict analysis to specific env keys (repeatable)")
+    args = ap.parse_args()
+
+    data = load_results(args.file)
+    runs: List[Dict] = data["runs"]
+    fa_filter = fa_to_filter(args.fa)
+    envs = envs_present(runs, args.only_env, args.include_all_envs)
+
+    pp_place, _ = margin_aware_placements(runs, envs, "pp512", fa_filter)
+    tg_place, _ = margin_aware_placements(runs, envs, "tg128", fa_filter)
+
+    readme_md = build_readme_section(envs, pp_place, tg_place, fa_filter)
+    args.out_readme.write_text(readme_md)
+
+    bench_md = build_benchmarks_doc(runs, envs, pp_place, tg_place, fa_filter)
+    args.out_bench.write_text(bench_md)
+
+    print(f"Wrote:\n - {args.out_readme}\n - {args.out_bench}")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,175 +0,0 @@
-#!/usr/bin/env python3
-import json, re
-from collections import defaultdict
-from pathlib import Path
-
-RESULTS_FILE = "../docs/results.json"
-
-# Column order + labels
-ENV_ORDER = [
-    "vulkan_amdvlk",
-    "vulkan_radv",
-    "rocm6_4_2",
-    "rocm6_4_2-rocwmma",
-    "rocm7_beta",
-    "rocm7_rc",
-]
-COL_NAMES = {
-    "vulkan_amdvlk": "Vulkan (AMDVLK)",
-    "vulkan_radv": "Vulkan (RADV)",
-    "rocm6_4_2": "ROCm 6.4.2",
-    "rocm6_4_2-rocwmma": "ROCm 6.4.2 + ROCWMMA",
-    "rocm7_beta": "ROCm 7.0 Beta",
-    "rocm7_rc": "ROCm 7.0 RC",
-}
-WINNER_NAMES = {
-    "vulkan_amdvlk": "AMDVLK",
-    "vulkan_radv": "RADV",
-    "rocm6_4_2": "ROCm6.4.2",
-    "rocm6_4_2-rocwmma": "ROCm6.4.2+ROCWMMA",
-    "rocm7_beta": "ROCm7 Beta",
-    "rocm7_rc": "ROCm7 RC",
-}
-ERROR_LABEL = {
-    "load": "⚠️ Load Error",
-    "hang": "⚠️ GPU Hang",
-    "runtime": "⚠️ Runtime Error",
-}
-
-DEFAULT_MODELS = [
-    ("Gemma3 12B Q8_0",            "gemma-3-12b"),
-    ("Gemma3 27B BF16",            "gemma-3-27b"),
-    ("Llama-4-Scout 17B Q8_0",     "llama-4-scout-17b-16e-instruct-q8_0"),
-    ("Llama-4-Scout 17B Q4_K XL",  "llama-4-scout-17b-16e-instruct-q4_k_xl"),
-    ("Qwen3 30B BF16",             "qwen3-30b-a3b-bf16"),
-    ("Qwen3-235B Q3_K XL",         "qwen3-235b-a22b"),
-    ("GLM-4.5-Air-Q4_K_XL",        "glm-4.5-air-q4_k_xl"),
-    ("GLM-4.5-Air-Q6_K_XL",        "glm-4.5-air-q6_k_xl"),
-    ("gpt-oss-120b-mxfp4",         "gpt-oss-120b-mxfp4"),
-    ("gpt-oss-20b-mxfp4",          "gpt-oss-20b-mxfp4"),
-]
-
-SHARD_RE = re.compile(r"-000\d+-of-000\d+", re.IGNORECASE)
-def norm_model(s: str) -> str:
-    s = (s or "").lower().replace("_", "-")
-    s = SHARD_RE.sub("", s)
-    s = s.replace("-ud", "")
-    return s
-
-raw = json.loads(Path(RESULTS_FILE).read_text(encoding="utf-8"))
-runs = raw["runs"]
-
-buckets = defaultdict(list)
-error_only = defaultdict(list)
-all_models = set()
-
-for r in runs:
-    env = r.get("env")
-    if env not in ENV_ORDER:
-        continue
-    mkey = norm_model(r.get("model_clean") or r.get("model") or "")
-    all_models.add(mkey)
-    test = r.get("test")
-    if test in ("pp512", "tg128"):
-        buckets[(mkey, env, test)].append(r)
-    else:
-        if r.get("error"):
-            error_only[(mkey, env)].append(r.get("error_type") or "runtime")
-
-def pick_best(rows):
-    best, best_val, fallback = None, -1, None
-    for r in rows:
-        if r.get("error"):
-            fallback = r
-            continue
-        v = r.get("tps_mean")
-        if isinstance(v, (int, float)) and v > best_val:
-            best_val, best = v, r
-    return best or fallback
-
-chosen = defaultdict(lambda: defaultdict(dict))
-for (mkey, env, test), rows in buckets.items():
-    chosen_row = pick_best(rows)
-    chosen[mkey][env][test] = chosen_row
-
-for (mkey, env), etypes in error_only.items():
-    if etypes:
-        if "load" in etypes:
-            chosen[mkey][env]["error_only"] = "load"
-        elif "hang" in etypes:
-            chosen[mkey][env]["error_only"] = "hang"
-        else:
-            chosen[mkey][env]["error_only"] = "runtime"
-
-def fa_tag(row):
-    if not row or row.get("error"):
-        return ""
-    fa = row.get("fa")
-    if fa is None:
-        return ""
-    return " (FA on)" if fa else " (FA off)"
-
-def format_cell(entry_dict):
-    pp = entry_dict.get("pp512")
-    tg = entry_dict.get("tg128")
-    for row in (pp, tg):
-        if row and row.get("error"):
-            return ERROR_LABEL.get(row.get("error_type") or "runtime", "⚠️ Error")
-    if not pp and not tg:
-        et = entry_dict.get("error_only")
-        if et:
-            return ERROR_LABEL.get(et, "⚠️ Error")
-        return "—"
-    def fmt(v):
-        return f"{int(round(v))}" if isinstance(v, (int, float)) else "—"
-    ppv = pp.get("tps_mean") if pp else None
-    tgv = tg.get("tps_mean") if tg else None
-    pp_suffix = fa_tag(pp)
-    tg_suffix = fa_tag(tg)
-    if isinstance(tgv, (int, float)):
-        return f"{fmt(ppv)} pp{pp_suffix} / {tgv:.1f} tg{tg_suffix}"
-    else:
-        return f"{fmt(ppv)} pp{pp_suffix} / — tg"
-
-def best_env_for(mkey, test):
-    best_env, best_val, best_row = None, -1, None
-    for env in ENV_ORDER:
-        row = chosen[mkey].get(env, {}).get(test)
-        if not row or row.get("error"):
-            continue
-        v = row.get("tps_mean")
-        if isinstance(v, (int, float)) and v > best_val:
-            best_env, best_val, best_row = env, v, row
-    return best_env, (best_row.get("fa") if best_row else None)
-
-def win_label(env, fa):
-    if not env:
-        return "—"
-    base = WINNER_NAMES[env]
-    if fa is None:
-        return f"🏆 **{base}**"
-    return f"🏆 **{base}** ({'FA on' if fa else 'FA off'})"
-
-def find_model_key(fuzzy):
-    needle = norm_model(fuzzy)
-    for k in all_models:
-        if needle in k:
-            return k
-    return None
-
-# Header now has Best PP & Best TG right after Model
-header = ["Model", "🏆 Best PP", "🏆 Best TG"] + [COL_NAMES[e] for e in ENV_ORDER]
-print("| " + " | ".join(header) + " |")
-print("|" + "|".join(["---"] * len(header)) + "|")
-
-for disp, fuzzy in DEFAULT_MODELS:
-    mkey = find_model_key(fuzzy)
-    if not mkey:
-        print("| " + " | ".join([f"**{disp}**", "—", "—"] + ["—"]*len(ENV_ORDER)) + " |")
-        continue
-    bpp_env, bpp_fa = best_env_for(mkey, "pp512")
-    btg_env, btg_fa = best_env_for(mkey, "tg128")
-    row = [f"**{disp}**", win_label(bpp_env, bpp_fa), win_label(btg_env, btg_fa)]
-    for env in ENV_ORDER:
-        row.append(format_cell(chosen[mkey].get(env, {})))
-    print("| " + " | ".join(row) + " |")
@@ -1,172 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen2
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Kimi-Dev-72B
-llama_model_loader: - kv   3:                           general.basename str              = Kimi-Dev-72B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 72B
-llama_model_loader: - kv   6:                            general.license str              = mit
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Kimi Dev 72B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Moonshotai
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/moonshotai/Kim...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["code", "unsloth", "swebench", "soft...
-llama_model_loader: - kv  13:                          qwen2.block_count u32              = 80
-llama_model_loader: - kv  14:                       qwen2.context_length u32              = 131072
-llama_model_loader: - kv  15:                     qwen2.embedding_length u32              = 8192
-llama_model_loader: - kv  16:                  qwen2.feed_forward_length u32              = 29568
-llama_model_loader: - kv  17:                 qwen2.attention.head_count u32              = 64
-llama_model_loader: - kv  18:              qwen2.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                       qwen2.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  20:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Kimi-Dev-72B.txt
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count u32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count u32              = 685
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 963
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  401 tensors
-llama_model_loader: - type  f16:  107 tensors
-llama_model_loader: - type q8_0:  455 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 78.21 GiB (9.24 BPW) 
-load: special tokens cache size = 22
-load: token to piece cache size = 0.9310 MB
-print_info: arch             = qwen2
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 29568
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = -1
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 72.71 B
-print_info: general.name     = Kimi-Dev-72B
-print_info: vocab type       = BPE
-print_info: n_vocab          = 152064
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:        ROCm0 model buffer size = 77715.11 MiB
-load_tensors:    ROCm_Host model buffer size =  2376.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   313.00 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 2887
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1808727616
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello0
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 31746.03 tokens per second)
-llama_perf_context_print:        load time =   31744.47 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     463.93 ms /     1 runs   (  463.93 ms per token,     2.16 tokens per second)
-llama_perf_context_print:       total time =     470.35 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 36.639378936s
-    Run #3 status: 0
-  → Avg over 3 runs: 35.301s
@@ -1,172 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen2
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Kimi-Dev-72B
-llama_model_loader: - kv   3:                           general.basename str              = Kimi-Dev-72B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 72B
-llama_model_loader: - kv   6:                            general.license str              = mit
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Kimi Dev 72B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Moonshotai
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/moonshotai/Kim...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["code", "unsloth", "swebench", "soft...
-llama_model_loader: - kv  13:                          qwen2.block_count u32              = 80
-llama_model_loader: - kv  14:                       qwen2.context_length u32              = 131072
-llama_model_loader: - kv  15:                     qwen2.embedding_length u32              = 8192
-llama_model_loader: - kv  16:                  qwen2.feed_forward_length u32              = 29568
-llama_model_loader: - kv  17:                 qwen2.attention.head_count u32              = 64
-llama_model_loader: - kv  18:              qwen2.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                       qwen2.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  20:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Kimi-Dev-72B.txt
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count u32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count u32              = 685
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 963
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  401 tensors
-llama_model_loader: - type  f16:  107 tensors
-llama_model_loader: - type q8_0:  455 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 78.21 GiB (9.24 BPW) 
-load: special tokens cache size = 22
-load: token to piece cache size = 0.9310 MB
-print_info: arch             = qwen2
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 29568
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = -1
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 72.71 B
-print_info: general.name     = Kimi-Dev-72B
-print_info: vocab type       = BPE
-print_info: n_vocab          = 152064
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:        ROCm0 model buffer size = 77715.11 MiB
-load_tensors:    ROCm_Host model buffer size =  2376.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   313.00 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 2887
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3691857665
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello0
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     2 runs   (    0.04 ms per token, 27027.03 tokens per second)
-llama_perf_context_print:        load time =   30932.72 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     559.63 ms /     1 runs   (  559.63 ms per token,     1.79 tokens per second)
-llama_perf_context_print:       total time =     566.03 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 32.156014765s
-    Run #3 status: 0
-  → Avg over 3 runs: 30.024s
@@ -1,172 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen2
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Kimi-Dev-72B
-llama_model_loader: - kv   3:                           general.basename str              = Kimi-Dev-72B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 72B
-llama_model_loader: - kv   6:                            general.license str              = mit
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Kimi Dev 72B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Moonshotai
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/moonshotai/Kim...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["code", "unsloth", "swebench", "soft...
-llama_model_loader: - kv  13:                          qwen2.block_count u32              = 80
-llama_model_loader: - kv  14:                       qwen2.context_length u32              = 131072
-llama_model_loader: - kv  15:                     qwen2.embedding_length u32              = 8192
-llama_model_loader: - kv  16:                  qwen2.feed_forward_length u32              = 29568
-llama_model_loader: - kv  17:                 qwen2.attention.head_count u32              = 64
-llama_model_loader: - kv  18:              qwen2.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                       qwen2.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  20:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Kimi-Dev-72B.txt
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count u32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count u32              = 685
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 963
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  401 tensors
-llama_model_loader: - type  f16:  107 tensors
-llama_model_loader: - type q8_0:  455 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 78.21 GiB (9.24 BPW) 
-load: special tokens cache size = 22
-load: token to piece cache size = 0.9310 MB
-print_info: arch             = qwen2
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 29568
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = -1
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 72.71 B
-print_info: general.name     = Kimi-Dev-72B
-print_info: vocab type       = BPE
-print_info: n_vocab          = 152064
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:        ROCm0 model buffer size = 77715.11 MiB
-load_tensors:    ROCm_Host model buffer size =  2376.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   313.00 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 2887
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3133611532
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello0
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 35087.72 tokens per second)
-llama_perf_context_print:        load time =   25127.98 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     383.37 ms /     1 runs   (  383.37 ms per token,     2.61 tokens per second)
-llama_perf_context_print:       total time =     389.90 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 26.238043008s
-    Run #3 status: 0
-  → Avg over 3 runs: 26.362s
@@ -1,123 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen2
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Kimi-Dev-72B
-llama_model_loader: - kv   3:                           general.basename str              = Kimi-Dev-72B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 72B
-llama_model_loader: - kv   6:                            general.license str              = mit
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Kimi Dev 72B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Moonshotai
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/moonshotai/Kim...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["code", "unsloth", "swebench", "soft...
-llama_model_loader: - kv  13:                          qwen2.block_count u32              = 80
-llama_model_loader: - kv  14:                       qwen2.context_length u32              = 131072
-llama_model_loader: - kv  15:                     qwen2.embedding_length u32              = 8192
-llama_model_loader: - kv  16:                  qwen2.feed_forward_length u32              = 29568
-llama_model_loader: - kv  17:                 qwen2.attention.head_count u32              = 64
-llama_model_loader: - kv  18:              qwen2.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                       qwen2.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  20:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Kimi-Dev-72B.txt
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count u32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count u32              = 685
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 963
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  401 tensors
-llama_model_loader: - type  f16:  107 tensors
-llama_model_loader: - type q8_0:  455 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 78.21 GiB (9.24 BPW) 
-load: special tokens cache size = 22
-load: token to piece cache size = 0.9310 MB
-print_info: arch             = qwen2
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 29568
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = -1
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 72.71 B
-print_info: general.name     = Kimi-Dev-72B
-print_info: vocab type       = BPE
-print_info: n_vocab          = 152064
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-ggml_vulkan: Device memory allocation of size 2491416576 failed.
-ggml_vulkan: Requested buffer size exceeds device memory allocation limit: ErrorOutOfDeviceMemory
-alloc_tensor_range: failed to allocate Vulkan0 buffer of size 2491416576
-llama_model_load: error loading model: unable to allocate Vulkan0 buffer
-llama_model_load_from_file_impl: failed to load model
-common_init_from_params: failed to load model '/home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf'
-main: error: unable to load model
-    Elapsed #3: .334893088s
-    Run #3 status: 1
-    ✖ run #3 failed
-  → No successful runs
@@ -1,170 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 963 tensors from /home/kyuz0/models/kimi-dev-72B-Q8_K_XL/UD-Q8_K_XL/Kimi-Dev-72B-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen2
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Kimi-Dev-72B
-llama_model_loader: - kv   3:                           general.basename str              = Kimi-Dev-72B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 72B
-llama_model_loader: - kv   6:                            general.license str              = mit
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Kimi Dev 72B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Moonshotai
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/moonshotai/Kim...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["code", "unsloth", "swebench", "soft...
-llama_model_loader: - kv  13:                          qwen2.block_count u32              = 80
-llama_model_loader: - kv  14:                       qwen2.context_length u32              = 131072
-llama_model_loader: - kv  15:                     qwen2.embedding_length u32              = 8192
-llama_model_loader: - kv  16:                  qwen2.feed_forward_length u32              = 29568
-llama_model_loader: - kv  17:                 qwen2.attention.head_count u32              = 64
-llama_model_loader: - kv  18:              qwen2.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                       qwen2.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  20:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  24:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  25:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Kimi-Dev-72B-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Kimi-Dev-72B.txt
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count u32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count u32              = 685
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 963
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  401 tensors
-llama_model_loader: - type  f16:  107 tensors
-llama_model_loader: - type q8_0:  455 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 78.21 GiB (9.24 BPW) 
-load: special tokens cache size = 22
-load: token to piece cache size = 0.9310 MB
-print_info: arch             = qwen2
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 29568
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = -1
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 72.71 B
-print_info: general.name     = Kimi-Dev-72B
-print_info: vocab type       = BPE
-print_info: n_vocab          = 152064
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 77715.09 MiB
-load_tensors:  Vulkan_Host model buffer size =  2376.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   313.00 MiB
-llama_context: Vulkan_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2887
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4071074447
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello beğen
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     2 runs   (    0.03 ms per token, 37037.04 tokens per second)
-llama_perf_context_print:        load time =   29902.30 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     392.32 ms /     1 runs   (  392.32 ms per token,     2.55 tokens per second)
-llama_perf_context_print:       total time =     399.50 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 30.654893638s
-    Run #3 status: 0
-  → Avg over 3 runs: 30.591s
@@ -1,163 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                          llama.block_count u32              = 80
-llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  10:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  19:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 128004
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Llama-3.3-70B-Instruct-GGUF/imatrix_u...
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-3.3-70B-Ins...
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count i32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count i32              = 689
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 724
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q8_0:  455 tensors
-llama_model_loader: - type bf16:  107 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 75.65 GiB (9.21 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama-3.3-70B-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: PAD token        = 128004 '<|finetune_right_pad_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:        ROCm0 model buffer size = 75456.53 MiB
-load_tensors:    ROCm_Host model buffer size =  2004.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   266.50 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 1
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 192699360
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     3 runs   (    0.02 ms per token, 63829.79 tokens per second)
-llama_perf_context_print:        load time =   24487.91 ms
-llama_perf_context_print: prompt eval time =     368.54 ms /     2 tokens (  184.27 ms per token,     5.43 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     383.50 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 28.922457711s
-    Run #3 status: 0
-  → Avg over 3 runs: 30.998s
@@ -1,163 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                          llama.block_count u32              = 80
-llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  10:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  19:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 128004
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Llama-3.3-70B-Instruct-GGUF/imatrix_u...
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-3.3-70B-Ins...
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count i32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count i32              = 689
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 724
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q8_0:  455 tensors
-llama_model_loader: - type bf16:  107 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 75.65 GiB (9.21 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama-3.3-70B-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: PAD token        = 128004 '<|finetune_right_pad_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:        ROCm0 model buffer size = 75456.53 MiB
-load_tensors:    ROCm_Host model buffer size =  2004.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   266.50 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 1
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3478849877
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello H
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     3 runs   (    0.02 ms per token, 53571.43 tokens per second)
-llama_perf_context_print:        load time =   32005.62 ms
-llama_perf_context_print: prompt eval time =     456.36 ms /     2 tokens (  228.18 ms per token,     4.38 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     471.29 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 33.222127697s
-    Run #3 status: 0
-  → Avg over 3 runs: 32.796s
@@ -1,163 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                          llama.block_count u32              = 80
-llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  10:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  19:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 128004
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Llama-3.3-70B-Instruct-GGUF/imatrix_u...
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-3.3-70B-Ins...
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count i32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count i32              = 689
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 724
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q8_0:  455 tensors
-llama_model_loader: - type bf16:  107 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 75.65 GiB (9.21 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama-3.3-70B-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: PAD token        = 128004 '<|finetune_right_pad_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:        ROCm0 model buffer size = 75456.53 MiB
-load_tensors:    ROCm_Host model buffer size =  2004.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   266.50 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 1
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4130863841
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 44117.65 tokens per second)
-llama_perf_context_print:        load time =   32184.35 ms
-llama_perf_context_print: prompt eval time =     697.57 ms /     2 tokens (  348.79 ms per token,     2.87 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     712.61 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 33.659541277s
-    Run #3 status: 0
-  → Avg over 3 runs: 32.911s
@@ -1,161 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                          llama.block_count u32              = 80
-llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  10:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  19:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 128004
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Llama-3.3-70B-Instruct-GGUF/imatrix_u...
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-3.3-70B-Ins...
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count i32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count i32              = 689
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 724
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q8_0:  455 tensors
-llama_model_loader: - type bf16:  107 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 75.65 GiB (9.21 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama-3.3-70B-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: PAD token        = 128004 '<|finetune_right_pad_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 75456.53 MiB
-load_tensors:  Vulkan_Host model buffer size =  2004.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   266.50 MiB
-llama_context: Vulkan_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 327404797
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     3 runs   (    0.02 ms per token, 50847.46 tokens per second)
-llama_perf_context_print:        load time =   26953.87 ms
-llama_perf_context_print: prompt eval time =     387.45 ms /     2 tokens (  193.72 ms per token,     5.16 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     404.05 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 28.173844492s
-    Run #3 status: 0
-  → Avg over 3 runs: 30.604s
@@ -1,161 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-70B-Instruct/UD-Q8_K_XL/Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-3.3-70B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                          llama.block_count u32              = 80
-llama_model_loader: - kv   9:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  10:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  11:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  12:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  13:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  14:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  15:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  16:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  17:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  19:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  27:            tokenizer.ggml.padding_token_id u32              = 128004
-llama_model_loader: - kv  28:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  29:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                          general.file_type u32              = 7
-llama_model_loader: - kv  32:                      quantize.imatrix.file str              = Llama-3.3-70B-Instruct-GGUF/imatrix_u...
-llama_model_loader: - kv  33:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-3.3-70B-Ins...
-llama_model_loader: - kv  34:             quantize.imatrix.entries_count i32              = 560
-llama_model_loader: - kv  35:              quantize.imatrix.chunks_count i32              = 689
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                        split.tensors.count i32              = 724
-llama_model_loader: - kv  38:                                split.count u16              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q8_0:  455 tensors
-llama_model_loader: - type bf16:  107 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 75.65 GiB (9.21 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama-3.3-70B-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: PAD token        = 128004 '<|finetune_right_pad_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 75456.53 MiB
-load_tensors:  Vulkan_Host model buffer size =  2004.00 MiB
-.................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   266.50 MiB
-llama_context: Vulkan_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 2154218339
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello’s
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     3 runs   (    0.02 ms per token, 51724.14 tokens per second)
-llama_perf_context_print:        load time =   29443.29 ms
-llama_perf_context_print: prompt eval time =     376.13 ms /     2 tokens (  188.07 ms per token,     5.32 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     392.17 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 30.227365941s
-    Run #3 status: 0
-  → Avg over 3 runs: 30.376s
@@ -1,179 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 18
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q6_K:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q6_K
-print_info: file size   = 82.35 GiB (6.56 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:          CPU model buffer size =   809.29 MiB
-load_tensors:        ROCm0 model buffer size = 83513.68 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   442.62 MiB
-llama_context:  ROCm_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1642319140
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello 
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 42857.14 tokens per second)
-llama_perf_context_print:        load time =   26639.60 ms
-llama_perf_context_print: prompt eval time =     107.52 ms /     2 tokens (   53.76 ms per token,    18.60 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     127.12 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 30.905590182s
-    Run #3 status: 0
-  → Avg over 3 runs: 31.792s
@@ -1,179 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 18
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q6_K:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q6_K
-print_info: file size   = 82.35 GiB (6.56 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:          CPU model buffer size =   809.29 MiB
-load_tensors:        ROCm0 model buffer size = 83513.68 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   442.62 MiB
-llama_context:  ROCm_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1329865451
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello1
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 44776.12 tokens per second)
-llama_perf_context_print:        load time =   27337.52 ms
-llama_perf_context_print: prompt eval time =     135.84 ms /     2 tokens (   67.92 ms per token,    14.72 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     155.35 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 28.220065203s
-    Run #3 status: 0
-  → Avg over 3 runs: 28.221s
@@ -1,179 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 18
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q6_K:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q6_K
-print_info: file size   = 82.35 GiB (6.56 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:          CPU model buffer size =   809.29 MiB
-load_tensors:        ROCm0 model buffer size = 83513.68 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   442.62 MiB
-llama_context:  ROCm_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3194189125
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 46153.85 tokens per second)
-llama_perf_context_print:        load time =   26424.61 ms
-llama_perf_context_print: prompt eval time =     106.73 ms /     2 tokens (   53.37 ms per token,    18.74 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     126.53 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 27.353142250s
-    Run #3 status: 0
-  → Avg over 3 runs: 28.435s
@@ -1,177 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 18
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q6_K:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q6_K
-print_info: file size   = 82.35 GiB (6.56 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 83513.68 MiB
-load_tensors:          CPU model buffer size =   809.29 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   440.63 MiB
-llama_context: Vulkan_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4111748233
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.15 ms /     3 runs   (    0.05 ms per token, 20134.23 tokens per second)
-llama_perf_context_print:        load time =   31375.27 ms
-llama_perf_context_print: prompt eval time =     267.76 ms /     2 tokens (  133.88 ms per token,     7.47 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     295.92 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 33.122388042s
-    Run #3 status: 0
-  → Avg over 3 runs: 35.541s
@@ -1,177 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q6_K/Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 18
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q6_K:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q6_K
-print_info: file size   = 82.35 GiB (6.56 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 83513.68 MiB
-load_tensors:          CPU model buffer size =   809.29 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   440.63 MiB
-llama_context: Vulkan_Host compute buffer size =    26.02 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1422642604
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello1
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 32967.03 tokens per second)
-llama_perf_context_print:        load time =   32072.23 ms
-llama_perf_context_print: prompt eval time =     296.78 ms /     2 tokens (  148.39 ms per token,     6.74 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     324.57 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 32.859879045s
-    Run #3 status: 0
-  → Avg over 3 runs: 32.810s
@@ -1,179 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 7
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 3
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q8_0:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 106.65 GiB (8.50 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 108165.12 MiB
-load_tensors:    ROCm_Host model buffer size =  1048.22 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   434.62 MiB
-llama_context:  ROCm_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 1
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 2885096603
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello.
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     3 runs   (    0.02 ms per token, 46875.00 tokens per second)
-llama_perf_context_print:        load time =   36882.65 ms
-llama_perf_context_print: prompt eval time =     127.76 ms /     2 tokens (   63.88 ms per token,    15.65 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     158.41 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 41.426125320s
-    Run #3 status: 0
-  → Avg over 3 runs: 40.739s
@@ -1,179 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 7
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 3
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q8_0:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 106.65 GiB (8.50 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 108165.12 MiB
-load_tensors:    ROCm_Host model buffer size =  1048.22 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   434.62 MiB
-llama_context:  ROCm_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 1
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1149431120
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     3 runs   (    0.02 ms per token, 48387.10 tokens per second)
-llama_perf_context_print:        load time =   35959.68 ms
-llama_perf_context_print: prompt eval time =     127.62 ms /     2 tokens (   63.81 ms per token,    15.67 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     157.80 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 36.919182117s
-    Run #3 status: 0
-  → Avg over 3 runs: 36.400s
@@ -1,179 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 7
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 3
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q8_0:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 106.65 GiB (8.50 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 108165.12 MiB
-load_tensors:    ROCm_Host model buffer size =  1048.22 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   434.62 MiB
-llama_context:  ROCm_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 1
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 406280533
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello The
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 45454.55 tokens per second)
-llama_perf_context_print:        load time =   34222.03 ms
-llama_perf_context_print: prompt eval time =     136.79 ms /     2 tokens (   68.40 ms per token,    14.62 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     156.58 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 35.217307205s
-    Run #3 status: 0
-  → Avg over 3 runs: 35.742s
@@ -1,177 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 7
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 3
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q8_0:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 106.65 GiB (8.50 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 108165.12 MiB
-load_tensors:  Vulkan_Host model buffer size =  1048.22 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   440.63 MiB
-llama_context: Vulkan_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3690416473
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello 
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 32967.03 tokens per second)
-llama_perf_context_print:        load time =   41237.01 ms
-llama_perf_context_print: prompt eval time =     233.96 ms /     2 tokens (  116.98 ms per token,     8.55 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     261.97 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 45.548750208s
-    Run #3 status: 0
-  → Avg over 3 runs: 47.967s
@@ -1,177 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q8_0/Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 7
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 3
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q8_0:  482 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 106.65 GiB (8.50 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 108165.12 MiB
-load_tensors:  Vulkan_Host model buffer size =  1048.22 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   440.63 MiB
-llama_context: Vulkan_Host compute buffer size =    26.02 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4068031204
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello 
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 32967.03 tokens per second)
-llama_perf_context_print:        load time =   41299.30 ms
-llama_perf_context_print: prompt eval time =     252.99 ms /     2 tokens (  126.49 ms per token,     7.91 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     280.67 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 42.081911936s
-    Run #3 status: 0
-  → Avg over 3 runs: 41.626s
@@ -1,181 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 15
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q4_K:  421 tensors
-llama_model_loader: - type q5_K:   43 tensors
-llama_model_loader: - type q6_K:   18 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 57.73 GiB (4.60 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:          CPU model buffer size =   554.94 MiB
-load_tensors:        ROCm0 model buffer size = 58558.57 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   442.62 MiB
-llama_context:  ROCm_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4182963810
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello The
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 46153.85 tokens per second)
-llama_perf_context_print:        load time =    9663.18 ms
-llama_perf_context_print: prompt eval time =      90.98 ms /     2 tokens (   45.49 ms per token,    21.98 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     110.40 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 13.853856771s
-    Run #3 status: 0
-  → Avg over 3 runs: 15.776s
@@ -1,162 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 15
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q4_K:  421 tensors
-llama_model_loader: - type q5_K:   43 tensors
-llama_model_loader: - type q6_K:   18 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 57.73 GiB (4.60 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:          CPU model buffer size =   554.94 MiB
-load_tensors:        ROCm0 model buffer size = 58558.57 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   442.62 MiB
-llama_context:  ROCm_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-HW Exception by GPU node-1 (Agent handle: 0x48fa1f0) reason :GPU Hang
-    Elapsed #3: 22.180402418s
-    Run #3 status: 134
-    ✖ run #3 failed
-  → No successful runs
@@ -1,174 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 15
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q4_K:  421 tensors
-llama_model_loader: - type q5_K:   43 tensors
-llama_model_loader: - type q6_K:   18 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 57.73 GiB (4.60 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:          CPU model buffer size =   554.94 MiB
-load_tensors:        ROCm0 model buffer size = 58558.57 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   442.62 MiB
-llama_context:  ROCm_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 722371466
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello    Elapsed #3: 22.602610057s
-    Run #3 status: 134
-    ✖ run #3 failed
-  → Avg over 2 runs: 19.365s
@@ -1,179 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 15
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q4_K:  421 tensors
-llama_model_loader: - type q5_K:   43 tensors
-llama_model_loader: - type q6_K:   18 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 57.73 GiB (4.60 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 58558.57 MiB
-load_tensors:          CPU model buffer size =   554.94 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   440.63 MiB
-llama_context: Vulkan_Host compute buffer size =    26.01 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 83044290
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello 
-
-llama_perf_sampler_print:    sampling time =       0.16 ms /     3 runs   (    0.05 ms per token, 18518.52 tokens per second)
-llama_perf_context_print:        load time =   13560.35 ms
-llama_perf_context_print: prompt eval time =     257.61 ms /     2 tokens (  128.81 ms per token,     7.76 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     285.54 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 14.548378284s
-    Run #3 status: 0
-  → Avg over 3 runs: 16.752s
@@ -1,179 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 51 key-value pairs and 628 tensors from /home/kyuz0/models/llama-4-scout-17b-16e/Q4_K_XL/Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama4
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = 16E-Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Llama-4-Scout-17B-16E-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 17B
-llama_model_loader: - kv   7:                            general.license str              = other
-llama_model_loader: - kv   8:                       general.license.name str              = llama4
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Llama 4 Scout 17B 16E Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  14:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  15:                          general.languages arr[str,12]      = ["ar", "de", "en", "es", "fr", "hi", ...
-llama_model_loader: - kv  16:                         llama4.block_count u32              = 48
-llama_model_loader: - kv  17:                      llama4.context_length u32              = 10485760
-llama_model_loader: - kv  18:                    llama4.embedding_length u32              = 5120
-llama_model_loader: - kv  19:                 llama4.feed_forward_length u32              = 16384
-llama_model_loader: - kv  20:                llama4.attention.head_count u32              = 40
-llama_model_loader: - kv  21:             llama4.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  22:                      llama4.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  23:    llama4.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  24:                        llama4.expert_count u32              = 16
-llama_model_loader: - kv  25:                   llama4.expert_used_count u32              = 1
-llama_model_loader: - kv  26:                llama4.attention.key_length u32              = 128
-llama_model_loader: - kv  27:              llama4.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                          llama4.vocab_size u32              = 202048
-llama_model_loader: - kv  29:                llama4.rope.dimension_count u32              = 128
-llama_model_loader: - kv  30:           llama4.interleave_moe_layer_step u32              = 1
-llama_model_loader: - kv  31:          llama4.expert_feed_forward_length u32              = 8192
-llama_model_loader: - kv  32:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  33:                         tokenizer.ggml.pre str              = llama4
-llama_model_loader: - kv  34:                      tokenizer.ggml.tokens arr[str,202048]  = ["À", "Á", "õ", "ö", "÷", "ø", ...
-llama_model_loader: - kv  35:                  tokenizer.ggml.token_type arr[i32,202048]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  36:                      tokenizer.ggml.merges arr[str,439802]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  37:                tokenizer.ggml.bos_token_id u32              = 200000
-llama_model_loader: - kv  38:                tokenizer.ggml.eos_token_id u32              = 200008
-llama_model_loader: - kv  39:            tokenizer.ggml.padding_token_id u32              = 200018
-llama_model_loader: - kv  40:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  41:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  42:               general.quantization_version u32              = 2
-llama_model_loader: - kv  43:                          general.file_type u32              = 15
-llama_model_loader: - kv  44:                      quantize.imatrix.file str              = Llama-4-Scout-17B-16E-Instruct-GGUF/i...
-llama_model_loader: - kv  45:                   quantize.imatrix.dataset str              = unsloth_calibration_Llama-4-Scout-17B...
-llama_model_loader: - kv  46:             quantize.imatrix.entries_count u32              = 528
-llama_model_loader: - kv  47:              quantize.imatrix.chunks_count u32              = 729
-llama_model_loader: - kv  48:                                   split.no u16              = 0
-llama_model_loader: - kv  49:                        split.tensors.count i32              = 628
-llama_model_loader: - kv  50:                                split.count u16              = 2
-llama_model_loader: - type  f32:  146 tensors
-llama_model_loader: - type q4_K:  421 tensors
-llama_model_loader: - type q5_K:   43 tensors
-llama_model_loader: - type q6_K:   18 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 57.73 GiB (4.60 BPW) 
-load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
-load: special tokens cache size = 1135
-load: token to piece cache size = 1.3873 MB
-print_info: arch             = llama4
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 10485760
-print_info: n_embd           = 5120
-print_info: n_layer          = 48
-print_info: n_head           = 40
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 8192
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 5
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 16384
-print_info: n_expert         = 16
-print_info: n_expert_used    = 1
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 10485760
-print_info: rope_finetuned   = unknown
-print_info: model type       = 17Bx16E (Scout)
-print_info: model params     = 107.77 B
-print_info: general.name     = Llama-4-Scout-17B-16E-Instruct
-print_info: vocab type       = BPE
-print_info: n_vocab          = 202048
-print_info: n_merges         = 439802
-print_info: BOS token        = 200000 '<|begin_of_text|>'
-print_info: EOS token        = 200008 '<|eot|>'
-print_info: PAD token        = 200018 '<|finetune_right_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 200002 '<|fim_prefix|>'
-print_info: FIM SUF token    = 200004 '<|fim_suffix|>'
-print_info: FIM MID token    = 200003 '<|fim_middle|>'
-print_info: EOG token        = 200001 '<|end_of_text|>'
-print_info: EOG token        = 200008 '<|eot|>'
-print_info: max token length = 192
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 58558.57 MiB
-load_tensors:          CPU model buffer size =   554.94 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (10485760) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.77 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   192.00 MiB
-llama_kv_cache_unified: size =  192.00 MiB (  4096 cells,  12 layers,  1/ 1 seqs), K (f16):   96.00 MiB, V (f16):   96.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   576.00 MiB
-llama_kv_cache_unified: size =  576.00 MiB (  4096 cells,  36 layers,  1/ 1 seqs), K (f16):  288.00 MiB, V (f16):  288.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   440.63 MiB
-llama_context: Vulkan_Host compute buffer size =    26.02 MiB
-llama_context: graph nodes  = 2420
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eot|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 2510811977
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello (
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 32608.70 tokens per second)
-llama_perf_context_print:        load time =   16387.21 ms
-llama_perf_context_print: prompt eval time =     291.47 ms /     2 tokens (  145.73 ms per token,     6.86 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     319.42 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 17.154124582s
-    Run #3 status: 0
-  → Avg over 3 runs: 20.045s
@@ -1,184 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   3:                            general.version str              = 2507
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   6:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   7:                         general.size_label str              = 235B-A22B
-llama_model_loader: - kv   8:                            general.license str              = apache-2.0
-llama_model_loader: - kv   9:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  10:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  11:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  12:                  general.base_model.0.name str              = Qwen3 235B A22B Instruct 2507
-llama_model_loader: - kv  13:               general.base_model.0.version str              = 2507
-llama_model_loader: - kv  14:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  15:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  16:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  17:                       qwen3moe.block_count u32              = 94
-llama_model_loader: - kv  18:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  19:                  qwen3moe.embedding_length u32              = 4096
-llama_model_loader: - kv  20:               qwen3moe.feed_forward_length u32              = 12288
-llama_model_loader: - kv  21:              qwen3moe.attention.head_count u32              = 64
-llama_model_loader: - kv  22:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  23:                    qwen3moe.rope.freq_base f32              = 5000000.000000
-llama_model_loader: - kv  24:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  25:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  26:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  27:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  29:        qwen3moe.expert_feed_forward_length u32              = 1536
-llama_model_loader: - kv  30:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  31:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  32:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  33:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  34:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  35:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  36:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  37:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  38:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  39:               general.quantization_version u32              = 2
-llama_model_loader: - kv  40:                          general.file_type u32              = 12
-llama_model_loader: - kv  41:                      quantize.imatrix.file str              = Qwen3-235B-A22B-Instruct-2507-GGUF/im...
-llama_model_loader: - kv  42:                   quantize.imatrix.dataset str              = unsloth_calibration_Qwen3-235B-A22B-I...
-llama_model_loader: - kv  43:             quantize.imatrix.entries_count u32              = 745
-llama_model_loader: - kv  44:              quantize.imatrix.chunks_count u32              = 693
-llama_model_loader: - kv  45:                                   split.no u16              = 0
-llama_model_loader: - kv  46:                        split.tensors.count i32              = 1131
-llama_model_loader: - kv  47:                                split.count u16              = 3
-llama_model_loader: - type  f32:  471 tensors
-llama_model_loader: - type q3_K:  267 tensors
-llama_model_loader: - type q4_K:  362 tensors
-llama_model_loader: - type q5_K:   20 tensors
-llama_model_loader: - type q6_K:   11 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q3_K - Medium
-print_info: file size   = 96.99 GiB (3.54 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 4096
-print_info: n_layer          = 94
-print_info: n_head           = 64
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 16
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 12288
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 5000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 235B.A22B
-print_info: model params     = 235.09 B
-print_info: general.name     = Qwen3-235B-A22B-Instruct-2507
-print_info: n_ff_exp         = 1536
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 94 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 95/95 layers to GPU
-load_tensors:          CPU model buffer size =   333.84 MiB
-load_tensors:        ROCm0 model buffer size = 98988.40 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 5000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   752.00 MiB
-llama_kv_cache_unified: size =  752.00 MiB (  4096 cells,  94 layers,  1/ 1 seqs), K (f16):  376.00 MiB, V (f16):  376.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   304.75 MiB
-llama_context:  ROCm_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 6023
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4068503868
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 35087.72 tokens per second)
-llama_perf_context_print:        load time =   34531.90 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      74.04 ms /     1 runs   (   74.04 ms per token,    13.51 tokens per second)
-llama_perf_context_print:       total time =      87.46 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 38.606270419s
-    Run #3 status: 0
-  → Avg over 3 runs: 39.062s
@@ -1,184 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   3:                            general.version str              = 2507
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   6:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   7:                         general.size_label str              = 235B-A22B
-llama_model_loader: - kv   8:                            general.license str              = apache-2.0
-llama_model_loader: - kv   9:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  10:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  11:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  12:                  general.base_model.0.name str              = Qwen3 235B A22B Instruct 2507
-llama_model_loader: - kv  13:               general.base_model.0.version str              = 2507
-llama_model_loader: - kv  14:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  15:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  16:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  17:                       qwen3moe.block_count u32              = 94
-llama_model_loader: - kv  18:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  19:                  qwen3moe.embedding_length u32              = 4096
-llama_model_loader: - kv  20:               qwen3moe.feed_forward_length u32              = 12288
-llama_model_loader: - kv  21:              qwen3moe.attention.head_count u32              = 64
-llama_model_loader: - kv  22:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  23:                    qwen3moe.rope.freq_base f32              = 5000000.000000
-llama_model_loader: - kv  24:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  25:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  26:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  27:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  29:        qwen3moe.expert_feed_forward_length u32              = 1536
-llama_model_loader: - kv  30:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  31:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  32:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  33:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  34:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  35:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  36:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  37:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  38:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  39:               general.quantization_version u32              = 2
-llama_model_loader: - kv  40:                          general.file_type u32              = 12
-llama_model_loader: - kv  41:                      quantize.imatrix.file str              = Qwen3-235B-A22B-Instruct-2507-GGUF/im...
-llama_model_loader: - kv  42:                   quantize.imatrix.dataset str              = unsloth_calibration_Qwen3-235B-A22B-I...
-llama_model_loader: - kv  43:             quantize.imatrix.entries_count u32              = 745
-llama_model_loader: - kv  44:              quantize.imatrix.chunks_count u32              = 693
-llama_model_loader: - kv  45:                                   split.no u16              = 0
-llama_model_loader: - kv  46:                        split.tensors.count i32              = 1131
-llama_model_loader: - kv  47:                                split.count u16              = 3
-llama_model_loader: - type  f32:  471 tensors
-llama_model_loader: - type q3_K:  267 tensors
-llama_model_loader: - type q4_K:  362 tensors
-llama_model_loader: - type q5_K:   20 tensors
-llama_model_loader: - type q6_K:   11 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q3_K - Medium
-print_info: file size   = 96.99 GiB (3.54 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 4096
-print_info: n_layer          = 94
-print_info: n_head           = 64
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 16
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 12288
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 5000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 235B.A22B
-print_info: model params     = 235.09 B
-print_info: general.name     = Qwen3-235B-A22B-Instruct-2507
-print_info: n_ff_exp         = 1536
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 94 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 95/95 layers to GPU
-load_tensors:          CPU model buffer size =   333.84 MiB
-load_tensors:        ROCm0 model buffer size = 98988.40 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 5000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   752.00 MiB
-llama_kv_cache_unified: size =  752.00 MiB (  4096 cells,  94 layers,  1/ 1 seqs), K (f16):  376.00 MiB, V (f16):  376.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   304.75 MiB
-llama_context:  ROCm_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 6023
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 698255200
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello!
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     2 runs   (    0.03 ms per token, 37037.04 tokens per second)
-llama_perf_context_print:        load time =   34496.41 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      74.48 ms /     1 runs   (   74.48 ms per token,    13.43 tokens per second)
-llama_perf_context_print:       total time =      87.80 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 35.247053632s
-    Run #3 status: 0
-  → Avg over 3 runs: 35.392s
@@ -1,184 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   3:                            general.version str              = 2507
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   6:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   7:                         general.size_label str              = 235B-A22B
-llama_model_loader: - kv   8:                            general.license str              = apache-2.0
-llama_model_loader: - kv   9:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  10:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  11:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  12:                  general.base_model.0.name str              = Qwen3 235B A22B Instruct 2507
-llama_model_loader: - kv  13:               general.base_model.0.version str              = 2507
-llama_model_loader: - kv  14:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  15:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  16:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  17:                       qwen3moe.block_count u32              = 94
-llama_model_loader: - kv  18:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  19:                  qwen3moe.embedding_length u32              = 4096
-llama_model_loader: - kv  20:               qwen3moe.feed_forward_length u32              = 12288
-llama_model_loader: - kv  21:              qwen3moe.attention.head_count u32              = 64
-llama_model_loader: - kv  22:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  23:                    qwen3moe.rope.freq_base f32              = 5000000.000000
-llama_model_loader: - kv  24:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  25:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  26:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  27:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  29:        qwen3moe.expert_feed_forward_length u32              = 1536
-llama_model_loader: - kv  30:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  31:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  32:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  33:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  34:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  35:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  36:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  37:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  38:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  39:               general.quantization_version u32              = 2
-llama_model_loader: - kv  40:                          general.file_type u32              = 12
-llama_model_loader: - kv  41:                      quantize.imatrix.file str              = Qwen3-235B-A22B-Instruct-2507-GGUF/im...
-llama_model_loader: - kv  42:                   quantize.imatrix.dataset str              = unsloth_calibration_Qwen3-235B-A22B-I...
-llama_model_loader: - kv  43:             quantize.imatrix.entries_count u32              = 745
-llama_model_loader: - kv  44:              quantize.imatrix.chunks_count u32              = 693
-llama_model_loader: - kv  45:                                   split.no u16              = 0
-llama_model_loader: - kv  46:                        split.tensors.count i32              = 1131
-llama_model_loader: - kv  47:                                split.count u16              = 3
-llama_model_loader: - type  f32:  471 tensors
-llama_model_loader: - type q3_K:  267 tensors
-llama_model_loader: - type q4_K:  362 tensors
-llama_model_loader: - type q5_K:   20 tensors
-llama_model_loader: - type q6_K:   11 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q3_K - Medium
-print_info: file size   = 96.99 GiB (3.54 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 4096
-print_info: n_layer          = 94
-print_info: n_head           = 64
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 16
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 12288
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 5000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 235B.A22B
-print_info: model params     = 235.09 B
-print_info: general.name     = Qwen3-235B-A22B-Instruct-2507
-print_info: n_ff_exp         = 1536
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 94 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 95/95 layers to GPU
-load_tensors:          CPU model buffer size =   333.84 MiB
-load_tensors:        ROCm0 model buffer size = 98988.40 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 5000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   752.00 MiB
-llama_kv_cache_unified: size =  752.00 MiB (  4096 cells,  94 layers,  1/ 1 seqs), K (f16):  376.00 MiB, V (f16):  376.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   304.75 MiB
-llama_context:  ROCm_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 6023
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 715670654
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 34482.76 tokens per second)
-llama_perf_context_print:        load time =   31968.90 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      73.79 ms /     1 runs   (   73.79 ms per token,    13.55 tokens per second)
-llama_perf_context_print:       total time =      87.27 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 32.781452355s
-    Run #3 status: 0
-  → Avg over 3 runs: 33.458s
@@ -1,182 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   3:                            general.version str              = 2507
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   6:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   7:                         general.size_label str              = 235B-A22B
-llama_model_loader: - kv   8:                            general.license str              = apache-2.0
-llama_model_loader: - kv   9:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  10:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  11:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  12:                  general.base_model.0.name str              = Qwen3 235B A22B Instruct 2507
-llama_model_loader: - kv  13:               general.base_model.0.version str              = 2507
-llama_model_loader: - kv  14:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  15:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  16:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  17:                       qwen3moe.block_count u32              = 94
-llama_model_loader: - kv  18:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  19:                  qwen3moe.embedding_length u32              = 4096
-llama_model_loader: - kv  20:               qwen3moe.feed_forward_length u32              = 12288
-llama_model_loader: - kv  21:              qwen3moe.attention.head_count u32              = 64
-llama_model_loader: - kv  22:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  23:                    qwen3moe.rope.freq_base f32              = 5000000.000000
-llama_model_loader: - kv  24:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  25:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  26:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  27:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  29:        qwen3moe.expert_feed_forward_length u32              = 1536
-llama_model_loader: - kv  30:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  31:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  32:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  33:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  34:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  35:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  36:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  37:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  38:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  39:               general.quantization_version u32              = 2
-llama_model_loader: - kv  40:                          general.file_type u32              = 12
-llama_model_loader: - kv  41:                      quantize.imatrix.file str              = Qwen3-235B-A22B-Instruct-2507-GGUF/im...
-llama_model_loader: - kv  42:                   quantize.imatrix.dataset str              = unsloth_calibration_Qwen3-235B-A22B-I...
-llama_model_loader: - kv  43:             quantize.imatrix.entries_count u32              = 745
-llama_model_loader: - kv  44:              quantize.imatrix.chunks_count u32              = 693
-llama_model_loader: - kv  45:                                   split.no u16              = 0
-llama_model_loader: - kv  46:                        split.tensors.count i32              = 1131
-llama_model_loader: - kv  47:                                split.count u16              = 3
-llama_model_loader: - type  f32:  471 tensors
-llama_model_loader: - type q3_K:  267 tensors
-llama_model_loader: - type q4_K:  362 tensors
-llama_model_loader: - type q5_K:   20 tensors
-llama_model_loader: - type q6_K:   11 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q3_K - Medium
-print_info: file size   = 96.99 GiB (3.54 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 4096
-print_info: n_layer          = 94
-print_info: n_head           = 64
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 16
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 12288
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 5000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 235B.A22B
-print_info: model params     = 235.09 B
-print_info: general.name     = Qwen3-235B-A22B-Instruct-2507
-print_info: n_ff_exp         = 1536
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 94 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 95/95 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 98988.40 MiB
-load_tensors:          CPU model buffer size =   333.84 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 5000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   752.00 MiB
-llama_kv_cache_unified: size =  752.00 MiB (  4096 cells,  94 layers,  1/ 1 seqs), K (f16):  376.00 MiB, V (f16):  376.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   304.75 MiB
-llama_context: Vulkan_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 6023
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4076614647
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     2 runs   (    0.04 ms per token, 28571.43 tokens per second)
-llama_perf_context_print:        load time =   40072.88 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      67.40 ms /     1 runs   (   67.40 ms per token,    14.84 tokens per second)
-llama_perf_context_print:       total time =      86.12 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 43.569299668s
-    Run #3 status: 0
-  → Avg over 3 runs: 44.883s
@@ -1,182 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 2 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 48 key-value pairs and 1131 tensors from /home/kyuz0/models/qwen-3-235B-Q3_K-XL/UD-Q3_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   3:                            general.version str              = 2507
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Qwen3-235B-A22B-Instruct-2507
-llama_model_loader: - kv   6:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   7:                         general.size_label str              = 235B-A22B
-llama_model_loader: - kv   8:                            general.license str              = apache-2.0
-llama_model_loader: - kv   9:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  10:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  11:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  12:                  general.base_model.0.name str              = Qwen3 235B A22B Instruct 2507
-llama_model_loader: - kv  13:               general.base_model.0.version str              = 2507
-llama_model_loader: - kv  14:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  15:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-235...
-llama_model_loader: - kv  16:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  17:                       qwen3moe.block_count u32              = 94
-llama_model_loader: - kv  18:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  19:                  qwen3moe.embedding_length u32              = 4096
-llama_model_loader: - kv  20:               qwen3moe.feed_forward_length u32              = 12288
-llama_model_loader: - kv  21:              qwen3moe.attention.head_count u32              = 64
-llama_model_loader: - kv  22:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  23:                    qwen3moe.rope.freq_base f32              = 5000000.000000
-llama_model_loader: - kv  24:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  25:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  26:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  27:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  28:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  29:        qwen3moe.expert_feed_forward_length u32              = 1536
-llama_model_loader: - kv  30:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  31:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  32:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  33:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  34:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  35:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  36:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  37:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  38:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  39:               general.quantization_version u32              = 2
-llama_model_loader: - kv  40:                          general.file_type u32              = 12
-llama_model_loader: - kv  41:                      quantize.imatrix.file str              = Qwen3-235B-A22B-Instruct-2507-GGUF/im...
-llama_model_loader: - kv  42:                   quantize.imatrix.dataset str              = unsloth_calibration_Qwen3-235B-A22B-I...
-llama_model_loader: - kv  43:             quantize.imatrix.entries_count u32              = 745
-llama_model_loader: - kv  44:              quantize.imatrix.chunks_count u32              = 693
-llama_model_loader: - kv  45:                                   split.no u16              = 0
-llama_model_loader: - kv  46:                        split.tensors.count i32              = 1131
-llama_model_loader: - kv  47:                                split.count u16              = 3
-llama_model_loader: - type  f32:  471 tensors
-llama_model_loader: - type q3_K:  267 tensors
-llama_model_loader: - type q4_K:  362 tensors
-llama_model_loader: - type q5_K:   20 tensors
-llama_model_loader: - type q6_K:   11 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q3_K - Medium
-print_info: file size   = 96.99 GiB (3.54 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 4096
-print_info: n_layer          = 94
-print_info: n_head           = 64
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 16
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 12288
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 5000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 235B.A22B
-print_info: model params     = 235.09 B
-print_info: general.name     = Qwen3-235B-A22B-Instruct-2507
-print_info: n_ff_exp         = 1536
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 94 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 95/95 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 98988.40 MiB
-load_tensors:          CPU model buffer size =   333.84 MiB
-....................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 5000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   752.00 MiB
-llama_kv_cache_unified: size =  752.00 MiB (  4096 cells,  94 layers,  1/ 1 seqs), K (f16):  376.00 MiB, V (f16):  376.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   304.75 MiB
-llama_context: Vulkan_Host compute buffer size =    16.01 MiB
-llama_context: graph nodes  = 6023
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1959920459
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.08 ms /     2 runs   (    0.04 ms per token, 25641.03 tokens per second)
-llama_perf_context_print:        load time =   40114.24 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      67.08 ms /     1 runs   (   67.08 ms per token,    14.91 tokens per second)
-llama_perf_context_print:       total time =      86.46 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 40.621909942s
-    Run #3 status: 0
-  → Avg over 3 runs: 40.722s
@@ -1,167 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-30B-A3B
-llama_model_loader: - kv   3:                           general.basename str              = Qwen3-30B-A3B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   6:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   7:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv   8:                    qwen3moe.context_length u32              = 40960
-llama_model_loader: - kv   9:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  10:               qwen3moe.feed_forward_length u32              = 6144
-llama_model_loader: - kv  11:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  12:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  13:                    qwen3moe.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  14:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  15:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  16:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  17:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                          general.file_type u32              = 32
-llama_model_loader: - kv  19:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  20:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  21:               general.quantization_version u32              = 2
-llama_model_loader: - kv  22:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  23:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  24:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  28:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  29:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  31:                                   split.no u16              = 0
-llama_model_loader: - kv  32:                                split.count u16              = 2
-llama_model_loader: - kv  33:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 40960
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 6144
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 40960
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-30B-A3B
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 57666.30 MiB
-load_tensors:    ROCm_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   300.75 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1093628111
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello -
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 34482.76 tokens per second)
-llama_perf_context_print:        load time =   19374.51 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      42.85 ms /     1 runs   (   42.85 ms per token,    23.34 tokens per second)
-llama_perf_context_print:       total time =      73.04 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 23.364750813s
-    Run #3 status: 0
-  → Avg over 3 runs: 22.166s
@@ -1,167 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-30B-A3B
-llama_model_loader: - kv   3:                           general.basename str              = Qwen3-30B-A3B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   6:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   7:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv   8:                    qwen3moe.context_length u32              = 40960
-llama_model_loader: - kv   9:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  10:               qwen3moe.feed_forward_length u32              = 6144
-llama_model_loader: - kv  11:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  12:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  13:                    qwen3moe.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  14:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  15:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  16:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  17:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                          general.file_type u32              = 32
-llama_model_loader: - kv  19:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  20:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  21:               general.quantization_version u32              = 2
-llama_model_loader: - kv  22:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  23:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  24:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  28:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  29:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  31:                                   split.no u16              = 0
-llama_model_loader: - kv  32:                                split.count u16              = 2
-llama_model_loader: - kv  33:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 40960
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 6144
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 40960
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-30B-A3B
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 57666.30 MiB
-load_tensors:    ROCm_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   300.75 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3515911169
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello *
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     2 runs   (    0.03 ms per token, 37037.04 tokens per second)
-llama_perf_context_print:        load time =   12423.68 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      43.15 ms /     1 runs   (   43.15 ms per token,    23.18 tokens per second)
-llama_perf_context_print:       total time =      62.68 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 13.032265401s
-    Run #3 status: 0
-  → Avg over 3 runs: 15.930s
@@ -1,167 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-30B-A3B
-llama_model_loader: - kv   3:                           general.basename str              = Qwen3-30B-A3B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   6:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   7:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv   8:                    qwen3moe.context_length u32              = 40960
-llama_model_loader: - kv   9:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  10:               qwen3moe.feed_forward_length u32              = 6144
-llama_model_loader: - kv  11:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  12:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  13:                    qwen3moe.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  14:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  15:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  16:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  17:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                          general.file_type u32              = 32
-llama_model_loader: - kv  19:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  20:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  21:               general.quantization_version u32              = 2
-llama_model_loader: - kv  22:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  23:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  24:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  28:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  29:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  31:                                   split.no u16              = 0
-llama_model_loader: - kv  32:                                split.count u16              = 2
-llama_model_loader: - kv  33:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 40960
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 6144
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 40960
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-30B-A3B
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 57666.30 MiB
-load_tensors:    ROCm_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   300.75 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4057380724
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello this
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     2 runs   (    0.03 ms per token, 37037.04 tokens per second)
-llama_perf_context_print:        load time =   21106.31 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      43.24 ms /     1 runs   (   43.24 ms per token,    23.13 tokens per second)
-llama_perf_context_print:       total time =      62.41 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 21.852416396s
-    Run #3 status: 0
-  → Avg over 3 runs: 22.669s
@@ -1,165 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-30B-A3B
-llama_model_loader: - kv   3:                           general.basename str              = Qwen3-30B-A3B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   6:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   7:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv   8:                    qwen3moe.context_length u32              = 40960
-llama_model_loader: - kv   9:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  10:               qwen3moe.feed_forward_length u32              = 6144
-llama_model_loader: - kv  11:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  12:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  13:                    qwen3moe.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  14:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  15:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  16:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  17:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                          general.file_type u32              = 32
-llama_model_loader: - kv  19:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  20:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  21:               general.quantization_version u32              = 2
-llama_model_loader: - kv  22:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  23:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  24:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  28:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  29:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  31:                                   split.no u16              = 0
-llama_model_loader: - kv  32:                                split.count u16              = 2
-llama_model_loader: - kv  33:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 40960
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 6144
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 40960
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-30B-A3B
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 57666.30 MiB
-load_tensors:  Vulkan_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   304.75 MiB
-llama_context: Vulkan_Host compute buffer size =    12.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 157667903
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello and
-
-llama_perf_sampler_print:    sampling time =       0.08 ms /     2 runs   (    0.04 ms per token, 24390.24 tokens per second)
-llama_perf_context_print:        load time =   10008.37 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     128.73 ms /     1 runs   (  128.73 ms per token,     7.77 tokens per second)
-llama_perf_context_print:       total time =     155.88 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 10.759732568s
-    Run #3 status: 0
-  → Avg over 3 runs: 12.935s
@@ -1,165 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from /home/kyuz0/models/qwen-3-30B-A3B/BF16/Qwen3-30B-A3B-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-30B-A3B
-llama_model_loader: - kv   3:                           general.basename str              = Qwen3-30B-A3B
-llama_model_loader: - kv   4:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   5:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   6:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   7:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv   8:                    qwen3moe.context_length u32              = 40960
-llama_model_loader: - kv   9:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  10:               qwen3moe.feed_forward_length u32              = 6144
-llama_model_loader: - kv  11:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  12:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  13:                    qwen3moe.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  14:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  15:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  16:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  17:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  18:                          general.file_type u32              = 32
-llama_model_loader: - kv  19:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  20:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  21:               general.quantization_version u32              = 2
-llama_model_loader: - kv  22:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  23:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  24:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  28:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  29:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  30:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
-llama_model_loader: - kv  31:                                   split.no u16              = 0
-llama_model_loader: - kv  32:                                split.count u16              = 2
-llama_model_loader: - kv  33:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 40960
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 6144
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 40960
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-30B-A3B
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 57666.30 MiB
-load_tensors:  Vulkan_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (40960) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   304.75 MiB
-llama_context: Vulkan_Host compute buffer size =    12.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1118253234
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello -
-
-llama_perf_sampler_print:    sampling time =       0.08 ms /     2 runs   (    0.04 ms per token, 25316.46 tokens per second)
-llama_perf_context_print:        load time =   12501.96 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     137.49 ms /     1 runs   (  137.49 ms per token,     7.27 tokens per second)
-llama_perf_context_print:       total time =     164.69 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 13.022605949s
-    Run #3 status: 0
-  → Avg over 3 runs: 14.761s
@@ -1,176 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   7:                            general.license str              = apache-2.0
-llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Qwen3 Coder 30B A3B Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv  14:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  15:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv  16:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  17:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  18:               qwen3moe.feed_forward_length u32              = 5472
-llama_model_loader: - kv  19:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  20:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  21:                    qwen3moe.rope.freq_base f32              = 10000000.000000
-llama_model_loader: - kv  22:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  23:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  24:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  25:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  26:                          general.file_type u32              = 32
-llama_model_loader: - kv  27:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  28:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  29: qwen3moe.expert_shared_feed_forward_length u32              = 0
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  32:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  33:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  34:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  35:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  36:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  37:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  38:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  39:                    tokenizer.chat_template str              = {#- Copyright 2025-present the Unslot...
-llama_model_loader: - kv  40:                                   split.no u16              = 0
-llama_model_loader: - kv  41:                                split.count u16              = 2
-llama_model_loader: - kv  42:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 5472
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 10000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-Coder-30B-A3B-Instruct
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 57666.30 MiB
-load_tensors:    ROCm_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 10000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   300.75 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3288748167
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     2 runs   (    0.03 ms per token, 38461.54 tokens per second)
-llama_perf_context_print:        load time =   12175.61 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      42.43 ms /     1 runs   (   42.43 ms per token,    23.57 tokens per second)
-llama_perf_context_print:       total time =      81.77 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 16.099845533s
-    Run #3 status: 0
-  → Avg over 3 runs: 17.779s
@@ -1,176 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   7:                            general.license str              = apache-2.0
-llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Qwen3 Coder 30B A3B Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv  14:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  15:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv  16:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  17:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  18:               qwen3moe.feed_forward_length u32              = 5472
-llama_model_loader: - kv  19:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  20:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  21:                    qwen3moe.rope.freq_base f32              = 10000000.000000
-llama_model_loader: - kv  22:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  23:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  24:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  25:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  26:                          general.file_type u32              = 32
-llama_model_loader: - kv  27:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  28:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  29: qwen3moe.expert_shared_feed_forward_length u32              = 0
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  32:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  33:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  34:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  35:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  36:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  37:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  38:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  39:                    tokenizer.chat_template str              = {#- Copyright 2025-present the Unslot...
-llama_model_loader: - kv  40:                                   split.no u16              = 0
-llama_model_loader: - kv  41:                                split.count u16              = 2
-llama_model_loader: - kv  42:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 5472
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 10000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-Coder-30B-A3B-Instruct
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 57666.30 MiB
-load_tensors:    ROCm_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 10000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   300.75 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3173540432
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 35087.72 tokens per second)
-llama_perf_context_print:        load time =   11733.11 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      42.68 ms /     1 runs   (   42.68 ms per token,    23.43 tokens per second)
-llama_perf_context_print:       total time =      82.14 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 12.376138939s
-    Run #3 status: 0
-  → Avg over 3 runs: 14.392s
@@ -1,176 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   7:                            general.license str              = apache-2.0
-llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Qwen3 Coder 30B A3B Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv  14:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  15:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv  16:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  17:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  18:               qwen3moe.feed_forward_length u32              = 5472
-llama_model_loader: - kv  19:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  20:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  21:                    qwen3moe.rope.freq_base f32              = 10000000.000000
-llama_model_loader: - kv  22:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  23:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  24:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  25:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  26:                          general.file_type u32              = 32
-llama_model_loader: - kv  27:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  28:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  29: qwen3moe.expert_shared_feed_forward_length u32              = 0
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  32:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  33:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  34:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  35:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  36:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  37:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  38:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  39:                    tokenizer.chat_template str              = {#- Copyright 2025-present the Unslot...
-llama_model_loader: - kv  40:                                   split.no u16              = 0
-llama_model_loader: - kv  41:                                split.count u16              = 2
-llama_model_loader: - kv  42:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 5472
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 10000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-Coder-30B-A3B-Instruct
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 57666.30 MiB
-load_tensors:    ROCm_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 10000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   300.75 MiB
-llama_context:  ROCm_Host compute buffer size =     8.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 1
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1388157865
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.06 ms /     2 runs   (    0.03 ms per token, 36363.64 tokens per second)
-llama_perf_context_print:        load time =   11788.33 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =      43.56 ms /     1 runs   (   43.56 ms per token,    22.95 tokens per second)
-llama_perf_context_print:       total time =      82.77 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 12.528214562s
-    Run #3 status: 0
-  → Avg over 3 runs: 16.161s
@@ -1,174 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   7:                            general.license str              = apache-2.0
-llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Qwen3 Coder 30B A3B Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv  14:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  15:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv  16:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  17:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  18:               qwen3moe.feed_forward_length u32              = 5472
-llama_model_loader: - kv  19:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  20:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  21:                    qwen3moe.rope.freq_base f32              = 10000000.000000
-llama_model_loader: - kv  22:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  23:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  24:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  25:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  26:                          general.file_type u32              = 32
-llama_model_loader: - kv  27:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  28:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  29: qwen3moe.expert_shared_feed_forward_length u32              = 0
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  32:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  33:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  34:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  35:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  36:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  37:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  38:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  39:                    tokenizer.chat_template str              = {#- Copyright 2025-present the Unslot...
-llama_model_loader: - kv  40:                                   split.no u16              = 0
-llama_model_loader: - kv  41:                                split.count u16              = 2
-llama_model_loader: - kv  42:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 5472
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 10000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-Coder-30B-A3B-Instruct
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 57666.30 MiB
-load_tensors:  Vulkan_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 10000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   304.75 MiB
-llama_context: Vulkan_Host compute buffer size =    12.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 243266880
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.08 ms /     2 runs   (    0.04 ms per token, 26315.79 tokens per second)
-llama_perf_context_print:        load time =    9973.02 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     130.78 ms /     1 runs   (  130.78 ms per token,     7.65 tokens per second)
-llama_perf_context_print:       total time =     185.17 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 10.756452016s
-    Run #3 status: 0
-  → Avg over 3 runs: 12.940s
@@ -1,174 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 43 key-value pairs and 579 tensors from /home/kyuz0/models/qwen3-coder-30B-A3B/BF16/Qwen3-Coder-30B-A3B-Instruct-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = qwen3moe
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   3:                           general.finetune str              = Instruct
-llama_model_loader: - kv   4:                           general.basename str              = Qwen3-Coder-30B-A3B-Instruct
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 30B-A3B
-llama_model_loader: - kv   7:                            general.license str              = apache-2.0
-llama_model_loader: - kv   8:                       general.license.link str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv   9:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv  10:                   general.base_model.count u32              = 1
-llama_model_loader: - kv  11:                  general.base_model.0.name str              = Qwen3 Coder 30B A3B Instruct
-llama_model_loader: - kv  12:          general.base_model.0.organization str              = Qwen
-llama_model_loader: - kv  13:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen3-Cod...
-llama_model_loader: - kv  14:                               general.tags arr[str,2]       = ["unsloth", "text-generation"]
-llama_model_loader: - kv  15:                       qwen3moe.block_count u32              = 48
-llama_model_loader: - kv  16:                    qwen3moe.context_length u32              = 262144
-llama_model_loader: - kv  17:                  qwen3moe.embedding_length u32              = 2048
-llama_model_loader: - kv  18:               qwen3moe.feed_forward_length u32              = 5472
-llama_model_loader: - kv  19:              qwen3moe.attention.head_count u32              = 32
-llama_model_loader: - kv  20:           qwen3moe.attention.head_count_kv u32              = 4
-llama_model_loader: - kv  21:                    qwen3moe.rope.freq_base f32              = 10000000.000000
-llama_model_loader: - kv  22:  qwen3moe.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  23:                 qwen3moe.expert_used_count u32              = 8
-llama_model_loader: - kv  24:              qwen3moe.attention.key_length u32              = 128
-llama_model_loader: - kv  25:            qwen3moe.attention.value_length u32              = 128
-llama_model_loader: - kv  26:                          general.file_type u32              = 32
-llama_model_loader: - kv  27:                      qwen3moe.expert_count u32              = 128
-llama_model_loader: - kv  28:        qwen3moe.expert_feed_forward_length u32              = 768
-llama_model_loader: - kv  29: qwen3moe.expert_shared_feed_forward_length u32              = 0
-llama_model_loader: - kv  30:               general.quantization_version u32              = 2
-llama_model_loader: - kv  31:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  32:                         tokenizer.ggml.pre str              = qwen2
-llama_model_loader: - kv  33:                      tokenizer.ggml.tokens arr[str,151936]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  34:                  tokenizer.ggml.token_type arr[i32,151936]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  35:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
-llama_model_loader: - kv  36:                tokenizer.ggml.eos_token_id u32              = 151645
-llama_model_loader: - kv  37:            tokenizer.ggml.padding_token_id u32              = 151654
-llama_model_loader: - kv  38:               tokenizer.ggml.add_bos_token bool             = false
-llama_model_loader: - kv  39:                    tokenizer.chat_template str              = {#- Copyright 2025-present the Unslot...
-llama_model_loader: - kv  40:                                   split.no u16              = 0
-llama_model_loader: - kv  41:                                split.count u16              = 2
-llama_model_loader: - kv  42:                        split.tensors.count i32              = 579
-llama_model_loader: - type  f32:  241 tensors
-llama_model_loader: - type bf16:  338 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 56.89 GiB (16.01 BPW) 
-load: special tokens cache size = 26
-load: token to piece cache size = 0.9311 MB
-print_info: arch             = qwen3moe
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 262144
-print_info: n_embd           = 2048
-print_info: n_layer          = 48
-print_info: n_head           = 32
-print_info: n_head_kv        = 4
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 512
-print_info: n_embd_v_gqa     = 512
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 5472
-print_info: n_expert         = 128
-print_info: n_expert_used    = 8
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 10000000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 262144
-print_info: rope_finetuned   = unknown
-print_info: model type       = 30B.A3B
-print_info: model params     = 30.53 B
-print_info: general.name     = Qwen3-Coder-30B-A3B-Instruct
-print_info: n_ff_exp         = 768
-print_info: vocab type       = BPE
-print_info: n_vocab          = 151936
-print_info: n_merges         = 151387
-print_info: BOS token        = 11 ','
-print_info: EOS token        = 151645 '<|im_end|>'
-print_info: EOT token        = 151645 '<|im_end|>'
-print_info: PAD token        = 151654 '<|vision_pad|>'
-print_info: LF token         = 198 'Ċ'
-print_info: FIM PRE token    = 151659 '<|fim_prefix|>'
-print_info: FIM SUF token    = 151661 '<|fim_suffix|>'
-print_info: FIM MID token    = 151660 '<|fim_middle|>'
-print_info: FIM PAD token    = 151662 '<|fim_pad|>'
-print_info: FIM REP token    = 151663 '<|repo_name|>'
-print_info: FIM SEP token    = 151664 '<|file_sep|>'
-print_info: EOG token        = 151643 '<|endoftext|>'
-print_info: EOG token        = 151645 '<|im_end|>'
-print_info: EOG token        = 151662 '<|fim_pad|>'
-print_info: EOG token        = 151663 '<|repo_name|>'
-print_info: EOG token        = 151664 '<|file_sep|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 57666.30 MiB
-load_tensors:  Vulkan_Host model buffer size =   593.50 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 10000000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.58 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   384.00 MiB
-llama_kv_cache_unified: size =  384.00 MiB (  4096 cells,  48 layers,  1/ 1 seqs), K (f16):  192.00 MiB, V (f16):  192.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   304.75 MiB
-llama_context: Vulkan_Host compute buffer size =    12.01 MiB
-llama_context: graph nodes  = 3079
-llama_context: graph splits = 2
-common_init_from_params: added <|endoftext|> logit bias = -inf
-common_init_from_params: added <|im_end|> logit bias = -inf
-common_init_from_params: added <|fim_pad|> logit bias = -inf
-common_init_from_params: added <|repo_name|> logit bias = -inf
-common_init_from_params: added <|file_sep|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 2350977163
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 0
-
-Hello:
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     2 runs   (    0.04 ms per token, 27027.03 tokens per second)
-llama_perf_context_print:        load time =   13008.56 ms
-llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:        eval time =     140.05 ms /     1 runs   (  140.05 ms per token,     7.14 tokens per second)
-llama_perf_context_print:       total time =     194.09 ms /     2 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 13.570267879s
-    Run #3 status: 0
-  → Avg over 3 runs: 14.021s
@@ -1,165 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-12B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-12B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 12B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 3840
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 48
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 15360
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 16
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 256
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 256
-llama_model_loader: - kv  16:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  17:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  18:             gemma3.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  20:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  28:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  30:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  31:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  32:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  33:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  34:               general.quantization_version u32              = 2
-llama_model_loader: - kv  35:                          general.file_type u32              = 7
-llama_model_loader: - kv  36:                      quantize.imatrix.file str              = gemma-3-12b-it-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  37:                   quantize.imatrix.dataset str              = unsloth_calibration_gemma-3-12b-it.txt
-llama_model_loader: - kv  38:             quantize.imatrix.entries_count i32              = 336
-llama_model_loader: - kv  39:              quantize.imatrix.chunks_count i32              = 663
-llama_model_loader: - type  f32:  289 tensors
-llama_model_loader: - type q8_0:  311 tensors
-llama_model_loader: - type bf16:   26 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 13.40 GiB (9.78 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 3840
-print_info: n_layer          = 48
-print_info: n_head           = 16
-print_info: n_head_kv        = 8
-print_info: n_rot            = 256
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 256
-print_info: n_embd_head_v    = 256
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 6.2e-02
-print_info: n_ff             = 15360
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 12B
-print_info: model params     = 11.77 B
-print_info: general.name     = Gemma-3-12B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 13721.20 MiB
-load_tensors:    ROCm_Host model buffer size =  1920.47 MiB
-.............................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   256.00 MiB
-llama_kv_cache_unified: size =  256.00 MiB (  4096 cells,   8 layers,  1/ 1 seqs), K (f16):  128.00 MiB, V (f16):  128.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   480.00 MiB
-llama_kv_cache_unified: size =  480.00 MiB (  1536 cells,  40 layers,  1/ 1 seqs), K (f16):  240.00 MiB, V (f16):  240.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   519.62 MiB
-llama_context:  ROCm_Host compute buffer size =    11.01 MiB
-llama_context: graph nodes  = 2025
-llama_context: graph splits = 1
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3471752321
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello**
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 35294.12 tokens per second)
-llama_perf_context_print:        load time =    2510.88 ms
-llama_perf_context_print: prompt eval time =      74.99 ms /     2 tokens (   37.49 ms per token,    26.67 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =      79.74 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 6.594391168s
-    Run #3 status: 0
-  → Avg over 3 runs: 6.686s
@@ -1,165 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-12B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-12B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 12B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 3840
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 48
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 15360
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 16
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 256
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 256
-llama_model_loader: - kv  16:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  17:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  18:             gemma3.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  20:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  28:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  30:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  31:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  32:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  33:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  34:               general.quantization_version u32              = 2
-llama_model_loader: - kv  35:                          general.file_type u32              = 7
-llama_model_loader: - kv  36:                      quantize.imatrix.file str              = gemma-3-12b-it-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  37:                   quantize.imatrix.dataset str              = unsloth_calibration_gemma-3-12b-it.txt
-llama_model_loader: - kv  38:             quantize.imatrix.entries_count i32              = 336
-llama_model_loader: - kv  39:              quantize.imatrix.chunks_count i32              = 663
-llama_model_loader: - type  f32:  289 tensors
-llama_model_loader: - type q8_0:  311 tensors
-llama_model_loader: - type bf16:   26 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 13.40 GiB (9.78 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 3840
-print_info: n_layer          = 48
-print_info: n_head           = 16
-print_info: n_head_kv        = 8
-print_info: n_rot            = 256
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 256
-print_info: n_embd_head_v    = 256
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 6.2e-02
-print_info: n_ff             = 15360
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 12B
-print_info: model params     = 11.77 B
-print_info: general.name     = Gemma-3-12B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 13721.20 MiB
-load_tensors:    ROCm_Host model buffer size =  1920.47 MiB
-.............................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   256.00 MiB
-llama_kv_cache_unified: size =  256.00 MiB (  4096 cells,   8 layers,  1/ 1 seqs), K (f16):  128.00 MiB, V (f16):  128.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   480.00 MiB
-llama_kv_cache_unified: size =  480.00 MiB (  1536 cells,  40 layers,  1/ 1 seqs), K (f16):  240.00 MiB, V (f16):  240.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   519.62 MiB
-llama_context:  ROCm_Host compute buffer size =    11.01 MiB
-llama_context: graph nodes  = 2025
-llama_context: graph splits = 1
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 854716185
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-HelloWhat
-
-llama_perf_sampler_print:    sampling time =       0.14 ms /     3 runs   (    0.05 ms per token, 21428.57 tokens per second)
-llama_perf_context_print:        load time =    2695.72 ms
-llama_perf_context_print: prompt eval time =      75.18 ms /     2 tokens (   37.59 ms per token,    26.60 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =      82.57 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 3.208919123s
-    Run #3 status: 0
-  → Avg over 3 runs: 3.434s
@@ -1,165 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-12B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-12B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 12B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 3840
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 48
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 15360
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 16
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 256
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 256
-llama_model_loader: - kv  16:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  17:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  18:             gemma3.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  20:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  28:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  30:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  31:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  32:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  33:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  34:               general.quantization_version u32              = 2
-llama_model_loader: - kv  35:                          general.file_type u32              = 7
-llama_model_loader: - kv  36:                      quantize.imatrix.file str              = gemma-3-12b-it-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  37:                   quantize.imatrix.dataset str              = unsloth_calibration_gemma-3-12b-it.txt
-llama_model_loader: - kv  38:             quantize.imatrix.entries_count i32              = 336
-llama_model_loader: - kv  39:              quantize.imatrix.chunks_count i32              = 663
-llama_model_loader: - type  f32:  289 tensors
-llama_model_loader: - type q8_0:  311 tensors
-llama_model_loader: - type bf16:   26 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 13.40 GiB (9.78 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 3840
-print_info: n_layer          = 48
-print_info: n_head           = 16
-print_info: n_head_kv        = 8
-print_info: n_rot            = 256
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 256
-print_info: n_embd_head_v    = 256
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 6.2e-02
-print_info: n_ff             = 15360
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 12B
-print_info: model params     = 11.77 B
-print_info: general.name     = Gemma-3-12B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:        ROCm0 model buffer size = 13721.20 MiB
-load_tensors:    ROCm_Host model buffer size =  1920.47 MiB
-.............................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   256.00 MiB
-llama_kv_cache_unified: size =  256.00 MiB (  4096 cells,   8 layers,  1/ 1 seqs), K (f16):  128.00 MiB, V (f16):  128.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   480.00 MiB
-llama_kv_cache_unified: size =  480.00 MiB (  1536 cells,  40 layers,  1/ 1 seqs), K (f16):  240.00 MiB, V (f16):  240.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   519.62 MiB
-llama_context:  ROCm_Host compute buffer size =    11.01 MiB
-llama_context: graph nodes  = 2025
-llama_context: graph splits = 1
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 754281730
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-HelloThe
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 32608.70 tokens per second)
-llama_perf_context_print:        load time =    3090.57 ms
-llama_perf_context_print: prompt eval time =      75.62 ms /     2 tokens (   37.81 ms per token,    26.45 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =      81.49 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 3.616272374s
-    Run #3 status: 0
-  → Avg over 3 runs: 3.861s
@@ -1,163 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-12B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-12B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 12B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 3840
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 48
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 15360
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 16
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 256
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 256
-llama_model_loader: - kv  16:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  17:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  18:             gemma3.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  20:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  28:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  30:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  31:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  32:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  33:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  34:               general.quantization_version u32              = 2
-llama_model_loader: - kv  35:                          general.file_type u32              = 7
-llama_model_loader: - kv  36:                      quantize.imatrix.file str              = gemma-3-12b-it-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  37:                   quantize.imatrix.dataset str              = unsloth_calibration_gemma-3-12b-it.txt
-llama_model_loader: - kv  38:             quantize.imatrix.entries_count i32              = 336
-llama_model_loader: - kv  39:              quantize.imatrix.chunks_count i32              = 663
-llama_model_loader: - type  f32:  289 tensors
-llama_model_loader: - type q8_0:  311 tensors
-llama_model_loader: - type bf16:   26 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 13.40 GiB (9.78 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 3840
-print_info: n_layer          = 48
-print_info: n_head           = 16
-print_info: n_head_kv        = 8
-print_info: n_rot            = 256
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 256
-print_info: n_embd_head_v    = 256
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 6.2e-02
-print_info: n_ff             = 15360
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 12B
-print_info: model params     = 11.77 B
-print_info: general.name     = Gemma-3-12B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 13721.12 MiB
-load_tensors:  Vulkan_Host model buffer size =  1920.47 MiB
-.............................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   256.00 MiB
-llama_kv_cache_unified: size =  256.00 MiB (  4096 cells,   8 layers,  1/ 1 seqs), K (f16):  128.00 MiB, V (f16):  128.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   480.00 MiB
-llama_kv_cache_unified: size =  480.00 MiB (  1536 cells,  40 layers,  1/ 1 seqs), K (f16):  240.00 MiB, V (f16):  240.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   519.62 MiB
-llama_context: Vulkan_Host compute buffer size =    18.51 MiB
-llama_context: graph nodes  = 2025
-llama_context: graph splits = 2
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 356896032
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello
-
-llama_perf_sampler_print:    sampling time =       0.12 ms /     3 runs   (    0.04 ms per token, 24390.24 tokens per second)
-llama_perf_context_print:        load time =    3459.76 ms
-llama_perf_context_print: prompt eval time =      90.54 ms /     2 tokens (   45.27 ms per token,    22.09 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =      98.48 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 3.933674345s
-    Run #3 status: 0
-  → Avg over 3 runs: 3.955s
@@ -1,163 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /home/kyuz0/models/gemma-3-12b-it-UD-Q8_K_XL/gemma-3-12b-it-UD-Q8_K_XL.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-12B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-12B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 12B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 3840
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 48
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 15360
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 16
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 256
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 256
-llama_model_loader: - kv  16:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  17:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  18:             gemma3.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  19:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  20:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  21:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  22:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  23:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  24:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  28:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  30:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  31:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  32:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  33:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  34:               general.quantization_version u32              = 2
-llama_model_loader: - kv  35:                          general.file_type u32              = 7
-llama_model_loader: - kv  36:                      quantize.imatrix.file str              = gemma-3-12b-it-GGUF/imatrix_unsloth.dat
-llama_model_loader: - kv  37:                   quantize.imatrix.dataset str              = unsloth_calibration_gemma-3-12b-it.txt
-llama_model_loader: - kv  38:             quantize.imatrix.entries_count i32              = 336
-llama_model_loader: - kv  39:              quantize.imatrix.chunks_count i32              = 663
-llama_model_loader: - type  f32:  289 tensors
-llama_model_loader: - type q8_0:  311 tensors
-llama_model_loader: - type bf16:   26 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q8_0
-print_info: file size   = 13.40 GiB (9.78 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 3840
-print_info: n_layer          = 48
-print_info: n_head           = 16
-print_info: n_head_kv        = 8
-print_info: n_rot            = 256
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 256
-print_info: n_embd_head_v    = 256
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 6.2e-02
-print_info: n_ff             = 15360
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 12B
-print_info: model params     = 11.77 B
-print_info: general.name     = Gemma-3-12B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 48 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 49/49 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 13721.12 MiB
-load_tensors:  Vulkan_Host model buffer size =  1920.47 MiB
-.............................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   256.00 MiB
-llama_kv_cache_unified: size =  256.00 MiB (  4096 cells,   8 layers,  1/ 1 seqs), K (f16):  128.00 MiB, V (f16):  128.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   480.00 MiB
-llama_kv_cache_unified: size =  480.00 MiB (  1536 cells,  40 layers,  1/ 1 seqs), K (f16):  240.00 MiB, V (f16):  240.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   519.62 MiB
-llama_context: Vulkan_Host compute buffer size =    18.51 MiB
-llama_context: graph nodes  = 2025
-llama_context: graph splits = 2
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3541901199
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-HelloI
-
-llama_perf_sampler_print:    sampling time =       0.12 ms /     3 runs   (    0.04 ms per token, 24590.16 tokens per second)
-llama_perf_context_print:        load time =    3946.08 ms
-llama_perf_context_print: prompt eval time =      78.51 ms /     2 tokens (   39.26 ms per token,    25.47 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =      86.43 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 4.313578800s
-    Run #3 status: 0
-  → Avg over 3 runs: 4.295s
@@ -1,164 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-27B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-27B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 27B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 5376
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 62
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 21504
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 32
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 128
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 128
-llama_model_loader: - kv  16:                          general.file_type u32              = 32
-llama_model_loader: - kv  17:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  18:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  19:             gemma3.attention.head_count_kv u32              = 16
-llama_model_loader: - kv  20:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  21:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  22:               general.quantization_version u32              = 2
-llama_model_loader: - kv  23:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  24:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  25:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  30:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  33:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  35:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                                split.count u16              = 2
-llama_model_loader: - kv  38:                        split.tensors.count i32              = 808
-llama_model_loader: - type  f32:  373 tensors
-llama_model_loader: - type bf16:  435 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 50.31 GiB (16.00 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 5376
-print_info: n_layer          = 62
-print_info: n_head           = 32
-print_info: n_head_kv        = 16
-print_info: n_rot            = 128
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 7.7e-02
-print_info: n_ff             = 21504
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 27B
-print_info: model params     = 27.01 B
-print_info: general.name     = Gemma-3-27B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 62 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 63/63 layers to GPU
-load_tensors:        ROCm0 model buffer size = 51518.82 MiB
-load_tensors:    ROCm_Host model buffer size =  2688.66 MiB
-.............................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   320.00 MiB
-llama_kv_cache_unified: size =  320.00 MiB (  4096 cells,  10 layers,  1/ 1 seqs), K (f16):  160.00 MiB, V (f16):  160.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   624.00 MiB
-llama_kv_cache_unified: size =  624.00 MiB (  1536 cells,  52 layers,  1/ 1 seqs), K (f16):  312.00 MiB, V (f16):  312.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   522.62 MiB
-llama_context:  ROCm_Host compute buffer size =    11.01 MiB
-llama_context: graph nodes  = 2613
-llama_context: graph splits = 1
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 204092650
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello 
-
-llama_perf_sampler_print:    sampling time =       0.08 ms /     3 runs   (    0.03 ms per token, 39473.68 tokens per second)
-llama_perf_context_print:        load time =    7815.59 ms
-llama_perf_context_print: prompt eval time =     253.33 ms /     2 tokens (  126.66 ms per token,     7.89 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     258.00 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 11.830337249s
-    Run #3 status: 0
-  → Avg over 3 runs: 12.495s
@@ -1,164 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-27B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-27B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 27B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 5376
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 62
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 21504
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 32
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 128
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 128
-llama_model_loader: - kv  16:                          general.file_type u32              = 32
-llama_model_loader: - kv  17:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  18:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  19:             gemma3.attention.head_count_kv u32              = 16
-llama_model_loader: - kv  20:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  21:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  22:               general.quantization_version u32              = 2
-llama_model_loader: - kv  23:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  24:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  25:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  30:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  33:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  35:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                                split.count u16              = 2
-llama_model_loader: - kv  38:                        split.tensors.count i32              = 808
-llama_model_loader: - type  f32:  373 tensors
-llama_model_loader: - type bf16:  435 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 50.31 GiB (16.00 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 5376
-print_info: n_layer          = 62
-print_info: n_head           = 32
-print_info: n_head_kv        = 16
-print_info: n_rot            = 128
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 7.7e-02
-print_info: n_ff             = 21504
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 27B
-print_info: model params     = 27.01 B
-print_info: general.name     = Gemma-3-27B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 62 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 63/63 layers to GPU
-load_tensors:        ROCm0 model buffer size = 51518.82 MiB
-load_tensors:    ROCm_Host model buffer size =  2688.66 MiB
-.............................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   320.00 MiB
-llama_kv_cache_unified: size =  320.00 MiB (  4096 cells,  10 layers,  1/ 1 seqs), K (f16):  160.00 MiB, V (f16):  160.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   624.00 MiB
-llama_kv_cache_unified: size =  624.00 MiB (  1536 cells,  52 layers,  1/ 1 seqs), K (f16):  312.00 MiB, V (f16):  312.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   522.62 MiB
-llama_context:  ROCm_Host compute buffer size =    11.01 MiB
-llama_context: graph nodes  = 2613
-llama_context: graph splits = 1
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 88592582
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 35294.12 tokens per second)
-llama_perf_context_print:        load time =   10385.57 ms
-llama_perf_context_print: prompt eval time =     253.71 ms /     2 tokens (  126.85 ms per token,     7.88 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     259.35 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 11.144656718s
-    Run #3 status: 0
-  → Avg over 3 runs: 10.486s
@@ -1,164 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-27B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-27B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 27B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 5376
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 62
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 21504
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 32
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 128
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 128
-llama_model_loader: - kv  16:                          general.file_type u32              = 32
-llama_model_loader: - kv  17:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  18:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  19:             gemma3.attention.head_count_kv u32              = 16
-llama_model_loader: - kv  20:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  21:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  22:               general.quantization_version u32              = 2
-llama_model_loader: - kv  23:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  24:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  25:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  30:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  33:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  35:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                                split.count u16              = 2
-llama_model_loader: - kv  38:                        split.tensors.count i32              = 808
-llama_model_loader: - type  f32:  373 tensors
-llama_model_loader: - type bf16:  435 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 50.31 GiB (16.00 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 5376
-print_info: n_layer          = 62
-print_info: n_head           = 32
-print_info: n_head_kv        = 16
-print_info: n_rot            = 128
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 7.7e-02
-print_info: n_ff             = 21504
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 27B
-print_info: model params     = 27.01 B
-print_info: general.name     = Gemma-3-27B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 62 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 63/63 layers to GPU
-load_tensors:        ROCm0 model buffer size = 51518.82 MiB
-load_tensors:    ROCm_Host model buffer size =  2688.66 MiB
-.............................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   320.00 MiB
-llama_kv_cache_unified: size =  320.00 MiB (  4096 cells,  10 layers,  1/ 1 seqs), K (f16):  160.00 MiB, V (f16):  160.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:      ROCm0 KV buffer size =   624.00 MiB
-llama_kv_cache_unified: size =  624.00 MiB (  1536 cells,  52 layers,  1/ 1 seqs), K (f16):  312.00 MiB, V (f16):  312.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   522.62 MiB
-llama_context:  ROCm_Host compute buffer size =    11.01 MiB
-llama_context: graph nodes  = 2613
-llama_context: graph splits = 1
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1422263455
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.09 ms /     3 runs   (    0.03 ms per token, 35294.12 tokens per second)
-llama_perf_context_print:        load time =    9620.16 ms
-llama_perf_context_print: prompt eval time =     256.55 ms /     2 tokens (  128.27 ms per token,     7.80 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     261.63 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 10.587027979s
-    Run #3 status: 0
-  → Avg over 3 runs: 10.417s
@@ -1,113 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-27B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-27B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 27B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 5376
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 62
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 21504
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 32
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 128
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 128
-llama_model_loader: - kv  16:                          general.file_type u32              = 32
-llama_model_loader: - kv  17:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  18:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  19:             gemma3.attention.head_count_kv u32              = 16
-llama_model_loader: - kv  20:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  21:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  22:               general.quantization_version u32              = 2
-llama_model_loader: - kv  23:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  24:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  25:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  30:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  33:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  35:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                                split.count u16              = 2
-llama_model_loader: - kv  38:                        split.tensors.count i32              = 808
-llama_model_loader: - type  f32:  373 tensors
-llama_model_loader: - type bf16:  435 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 50.31 GiB (16.00 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 5376
-print_info: n_layer          = 62
-print_info: n_head           = 32
-print_info: n_head_kv        = 16
-print_info: n_rot            = 128
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 7.7e-02
-print_info: n_ff             = 21504
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 27B
-print_info: model params     = 27.01 B
-print_info: general.name     = Gemma-3-27B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-ggml_vulkan: Device memory allocation of size 2819260416 failed.
-ggml_vulkan: Requested buffer size exceeds device memory allocation limit: ErrorOutOfDeviceMemory
-alloc_tensor_range: failed to allocate Vulkan0 buffer of size 2819260416
-llama_model_load: error loading model: unable to allocate Vulkan0 buffer
-llama_model_load_from_file_impl: failed to load model
-common_init_from_params: failed to load model '/home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf'
-main: error: unable to load model
-    Elapsed #3: .416644024s
-    Run #3 status: 1
-    ✖ run #3 failed
-  → No successful runs
@@ -1,162 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: additional 1 GGUFs metadata loaded.
-llama_model_loader: loaded meta data with 39 key-value pairs and 808 tensors from /home/kyuz0/models/gemma-3-27b-it-BF16/gemma-3-27b-it-BF16-00001-of-00002.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = gemma3
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Gemma-3-27B-It
-llama_model_loader: - kv   3:                           general.finetune str              = it
-llama_model_loader: - kv   4:                           general.basename str              = Gemma-3-27B-It
-llama_model_loader: - kv   5:                       general.quantized_by str              = Unsloth
-llama_model_loader: - kv   6:                         general.size_label str              = 27B
-llama_model_loader: - kv   7:                           general.repo_url str              = https://huggingface.co/unsloth
-llama_model_loader: - kv   8:                      gemma3.context_length u32              = 131072
-llama_model_loader: - kv   9:                    gemma3.embedding_length u32              = 5376
-llama_model_loader: - kv  10:                         gemma3.block_count u32              = 62
-llama_model_loader: - kv  11:                 gemma3.feed_forward_length u32              = 21504
-llama_model_loader: - kv  12:                gemma3.attention.head_count u32              = 32
-llama_model_loader: - kv  13:    gemma3.attention.layer_norm_rms_epsilon f32              = 0.000001
-llama_model_loader: - kv  14:                gemma3.attention.key_length u32              = 128
-llama_model_loader: - kv  15:              gemma3.attention.value_length u32              = 128
-llama_model_loader: - kv  16:                          general.file_type u32              = 32
-llama_model_loader: - kv  17:                      gemma3.rope.freq_base f32              = 1000000.000000
-llama_model_loader: - kv  18:            gemma3.attention.sliding_window u32              = 1024
-llama_model_loader: - kv  19:             gemma3.attention.head_count_kv u32              = 16
-llama_model_loader: - kv  20:                   gemma3.rope.scaling.type str              = linear
-llama_model_loader: - kv  21:                 gemma3.rope.scaling.factor f32              = 8.000000
-llama_model_loader: - kv  22:               general.quantization_version u32              = 2
-llama_model_loader: - kv  23:                       tokenizer.ggml.model str              = llama
-llama_model_loader: - kv  24:                         tokenizer.ggml.pre str              = default
-llama_model_loader: - kv  25:                      tokenizer.ggml.tokens arr[str,262208]  = ["<pad>", "<eos>", "<bos>", "<unk>", ...
-llama_model_loader: - kv  26:                      tokenizer.ggml.scores arr[f32,262208]  = [-1000.000000, -1000.000000, -1000.00...
-llama_model_loader: - kv  27:                  tokenizer.ggml.token_type arr[i32,262208]  = [3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, ...
-llama_model_loader: - kv  28:                tokenizer.ggml.bos_token_id u32              = 2
-llama_model_loader: - kv  29:                tokenizer.ggml.eos_token_id u32              = 106
-llama_model_loader: - kv  30:            tokenizer.ggml.unknown_token_id u32              = 3
-llama_model_loader: - kv  31:            tokenizer.ggml.padding_token_id u32              = 0
-llama_model_loader: - kv  32:               tokenizer.ggml.add_bos_token bool             = true
-llama_model_loader: - kv  33:               tokenizer.ggml.add_eos_token bool             = false
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{ bos_token }}\n{%- if messages[0]['r...
-llama_model_loader: - kv  35:            tokenizer.ggml.add_space_prefix bool             = false
-llama_model_loader: - kv  36:                                   split.no u16              = 0
-llama_model_loader: - kv  37:                                split.count u16              = 2
-llama_model_loader: - kv  38:                        split.tensors.count i32              = 808
-llama_model_loader: - type  f32:  373 tensors
-llama_model_loader: - type bf16:  435 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = BF16
-print_info: file size   = 50.31 GiB (16.00 BPW) 
-load: special tokens cache size = 6415
-load: token to piece cache size = 1.9446 MB
-print_info: arch             = gemma3
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 5376
-print_info: n_layer          = 62
-print_info: n_head           = 32
-print_info: n_head_kv        = 16
-print_info: n_rot            = 128
-print_info: n_swa            = 1024
-print_info: is_swa_any       = 1
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 2
-print_info: n_embd_k_gqa     = 2048
-print_info: n_embd_v_gqa     = 2048
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-06
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 7.7e-02
-print_info: n_ff             = 21504
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 2
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 1000000.0
-print_info: freq_scale_train = 0.125
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 27B
-print_info: model params     = 27.01 B
-print_info: general.name     = Gemma-3-27B-It
-print_info: vocab type       = SPM
-print_info: n_vocab          = 262208
-print_info: n_merges         = 0
-print_info: BOS token        = 2 '<bos>'
-print_info: EOS token        = 106 '<end_of_turn>'
-print_info: EOT token        = 106 '<end_of_turn>'
-print_info: UNK token        = 3 '<unk>'
-print_info: PAD token        = 0 '<pad>'
-print_info: LF token         = 248 '<0x0A>'
-print_info: EOG token        = 106 '<end_of_turn>'
-print_info: max token length = 48
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 62 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 63/63 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 51518.82 MiB
-load_tensors:  Vulkan_Host model buffer size =  2688.66 MiB
-.............................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 1000000.0
-llama_context: freq_scale    = 0.125
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     1.00 MiB
-llama_kv_cache_unified_iswa: creating non-SWA KV cache, size = 4096 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   320.00 MiB
-llama_kv_cache_unified: size =  320.00 MiB (  4096 cells,  10 layers,  1/ 1 seqs), K (f16):  160.00 MiB, V (f16):  160.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_kv_cache_unified_iswa: creating     SWA KV cache, size = 1536 cells
-llama_kv_cache_unified:    Vulkan0 KV buffer size =   624.00 MiB
-llama_kv_cache_unified: size =  624.00 MiB (  1536 cells,  52 layers,  1/ 1 seqs), K (f16):  312.00 MiB, V (f16):  312.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   522.62 MiB
-llama_context: Vulkan_Host compute buffer size =    21.51 MiB
-llama_context: graph nodes  = 2613
-llama_context: graph splits = 2
-common_init_from_params: KV cache shifting is not supported for this context, disabling KV cache shifting
-common_init_from_params: added <end_of_turn> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 4215263583
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.18 ms /     3 runs   (    0.06 ms per token, 16666.67 tokens per second)
-llama_perf_context_print:        load time =   14451.51 ms
-llama_perf_context_print: prompt eval time =     257.32 ms /     2 tokens (  128.66 ms per token,     7.77 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     265.56 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 15.024330058s
-    Run #3 status: 0
-  → Avg over 3 runs: 13.579s
@@ -1,159 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250521 (Red Hat 15.1.1-2) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (Radeon 8060S Graphics) - 124522 MiB free
-llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 70B Instruct 2024 12
-llama_model_loader: - kv   3:                            general.version str              = 2024-12
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Llama-3.1
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                            general.license str              = llama3.1
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Llama 3.1 70B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  13:                          general.languages arr[str,7]       = ["fr", "it", "pt", "hi", "es", "th", ...
-llama_model_loader: - kv  14:                          llama.block_count u32              = 80
-llama_model_loader: - kv  15:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  16:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  17:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  18:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  19:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  20:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  21:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  22:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  23:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  24:                          general.file_type u32              = 15
-llama_model_loader: - kv  25:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  26:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  27:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  28:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  29:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  30:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  31:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  32:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  33:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  35:               general.quantization_version u32              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q4_K:  441 tensors
-llama_model_loader: - type q5_K:   40 tensors
-llama_model_loader: - type q6_K:   81 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 39.59 GiB (4.82 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama 3.1 70B Instruct 2024 12
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:          CPU model buffer size =   563.62 MiB
-load_tensors:        ROCm0 model buffer size = 39979.48 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   266.50 MiB
-llama_context:  ROCm_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1295757489
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     3 runs   (    0.02 ms per token, 61224.49 tokens per second)
-llama_perf_context_print:        load time =    5592.62 ms
-llama_perf_context_print: prompt eval time =     248.28 ms /     2 tokens (  124.14 ms per token,     8.06 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     263.25 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 9.635053314s
-    Run #3 status: 0
-  → Avg over 3 runs: 9.887s
@@ -1,159 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 70B Instruct 2024 12
-llama_model_loader: - kv   3:                            general.version str              = 2024-12
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Llama-3.1
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                            general.license str              = llama3.1
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Llama 3.1 70B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  13:                          general.languages arr[str,7]       = ["fr", "it", "pt", "hi", "es", "th", ...
-llama_model_loader: - kv  14:                          llama.block_count u32              = 80
-llama_model_loader: - kv  15:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  16:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  17:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  18:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  19:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  20:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  21:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  22:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  23:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  24:                          general.file_type u32              = 15
-llama_model_loader: - kv  25:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  26:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  27:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  28:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  29:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  30:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  31:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  32:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  33:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  35:               general.quantization_version u32              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q4_K:  441 tensors
-llama_model_loader: - type q5_K:   40 tensors
-llama_model_loader: - type q6_K:   81 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 39.59 GiB (4.82 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama 3.1 70B Instruct 2024 12
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:          CPU model buffer size =   563.62 MiB
-load_tensors:        ROCm0 model buffer size = 39979.48 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   266.50 MiB
-llama_context:  ROCm_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 3791928713
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello.
-
-llama_perf_sampler_print:    sampling time =       0.05 ms /     3 runs   (    0.02 ms per token, 57692.31 tokens per second)
-llama_perf_context_print:        load time =    6133.42 ms
-llama_perf_context_print: prompt eval time =     247.67 ms /     2 tokens (  123.83 ms per token,     8.08 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     268.37 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 6.904239282s
-    Run #3 status: 0
-  → Avg over 3 runs: 9.338s
@@ -1,159 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: AMD Radeon Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-build: 6066 (4cb208c9) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon Graphics) - 124523 MiB free
-llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 70B Instruct 2024 12
-llama_model_loader: - kv   3:                            general.version str              = 2024-12
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Llama-3.1
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                            general.license str              = llama3.1
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Llama 3.1 70B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  13:                          general.languages arr[str,7]       = ["fr", "it", "pt", "hi", "es", "th", ...
-llama_model_loader: - kv  14:                          llama.block_count u32              = 80
-llama_model_loader: - kv  15:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  16:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  17:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  18:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  19:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  20:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  21:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  22:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  23:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  24:                          general.file_type u32              = 15
-llama_model_loader: - kv  25:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  26:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  27:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  28:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  29:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  30:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  31:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  32:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  33:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  35:               general.quantization_version u32              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q4_K:  441 tensors
-llama_model_loader: - type q5_K:   40 tensors
-llama_model_loader: - type q6_K:   81 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 39.59 GiB (4.82 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama 3.1 70B Instruct 2024 12
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:          CPU model buffer size =   563.62 MiB
-load_tensors:        ROCm0 model buffer size = 39979.48 MiB
-...................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context:  ROCm_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:      ROCm0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:      ROCm0 compute buffer size =   266.50 MiB
-llama_context:  ROCm_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 59935472
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello.
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 46153.85 tokens per second)
-llama_perf_context_print:        load time =   12737.72 ms
-llama_perf_context_print: prompt eval time =     291.99 ms /     2 tokens (  145.99 ms per token,     6.85 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     306.96 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 13.680764475s
-    Run #3 status: 0
-  → Avg over 3 runs: 14.602s
@@ -1,157 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (AMD open-source driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6060 (9c35706b) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics) - 85720 MiB free
-llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 70B Instruct 2024 12
-llama_model_loader: - kv   3:                            general.version str              = 2024-12
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Llama-3.1
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                            general.license str              = llama3.1
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Llama 3.1 70B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  13:                          general.languages arr[str,7]       = ["fr", "it", "pt", "hi", "es", "th", ...
-llama_model_loader: - kv  14:                          llama.block_count u32              = 80
-llama_model_loader: - kv  15:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  16:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  17:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  18:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  19:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  20:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  21:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  22:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  23:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  24:                          general.file_type u32              = 15
-llama_model_loader: - kv  25:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  26:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  27:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  28:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  29:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  30:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  31:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  32:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  33:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  35:               general.quantization_version u32              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q4_K:  441 tensors
-llama_model_loader: - type q5_K:   40 tensors
-llama_model_loader: - type q6_K:   81 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 39.59 GiB (4.82 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama 3.1 70B Instruct 2024 12
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 39979.48 MiB
-load_tensors:          CPU model buffer size =   563.62 MiB
-..................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   266.50 MiB
-llama_context: Vulkan_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 1976378490
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello,
-
-llama_perf_sampler_print:    sampling time =       0.08 ms /     3 runs   (    0.03 ms per token, 36585.37 tokens per second)
-llama_perf_context_print:        load time =    6987.06 ms
-llama_perf_context_print: prompt eval time =     210.77 ms /     2 tokens (  105.39 ms per token,     9.49 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     232.45 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 7.786884955s
-    Run #3 status: 0
-  → Avg over 3 runs: 9.176s
@@ -1,157 +0,0 @@
-ggml_vulkan: Found 1 Vulkan devices:
-ggml_vulkan: 0 = Radeon 8060S Graphics (RADV GFX1151) (radv) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 65536 | int dot: 1 | matrix cores: KHR_coopmat
-build: 6040 (66625a59) with cc (GCC) 15.1.1 20250719 (Red Hat 15.1.1-5) for x86_64-redhat-linux
-main: llama backend init
-main: load the model and apply lora adapter, if any
-llama_model_load_from_file_impl: using device Vulkan0 (Radeon 8060S Graphics (RADV GFX1151)) - 87722 MiB free
-llama_model_loader: loaded meta data with 36 key-value pairs and 724 tensors from /home/kyuz0/models/llama-3.3-Q4_K_M/llama3.3-70.6B-Q4_K_M.gguf (version GGUF V3 (latest))
-llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
-llama_model_loader: - kv   0:                       general.architecture str              = llama
-llama_model_loader: - kv   1:                               general.type str              = model
-llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 70B Instruct 2024 12
-llama_model_loader: - kv   3:                            general.version str              = 2024-12
-llama_model_loader: - kv   4:                           general.finetune str              = Instruct
-llama_model_loader: - kv   5:                           general.basename str              = Llama-3.1
-llama_model_loader: - kv   6:                         general.size_label str              = 70B
-llama_model_loader: - kv   7:                            general.license str              = llama3.1
-llama_model_loader: - kv   8:                   general.base_model.count u32              = 1
-llama_model_loader: - kv   9:                  general.base_model.0.name str              = Llama 3.1 70B
-llama_model_loader: - kv  10:          general.base_model.0.organization str              = Meta Llama
-llama_model_loader: - kv  11:              general.base_model.0.repo_url str              = https://huggingface.co/meta-llama/Lla...
-llama_model_loader: - kv  12:                               general.tags arr[str,5]       = ["facebook", "meta", "pytorch", "llam...
-llama_model_loader: - kv  13:                          general.languages arr[str,7]       = ["fr", "it", "pt", "hi", "es", "th", ...
-llama_model_loader: - kv  14:                          llama.block_count u32              = 80
-llama_model_loader: - kv  15:                       llama.context_length u32              = 131072
-llama_model_loader: - kv  16:                     llama.embedding_length u32              = 8192
-llama_model_loader: - kv  17:                  llama.feed_forward_length u32              = 28672
-llama_model_loader: - kv  18:                 llama.attention.head_count u32              = 64
-llama_model_loader: - kv  19:              llama.attention.head_count_kv u32              = 8
-llama_model_loader: - kv  20:                       llama.rope.freq_base f32              = 500000.000000
-llama_model_loader: - kv  21:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
-llama_model_loader: - kv  22:                 llama.attention.key_length u32              = 128
-llama_model_loader: - kv  23:               llama.attention.value_length u32              = 128
-llama_model_loader: - kv  24:                          general.file_type u32              = 15
-llama_model_loader: - kv  25:                           llama.vocab_size u32              = 128256
-llama_model_loader: - kv  26:                 llama.rope.dimension_count u32              = 128
-llama_model_loader: - kv  27:                       tokenizer.ggml.model str              = gpt2
-llama_model_loader: - kv  28:                         tokenizer.ggml.pre str              = llama-bpe
-llama_model_loader: - kv  29:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
-llama_model_loader: - kv  30:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
-llama_model_loader: - kv  31:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
-llama_model_loader: - kv  32:                tokenizer.ggml.bos_token_id u32              = 128000
-llama_model_loader: - kv  33:                tokenizer.ggml.eos_token_id u32              = 128009
-llama_model_loader: - kv  34:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
-llama_model_loader: - kv  35:               general.quantization_version u32              = 2
-llama_model_loader: - type  f32:  162 tensors
-llama_model_loader: - type q4_K:  441 tensors
-llama_model_loader: - type q5_K:   40 tensors
-llama_model_loader: - type q6_K:   81 tensors
-print_info: file format = GGUF V3 (latest)
-print_info: file type   = Q4_K - Medium
-print_info: file size   = 39.59 GiB (4.82 BPW) 
-load: special tokens cache size = 256
-load: token to piece cache size = 0.7999 MB
-print_info: arch             = llama
-print_info: vocab_only       = 0
-print_info: n_ctx_train      = 131072
-print_info: n_embd           = 8192
-print_info: n_layer          = 80
-print_info: n_head           = 64
-print_info: n_head_kv        = 8
-print_info: n_rot            = 128
-print_info: n_swa            = 0
-print_info: is_swa_any       = 0
-print_info: n_embd_head_k    = 128
-print_info: n_embd_head_v    = 128
-print_info: n_gqa            = 8
-print_info: n_embd_k_gqa     = 1024
-print_info: n_embd_v_gqa     = 1024
-print_info: f_norm_eps       = 0.0e+00
-print_info: f_norm_rms_eps   = 1.0e-05
-print_info: f_clamp_kqv      = 0.0e+00
-print_info: f_max_alibi_bias = 0.0e+00
-print_info: f_logit_scale    = 0.0e+00
-print_info: f_attn_scale     = 0.0e+00
-print_info: n_ff             = 28672
-print_info: n_expert         = 0
-print_info: n_expert_used    = 0
-print_info: causal attn      = 1
-print_info: pooling type     = 0
-print_info: rope type        = 0
-print_info: rope scaling     = linear
-print_info: freq_base_train  = 500000.0
-print_info: freq_scale_train = 1
-print_info: n_ctx_orig_yarn  = 131072
-print_info: rope_finetuned   = unknown
-print_info: model type       = 70B
-print_info: model params     = 70.55 B
-print_info: general.name     = Llama 3.1 70B Instruct 2024 12
-print_info: vocab type       = BPE
-print_info: n_vocab          = 128256
-print_info: n_merges         = 280147
-print_info: BOS token        = 128000 '<|begin_of_text|>'
-print_info: EOS token        = 128009 '<|eot_id|>'
-print_info: EOT token        = 128009 '<|eot_id|>'
-print_info: EOM token        = 128008 '<|eom_id|>'
-print_info: LF token         = 198 'Ċ'
-print_info: EOG token        = 128001 '<|end_of_text|>'
-print_info: EOG token        = 128008 '<|eom_id|>'
-print_info: EOG token        = 128009 '<|eot_id|>'
-print_info: max token length = 256
-load_tensors: loading model tensors, this can take a while... (mmap = false)
-load_tensors: offloading 80 repeating layers to GPU
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 81/81 layers to GPU
-load_tensors:      Vulkan0 model buffer size = 39979.48 MiB
-load_tensors:          CPU model buffer size =   563.62 MiB
-..................................................................................................
-llama_context: constructing llama_context
-llama_context: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache
-llama_context: n_seq_max     = 1
-llama_context: n_ctx         = 4096
-llama_context: n_ctx_per_seq = 4096
-llama_context: n_batch       = 2048
-llama_context: n_ubatch      = 512
-llama_context: causal_attn   = 1
-llama_context: flash_attn    = 1
-llama_context: kv_unified    = true
-llama_context: freq_base     = 500000.0
-llama_context: freq_scale    = 1
-llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: Vulkan_Host  output buffer size =     0.49 MiB
-llama_kv_cache_unified:    Vulkan0 KV buffer size =  1280.00 MiB
-llama_kv_cache_unified: size = 1280.00 MiB (  4096 cells,  80 layers,  1/ 1 seqs), K (f16):  640.00 MiB, V (f16):  640.00 MiB
-llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
-llama_context:    Vulkan0 compute buffer size =   266.50 MiB
-llama_context: Vulkan_Host compute buffer size =    24.01 MiB
-llama_context: graph nodes  = 2647
-llama_context: graph splits = 2
-common_init_from_params: added <|end_of_text|> logit bias = -inf
-common_init_from_params: added <|eom_id|> logit bias = -inf
-common_init_from_params: added <|eot_id|> logit bias = -inf
-common_init_from_params: setting dry_penalty_last_n to ctx_size = 4096
-common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
-main: llama threadpool init, n_threads = 16
-
-system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | 
-
-sampler seed: 2613669910
-sampler params: 
-	repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
-	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
-	top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
-	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
-sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist 
-generate: n_ctx = 4096, n_batch = 2048, n_predict = 1, n_keep = 1
-
-Hello's
-
-llama_perf_sampler_print:    sampling time =       0.07 ms /     3 runs   (    0.02 ms per token, 40540.54 tokens per second)
-llama_perf_context_print:        load time =    8119.06 ms
-llama_perf_context_print: prompt eval time =     204.01 ms /     2 tokens (  102.01 ms per token,     9.80 tokens per second)
-llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_perf_context_print:       total time =     225.18 ms /     3 tokens
-llama_perf_context_print:    graphs reused =          0
-    Elapsed #3: 8.699816033s
-    Run #3 status: 0
-  → Avg over 3 runs: 8.816s
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parse the console output of run_loadtime_benchmarks.sh stored in run_loadtime_benchmarks.log,
-then produce a Markdown table of average load+inference times per model/env.
-"""
-import re
-from collections import defaultdict, OrderedDict
-import sys
-
-LOGFILE = 'run_loadtime_benchmark.log'
-# Define expected environments in desired column order
-ENV_ORDER = ['vulkan_radv','vulkan_amdvlk','rocm6_4_2','rocm7_beta','rocm7_rc']
-
-# Regex patterns
-ENTRY_RE = re.compile(r"✔ \[(?P<env>[^]]+)\] (?P<model>[^ ]+) avg=(?P<avg>[0-9.]+)s over (?P<n>[0-9]+) runs")
-FAIL_RE  = re.compile(r"✖ \[(?P<env>[^]]+)\] (?P<model>[^ ]+) all runs failed")
-
-# Data containers
-results = defaultdict(lambda: {})  # results[model][env] = float or 'ERR'
-
-# Read and parse log
-with open(LOGFILE) as f:
-    for line in f:
-        line = line.strip()
-        m = ENTRY_RE.match(line)
-        if m:
-            env = m.group('env')
-            model = m.group('model')
-            avg = float(m.group('avg'))
-            results[model][env] = avg
-            continue
-        m2 = FAIL_RE.match(line)
-        if m2:
-            env = m2.group('env')
-            model = m2.group('model')
-            results[model][env] = None  # indicate failure
-
-# Compute winner per model: smallest time
-md_lines = []
-# Header
-header = ['Model'] + [e.replace('_',' ').title() for e in ENV_ORDER] + ['Fastest']
-md_lines.append('| ' + ' | '.join(header) + ' |')
-md_lines.append('|' + '|'.join(['---']*len(header)) + '|')
-
-for model in sorted(results, key=lambda s: s.lower()):
-    row = [f"**{model}**"]
-    env_times = results[model]
-    # find fastest
-    valid = {e:env_times[e] for e in ENV_ORDER if e in env_times and env_times[e] is not None}
-    if valid:
-        best_env = min(valid, key=lambda k: valid[k])
-        fastest = f"🏆 **{best_env}**"
-    else:
-        fastest = '—'
-    for env in ENV_ORDER:
-        if env not in env_times:
-            cell = '—'
-        else:
-            t = env_times[env]
-            if t is None:
-                cell = '⚠️ Fail'
-            else:
-                cell = f"{t:.2f}s"
-        row.append(cell)
-    row.append(fastest)
-    md_lines.append('| ' + ' | '.join(row) + ' |')
-
-# Print markdown
-table = '\n'.join(md_lines)
-print(table)
-
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-Memory access fault by GPU node-1 (Agent handle: 0x275a2540) on address 0x7f3fb2c08000. Reason: Page not present or supervisor privilege.
-✖ ! [rocm6_4_2-rocwmma] GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x25d19540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] GLM-4.5-Air-UD-Q4_K_XL-00001-of-00002 __fa1 failed (exit 134)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| glm4moe 106B.A12B Q4_K - Medium |  68.01 GiB |   110.47 B | ROCm       |  99 |    0 |           pp512 |        131.14 ± 0.28 |
-| glm4moe 106B.A12B Q4_K - Medium |  68.01 GiB |   110.47 B | ROCm       |  99 |    0 |           tg128 |         20.15 ± 0.01 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| glm4moe 106B.A12B Q4_K - Medium |  68.01 GiB |   110.47 B | ROCm       |  99 |  1 |    0 |           pp512 |        104.12 ± 0.05 |
-| glm4moe 106B.A12B Q4_K - Medium |  68.01 GiB |   110.47 B | ROCm       |  99 |  1 |    0 |           tg128 |         20.35 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x3e28b540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-Memory access fault by GPU node-1 (Agent handle: 0x2bdf8540) on address 0x7f5f95e35000. Reason: Page not present or supervisor privilege.
-✖ ! [rocm6_4_2-rocwmma] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x3ff2d540) reason :GPU Hang
-✖ ! [rocm6_4_2] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x3bb3540) reason :GPU Hang
-✖ ! [rocm6_4_2] GLM-4.5-Air-UD-Q6_K_XL-00001-of-00003 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x33b8a540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x20e35540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x1b1ea540) reason :GPU Hang
-✖ ! [rocm6_4_2] Llama-3.3-70B-Instruct-UD-Q8_K_XL-00001-of-00002 failed (exit 134)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| llama 70B Q8_0                 |  75.65 GiB |    70.55 B | ROCm       |  99 |  1 |    0 |           pp512 |         16.16 ± 0.02 |
-| llama 70B Q8_0                 |  75.65 GiB |    70.55 B | ROCm       |  99 |  1 |    0 |           tg128 |          2.78 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x344ea540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0xe316540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x17ade540) reason :GPU Hang
-✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0xe91f540) reason :GPU Hang
-✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q6_K-00001-of-00002 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x1019d540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x2ff5c540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x3db80540) reason :GPU Hang
-✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x24a4c540) reason :GPU Hang
-✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-Q8_0-00001-of-00003 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-Memory access fault by GPU node-1 (Agent handle: 0x3e5ce540) on address 0x7f64d3b76000. Reason: Page not present or supervisor privilege.
-✖ ! [rocm6_4_2-rocwmma] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x1239e540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x101f4540) reason :GPU Hang
-✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-Memory access fault by GPU node-1 (Agent handle: 0x15f12540) on address 0x7ef17d976000. Reason: Page not present or supervisor privilege.
-✖ ! [rocm6_4_2] Llama-4-Scout-17B-16E-Instruct-UD-Q4_K_XL-00001-of-00002 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x2f5d1540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0xdc93540) reason :GPU Hang
-✖ ! [rocm6_4_2-rocwmma] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __fa1 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0xff7540) reason :GPU Hang
-✖ ! [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 failed (exit 134)
@@ -1,6 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-HW Exception by GPU node-1 (Agent handle: 0x2607e540) reason :GPU Hang
-✖ ! [rocm6_4_2] Qwen3-235B-A22B-Instruct-2507-UD-Q3_K_XL-00001-of-00003 __fa1 failed (exit 134)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |    0 |           pp512 |        157.75 ± 2.58 |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |    0 |           tg128 |         24.62 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           pp512 |        161.90 ± 3.05 |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           tg128 |         24.09 ± 0.02 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |    0 |           pp512 |        157.81 ± 2.51 |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |    0 |           tg128 |         24.61 ± 0.01 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           pp512 |        140.24 ± 1.86 |
-| qwen3moe 30B.A3B BF16          |  56.89 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           tg128 |         24.46 ± 0.02 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |    0 |           pp512 |        387.23 ± 0.82 |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |    0 |           tg128 |         50.64 ± 0.01 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           pp512 |        411.72 ± 1.04 |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           tg128 |         48.78 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |    0 |           pp512 |        387.86 ± 1.41 |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |    0 |           tg128 |         50.65 ± 0.01 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           pp512 |        301.23 ± 0.49 |
-| qwen3moe 30B.A3B Q6_K          |  24.53 GiB |    30.53 B | ROCm       |  99 |  1 |    0 |           tg128 |         50.07 ± 0.02 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |    0 |           pp512 |        222.91 ± 0.21 |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |    0 |           tg128 |         14.03 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |  1 |    0 |           pp512 |        229.15 ± 0.24 |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |  1 |    0 |           tg128 |         13.76 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |    0 |           pp512 |        222.59 ± 0.24 |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |    0 |           tg128 |         14.03 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |  1 |    0 |           pp512 |        197.89 ± 3.40 |
-| gemma3 12B Q8_0                |  13.40 GiB |    11.77 B | ROCm       |  99 |  1 |    0 |           tg128 |         13.76 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | ---: | --------------: | -------------------: |
-| gemma3 27B BF16                |  50.31 GiB |    27.01 B | ROCm       |  99 |    0 |           pp512 |         87.20 ± 3.70 |
-| gemma3 27B BF16                |  50.31 GiB |    27.01 B | ROCm       |  99 |    0 |           tg128 |          4.09 ± 0.00 |
-
-build: de219279 (6181)
@@ -1,10 +0,0 @@
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
-ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
-ggml_cuda_init: found 1 ROCm devices:
-  Device 0: Radeon 8060S Graphics, gfx1151 (0x1151), VMM: no, Wave Size: 32
-| model                          |       size |     params | backend    | ngl | fa | mmap |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -: | ---: | --------------: | -------------------: |
-| gemma3 27B BF16                |  50.31 GiB |    27.01 B | ROCm       |  99 |  1 |    0 |           pp512 |        68.87 ± 14.37 |
-| gemma3 27B BF16                |  50.31 GiB |    27.01 B | ROCm       |  99 |  1 |    0 |           tg128 |          4.08 ± 0.00 |
-
-build: de219279 (6181)
--- a/Show More
+++ b/Show More