#!/usr/bin/env python3 import argparse import glob import os import re RESULTS_DIR_DEFAULT = "results" # Same detection logic as your extractor HEADER_RE = re.compile(r"^\|\s*model\s*\|", re.IGNORECASE) SEP_RE = re.compile(r"^\|\s*-+") LOAD_ERR = re.compile(r"failed to load model|Device memory allocation.*failed|⚠️\s*Fail", re.IGNORECASE) HANG_ERR = re.compile(r"GPU Hang|HW Exception", re.IGNORECASE) GENERIC_ERR = re.compile(r"error:|exit \d+|runtime error|⚠️\s*Runtime Error", re.IGNORECASE) def parse_table(text): lines = text.splitlines() rows = [] header = None col_idx = {} for line in lines: if HEADER_RE.search(line): header = [c.strip().lower() for c in line.strip().strip("|").split("|")] for idx, name in enumerate(header): col_idx[name] = idx continue if header and (SEP_RE.search(line) or not line.strip()): continue if header and line.startswith("|"): parts = [c.strip() for c in line.strip().strip("|").split("|")] if len(parts) < len(header): continue row = {} for name, idx in col_idx.items(): row[name] = parts[idx] rows.append(row) if header and line.strip() == "" and rows: break return rows def detect_error(text): if LOAD_ERR.search(text): return True if HANG_ERR.search(text): return True if GENERIC_ERR.search(text): return True return False def is_non_transient_vram_issue(text): # Do NOT delete logs with this kind of Vulkan OOM return ( "ggml_vulkan: Device memory allocation of size" in text and "Requested buffer size exceeds device buffer size limit" in text ) def is_failed_run(text): table_rows = parse_table(text) has_pp = any(r.get("test", "").lower() == "pp512" for r in table_rows) has_tg = any(r.get("test", "").lower() == "tg128" for r in table_rows) if has_pp or has_tg: return False return detect_error(text) def main(): ap = argparse.ArgumentParser( description="Delete transient-failure benchmark logs in results/" ) ap.add_argument( "--results-dir", default=RESULTS_DIR_DEFAULT, help="Directory containing *.log files (default: results)", ) ap.add_argument( "--dry-run", action="store_true", help="Only print what would be deleted", ) args = ap.parse_args() results_dir = args.results_dir pattern = os.path.join(results_dir, "*.log") to_delete = [] skipped_non_transient = [] for path in sorted(glob.glob(pattern)): try: with open(path, errors="ignore") as f: text = f.read() except OSError as e: print(f"Could not read {path}: {e}") continue if not is_failed_run(text): continue if is_non_transient_vram_issue(text): skipped_non_transient.append(path) continue to_delete.append(path) if not to_delete and not skipped_non_transient: print("No failed logs found.") return if skipped_non_transient: print("Keeping logs with non transient VRAM issues:") for p in skipped_non_transient: print(f" KEEP {p}") if to_delete: print("Deleting logs with transient failures:") for p in to_delete: print(f" DELETE {p}") if not args.dry_run: try: os.remove(p) except OSError as e: print(f" Failed to delete {p}: {e}") else: print("No logs to delete.") if __name__ == "__main__": main()