"""Gradio app for pii-nl-bench — run benchmarks on HuggingFace Spaces (GPU). HuggingFace Space: Docker SDK + T4/A10G GPU """ from __future__ import annotations import io import sys import time import traceback from contextlib import redirect_stdout, redirect_stderr from datetime import datetime from pathlib import Path import gradio as gr from preflight import run_preflight # ── Helpers ──────────────────────────────────────────────────────── def capture_output(fn, *args, **kwargs): """Run fn and capture both stdout/stderr.""" buf = io.StringIO() with redirect_stdout(buf), redirect_stderr(buf): try: result = fn(*args, **kwargs) except Exception: buf.write(traceback.format_exc()) result = None return result, buf.getvalue() # ── Preflight ────────────────────────────────────────────────────── def run_preflight_check(quick: bool = False): """Run preflight checks and return formatted results.""" report, output = capture_output(run_preflight, skip_model_load=quick) if report is None: return f"Preflight crashed:\n```\n{output}\n```", False summary = report.summary() full = f"{output}\n{summary}" if output.strip() else summary return full, report.all_critical_passed # ── Benchmark runner ─────────────────────────────────────────────── def run_benchmark(groups: list[str], mode: str, max_samples: int, progress=gr.Progress()): """Run the group-based benchmark and return results.""" from benchmark.groups import ( GROUP_RUNNERS, MatchMode, detect_device, ) from benchmark.config import RESULTS_DIR progress(0, desc="Detecting device...") device = detect_device("auto") match_mode = MatchMode(mode) # Parse group selection if "All groups" in groups: group_ids = [1, 2, 3, 4] else: group_ids = [] for g in groups: gid = int(g.split(":")[0].strip()) group_ids.append(gid) all_results = [] total_groups = len(group_ids) log_lines = [] log_lines.append(f"Device: {device}") log_lines.append(f"Mode: {match_mode.value}") log_lines.append(f"Groups: {group_ids}") log_lines.append(f"Max samples: {max_samples or 'unlimited'}") log_lines.append("") for i, gid in enumerate(group_ids): if gid not in GROUP_RUNNERS: log_lines.append(f"Unknown group {gid}, skipping") continue label, runner = GROUP_RUNNERS[gid] progress((i / total_groups), desc=f"Group {gid}: {label}...") log_lines.append(f"{'=' * 60}") log_lines.append(f" Running Group {gid}: {label}") log_lines.append(f"{'=' * 60}") # Capture group output results, output = capture_output(runner, device, match_mode, max_samples) if output: log_lines.append(output) if results: all_results.extend(results) for r in results: o = r.overall if o.support > 0 or o.fp > 0: log_lines.append( f" {r.model_name:<25s} [{r.dataset_name}] " f"P={o.precision:.3f} R={o.recall:.3f} " f"F1={o.f1:.3f} F2={o.f2:.3f}" ) log_lines.append("") progress(0.90, desc="Loading dataset statistics...") # Collect dataset samples for report statistics from benchmark.datasets.loader import load_dataset_by_name from benchmark.datasets.normalize import normalize_dataset loaded_datasets = {} for ds_name in ["ai4privacy", "gretel", "e3jsi", "conll2002", "article9"]: if not any(r.dataset_name == ds_name for r in all_results): continue try: raw = load_dataset_by_name(ds_name) loaded_datasets[ds_name] = normalize_dataset(raw) except Exception: pass progress(0.95, desc="Generating report...") # Generate report report_md = "" if all_results: import json as _json from benchmark.evaluation.report import generate_report, export_results_json from benchmark.evaluation.charts import generate_all_charts RESULTS_DIR.mkdir(parents=True, exist_ok=True) ts = datetime.now().strftime("%Y-%m-%d_%H%M") # JSON results json_data = export_results_json(all_results) json_path = RESULTS_DIR / f"groups_{ts}.json" json_path.write_text(_json.dumps(json_data, indent=2, ensure_ascii=False)) log_lines.append(f"JSON saved: {json_path}") # Charts charts_dir = RESULTS_DIR / f"charts_{ts}" chart_paths = generate_all_charts(all_results, charts_dir) chart_pngs = [str(p) for p in chart_paths if p.suffix == ".png"] if chart_paths: log_lines.append(f"Charts saved: {charts_dir}/ ({len(chart_paths)} files)") # Markdown report (after charts so it can embed them) report_md = generate_report( all_results, datasets=loaded_datasets, charts_dir=str(charts_dir) if chart_paths else None, ) report_path = RESULTS_DIR / f"groups_{ts}.md" report_path.write_text(report_md) log_lines.append(f"Report saved: {report_path}") progress(1.0, desc="Done") log_text = "\n".join(log_lines) return log_text, report_md, chart_pngs if all_results else [] # ── Gradio UI ────────────────────────────────────────────────────── def build_ui(): with gr.Blocks( title="pii-nl-bench — Dutch PII Detection Benchmark", theme=gr.themes.Soft(), ) as app: gr.Markdown( "# pii-nl-bench — Dutch PII Detection Benchmark\n" "Compare PII detection models on Dutch text. " "Proves `monsieur_regex + qwen_adapter` outperforms alternatives.\n\n" "**Step 1**: Run preflight checks to validate GPU, models, and data. \n" "**Step 2**: Run the benchmark." ) with gr.Tab("Preflight Checks"): gr.Markdown( "Validates GPU, CUDA, bfloat16, model downloads, LoRA adapter, " "datasets, and disk space **before** running the benchmark." ) with gr.Row(): quick_check = gr.Checkbox( label="Quick (skip model loading)", value=False, ) preflight_btn = gr.Button("Run Preflight Checks", variant="primary") preflight_status = gr.Textbox( label="Preflight Status", lines=3, interactive=False, ) preflight_output = gr.Code( label="Detailed Output", language=None, lines=25, ) def on_preflight(quick): result, passed = run_preflight_check(quick) status = "ALL CLEAR — ready to benchmark" if passed else "BLOCKED — see details below" return status, result preflight_btn.click( fn=on_preflight, inputs=[quick_check], outputs=[preflight_status, preflight_output], ) with gr.Tab("Benchmark"): with gr.Row(): with gr.Column(): group_select = gr.CheckboxGroup( choices=[ "All groups", "1: Structured PII", "2: Named Entity Recognition", "3: Full PII Coverage", "4: Article 9 Special Categories", ], value=["All groups"], label="Benchmark Groups", ) mode_select = gr.Radio( choices=["lenient", "strict", "label_only"], value="lenient", label="Span Matching Mode", ) max_samples = gr.Slider( minimum=0, maximum=5000, step=50, value=1000, label="Max samples per dataset (0 = unlimited, default: 1000)", ) run_btn = gr.Button("Run Benchmark", variant="primary") with gr.Row(): log_output = gr.Code( label="Benchmark Log", language=None, lines=30, ) with gr.Row(): chart_gallery = gr.Gallery( label="Benchmark Charts", columns=2, height="auto", object_fit="contain", ) with gr.Row(): report_output = gr.Markdown(label="Report") run_btn.click( fn=run_benchmark, inputs=[group_select, mode_select, max_samples], outputs=[log_output, report_output, chart_gallery], ) with gr.Tab("About"): gr.Markdown( "## Models Compared\n\n" "| Model | Type | Labels |\n" "|-------|------|--------|\n" "| **monsieur_regex** | Rule-based regex | 16 structured PII types |\n" "| **qwen_adapter** (jellewas/gdpr-lora) | Qwen3.5-4B LoRA | 23 types incl. Article 9 |\n" "| **regex+adapter** (combined) | Ensemble | All types |\n" "| **pii_ner_nl** (jellewas/pii-ner-nl) | RobBERT token classifier | BIO-tagged NER |\n" "| **flair** | BiLSTM-CRF | PERSON, LOCATION, ORG |\n" "| **gliner** | Zero-shot transformer | 9 types |\n" "| **deduce** | Dutch clinical rules | 11 types |\n" "| **presidio** | spaCy + regex | 12 types |\n\n" "## Evaluation\n\n" "- **Primary metric**: F2 (recall weighted 4x over precision)\n" "- **Rationale**: Missed PII = GDPR violation > false alarm\n" "- **Matching modes**: strict (exact span), lenient (50% overlap), label-only\n" ) return app if __name__ == "__main__": app = build_ui() app.launch(server_name="0.0.0.0", server_port=7860)