Spaces:

jellewas
/

pii-nl-bench

Paused

App Files Files Community

pii-nl-bench / app.py

jellewas

Deploy to HuggingFace Space (a10g-small)

e7d9849 2 months ago

raw

history blame contribute delete

10.7 kB

	"""Gradio app for pii-nl-bench — run benchmarks on HuggingFace Spaces (GPU).

	HuggingFace Space: Docker SDK + T4/A10G GPU
	"""

	from __future__ import annotations

	import io
	import sys
	import time
	import traceback
	from contextlib import redirect_stdout, redirect_stderr
	from datetime import datetime
	from pathlib import Path

	import gradio as gr

	from preflight import run_preflight


	# ── Helpers ────────────────────────────────────────────────────────


	def capture_output(fn, args, *kwargs):
	"""Run fn and capture both stdout/stderr."""
	buf = io.StringIO()
	with redirect_stdout(buf), redirect_stderr(buf):
	try:
	result = fn(args, *kwargs)
	except Exception:
	buf.write(traceback.format_exc())
	result = None
	return result, buf.getvalue()


	# ── Preflight ──────────────────────────────────────────────────────


	def run_preflight_check(quick: bool = False):
	"""Run preflight checks and return formatted results."""
	report, output = capture_output(run_preflight, skip_model_load=quick)
	if report is None:
	return f"Preflight crashed:\n```\n{output}\n```", False

	summary = report.summary()
	full = f"{output}\n{summary}" if output.strip() else summary
	return full, report.all_critical_passed


	# ── Benchmark runner ───────────────────────────────────────────────


	def run_benchmark(groups: list[str], mode: str, max_samples: int, progress=gr.Progress()):
	"""Run the group-based benchmark and return results."""
	from benchmark.groups import (
	GROUP_RUNNERS,
	MatchMode,
	detect_device,
	)
	from benchmark.config import RESULTS_DIR

	progress(0, desc="Detecting device...")
	device = detect_device("auto")

	match_mode = MatchMode(mode)

	# Parse group selection
	if "All groups" in groups:
	group_ids = [1, 2, 3, 4]
	else:
	group_ids = []
	for g in groups:
	gid = int(g.split(":")[0].strip())
	group_ids.append(gid)

	all_results = []
	total_groups = len(group_ids)

	log_lines = []
	log_lines.append(f"Device: {device}")
	log_lines.append(f"Mode: {match_mode.value}")
	log_lines.append(f"Groups: {group_ids}")
	log_lines.append(f"Max samples: {max_samples or 'unlimited'}")
	log_lines.append("")

	for i, gid in enumerate(group_ids):
	if gid not in GROUP_RUNNERS:
	log_lines.append(f"Unknown group {gid}, skipping")
	continue

	label, runner = GROUP_RUNNERS[gid]
	progress((i / total_groups), desc=f"Group {gid}: {label}...")
	log_lines.append(f"{'=' * 60}")
	log_lines.append(f" Running Group {gid}: {label}")
	log_lines.append(f"{'=' * 60}")

	# Capture group output
	results, output = capture_output(runner, device, match_mode, max_samples)
	if output:
	log_lines.append(output)

	if results:
	all_results.extend(results)
	for r in results:
	o = r.overall
	if o.support > 0 or o.fp > 0:
	log_lines.append(
	f" {r.model_name:<25s} [{r.dataset_name}] "
	f"P={o.precision:.3f} R={o.recall:.3f} "
	f"F1={o.f1:.3f} F2={o.f2:.3f}"
	)
	log_lines.append("")

	progress(0.90, desc="Loading dataset statistics...")

	# Collect dataset samples for report statistics
	from benchmark.datasets.loader import load_dataset_by_name
	from benchmark.datasets.normalize import normalize_dataset
	loaded_datasets = {}
	for ds_name in ["ai4privacy", "gretel", "e3jsi", "conll2002", "article9"]:
	if not any(r.dataset_name == ds_name for r in all_results):
	continue
	try:
	raw = load_dataset_by_name(ds_name)
	loaded_datasets[ds_name] = normalize_dataset(raw)
	except Exception:
	pass

	progress(0.95, desc="Generating report...")

	# Generate report
	report_md = ""
	if all_results:
	import json as _json
	from benchmark.evaluation.report import generate_report, export_results_json
	from benchmark.evaluation.charts import generate_all_charts

	RESULTS_DIR.mkdir(parents=True, exist_ok=True)
	ts = datetime.now().strftime("%Y-%m-%d_%H%M")

	# JSON results
	json_data = export_results_json(all_results)
	json_path = RESULTS_DIR / f"groups_{ts}.json"
	json_path.write_text(_json.dumps(json_data, indent=2, ensure_ascii=False))
	log_lines.append(f"JSON saved: {json_path}")

	# Charts
	charts_dir = RESULTS_DIR / f"charts_{ts}"
	chart_paths = generate_all_charts(all_results, charts_dir)
	chart_pngs = [str(p) for p in chart_paths if p.suffix == ".png"]
	if chart_paths:
	log_lines.append(f"Charts saved: {charts_dir}/ ({len(chart_paths)} files)")

	# Markdown report (after charts so it can embed them)
	report_md = generate_report(
	all_results,
	datasets=loaded_datasets,
	charts_dir=str(charts_dir) if chart_paths else None,
	)
	report_path = RESULTS_DIR / f"groups_{ts}.md"
	report_path.write_text(report_md)
	log_lines.append(f"Report saved: {report_path}")

	progress(1.0, desc="Done")

	log_text = "\n".join(log_lines)
	return log_text, report_md, chart_pngs if all_results else []


	# ── Gradio UI ──────────────────────────────────────────────────────


	def build_ui():
	with gr.Blocks(
	title="pii-nl-bench — Dutch PII Detection Benchmark",
	theme=gr.themes.Soft(),
	) as app:
	gr.Markdown(
	"# pii-nl-bench — Dutch PII Detection Benchmark\n"
	"Compare PII detection models on Dutch text. "
	"Proves `monsieur_regex + qwen_adapter` outperforms alternatives.\n\n"
	"Step 1: Run preflight checks to validate GPU, models, and data. \n"
	"Step 2: Run the benchmark."
	)

	with gr.Tab("Preflight Checks"):
	gr.Markdown(
	"Validates GPU, CUDA, bfloat16, model downloads, LoRA adapter, "
	"datasets, and disk space before running the benchmark."
	)
	with gr.Row():
	quick_check = gr.Checkbox(
	label="Quick (skip model loading)", value=False,
	)
	preflight_btn = gr.Button("Run Preflight Checks", variant="primary")
	preflight_status = gr.Textbox(
	label="Preflight Status", lines=3, interactive=False,
	)
	preflight_output = gr.Code(
	label="Detailed Output", language=None, lines=25,
	)

	def on_preflight(quick):
	result, passed = run_preflight_check(quick)
	status = "ALL CLEAR — ready to benchmark" if passed else "BLOCKED — see details below"
	return status, result

	preflight_btn.click(
	fn=on_preflight,
	inputs=[quick_check],
	outputs=[preflight_status, preflight_output],
	)

	with gr.Tab("Benchmark"):
	with gr.Row():
	with gr.Column():
	group_select = gr.CheckboxGroup(
	choices=[
	"All groups",
	"1: Structured PII",
	"2: Named Entity Recognition",
	"3: Full PII Coverage",
	"4: Article 9 Special Categories",
	],
	value=["All groups"],
	label="Benchmark Groups",
	)
	mode_select = gr.Radio(
	choices=["lenient", "strict", "label_only"],
	value="lenient",
	label="Span Matching Mode",
	)
	max_samples = gr.Slider(
	minimum=0, maximum=5000, step=50, value=1000,
	label="Max samples per dataset (0 = unlimited, default: 1000)",
	)
	run_btn = gr.Button("Run Benchmark", variant="primary")

	with gr.Row():
	log_output = gr.Code(
	label="Benchmark Log", language=None, lines=30,
	)

	with gr.Row():
	chart_gallery = gr.Gallery(
	label="Benchmark Charts",
	columns=2,
	height="auto",
	object_fit="contain",
	)

	with gr.Row():
	report_output = gr.Markdown(label="Report")

	run_btn.click(
	fn=run_benchmark,
	inputs=[group_select, mode_select, max_samples],
	outputs=[log_output, report_output, chart_gallery],
	)

	with gr.Tab("About"):
	gr.Markdown(
	"## Models Compared\n\n"
	"\| Model \| Type \| Labels \|\n"
	"\|-------\|------\|--------\|\n"
	"\| monsieur_regex \| Rule-based regex \| 16 structured PII types \|\n"
	"\| qwen_adapter (jellewas/gdpr-lora) \| Qwen3.5-4B LoRA \| 23 types incl. Article 9 \|\n"
	"\| regex+adapter (combined) \| Ensemble \| All types \|\n"
	"\| pii_ner_nl (jellewas/pii-ner-nl) \| RobBERT token classifier \| BIO-tagged NER \|\n"
	"\| flair \| BiLSTM-CRF \| PERSON, LOCATION, ORG \|\n"
	"\| gliner \| Zero-shot transformer \| 9 types \|\n"
	"\| deduce \| Dutch clinical rules \| 11 types \|\n"
	"\| presidio \| spaCy + regex \| 12 types \|\n\n"
	"## Evaluation\n\n"
	"- Primary metric: F2 (recall weighted 4x over precision)\n"
	"- Rationale: Missed PII = GDPR violation > false alarm\n"
	"- Matching modes: strict (exact span), lenient (50% overlap), label-only\n"
	)

	return app


	if __name__ == "__main__":
	app = build_ui()
	app.launch(server_name="0.0.0.0", server_port=7860)