Spaces:
Runtime error
Runtime error
File size: 5,510 Bytes
f174953 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import csv
import pandas as pd
import gradio as gr
import plotly.express as px
DEFAULT_CSV = os.environ.get("RESULTS_CSV_PATH", "results.csv")
EXPECTED_COLS = [
"timestamp_iso","run_id","model","prompt_id","category",
"quality_score","latency_s","energy_wh","tokens","notes"
]
def _load_df(file: gr.File | None):
path = DEFAULT_CSV
if file is not None:
path = file.name
if not os.path.exists(path):
return pd.DataFrame(columns=EXPECTED_COLS)
df = pd.read_csv(path)
# ensure expected cols exist
for c in EXPECTED_COLS:
if c not in df.columns:
df[c] = None
# numeric coercion
for c in ["quality_score","latency_s","energy_wh","tokens"]:
df[c] = pd.to_numeric(df[c], errors="coerce")
return df
def _summaries(df: pd.DataFrame):
if df.empty:
return df, pd.DataFrame(), pd.DataFrame(), None, None, None, None
def q_per_wh(row):
if pd.notna(row["mean_energy"]) and row["mean_energy"] > 0 and pd.notna(row["mean_quality"]):
return row["mean_quality"] / row["mean_energy"]
return None
per_model = df.groupby("model", dropna=False).agg(
n_runs=("run_id","count"),
mean_quality=("quality_score","mean"),
median_latency=("latency_s","median"),
p95_latency=("latency_s", lambda x: x.dropna().quantile(0.95) if len(x.dropna()) else None),
mean_latency=("latency_s","mean"),
mean_energy=("energy_wh","mean"),
mean_tokens=("tokens","mean")
).reset_index()
per_model["quality_per_wh"] = per_model.apply(q_per_wh, axis=1)
per_model_cat = df.groupby(["model","category"], dropna=False).agg(
n_runs=("run_id","count"),
mean_quality=("quality_score","mean"),
mean_latency=("latency_s","mean"),
p95_latency=("latency_s", lambda x: x.dropna().quantile(0.95) if len(x.dropna()) else None),
mean_energy=("energy_wh","mean")
).reset_index()
c1 = px.bar(per_model.sort_values("mean_quality", ascending=False),
x="model", y="mean_quality", title="Mean Quality by Model")
c2 = px.bar(per_model.sort_values("mean_latency"),
x="model", y="mean_latency", title="Mean Latency (s) by Model")
c3 = px.bar(per_model.sort_values("p95_latency"),
x="model", y="p95_latency", title="P95 Latency (s) by Model")
c4 = px.bar(per_model.sort_values("quality_per_wh", ascending=False),
x="model", y="quality_per_wh", title="Quality per Watt-hour (↑ better)")
return df, per_model, per_model_cat, c1, c2, c3, c4
def _filter(df, model_sel, cat_sel, prompt_sel):
if df.empty:
return pd.DataFrame()
out = df.copy()
if model_sel and model_sel != "ALL":
out = out[out["model"] == model_sel]
if cat_sel and cat_sel != "ALL":
out = out[out["category"] == cat_sel]
if prompt_sel and prompt_sel != "ALL":
out = out[out["prompt_id"] == prompt_sel]
return out
def _choices(df):
models = ["ALL"] + sorted([m for m in df["model"].dropna().unique().tolist()])
cats = ["ALL"] + sorted([c for c in df["category"].dropna().unique().tolist()])
prompts = ["ALL"] + sorted([p for p in df["prompt_id"].dropna().unique().tolist()])
return models, cats, prompts
with gr.Blocks(title="Compare’IA — Benchmark Dashboard") as demo:
gr.Markdown("## Compare’IA — Benchmark Dashboard\nUpload your CSV or use the default `results.csv` in the Space repo.")
with gr.Row():
csv_file = gr.File(label="Upload results CSV", file_types=[".csv"])
refresh_btn = gr.Button("Refresh data")
raw_df = gr.Dataframe(label="Raw data", interactive=False, wrap=True, height=300)
with gr.Row():
model_dd = gr.Dropdown(choices=["ALL"], value="ALL", label="Model")
cat_dd = gr.Dropdown(choices=["ALL"], value="ALL", label="Category")
prompt_dd = gr.Dropdown(choices=["ALL"], value="ALL", label="Prompt ID")
apply_filter = gr.Button("Apply filter")
filtered_df = gr.Dataframe(label="Filtered rows", interactive=False, height=250)
with gr.Accordion("Aggregates & Charts", open=True):
per_model_df = gr.Dataframe(label="Per-model summary", interactive=False)
per_model_cat_df = gr.Dataframe(label="Per-model-per-category", interactive=False)
chart_quality = gr.Plot(label="Mean Quality by Model")
chart_mean_lat = gr.Plot(label="Mean Latency by Model")
chart_p95_lat = gr.Plot(label="P95 Latency by Model")
chart_q_per_wh = gr.Plot(label="Quality per Wh")
def _refresh(file):
df = _load_df(file)
models, cats, prompts = _choices(df)
full_df, pm, pmc, c1, c2, c3, c4 = _summaries(df)
return (full_df, gr.update(choices=models, value="ALL"),
gr.update(choices=cats, value="ALL"),
gr.update(choices=prompts, value="ALL"),
pm, pmc, c1, c2, c3, c4)
refresh_btn.click(_refresh, inputs=csv_file,
outputs=[raw_df, model_dd, cat_dd, prompt_dd, per_model_df, per_model_cat_df,
chart_quality, chart_mean_lat, chart_p95_lat, chart_q_per_wh])
def _apply(file, model_sel, cat_sel, prompt_sel):
df = _load_df(file)
out = _filter(df, model_sel, cat_sel, prompt_sel)
return out
apply_filter.click(_apply, inputs=[csv_file, model_dd, cat_dd, prompt_dd], outputs=[filtered_df])
demo.launch() |