File size: 5,510 Bytes
f174953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import csv
import pandas as pd
import gradio as gr
import plotly.express as px

DEFAULT_CSV = os.environ.get("RESULTS_CSV_PATH", "results.csv")

EXPECTED_COLS = [
    "timestamp_iso","run_id","model","prompt_id","category",
    "quality_score","latency_s","energy_wh","tokens","notes"
]

def _load_df(file: gr.File | None):
    path = DEFAULT_CSV
    if file is not None:
        path = file.name
    if not os.path.exists(path):
        return pd.DataFrame(columns=EXPECTED_COLS)
    df = pd.read_csv(path)
    # ensure expected cols exist
    for c in EXPECTED_COLS:
        if c not in df.columns:
            df[c] = None
    # numeric coercion
    for c in ["quality_score","latency_s","energy_wh","tokens"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def _summaries(df: pd.DataFrame):
    if df.empty:
        return df, pd.DataFrame(), pd.DataFrame(), None, None, None, None

    def q_per_wh(row):
        if pd.notna(row["mean_energy"]) and row["mean_energy"] > 0 and pd.notna(row["mean_quality"]):
            return row["mean_quality"] / row["mean_energy"]
        return None

    per_model = df.groupby("model", dropna=False).agg(
        n_runs=("run_id","count"),
        mean_quality=("quality_score","mean"),
        median_latency=("latency_s","median"),
        p95_latency=("latency_s", lambda x: x.dropna().quantile(0.95) if len(x.dropna()) else None),
        mean_latency=("latency_s","mean"),
        mean_energy=("energy_wh","mean"),
        mean_tokens=("tokens","mean")
    ).reset_index()
    per_model["quality_per_wh"] = per_model.apply(q_per_wh, axis=1)

    per_model_cat = df.groupby(["model","category"], dropna=False).agg(
        n_runs=("run_id","count"),
        mean_quality=("quality_score","mean"),
        mean_latency=("latency_s","mean"),
        p95_latency=("latency_s", lambda x: x.dropna().quantile(0.95) if len(x.dropna()) else None),
        mean_energy=("energy_wh","mean")
    ).reset_index()

    c1 = px.bar(per_model.sort_values("mean_quality", ascending=False),
                x="model", y="mean_quality", title="Mean Quality by Model")
    c2 = px.bar(per_model.sort_values("mean_latency"),
                x="model", y="mean_latency", title="Mean Latency (s) by Model")
    c3 = px.bar(per_model.sort_values("p95_latency"),
                x="model", y="p95_latency", title="P95 Latency (s) by Model")
    c4 = px.bar(per_model.sort_values("quality_per_wh", ascending=False),
                x="model", y="quality_per_wh", title="Quality per Watt-hour (↑ better)")

    return df, per_model, per_model_cat, c1, c2, c3, c4

def _filter(df, model_sel, cat_sel, prompt_sel):
    if df.empty:
        return pd.DataFrame()
    out = df.copy()
    if model_sel and model_sel != "ALL":
        out = out[out["model"] == model_sel]
    if cat_sel and cat_sel != "ALL":
        out = out[out["category"] == cat_sel]
    if prompt_sel and prompt_sel != "ALL":
        out = out[out["prompt_id"] == prompt_sel]
    return out

def _choices(df):
    models = ["ALL"] + sorted([m for m in df["model"].dropna().unique().tolist()])
    cats = ["ALL"] + sorted([c for c in df["category"].dropna().unique().tolist()])
    prompts = ["ALL"] + sorted([p for p in df["prompt_id"].dropna().unique().tolist()])
    return models, cats, prompts

with gr.Blocks(title="Compare’IA — Benchmark Dashboard") as demo:
    gr.Markdown("## Compare’IA — Benchmark Dashboard\nUpload your CSV or use the default `results.csv` in the Space repo.")

    with gr.Row():
        csv_file = gr.File(label="Upload results CSV", file_types=[".csv"])
        refresh_btn = gr.Button("Refresh data")

    raw_df = gr.Dataframe(label="Raw data", interactive=False, wrap=True, height=300)

    with gr.Row():
        model_dd = gr.Dropdown(choices=["ALL"], value="ALL", label="Model")
        cat_dd = gr.Dropdown(choices=["ALL"], value="ALL", label="Category")
        prompt_dd = gr.Dropdown(choices=["ALL"], value="ALL", label="Prompt ID")
        apply_filter = gr.Button("Apply filter")

    filtered_df = gr.Dataframe(label="Filtered rows", interactive=False, height=250)

    with gr.Accordion("Aggregates & Charts", open=True):
        per_model_df = gr.Dataframe(label="Per-model summary", interactive=False)
        per_model_cat_df = gr.Dataframe(label="Per-model-per-category", interactive=False)
        chart_quality = gr.Plot(label="Mean Quality by Model")
        chart_mean_lat = gr.Plot(label="Mean Latency by Model")
        chart_p95_lat = gr.Plot(label="P95 Latency by Model")
        chart_q_per_wh = gr.Plot(label="Quality per Wh")

    def _refresh(file):
        df = _load_df(file)
        models, cats, prompts = _choices(df)
        full_df, pm, pmc, c1, c2, c3, c4 = _summaries(df)
        return (full_df, gr.update(choices=models, value="ALL"),
                gr.update(choices=cats, value="ALL"),
                gr.update(choices=prompts, value="ALL"),
                pm, pmc, c1, c2, c3, c4)

    refresh_btn.click(_refresh, inputs=csv_file,
                      outputs=[raw_df, model_dd, cat_dd, prompt_dd, per_model_df, per_model_cat_df,
                               chart_quality, chart_mean_lat, chart_p95_lat, chart_q_per_wh])

    def _apply(file, model_sel, cat_sel, prompt_sel):
        df = _load_df(file)
        out = _filter(df, model_sel, cat_sel, prompt_sel)
        return out

    apply_filter.click(_apply, inputs=[csv_file, model_dd, cat_dd, prompt_dd], outputs=[filtered_df])

demo.launch()