Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Add Details tab
Browse files
app.py
CHANGED
|
@@ -19,6 +19,8 @@ EXCLUDED_KEYS = {
|
|
| 19 |
# "alias",
|
| 20 |
# }
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
TASKS = {
|
| 24 |
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
|
@@ -29,6 +31,57 @@ TASKS = {
|
|
| 29 |
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
| 30 |
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
| 31 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
fs = HfFileSystem()
|
| 34 |
|
|
@@ -103,6 +156,49 @@ def update_tasks(task):
|
|
| 103 |
)
|
| 104 |
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# if __name__ == "__main__":
|
| 107 |
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
|
| 108 |
|
|
@@ -135,6 +231,18 @@ with gr.Blocks(fill_height=True) as demo:
|
|
| 135 |
results = gr.HTML()
|
| 136 |
with gr.Tab("Configs"):
|
| 137 |
configs = gr.HTML()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
load_btn_1.click(
|
| 140 |
fn=load_result_dataframe,
|
|
@@ -166,6 +274,29 @@ with gr.Blocks(fill_height=True) as demo:
|
|
| 166 |
fn=display_results,
|
| 167 |
inputs=[dataframe_1, dataframe_2, task],
|
| 168 |
outputs=[results, configs],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
)
|
| 170 |
|
| 171 |
demo.launch()
|
|
|
|
| 19 |
# "alias",
|
| 20 |
# }
|
| 21 |
|
| 22 |
+
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
| 23 |
+
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
| 24 |
|
| 25 |
TASKS = {
|
| 26 |
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
|
|
|
| 31 |
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
| 32 |
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
| 33 |
}
|
| 34 |
+
SUBTASKS = {
|
| 35 |
+
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
|
| 36 |
+
"leaderboard_bbh": [
|
| 37 |
+
"leaderboard_bbh_boolean_expressions",
|
| 38 |
+
"leaderboard_bbh_causal_judgement",
|
| 39 |
+
"leaderboard_bbh_date_understanding",
|
| 40 |
+
"leaderboard_bbh_disambiguation_qa",
|
| 41 |
+
"leaderboard_bbh_formal_fallacies",
|
| 42 |
+
"leaderboard_bbh_geometric_shapes",
|
| 43 |
+
"leaderboard_bbh_hyperbaton",
|
| 44 |
+
"leaderboard_bbh_logical_deduction_five_objects",
|
| 45 |
+
"leaderboard_bbh_logical_deduction_seven_objects",
|
| 46 |
+
"leaderboard_bbh_logical_deduction_three_objects",
|
| 47 |
+
"leaderboard_bbh_movie_recommendation",
|
| 48 |
+
"leaderboard_bbh_navigate",
|
| 49 |
+
"leaderboard_bbh_object_counting",
|
| 50 |
+
"leaderboard_bbh_penguins_in_a_table",
|
| 51 |
+
"leaderboard_bbh_reasoning_about_colored_objects",
|
| 52 |
+
"leaderboard_bbh_ruin_names",
|
| 53 |
+
"leaderboard_bbh_salient_translation_error_detection",
|
| 54 |
+
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
|
| 55 |
+
"leaderboard_bbh_temporal_sequences",
|
| 56 |
+
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
|
| 57 |
+
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
|
| 58 |
+
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
|
| 59 |
+
"leaderboard_bbh_web_of_lies",
|
| 60 |
+
],
|
| 61 |
+
"leaderboard_gpqa": [
|
| 62 |
+
"leaderboard_gpqa_extended",
|
| 63 |
+
"leaderboard_gpqa_diamond",
|
| 64 |
+
"leaderboard_gpqa_main",
|
| 65 |
+
],
|
| 66 |
+
"leaderboard_ifeval": ["leaderboard_ifeval"],
|
| 67 |
+
# "leaderboard_math_hard": [
|
| 68 |
+
"leaderboard_math": [
|
| 69 |
+
"leaderboard_math_algebra_hard",
|
| 70 |
+
"leaderboard_math_counting_and_prob_hard",
|
| 71 |
+
"leaderboard_math_geometry_hard",
|
| 72 |
+
"leaderboard_math_intermediate_algebra_hard",
|
| 73 |
+
"leaderboard_math_num_theory_hard",
|
| 74 |
+
"leaderboard_math_prealgebra_hard",
|
| 75 |
+
"leaderboard_math_precalculus_hard",
|
| 76 |
+
],
|
| 77 |
+
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
|
| 78 |
+
"leaderboard_musr": [
|
| 79 |
+
"leaderboard_musr_murder_mysteries",
|
| 80 |
+
"leaderboard_musr_object_placements",
|
| 81 |
+
"leaderboard_musr_team_allocation",
|
| 82 |
+
],
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
|
| 86 |
fs = HfFileSystem()
|
| 87 |
|
|
|
|
| 156 |
)
|
| 157 |
|
| 158 |
|
| 159 |
+
def update_subtasks(task):
|
| 160 |
+
return gr.Radio(
|
| 161 |
+
SUBTASKS.get(task),
|
| 162 |
+
info="Evaluation subtasks to be displayed",
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def load_details_dataframe(model_id, subtask):
|
| 167 |
+
if not model_id or not subtask:
|
| 168 |
+
return
|
| 169 |
+
model_name_sanitized = model_id.replace("/", "__")
|
| 170 |
+
paths = fs.glob(
|
| 171 |
+
f"{DETAILS_DATASET_ID}/**/{DETAILS_FILENAME}".format(
|
| 172 |
+
model_name_sanitized=model_name_sanitized, subtask=subtask
|
| 173 |
+
)
|
| 174 |
+
)
|
| 175 |
+
if not paths:
|
| 176 |
+
return
|
| 177 |
+
path = max(paths)
|
| 178 |
+
with fs.open(path, "r") as f:
|
| 179 |
+
data = [json.loads(line) for line in f]
|
| 180 |
+
df = pd.json_normalize(data)
|
| 181 |
+
# df = df.rename_axis("Parameters", axis="columns")
|
| 182 |
+
df["model_name"] = model_id # Keep model_name
|
| 183 |
+
return df
|
| 184 |
+
# return df.set_index(pd.Index([model_id])).reset_index()
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def display_details(df_1, df_2, sample_idx):
|
| 188 |
+
s_1 = df_1.iloc[sample_idx]
|
| 189 |
+
s_2 = df_2.iloc[sample_idx]
|
| 190 |
+
# Pop model_name and add it to the column name
|
| 191 |
+
s_1 = s_1.rename(s_1.pop("model_name"))
|
| 192 |
+
s_2 = s_2.rename(s_2.pop("model_name"))
|
| 193 |
+
df = pd.concat([s_1, s_2], axis="columns")#.rename_axis("Parameters").reset_index()
|
| 194 |
+
return (
|
| 195 |
+
df.style
|
| 196 |
+
.format(na_rep="")
|
| 197 |
+
# .hide(axis="index")
|
| 198 |
+
.to_html()
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
# if __name__ == "__main__":
|
| 203 |
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
|
| 204 |
|
|
|
|
| 231 |
results = gr.HTML()
|
| 232 |
with gr.Tab("Configs"):
|
| 233 |
configs = gr.HTML()
|
| 234 |
+
with gr.Tab("Details"):
|
| 235 |
+
subtask = gr.Radio(
|
| 236 |
+
SUBTASKS.get(task.value),
|
| 237 |
+
label="Subtasks",
|
| 238 |
+
info="Evaluation subtasks to be displayed (choose one of the Tasks above)",
|
| 239 |
+
)
|
| 240 |
+
sample_idx = gr.Number(value=0, label="Sample Index", info="Index of the sample to be displayed", minimum=0)
|
| 241 |
+
load_details_btn = gr.Button("Load Details")
|
| 242 |
+
details = gr.HTML()
|
| 243 |
+
details_dataframe_1 = gr.Dataframe(visible=False)
|
| 244 |
+
details_dataframe_2 = gr.Dataframe(visible=False)
|
| 245 |
+
details_dataframe = gr.DataFrame(visible=False)
|
| 246 |
|
| 247 |
load_btn_1.click(
|
| 248 |
fn=load_result_dataframe,
|
|
|
|
| 274 |
fn=display_results,
|
| 275 |
inputs=[dataframe_1, dataframe_2, task],
|
| 276 |
outputs=[results, configs],
|
| 277 |
+
).then(
|
| 278 |
+
fn=update_subtasks,
|
| 279 |
+
inputs=task,
|
| 280 |
+
outputs=subtask,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
load_details_btn.click(
|
| 284 |
+
fn=load_details_dataframe,
|
| 285 |
+
inputs=[model_id_1, subtask],
|
| 286 |
+
outputs=details_dataframe_1,
|
| 287 |
+
).then(
|
| 288 |
+
fn=load_details_dataframe,
|
| 289 |
+
inputs=[model_id_2, subtask],
|
| 290 |
+
outputs=details_dataframe_2,
|
| 291 |
+
).then(
|
| 292 |
+
fn=display_details,
|
| 293 |
+
inputs=[details_dataframe_1, details_dataframe_2, sample_idx],
|
| 294 |
+
outputs=details,
|
| 295 |
+
)
|
| 296 |
+
sample_idx.change(
|
| 297 |
+
fn=display_details,
|
| 298 |
+
inputs=[details_dataframe_1, details_dataframe_2, sample_idx],
|
| 299 |
+
outputs=details,
|
| 300 |
)
|
| 301 |
|
| 302 |
demo.launch()
|