lmgame / rank_data_03_25_2025.json
Yuxuan13's picture
Upload 6 files
75407f0 verified
{
"Super Mario Bros": {
"runs": 3,
"results": [
{
"model": "gamingagent + claude-3-5-sonnet-20241022",
"score": 1267.7,
"detail_data":"709,1532,1562",
"progress": "1-1"
},
{
"model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
"score": 1418.7,
"detail_data":"2015,709,1532",
"progress": "1-1"
},
{
"model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
"score": 1385.0,
"detail_data":"1672,1266,1247",
"progress": "1-1"
},
{
"model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
"score": 1498.3,
"detail_data":"1561,1271,1663",
"progress": "1-1"
},
{
"model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
"score": 1468.7,
"detail_data":"898,2008,1500",
"progress": "1-1"
},
{
"model": "gamingagent + gpt-4.1-2025-04-14",
"score": 2126.3,
"detail_data":"1531,722,4126",
"progress": "1-1"
},
{
"model": "gamingagent + gpt-4o-2024-11-20",
"score": 2047.3,
"detail_data":"2017,2590,1535",
"progress": "1-1"
},
{
"model": "gamingagent + o1-2024-12-17",
"score": 855,
"detail_data":"855",
"progress": "1-1"
},
{
"model": "gamingagent + o3-2025-04-16",
"score": 3445,
"detail_data":"3445",
"progress": "1-1"
},
{
"model": "gamingagent + o4-mini-2025-04-16",
"score": 1448.0,
"detail_data":"1525,1263,1556",
"progress": "1-1"
},
{
"model": "random (x30)",
"score": 986.97,
"detail_data":"986.97",
"progress": "1-1"
}
]
},
"2048": {
"runs": 3,
"results": [
{
"model": "gamingagent + claude-3-5-sonnet-20241022",
"score": 1914.67,
"details": "1352,2860,1532",
"highest_tail": 256
},
{
"model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
"score": 2624,
"details": "2560,3224,2088",
"highest_tail": 256
},
{
"model": "gamingagent + deepseek-r1-0120",
"score": 1873.33,
"details": "700,1240,3680",
"highest_tail": 256
},
{
"model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
"score": 1697.33,
"details": "1304,1316,2472",
"highest_tail": 256
},
{
"model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
"score": 3586.67,
"details": "5300,2400,3060",
"highest_tail": 512
},
{
"model": "gamingagent + grok-3-mini-beta (thinking)",
"score": 4036,
"details": "6412,2492,3204",
"highest_tail": 512
},
{
"model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
"score": 1586.67,
"details": "1404,1272,2084",
"highest_tail": 128
},
{
"model": "gamingagent + gpt-4.1-2025-04-14",
"score": 1656,
"details": "1156,2664,1148",
"highest_tail": 256
},
{
"model": "gamingagent + gpt-4o-2024-11-20",
"score": 1656,
"details": "1604,1284,2080",
"highest_tail": 256
},
{
"model": "gamingagent + o1-2024-12-17",
"score": 7580,
"details": "7580",
"highest_tail": 512
},
{
"model": "gamingagent + o1-mini-2024-09-12",
"score": 2757.33,
"details": "3132,2004,3136",
"highest_tail": 256
},
{
"model": "gamingagent + o3-2025-04-16",
"score": 7120,
"details": "7120",
"highest_tail": 512
},
{
"model": "gamingagent + o4-mini-2025-04-16",
"score": 4432.0,
"details": "4928,5456,2912",
"highest_tail": 512
},
{
"model": "random (x30)",
"score": 1213.33,
"details": "",
"highest_tail": 128
},
{
"model": "gamingagent + claude-opus-4-20250514",
"score": 3036.0,
"details": "3036.0",
"highest_tail": 256
},
{
"model": "gamingagent + claude-sonnet-4-20250514",
"score": 3136,
"details": "2148,2360,4900",
"highest_tail": 256
},
{
"model": "gamingagent + deepseek-r1-0528",
"score": 3330.0,
"details": "3260,3400",
"highest_tail": 256
},
{
"model": "gamingagent + qwen3-235B-A22B-fp8",
"score": 2144.0,
"details": "1436,2556,2440",
"highest_tail": 256
}
]
},
"Tetris": {
"runs": 3,
"results": [
{
"model": "gamingagent + claude-3-5-sonnet-20241022",
"score": 14.7,
"details": "16,14,14"
},
{
"model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
"score": 16.3,
"details": "19,15,15"
},
{
"model": "gamingagent + deepseek-r1-0120",
"score": 14.3,
"details": "15,14,14"
},
{
"model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
"score": 16.3,
"details": "20,14,15"
},
{
"model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
"score": 23.3,
"details": "23,23,24"
},
{
"model": "gamingagent + grok-3-mini-beta (thinking)",
"score": 21.3,
"details": "20,15,29"
},
{
"model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
"score": 10.3,
"details": "9,10,12"
},
{
"model": "gamingagent + gpt-4.1-2025-04-14",
"score": 13.7,
"details": "13,14,14"
},
{
"model": "gamingagent + gpt-4o-2024-11-20",
"score": 14,
"details": "18,11,13"
},
{
"model": "gamingagent + o1-2024-12-17",
"score": 35,
"details": "35"
},
{
"model": "gamingagent + o1-mini-2024-09-12",
"score": 11.7,
"details": "11,11,13"
},
{
"model": "gamingagent + o3-2025-04-16",
"score": 42,
"details": "42"
},
{
"model": "gamingagent + o4-mini-2025-04-16",
"score": 25.3,
"details": "22,35,19"
},
{
"model": "random (x30)",
"score": 10.2,
"details": ""
},
{
"model": "gamingagent + claude-opus-4-20250514",
"score": 20,
"details": "17,18,25"
},
{
"model": "gamingagent + claude-sonnet-4-20250514",
"score": 19.33,
"details": "20,17,21"
},
{
"model": "gamingagent + deepseek-r1-0528",
"score": 33.67,
"details": "26,34,41"
},
{
"model": "gamingagent + qwen3-235B-A22B-fp8",
"score": 11.67,
"details": "13,14,8"
}
]
},
"Candy Crush": {
"runs": 3,
"results": [
{
"model": "gamingagent + claude-3-5-sonnet-20241022",
"score": 106,
"details": "92,165,61"
},
{
"model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
"score": 484,
"details": "535,428,489"
},
{
"model": "gamingagent + deepseek-r1-0120",
"score": 447.3,
"details": "409,436,497"
},
{
"model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
"score": 334.7,
"details": "259,372,373"
},
{
"model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
"score": 416.3,
"details": "411,414,424"
},
{
"model": "gamingagent + grok-3-mini-beta (thinking)",
"score": 254,
"details": "299,332,131"
},
{
"model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
"score": 128.7,
"details": "67,139,180"
},
{
"model": "gamingagent + gpt-4.1-2025-04-14",
"score": 182,
"details": "163,215,168"
},
{
"model": "gamingagent + gpt-4o-2024-11-20",
"score": 147.3,
"details": "131,104,207"
},
{
"model": "gamingagent + o1-2024-12-17",
"score": 159,
"details": "159"
},
{
"model": "gamingagent + o1-mini-2024-09-12",
"score": 48,
"details": "21,86,37"
},
{
"model": "gamingagent + o3-2025-04-16",
"score": 647,
"details": "647"
},
{
"model": "gamingagent + o4-mini-2025-04-16",
"score": 487.3,
"details": "259,591,612"
},
{
"model": "random (x30)",
"score": 116.5,
"details": ""
},
{
"model": "gamingagent + claude-opus-4-20250514",
"score": 464,
"details": "593,406,393"
},
{
"model": "gamingagent + claude-sonnet-4-20250514",
"score": 478.33,
"details": "545,468,422"
},
{
"model": "gamingagent + deepseek-r1-0528",
"score": 491.67,
"details": "464,463,548"
},
{
"model": "gamingagent + qwen3-235B-A22B-fp8",
"score": 363.33,
"details": "365,372,353"
}
]
},
"Sokoban": {
"runs": 3,
"results": [
{
"model": "gamingagent + claude-3-5-sonnet-20241022",
"score": 0,
"detail_box_on_target":"0,0,0",
"cracked_levels": "0,0,0"
},
{
"model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
"score": 2.33,
"detail_box_on_target":"2,4,1",
"cracked_levels": "1,2,0"
},
{
"model": "gamingagent + deepseek-r1-0120",
"score": 1.33,
"detail_box_on_target":"2,0,2",
"cracked_levels": "1,0,1"
},
{
"model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
"score": 1.67,
"detail_box_on_target":"3,0,2",
"cracked_levels": "2,0,1"
},
{
"model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
"score": 4.33,
"detail_box_on_target":"4,4,5",
"cracked_levels": "2,2,3"
},
{
"model": "gamingagent + grok-3-mini-beta (thinking)",
"score": 5.67,
"detail_box_on_target":"5,6,6",
"cracked_levels": "3,3,3"
},
{
"model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
"score": 0,
"detail_box_on_target":"0,0,0",
"cracked_levels": "0,0,0"
},
{
"model": "gamingagent + gpt-4.1-2025-04-14",
"score": 0,
"detail_box_on_target":"0,0,0",
"cracked_levels": "0,0,0"
},
{
"model": "gamingagent + gpt-4o-2024-11-20",
"score": 0,
"detail_box_on_target":"0,0,0",
"cracked_levels": "0,0,0"
},
{
"model": "gamingagent + o1-2024-12-17",
"score": 2.33,
"detail_box_on_target":"2,2,3",
"cracked_levels": "1,1,2"
},
{
"model": "gamingagent + o1-mini-2024-09-12",
"score": 1.33,
"detail_box_on_target":"1,2,1",
"cracked_levels": "0,1,0"
},
{
"model": "gamingagent + o3-2025-04-16",
"score": 8,
"detail_box_on_target":"10,6",
"cracked_levels": "5,3"
},
{
"model": "gamingagent + o4-mini-2025-04-16",
"score": 5.33,
"detail_box_on_target":"4,6,6",
"cracked_levels": "2,2,3"
},
{
"model": "random (x30)",
"score": 0,
"detail_box_on_target":"0,0,0",
"cracked_levels": "0,0,0"
},
{
"model": "gamingagent + claude-opus-4-20250514",
"score": 4,
"details": "4,4,4"
},
{
"model": "gamingagent + claude-sonnet-4-20250514",
"score": 3,
"details": "2,2,5"
},
{
"model": "gamingagent + deepseek-r1-0528",
"score": 4.67,
"details": "4,4,6"
},
{
"model": "gamingagent + qwen3-235B-A22B-fp8",
"score": 2.33,
"details": "1,2,4"
}
]
},
"Ace Attorney": {
"runs": 1,
"results": [
{
"model": "gamingagent + claude-3-5-sonnet-20241022",
"score": 2,
"progress": "1:2/5",
"evaluator result": "1/3"
},
{
"model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
"score": 7,
"progress": "2:2/9",
"evaluator result": "5/11"
},
{
"model": "gamingagent + deepseek-r1-0120",
"score": 0,
"progress": "0",
"evaluator result": "1/5"
},
{
"model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
"score": 4,
"progress": "1:4/5",
"evaluator result": "1/7"
},
{
"model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
"score": 7,
"progress": "2:2/9",
"evaluator result": "2/3"
},
{
"model": "gamingagent + grok-3-mini-beta (thinking)",
"score": 0,
"progress": "0",
"evaluator result": "0"
},
{
"model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
"score": 0,
"progress": "0",
"evaluator result": "0"
},
{
"model": "gamingagent + gpt-4.1-2025-04-14",
"score": 2,
"progress": "1:2/5",
"evaluator result": "2/3"
},
{
"model": "gamingagent + gpt-4o-2024-11-20",
"score": 0,
"progress": "0",
"evaluator result": "0"
},
{
"model": "gamingagent + o1-2024-12-17",
"score": 16,
"progress": "3: 2/8",
"evaluator result": "6/11"
},
{
"model": "gamingagent + o1-mini-2024-09-12",
"score": 0,
"progress": "0",
"evaluator result": "1/5"
},
{
"model": "gamingagent + o3-2025-04-16",
"score": 16,
"progress": "3: 2/8",
"evaluator result": "1/2"
},
{
"model": "gamingagent + o4-mini-2025-04-16",
"score": 4,
"progress": "1:4/5",
"evaluator result": "2/5"
},
{
"model": "random (x30)",
"score": 0,
"progress": "0",
"evaluator result": "0"
},
{
"model": "gamingagent + claude-opus-4-20250514",
"score": 6,
"details": "6"
},
{
"model": "gamingagent + claude-sonnet-4-20250514",
"score": 3.67,
"details": "3,4,4"
},
{
"model": "gamingagent + gemini-2.5-flash-preview-05-20",
"score": 4.33,
"details": "3,4,6"
}
]
}
}