| { | |
| "Super Mario Bros": { | |
| "runs": 3, | |
| "results": [ | |
| { | |
| "model": "gamingagent + claude-3-5-sonnet-20241022", | |
| "score": 1267.7, | |
| "detail_data":"709,1532,1562", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)", | |
| "score": 1418.7, | |
| "detail_data":"2015,709,1532", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)", | |
| "score": 1385.0, | |
| "detail_data":"1672,1266,1247", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)", | |
| "score": 1498.3, | |
| "detail_data":"1561,1271,1663", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8", | |
| "score": 1468.7, | |
| "detail_data":"898,2008,1500", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4.1-2025-04-14", | |
| "score": 2126.3, | |
| "detail_data":"1531,722,4126", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4o-2024-11-20", | |
| "score": 2047.3, | |
| "detail_data":"2017,2590,1535", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + o1-2024-12-17", | |
| "score": 855, | |
| "detail_data":"855", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + o3-2025-04-16", | |
| "score": 3445, | |
| "detail_data":"3445", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "gamingagent + o4-mini-2025-04-16", | |
| "score": 1448.0, | |
| "detail_data":"1525,1263,1556", | |
| "progress": "1-1" | |
| }, | |
| { | |
| "model": "random (x30)", | |
| "score": 986.97, | |
| "detail_data":"986.97", | |
| "progress": "1-1" | |
| } | |
| ] | |
| }, | |
| "2048": { | |
| "runs": 3, | |
| "results": [ | |
| { | |
| "model": "gamingagent + claude-3-5-sonnet-20241022", | |
| "score": 1914.67, | |
| "details": "1352,2860,1532", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)", | |
| "score": 2624, | |
| "details": "2560,3224,2088", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0120", | |
| "score": 1873.33, | |
| "details": "700,1240,3680", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)", | |
| "score": 1697.33, | |
| "details": "1304,1316,2472", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)", | |
| "score": 3586.67, | |
| "details": "5300,2400,3060", | |
| "highest_tail": 512 | |
| }, | |
| { | |
| "model": "gamingagent + grok-3-mini-beta (thinking)", | |
| "score": 4036, | |
| "details": "6412,2492,3204", | |
| "highest_tail": 512 | |
| }, | |
| { | |
| "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8", | |
| "score": 1586.67, | |
| "details": "1404,1272,2084", | |
| "highest_tail": 128 | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4.1-2025-04-14", | |
| "score": 1656, | |
| "details": "1156,2664,1148", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4o-2024-11-20", | |
| "score": 1656, | |
| "details": "1604,1284,2080", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + o1-2024-12-17", | |
| "score": 7580, | |
| "details": "7580", | |
| "highest_tail": 512 | |
| }, | |
| { | |
| "model": "gamingagent + o1-mini-2024-09-12", | |
| "score": 2757.33, | |
| "details": "3132,2004,3136", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + o3-2025-04-16", | |
| "score": 7120, | |
| "details": "7120", | |
| "highest_tail": 512 | |
| }, | |
| { | |
| "model": "gamingagent + o4-mini-2025-04-16", | |
| "score": 4432.0, | |
| "details": "4928,5456,2912", | |
| "highest_tail": 512 | |
| }, | |
| { | |
| "model": "random (x30)", | |
| "score": 1213.33, | |
| "details": "", | |
| "highest_tail": 128 | |
| }, | |
| { | |
| "model": "gamingagent + claude-opus-4-20250514", | |
| "score": 3036.0, | |
| "details": "3036.0", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + claude-sonnet-4-20250514", | |
| "score": 3136, | |
| "details": "2148,2360,4900", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0528", | |
| "score": 3330.0, | |
| "details": "3260,3400", | |
| "highest_tail": 256 | |
| }, | |
| { | |
| "model": "gamingagent + qwen3-235B-A22B-fp8", | |
| "score": 2144.0, | |
| "details": "1436,2556,2440", | |
| "highest_tail": 256 | |
| } | |
| ] | |
| }, | |
| "Tetris": { | |
| "runs": 3, | |
| "results": [ | |
| { | |
| "model": "gamingagent + claude-3-5-sonnet-20241022", | |
| "score": 14.7, | |
| "details": "16,14,14" | |
| }, | |
| { | |
| "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)", | |
| "score": 16.3, | |
| "details": "19,15,15" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0120", | |
| "score": 14.3, | |
| "details": "15,14,14" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)", | |
| "score": 16.3, | |
| "details": "20,14,15" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)", | |
| "score": 23.3, | |
| "details": "23,23,24" | |
| }, | |
| { | |
| "model": "gamingagent + grok-3-mini-beta (thinking)", | |
| "score": 21.3, | |
| "details": "20,15,29" | |
| }, | |
| { | |
| "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8", | |
| "score": 10.3, | |
| "details": "9,10,12" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4.1-2025-04-14", | |
| "score": 13.7, | |
| "details": "13,14,14" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4o-2024-11-20", | |
| "score": 14, | |
| "details": "18,11,13" | |
| }, | |
| { | |
| "model": "gamingagent + o1-2024-12-17", | |
| "score": 35, | |
| "details": "35" | |
| }, | |
| { | |
| "model": "gamingagent + o1-mini-2024-09-12", | |
| "score": 11.7, | |
| "details": "11,11,13" | |
| }, | |
| { | |
| "model": "gamingagent + o3-2025-04-16", | |
| "score": 42, | |
| "details": "42" | |
| }, | |
| { | |
| "model": "gamingagent + o4-mini-2025-04-16", | |
| "score": 25.3, | |
| "details": "22,35,19" | |
| }, | |
| { | |
| "model": "random (x30)", | |
| "score": 10.2, | |
| "details": "" | |
| }, | |
| { | |
| "model": "gamingagent + claude-opus-4-20250514", | |
| "score": 20, | |
| "details": "17,18,25" | |
| }, | |
| { | |
| "model": "gamingagent + claude-sonnet-4-20250514", | |
| "score": 19.33, | |
| "details": "20,17,21" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0528", | |
| "score": 33.67, | |
| "details": "26,34,41" | |
| }, | |
| { | |
| "model": "gamingagent + qwen3-235B-A22B-fp8", | |
| "score": 11.67, | |
| "details": "13,14,8" | |
| } | |
| ] | |
| }, | |
| "Candy Crush": { | |
| "runs": 3, | |
| "results": [ | |
| { | |
| "model": "gamingagent + claude-3-5-sonnet-20241022", | |
| "score": 106, | |
| "details": "92,165,61" | |
| }, | |
| { | |
| "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)", | |
| "score": 484, | |
| "details": "535,428,489" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0120", | |
| "score": 447.3, | |
| "details": "409,436,497" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)", | |
| "score": 334.7, | |
| "details": "259,372,373" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)", | |
| "score": 416.3, | |
| "details": "411,414,424" | |
| }, | |
| { | |
| "model": "gamingagent + grok-3-mini-beta (thinking)", | |
| "score": 254, | |
| "details": "299,332,131" | |
| }, | |
| { | |
| "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8", | |
| "score": 128.7, | |
| "details": "67,139,180" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4.1-2025-04-14", | |
| "score": 182, | |
| "details": "163,215,168" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4o-2024-11-20", | |
| "score": 147.3, | |
| "details": "131,104,207" | |
| }, | |
| { | |
| "model": "gamingagent + o1-2024-12-17", | |
| "score": 159, | |
| "details": "159" | |
| }, | |
| { | |
| "model": "gamingagent + o1-mini-2024-09-12", | |
| "score": 48, | |
| "details": "21,86,37" | |
| }, | |
| { | |
| "model": "gamingagent + o3-2025-04-16", | |
| "score": 647, | |
| "details": "647" | |
| }, | |
| { | |
| "model": "gamingagent + o4-mini-2025-04-16", | |
| "score": 487.3, | |
| "details": "259,591,612" | |
| }, | |
| { | |
| "model": "random (x30)", | |
| "score": 116.5, | |
| "details": "" | |
| }, | |
| { | |
| "model": "gamingagent + claude-opus-4-20250514", | |
| "score": 464, | |
| "details": "593,406,393" | |
| }, | |
| { | |
| "model": "gamingagent + claude-sonnet-4-20250514", | |
| "score": 478.33, | |
| "details": "545,468,422" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0528", | |
| "score": 491.67, | |
| "details": "464,463,548" | |
| }, | |
| { | |
| "model": "gamingagent + qwen3-235B-A22B-fp8", | |
| "score": 363.33, | |
| "details": "365,372,353" | |
| } | |
| ] | |
| }, | |
| "Sokoban": { | |
| "runs": 3, | |
| "results": [ | |
| { | |
| "model": "gamingagent + claude-3-5-sonnet-20241022", | |
| "score": 0, | |
| "detail_box_on_target":"0,0,0", | |
| "cracked_levels": "0,0,0" | |
| }, | |
| { | |
| "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)", | |
| "score": 2.33, | |
| "detail_box_on_target":"2,4,1", | |
| "cracked_levels": "1,2,0" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0120", | |
| "score": 1.33, | |
| "detail_box_on_target":"2,0,2", | |
| "cracked_levels": "1,0,1" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)", | |
| "score": 1.67, | |
| "detail_box_on_target":"3,0,2", | |
| "cracked_levels": "2,0,1" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)", | |
| "score": 4.33, | |
| "detail_box_on_target":"4,4,5", | |
| "cracked_levels": "2,2,3" | |
| }, | |
| { | |
| "model": "gamingagent + grok-3-mini-beta (thinking)", | |
| "score": 5.67, | |
| "detail_box_on_target":"5,6,6", | |
| "cracked_levels": "3,3,3" | |
| }, | |
| { | |
| "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8", | |
| "score": 0, | |
| "detail_box_on_target":"0,0,0", | |
| "cracked_levels": "0,0,0" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4.1-2025-04-14", | |
| "score": 0, | |
| "detail_box_on_target":"0,0,0", | |
| "cracked_levels": "0,0,0" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4o-2024-11-20", | |
| "score": 0, | |
| "detail_box_on_target":"0,0,0", | |
| "cracked_levels": "0,0,0" | |
| }, | |
| { | |
| "model": "gamingagent + o1-2024-12-17", | |
| "score": 2.33, | |
| "detail_box_on_target":"2,2,3", | |
| "cracked_levels": "1,1,2" | |
| }, | |
| { | |
| "model": "gamingagent + o1-mini-2024-09-12", | |
| "score": 1.33, | |
| "detail_box_on_target":"1,2,1", | |
| "cracked_levels": "0,1,0" | |
| }, | |
| { | |
| "model": "gamingagent + o3-2025-04-16", | |
| "score": 8, | |
| "detail_box_on_target":"10,6", | |
| "cracked_levels": "5,3" | |
| }, | |
| { | |
| "model": "gamingagent + o4-mini-2025-04-16", | |
| "score": 5.33, | |
| "detail_box_on_target":"4,6,6", | |
| "cracked_levels": "2,2,3" | |
| }, | |
| { | |
| "model": "random (x30)", | |
| "score": 0, | |
| "detail_box_on_target":"0,0,0", | |
| "cracked_levels": "0,0,0" | |
| }, | |
| { | |
| "model": "gamingagent + claude-opus-4-20250514", | |
| "score": 4, | |
| "details": "4,4,4" | |
| }, | |
| { | |
| "model": "gamingagent + claude-sonnet-4-20250514", | |
| "score": 3, | |
| "details": "2,2,5" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0528", | |
| "score": 4.67, | |
| "details": "4,4,6" | |
| }, | |
| { | |
| "model": "gamingagent + qwen3-235B-A22B-fp8", | |
| "score": 2.33, | |
| "details": "1,2,4" | |
| } | |
| ] | |
| }, | |
| "Ace Attorney": { | |
| "runs": 1, | |
| "results": [ | |
| { | |
| "model": "gamingagent + claude-3-5-sonnet-20241022", | |
| "score": 2, | |
| "progress": "1:2/5", | |
| "evaluator result": "1/3" | |
| }, | |
| { | |
| "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)", | |
| "score": 7, | |
| "progress": "2:2/9", | |
| "evaluator result": "5/11" | |
| }, | |
| { | |
| "model": "gamingagent + deepseek-r1-0120", | |
| "score": 0, | |
| "progress": "0", | |
| "evaluator result": "1/5" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)", | |
| "score": 4, | |
| "progress": "1:4/5", | |
| "evaluator result": "1/7" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)", | |
| "score": 7, | |
| "progress": "2:2/9", | |
| "evaluator result": "2/3" | |
| }, | |
| { | |
| "model": "gamingagent + grok-3-mini-beta (thinking)", | |
| "score": 0, | |
| "progress": "0", | |
| "evaluator result": "0" | |
| }, | |
| { | |
| "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8", | |
| "score": 0, | |
| "progress": "0", | |
| "evaluator result": "0" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4.1-2025-04-14", | |
| "score": 2, | |
| "progress": "1:2/5", | |
| "evaluator result": "2/3" | |
| }, | |
| { | |
| "model": "gamingagent + gpt-4o-2024-11-20", | |
| "score": 0, | |
| "progress": "0", | |
| "evaluator result": "0" | |
| }, | |
| { | |
| "model": "gamingagent + o1-2024-12-17", | |
| "score": 16, | |
| "progress": "3: 2/8", | |
| "evaluator result": "6/11" | |
| }, | |
| { | |
| "model": "gamingagent + o1-mini-2024-09-12", | |
| "score": 0, | |
| "progress": "0", | |
| "evaluator result": "1/5" | |
| }, | |
| { | |
| "model": "gamingagent + o3-2025-04-16", | |
| "score": 16, | |
| "progress": "3: 2/8", | |
| "evaluator result": "1/2" | |
| }, | |
| { | |
| "model": "gamingagent + o4-mini-2025-04-16", | |
| "score": 4, | |
| "progress": "1:4/5", | |
| "evaluator result": "2/5" | |
| }, | |
| { | |
| "model": "random (x30)", | |
| "score": 0, | |
| "progress": "0", | |
| "evaluator result": "0" | |
| }, | |
| { | |
| "model": "gamingagent + claude-opus-4-20250514", | |
| "score": 6, | |
| "details": "6" | |
| }, | |
| { | |
| "model": "gamingagent + claude-sonnet-4-20250514", | |
| "score": 3.67, | |
| "details": "3,4,4" | |
| }, | |
| { | |
| "model": "gamingagent + gemini-2.5-flash-preview-05-20", | |
| "score": 4.33, | |
| "details": "3,4,6" | |
| } | |
| ] | |
| } | |
| } |