File size: 7,986 Bytes
4e5eb13
 
e55f60d
 
 
 
 
5923ca2
4e5eb13
11ee6dc
 
e55f60d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452c890
34660db
452c890
 
 
 
 
 
 
 
 
 
 
 
 
 
34660db
 
452c890
34660db
948cba3
 
452c890
 
 
 
 
 
 
4e5eb13
948cba3
 
 
 
a652572
892fa36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f36044d
892fa36
 
 
 
 
 
 
a652572
892fa36
 
 
 
 
11ee6dc
09d7cf1
948cba3
5923ca2
948cba3
 
34660db
948cba3
34660db
948cba3
34660db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948cba3
 
34660db
 
a652572
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import gradio as gr
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import os
import tempfile
from src.leaderboard.leaderboard_html import create_leaderboard_html

BASE_URL = "https://huggingface.co/datasets/zonszer/demo_source_data/resolve/main"

def load_image_from_url(url):
    try:
        response = requests.get(url)
        return Image.open(BytesIO(response.content))
    except:
        return None
        
def load_file_from_url(url):
    try:
        response = requests.get(url)
        file_ext = os.path.splitext(url)[1]
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file:
            tmp_file.write(response.content)
            return tmp_file.name
    except:
        return None

# Static data - reordered columns: Method, #Param., Input Type, Control Type, Model Type, Mean Traj. ↓, Acc. ↑
STATIC_DATA = [
    ["w/o WM", "72B", "RGB", "–", "VLM", 6.24, 50.27],
    ["PathDreamer [36]", "0.69B", "RGB-D; Sem; Pano", "Viewpoint", "Image Gen.", 5.28, 56.99],
    ["SE3DS [11]", "1.1B", "RGB-D; Pano", "Viewpoint", "Image Gen.", 5.29, 57.53],
    ["NWM [25]", "1B", "RGB", "Trajectory", "Video Gen.", 5.68, 57.35],
    ["SVD [6]", "1.5B", "RGB", "Image", "Video Gen.", 5.29, 57.71],
    ["LTX-Video [5]", "2B", "RGB", "Text", "Video Gen.", 5.37, 56.08],
    ["Hunyuan [4]", "13B", "RGB", "Text", "Video Gen.", 5.21, 57.71],
    ["Wan2.1 [23]", "14B", "RGB", "Text", "Video Gen.", 5.24, 58.26],
    ["Cosmos [1]", "2B", "RGB", "Text", "Video Gen.", 5.898, 52.27],
    ["Runway", "–", "–", "Text", "Video Gen.", "–", "–"],
    ["SVD† [6]", "1.5B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.02, 60.98],
    ["LTX† [5]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.49, 57.53],
    ["WAN2.1† [23]", "14B", "RGB; Pano", "Action", "Video Gen. Post-Train", "XXX", "XXX"],
    ["Cosmos† [1]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.08, 60.25],
]

COLUMNS = ["Method", "#Param.", "Input Type", "Control Type", "Model Type", "Mean Traj. ↓", "Acc. ↑"]

def create_leaderboard():
    df = pd.DataFrame(STATIC_DATA, columns=COLUMNS)
    # Sort by accuracy in descending order (highest first), handling non-numeric values
    df_clean = df.copy()
    # Replace non-numeric values with -1 for sorting (so they appear at bottom)
    df_clean['Acc. ↑'] = pd.to_numeric(df_clean['Acc. ↑'], errors='coerce').fillna(-1)
    df_sorted = df_clean.sort_values('Acc. ↑', ascending=False)
    # Return original df with the sorted order but original values
    return df.iloc[df_sorted.index].reset_index(drop=True)

with gr.Blocks(title="World-in-World: Building a Closed-Loop World Interface to Evaluate World Models", theme=gr.themes.Soft()) as demo:
    gr.HTML("<h1 style='text-align: center; margin-bottom: 1rem'>πŸ† World-in-World: Building a Closed-Loop World Interface to Evaluate World Models</h1>")
    
    with gr.Tabs():
        with gr.TabItem("πŸ§‘β€πŸ« Interactive Demo"):
            with open("src/display/demo_new.html", "r", encoding="utf-8") as f:
                html_content = f.read()
            gr.HTML(html_content)
            # with gr.Row():
            #     # Left Zone: Agent's View
            #     with gr.Column(scale=2, min_width=350):
            #         gr.HTML("<h2 style='text-align: center;'>Agent's View</h2>")
            #         # Mimicking the blue instruction box from the image
            #         gr.HTML("""
            #             <div style='background-color: #e6f3ff; border: 1px solid #b3d9ff; border-radius: 8px; padding: 15px; font-family: sans-serif;'>
            #                 <div style='display: flex; align-items: center; margin-bottom: 10px;'>
            #                     <span style='font-size: 24px; margin-right: 10px;'>🧠</span>
            #                     <h3 style='margin: 0; color: #333;'>Instruction:</h3>
            #                 </div>
            #                 <p style='margin: 0; color: #555;'>Navigate to the Toaster in the room and be as close as possible to it.</p>
            #             </div>
            #         """)
            #         # Mimicking the grey planning box from the image
            #         gr.HTML("""
            #             <div style='background-color: #f5f5f5; border: 1px solid #e0e0e0; border-radius: 8px; padding: 15px; margin-top: 20px; font-family: sans-serif;'>
            #                 <div style='display: flex; align-items: center; margin-bottom: 10px;'>
            #                     <span style='font-size: 24px; margin-right: 10px;'>🦾</span>
            #                     <h3 style='margin: 0; color: #333;'>Environment Step 4-7:</h3>
            #                 </div>
            #                 <h4 style='margin-top: 10px; margin-bottom: 5px; color: #444;'>Planning:</h4>
            #                 <ol start="4" style='padding-left: 20px; margin: 0; color: #555;'>
            #                     <li>Move leftward by 0.25.</li>
            #                     <li>Move leftward by 0.25.</li>
            #                     <li>Move forward by 0.25.</li>
            #                     <li>Move forward by 0.25.</li>
            #                 </ol>
            #             </div>
            #         """)
                
                # # Middle Zone: Closed-Loop Environmental Feedback
                # with gr.Column(scale=4, min_width=500):
                #     gr.HTML("<h2 style='text-align: center; color: #db83b5;'>Closed-Loop Environmental Feedback</h2>")
                #     with gr.Row():
                #         gr.Video(value=load_file_from_url(f"{BASE_URL}/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A001/world_model_gen/bbox_gen_video_1.mp4"), label="First Person View", interactive=False)
                #         gr.Image(value=load_image_from_url(f"{BASE_URL}/scenes_glb/birdEye_5ZKStnWn8Zo.png"), label="Bird's Eye View", type="pil", interactive=False)
                #     # gr.Model3D(value=load_file_from_url(f"{BASE_URL}/scenes_glb/5ZKStnWn8Zo.glb"), label="3D Scene", interactive=False)
                
                # # Right Zone: World Model's Generation
                # with gr.Column(scale=3, min_width=400):
                #     gr.HTML("<h2 style='text-align: center;'>World Model's Generation</h2>")
                #     # Using the new video path provided by the user
                #     gr.Video(value=load_file_from_url(f"{BASE_URL}/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A005/world_model_gen/obj_centered_gen_video_1.mp4"), label="Generated View", interactive=False)


        with gr.TabItem("πŸ“Š Leaderboard"):
            gr.HTML(create_leaderboard_html())
        
        with gr.TabItem("πŸ“ About"):
            gr.Markdown("""
            # World-in-World: Building a Closed-Loop World Interface to Evaluate World Models
            
            This leaderboard showcases performance metrics across different types of AI models in world modeling tasks:
            
            ## Model Categories
            - **VLM**: Vision-Language Models
            - **Image Gen.**: Image Generation Models  
            - **Video Gen.**: Video Generation Models
            - **Video Gen. Post-Train**: Post-training specialized Video Generation Models
            
            ## Metrics Explained
            - **Acc. ↑**: Accuracy score (higher values indicate better performance)
            - **Mean Traj. ↓**: Mean trajectory error (lower values indicate better performance)
            
            ## Notes
            - † indicates post-training specialized models
            - XXX indicates results pending/unavailable
            - – indicates not applicable or not available
            
            *Results represent performance on world modeling evaluation benchmarks and may vary across different evaluation settings.*
            """)

if __name__ == "__main__":
    demo.launch()