Spaces:

X-iZhang
/

CCD

Running

App Files Files Community

X-iZhang commited on Oct 6

Commit

b5b8cb6

verified ·

1 Parent(s): 842642b

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -34

app.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import os
 # Force CPU-only in this process by hiding CUDA devices (set before importing heavy libs)
-os.environ.setdefault('CUDA_VISIBLE_DEVICES', '')
 import torch
 import gradio as gr
 import time
 # =========================================
 # Safe Libra Hook (CPU fallback + dtype fix)
 # This hook must run before any heavyweight libra model-loading occurs.
@@ -19,14 +23,16 @@ _original_load_pretrained_model = getattr(builder, 'load_pretrained_model', None
 def safe_load_pretrained_model(model_path, model_base=None, model_name=None, **kwargs):
     print("[INFO] Hook activated: safe_load_pretrained_model()")
-    # 补全 model_name，避免 .lower() on None
     if model_name is None:
         model_name = model_path
-    # 强制以 CPU 参数调用原函数，尽量避免 CUDA 初始化
     kwargs = dict(kwargs)
-    kwargs.setdefault('device', 'cpu')
-    kwargs.setdefault('device_map', 'cpu')
     if _original_load_pretrained_model is None:
         raise RuntimeError('Original load_pretrained_model not found in builder')
@@ -50,20 +56,31 @@ def safe_load_pretrained_model(model_path, model_base=None, model_name=None, **k
         # propagate other errors
         raise
-    # 在 CPU 情况下尝试把模型和视觉塔上调到 float32，减少 CPU 上的兼容问题
-    if not torch.cuda.is_available():
-        try:
-            model.to(dtype=torch.float32)
-        except Exception as e:
-            print(f"[WARN] Could not upcast LM to float32: {e}")
-        try:
             vt = model.get_vision_tower()
-            vt.to(device='cpu', dtype=torch.float32)
-            print('[INFO] Vision tower moved to cpu (float32).')
-        except Exception as e:
-            print(f"[WARN] Could not move vision_tower to cpu/float32: {e}")
-    else:
-        print('[INFO] GPU available — keeping original device/dtype behavior.')
     return tokenizer, model, image_processor, context_len
@@ -80,7 +97,12 @@ def safe_load_model(model_path, model_base=None, model_name=None):
 run_libra.load_model = safe_load_model
-# 现在导入 CCD 与其他被 hook 的符号（导入放在 hook 之后以确保生效）
 from ccd import ccd_eval, run_eval
 from libra.eval.run_libra import load_model
@@ -88,15 +110,15 @@ from libra.eval.run_libra import load_model
 # Global Configuration
 # =========================================
 MODEL_CATALOGUE = {
     "Libra-v1.0-7B": "X-iZhang/libra-v1.0-7b",
-    "Libra-v1.0-3B": "X-iZhang/libra-v1.0-3b",
     "MAIRA-2": "X-iZhang/libra-maira-2",
     "LLaVA-Med-v1.5": "X-iZhang/libra-llava-med-v1.5-mistral-7b",
     "LLaVA-Rad": "X-iZhang/libra-llava-rad",
     "Med-CXRGen-F": "X-iZhang/Med-CXRGen-F",
     "Med-CXRGen-I": "X-iZhang/Med-CXRGen-I"
 }
-DEFAULT_MODEL_NAME = "MAIRA-2"
 _loaded_models = {}
@@ -104,13 +126,14 @@ _loaded_models = {}
 # Environment Setup
 # =========================================
 def setup_environment():
-    if torch.cuda.is_available():
-        print("🔹 Using GPU:", torch.cuda.get_device_name(0))
-    else:
-        print("🔹 Using CPU")
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
     os.environ['TRANSFORMERS_CACHE'] = './cache'
-    torch.set_num_threads(4)
 # =========================================
@@ -125,14 +148,27 @@ def load_or_get_model(model_name: str):
         return _loaded_models[model_path]
     print(f"🔹 Loading model: {model_name} ({model_path}) ...")
     try:
         with torch.no_grad():
             model = load_model(model_path)
         _loaded_models[model_path] = model
         print(f"✅ Loaded successfully: {model_name}")
         return model
     except Exception as e:
         print(f"❌ Error loading model {model_name}: {e}")
         raise
@@ -148,19 +184,25 @@ def generate_ccd_description(
     beta,
     gamma,
     use_run_eval,
-    max_new_tokens
 ):
     """Generate findings using CCD evaluation."""
     if not current_img:
         return "⚠️ Please upload or select an example image first."
     try:
         print(f"🔹 Generating description with model: {selected_model_name}")
         print(f"🔹 Parameters: alpha={alpha}, beta={beta}, gamma={gamma}")
         print(f"🔹 Image path: {current_img}")
         model = load_or_get_model(selected_model_name)
         print(f"🔹 Running CCD with {selected_model_name} and expert model {expert_model}...")
         ccd_output = ccd_eval(
             libra_model=model,
             image=current_img,
@@ -172,7 +214,10 @@ def generate_ccd_description(
             gamma=gamma
         )
         if use_run_eval:
             baseline_output = run_eval(
                 libra_model=model,
                 image=current_img,
@@ -180,11 +225,13 @@ def generate_ccd_description(
                 max_new_tokens=max_new_tokens,
                 num_beams=1
             )
             return (
                 f"### 🩺 CCD Result ({expert_model})\n{ccd_output}\n\n"
                 f"---\n### ⚖️ Baseline (run_eval)\n{baseline_output[0]}"
             )
         return f"### 🩺 CCD Result ({expert_model})\n{ccd_output}"
     except Exception:
@@ -281,10 +328,16 @@ def main():
         ### [Project Page](https://x-izhang.github.io/CCD/) | [Paper](https://arxiv.org/abs/2509.23379) | [Code](https://github.com/X-iZhang/CCD) | [Models](https://huggingface.co/collections/X-iZhang/libra-6772bfccc6079298a0fa5f8d)
         **🚨 Performance Warning**
-        The demo is currently running on **CPU**, and a single inference takes approximately **500 seconds**.
-        To achieve optimal performance and significantly reduce inference time, **GPU** is required for effective operation.
-        For more details, please refer to the [launch demo locally](https://github.com/X-iZhang/CCD#gradio-web-interface).
         """)
         with gr.Tab("✨ CCD Demo"):
@@ -347,8 +400,8 @@ def main():
                         gamma = gr.Slider(0, 20, value=10, step=1, label="Gamma")
                     with gr.Accordion("Advanced Options", open=False):
-                        max_new_tokens = gr.Slider(10, 256, value=128, step=1, label="Max New Tokens")
-                        use_run_eval = gr.Checkbox(label="Compare with baseline (run_eval)", value=False)
                     generate_btn = gr.Button("🚀 Generate", variant="primary")
@@ -396,7 +449,12 @@ def main():
         pass
-    demo.launch()
 if __name__ == "__main__":

 import os
 # Force CPU-only in this process by hiding CUDA devices (set before importing heavy libs)
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 import torch
 import gradio as gr
 import time
+# Force CPU device globally by overriding torch.cuda.is_available
+torch.cuda.is_available = lambda: False
 # =========================================
 # Safe Libra Hook (CPU fallback + dtype fix)
 # This hook must run before any heavyweight libra model-loading occurs.
 def safe_load_pretrained_model(model_path, model_base=None, model_name=None, **kwargs):
     print("[INFO] Hook activated: safe_load_pretrained_model()")
+    # Complete model_name to avoid .lower() on None
     if model_name is None:
         model_name = model_path
+    # Force CPU parameters when calling original function
     kwargs = dict(kwargs)
+    kwargs['device'] = 'cpu'
+    kwargs['device_map'] = 'cpu'
+    kwargs.setdefault('torch_dtype', torch.float32)
+    kwargs.setdefault('low_cpu_mem_usage', True)
     if _original_load_pretrained_model is None:
         raise RuntimeError('Original load_pretrained_model not found in builder')
         # propagate other errors
         raise
+    # Force all model components to CPU with float32 for compatibility
+    print('[INFO] Forcing all components to CPU with float32 dtype...')
+    try:
+        model = model.to(device='cpu', dtype=torch.float32)
+        print('[INFO] Model moved to CPU (float32).')
+    except Exception as e:
+        print(f"[WARN] Could not move model to cpu/float32: {e}")
+    try:
+        if hasattr(model, 'get_vision_tower'):
             vt = model.get_vision_tower()
+            if vt is not None:
+                vt = vt.to(device='cpu', dtype=torch.float32)
+                print('[INFO] Vision tower moved to CPU (float32).')
+    except Exception as e:
+        print(f"[WARN] Could not move vision_tower to cpu/float32: {e}")
+    try:
+        if hasattr(model, 'get_model'):
+            inner_model = model.get_model()
+            if inner_model is not None:
+                inner_model = inner_model.to(device='cpu', dtype=torch.float32)
+                print('[INFO] Inner model moved to CPU (float32).')
+    except Exception as e:
+        print(f"[WARN] Could not move inner model to cpu/float32: {e}")
     return tokenizer, model, image_processor, context_len
 run_libra.load_model = safe_load_model
+# Now import CCD and hook ccd_utils to force CPU for expert models
+import ccd.ccd_utils as ccd_utils_module
+ccd_utils_module._DEVICE = torch.device('cpu')
+print('[INFO] Forced ccd_utils._DEVICE to CPU')
+# Now import the evaluation functions
 from ccd import ccd_eval, run_eval
 from libra.eval.run_libra import load_model
 # Global Configuration
 # =========================================
 MODEL_CATALOGUE = {
+    "Libra-v1.0-3B (⚡Recommended for CPU)": "X-iZhang/libra-v1.0-3b",
     "Libra-v1.0-7B": "X-iZhang/libra-v1.0-7b",
     "MAIRA-2": "X-iZhang/libra-maira-2",
     "LLaVA-Med-v1.5": "X-iZhang/libra-llava-med-v1.5-mistral-7b",
     "LLaVA-Rad": "X-iZhang/libra-llava-rad",
     "Med-CXRGen-F": "X-iZhang/Med-CXRGen-F",
     "Med-CXRGen-I": "X-iZhang/Med-CXRGen-I"
 }
+DEFAULT_MODEL_NAME = "Libra-v1.0-3B (⚡Recommended for CPU)"
 _loaded_models = {}
 # Environment Setup
 # =========================================
 def setup_environment():
+    print("🔹 Running in CPU-only mode (forced for Hugging Face Spaces)")
     os.environ['TOKENIZERS_PARALLELISM'] = 'false'
     os.environ['TRANSFORMERS_CACHE'] = './cache'
+    # Set number of threads for CPU inference
+    num_threads = min(os.cpu_count() or 4, 8)
+    torch.set_num_threads(num_threads)
+    print(f"🔹 Using {num_threads} CPU threads")
 # =========================================
         return _loaded_models[model_path]
     print(f"🔹 Loading model: {model_name} ({model_path}) ...")
+    print(f"🔹 This may take 2-5 minutes on CPU, please wait...")
     try:
+        # Clear cache before loading to maximize available memory
+        import gc
+        gc.collect()
+        if hasattr(torch.cuda, 'empty_cache'):
+            torch.cuda.empty_cache()
         with torch.no_grad():
             model = load_model(model_path)
         _loaded_models[model_path] = model
         print(f"✅ Loaded successfully: {model_name}")
+        # Clean up after loading
+        gc.collect()
         return model
     except Exception as e:
         print(f"❌ Error loading model {model_name}: {e}")
+        import traceback
+        traceback.print_exc()
         raise
     beta,
     gamma,
     use_run_eval,
+    max_new_tokens,
+    progress=gr.Progress()
 ):
     """Generate findings using CCD evaluation."""
     if not current_img:
         return "⚠️ Please upload or select an example image first."
     try:
+        progress(0, desc="Starting inference...")
         print(f"🔹 Generating description with model: {selected_model_name}")
         print(f"🔹 Parameters: alpha={alpha}, beta={beta}, gamma={gamma}")
         print(f"🔹 Image path: {current_img}")
+        progress(0.1, desc="Loading model (this may take several minutes on CPU)...")
         model = load_or_get_model(selected_model_name)
+        progress(0.3, desc="Running CCD inference (this may take 5-10 minutes on CPU)...")
         print(f"🔹 Running CCD with {selected_model_name} and expert model {expert_model}...")
         ccd_output = ccd_eval(
             libra_model=model,
             image=current_img,
             gamma=gamma
         )
+        progress(0.8, desc="Processing results...")
         if use_run_eval:
+            progress(0.85, desc="Running baseline comparison...")
             baseline_output = run_eval(
                 libra_model=model,
                 image=current_img,
                 max_new_tokens=max_new_tokens,
                 num_beams=1
             )
+            progress(1.0, desc="Complete!")
             return (
                 f"### 🩺 CCD Result ({expert_model})\n{ccd_output}\n\n"
                 f"---\n### ⚖️ Baseline (run_eval)\n{baseline_output[0]}"
             )
+        progress(1.0, desc="Complete!")
         return f"### 🩺 CCD Result ({expert_model})\n{ccd_output}"
     except Exception:
         ### [Project Page](https://x-izhang.github.io/CCD/) | [Paper](https://arxiv.org/abs/2509.23379) | [Code](https://github.com/X-iZhang/CCD) | [Models](https://huggingface.co/collections/X-iZhang/libra-6772bfccc6079298a0fa5f8d)
         **🚨 Performance Warning**
+        This demo is running on **CPU-only** mode. A single inference may take **5-10 minutes** depending on the model and parameters.
+        **Recommendations for faster inference:**
+        - Use smaller models (Libra-v1.0-3B is faster than 7B models)
+        - Reduce `Max New Tokens` to 64-128 (default: 128)
+        - Disable baseline comparison
+        - For GPU acceleration, please [run the demo locally](https://github.com/X-iZhang/CCD#gradio-web-interface)
+        **Note:** If you see "Connection Lost", please wait - the inference is still running. The results will appear when complete.
         """)
         with gr.Tab("✨ CCD Demo"):
                         gamma = gr.Slider(0, 20, value=10, step=1, label="Gamma")
                     with gr.Accordion("Advanced Options", open=False):
+                        max_new_tokens = gr.Slider(10, 256, value=64, step=1, label="Max New Tokens (lower = faster)")
+                        use_run_eval = gr.Checkbox(label="Compare with baseline (run_eval) [doubles inference time]", value=False)
                     generate_btn = gr.Button("🚀 Generate", variant="primary")
         pass
+    # Launch with extended timeout for CPU inference
+    demo.queue(max_size=10)  # Enable queue for better handling of long-running tasks
+    demo.launch(
+        max_threads=4,  # Limit concurrent requests
+        show_error=True  # Show detailed errors
+    )
 if __name__ == "__main__":