Spaces:

nvidia
/

audio-flamingo-3

Running on Zero

App Files Files Community

create_copies_for_each_mode

by nithinraok - opened 29 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+11

-6

Files changed (1) hide show

app.py +11 -6

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import llava
 from peft import PeftModel
 import os
 from huggingface_hub import snapshot_download
-import copy
 import spaces
 # ---------------------------------
 # SINGLE-TURN MODEL SETUP
@@ -12,14 +11,16 @@ import spaces
 MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
 # model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, devices=[0])
 model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
-model_single = model_single.to("cuda")
-model_single_copy = copy.deepcopy(model_single) # keep a copy of the original model for non-thinking mode
 generation_config_single = model_single.default_generation_config
 non_lora_trainables = torch.load(
                 os.path.join(MODEL_BASE_THINK, "non_lora_trainables.bin"),
                 map_location="cpu",
@@ -27,9 +28,13 @@ non_lora_trainables = torch.load(
 non_lora_trainables = {
         (k[6:] if k.startswith("model.") else k): v for k, v in non_lora_trainables.items()
     }
-model_single.load_state_dict(non_lora_trainables, strict=False)
 model_think = PeftModel.from_pretrained(
-        model_single,
         MODEL_BASE_THINK,
         device_map="auto",
         torch_dtype=torch.float16,
@@ -51,7 +56,7 @@ def single_turn_infer(audio_file, prompt_text):
     try:
         sound = llava.Sound(audio_file)
         full_prompt = f"<sound>\n{prompt_text}"
-        response = model_single_copy.generate_content([sound, full_prompt], generation_config=generation_config_single)
         return response
     except Exception as e:
         return f"❌ Error: {str(e)}"

 from peft import PeftModel
 import os
 from huggingface_hub import snapshot_download
 import spaces
 # ---------------------------------
 # SINGLE-TURN MODEL SETUP
 MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, devices=[0])
 model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
+model_single = model_single.to(device)
 generation_config_single = model_single.default_generation_config
+# Load the thinking model with LoRA adapters
 non_lora_trainables = torch.load(
                 os.path.join(MODEL_BASE_THINK, "non_lora_trainables.bin"),
                 map_location="cpu",
 non_lora_trainables = {
         (k[6:] if k.startswith("model.") else k): v for k, v in non_lora_trainables.items()
     }
+# Load model_think as a separate instance for thinking mode
+model_think = llava.load(MODEL_BASE_SINGLE, model_base=None)
+model_think = model_think.to(device)
+model_think.load_state_dict(non_lora_trainables, strict=False)
 model_think = PeftModel.from_pretrained(
+        model_think,
         MODEL_BASE_THINK,
         device_map="auto",
         torch_dtype=torch.float16,
     try:
         sound = llava.Sound(audio_file)
         full_prompt = f"<sound>\n{prompt_text}"
+        response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
         return response
     except Exception as e:
         return f"❌ Error: {str(e)}"