Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running

ACloudCenter Claude Opus 4.6 (1M context) commited on 26 days ago

Commit

3253bc7

1 Parent(s): aec877b

Add AI script generator, conversation editor, and cleanup

- Add AI script generation via HF Inference API (Qwen2.5-72B)
with prompt field, ~1000 word limit, respects speaker count
- Replace raw script textbox with dynamic turn-based conversation
editor (add/edit/delete turns, speaker dropdowns per turn)
- Fix voice name mismatch: frontend now uses actual voice file
names (Cherry, Chicago, Janus, Mantis, Sponge, Starchild)
- Clean up duplicate content in Architecture tab
- Reorder UI layout to settings → generate → output flow
- Remove progress slider, consolidate status displays

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

app.py +347 -155
backend_modal/modal_runner.py +30 -15
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,27 +1,30 @@
 import os
 import gradio as gr
 import modal
 import traceback
 # --- Configuration ---
-# This is the name of your Modal stub.
 MODAL_STUB_NAME = "vibevoice-generator"
-MODAL_CLASS_NAME = "VibeVoiceModel" # Extract class name
-MODAL_METHOD_NAME = "generate_podcast" # Extract method name
 AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
 AVAILABLE_VOICES = ["Cherry", "Chicago", "Janus", "Mantis", "Sponge", "Starchild"]
 DEFAULT_SPEAKERS = ["Cherry", "Chicago", "Janus", "Mantis"]
-# Load example scripts
 def load_example_scripts():
     examples_dir = "text_examples"
     example_scripts = []
     example_scripts_natural = []
     if not os.path.exists(examples_dir):
         return example_scripts, example_scripts_natural
     original_files = [
         "1p_ai_tedtalk.txt",
         "1p_politcal_speech.txt",
@@ -30,67 +33,152 @@ def load_example_scripts():
         "3p_military_meeting.txt",
         "3p_oil_meeting.txt",
         "4p_gamecreation_meeting.txt",
-        "4p_product_meeting.txt"
     ]
     for txt_file in original_files:
         file_path = os.path.join(examples_dir, txt_file)
         natural_file = txt_file.replace(".txt", "_natural.txt")
         natural_path = os.path.join(examples_dir, natural_file)
         if os.path.exists(file_path):
-            with open(file_path, 'r', encoding='utf-8') as f:
                 example_scripts.append(f.read())
         else:
             example_scripts.append("")
         if os.path.exists(natural_path):
-            with open(natural_path, 'r', encoding='utf-8') as f:
                 example_scripts_natural.append(f.read())
         else:
-            example_scripts_natural.append(example_scripts[-1] if example_scripts else "")
     return example_scripts, example_scripts_natural
-# Number of speakers per example script
-SCRIPT_SPEAKER_COUNTS = [1, 1, 2, 2, 3, 3, 4, 4]
 EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
 # --- Modal Connection ---
 try:
-    # Look up the remote class
     RemoteVibeVoiceModel = modal.Cls.from_name(MODAL_STUB_NAME, MODAL_CLASS_NAME)
-    # Create an instance of the remote class
     remote_model_instance = RemoteVibeVoiceModel()
-    # Get the remote method
     remote_generate_function = remote_model_instance.generate_podcast
     print("Successfully connected to Modal function.")
 except modal.exception.NotFoundError:
     print("ERROR: Modal function not found.")
-    print(f"Please deploy the Modal app first by running: modal deploy modal_runner.py")
     remote_generate_function = None
-# --- Gradio UI Definition ---
 theme = gr.themes.Ocean(
     primary_hue="indigo",
     secondary_hue="fuchsia",
     neutral_hue="slate",
-).set(
-    button_large_radius='*radius_sm'
-)
 AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
 PRIMARY_STAGE_MESSAGES = {
-    "connecting": ("🚀 Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
-    "queued": ("🚦 Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
-    "loading_model": ("📦 Loading Model", "Streaming VibeVoice weights to the GPU."),
-    "loading_voices": ("🎙️ Loading Voices", None),
-    "preparing_inputs": ("📝 Preparing Script", "Formatting the conversation for the model."),
-    "generating_audio": ("🎧 Generating Audio", "Synthesizing speech — this is the longest step."),
-    "processing_audio": ("✨ Finalizing Audio", "Converting tensors into a playable waveform."),
-    "complete": ("✅ Ready", "Press play below or download your conference."),
-    "error": ("❌ Error", "Check the log for details."),
 }
 AUDIO_STAGE_LABELS = {
     "connecting": "Complete Conference (requesting GPU...)",
@@ -102,11 +190,11 @@ AUDIO_STAGE_LABELS = {
     "processing_audio": "Complete Conference (finalizing audio...)",
     "error": "Complete Conference (error)",
 }
-READY_PRIMARY_STATUS = "### Ready\nPress **Generate** to run VibeVoice."
 def build_primary_status(stage: str, status_line: str) -> str:
-    title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("⚙️ Working", "Processing..."))
     desc_parts = []
     if default_desc:
         desc_parts.append(default_desc)
@@ -116,23 +204,33 @@ def build_primary_status(stage: str, status_line: str) -> str:
     return f"### {title}\n{desc}"
 def create_demo_interface():
     with gr.Blocks(
         title="VibeVoice - Conference Generator",
         theme=theme,
     ) as interface:
         gr.HTML("""
         <div style="width: 100%; margin-bottom: 20px;">
-            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png"
                 style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
                 alt="VibeVoice Banner">
         </div>
         """)
         with gr.Tabs():
             with gr.Tab("Generate"):
-                gr.Markdown("**Tip:** The 1.5B model is recommended — it's much faster with minimal quality difference.")
                 with gr.Row():
                     with gr.Column(scale=1):
                         gr.Markdown("### Settings")
                         model_dropdown = gr.Dropdown(
@@ -150,7 +248,7 @@ def create_demo_interface():
                             speaker = gr.Dropdown(
                                 choices=AVAILABLE_VOICES,
                                 value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
-                                label=f"Speaker {i+1}",
                                 visible=(i < 2),
                             )
                             speaker_selections.append(speaker)
@@ -161,66 +259,130 @@ def create_demo_interface():
                                 label="CFG Scale (Guidance Strength)",
                             )
                     with gr.Column(scale=2):
-                        script_input = gr.Textbox(
-                            label="Conversation Script",
-                            placeholder="Enter your conference script here...\n\nFormat:\nSpeaker 1: Hello everyone...\nSpeaker 2: Thanks for having me...",
-                            lines=12,
-                            max_lines=20,
-                        )
-                        with gr.Row():
-                            use_natural = gr.Checkbox(
-                                value=True,
-                                label="Natural talking sounds",
-                                scale=1,
                             )
-                            duration_display = gr.Textbox(
-                                value="",
-                                label="Est. Duration",
-                                interactive=False,
-                                scale=1,
                             )
-                        example_names = [
-                            "AI TED Talk",
-                            "Political Speech",
-                            "Finance IPO Meeting",
-                            "Telehealth Meeting",
-                            "Military Meeting",
-                            "Oil Meeting",
-                            "Game Creation Meeting",
-                            "Product Meeting",
-                        ]
-                        example_buttons = []
-                        with gr.Row():
-                            for i in range(min(4, len(example_names))):
-                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
-                                example_buttons.append(btn)
-                        with gr.Row():
-                            for i in range(4, min(8, len(example_names))):
-                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
-                                example_buttons.append(btn)
                 generate_btn = gr.Button(
-                    "Generate Conference", size="lg",
-                    variant="primary",
                 )
                 primary_status = gr.Markdown(
                     value=READY_PRIMARY_STATUS,
                     elem_id="primary-status",
                 )
-                progress_slider = gr.Slider(
-                    minimum=0,
-                    maximum=100,
-                    value=0,
-                    step=1,
-                    label="Progress",
-                    interactive=False,
-                )
                 complete_audio_output = gr.Audio(
                     label=AUDIO_LABEL_DEFAULT,
                     type="numpy",
@@ -234,73 +396,112 @@ def create_demo_interface():
                         interactive=False,
                     )
-                def update_speaker_visibility(num_speakers):
-                    return [gr.update(visible=(i < num_speakers)) for i in range(4)]
-                def estimate_duration(script):
-                    """Estimate duration based on word count."""
-                    if not script:
-                        return ""
-                    words = len(script.split())
-                    # Approximate 150 words per minute for natural speech
-                    minutes = words / 150
-                    if minutes < 1:
-                        return f"~{int(minutes * 60)} seconds"
                     else:
-                        return f"~{minutes:.1f} minutes"
-                def load_specific_example(idx, natural):
-                    """Load a specific example script."""
                     if idx >= len(EXAMPLE_SCRIPTS):
-                        return [2, "", ""] + [None, None, None, None]
                     script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
                     num = SCRIPT_SPEAKER_COUNTS[idx] if idx < len(SCRIPT_SPEAKER_COUNTS) else 1
-                    speakers = AVAILABLE_VOICES[:num]
-                    duration = estimate_duration(script)
-                    # Pad speakers to 4
                     while len(speakers) < 4:
                         speakers.append(None)
-                    return [num, script, duration] + speakers[:4]
-                # Connect example buttons
                 for idx, btn in enumerate(example_buttons):
                     btn.click(
-                        fn=lambda nat, i=idx: load_specific_example(i, nat),
                         inputs=[use_natural],
-                        outputs=[num_speakers, script_input, duration_display] + speaker_selections,
-                        queue=False
                     )
-                # Update duration when script changes
-                script_input.change(
-                    fn=estimate_duration,
-                    inputs=[script_input],
-                    outputs=[duration_display],
-                    queue=False
-                )
-                num_speakers.change(
-                    fn=update_speaker_visibility,
-                    inputs=[num_speakers],
-                    outputs=speaker_selections
-                )
-                def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                     if remote_generate_function is None:
                         yield (
                             build_primary_status("error", "Modal backend is offline."),
-                            gr.update(value=0),
                             gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                             "ERROR: Modal function not deployed. Please contact the space owner.",
                         )
                         return
                     yield (
                         build_primary_status("connecting", "Provisioning GPU resources... cold starts can take up to a minute."),
-                        gr.update(value=1),
                         gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
                         "Calling remote GPU on Modal.com...",
                     )
@@ -309,7 +510,6 @@ def create_demo_interface():
                         speakers = speakers_and_params[:4]
                         cfg_scale_val = speakers_and_params[4]
                         current_log = ""
-                        last_pct = 1
                         last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
                         last_stage = "connecting"
@@ -321,18 +521,16 @@ def create_demo_interface():
                             speaker_3=speakers[2],
                             speaker_4=speakers[3],
                             cfg_scale=cfg_scale_val,
-                            model_name=model_choice
                         ):
                             if not update:
                                 continue
                             if isinstance(update, dict):
                                 audio_payload = update.get("audio")
-                                progress_pct = update.get("pct", last_pct)
                                 stage_key = update.get("stage", last_stage) or last_stage
                                 status_line = update.get("status") or "Processing..."
                                 current_log = update.get("log", current_log)
-                                progress_value = max(0, min(100, int(round(progress_pct))))
                                 audio_label = AUDIO_STAGE_LABELS.get(stage_key)
                                 if not audio_label:
@@ -340,8 +538,6 @@ def create_demo_interface():
                                     audio_label = f"Complete Conference ({stage_label.lower()})"
                                 if stage_key == "complete":
                                     audio_label = AUDIO_LABEL_DEFAULT
-                                if stage_key == "error":
-                                    progress_value = 0
                                 audio_update = gr.update(label=audio_label)
                                 if audio_payload is not None:
@@ -349,32 +545,29 @@ def create_demo_interface():
                                 yield (
                                     build_primary_status(stage_key, status_line),
-                                    gr.update(value=progress_value),
                                     audio_update,
                                     current_log,
                                 )
-                                last_pct = progress_value
                                 last_audio_label = audio_label
                                 last_stage = stage_key
                             else:
-                                audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
                                 if log_text:
                                     current_log = log_text
                                 if audio_payload is not None:
-                                    audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
                                     yield (
                                         build_primary_status("complete", "Conference ready to download."),
-                                        gr.update(value=100),
-                                        audio_update,
                                         current_log,
                                     )
                                 else:
                                     status_line = current_log.splitlines()[-1] if current_log else "Processing..."
                                     yield (
                                         build_primary_status("generating_audio", status_line),
-                                        gr.update(value=max(last_pct, 70)),
                                         gr.update(label=AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)),
                                         current_log,
                                     )
@@ -383,23 +576,25 @@ def create_demo_interface():
                         print(f"Error calling Modal: {e}")
                         yield (
                             build_primary_status("error", "Inference failed."),
-                            gr.update(value=0),
                             gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                             f"An error occurred: {e}\n\n{tb}",
                         )
                 generate_btn.click(
                     fn=generate_podcast_wrapper,
-                    inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
-                    outputs=[primary_status, progress_slider, complete_audio_output, log_output],
                 )
             with gr.Tab("Architecture"):
                 gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
-                gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker
                 conversational audio from text. It addresses challenges in traditional TTS systems — scalability, speaker
                 consistency, and natural turn-taking — using continuous speech tokenizers at an ultra-low 7.5 Hz frame rate
-                and a next-token diffusion framework. It can synthesize speech up to 90 minutes long with up to 4 distinct speakers.""")
                 with gr.Row():
                     with gr.Column():
@@ -438,20 +633,17 @@ def create_demo_interface():
                         )
     return interface
-# --- Main Execution ---
 if __name__ == "__main__":
     if remote_generate_function is None:
-        # If Modal isn't set up, we can't launch the full app.
-        # We'll show a simplified UI with an error message.
         with gr.Blocks(theme=theme) as interface:
-            gr.Markdown("# ❌ Configuration Error")
             gr.Markdown(
                 "The Gradio application cannot connect to the Modal backend. "
-                "The Modal app has not been deployed yet. "
-                "Please run `modal deploy modal_runner.py` in your terminal and then refresh this page."
             )
         interface.launch()
     else:
-        # Launch the full Gradio interface
         interface = create_demo_interface()
         interface.queue().launch(show_error=True)

 import os
+import re
 import gradio as gr
 import modal
 import traceback
+from huggingface_hub import InferenceClient
 # --- Configuration ---
 MODAL_STUB_NAME = "vibevoice-generator"
+MODAL_CLASS_NAME = "VibeVoiceModel"
 AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
 AVAILABLE_VOICES = ["Cherry", "Chicago", "Janus", "Mantis", "Sponge", "Starchild"]
 DEFAULT_SPEAKERS = ["Cherry", "Chicago", "Janus", "Mantis"]
+SCRIPT_GEN_MODEL = "Qwen/Qwen2.5-72B-Instruct"
+SCRIPT_MAX_WORDS = 1000
+# --- Load example scripts ---
 def load_example_scripts():
     examples_dir = "text_examples"
     example_scripts = []
     example_scripts_natural = []
     if not os.path.exists(examples_dir):
         return example_scripts, example_scripts_natural
     original_files = [
         "1p_ai_tedtalk.txt",
         "1p_politcal_speech.txt",
         "3p_military_meeting.txt",
         "3p_oil_meeting.txt",
         "4p_gamecreation_meeting.txt",
+        "4p_product_meeting.txt",
     ]
     for txt_file in original_files:
         file_path = os.path.join(examples_dir, txt_file)
         natural_file = txt_file.replace(".txt", "_natural.txt")
         natural_path = os.path.join(examples_dir, natural_file)
         if os.path.exists(file_path):
+            with open(file_path, "r", encoding="utf-8") as f:
                 example_scripts.append(f.read())
         else:
             example_scripts.append("")
         if os.path.exists(natural_path):
+            with open(natural_path, "r", encoding="utf-8") as f:
                 example_scripts_natural.append(f.read())
         else:
+            example_scripts_natural.append(
+                example_scripts[-1] if example_scripts else ""
+            )
     return example_scripts, example_scripts_natural
+SCRIPT_SPEAKER_COUNTS = [1, 1, 2, 2, 3, 3, 4, 4]
 EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
+# --- Script parsing helpers ---
+def parse_script_to_turns(script_text: str) -> list[dict]:
+    """Parse a 'Speaker N: text' script into a list of turn dicts."""
+    turns = []
+    if not script_text or not script_text.strip():
+        return turns
+    pattern = re.compile(r"^Speaker\s+(\d+)\s*:\s*(.+)", re.IGNORECASE)
+    current_speaker = None
+    current_text = []
+    for line in script_text.strip().split("\n"):
+        m = pattern.match(line.strip())
+        if m:
+            if current_speaker is not None:
+                turns.append({"speaker": current_speaker, "text": " ".join(current_text).strip()})
+            current_speaker = int(m.group(1))
+            current_text = [m.group(2).strip()]
+        elif line.strip():
+            if current_speaker is not None:
+                current_text.append(line.strip())
+            else:
+                # Line without a speaker tag — assign to Speaker 1
+                current_speaker = 1
+                current_text = [line.strip()]
+    if current_speaker is not None and current_text:
+        turns.append({"speaker": current_speaker, "text": " ".join(current_text).strip()})
+    return turns
+def turns_to_script(turns: list[dict]) -> str:
+    """Convert turn dicts back to 'Speaker N: text' format."""
+    lines = []
+    for t in turns:
+        if t.get("text", "").strip():
+            lines.append(f"Speaker {t['speaker']}: {t['text'].strip()}")
+    return "\n\n".join(lines)
+def estimate_duration(turns: list[dict]) -> str:
+    """Estimate audio duration from total word count."""
+    total_words = sum(len(t.get("text", "").split()) for t in turns)
+    if total_words == 0:
+        return ""
+    minutes = total_words / 150
+    if minutes < 1:
+        return f"~{int(minutes * 60)} seconds"
+    return f"~{minutes:.1f} minutes"
+# --- AI Script Generation ---
+llm_client = InferenceClient(model=SCRIPT_GEN_MODEL)
+SCRIPT_SYSTEM_PROMPT = """You are a script writer. Write a realistic, engaging conversation script.
+RULES:
+- Use EXACTLY this format for every line: "Speaker N: dialogue text"
+- N must be a number starting from 1
+- Each speaker turn is its own paragraph separated by a blank line
+- Write natural, flowing dialogue — not robotic or overly formal
+- Include character names and context naturally in the dialogue
+- Keep the total script under {max_words} words
+- Use EXACTLY {num_speakers} speakers (Speaker 1 through Speaker {num_speakers})
+- Do NOT include stage directions, parentheticals, or anything other than dialogue
+- Output ONLY the script, no preamble or commentary"""
+def generate_script_from_prompt(prompt: str, num_speakers: int) -> list[dict]:
+    """Call the HF Inference API to generate a script from a prompt."""
+    system = SCRIPT_SYSTEM_PROMPT.format(
+        max_words=SCRIPT_MAX_WORDS, num_speakers=num_speakers
+    )
+    response = llm_client.chat_completion(
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": prompt},
+        ],
+        max_tokens=4096,
+        temperature=0.7,
+    )
+    raw = response.choices[0].message.content
+    turns = parse_script_to_turns(raw)
+    return turns
 # --- Modal Connection ---
 try:
     RemoteVibeVoiceModel = modal.Cls.from_name(MODAL_STUB_NAME, MODAL_CLASS_NAME)
     remote_model_instance = RemoteVibeVoiceModel()
     remote_generate_function = remote_model_instance.generate_podcast
     print("Successfully connected to Modal function.")
 except modal.exception.NotFoundError:
     print("ERROR: Modal function not found.")
+    print("Please deploy the Modal app first: modal deploy backend_modal/modal_runner.py")
     remote_generate_function = None
+# --- Gradio UI ---
 theme = gr.themes.Ocean(
     primary_hue="indigo",
     secondary_hue="fuchsia",
     neutral_hue="slate",
+).set(button_large_radius="*radius_sm")
 AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
 PRIMARY_STAGE_MESSAGES = {
+    "connecting": ("Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
+    "queued": ("Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
+    "loading_model": ("Loading Model", "Streaming VibeVoice weights to the GPU."),
+    "loading_voices": ("Loading Voices", None),
+    "preparing_inputs": ("Preparing Script", "Formatting the conversation for the model."),
+    "generating_audio": ("Generating Audio", "Synthesizing speech — this is the longest step."),
+    "processing_audio": ("Finalizing Audio", "Converting tensors into a playable waveform."),
+    "complete": ("Ready", "Press play below or download your conference."),
+    "error": ("Error", "Check the log for details."),
 }
 AUDIO_STAGE_LABELS = {
     "connecting": "Complete Conference (requesting GPU...)",
     "processing_audio": "Complete Conference (finalizing audio...)",
     "error": "Complete Conference (error)",
 }
+READY_PRIMARY_STATUS = "### Ready\nPress **Generate Conference** to run VibeVoice."
 def build_primary_status(stage: str, status_line: str) -> str:
+    title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("Working", "Processing..."))
     desc_parts = []
     if default_desc:
         desc_parts.append(default_desc)
     return f"### {title}\n{desc}"
+# --- Build Interface ---
 def create_demo_interface():
     with gr.Blocks(
         title="VibeVoice - Conference Generator",
         theme=theme,
     ) as interface:
+        # --- Banner ---
         gr.HTML("""
         <div style="width: 100%; margin-bottom: 20px;">
+            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png"
                 style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
                 alt="VibeVoice Banner">
         </div>
         """)
         with gr.Tabs():
+            # ==================== GENERATE TAB ====================
             with gr.Tab("Generate"):
+                gr.Markdown("**Tip:** The 1.5B model is recommended — much faster with minimal quality difference.")
+                # --- Conversation state: list of {speaker: int, text: str} ---
+                turns_state = gr.State([])
+                # --- Top row: Settings (left) + Script Tools (right) ---
                 with gr.Row():
+                    # ---------- LEFT COLUMN: Settings ----------
                     with gr.Column(scale=1):
                         gr.Markdown("### Settings")
                         model_dropdown = gr.Dropdown(
                             speaker = gr.Dropdown(
                                 choices=AVAILABLE_VOICES,
                                 value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
+                                label=f"Speaker {i + 1}",
                                 visible=(i < 2),
                             )
                             speaker_selections.append(speaker)
                                 label="CFG Scale (Guidance Strength)",
                             )
+                    # ---------- RIGHT COLUMN: Script creation ----------
                     with gr.Column(scale=2):
+                        # --- AI Script Generator ---
+                        with gr.Accordion("Generate a Script with AI", open=True):
+                            gr.Markdown("Describe the conversation you want and AI will write the script for you.")
+                            script_prompt = gr.Textbox(
+                                label="Prompt",
+                                placeholder="e.g. A wizard consulting an orc about battle strategy for an upcoming siege",
+                                lines=2,
+                                max_lines=4,
+                            )
+                            with gr.Row():
+                                generate_script_btn = gr.Button(
+                                    "Generate Script", variant="secondary",
+                                )
+                                script_gen_status = gr.Markdown(value="", visible=False)
+                        # --- Example buttons ---
+                        with gr.Accordion("Example Scripts", open=False):
+                            with gr.Row():
+                                use_natural = gr.Checkbox(
+                                    value=True,
+                                    label="Natural talking sounds",
+                                )
+                            example_names = [
+                                "AI TED Talk", "Political Speech",
+                                "Finance IPO Meeting", "Telehealth Meeting",
+                                "Military Meeting", "Oil Meeting",
+                                "Game Creation Meeting", "Product Meeting",
+                            ]
+                            example_buttons = []
+                            with gr.Row():
+                                for i in range(4):
+                                    btn = gr.Button(example_names[i], size="sm", variant="secondary")
+                                    example_buttons.append(btn)
+                            with gr.Row():
+                                for i in range(4, 8):
+                                    btn = gr.Button(example_names[i], size="sm", variant="secondary")
+                                    example_buttons.append(btn)
+                # --- Conversation Editor ---
+                gr.Markdown("### Conversation")
+                duration_display = gr.Markdown(value="")
+                @gr.render(inputs=[turns_state, num_speakers])
+                def render_turns(turns, n_speakers):
+                    if not turns:
+                        gr.Markdown("*No script yet. Generate one with AI above, load an example, or add turns manually.*")
+                    else:
+                        speaker_choices = [f"Speaker {i + 1}" for i in range(int(n_speakers))]
+                        for idx, turn in enumerate(turns):
+                            with gr.Row(key=f"turn-{idx}"):
+                                spk_dd = gr.Dropdown(
+                                    choices=speaker_choices,
+                                    value=f"Speaker {turn['speaker']}",
+                                    label="",
+                                    scale=1,
+                                    min_width=120,
+                                    container=False,
+                                    key=f"spk-{idx}",
+                                )
+                                txt = gr.Textbox(
+                                    value=turn["text"],
+                                    label="",
+                                    lines=2,
+                                    max_lines=6,
+                                    scale=5,
+                                    container=False,
+                                    key=f"txt-{idx}",
+                                )
+                                del_btn = gr.Button("X", size="sm", variant="stop", scale=0, min_width=40, key=f"del-{idx}")
+                            # Update turn text when user edits
+                            def on_text_change(new_text, current_turns, i=idx):
+                                if i < len(current_turns):
+                                    current_turns[i]["text"] = new_text
+                                return current_turns
+                            txt.change(
+                                fn=on_text_change,
+                                inputs=[txt, turns_state],
+                                outputs=[turns_state],
+                                queue=False,
+                            )
+                            # Update speaker when user changes dropdown
+                            def on_speaker_change(new_spk, current_turns, i=idx):
+                                if i < len(current_turns):
+                                    num = int(new_spk.replace("Speaker ", ""))
+                                    current_turns[i]["speaker"] = num
+                                return current_turns
+                            spk_dd.change(
+                                fn=on_speaker_change,
+                                inputs=[spk_dd, turns_state],
+                                outputs=[turns_state],
+                                queue=False,
                             )
+                            # Delete turn
+                            def on_delete(current_turns, i=idx):
+                                if i < len(current_turns):
+                                    current_turns.pop(i)
+                                return current_turns
+                            del_btn.click(
+                                fn=on_delete,
+                                inputs=[turns_state],
+                                outputs=[turns_state],
                             )
+                with gr.Row():
+                    add_turn_btn = gr.Button("+ Add Turn", size="sm", variant="secondary")
+                # --- Generate Conference ---
                 generate_btn = gr.Button(
+                    "Generate Conference", size="lg", variant="primary",
                 )
+                # --- Output section ---
                 primary_status = gr.Markdown(
                     value=READY_PRIMARY_STATUS,
                     elem_id="primary-status",
                 )
                 complete_audio_output = gr.Audio(
                     label=AUDIO_LABEL_DEFAULT,
                     type="numpy",
                         interactive=False,
                     )
+                # ==================== EVENT HANDLERS ====================
+                def update_speaker_visibility(n):
+                    return [gr.update(visible=(i < n)) for i in range(4)]
+                num_speakers.change(
+                    fn=update_speaker_visibility,
+                    inputs=[num_speakers],
+                    outputs=speaker_selections,
+                )
+                # --- Add turn ---
+                def add_turn(turns, n_speakers):
+                    if not turns:
+                        next_speaker = 1
                     else:
+                        last = turns[-1]["speaker"]
+                        next_speaker = (last % int(n_speakers)) + 1
+                    turns.append({"speaker": next_speaker, "text": ""})
+                    return turns, estimate_duration(turns)
+                add_turn_btn.click(
+                    fn=add_turn,
+                    inputs=[turns_state, num_speakers],
+                    outputs=[turns_state, duration_display],
+                )
+                # --- Update duration whenever turns change ---
+                def update_duration(turns):
+                    return estimate_duration(turns)
+                turns_state.change(
+                    fn=update_duration,
+                    inputs=[turns_state],
+                    outputs=[duration_display],
+                    queue=False,
+                )
+                # --- AI Script Generation ---
+                def on_generate_script(prompt, n_speakers):
+                    if not prompt or not prompt.strip():
+                        gr.Warning("Please enter a prompt describing the conversation.")
+                        return gr.update(), gr.update()
+                    try:
+                        turns = generate_script_from_prompt(prompt.strip(), int(n_speakers))
+                        if not turns:
+                            gr.Warning("The AI returned an empty script. Try a more descriptive prompt.")
+                            return gr.update(), gr.update()
+                        return turns, estimate_duration(turns)
+                    except Exception as e:
+                        gr.Warning(f"Script generation failed: {e}")
+                        return gr.update(), gr.update()
+                generate_script_btn.click(
+                    fn=on_generate_script,
+                    inputs=[script_prompt, num_speakers],
+                    outputs=[turns_state, duration_display],
+                )
+                # --- Load example scripts ---
+                def load_example(idx, natural):
                     if idx >= len(EXAMPLE_SCRIPTS):
+                        return [], 2, "" , *[None] * 4
                     script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
                     num = SCRIPT_SPEAKER_COUNTS[idx] if idx < len(SCRIPT_SPEAKER_COUNTS) else 1
+                    turns = parse_script_to_turns(script)
+                    speakers = list(AVAILABLE_VOICES[:num])
                     while len(speakers) < 4:
                         speakers.append(None)
+                    return turns, num, estimate_duration(turns), *speakers[:4]
                 for idx, btn in enumerate(example_buttons):
                     btn.click(
+                        fn=lambda nat, i=idx: load_example(i, nat),
                         inputs=[use_natural],
+                        outputs=[turns_state, num_speakers, duration_display] + speaker_selections,
+                        queue=False,
                     )
+                # --- Generate Conference (audio) ---
+                def generate_podcast_wrapper(
+                    model_choice, num_speakers_val, turns, *speakers_and_params
+                ):
                     if remote_generate_function is None:
                         yield (
                             build_primary_status("error", "Modal backend is offline."),
                             gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                             "ERROR: Modal function not deployed. Please contact the space owner.",
                         )
                         return
+                    # Assemble turns into script text
+                    script = turns_to_script(turns)
+                    if not script.strip():
+                        yield (
+                            build_primary_status("error", "No script to generate."),
+                            gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
+                            "Please add some dialogue before generating.",
+                        )
+                        return
                     yield (
                         build_primary_status("connecting", "Provisioning GPU resources... cold starts can take up to a minute."),
                         gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
                         "Calling remote GPU on Modal.com...",
                     )
                         speakers = speakers_and_params[:4]
                         cfg_scale_val = speakers_and_params[4]
                         current_log = ""
                         last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
                         last_stage = "connecting"
                             speaker_3=speakers[2],
                             speaker_4=speakers[3],
                             cfg_scale=cfg_scale_val,
+                            model_name=model_choice,
                         ):
                             if not update:
                                 continue
                             if isinstance(update, dict):
                                 audio_payload = update.get("audio")
                                 stage_key = update.get("stage", last_stage) or last_stage
                                 status_line = update.get("status") or "Processing..."
                                 current_log = update.get("log", current_log)
                                 audio_label = AUDIO_STAGE_LABELS.get(stage_key)
                                 if not audio_label:
                                     audio_label = f"Complete Conference ({stage_label.lower()})"
                                 if stage_key == "complete":
                                     audio_label = AUDIO_LABEL_DEFAULT
                                 audio_update = gr.update(label=audio_label)
                                 if audio_payload is not None:
                                 yield (
                                     build_primary_status(stage_key, status_line),
                                     audio_update,
                                     current_log,
                                 )
                                 last_audio_label = audio_label
                                 last_stage = stage_key
                             else:
+                                audio_payload, log_text = (
+                                    update if isinstance(update, (tuple, list)) else (None, str(update))
+                                )
                                 if log_text:
                                     current_log = log_text
                                 if audio_payload is not None:
                                     yield (
                                         build_primary_status("complete", "Conference ready to download."),
+                                        gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT),
                                         current_log,
                                     )
                                 else:
                                     status_line = current_log.splitlines()[-1] if current_log else "Processing..."
                                     yield (
                                         build_primary_status("generating_audio", status_line),
                                         gr.update(label=AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)),
                                         current_log,
                                     )
                         print(f"Error calling Modal: {e}")
                         yield (
                             build_primary_status("error", "Inference failed."),
                             gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                             f"An error occurred: {e}\n\n{tb}",
                         )
                 generate_btn.click(
                     fn=generate_podcast_wrapper,
+                    inputs=[model_dropdown, num_speakers, turns_state] + speaker_selections + [cfg_scale],
+                    outputs=[primary_status, complete_audio_output, log_output],
                 )
+            # ==================== ARCHITECTURE TAB ====================
             with gr.Tab("Architecture"):
                 gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
+                gr.Markdown(
+                    """VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker
                 conversational audio from text. It addresses challenges in traditional TTS systems — scalability, speaker
                 consistency, and natural turn-taking — using continuous speech tokenizers at an ultra-low 7.5 Hz frame rate
+                and a next-token diffusion framework. It can synthesize speech up to 90 minutes long with up to 4 distinct speakers."""
+                )
                 with gr.Row():
                     with gr.Column():
                         )
     return interface
+# --- Main ---
 if __name__ == "__main__":
     if remote_generate_function is None:
         with gr.Blocks(theme=theme) as interface:
+            gr.Markdown("# Configuration Error")
             gr.Markdown(
                 "The Gradio application cannot connect to the Modal backend. "
+                "Please run `modal deploy backend_modal/modal_runner.py` and refresh."
             )
         interface.launch()
     else:
         interface = create_demo_interface()
         interface.queue().launch(show_error=True)

backend_modal/modal_runner.py CHANGED Viewed

@@ -30,6 +30,14 @@ image = (
         "librosa",
         "pydub",
     )
     .add_local_dir("backend_modal/modular", remote_path="/root/modular")
     .add_local_dir("backend_modal/processor", remote_path="/root/processor")
     .add_local_dir("backend_modal/voices", remote_path="/root/voices")
@@ -51,7 +59,9 @@ cache_volume = modal.Volume.from_name("vibevoice-cache", create_if_missing=True)
     volumes={"/cache": cache_volume}
 )
 class VibeVoiceModel:
-    def __init__(self):
         self.model_paths = {
             "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
             "VibeVoice-7B": "vibevoice/VibeVoice-7B",
@@ -61,17 +71,11 @@ class VibeVoiceModel:
         self.cache_dir = "/cache"
         self.max_cache_size_gb = 10  # Limit cache to 10GB
-    @modal.enter()
-    def load_models(self):
-        """
-        This method is run once when the container starts.
-        With A10G (24GB), we can load both models to GPU.
-        """
         # Project-specific imports are moved here to run inside the container
         from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
         from processor.vibevoice_processor import VibeVoiceProcessor
-        print("Entering container and loading models to GPU (A10G with 24GB)...")
         # Set compiler flags for better performance
         if torch.cuda.is_available() and hasattr(torch, '_inductor'):
@@ -104,11 +108,9 @@ class VibeVoiceModel:
         self.setup_voice_presets()
         print("Model loading complete.")
     def _place_model(self, target_name: str):
-        """
-        With A10G, both models stay on GPU. Just update the current model.
-        """
         self.current_model_name = target_name
         print(f"Switched to model {target_name}")
@@ -297,7 +299,6 @@ class VibeVoiceModel:
             if model_name not in self.models:
                 raise ValueError(f"Unknown model: {model_name}")
-            # Initialize log scaffold
             selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
             log_lines = [
                 f"Generating conference with {num_speakers} speakers",
@@ -307,7 +308,20 @@ class VibeVoiceModel:
             ]
             log_text = "\n".join(log_lines)
-            # Emit initial status before heavy work kicks in
             yield self._emit_progress(
                 stage="queued",
                 pct=5,
@@ -475,10 +489,11 @@ class VibeVoiceModel:
             sample_rate = 24000
             total_duration = len(audio) / sample_rate
             log_lines.append(f"Audio duration: {total_duration:.2f} seconds")
             log_lines.append("Complete!")
             log_text = "\n".join(log_lines)
-            # Final yield with both audio and complete log
             yield self._emit_progress(
                 stage="complete",
                 pct=100,

         "librosa",
         "pydub",
     )
+    .run_commands(
+        "mkdir -p /root/vibevoice",
+        "touch /root/vibevoice/__init__.py",
+        "ln -s /root/modular /root/vibevoice/modular",
+        "ln -s /root/processor /root/vibevoice/processor",
+        "ln -s /root/voices /root/vibevoice/voices",
+        "ln -s /root/schedule /root/vibevoice/schedule"
+    )
     .add_local_dir("backend_modal/modular", remote_path="/root/modular")
     .add_local_dir("backend_modal/processor", remote_path="/root/processor")
     .add_local_dir("backend_modal/voices", remote_path="/root/voices")
     volumes={"/cache": cache_volume}
 )
 class VibeVoiceModel:
+    @modal.enter()
+    def load_models(self):
+        """Run once when the container starts. Loads both models to GPU."""
         self.model_paths = {
             "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
             "VibeVoice-7B": "vibevoice/VibeVoice-7B",
         self.cache_dir = "/cache"
         self.max_cache_size_gb = 10  # Limit cache to 10GB
         # Project-specific imports are moved here to run inside the container
         from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
         from processor.vibevoice_processor import VibeVoiceProcessor
+        print("Entering container and loading models to GPU...")
         # Set compiler flags for better performance
         if torch.cuda.is_available() and hasattr(torch, '_inductor'):
         self.setup_voice_presets()
         print("Model loading complete.")
     def _place_model(self, target_name: str):
+        """Both models stay on GPU. Just update the active selection."""
         self.current_model_name = target_name
         print(f"Switched to model {target_name}")
             if model_name not in self.models:
                 raise ValueError(f"Unknown model: {model_name}")
             selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
             log_lines = [
                 f"Generating conference with {num_speakers} speakers",
             ]
             log_text = "\n".join(log_lines)
+            # Check cache first
+            cache_key = self._generate_cache_key(script, model_name, selected_speakers, cfg_scale)
+            cached_audio, cached_sr = self._get_cached_audio(cache_key)
+            if cached_audio is not None:
+                log_lines.append("Cache hit! Returning previously generated audio.")
+                log_text = "\n".join(log_lines)
+                yield self._emit_progress(
+                    stage="complete", pct=100,
+                    status="Loaded from cache.",
+                    log_text=log_text,
+                    audio=(cached_sr, cached_audio), done=True,
+                )
+                return
             yield self._emit_progress(
                 stage="queued",
                 pct=5,
             sample_rate = 24000
             total_duration = len(audio) / sample_rate
             log_lines.append(f"Audio duration: {total_duration:.2f} seconds")
+            self._save_to_cache(cache_key, audio, sample_rate)
             log_lines.append("Complete!")
             log_text = "\n".join(log_lines)
             yield self._emit_progress(
                 stage="complete",
                 pct=100,

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 gradio
-modal

 gradio
+modal
+huggingface_hub