ACloudCenter Claude Opus 4.6 (1M context) commited on
Commit
3253bc7
·
1 Parent(s): aec877b

Add AI script generator, conversation editor, and cleanup

Browse files

- Add AI script generation via HF Inference API (Qwen2.5-72B)
with prompt field, ~1000 word limit, respects speaker count
- Replace raw script textbox with dynamic turn-based conversation
editor (add/edit/delete turns, speaker dropdowns per turn)
- Fix voice name mismatch: frontend now uses actual voice file
names (Cherry, Chicago, Janus, Mantis, Sponge, Starchild)
- Clean up duplicate content in Architecture tab
- Reorder UI layout to settings → generate → output flow
- Remove progress slider, consolidate status displays

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +347 -155
  2. backend_modal/modal_runner.py +30 -15
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,27 +1,30 @@
1
  import os
 
2
  import gradio as gr
3
  import modal
4
  import traceback
 
5
 
6
  # --- Configuration ---
7
- # This is the name of your Modal stub.
8
  MODAL_STUB_NAME = "vibevoice-generator"
9
- MODAL_CLASS_NAME = "VibeVoiceModel" # Extract class name
10
- MODAL_METHOD_NAME = "generate_podcast" # Extract method name
11
 
12
  AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
13
  AVAILABLE_VOICES = ["Cherry", "Chicago", "Janus", "Mantis", "Sponge", "Starchild"]
14
  DEFAULT_SPEAKERS = ["Cherry", "Chicago", "Janus", "Mantis"]
15
 
16
- # Load example scripts
 
 
 
17
  def load_example_scripts():
18
  examples_dir = "text_examples"
19
  example_scripts = []
20
  example_scripts_natural = []
21
-
22
  if not os.path.exists(examples_dir):
23
  return example_scripts, example_scripts_natural
24
-
25
  original_files = [
26
  "1p_ai_tedtalk.txt",
27
  "1p_politcal_speech.txt",
@@ -30,67 +33,152 @@ def load_example_scripts():
30
  "3p_military_meeting.txt",
31
  "3p_oil_meeting.txt",
32
  "4p_gamecreation_meeting.txt",
33
- "4p_product_meeting.txt"
34
  ]
35
-
36
  for txt_file in original_files:
37
  file_path = os.path.join(examples_dir, txt_file)
38
  natural_file = txt_file.replace(".txt", "_natural.txt")
39
  natural_path = os.path.join(examples_dir, natural_file)
40
-
41
  if os.path.exists(file_path):
42
- with open(file_path, 'r', encoding='utf-8') as f:
43
  example_scripts.append(f.read())
44
  else:
45
  example_scripts.append("")
46
-
47
  if os.path.exists(natural_path):
48
- with open(natural_path, 'r', encoding='utf-8') as f:
49
  example_scripts_natural.append(f.read())
50
  else:
51
- example_scripts_natural.append(example_scripts[-1] if example_scripts else "")
52
-
 
 
53
  return example_scripts, example_scripts_natural
54
 
55
- # Number of speakers per example script
56
- SCRIPT_SPEAKER_COUNTS = [1, 1, 2, 2, 3, 3, 4, 4]
57
 
 
58
  EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # --- Modal Connection ---
61
  try:
62
- # Look up the remote class
63
  RemoteVibeVoiceModel = modal.Cls.from_name(MODAL_STUB_NAME, MODAL_CLASS_NAME)
64
- # Create an instance of the remote class
65
  remote_model_instance = RemoteVibeVoiceModel()
66
- # Get the remote method
67
  remote_generate_function = remote_model_instance.generate_podcast
68
  print("Successfully connected to Modal function.")
69
  except modal.exception.NotFoundError:
70
  print("ERROR: Modal function not found.")
71
- print(f"Please deploy the Modal app first by running: modal deploy modal_runner.py")
72
  remote_generate_function = None
73
 
74
- # --- Gradio UI Definition ---
75
  theme = gr.themes.Ocean(
76
  primary_hue="indigo",
77
  secondary_hue="fuchsia",
78
  neutral_hue="slate",
79
- ).set(
80
- button_large_radius='*radius_sm'
81
- )
82
-
83
  AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
84
  PRIMARY_STAGE_MESSAGES = {
85
- "connecting": ("🚀 Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
86
- "queued": ("🚦 Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
87
- "loading_model": ("📦 Loading Model", "Streaming VibeVoice weights to the GPU."),
88
- "loading_voices": ("🎙️ Loading Voices", None),
89
- "preparing_inputs": ("📝 Preparing Script", "Formatting the conversation for the model."),
90
- "generating_audio": ("🎧 Generating Audio", "Synthesizing speech — this is the longest step."),
91
- "processing_audio": ("Finalizing Audio", "Converting tensors into a playable waveform."),
92
- "complete": ("Ready", "Press play below or download your conference."),
93
- "error": ("Error", "Check the log for details."),
94
  }
95
  AUDIO_STAGE_LABELS = {
96
  "connecting": "Complete Conference (requesting GPU...)",
@@ -102,11 +190,11 @@ AUDIO_STAGE_LABELS = {
102
  "processing_audio": "Complete Conference (finalizing audio...)",
103
  "error": "Complete Conference (error)",
104
  }
105
- READY_PRIMARY_STATUS = "### Ready\nPress **Generate** to run VibeVoice."
106
 
107
 
108
  def build_primary_status(stage: str, status_line: str) -> str:
109
- title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("⚙️ Working", "Processing..."))
110
  desc_parts = []
111
  if default_desc:
112
  desc_parts.append(default_desc)
@@ -116,23 +204,33 @@ def build_primary_status(stage: str, status_line: str) -> str:
116
  return f"### {title}\n{desc}"
117
 
118
 
 
 
119
  def create_demo_interface():
120
  with gr.Blocks(
121
  title="VibeVoice - Conference Generator",
122
  theme=theme,
123
  ) as interface:
 
124
  gr.HTML("""
125
  <div style="width: 100%; margin-bottom: 20px;">
126
- <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png"
127
  style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
128
  alt="VibeVoice Banner">
129
  </div>
130
  """)
 
131
  with gr.Tabs():
 
132
  with gr.Tab("Generate"):
133
- gr.Markdown("**Tip:** The 1.5B model is recommended — it's much faster with minimal quality difference.")
 
 
 
134
 
 
135
  with gr.Row():
 
136
  with gr.Column(scale=1):
137
  gr.Markdown("### Settings")
138
  model_dropdown = gr.Dropdown(
@@ -150,7 +248,7 @@ def create_demo_interface():
150
  speaker = gr.Dropdown(
151
  choices=AVAILABLE_VOICES,
152
  value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
153
- label=f"Speaker {i+1}",
154
  visible=(i < 2),
155
  )
156
  speaker_selections.append(speaker)
@@ -161,66 +259,130 @@ def create_demo_interface():
161
  label="CFG Scale (Guidance Strength)",
162
  )
163
 
 
164
  with gr.Column(scale=2):
165
- script_input = gr.Textbox(
166
- label="Conversation Script",
167
- placeholder="Enter your conference script here...\n\nFormat:\nSpeaker 1: Hello everyone...\nSpeaker 2: Thanks for having me...",
168
- lines=12,
169
- max_lines=20,
170
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- with gr.Row():
173
- use_natural = gr.Checkbox(
174
- value=True,
175
- label="Natural talking sounds",
176
- scale=1,
 
 
 
 
 
 
 
177
  )
178
- duration_display = gr.Textbox(
179
- value="",
180
- label="Est. Duration",
181
- interactive=False,
182
- scale=1,
 
 
 
 
 
 
183
  )
184
 
185
- example_names = [
186
- "AI TED Talk",
187
- "Political Speech",
188
- "Finance IPO Meeting",
189
- "Telehealth Meeting",
190
- "Military Meeting",
191
- "Oil Meeting",
192
- "Game Creation Meeting",
193
- "Product Meeting",
194
- ]
195
-
196
- example_buttons = []
197
- with gr.Row():
198
- for i in range(min(4, len(example_names))):
199
- btn = gr.Button(example_names[i], size="sm", variant="secondary")
200
- example_buttons.append(btn)
201
-
202
- with gr.Row():
203
- for i in range(4, min(8, len(example_names))):
204
- btn = gr.Button(example_names[i], size="sm", variant="secondary")
205
- example_buttons.append(btn)
206
 
 
207
  generate_btn = gr.Button(
208
- "Generate Conference", size="lg",
209
- variant="primary",
210
  )
211
 
 
212
  primary_status = gr.Markdown(
213
  value=READY_PRIMARY_STATUS,
214
  elem_id="primary-status",
215
  )
216
- progress_slider = gr.Slider(
217
- minimum=0,
218
- maximum=100,
219
- value=0,
220
- step=1,
221
- label="Progress",
222
- interactive=False,
223
- )
224
  complete_audio_output = gr.Audio(
225
  label=AUDIO_LABEL_DEFAULT,
226
  type="numpy",
@@ -234,73 +396,112 @@ def create_demo_interface():
234
  interactive=False,
235
  )
236
 
237
- def update_speaker_visibility(num_speakers):
238
- return [gr.update(visible=(i < num_speakers)) for i in range(4)]
239
-
240
- def estimate_duration(script):
241
- """Estimate duration based on word count."""
242
- if not script:
243
- return ""
244
- words = len(script.split())
245
- # Approximate 150 words per minute for natural speech
246
- minutes = words / 150
247
- if minutes < 1:
248
- return f"~{int(minutes * 60)} seconds"
 
 
 
249
  else:
250
- return f"~{minutes:.1f} minutes"
251
-
252
- def load_specific_example(idx, natural):
253
- """Load a specific example script."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  if idx >= len(EXAMPLE_SCRIPTS):
255
- return [2, "", ""] + [None, None, None, None]
256
 
257
  script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
258
  num = SCRIPT_SPEAKER_COUNTS[idx] if idx < len(SCRIPT_SPEAKER_COUNTS) else 1
259
- speakers = AVAILABLE_VOICES[:num]
260
- duration = estimate_duration(script)
261
 
262
- # Pad speakers to 4
263
  while len(speakers) < 4:
264
  speakers.append(None)
265
 
266
- return [num, script, duration] + speakers[:4]
267
-
268
- # Connect example buttons
269
  for idx, btn in enumerate(example_buttons):
270
  btn.click(
271
- fn=lambda nat, i=idx: load_specific_example(i, nat),
272
  inputs=[use_natural],
273
- outputs=[num_speakers, script_input, duration_display] + speaker_selections,
274
- queue=False
275
  )
276
-
277
- # Update duration when script changes
278
- script_input.change(
279
- fn=estimate_duration,
280
- inputs=[script_input],
281
- outputs=[duration_display],
282
- queue=False
283
- )
284
-
285
- num_speakers.change(
286
- fn=update_speaker_visibility,
287
- inputs=[num_speakers],
288
- outputs=speaker_selections
289
- )
290
 
291
- def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
 
 
 
292
  if remote_generate_function is None:
293
  yield (
294
  build_primary_status("error", "Modal backend is offline."),
295
- gr.update(value=0),
296
  gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
297
  "ERROR: Modal function not deployed. Please contact the space owner.",
298
  )
299
  return
300
 
 
 
 
 
 
 
 
 
 
 
301
  yield (
302
  build_primary_status("connecting", "Provisioning GPU resources... cold starts can take up to a minute."),
303
- gr.update(value=1),
304
  gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
305
  "Calling remote GPU on Modal.com...",
306
  )
@@ -309,7 +510,6 @@ def create_demo_interface():
309
  speakers = speakers_and_params[:4]
310
  cfg_scale_val = speakers_and_params[4]
311
  current_log = ""
312
- last_pct = 1
313
  last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
314
  last_stage = "connecting"
315
 
@@ -321,18 +521,16 @@ def create_demo_interface():
321
  speaker_3=speakers[2],
322
  speaker_4=speakers[3],
323
  cfg_scale=cfg_scale_val,
324
- model_name=model_choice
325
  ):
326
  if not update:
327
  continue
328
 
329
  if isinstance(update, dict):
330
  audio_payload = update.get("audio")
331
- progress_pct = update.get("pct", last_pct)
332
  stage_key = update.get("stage", last_stage) or last_stage
333
  status_line = update.get("status") or "Processing..."
334
  current_log = update.get("log", current_log)
335
- progress_value = max(0, min(100, int(round(progress_pct))))
336
 
337
  audio_label = AUDIO_STAGE_LABELS.get(stage_key)
338
  if not audio_label:
@@ -340,8 +538,6 @@ def create_demo_interface():
340
  audio_label = f"Complete Conference ({stage_label.lower()})"
341
  if stage_key == "complete":
342
  audio_label = AUDIO_LABEL_DEFAULT
343
- if stage_key == "error":
344
- progress_value = 0
345
 
346
  audio_update = gr.update(label=audio_label)
347
  if audio_payload is not None:
@@ -349,32 +545,29 @@ def create_demo_interface():
349
 
350
  yield (
351
  build_primary_status(stage_key, status_line),
352
- gr.update(value=progress_value),
353
  audio_update,
354
  current_log,
355
  )
356
 
357
- last_pct = progress_value
358
  last_audio_label = audio_label
359
  last_stage = stage_key
360
  else:
361
- audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
 
 
362
  if log_text:
363
  current_log = log_text
364
 
365
  if audio_payload is not None:
366
- audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
367
  yield (
368
  build_primary_status("complete", "Conference ready to download."),
369
- gr.update(value=100),
370
- audio_update,
371
  current_log,
372
  )
373
  else:
374
  status_line = current_log.splitlines()[-1] if current_log else "Processing..."
375
  yield (
376
  build_primary_status("generating_audio", status_line),
377
- gr.update(value=max(last_pct, 70)),
378
  gr.update(label=AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)),
379
  current_log,
380
  )
@@ -383,23 +576,25 @@ def create_demo_interface():
383
  print(f"Error calling Modal: {e}")
384
  yield (
385
  build_primary_status("error", "Inference failed."),
386
- gr.update(value=0),
387
  gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
388
  f"An error occurred: {e}\n\n{tb}",
389
  )
390
 
391
  generate_btn.click(
392
  fn=generate_podcast_wrapper,
393
- inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
394
- outputs=[primary_status, progress_slider, complete_audio_output, log_output],
395
  )
396
-
 
397
  with gr.Tab("Architecture"):
398
  gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
399
- gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker
 
400
  conversational audio from text. It addresses challenges in traditional TTS systems — scalability, speaker
401
  consistency, and natural turn-taking — using continuous speech tokenizers at an ultra-low 7.5 Hz frame rate
402
- and a next-token diffusion framework. It can synthesize speech up to 90 minutes long with up to 4 distinct speakers.""")
 
403
 
404
  with gr.Row():
405
  with gr.Column():
@@ -438,20 +633,17 @@ def create_demo_interface():
438
  )
439
  return interface
440
 
441
- # --- Main Execution ---
 
442
  if __name__ == "__main__":
443
  if remote_generate_function is None:
444
- # If Modal isn't set up, we can't launch the full app.
445
- # We'll show a simplified UI with an error message.
446
  with gr.Blocks(theme=theme) as interface:
447
- gr.Markdown("# Configuration Error")
448
  gr.Markdown(
449
  "The Gradio application cannot connect to the Modal backend. "
450
- "The Modal app has not been deployed yet. "
451
- "Please run `modal deploy modal_runner.py` in your terminal and then refresh this page."
452
  )
453
  interface.launch()
454
  else:
455
- # Launch the full Gradio interface
456
  interface = create_demo_interface()
457
  interface.queue().launch(show_error=True)
 
1
  import os
2
+ import re
3
  import gradio as gr
4
  import modal
5
  import traceback
6
+ from huggingface_hub import InferenceClient
7
 
8
  # --- Configuration ---
 
9
  MODAL_STUB_NAME = "vibevoice-generator"
10
+ MODAL_CLASS_NAME = "VibeVoiceModel"
 
11
 
12
  AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
13
  AVAILABLE_VOICES = ["Cherry", "Chicago", "Janus", "Mantis", "Sponge", "Starchild"]
14
  DEFAULT_SPEAKERS = ["Cherry", "Chicago", "Janus", "Mantis"]
15
 
16
+ SCRIPT_GEN_MODEL = "Qwen/Qwen2.5-72B-Instruct"
17
+ SCRIPT_MAX_WORDS = 1000
18
+
19
+ # --- Load example scripts ---
20
  def load_example_scripts():
21
  examples_dir = "text_examples"
22
  example_scripts = []
23
  example_scripts_natural = []
24
+
25
  if not os.path.exists(examples_dir):
26
  return example_scripts, example_scripts_natural
27
+
28
  original_files = [
29
  "1p_ai_tedtalk.txt",
30
  "1p_politcal_speech.txt",
 
33
  "3p_military_meeting.txt",
34
  "3p_oil_meeting.txt",
35
  "4p_gamecreation_meeting.txt",
36
+ "4p_product_meeting.txt",
37
  ]
38
+
39
  for txt_file in original_files:
40
  file_path = os.path.join(examples_dir, txt_file)
41
  natural_file = txt_file.replace(".txt", "_natural.txt")
42
  natural_path = os.path.join(examples_dir, natural_file)
43
+
44
  if os.path.exists(file_path):
45
+ with open(file_path, "r", encoding="utf-8") as f:
46
  example_scripts.append(f.read())
47
  else:
48
  example_scripts.append("")
49
+
50
  if os.path.exists(natural_path):
51
+ with open(natural_path, "r", encoding="utf-8") as f:
52
  example_scripts_natural.append(f.read())
53
  else:
54
+ example_scripts_natural.append(
55
+ example_scripts[-1] if example_scripts else ""
56
+ )
57
+
58
  return example_scripts, example_scripts_natural
59
 
 
 
60
 
61
+ SCRIPT_SPEAKER_COUNTS = [1, 1, 2, 2, 3, 3, 4, 4]
62
  EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
63
 
64
+ # --- Script parsing helpers ---
65
+
66
+ def parse_script_to_turns(script_text: str) -> list[dict]:
67
+ """Parse a 'Speaker N: text' script into a list of turn dicts."""
68
+ turns = []
69
+ if not script_text or not script_text.strip():
70
+ return turns
71
+
72
+ pattern = re.compile(r"^Speaker\s+(\d+)\s*:\s*(.+)", re.IGNORECASE)
73
+ current_speaker = None
74
+ current_text = []
75
+
76
+ for line in script_text.strip().split("\n"):
77
+ m = pattern.match(line.strip())
78
+ if m:
79
+ if current_speaker is not None:
80
+ turns.append({"speaker": current_speaker, "text": " ".join(current_text).strip()})
81
+ current_speaker = int(m.group(1))
82
+ current_text = [m.group(2).strip()]
83
+ elif line.strip():
84
+ if current_speaker is not None:
85
+ current_text.append(line.strip())
86
+ else:
87
+ # Line without a speaker tag — assign to Speaker 1
88
+ current_speaker = 1
89
+ current_text = [line.strip()]
90
+
91
+ if current_speaker is not None and current_text:
92
+ turns.append({"speaker": current_speaker, "text": " ".join(current_text).strip()})
93
+
94
+ return turns
95
+
96
+
97
+ def turns_to_script(turns: list[dict]) -> str:
98
+ """Convert turn dicts back to 'Speaker N: text' format."""
99
+ lines = []
100
+ for t in turns:
101
+ if t.get("text", "").strip():
102
+ lines.append(f"Speaker {t['speaker']}: {t['text'].strip()}")
103
+ return "\n\n".join(lines)
104
+
105
+
106
+ def estimate_duration(turns: list[dict]) -> str:
107
+ """Estimate audio duration from total word count."""
108
+ total_words = sum(len(t.get("text", "").split()) for t in turns)
109
+ if total_words == 0:
110
+ return ""
111
+ minutes = total_words / 150
112
+ if minutes < 1:
113
+ return f"~{int(minutes * 60)} seconds"
114
+ return f"~{minutes:.1f} minutes"
115
+
116
+
117
+ # --- AI Script Generation ---
118
+
119
+ llm_client = InferenceClient(model=SCRIPT_GEN_MODEL)
120
+
121
+ SCRIPT_SYSTEM_PROMPT = """You are a script writer. Write a realistic, engaging conversation script.
122
+
123
+ RULES:
124
+ - Use EXACTLY this format for every line: "Speaker N: dialogue text"
125
+ - N must be a number starting from 1
126
+ - Each speaker turn is its own paragraph separated by a blank line
127
+ - Write natural, flowing dialogue — not robotic or overly formal
128
+ - Include character names and context naturally in the dialogue
129
+ - Keep the total script under {max_words} words
130
+ - Use EXACTLY {num_speakers} speakers (Speaker 1 through Speaker {num_speakers})
131
+ - Do NOT include stage directions, parentheticals, or anything other than dialogue
132
+ - Output ONLY the script, no preamble or commentary"""
133
+
134
+
135
+ def generate_script_from_prompt(prompt: str, num_speakers: int) -> list[dict]:
136
+ """Call the HF Inference API to generate a script from a prompt."""
137
+ system = SCRIPT_SYSTEM_PROMPT.format(
138
+ max_words=SCRIPT_MAX_WORDS, num_speakers=num_speakers
139
+ )
140
+ response = llm_client.chat_completion(
141
+ messages=[
142
+ {"role": "system", "content": system},
143
+ {"role": "user", "content": prompt},
144
+ ],
145
+ max_tokens=4096,
146
+ temperature=0.7,
147
+ )
148
+ raw = response.choices[0].message.content
149
+ turns = parse_script_to_turns(raw)
150
+ return turns
151
+
152
+
153
  # --- Modal Connection ---
154
  try:
 
155
  RemoteVibeVoiceModel = modal.Cls.from_name(MODAL_STUB_NAME, MODAL_CLASS_NAME)
 
156
  remote_model_instance = RemoteVibeVoiceModel()
 
157
  remote_generate_function = remote_model_instance.generate_podcast
158
  print("Successfully connected to Modal function.")
159
  except modal.exception.NotFoundError:
160
  print("ERROR: Modal function not found.")
161
+ print("Please deploy the Modal app first: modal deploy backend_modal/modal_runner.py")
162
  remote_generate_function = None
163
 
164
+ # --- Gradio UI ---
165
  theme = gr.themes.Ocean(
166
  primary_hue="indigo",
167
  secondary_hue="fuchsia",
168
  neutral_hue="slate",
169
+ ).set(button_large_radius="*radius_sm")
170
+
 
 
171
  AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
172
  PRIMARY_STAGE_MESSAGES = {
173
+ "connecting": ("Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
174
+ "queued": ("Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
175
+ "loading_model": ("Loading Model", "Streaming VibeVoice weights to the GPU."),
176
+ "loading_voices": ("Loading Voices", None),
177
+ "preparing_inputs": ("Preparing Script", "Formatting the conversation for the model."),
178
+ "generating_audio": ("Generating Audio", "Synthesizing speech — this is the longest step."),
179
+ "processing_audio": ("Finalizing Audio", "Converting tensors into a playable waveform."),
180
+ "complete": ("Ready", "Press play below or download your conference."),
181
+ "error": ("Error", "Check the log for details."),
182
  }
183
  AUDIO_STAGE_LABELS = {
184
  "connecting": "Complete Conference (requesting GPU...)",
 
190
  "processing_audio": "Complete Conference (finalizing audio...)",
191
  "error": "Complete Conference (error)",
192
  }
193
+ READY_PRIMARY_STATUS = "### Ready\nPress **Generate Conference** to run VibeVoice."
194
 
195
 
196
  def build_primary_status(stage: str, status_line: str) -> str:
197
+ title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("Working", "Processing..."))
198
  desc_parts = []
199
  if default_desc:
200
  desc_parts.append(default_desc)
 
204
  return f"### {title}\n{desc}"
205
 
206
 
207
+ # --- Build Interface ---
208
+
209
  def create_demo_interface():
210
  with gr.Blocks(
211
  title="VibeVoice - Conference Generator",
212
  theme=theme,
213
  ) as interface:
214
+ # --- Banner ---
215
  gr.HTML("""
216
  <div style="width: 100%; margin-bottom: 20px;">
217
+ <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png"
218
  style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
219
  alt="VibeVoice Banner">
220
  </div>
221
  """)
222
+
223
  with gr.Tabs():
224
+ # ==================== GENERATE TAB ====================
225
  with gr.Tab("Generate"):
226
+ gr.Markdown("**Tip:** The 1.5B model is recommended — much faster with minimal quality difference.")
227
+
228
+ # --- Conversation state: list of {speaker: int, text: str} ---
229
+ turns_state = gr.State([])
230
 
231
+ # --- Top row: Settings (left) + Script Tools (right) ---
232
  with gr.Row():
233
+ # ---------- LEFT COLUMN: Settings ----------
234
  with gr.Column(scale=1):
235
  gr.Markdown("### Settings")
236
  model_dropdown = gr.Dropdown(
 
248
  speaker = gr.Dropdown(
249
  choices=AVAILABLE_VOICES,
250
  value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
251
+ label=f"Speaker {i + 1}",
252
  visible=(i < 2),
253
  )
254
  speaker_selections.append(speaker)
 
259
  label="CFG Scale (Guidance Strength)",
260
  )
261
 
262
+ # ---------- RIGHT COLUMN: Script creation ----------
263
  with gr.Column(scale=2):
264
+ # --- AI Script Generator ---
265
+ with gr.Accordion("Generate a Script with AI", open=True):
266
+ gr.Markdown("Describe the conversation you want and AI will write the script for you.")
267
+ script_prompt = gr.Textbox(
268
+ label="Prompt",
269
+ placeholder="e.g. A wizard consulting an orc about battle strategy for an upcoming siege",
270
+ lines=2,
271
+ max_lines=4,
272
+ )
273
+ with gr.Row():
274
+ generate_script_btn = gr.Button(
275
+ "Generate Script", variant="secondary",
276
+ )
277
+ script_gen_status = gr.Markdown(value="", visible=False)
278
+
279
+ # --- Example buttons ---
280
+ with gr.Accordion("Example Scripts", open=False):
281
+ with gr.Row():
282
+ use_natural = gr.Checkbox(
283
+ value=True,
284
+ label="Natural talking sounds",
285
+ )
286
+ example_names = [
287
+ "AI TED Talk", "Political Speech",
288
+ "Finance IPO Meeting", "Telehealth Meeting",
289
+ "Military Meeting", "Oil Meeting",
290
+ "Game Creation Meeting", "Product Meeting",
291
+ ]
292
+ example_buttons = []
293
+ with gr.Row():
294
+ for i in range(4):
295
+ btn = gr.Button(example_names[i], size="sm", variant="secondary")
296
+ example_buttons.append(btn)
297
+ with gr.Row():
298
+ for i in range(4, 8):
299
+ btn = gr.Button(example_names[i], size="sm", variant="secondary")
300
+ example_buttons.append(btn)
301
+
302
+ # --- Conversation Editor ---
303
+ gr.Markdown("### Conversation")
304
+ duration_display = gr.Markdown(value="")
305
+
306
+ @gr.render(inputs=[turns_state, num_speakers])
307
+ def render_turns(turns, n_speakers):
308
+ if not turns:
309
+ gr.Markdown("*No script yet. Generate one with AI above, load an example, or add turns manually.*")
310
+ else:
311
+ speaker_choices = [f"Speaker {i + 1}" for i in range(int(n_speakers))]
312
+ for idx, turn in enumerate(turns):
313
+ with gr.Row(key=f"turn-{idx}"):
314
+ spk_dd = gr.Dropdown(
315
+ choices=speaker_choices,
316
+ value=f"Speaker {turn['speaker']}",
317
+ label="",
318
+ scale=1,
319
+ min_width=120,
320
+ container=False,
321
+ key=f"spk-{idx}",
322
+ )
323
+ txt = gr.Textbox(
324
+ value=turn["text"],
325
+ label="",
326
+ lines=2,
327
+ max_lines=6,
328
+ scale=5,
329
+ container=False,
330
+ key=f"txt-{idx}",
331
+ )
332
+ del_btn = gr.Button("X", size="sm", variant="stop", scale=0, min_width=40, key=f"del-{idx}")
333
+
334
+ # Update turn text when user edits
335
+ def on_text_change(new_text, current_turns, i=idx):
336
+ if i < len(current_turns):
337
+ current_turns[i]["text"] = new_text
338
+ return current_turns
339
+
340
+ txt.change(
341
+ fn=on_text_change,
342
+ inputs=[txt, turns_state],
343
+ outputs=[turns_state],
344
+ queue=False,
345
+ )
346
 
347
+ # Update speaker when user changes dropdown
348
+ def on_speaker_change(new_spk, current_turns, i=idx):
349
+ if i < len(current_turns):
350
+ num = int(new_spk.replace("Speaker ", ""))
351
+ current_turns[i]["speaker"] = num
352
+ return current_turns
353
+
354
+ spk_dd.change(
355
+ fn=on_speaker_change,
356
+ inputs=[spk_dd, turns_state],
357
+ outputs=[turns_state],
358
+ queue=False,
359
  )
360
+
361
+ # Delete turn
362
+ def on_delete(current_turns, i=idx):
363
+ if i < len(current_turns):
364
+ current_turns.pop(i)
365
+ return current_turns
366
+
367
+ del_btn.click(
368
+ fn=on_delete,
369
+ inputs=[turns_state],
370
+ outputs=[turns_state],
371
  )
372
 
373
+ with gr.Row():
374
+ add_turn_btn = gr.Button("+ Add Turn", size="sm", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
+ # --- Generate Conference ---
377
  generate_btn = gr.Button(
378
+ "Generate Conference", size="lg", variant="primary",
 
379
  )
380
 
381
+ # --- Output section ---
382
  primary_status = gr.Markdown(
383
  value=READY_PRIMARY_STATUS,
384
  elem_id="primary-status",
385
  )
 
 
 
 
 
 
 
 
386
  complete_audio_output = gr.Audio(
387
  label=AUDIO_LABEL_DEFAULT,
388
  type="numpy",
 
396
  interactive=False,
397
  )
398
 
399
+ # ==================== EVENT HANDLERS ====================
400
+
401
+ def update_speaker_visibility(n):
402
+ return [gr.update(visible=(i < n)) for i in range(4)]
403
+
404
+ num_speakers.change(
405
+ fn=update_speaker_visibility,
406
+ inputs=[num_speakers],
407
+ outputs=speaker_selections,
408
+ )
409
+
410
+ # --- Add turn ---
411
+ def add_turn(turns, n_speakers):
412
+ if not turns:
413
+ next_speaker = 1
414
  else:
415
+ last = turns[-1]["speaker"]
416
+ next_speaker = (last % int(n_speakers)) + 1
417
+ turns.append({"speaker": next_speaker, "text": ""})
418
+ return turns, estimate_duration(turns)
419
+
420
+ add_turn_btn.click(
421
+ fn=add_turn,
422
+ inputs=[turns_state, num_speakers],
423
+ outputs=[turns_state, duration_display],
424
+ )
425
+
426
+ # --- Update duration whenever turns change ---
427
+ def update_duration(turns):
428
+ return estimate_duration(turns)
429
+
430
+ turns_state.change(
431
+ fn=update_duration,
432
+ inputs=[turns_state],
433
+ outputs=[duration_display],
434
+ queue=False,
435
+ )
436
+
437
+ # --- AI Script Generation ---
438
+ def on_generate_script(prompt, n_speakers):
439
+ if not prompt or not prompt.strip():
440
+ gr.Warning("Please enter a prompt describing the conversation.")
441
+ return gr.update(), gr.update()
442
+ try:
443
+ turns = generate_script_from_prompt(prompt.strip(), int(n_speakers))
444
+ if not turns:
445
+ gr.Warning("The AI returned an empty script. Try a more descriptive prompt.")
446
+ return gr.update(), gr.update()
447
+ return turns, estimate_duration(turns)
448
+ except Exception as e:
449
+ gr.Warning(f"Script generation failed: {e}")
450
+ return gr.update(), gr.update()
451
+
452
+ generate_script_btn.click(
453
+ fn=on_generate_script,
454
+ inputs=[script_prompt, num_speakers],
455
+ outputs=[turns_state, duration_display],
456
+ )
457
+
458
+ # --- Load example scripts ---
459
+ def load_example(idx, natural):
460
  if idx >= len(EXAMPLE_SCRIPTS):
461
+ return [], 2, "" , *[None] * 4
462
 
463
  script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
464
  num = SCRIPT_SPEAKER_COUNTS[idx] if idx < len(SCRIPT_SPEAKER_COUNTS) else 1
465
+ turns = parse_script_to_turns(script)
 
466
 
467
+ speakers = list(AVAILABLE_VOICES[:num])
468
  while len(speakers) < 4:
469
  speakers.append(None)
470
 
471
+ return turns, num, estimate_duration(turns), *speakers[:4]
472
+
 
473
  for idx, btn in enumerate(example_buttons):
474
  btn.click(
475
+ fn=lambda nat, i=idx: load_example(i, nat),
476
  inputs=[use_natural],
477
+ outputs=[turns_state, num_speakers, duration_display] + speaker_selections,
478
+ queue=False,
479
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
+ # --- Generate Conference (audio) ---
482
+ def generate_podcast_wrapper(
483
+ model_choice, num_speakers_val, turns, *speakers_and_params
484
+ ):
485
  if remote_generate_function is None:
486
  yield (
487
  build_primary_status("error", "Modal backend is offline."),
 
488
  gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
489
  "ERROR: Modal function not deployed. Please contact the space owner.",
490
  )
491
  return
492
 
493
+ # Assemble turns into script text
494
+ script = turns_to_script(turns)
495
+ if not script.strip():
496
+ yield (
497
+ build_primary_status("error", "No script to generate."),
498
+ gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
499
+ "Please add some dialogue before generating.",
500
+ )
501
+ return
502
+
503
  yield (
504
  build_primary_status("connecting", "Provisioning GPU resources... cold starts can take up to a minute."),
 
505
  gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
506
  "Calling remote GPU on Modal.com...",
507
  )
 
510
  speakers = speakers_and_params[:4]
511
  cfg_scale_val = speakers_and_params[4]
512
  current_log = ""
 
513
  last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
514
  last_stage = "connecting"
515
 
 
521
  speaker_3=speakers[2],
522
  speaker_4=speakers[3],
523
  cfg_scale=cfg_scale_val,
524
+ model_name=model_choice,
525
  ):
526
  if not update:
527
  continue
528
 
529
  if isinstance(update, dict):
530
  audio_payload = update.get("audio")
 
531
  stage_key = update.get("stage", last_stage) or last_stage
532
  status_line = update.get("status") or "Processing..."
533
  current_log = update.get("log", current_log)
 
534
 
535
  audio_label = AUDIO_STAGE_LABELS.get(stage_key)
536
  if not audio_label:
 
538
  audio_label = f"Complete Conference ({stage_label.lower()})"
539
  if stage_key == "complete":
540
  audio_label = AUDIO_LABEL_DEFAULT
 
 
541
 
542
  audio_update = gr.update(label=audio_label)
543
  if audio_payload is not None:
 
545
 
546
  yield (
547
  build_primary_status(stage_key, status_line),
 
548
  audio_update,
549
  current_log,
550
  )
551
 
 
552
  last_audio_label = audio_label
553
  last_stage = stage_key
554
  else:
555
+ audio_payload, log_text = (
556
+ update if isinstance(update, (tuple, list)) else (None, str(update))
557
+ )
558
  if log_text:
559
  current_log = log_text
560
 
561
  if audio_payload is not None:
 
562
  yield (
563
  build_primary_status("complete", "Conference ready to download."),
564
+ gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT),
 
565
  current_log,
566
  )
567
  else:
568
  status_line = current_log.splitlines()[-1] if current_log else "Processing..."
569
  yield (
570
  build_primary_status("generating_audio", status_line),
 
571
  gr.update(label=AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)),
572
  current_log,
573
  )
 
576
  print(f"Error calling Modal: {e}")
577
  yield (
578
  build_primary_status("error", "Inference failed."),
 
579
  gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
580
  f"An error occurred: {e}\n\n{tb}",
581
  )
582
 
583
  generate_btn.click(
584
  fn=generate_podcast_wrapper,
585
+ inputs=[model_dropdown, num_speakers, turns_state] + speaker_selections + [cfg_scale],
586
+ outputs=[primary_status, complete_audio_output, log_output],
587
  )
588
+
589
+ # ==================== ARCHITECTURE TAB ====================
590
  with gr.Tab("Architecture"):
591
  gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
592
+ gr.Markdown(
593
+ """VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker
594
  conversational audio from text. It addresses challenges in traditional TTS systems — scalability, speaker
595
  consistency, and natural turn-taking — using continuous speech tokenizers at an ultra-low 7.5 Hz frame rate
596
+ and a next-token diffusion framework. It can synthesize speech up to 90 minutes long with up to 4 distinct speakers."""
597
+ )
598
 
599
  with gr.Row():
600
  with gr.Column():
 
633
  )
634
  return interface
635
 
636
+
637
+ # --- Main ---
638
  if __name__ == "__main__":
639
  if remote_generate_function is None:
 
 
640
  with gr.Blocks(theme=theme) as interface:
641
+ gr.Markdown("# Configuration Error")
642
  gr.Markdown(
643
  "The Gradio application cannot connect to the Modal backend. "
644
+ "Please run `modal deploy backend_modal/modal_runner.py` and refresh."
 
645
  )
646
  interface.launch()
647
  else:
 
648
  interface = create_demo_interface()
649
  interface.queue().launch(show_error=True)
backend_modal/modal_runner.py CHANGED
@@ -30,6 +30,14 @@ image = (
30
  "librosa",
31
  "pydub",
32
  )
 
 
 
 
 
 
 
 
33
  .add_local_dir("backend_modal/modular", remote_path="/root/modular")
34
  .add_local_dir("backend_modal/processor", remote_path="/root/processor")
35
  .add_local_dir("backend_modal/voices", remote_path="/root/voices")
@@ -51,7 +59,9 @@ cache_volume = modal.Volume.from_name("vibevoice-cache", create_if_missing=True)
51
  volumes={"/cache": cache_volume}
52
  )
53
  class VibeVoiceModel:
54
- def __init__(self):
 
 
55
  self.model_paths = {
56
  "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
57
  "VibeVoice-7B": "vibevoice/VibeVoice-7B",
@@ -61,17 +71,11 @@ class VibeVoiceModel:
61
  self.cache_dir = "/cache"
62
  self.max_cache_size_gb = 10 # Limit cache to 10GB
63
 
64
- @modal.enter()
65
- def load_models(self):
66
- """
67
- This method is run once when the container starts.
68
- With A10G (24GB), we can load both models to GPU.
69
- """
70
  # Project-specific imports are moved here to run inside the container
71
  from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
72
  from processor.vibevoice_processor import VibeVoiceProcessor
73
 
74
- print("Entering container and loading models to GPU (A10G with 24GB)...")
75
 
76
  # Set compiler flags for better performance
77
  if torch.cuda.is_available() and hasattr(torch, '_inductor'):
@@ -104,11 +108,9 @@ class VibeVoiceModel:
104
 
105
  self.setup_voice_presets()
106
  print("Model loading complete.")
107
-
108
  def _place_model(self, target_name: str):
109
- """
110
- With A10G, both models stay on GPU. Just update the current model.
111
- """
112
  self.current_model_name = target_name
113
  print(f"Switched to model {target_name}")
114
 
@@ -297,7 +299,6 @@ class VibeVoiceModel:
297
  if model_name not in self.models:
298
  raise ValueError(f"Unknown model: {model_name}")
299
 
300
- # Initialize log scaffold
301
  selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
302
  log_lines = [
303
  f"Generating conference with {num_speakers} speakers",
@@ -307,7 +308,20 @@ class VibeVoiceModel:
307
  ]
308
  log_text = "\n".join(log_lines)
309
 
310
- # Emit initial status before heavy work kicks in
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  yield self._emit_progress(
312
  stage="queued",
313
  pct=5,
@@ -475,10 +489,11 @@ class VibeVoiceModel:
475
  sample_rate = 24000
476
  total_duration = len(audio) / sample_rate
477
  log_lines.append(f"Audio duration: {total_duration:.2f} seconds")
 
 
478
  log_lines.append("Complete!")
479
  log_text = "\n".join(log_lines)
480
 
481
- # Final yield with both audio and complete log
482
  yield self._emit_progress(
483
  stage="complete",
484
  pct=100,
 
30
  "librosa",
31
  "pydub",
32
  )
33
+ .run_commands(
34
+ "mkdir -p /root/vibevoice",
35
+ "touch /root/vibevoice/__init__.py",
36
+ "ln -s /root/modular /root/vibevoice/modular",
37
+ "ln -s /root/processor /root/vibevoice/processor",
38
+ "ln -s /root/voices /root/vibevoice/voices",
39
+ "ln -s /root/schedule /root/vibevoice/schedule"
40
+ )
41
  .add_local_dir("backend_modal/modular", remote_path="/root/modular")
42
  .add_local_dir("backend_modal/processor", remote_path="/root/processor")
43
  .add_local_dir("backend_modal/voices", remote_path="/root/voices")
 
59
  volumes={"/cache": cache_volume}
60
  )
61
  class VibeVoiceModel:
62
+ @modal.enter()
63
+ def load_models(self):
64
+ """Run once when the container starts. Loads both models to GPU."""
65
  self.model_paths = {
66
  "VibeVoice-1.5B": "microsoft/VibeVoice-1.5B",
67
  "VibeVoice-7B": "vibevoice/VibeVoice-7B",
 
71
  self.cache_dir = "/cache"
72
  self.max_cache_size_gb = 10 # Limit cache to 10GB
73
 
 
 
 
 
 
 
74
  # Project-specific imports are moved here to run inside the container
75
  from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
76
  from processor.vibevoice_processor import VibeVoiceProcessor
77
 
78
+ print("Entering container and loading models to GPU...")
79
 
80
  # Set compiler flags for better performance
81
  if torch.cuda.is_available() and hasattr(torch, '_inductor'):
 
108
 
109
  self.setup_voice_presets()
110
  print("Model loading complete.")
111
+
112
  def _place_model(self, target_name: str):
113
+ """Both models stay on GPU. Just update the active selection."""
 
 
114
  self.current_model_name = target_name
115
  print(f"Switched to model {target_name}")
116
 
 
299
  if model_name not in self.models:
300
  raise ValueError(f"Unknown model: {model_name}")
301
 
 
302
  selected_speakers = [speaker_1, speaker_2, speaker_3, speaker_4][:num_speakers]
303
  log_lines = [
304
  f"Generating conference with {num_speakers} speakers",
 
308
  ]
309
  log_text = "\n".join(log_lines)
310
 
311
+ # Check cache first
312
+ cache_key = self._generate_cache_key(script, model_name, selected_speakers, cfg_scale)
313
+ cached_audio, cached_sr = self._get_cached_audio(cache_key)
314
+ if cached_audio is not None:
315
+ log_lines.append("Cache hit! Returning previously generated audio.")
316
+ log_text = "\n".join(log_lines)
317
+ yield self._emit_progress(
318
+ stage="complete", pct=100,
319
+ status="Loaded from cache.",
320
+ log_text=log_text,
321
+ audio=(cached_sr, cached_audio), done=True,
322
+ )
323
+ return
324
+
325
  yield self._emit_progress(
326
  stage="queued",
327
  pct=5,
 
489
  sample_rate = 24000
490
  total_duration = len(audio) / sample_rate
491
  log_lines.append(f"Audio duration: {total_duration:.2f} seconds")
492
+
493
+ self._save_to_cache(cache_key, audio, sample_rate)
494
  log_lines.append("Complete!")
495
  log_text = "\n".join(log_lines)
496
 
 
497
  yield self._emit_progress(
498
  stage="complete",
499
  pct=100,
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  gradio
2
- modal
 
 
1
  gradio
2
+ modal
3
+ huggingface_hub