ACloudCenter commited on
Commit
aec877b
·
verified ·
1 Parent(s): c7cf6c1

Fix: Update with refactor. Remove unneeded diag.

Browse files
Files changed (1) hide show
  1. app.py +124 -243
app.py CHANGED
@@ -9,31 +9,9 @@ MODAL_STUB_NAME = "vibevoice-generator"
9
  MODAL_CLASS_NAME = "VibeVoiceModel" # Extract class name
10
  MODAL_METHOD_NAME = "generate_podcast" # Extract method name
11
 
12
- # These lists are now hardcoded because the data lives on the Modal container.
13
- # For a more dynamic app, you could create a small Modal function to fetch these lists.
14
  AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
15
- AVAILABLE_VOICES = [
16
- "en-Alice_woman_bgm", "en-Alice_woman", "en-Carter_man", "en-Frank_man",
17
- "en-Maya_woman", "en-Yasser_man", "in-Samuel_man", "zh-Anchen_man_bgm",
18
- "zh-Bowen_man", "zh-Xinran_woman"
19
- ]
20
- DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
21
-
22
- # Male and female voice categories for smart speaker selection
23
- MALE_VOICES = [
24
- "en-Carter_man",
25
- "en-Frank_man",
26
- "en-Yasser_man",
27
- "in-Samuel_man",
28
- "zh-Anchen_man_bgm",
29
- "zh-Bowen_man"
30
- ]
31
- FEMALE_VOICES = [
32
- "en-Alice_woman_bgm",
33
- "en-Alice_woman",
34
- "en-Maya_woman",
35
- "zh-Xinran_woman"
36
- ]
37
 
38
  # Load example scripts
39
  def load_example_scripts():
@@ -74,17 +52,8 @@ def load_example_scripts():
74
 
75
  return example_scripts, example_scripts_natural
76
 
77
- # Gender mapping for each script's speakers
78
- SCRIPT_SPEAKER_GENDERS = [
79
- ["female"], # AI TED Talk - Rachel
80
- ["neutral"], # Political Speech - generic speaker
81
- ["male", "female"], # Finance IPO - James, Patricia
82
- ["female", "male"], # Telehealth - Jennifer, Tom
83
- ["female", "male", "female"], # Military - Sarah, David, Lisa
84
- ["male", "female", "male"], # Oil - Robert, Lisa, Michael
85
- ["male", "female", "male", "male"], # Game Creation - Alex, Sarah, Marcus, Emma
86
- ["female", "male", "female", "male"] # Product Meeting - Sarah, Marcus, Jennifer, David
87
- ]
88
 
89
  EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
90
 
@@ -159,25 +128,13 @@ def create_demo_interface():
159
  alt="VibeVoice Banner">
160
  </div>
161
  """)
162
- gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5B first.")
163
-
164
  with gr.Tabs():
165
  with gr.Tab("Generate"):
166
- gr.Markdown("### Generated Conference")
167
- primary_status = gr.Markdown(
168
- value=READY_PRIMARY_STATUS,
169
- elem_id="primary-status",
170
- )
171
- complete_audio_output = gr.Audio(
172
- label=AUDIO_LABEL_DEFAULT,
173
- type="numpy",
174
- autoplay=False,
175
- show_download_button=True,
176
- )
177
-
178
  with gr.Row():
179
  with gr.Column(scale=1):
180
- gr.Markdown("### Conference Settings")
181
  model_dropdown = gr.Dropdown(
182
  choices=AVAILABLE_MODELS,
183
  value=AVAILABLE_MODELS[0],
@@ -188,7 +145,6 @@ def create_demo_interface():
188
  label="Number of Speakers",
189
  )
190
 
191
- gr.Markdown("### Speaker Selection")
192
  speaker_selections = []
193
  for i in range(4):
194
  speaker = gr.Dropdown(
@@ -206,30 +162,26 @@ def create_demo_interface():
206
  )
207
 
208
  with gr.Column(scale=2):
209
- gr.Markdown("### Script Input")
210
  script_input = gr.Textbox(
211
  label="Conversation Script",
212
- placeholder="Enter your conference script here...",
213
  lines=12,
214
  max_lines=20,
215
  )
216
-
217
  with gr.Row():
218
- with gr.Column(scale=1):
219
- gr.Markdown("### Example Scripts")
220
- with gr.Row():
221
- use_natural = gr.Checkbox(
222
- value=True,
223
- label="Natural talking sounds",
224
- scale=1
225
- )
226
- duration_display = gr.Textbox(
227
- value="",
228
- label="Est. Duration",
229
- interactive=False,
230
- scale=1
231
- )
232
-
233
  example_names = [
234
  "AI TED Talk",
235
  "Political Speech",
@@ -238,42 +190,49 @@ def create_demo_interface():
238
  "Military Meeting",
239
  "Oil Meeting",
240
  "Game Creation Meeting",
241
- "Product Meeting"
242
  ]
243
-
244
  example_buttons = []
245
  with gr.Row():
246
  for i in range(min(4, len(example_names))):
247
  btn = gr.Button(example_names[i], size="sm", variant="secondary")
248
  example_buttons.append(btn)
249
-
250
  with gr.Row():
251
  for i in range(4, min(8, len(example_names))):
252
  btn = gr.Button(example_names[i], size="sm", variant="secondary")
253
  example_buttons.append(btn)
254
-
255
- generate_btn = gr.Button(
256
- "🚀 Generate Conference (on Modal)", size="lg",
257
- variant="primary",
258
- )
259
- log_output = gr.Textbox(
260
- label="Generation Log",
261
- lines=8, max_lines=15,
262
- interactive=False,
263
- )
264
- with gr.Row():
265
- status_display = gr.Markdown(
266
- value="**Idle**\nPress generate to get started.",
267
- elem_id="status-display",
268
- )
269
- progress_slider = gr.Slider(
270
- minimum=0,
271
- maximum=100,
272
- value=0,
273
- step=1,
274
- label="Progress",
275
- interactive=False,
276
- )
 
 
 
 
 
 
 
277
 
278
  def update_speaker_visibility(num_speakers):
279
  return [gr.update(visible=(i < num_speakers)) for i in range(4)]
@@ -290,46 +249,21 @@ def create_demo_interface():
290
  else:
291
  return f"~{minutes:.1f} minutes"
292
 
293
- def smart_speaker_selection(gender_list):
294
- """Select speakers based on gender requirements."""
295
- selected = []
296
- for gender in gender_list:
297
- if gender == "male" and MALE_VOICES:
298
- available = [v for v in MALE_VOICES if v not in selected]
299
- if available:
300
- selected.append(available[0])
301
- else:
302
- selected.append(MALE_VOICES[0])
303
- elif gender == "female" and FEMALE_VOICES:
304
- available = [v for v in FEMALE_VOICES if v not in selected]
305
- if available:
306
- selected.append(available[0])
307
- else:
308
- selected.append(FEMALE_VOICES[0])
309
- else:
310
- # neutral or fallback
311
- available = [v for v in AVAILABLE_VOICES if v not in selected]
312
- if available:
313
- selected.append(available[0])
314
- else:
315
- selected.append(AVAILABLE_VOICES[0])
316
- return selected
317
-
318
  def load_specific_example(idx, natural):
319
  """Load a specific example script."""
320
  if idx >= len(EXAMPLE_SCRIPTS):
321
  return [2, "", ""] + [None, None, None, None]
322
-
323
  script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
324
- genders = SCRIPT_SPEAKER_GENDERS[idx] if idx < len(SCRIPT_SPEAKER_GENDERS) else ["neutral"]
325
- speakers = smart_speaker_selection(genders)
326
  duration = estimate_duration(script)
327
-
328
  # Pad speakers to 4
329
  while len(speakers) < 4:
330
  speakers.append(None)
331
-
332
- return [len(genders), script, duration] + speakers[:4]
333
 
334
  # Connect example buttons
335
  for idx, btn in enumerate(example_buttons):
@@ -356,27 +290,19 @@ def create_demo_interface():
356
 
357
  def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
358
  if remote_generate_function is None:
359
- error_message = "ERROR: Modal function not deployed. Please contact the space owner."
360
- primary_error = build_primary_status("error", "Modal backend is offline.")
361
  yield (
362
- gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
363
- error_message,
364
- "**Error**\nModal backend unavailable.",
365
  gr.update(value=0),
366
- primary_error,
 
367
  )
368
  return
369
 
370
- connecting_status_line = "Provisioning GPU resources... cold starts can take up to a minute."
371
- primary_connecting = build_primary_status("connecting", connecting_status_line)
372
- status_detail = "**Connecting**\nRequesting GPU resources…"
373
-
374
  yield (
375
- gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
376
- "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
377
- status_detail,
378
  gr.update(value=1),
379
- primary_connecting,
 
380
  )
381
 
382
  try:
@@ -384,12 +310,9 @@ def create_demo_interface():
384
  cfg_scale_val = speakers_and_params[4]
385
  current_log = ""
386
  last_pct = 1
387
- last_status = status_detail
388
- last_primary = primary_connecting
389
  last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
390
  last_stage = "connecting"
391
 
392
- # Stream updates from the Modal function
393
  for update in remote_generate_function.remote_gen(
394
  num_speakers=int(num_speakers_val),
395
  script=script,
@@ -409,152 +332,110 @@ def create_demo_interface():
409
  stage_key = update.get("stage", last_stage) or last_stage
410
  status_line = update.get("status") or "Processing..."
411
  current_log = update.get("log", current_log)
412
-
413
- stage_label = stage_key.replace("_", " ").title() if stage_key else "Status"
414
- status_formatted = f"**{stage_label}**\n{status_line}"
415
  progress_value = max(0, min(100, int(round(progress_pct))))
416
 
417
  audio_label = AUDIO_STAGE_LABELS.get(stage_key)
418
  if not audio_label:
419
- audio_label = f"Complete Conference ({stage_label.lower()})" if stage_label else AUDIO_LABEL_DEFAULT
 
420
  if stage_key == "complete":
421
  audio_label = AUDIO_LABEL_DEFAULT
422
  if stage_key == "error":
423
  progress_value = 0
424
 
425
- primary_value = build_primary_status(stage_key, status_line)
426
-
427
  audio_update = gr.update(label=audio_label)
428
  if audio_payload is not None:
429
  audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
430
 
431
  yield (
 
 
432
  audio_update,
433
  current_log,
434
- status_formatted,
435
- gr.update(value=progress_value),
436
- primary_value,
437
  )
438
 
439
  last_pct = progress_value
440
- last_status = status_formatted
441
- last_primary = primary_value
442
  last_audio_label = audio_label
443
  last_stage = stage_key
444
  else:
445
- # Backwards compatibility: older backend returns (audio, log)
446
  audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
447
- status_line = None
448
  if log_text:
449
  current_log = log_text
450
- status_line = log_text.splitlines()[-1]
451
- if not status_line:
452
- status_line = "Processing..."
453
-
454
- if audio_payload is not None:
455
- progress_value = 100
456
- audio_label = AUDIO_LABEL_DEFAULT
457
- primary_value = build_primary_status("complete", "Conference ready to download.")
458
- status_formatted = "**Complete**\nConference ready to download."
459
- else:
460
- progress_value = max(last_pct, 70)
461
- audio_label = AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)
462
- primary_value = build_primary_status("generating_audio", status_line)
463
- status_formatted = f"**Streaming**\n{status_line}"
464
 
465
- audio_update = gr.update(label=audio_label)
466
  if audio_payload is not None:
467
  audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
468
-
469
- last_pct = progress_value
470
- last_status = status_formatted
471
- last_primary = primary_value
472
- last_audio_label = audio_label
473
-
474
- yield (
475
- audio_update,
476
- current_log,
477
- status_formatted,
478
- gr.update(value=progress_value),
479
- primary_value,
480
- )
 
481
  except Exception as e:
482
  tb = traceback.format_exc()
483
  print(f"Error calling Modal: {e}")
484
- error_log = f"❌ An error occurred: {e}\n\n{tb}"
485
- primary_error = build_primary_status("error", "Inference failed.")
486
  yield (
487
- gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
488
- error_log,
489
- "**Error**\nInference failed.",
490
  gr.update(value=0),
491
- primary_error,
 
492
  )
493
 
494
  generate_btn.click(
495
  fn=generate_podcast_wrapper,
496
  inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
497
- outputs=[complete_audio_output, log_output, status_display, progress_slider, primary_status]
498
  )
499
 
500
  with gr.Tab("Architecture"):
 
 
 
 
 
 
501
  with gr.Row():
502
- gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
503
- such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly
504
- in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous
505
- speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently
506
- preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice
507
- employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and
508
- dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to
509
- 90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.""")
510
- with gr.Row():
511
  with gr.Column():
512
- gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
513
-
514
  gr.Markdown("""
515
- ### Overview
516
-
517
- VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
518
- such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems,
519
- particularly in scalability, speaker consistency, and natural turn-taking.
520
-
521
- ### Key Features
522
-
523
- - **Multi-Speaker Support**: Handles up to 4 distinct speakers
524
- - **Long-Form Generation**: Synthesizes speech up to 90 minutes
525
- - **Natural Conversation Flow**: Includes turn-taking and interruptions
526
- - **Ultra-Low Frame Rate**: 7.5 Hz tokenizers for efficiency
527
- - **High Fidelity**: Preserves acoustic details while being computationally efficient
528
-
529
- ### Technical Architecture
530
-
531
- 1. **Continuous Speech Tokenizers**: Acoustic and Semantic tokenizers at 7.5 Hz
532
- 2. **Next-Token Diffusion Framework**: Combines LLM understanding with diffusion generation
533
- 3. **Large Language Model**: Understands context and dialogue flow
534
- 4. **Diffusion Head**: Generates high-fidelity acoustic details
535
  """)
536
-
537
  with gr.Column():
538
- gr.HTML("""
539
- <div style="width: 100%; padding: 20px;">
540
- <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.jpg"
541
- style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 5px 20px rgba(0,0,0,0.15);"
542
- alt="VibeVoice Architecture Diagram">
543
- </div>
544
- """)
545
-
546
- gr.Markdown("""
547
- ### Model Variants
548
-
549
- **VibeVoice-1.5B**: Faster inference, suitable for real-time applications
550
- **VibeVoice-7B**: Higher quality output, recommended for production use
551
-
552
- ### Performance Metrics
553
-
554
- <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png"
555
- style="width: 100%; height: auto; border-radius: 10px; margin-top: 20px;"
556
- alt="Performance Comparison">
557
- """)
558
  return interface
559
 
560
  # --- Main Execution ---
 
9
  MODAL_CLASS_NAME = "VibeVoiceModel" # Extract class name
10
  MODAL_METHOD_NAME = "generate_podcast" # Extract method name
11
 
 
 
12
  AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
13
+ AVAILABLE_VOICES = ["Cherry", "Chicago", "Janus", "Mantis", "Sponge", "Starchild"]
14
+ DEFAULT_SPEAKERS = ["Cherry", "Chicago", "Janus", "Mantis"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Load example scripts
17
  def load_example_scripts():
 
52
 
53
  return example_scripts, example_scripts_natural
54
 
55
+ # Number of speakers per example script
56
+ SCRIPT_SPEAKER_COUNTS = [1, 1, 2, 2, 3, 3, 4, 4]
 
 
 
 
 
 
 
 
 
57
 
58
  EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
59
 
 
128
  alt="VibeVoice Banner">
129
  </div>
130
  """)
 
 
131
  with gr.Tabs():
132
  with gr.Tab("Generate"):
133
+ gr.Markdown("**Tip:** The 1.5B model is recommended — it's much faster with minimal quality difference.")
134
+
 
 
 
 
 
 
 
 
 
 
135
  with gr.Row():
136
  with gr.Column(scale=1):
137
+ gr.Markdown("### Settings")
138
  model_dropdown = gr.Dropdown(
139
  choices=AVAILABLE_MODELS,
140
  value=AVAILABLE_MODELS[0],
 
145
  label="Number of Speakers",
146
  )
147
 
 
148
  speaker_selections = []
149
  for i in range(4):
150
  speaker = gr.Dropdown(
 
162
  )
163
 
164
  with gr.Column(scale=2):
 
165
  script_input = gr.Textbox(
166
  label="Conversation Script",
167
+ placeholder="Enter your conference script here...\n\nFormat:\nSpeaker 1: Hello everyone...\nSpeaker 2: Thanks for having me...",
168
  lines=12,
169
  max_lines=20,
170
  )
171
+
172
  with gr.Row():
173
+ use_natural = gr.Checkbox(
174
+ value=True,
175
+ label="Natural talking sounds",
176
+ scale=1,
177
+ )
178
+ duration_display = gr.Textbox(
179
+ value="",
180
+ label="Est. Duration",
181
+ interactive=False,
182
+ scale=1,
183
+ )
184
+
 
 
 
185
  example_names = [
186
  "AI TED Talk",
187
  "Political Speech",
 
190
  "Military Meeting",
191
  "Oil Meeting",
192
  "Game Creation Meeting",
193
+ "Product Meeting",
194
  ]
195
+
196
  example_buttons = []
197
  with gr.Row():
198
  for i in range(min(4, len(example_names))):
199
  btn = gr.Button(example_names[i], size="sm", variant="secondary")
200
  example_buttons.append(btn)
201
+
202
  with gr.Row():
203
  for i in range(4, min(8, len(example_names))):
204
  btn = gr.Button(example_names[i], size="sm", variant="secondary")
205
  example_buttons.append(btn)
206
+
207
+ generate_btn = gr.Button(
208
+ "Generate Conference", size="lg",
209
+ variant="primary",
210
+ )
211
+
212
+ primary_status = gr.Markdown(
213
+ value=READY_PRIMARY_STATUS,
214
+ elem_id="primary-status",
215
+ )
216
+ progress_slider = gr.Slider(
217
+ minimum=0,
218
+ maximum=100,
219
+ value=0,
220
+ step=1,
221
+ label="Progress",
222
+ interactive=False,
223
+ )
224
+ complete_audio_output = gr.Audio(
225
+ label=AUDIO_LABEL_DEFAULT,
226
+ type="numpy",
227
+ autoplay=False,
228
+ show_download_button=True,
229
+ )
230
+ with gr.Accordion("Generation Log", open=False):
231
+ log_output = gr.Textbox(
232
+ label="Log",
233
+ lines=8, max_lines=15,
234
+ interactive=False,
235
+ )
236
 
237
  def update_speaker_visibility(num_speakers):
238
  return [gr.update(visible=(i < num_speakers)) for i in range(4)]
 
249
  else:
250
  return f"~{minutes:.1f} minutes"
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  def load_specific_example(idx, natural):
253
  """Load a specific example script."""
254
  if idx >= len(EXAMPLE_SCRIPTS):
255
  return [2, "", ""] + [None, None, None, None]
256
+
257
  script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
258
+ num = SCRIPT_SPEAKER_COUNTS[idx] if idx < len(SCRIPT_SPEAKER_COUNTS) else 1
259
+ speakers = AVAILABLE_VOICES[:num]
260
  duration = estimate_duration(script)
261
+
262
  # Pad speakers to 4
263
  while len(speakers) < 4:
264
  speakers.append(None)
265
+
266
+ return [num, script, duration] + speakers[:4]
267
 
268
  # Connect example buttons
269
  for idx, btn in enumerate(example_buttons):
 
290
 
291
  def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
292
  if remote_generate_function is None:
 
 
293
  yield (
294
+ build_primary_status("error", "Modal backend is offline."),
 
 
295
  gr.update(value=0),
296
+ gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
297
+ "ERROR: Modal function not deployed. Please contact the space owner.",
298
  )
299
  return
300
 
 
 
 
 
301
  yield (
302
+ build_primary_status("connecting", "Provisioning GPU resources... cold starts can take up to a minute."),
 
 
303
  gr.update(value=1),
304
+ gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
305
+ "Calling remote GPU on Modal.com...",
306
  )
307
 
308
  try:
 
310
  cfg_scale_val = speakers_and_params[4]
311
  current_log = ""
312
  last_pct = 1
 
 
313
  last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
314
  last_stage = "connecting"
315
 
 
316
  for update in remote_generate_function.remote_gen(
317
  num_speakers=int(num_speakers_val),
318
  script=script,
 
332
  stage_key = update.get("stage", last_stage) or last_stage
333
  status_line = update.get("status") or "Processing..."
334
  current_log = update.get("log", current_log)
 
 
 
335
  progress_value = max(0, min(100, int(round(progress_pct))))
336
 
337
  audio_label = AUDIO_STAGE_LABELS.get(stage_key)
338
  if not audio_label:
339
+ stage_label = stage_key.replace("_", " ").title()
340
+ audio_label = f"Complete Conference ({stage_label.lower()})"
341
  if stage_key == "complete":
342
  audio_label = AUDIO_LABEL_DEFAULT
343
  if stage_key == "error":
344
  progress_value = 0
345
 
 
 
346
  audio_update = gr.update(label=audio_label)
347
  if audio_payload is not None:
348
  audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
349
 
350
  yield (
351
+ build_primary_status(stage_key, status_line),
352
+ gr.update(value=progress_value),
353
  audio_update,
354
  current_log,
 
 
 
355
  )
356
 
357
  last_pct = progress_value
 
 
358
  last_audio_label = audio_label
359
  last_stage = stage_key
360
  else:
 
361
  audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
 
362
  if log_text:
363
  current_log = log_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
 
365
  if audio_payload is not None:
366
  audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)
367
+ yield (
368
+ build_primary_status("complete", "Conference ready to download."),
369
+ gr.update(value=100),
370
+ audio_update,
371
+ current_log,
372
+ )
373
+ else:
374
+ status_line = current_log.splitlines()[-1] if current_log else "Processing..."
375
+ yield (
376
+ build_primary_status("generating_audio", status_line),
377
+ gr.update(value=max(last_pct, 70)),
378
+ gr.update(label=AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)),
379
+ current_log,
380
+ )
381
  except Exception as e:
382
  tb = traceback.format_exc()
383
  print(f"Error calling Modal: {e}")
 
 
384
  yield (
385
+ build_primary_status("error", "Inference failed."),
 
 
386
  gr.update(value=0),
387
+ gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
388
+ f"An error occurred: {e}\n\n{tb}",
389
  )
390
 
391
  generate_btn.click(
392
  fn=generate_podcast_wrapper,
393
  inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
394
+ outputs=[primary_status, progress_slider, complete_audio_output, log_output],
395
  )
396
 
397
  with gr.Tab("Architecture"):
398
+ gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
399
+ gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker
400
+ conversational audio from text. It addresses challenges in traditional TTS systems — scalability, speaker
401
+ consistency, and natural turn-taking — using continuous speech tokenizers at an ultra-low 7.5 Hz frame rate
402
+ and a next-token diffusion framework. It can synthesize speech up to 90 minutes long with up to 4 distinct speakers.""")
403
+
404
  with gr.Row():
 
 
 
 
 
 
 
 
 
405
  with gr.Column():
 
 
406
  gr.Markdown("""
407
+ ### Key Features
408
+
409
+ - **Multi-Speaker Support**: Up to 4 distinct speakers
410
+ - **Long-Form Generation**: Up to 90 minutes of speech
411
+ - **Natural Conversation Flow**: Turn-taking and interruptions
412
+ - **Ultra-Low Frame Rate**: 7.5 Hz tokenizers for efficiency
413
+ - **High Fidelity**: Preserves acoustic details while being computationally efficient
414
+
415
+ ### Technical Architecture
416
+
417
+ 1. **Continuous Speech Tokenizers**: Acoustic and Semantic tokenizers at 7.5 Hz
418
+ 2. **Next-Token Diffusion Framework**: Combines LLM understanding with diffusion generation
419
+ 3. **Large Language Model**: Understands context and dialogue flow
420
+ 4. **Diffusion Head**: Generates high-fidelity acoustic details
421
+
422
+ ### Model Variants
423
+
424
+ - **VibeVoice-1.5B**: Faster inference, suitable for real-time applications
425
+ - **VibeVoice-7B**: Higher quality output, recommended for production use
 
426
  """)
427
+
428
  with gr.Column():
429
+ gr.Image(
430
+ value="public/images/diagram.jpg",
431
+ label="Architecture Diagram",
432
+ show_download_button=False,
433
+ )
434
+ gr.Image(
435
+ value="public/images/chart.png",
436
+ label="Performance Comparison",
437
+ show_download_button=False,
438
+ )
 
 
 
 
 
 
 
 
 
 
439
  return interface
440
 
441
  # --- Main Execution ---