ACloudCenter commited on
Commit
e28a050
·
1 Parent(s): b504537

Auto-pick voices by gender, sync voice picks to

Browse files

script tags, add preview

Script prompt now requests a 'Character Genders:
' line which the parser strips out and uses to
pick matching-gender voices (Mom gets Cherry,
Wizard gets Chicago, etc.) without duplicates.
Adds a voice_selections state so changing a
Voice dropdown below the script live-updates every
'Speaker N - ...' tag in the editor. New
collapsible voice preview lets users sample each
voice before committing to a long generation.

Files changed (1) hide show
  1. app.py +142 -21
app.py CHANGED
@@ -236,7 +236,14 @@ CRITICAL — ONE SPEAKER PER TURN:
236
  - NEVER embed another character's dialogue inside someone else's turn
237
  - WRONG: "Speaker 1: We need magic. Mom: Hey kids, what's going on?"
238
  - RIGHT: Every time the speaker changes, END the current turn, add a BLANK LINE, then start a NEW turn with "Speaker N:" on its own line
239
- - Do NOT use character names as inline labels like "Mom:" or "Wizard:" mid-paragraph — always use "Speaker N:" on a fresh line"""
 
 
 
 
 
 
 
240
 
241
 
242
  # Strip bracketed stage directions, parenthetical cues, and asterisk actions.
@@ -277,8 +284,59 @@ def sanitize_dialogue(text: str) -> str:
277
  return text
278
 
279
 
280
- def generate_script_from_prompt(prompt: str) -> tuple[list[dict], int, str]:
281
- """Returns (turns, num_speakers, title)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  system = SCRIPT_SYSTEM_PROMPT.format(max_words=SCRIPT_MAX_WORDS)
283
  response = llm_client.chat_completion(
284
  messages=[
@@ -297,6 +355,9 @@ def generate_script_from_prompt(prompt: str) -> tuple[list[dict], int, str]:
297
  title = lines[0].split(":", 1)[1].strip()
298
  raw = "\n".join(lines[1:])
299
 
 
 
 
300
  turns = parse_script_to_turns(raw)
301
  # Scrub stage directions from each turn, drop any turn that becomes empty
302
  turns = [
@@ -311,7 +372,9 @@ def generate_script_from_prompt(prompt: str) -> tuple[list[dict], int, str]:
311
  total_words = sum(len(t["text"].split()) for t in turns)
312
  speaker_ids = {t["speaker"] for t in turns}
313
  num_speakers = max(min(len(speaker_ids), 4), 1) if speaker_ids else 1
314
- return turns, num_speakers, title
 
 
315
 
316
 
317
  PARODY_SYSTEM_PROMPT = """You are a comedian narrator. The user will give you a scenario. Write a SHORT, funny behind-the-scenes narration of what's "really" happening while their audio is being generated. Be absurd, self-aware, and poke fun at AI.
@@ -543,6 +606,8 @@ def create_demo_interface():
543
  turns_state = gr.State([])
544
  script_title_state = gr.State("")
545
  parody_lines_state = gr.State([]) # funny loading story for audio generation
 
 
546
 
547
  # ---- BANNER ----
548
  gr.HTML("""
@@ -595,8 +660,8 @@ def create_demo_interface():
595
  duration_display = gr.HTML(value="")
596
 
597
  with gr.Column(elem_classes="conversation-scroll"):
598
- @gr.render(inputs=[turns_state, gr.State(4)])
599
- def render_turns(turns, _max):
600
  if not turns:
601
  gr.Markdown(
602
  "Your conversation will appear here.\n\n"
@@ -606,12 +671,16 @@ def create_demo_interface():
606
  )
607
  return
608
 
609
- # Show all 4 speakers with voice name + gender
 
610
  speaker_choices = []
611
  for i in range(4):
612
- voice_name = AVAILABLE_VOICES[i] if i < len(AVAILABLE_VOICES) else "?"
613
- gender = VOICE_GENDERS.get(voice_name, "")
614
- speaker_choices.append(f"Speaker {i+1} - {voice_name} ({gender})")
 
 
 
615
 
616
  for idx, turn in enumerate(turns):
617
  spk_num = turn["speaker"]
@@ -698,6 +767,39 @@ def create_demo_interface():
698
  label="CFG Scale",
699
  )
700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  # ---- STEP 4: GENERATE ----
702
  generate_btn = gr.Button(
703
  "Generate Conference Audio", size="lg", variant="primary",
@@ -728,6 +830,19 @@ def create_demo_interface():
728
  outputs=speaker_selections,
729
  )
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  def add_turn(turns):
732
  if len(turns) >= MAX_TURNS:
733
  gr.Warning(f"Maximum {MAX_TURNS} turns reached.")
@@ -801,13 +916,13 @@ def create_demo_interface():
801
  "One more revision, we promise...",
802
  ]
803
 
804
- # outputs: turns, duration, status, title, audio, script_btn, gen_btn, parody, num_speakers, *4 voices
805
  def _script_no_change(status_html):
806
  return (gr.update(), gr.update(), status_html,
807
  gr.update(), gr.update(),
808
  gr.update(), gr.update(),
809
  gr.update(),
810
- gr.update(), *[gr.update()] * 4)
811
 
812
  def _script_buttons_busy(status_html):
813
  return (gr.update(), gr.update(), status_html,
@@ -815,7 +930,7 @@ def create_demo_interface():
815
  gr.update(interactive=False, value="Writing..."),
816
  gr.update(interactive=False),
817
  gr.update(),
818
- gr.update(), *[gr.update()] * 4)
819
 
820
  def _script_buttons_ready(status_html=""):
821
  return (gr.update(), gr.update(), status_html,
@@ -823,7 +938,7 @@ def create_demo_interface():
823
  gr.update(interactive=True, value="Write Script with AI"),
824
  gr.update(interactive=True),
825
  gr.update(),
826
- gr.update(), *[gr.update()] * 4)
827
 
828
  def _make_title_html(title):
829
  if title:
@@ -879,15 +994,20 @@ def create_demo_interface():
879
  yield _script_buttons_ready(f"<em>Error: {msg[:200]}</em>")
880
  return
881
 
882
- turns, detected, title = result["data"]
883
  if not turns:
884
  yield _script_buttons_ready("<em>Empty result — try a more descriptive prompt.</em>")
885
  return
886
 
887
- voices = list(VOICE_DISPLAY[:detected])
 
888
  while len(voices) < 4:
889
  voices.append(None)
890
 
 
 
 
 
891
  audio_label = title if title else AUDIO_LABEL_DEFAULT
892
  yield (turns, estimate_duration(turns), "",
893
  _make_title_html(title),
@@ -895,7 +1015,7 @@ def create_demo_interface():
895
  gr.update(interactive=True, value="Write Script with AI"),
896
  gr.update(interactive=True),
897
  parody_result["lines"],
898
- detected, *voices[:4])
899
 
900
  generate_script_btn.click(
901
  fn=on_generate_script,
@@ -904,13 +1024,13 @@ def create_demo_interface():
904
  script_title_display, complete_audio_output,
905
  generate_script_btn, generate_btn,
906
  parody_lines_state,
907
- num_speakers] + speaker_selections,
908
  )
909
 
910
  # --- Load examples ---
911
  def load_example(idx):
912
  if idx >= len(EXAMPLE_SCRIPTS):
913
- return [], 2, "", "<h3 style='margin:0'>Script</h3>", gr.update(), *[None] * 4
914
 
915
  title = example_names[idx]
916
  script = EXAMPLE_SCRIPTS_NATURAL[idx]
@@ -924,14 +1044,15 @@ def create_demo_interface():
924
  return (turns, num, estimate_duration(turns),
925
  f"<h3 style='margin:0'>{title}</h3>",
926
  gr.update(label=title),
927
- *voices[:4])
928
 
929
  for idx, btn in enumerate(example_buttons):
930
  btn.click(
931
  fn=lambda i=idx: load_example(i),
932
  inputs=[],
933
  outputs=[turns_state, num_speakers, duration_display,
934
- script_title_display, complete_audio_output] + speaker_selections,
 
935
  queue=False,
936
  )
937
 
 
236
  - NEVER embed another character's dialogue inside someone else's turn
237
  - WRONG: "Speaker 1: We need magic. Mom: Hey kids, what's going on?"
238
  - RIGHT: Every time the speaker changes, END the current turn, add a BLANK LINE, then start a NEW turn with "Speaker N:" on its own line
239
+ - Do NOT use character names as inline labels like "Mom:" or "Wizard:" mid-paragraph — always use "Speaker N:" on a fresh line
240
+
241
+ AFTER THE DIALOGUE — Character roster (REQUIRED):
242
+ - After the final dialogue turn, add a blank line, then a single line in this EXACT format:
243
+ Character Genders: Speaker 1: <F or M>, Speaker 2: <F or M>, Speaker 3: <F or M>, Speaker 4: <F or M>
244
+ - Only list speakers you actually used. Use "F" for feminine-presenting voices (women, girls, moms, queens, witches, female narrators) and "M" for masculine-presenting (men, boys, dads, kings, wizards-as-male, male narrators).
245
+ - For gender-ambiguous roles (robots, narrators, dragons), pick whichever fits the tone. Never use "N" or "?"
246
+ - Example: "Character Genders: Speaker 1: M, Speaker 2: M, Speaker 3: F" """
247
 
248
 
249
  # Strip bracketed stage directions, parenthetical cues, and asterisk actions.
 
284
  return text
285
 
286
 
287
+ _GENDER_LINE = re.compile(
288
+ r"character\s+genders\s*:\s*(.+?)$",
289
+ re.IGNORECASE | re.MULTILINE,
290
+ )
291
+ _GENDER_PAIR = re.compile(r"speaker\s+(\d+)\s*:\s*([FM])", re.IGNORECASE)
292
+
293
+
294
+ def _extract_genders(raw: str) -> tuple[str, dict[int, str]]:
295
+ """Find and remove the 'Character Genders: ...' line. Returns (cleaned_text, genders_dict)."""
296
+ genders: dict[int, str] = {}
297
+ m = _GENDER_LINE.search(raw)
298
+ if not m:
299
+ return raw, genders
300
+ for pair in _GENDER_PAIR.finditer(m.group(1)):
301
+ try:
302
+ n = int(pair.group(1))
303
+ g = pair.group(2).upper()
304
+ if 1 <= n <= 4 and g in ("F", "M"):
305
+ genders[n] = g
306
+ except ValueError:
307
+ pass
308
+ cleaned = raw[: m.start()].rstrip() + "\n" + raw[m.end():].lstrip()
309
+ return cleaned, genders
310
+
311
+
312
+ def assign_voices_by_gender(genders: dict[int, str], num_speakers: int) -> list[str]:
313
+ """Return a list of 4 voice-display strings, picking matching-gender voices without duplicates.
314
+
315
+ Falls back to DEFAULT_SPEAKERS_DISPLAY if no gender info for a slot.
316
+ """
317
+ female_pool = [v for v in AVAILABLE_VOICES if VOICE_GENDERS.get(v) == "F"]
318
+ male_pool = [v for v in AVAILABLE_VOICES if VOICE_GENDERS.get(v) == "M"]
319
+ used: set[str] = set()
320
+ chosen: list[str] = []
321
+
322
+ for i in range(4):
323
+ slot = i + 1
324
+ if slot <= num_speakers:
325
+ g = genders.get(slot)
326
+ pool = female_pool if g == "F" else (male_pool if g == "M" else AVAILABLE_VOICES)
327
+ pick = next((v for v in pool if v not in used), None)
328
+ if pick is None:
329
+ # exhausted preferred pool — fall back to any unused voice
330
+ pick = next((v for v in AVAILABLE_VOICES if v not in used), AVAILABLE_VOICES[0])
331
+ used.add(pick)
332
+ chosen.append(f"{pick} ({VOICE_GENDERS.get(pick, '?')})")
333
+ else:
334
+ chosen.append(DEFAULT_SPEAKERS_DISPLAY[i] if i < len(DEFAULT_SPEAKERS_DISPLAY) else None)
335
+ return chosen
336
+
337
+
338
+ def generate_script_from_prompt(prompt: str) -> tuple[list[dict], int, str, list[str]]:
339
+ """Returns (turns, num_speakers, title, voice_selections)."""
340
  system = SCRIPT_SYSTEM_PROMPT.format(max_words=SCRIPT_MAX_WORDS)
341
  response = llm_client.chat_completion(
342
  messages=[
 
355
  title = lines[0].split(":", 1)[1].strip()
356
  raw = "\n".join(lines[1:])
357
 
358
+ # Extract and strip the "Character Genders:" line before parsing turns
359
+ raw, genders = _extract_genders(raw)
360
+
361
  turns = parse_script_to_turns(raw)
362
  # Scrub stage directions from each turn, drop any turn that becomes empty
363
  turns = [
 
372
  total_words = sum(len(t["text"].split()) for t in turns)
373
  speaker_ids = {t["speaker"] for t in turns}
374
  num_speakers = max(min(len(speaker_ids), 4), 1) if speaker_ids else 1
375
+
376
+ voice_selections = assign_voices_by_gender(genders, num_speakers)
377
+ return turns, num_speakers, title, voice_selections
378
 
379
 
380
  PARODY_SYSTEM_PROMPT = """You are a comedian narrator. The user will give you a scenario. Write a SHORT, funny behind-the-scenes narration of what's "really" happening while their audio is being generated. Be absurd, self-aware, and poke fun at AI.
 
606
  turns_state = gr.State([])
607
  script_title_state = gr.State("")
608
  parody_lines_state = gr.State([]) # funny loading story for audio generation
609
+ # Current voice selection per speaker slot (list of 4 display strings like "Cherry (F)")
610
+ voice_selections_state = gr.State(list(DEFAULT_SPEAKERS_DISPLAY))
611
 
612
  # ---- BANNER ----
613
  gr.HTML("""
 
660
  duration_display = gr.HTML(value="")
661
 
662
  with gr.Column(elem_classes="conversation-scroll"):
663
+ @gr.render(inputs=[turns_state, voice_selections_state])
664
+ def render_turns(turns, voice_sels):
665
  if not turns:
666
  gr.Markdown(
667
  "Your conversation will appear here.\n\n"
 
671
  )
672
  return
673
 
674
+ # Build speaker choice labels from the CURRENT voice selections,
675
+ # so changing a Voice dropdown below propagates to the tags above.
676
  speaker_choices = []
677
  for i in range(4):
678
+ sel = voice_sels[i] if voice_sels and i < len(voice_sels) else None
679
+ if sel:
680
+ # sel looks like "Cherry (F)" — display directly
681
+ speaker_choices.append(f"Speaker {i+1} - {sel}")
682
+ else:
683
+ speaker_choices.append(f"Speaker {i+1}")
684
 
685
  for idx, turn in enumerate(turns):
686
  spk_num = turn["speaker"]
 
767
  label="CFG Scale",
768
  )
769
 
770
+ # ---- Voice preview ----
771
+ with gr.Accordion("🔊 Preview voices before generating", open=False):
772
+ with gr.Row():
773
+ preview_voice = gr.Dropdown(
774
+ choices=VOICE_DISPLAY,
775
+ value=VOICE_DISPLAY[0] if VOICE_DISPLAY else None,
776
+ label="Pick a voice",
777
+ scale=2,
778
+ )
779
+ preview_audio = gr.Audio(
780
+ label="Sample",
781
+ value=os.path.join("public", "voices", f"{AVAILABLE_VOICES[0]}.mp3") if AVAILABLE_VOICES else None,
782
+ autoplay=False,
783
+ show_download_button=False,
784
+ scale=3,
785
+ )
786
+
787
+ def _load_preview(display: str):
788
+ name = voice_display_to_name(display) if display else None
789
+ if not name:
790
+ return gr.update(value=None)
791
+ path = os.path.join("public", "voices", f"{name}.mp3")
792
+ if not os.path.exists(path):
793
+ return gr.update(value=None)
794
+ return gr.update(value=path)
795
+
796
+ preview_voice.change(
797
+ fn=_load_preview,
798
+ inputs=[preview_voice],
799
+ outputs=[preview_audio],
800
+ queue=False,
801
+ )
802
+
803
  # ---- STEP 4: GENERATE ----
804
  generate_btn = gr.Button(
805
  "Generate Conference Audio", size="lg", variant="primary",
 
830
  outputs=speaker_selections,
831
  )
832
 
833
+ # Two-way sync: when a Voice dropdown changes, update voice_selections_state
834
+ # so the script turn tags re-render with the new voice label.
835
+ def _sync_voice_state(*voices):
836
+ return list(voices)
837
+
838
+ for sel in speaker_selections:
839
+ sel.change(
840
+ fn=_sync_voice_state,
841
+ inputs=speaker_selections,
842
+ outputs=[voice_selections_state],
843
+ queue=False,
844
+ )
845
+
846
  def add_turn(turns):
847
  if len(turns) >= MAX_TURNS:
848
  gr.Warning(f"Maximum {MAX_TURNS} turns reached.")
 
916
  "One more revision, we promise...",
917
  ]
918
 
919
+ # outputs: turns, duration, status, title, audio, script_btn, gen_btn, parody, num_speakers, *4 voices, voice_selections_state
920
  def _script_no_change(status_html):
921
  return (gr.update(), gr.update(), status_html,
922
  gr.update(), gr.update(),
923
  gr.update(), gr.update(),
924
  gr.update(),
925
+ gr.update(), *[gr.update()] * 4, gr.update())
926
 
927
  def _script_buttons_busy(status_html):
928
  return (gr.update(), gr.update(), status_html,
 
930
  gr.update(interactive=False, value="Writing..."),
931
  gr.update(interactive=False),
932
  gr.update(),
933
+ gr.update(), *[gr.update()] * 4, gr.update())
934
 
935
  def _script_buttons_ready(status_html=""):
936
  return (gr.update(), gr.update(), status_html,
 
938
  gr.update(interactive=True, value="Write Script with AI"),
939
  gr.update(interactive=True),
940
  gr.update(),
941
+ gr.update(), *[gr.update()] * 4, gr.update())
942
 
943
  def _make_title_html(title):
944
  if title:
 
994
  yield _script_buttons_ready(f"<em>Error: {msg[:200]}</em>")
995
  return
996
 
997
+ turns, detected, title, voice_picks = result["data"]
998
  if not turns:
999
  yield _script_buttons_ready("<em>Empty result — try a more descriptive prompt.</em>")
1000
  return
1001
 
1002
+ # voice_picks is a list of 4 display strings from assign_voices_by_gender
1003
+ voices = list(voice_picks)
1004
  while len(voices) < 4:
1005
  voices.append(None)
1006
 
1007
+ # Strip "Speaker N - " style prefixes so the Voice dropdowns get clean values.
1008
+ # assign_voices_by_gender already returns display strings like "Cherry (F)".
1009
+ clean_voices = voices[:4]
1010
+
1011
  audio_label = title if title else AUDIO_LABEL_DEFAULT
1012
  yield (turns, estimate_duration(turns), "",
1013
  _make_title_html(title),
 
1015
  gr.update(interactive=True, value="Write Script with AI"),
1016
  gr.update(interactive=True),
1017
  parody_result["lines"],
1018
+ detected, *clean_voices, clean_voices)
1019
 
1020
  generate_script_btn.click(
1021
  fn=on_generate_script,
 
1024
  script_title_display, complete_audio_output,
1025
  generate_script_btn, generate_btn,
1026
  parody_lines_state,
1027
+ num_speakers] + speaker_selections + [voice_selections_state],
1028
  )
1029
 
1030
  # --- Load examples ---
1031
  def load_example(idx):
1032
  if idx >= len(EXAMPLE_SCRIPTS):
1033
+ return [], 2, "", "<h3 style='margin:0'>Script</h3>", gr.update(), *[None] * 4, list(DEFAULT_SPEAKERS_DISPLAY)
1034
 
1035
  title = example_names[idx]
1036
  script = EXAMPLE_SCRIPTS_NATURAL[idx]
 
1044
  return (turns, num, estimate_duration(turns),
1045
  f"<h3 style='margin:0'>{title}</h3>",
1046
  gr.update(label=title),
1047
+ *voices[:4], voices[:4])
1048
 
1049
  for idx, btn in enumerate(example_buttons):
1050
  btn.click(
1051
  fn=lambda i=idx: load_example(i),
1052
  inputs=[],
1053
  outputs=[turns_state, num_speakers, duration_display,
1054
+ script_title_display, complete_audio_output]
1055
+ + speaker_selections + [voice_selections_state],
1056
  queue=False,
1057
  )
1058