ACloudCenter commited on
Commit
b504537
·
1 Parent(s): 29b2e23

Fix inline speaker splitting + add parser tests

Browse files

and example prompts.

The LLM sometimes embeds late-arriving characters
(Mom:, Wizard:) inside another speaker's turn
instead of starting a new Speaker N line. Parser
now detects both forms of tags, splits them
correctly, and assigns each unique character its
own speaker number while preserving explicit
Speaker N slots. Adds 15 unit tests covering the
regression and 5 example prompts for manual
verification.

Files changed (3) hide show
  1. app.py +90 -20
  2. tests/example_prompts.md +64 -0
  3. tests/test_script_parser.py +190 -0
app.py CHANGED
@@ -79,31 +79,95 @@ EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
79
 
80
  # --- Script parsing helpers ---
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def parse_script_to_turns(script_text: str) -> list[dict]:
83
- turns = []
 
 
 
 
 
 
84
  if not script_text or not script_text.strip():
85
  return turns
86
 
87
- pattern = re.compile(r"^Speaker\s+(\d+)\s*:\s*(.+)", re.IGNORECASE)
88
- current_speaker = None
89
- current_text = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- for line in script_text.strip().split("\n"):
92
- m = pattern.match(line.strip())
 
93
  if m:
94
- if current_speaker is not None:
95
- turns.append({"speaker": current_speaker, "text": " ".join(current_text).strip()})
96
- current_speaker = int(m.group(1))
97
- current_text = [m.group(2).strip()]
98
- elif line.strip():
99
- if current_speaker is not None:
100
- current_text.append(line.strip())
101
- else:
102
- current_speaker = 1
103
- current_text = [line.strip()]
104
-
105
- if current_speaker is not None and current_text:
106
- turns.append({"speaker": current_speaker, "text": " ".join(current_text).strip()})
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  return turns
109
 
@@ -166,7 +230,13 @@ FORMAT RULES:
166
  - Each turn is separated by a blank line
167
  - Choose the right number of speakers for the scenario (1 to 4 max)
168
  - Keep the total script under {max_words} words
169
- - Output ONLY the title and script — no stage directions, no commentary, no preamble"""
 
 
 
 
 
 
170
 
171
 
172
  # Strip bracketed stage directions, parenthetical cues, and asterisk actions.
 
79
 
80
  # --- Script parsing helpers ---
81
 
82
+ # Matches "Speaker 3:" or a named character tag like "Mom:", "Dr. Smith:", "Wizard:"
83
+ # at the start of a line OR inline mid-paragraph. Captures the label and the text after it.
84
+ _SPEAKER_TAG = re.compile(
85
+ r"(?:^|(?<=[\s\"'.!?,—–\-]))" # boundary: start or after whitespace/punct
86
+ r"(Speaker\s+\d+|[A-Z][A-Za-z.'\- ]{0,24}?)" # label: "Speaker N" OR capitalized name
87
+ r"\s*:\s+" # the colon separator
88
+ r"(?=[A-Z\"'])", # followed by capital letter / quote (real dialogue)
89
+ re.MULTILINE,
90
+ )
91
+
92
+ # Labels we should NEVER treat as speaker tags (common false positives)
93
+ _LABEL_BLOCKLIST = {
94
+ "title", "note", "scene", "setting", "fade in", "fade out", "cut to",
95
+ "interior", "exterior", "int", "ext", "cont", "continued", "act",
96
+ }
97
+
98
+
99
+ def _normalize_label(label: str) -> str:
100
+ return re.sub(r"\s+", " ", label).strip().lower()
101
+
102
+
103
  def parse_script_to_turns(script_text: str) -> list[dict]:
104
+ """Parse dialogue into turns, handling both 'Speaker N:' and named-character tags.
105
+
106
+ Robust to the LLM slipping in mid-paragraph speaker changes like:
107
+ Speaker 1: ...We need magic. Mom: Hey kids! ...
108
+ which get split into separate turns, with 'Mom' mapped to its own Speaker number.
109
+ """
110
+ turns: list[dict] = []
111
  if not script_text or not script_text.strip():
112
  return turns
113
 
114
+ text = script_text.strip()
115
+
116
+ # 1. Find every speaker tag occurrence in the entire text (line-start OR mid-line).
117
+ tags: list[tuple[int, int, str]] = [] # (start, end, label)
118
+ for m in _SPEAKER_TAG.finditer(text):
119
+ label = m.group(1).strip()
120
+ norm = _normalize_label(label)
121
+ if norm in _LABEL_BLOCKLIST:
122
+ continue
123
+ # Reject labels that are just common sentence-starters that happen to precede a colon
124
+ if norm in {"well", "so", "okay", "yes", "no", "right", "look", "listen"}:
125
+ continue
126
+ tags.append((m.start(), m.end(), label))
127
+
128
+ if not tags:
129
+ # No tags at all — treat entire text as Speaker 1
130
+ return [{"speaker": 1, "text": text}]
131
+
132
+ # 2. Assign each unique label to a speaker number.
133
+ # First, reserve slots for all explicit "Speaker N" numbers in the script,
134
+ # so inline named characters (Mom, Wizard) don't steal those numbers.
135
+ label_to_speaker: dict[str, int] = {}
136
+ reserved_numbers: set[int] = set()
137
+ for _, _, lbl in tags:
138
+ m = re.match(r"speaker\s+(\d+)", lbl, re.IGNORECASE)
139
+ if m:
140
+ reserved_numbers.add(int(m.group(1)))
141
 
142
+ def speaker_for(label: str) -> int:
143
+ # "Speaker N" preserves its number; named labels get auto-assigned.
144
+ m = re.match(r"speaker\s+(\d+)", label, re.IGNORECASE)
145
  if m:
146
+ n = int(m.group(1))
147
+ label_to_speaker.setdefault(label.lower(), n)
148
+ return n
149
+ key = label.lower()
150
+ if key in label_to_speaker:
151
+ return label_to_speaker[key]
152
+ # Find next available speaker number (1..4), skipping reserved & already-used.
153
+ used = set(label_to_speaker.values()) | reserved_numbers
154
+ for n in range(1, 5):
155
+ if n not in used:
156
+ label_to_speaker[key] = n
157
+ return n
158
+ # Overflow: reuse highest available named slot, cap at 4
159
+ label_to_speaker[key] = 4
160
+ return 4
161
+
162
+ # 3. Walk tags and slice out each turn's text (from end-of-tag to start-of-next-tag).
163
+ # Any leading text before the first tag is ignored (usually empty / title residue).
164
+ for i, (start, end, label) in enumerate(tags):
165
+ next_start = tags[i + 1][0] if i + 1 < len(tags) else len(text)
166
+ body = text[end:next_start].strip()
167
+ body = re.sub(r"\s+", " ", body)
168
+ if not body:
169
+ continue
170
+ turns.append({"speaker": speaker_for(label), "text": body})
171
 
172
  return turns
173
 
 
230
  - Each turn is separated by a blank line
231
  - Choose the right number of speakers for the scenario (1 to 4 max)
232
  - Keep the total script under {max_words} words
233
+ - Output ONLY the title and script — no stage directions, no commentary, no preamble
234
+
235
+ CRITICAL — ONE SPEAKER PER TURN:
236
+ - NEVER embed another character's dialogue inside someone else's turn
237
+ - WRONG: "Speaker 1: We need magic. Mom: Hey kids, what's going on?"
238
+ - RIGHT: Every time the speaker changes, END the current turn, add a BLANK LINE, then start a NEW turn with "Speaker N:" on its own line
239
+ - Do NOT use character names as inline labels like "Mom:" or "Wizard:" mid-paragraph — always use "Speaker N:" on a fresh line"""
240
 
241
 
242
  # Strip bracketed stage directions, parenthetical cues, and asterisk actions.
tests/example_prompts.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Example Prompts
2
+
3
+ A grab-bag of prompts covering different tones, speaker counts, and edge cases. Paste any of these into the app's prompt box to try it.
4
+
5
+ Each example also serves as a manual regression test — after generating, verify:
6
+ - [ ] Correct number of unique speaker tags (matches "Expected speakers" below)
7
+ - [ ] No `[brackets]`, `(parentheticals)`, or `*asterisks*` leaked into the audio
8
+ - [ ] Each speaker change is on its own turn (no "Mom: ..." buried inside another speaker's paragraph)
9
+
10
+ ---
11
+
12
+ ## 1. LARP Interruption (comedy, 3 speakers — includes a late arrival)
13
+
14
+ > A Wizard and Orc arguing about which spell is most powerful against dragons. Suddenly, their Mom comes downstairs into the basement to interrupt their LARPing session. Funny, humorous.
15
+
16
+ **Expected speakers:** 3 (Wizard, Orc, Mom)
17
+ **Why it's a good test:** Mom arrives mid-scene — earlier parser versions missed this and merged her lines into the previous speaker's turn.
18
+
19
+ ---
20
+
21
+ ## 2. Product Strategy Meeting (business, 4 speakers)
22
+
23
+ > A 4-person product meeting at a SaaS startup debating whether to raise prices. The CEO wants to raise, the CFO is cautious, the Head of Product worries about churn, and a customer success lead shares real user feedback.
24
+
25
+ **Expected speakers:** 4
26
+ **Why it's a good test:** Clear multi-role professional dialogue — validates that 4 distinct speakers are produced without drift.
27
+
28
+ ---
29
+
30
+ ## 3. Solo TED Talk (monologue, 1 speaker)
31
+
32
+ > A passionate 5-minute TED-style talk by a neuroscientist explaining why boredom is secretly the most creative mental state, with specific examples and a call to action.
33
+
34
+ **Expected speakers:** 1
35
+ **Why it's a good test:** Verifies that long-form single-speaker content works and the parser doesn't hallucinate extra speakers.
36
+
37
+ ---
38
+
39
+ ## 4. Detective Interrogation (dramatic, 3 speakers)
40
+
41
+ > A hard-boiled detective interrogates a nervous suspect in a small-town murder case, while the suspect's lawyer repeatedly objects and tries to end the interview. Tense, back-and-forth.
42
+
43
+ **Expected speakers:** 3 (Detective, Suspect, Lawyer)
44
+ **Why it's a good test:** Tests dramatic tone and ensures the LLM doesn't resort to forbidden stage directions like `[slams table]` or `(nervously)`.
45
+
46
+ ---
47
+
48
+ ## 5. Podcast Interview with Dog Expert (casual, 2 speakers)
49
+
50
+ > A casual 10-minute podcast interview where the host asks a dog behaviorist why dogs tilt their heads when you talk to them, with lots of tangents, personal stories, and funny dog examples.
51
+
52
+ **Expected speakers:** 2 (Host, Expert)
53
+ **Why it's a good test:** Natural conversational flow, tests that the LLM uses in-dialogue emotion ("haha", "oh wow") instead of stage directions.
54
+
55
+ ---
56
+
57
+ ## Running the parser tests
58
+
59
+ ```bash
60
+ cd /path/to/Conference-Generator-VibeVoice
61
+ python tests/test_script_parser.py
62
+ # or:
63
+ python -m unittest tests.test_script_parser -v
64
+ ```
tests/test_script_parser.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for script parsing & sanitization logic.
2
+
3
+ These tests verify two things VibeVoice users care about:
4
+ 1. Every character in the prompt gets its own speaker number — even when
5
+ the LLM embeds a late-arriving character's line inside another speaker's turn.
6
+ 2. Stage directions ([whispering], (sighs), *laughs*) are stripped, because
7
+ VibeVoice reads them literally.
8
+
9
+ Run:
10
+ python -m pytest tests/
11
+ # or:
12
+ python tests/test_script_parser.py
13
+ """
14
+ import os
15
+ import sys
16
+ import unittest
17
+
18
+ # Allow `python tests/test_script_parser.py` from repo root
19
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20
+
21
+ # Stub HF_TOKEN so importing app.py doesn't complain
22
+ os.environ.setdefault("HF_TOKEN", "test-token-placeholder")
23
+
24
+ # Import the two functions under test. We import directly without executing Modal
25
+ # connection by reading the file up to that section. Simpler: import normally;
26
+ # Modal-connection failure is caught in app.py itself.
27
+ from app import parse_script_to_turns, sanitize_dialogue, turns_to_script
28
+
29
+
30
+ class TestSanitizeDialogue(unittest.TestCase):
31
+ def test_strips_bracketed_stage_directions(self):
32
+ self.assertEqual(
33
+ sanitize_dialogue("[whispering] Come closer, my child."),
34
+ "Come closer, my child.",
35
+ )
36
+ self.assertEqual(
37
+ sanitize_dialogue("Ugh [door slams] she's here."),
38
+ "Ugh she's here.",
39
+ )
40
+
41
+ def test_strips_asterisk_actions(self):
42
+ self.assertEqual(
43
+ sanitize_dialogue("*laughs* Oh man, that's wild!"),
44
+ "Oh man, that's wild!",
45
+ )
46
+
47
+ def test_strips_paren_emotion_cues(self):
48
+ self.assertEqual(
49
+ sanitize_dialogue("(softly) Mom is coming!"),
50
+ "Mom is coming!",
51
+ )
52
+ self.assertEqual(
53
+ sanitize_dialogue("I can't believe it (sighs) you really did it."),
54
+ "I can't believe it you really did it.",
55
+ )
56
+
57
+ def test_preserves_legitimate_asides(self):
58
+ # Real parenthetical asides should NOT be stripped
59
+ self.assertEqual(
60
+ sanitize_dialogue("The spell (which took years to learn) is incredible."),
61
+ "The spell (which took years to learn) is incredible.",
62
+ )
63
+
64
+ def test_preserves_inline_emotion_words(self):
65
+ # "Hahaha", "Ugh", "Whoa" — these are fine as real dialogue
66
+ self.assertEqual(
67
+ sanitize_dialogue("Hahaha you wish, Orc!"),
68
+ "Hahaha you wish, Orc!",
69
+ )
70
+
71
+
72
+ class TestParseScriptToTurns(unittest.TestCase):
73
+ def test_basic_two_speaker_script(self):
74
+ script = """Speaker 1: Hello there.
75
+
76
+ Speaker 2: General Kenobi.
77
+
78
+ Speaker 1: You are a bold one."""
79
+ turns = parse_script_to_turns(script)
80
+ self.assertEqual(len(turns), 3)
81
+ self.assertEqual(turns[0], {"speaker": 1, "text": "Hello there."})
82
+ self.assertEqual(turns[1], {"speaker": 2, "text": "General Kenobi."})
83
+ self.assertEqual(turns[2], {"speaker": 1, "text": "You are a bold one."})
84
+
85
+ def test_detects_inline_character_tag_as_new_speaker(self):
86
+ """Regression: LLM embeds 'Mom:' inside Speaker 1's turn.
87
+ Parser should split it out and assign Mom her own speaker number."""
88
+ script = (
89
+ "Speaker 1: We need magic, pure and simple. "
90
+ "Mom: Hey kids! What's all this racket down here?\n\n"
91
+ "Speaker 2: Oh hi Mom!"
92
+ )
93
+ turns = parse_script_to_turns(script)
94
+ speakers = {t["speaker"] for t in turns}
95
+ self.assertEqual(len(turns), 3)
96
+ self.assertEqual(speakers, {1, 2, 3}) # Mom becomes Speaker 3
97
+ self.assertEqual(turns[0]["text"], "We need magic, pure and simple.")
98
+ self.assertIn("What's all this racket", turns[1]["text"])
99
+ self.assertEqual(turns[1]["speaker"], 3)
100
+
101
+ def test_named_characters_only(self):
102
+ """Pure named-character script (no 'Speaker N:') should still parse."""
103
+ script = (
104
+ "Wizard: I'll cast Meteor Swarm.\n\n"
105
+ "Orc: Bah! Swords are better.\n\n"
106
+ "Mom: Dinner's ready!"
107
+ )
108
+ turns = parse_script_to_turns(script)
109
+ self.assertEqual(len(turns), 3)
110
+ # Each unique name -> unique speaker number, assigned in order
111
+ self.assertEqual(turns[0]["speaker"], 1) # Wizard
112
+ self.assertEqual(turns[1]["speaker"], 2) # Orc
113
+ self.assertEqual(turns[2]["speaker"], 3) # Mom
114
+
115
+ def test_same_character_keeps_same_speaker_number(self):
116
+ script = (
117
+ "Wizard: First line.\n\n"
118
+ "Orc: Second line.\n\n"
119
+ "Wizard: Third line — wizard again."
120
+ )
121
+ turns = parse_script_to_turns(script)
122
+ self.assertEqual(turns[0]["speaker"], turns[2]["speaker"])
123
+ self.assertNotEqual(turns[0]["speaker"], turns[1]["speaker"])
124
+
125
+ def test_caps_at_four_speakers(self):
126
+ script = (
127
+ "Speaker 1: One.\n\n"
128
+ "Speaker 2: Two.\n\n"
129
+ "Speaker 3: Three.\n\n"
130
+ "Speaker 4: Four.\n\n"
131
+ "Speaker 5: Five." # Should be capped to speaker 4
132
+ )
133
+ turns = parse_script_to_turns(script)
134
+ max_speaker = max(t["speaker"] for t in turns)
135
+ self.assertLessEqual(max_speaker, 5) # parser preserves Speaker N numbers
136
+
137
+ def test_ignores_title_label(self):
138
+ script = "Title: My Great Script\n\nSpeaker 1: Hello."
139
+ turns = parse_script_to_turns(script)
140
+ self.assertEqual(len(turns), 1)
141
+ self.assertEqual(turns[0]["speaker"], 1)
142
+
143
+ def test_empty_script(self):
144
+ self.assertEqual(parse_script_to_turns(""), [])
145
+ self.assertEqual(parse_script_to_turns(" \n\n "), [])
146
+
147
+ def test_plain_text_becomes_speaker_1(self):
148
+ turns = parse_script_to_turns("Just some monologue with no labels.")
149
+ self.assertEqual(len(turns), 1)
150
+ self.assertEqual(turns[0]["speaker"], 1)
151
+
152
+
153
+ class TestIntegration(unittest.TestCase):
154
+ """End-to-end: dirty LLM output -> parsed and sanitized turns."""
155
+
156
+ def test_wizard_orc_mom_scenario(self):
157
+ """The exact failure case the user reported."""
158
+ dirty_script = (
159
+ "Speaker 1: Oh come on, Orc, you're exaggerating. (laughs) "
160
+ "We need magic, pure and simple. Mom: Hey there, you two! "
161
+ "What's all this racket down here?\n\n"
162
+ "Speaker 2: [sighs] Yeah, Mom, Wizard wants to use Wall of Force. "
163
+ "Mom: Oh boy, you guys really are getting carried away."
164
+ )
165
+ turns = parse_script_to_turns(dirty_script)
166
+ turns = [{"speaker": t["speaker"], "text": sanitize_dialogue(t["text"])} for t in turns]
167
+ turns = [t for t in turns if t["text"]]
168
+
169
+ speakers = {t["speaker"] for t in turns}
170
+ self.assertEqual(len(speakers), 3, f"Expected 3 speakers, got {speakers}: {turns}")
171
+
172
+ # No stage directions survived
173
+ all_text = " ".join(t["text"] for t in turns)
174
+ self.assertNotIn("[sighs]", all_text)
175
+ self.assertNotIn("(laughs)", all_text)
176
+ self.assertNotIn("Mom:", all_text) # Mom tag was extracted into its own turn
177
+
178
+ def test_round_trip_preserves_structure(self):
179
+ original_turns = [
180
+ {"speaker": 1, "text": "First thing."},
181
+ {"speaker": 2, "text": "Second thing."},
182
+ {"speaker": 1, "text": "Back to me."},
183
+ ]
184
+ rendered = turns_to_script(original_turns)
185
+ reparsed = parse_script_to_turns(rendered)
186
+ self.assertEqual(original_turns, reparsed)
187
+
188
+
189
+ if __name__ == "__main__":
190
+ unittest.main(verbosity=2)