Commit ·
b504537
1
Parent(s): 29b2e23
Fix inline speaker splitting + add parser tests
Browse filesand example prompts.
The LLM sometimes embeds late-arriving characters
(Mom:, Wizard:) inside another speaker's turn
instead of starting a new Speaker N line. Parser
now detects both forms of tags, splits them
correctly, and assigns each unique character its
own speaker number while preserving explicit
Speaker N slots. Adds 15 unit tests covering the
regression and 5 example prompts for manual
verification.
- app.py +90 -20
- tests/example_prompts.md +64 -0
- tests/test_script_parser.py +190 -0
app.py
CHANGED
|
@@ -79,31 +79,95 @@ EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
|
|
| 79 |
|
| 80 |
# --- Script parsing helpers ---
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
def parse_script_to_turns(script_text: str) -> list[dict]:
|
| 83 |
-
turns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
if not script_text or not script_text.strip():
|
| 85 |
return turns
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
|
|
|
| 93 |
if m:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
return turns
|
| 109 |
|
|
@@ -166,7 +230,13 @@ FORMAT RULES:
|
|
| 166 |
- Each turn is separated by a blank line
|
| 167 |
- Choose the right number of speakers for the scenario (1 to 4 max)
|
| 168 |
- Keep the total script under {max_words} words
|
| 169 |
-
- Output ONLY the title and script — no stage directions, no commentary, no preamble
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
|
| 172 |
# Strip bracketed stage directions, parenthetical cues, and asterisk actions.
|
|
|
|
| 79 |
|
| 80 |
# --- Script parsing helpers ---
|
| 81 |
|
| 82 |
+
# Matches "Speaker 3:" or a named character tag like "Mom:", "Dr. Smith:", "Wizard:"
|
| 83 |
+
# at the start of a line OR inline mid-paragraph. Captures the label and the text after it.
|
| 84 |
+
_SPEAKER_TAG = re.compile(
|
| 85 |
+
r"(?:^|(?<=[\s\"'.!?,—–\-]))" # boundary: start or after whitespace/punct
|
| 86 |
+
r"(Speaker\s+\d+|[A-Z][A-Za-z.'\- ]{0,24}?)" # label: "Speaker N" OR capitalized name
|
| 87 |
+
r"\s*:\s+" # the colon separator
|
| 88 |
+
r"(?=[A-Z\"'])", # followed by capital letter / quote (real dialogue)
|
| 89 |
+
re.MULTILINE,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Labels we should NEVER treat as speaker tags (common false positives)
|
| 93 |
+
_LABEL_BLOCKLIST = {
|
| 94 |
+
"title", "note", "scene", "setting", "fade in", "fade out", "cut to",
|
| 95 |
+
"interior", "exterior", "int", "ext", "cont", "continued", "act",
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _normalize_label(label: str) -> str:
|
| 100 |
+
return re.sub(r"\s+", " ", label).strip().lower()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
def parse_script_to_turns(script_text: str) -> list[dict]:
|
| 104 |
+
"""Parse dialogue into turns, handling both 'Speaker N:' and named-character tags.
|
| 105 |
+
|
| 106 |
+
Robust to the LLM slipping in mid-paragraph speaker changes like:
|
| 107 |
+
Speaker 1: ...We need magic. Mom: Hey kids! ...
|
| 108 |
+
which get split into separate turns, with 'Mom' mapped to its own Speaker number.
|
| 109 |
+
"""
|
| 110 |
+
turns: list[dict] = []
|
| 111 |
if not script_text or not script_text.strip():
|
| 112 |
return turns
|
| 113 |
|
| 114 |
+
text = script_text.strip()
|
| 115 |
+
|
| 116 |
+
# 1. Find every speaker tag occurrence in the entire text (line-start OR mid-line).
|
| 117 |
+
tags: list[tuple[int, int, str]] = [] # (start, end, label)
|
| 118 |
+
for m in _SPEAKER_TAG.finditer(text):
|
| 119 |
+
label = m.group(1).strip()
|
| 120 |
+
norm = _normalize_label(label)
|
| 121 |
+
if norm in _LABEL_BLOCKLIST:
|
| 122 |
+
continue
|
| 123 |
+
# Reject labels that are just common sentence-starters that happen to precede a colon
|
| 124 |
+
if norm in {"well", "so", "okay", "yes", "no", "right", "look", "listen"}:
|
| 125 |
+
continue
|
| 126 |
+
tags.append((m.start(), m.end(), label))
|
| 127 |
+
|
| 128 |
+
if not tags:
|
| 129 |
+
# No tags at all — treat entire text as Speaker 1
|
| 130 |
+
return [{"speaker": 1, "text": text}]
|
| 131 |
+
|
| 132 |
+
# 2. Assign each unique label to a speaker number.
|
| 133 |
+
# First, reserve slots for all explicit "Speaker N" numbers in the script,
|
| 134 |
+
# so inline named characters (Mom, Wizard) don't steal those numbers.
|
| 135 |
+
label_to_speaker: dict[str, int] = {}
|
| 136 |
+
reserved_numbers: set[int] = set()
|
| 137 |
+
for _, _, lbl in tags:
|
| 138 |
+
m = re.match(r"speaker\s+(\d+)", lbl, re.IGNORECASE)
|
| 139 |
+
if m:
|
| 140 |
+
reserved_numbers.add(int(m.group(1)))
|
| 141 |
|
| 142 |
+
def speaker_for(label: str) -> int:
|
| 143 |
+
# "Speaker N" preserves its number; named labels get auto-assigned.
|
| 144 |
+
m = re.match(r"speaker\s+(\d+)", label, re.IGNORECASE)
|
| 145 |
if m:
|
| 146 |
+
n = int(m.group(1))
|
| 147 |
+
label_to_speaker.setdefault(label.lower(), n)
|
| 148 |
+
return n
|
| 149 |
+
key = label.lower()
|
| 150 |
+
if key in label_to_speaker:
|
| 151 |
+
return label_to_speaker[key]
|
| 152 |
+
# Find next available speaker number (1..4), skipping reserved & already-used.
|
| 153 |
+
used = set(label_to_speaker.values()) | reserved_numbers
|
| 154 |
+
for n in range(1, 5):
|
| 155 |
+
if n not in used:
|
| 156 |
+
label_to_speaker[key] = n
|
| 157 |
+
return n
|
| 158 |
+
# Overflow: reuse highest available named slot, cap at 4
|
| 159 |
+
label_to_speaker[key] = 4
|
| 160 |
+
return 4
|
| 161 |
+
|
| 162 |
+
# 3. Walk tags and slice out each turn's text (from end-of-tag to start-of-next-tag).
|
| 163 |
+
# Any leading text before the first tag is ignored (usually empty / title residue).
|
| 164 |
+
for i, (start, end, label) in enumerate(tags):
|
| 165 |
+
next_start = tags[i + 1][0] if i + 1 < len(tags) else len(text)
|
| 166 |
+
body = text[end:next_start].strip()
|
| 167 |
+
body = re.sub(r"\s+", " ", body)
|
| 168 |
+
if not body:
|
| 169 |
+
continue
|
| 170 |
+
turns.append({"speaker": speaker_for(label), "text": body})
|
| 171 |
|
| 172 |
return turns
|
| 173 |
|
|
|
|
| 230 |
- Each turn is separated by a blank line
|
| 231 |
- Choose the right number of speakers for the scenario (1 to 4 max)
|
| 232 |
- Keep the total script under {max_words} words
|
| 233 |
+
- Output ONLY the title and script — no stage directions, no commentary, no preamble
|
| 234 |
+
|
| 235 |
+
CRITICAL — ONE SPEAKER PER TURN:
|
| 236 |
+
- NEVER embed another character's dialogue inside someone else's turn
|
| 237 |
+
- WRONG: "Speaker 1: We need magic. Mom: Hey kids, what's going on?"
|
| 238 |
+
- RIGHT: Every time the speaker changes, END the current turn, add a BLANK LINE, then start a NEW turn with "Speaker N:" on its own line
|
| 239 |
+
- Do NOT use character names as inline labels like "Mom:" or "Wizard:" mid-paragraph — always use "Speaker N:" on a fresh line"""
|
| 240 |
|
| 241 |
|
| 242 |
# Strip bracketed stage directions, parenthetical cues, and asterisk actions.
|
tests/example_prompts.md
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Example Prompts
|
| 2 |
+
|
| 3 |
+
A grab-bag of prompts covering different tones, speaker counts, and edge cases. Paste any of these into the app's prompt box to try it.
|
| 4 |
+
|
| 5 |
+
Each example also serves as a manual regression test — after generating, verify:
|
| 6 |
+
- [ ] Correct number of unique speaker tags (matches "Expected speakers" below)
|
| 7 |
+
- [ ] No `[brackets]`, `(parentheticals)`, or `*asterisks*` leaked into the audio
|
| 8 |
+
- [ ] Each speaker change is on its own turn (no "Mom: ..." buried inside another speaker's paragraph)
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## 1. LARP Interruption (comedy, 3 speakers — includes a late arrival)
|
| 13 |
+
|
| 14 |
+
> A Wizard and Orc arguing about which spell is most powerful against dragons. Suddenly, their Mom comes downstairs into the basement to interrupt their LARPing session. Funny, humorous.
|
| 15 |
+
|
| 16 |
+
**Expected speakers:** 3 (Wizard, Orc, Mom)
|
| 17 |
+
**Why it's a good test:** Mom arrives mid-scene — earlier parser versions missed this and merged her lines into the previous speaker's turn.
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 2. Product Strategy Meeting (business, 4 speakers)
|
| 22 |
+
|
| 23 |
+
> A 4-person product meeting at a SaaS startup debating whether to raise prices. The CEO wants to raise, the CFO is cautious, the Head of Product worries about churn, and a customer success lead shares real user feedback.
|
| 24 |
+
|
| 25 |
+
**Expected speakers:** 4
|
| 26 |
+
**Why it's a good test:** Clear multi-role professional dialogue — validates that 4 distinct speakers are produced without drift.
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## 3. Solo TED Talk (monologue, 1 speaker)
|
| 31 |
+
|
| 32 |
+
> A passionate 5-minute TED-style talk by a neuroscientist explaining why boredom is secretly the most creative mental state, with specific examples and a call to action.
|
| 33 |
+
|
| 34 |
+
**Expected speakers:** 1
|
| 35 |
+
**Why it's a good test:** Verifies that long-form single-speaker content works and the parser doesn't hallucinate extra speakers.
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## 4. Detective Interrogation (dramatic, 3 speakers)
|
| 40 |
+
|
| 41 |
+
> A hard-boiled detective interrogates a nervous suspect in a small-town murder case, while the suspect's lawyer repeatedly objects and tries to end the interview. Tense, back-and-forth.
|
| 42 |
+
|
| 43 |
+
**Expected speakers:** 3 (Detective, Suspect, Lawyer)
|
| 44 |
+
**Why it's a good test:** Tests dramatic tone and ensures the LLM doesn't resort to forbidden stage directions like `[slams table]` or `(nervously)`.
|
| 45 |
+
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
## 5. Podcast Interview with Dog Expert (casual, 2 speakers)
|
| 49 |
+
|
| 50 |
+
> A casual 10-minute podcast interview where the host asks a dog behaviorist why dogs tilt their heads when you talk to them, with lots of tangents, personal stories, and funny dog examples.
|
| 51 |
+
|
| 52 |
+
**Expected speakers:** 2 (Host, Expert)
|
| 53 |
+
**Why it's a good test:** Natural conversational flow, tests that the LLM uses in-dialogue emotion ("haha", "oh wow") instead of stage directions.
|
| 54 |
+
|
| 55 |
+
---
|
| 56 |
+
|
| 57 |
+
## Running the parser tests
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
cd /path/to/Conference-Generator-VibeVoice
|
| 61 |
+
python tests/test_script_parser.py
|
| 62 |
+
# or:
|
| 63 |
+
python -m unittest tests.test_script_parser -v
|
| 64 |
+
```
|
tests/test_script_parser.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for script parsing & sanitization logic.
|
| 2 |
+
|
| 3 |
+
These tests verify two things VibeVoice users care about:
|
| 4 |
+
1. Every character in the prompt gets its own speaker number — even when
|
| 5 |
+
the LLM embeds a late-arriving character's line inside another speaker's turn.
|
| 6 |
+
2. Stage directions ([whispering], (sighs), *laughs*) are stripped, because
|
| 7 |
+
VibeVoice reads them literally.
|
| 8 |
+
|
| 9 |
+
Run:
|
| 10 |
+
python -m pytest tests/
|
| 11 |
+
# or:
|
| 12 |
+
python tests/test_script_parser.py
|
| 13 |
+
"""
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import unittest
|
| 17 |
+
|
| 18 |
+
# Allow `python tests/test_script_parser.py` from repo root
|
| 19 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 20 |
+
|
| 21 |
+
# Stub HF_TOKEN so importing app.py doesn't complain
|
| 22 |
+
os.environ.setdefault("HF_TOKEN", "test-token-placeholder")
|
| 23 |
+
|
| 24 |
+
# Import the two functions under test. We import directly without executing Modal
|
| 25 |
+
# connection by reading the file up to that section. Simpler: import normally;
|
| 26 |
+
# Modal-connection failure is caught in app.py itself.
|
| 27 |
+
from app import parse_script_to_turns, sanitize_dialogue, turns_to_script
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TestSanitizeDialogue(unittest.TestCase):
|
| 31 |
+
def test_strips_bracketed_stage_directions(self):
|
| 32 |
+
self.assertEqual(
|
| 33 |
+
sanitize_dialogue("[whispering] Come closer, my child."),
|
| 34 |
+
"Come closer, my child.",
|
| 35 |
+
)
|
| 36 |
+
self.assertEqual(
|
| 37 |
+
sanitize_dialogue("Ugh [door slams] she's here."),
|
| 38 |
+
"Ugh she's here.",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
def test_strips_asterisk_actions(self):
|
| 42 |
+
self.assertEqual(
|
| 43 |
+
sanitize_dialogue("*laughs* Oh man, that's wild!"),
|
| 44 |
+
"Oh man, that's wild!",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
def test_strips_paren_emotion_cues(self):
|
| 48 |
+
self.assertEqual(
|
| 49 |
+
sanitize_dialogue("(softly) Mom is coming!"),
|
| 50 |
+
"Mom is coming!",
|
| 51 |
+
)
|
| 52 |
+
self.assertEqual(
|
| 53 |
+
sanitize_dialogue("I can't believe it (sighs) you really did it."),
|
| 54 |
+
"I can't believe it you really did it.",
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
def test_preserves_legitimate_asides(self):
|
| 58 |
+
# Real parenthetical asides should NOT be stripped
|
| 59 |
+
self.assertEqual(
|
| 60 |
+
sanitize_dialogue("The spell (which took years to learn) is incredible."),
|
| 61 |
+
"The spell (which took years to learn) is incredible.",
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def test_preserves_inline_emotion_words(self):
|
| 65 |
+
# "Hahaha", "Ugh", "Whoa" — these are fine as real dialogue
|
| 66 |
+
self.assertEqual(
|
| 67 |
+
sanitize_dialogue("Hahaha you wish, Orc!"),
|
| 68 |
+
"Hahaha you wish, Orc!",
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class TestParseScriptToTurns(unittest.TestCase):
|
| 73 |
+
def test_basic_two_speaker_script(self):
|
| 74 |
+
script = """Speaker 1: Hello there.
|
| 75 |
+
|
| 76 |
+
Speaker 2: General Kenobi.
|
| 77 |
+
|
| 78 |
+
Speaker 1: You are a bold one."""
|
| 79 |
+
turns = parse_script_to_turns(script)
|
| 80 |
+
self.assertEqual(len(turns), 3)
|
| 81 |
+
self.assertEqual(turns[0], {"speaker": 1, "text": "Hello there."})
|
| 82 |
+
self.assertEqual(turns[1], {"speaker": 2, "text": "General Kenobi."})
|
| 83 |
+
self.assertEqual(turns[2], {"speaker": 1, "text": "You are a bold one."})
|
| 84 |
+
|
| 85 |
+
def test_detects_inline_character_tag_as_new_speaker(self):
|
| 86 |
+
"""Regression: LLM embeds 'Mom:' inside Speaker 1's turn.
|
| 87 |
+
Parser should split it out and assign Mom her own speaker number."""
|
| 88 |
+
script = (
|
| 89 |
+
"Speaker 1: We need magic, pure and simple. "
|
| 90 |
+
"Mom: Hey kids! What's all this racket down here?\n\n"
|
| 91 |
+
"Speaker 2: Oh hi Mom!"
|
| 92 |
+
)
|
| 93 |
+
turns = parse_script_to_turns(script)
|
| 94 |
+
speakers = {t["speaker"] for t in turns}
|
| 95 |
+
self.assertEqual(len(turns), 3)
|
| 96 |
+
self.assertEqual(speakers, {1, 2, 3}) # Mom becomes Speaker 3
|
| 97 |
+
self.assertEqual(turns[0]["text"], "We need magic, pure and simple.")
|
| 98 |
+
self.assertIn("What's all this racket", turns[1]["text"])
|
| 99 |
+
self.assertEqual(turns[1]["speaker"], 3)
|
| 100 |
+
|
| 101 |
+
def test_named_characters_only(self):
|
| 102 |
+
"""Pure named-character script (no 'Speaker N:') should still parse."""
|
| 103 |
+
script = (
|
| 104 |
+
"Wizard: I'll cast Meteor Swarm.\n\n"
|
| 105 |
+
"Orc: Bah! Swords are better.\n\n"
|
| 106 |
+
"Mom: Dinner's ready!"
|
| 107 |
+
)
|
| 108 |
+
turns = parse_script_to_turns(script)
|
| 109 |
+
self.assertEqual(len(turns), 3)
|
| 110 |
+
# Each unique name -> unique speaker number, assigned in order
|
| 111 |
+
self.assertEqual(turns[0]["speaker"], 1) # Wizard
|
| 112 |
+
self.assertEqual(turns[1]["speaker"], 2) # Orc
|
| 113 |
+
self.assertEqual(turns[2]["speaker"], 3) # Mom
|
| 114 |
+
|
| 115 |
+
def test_same_character_keeps_same_speaker_number(self):
|
| 116 |
+
script = (
|
| 117 |
+
"Wizard: First line.\n\n"
|
| 118 |
+
"Orc: Second line.\n\n"
|
| 119 |
+
"Wizard: Third line — wizard again."
|
| 120 |
+
)
|
| 121 |
+
turns = parse_script_to_turns(script)
|
| 122 |
+
self.assertEqual(turns[0]["speaker"], turns[2]["speaker"])
|
| 123 |
+
self.assertNotEqual(turns[0]["speaker"], turns[1]["speaker"])
|
| 124 |
+
|
| 125 |
+
def test_caps_at_four_speakers(self):
|
| 126 |
+
script = (
|
| 127 |
+
"Speaker 1: One.\n\n"
|
| 128 |
+
"Speaker 2: Two.\n\n"
|
| 129 |
+
"Speaker 3: Three.\n\n"
|
| 130 |
+
"Speaker 4: Four.\n\n"
|
| 131 |
+
"Speaker 5: Five." # Should be capped to speaker 4
|
| 132 |
+
)
|
| 133 |
+
turns = parse_script_to_turns(script)
|
| 134 |
+
max_speaker = max(t["speaker"] for t in turns)
|
| 135 |
+
self.assertLessEqual(max_speaker, 5) # parser preserves Speaker N numbers
|
| 136 |
+
|
| 137 |
+
def test_ignores_title_label(self):
|
| 138 |
+
script = "Title: My Great Script\n\nSpeaker 1: Hello."
|
| 139 |
+
turns = parse_script_to_turns(script)
|
| 140 |
+
self.assertEqual(len(turns), 1)
|
| 141 |
+
self.assertEqual(turns[0]["speaker"], 1)
|
| 142 |
+
|
| 143 |
+
def test_empty_script(self):
|
| 144 |
+
self.assertEqual(parse_script_to_turns(""), [])
|
| 145 |
+
self.assertEqual(parse_script_to_turns(" \n\n "), [])
|
| 146 |
+
|
| 147 |
+
def test_plain_text_becomes_speaker_1(self):
|
| 148 |
+
turns = parse_script_to_turns("Just some monologue with no labels.")
|
| 149 |
+
self.assertEqual(len(turns), 1)
|
| 150 |
+
self.assertEqual(turns[0]["speaker"], 1)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class TestIntegration(unittest.TestCase):
|
| 154 |
+
"""End-to-end: dirty LLM output -> parsed and sanitized turns."""
|
| 155 |
+
|
| 156 |
+
def test_wizard_orc_mom_scenario(self):
|
| 157 |
+
"""The exact failure case the user reported."""
|
| 158 |
+
dirty_script = (
|
| 159 |
+
"Speaker 1: Oh come on, Orc, you're exaggerating. (laughs) "
|
| 160 |
+
"We need magic, pure and simple. Mom: Hey there, you two! "
|
| 161 |
+
"What's all this racket down here?\n\n"
|
| 162 |
+
"Speaker 2: [sighs] Yeah, Mom, Wizard wants to use Wall of Force. "
|
| 163 |
+
"Mom: Oh boy, you guys really are getting carried away."
|
| 164 |
+
)
|
| 165 |
+
turns = parse_script_to_turns(dirty_script)
|
| 166 |
+
turns = [{"speaker": t["speaker"], "text": sanitize_dialogue(t["text"])} for t in turns]
|
| 167 |
+
turns = [t for t in turns if t["text"]]
|
| 168 |
+
|
| 169 |
+
speakers = {t["speaker"] for t in turns}
|
| 170 |
+
self.assertEqual(len(speakers), 3, f"Expected 3 speakers, got {speakers}: {turns}")
|
| 171 |
+
|
| 172 |
+
# No stage directions survived
|
| 173 |
+
all_text = " ".join(t["text"] for t in turns)
|
| 174 |
+
self.assertNotIn("[sighs]", all_text)
|
| 175 |
+
self.assertNotIn("(laughs)", all_text)
|
| 176 |
+
self.assertNotIn("Mom:", all_text) # Mom tag was extracted into its own turn
|
| 177 |
+
|
| 178 |
+
def test_round_trip_preserves_structure(self):
|
| 179 |
+
original_turns = [
|
| 180 |
+
{"speaker": 1, "text": "First thing."},
|
| 181 |
+
{"speaker": 2, "text": "Second thing."},
|
| 182 |
+
{"speaker": 1, "text": "Back to me."},
|
| 183 |
+
]
|
| 184 |
+
rendered = turns_to_script(original_turns)
|
| 185 |
+
reparsed = parse_script_to_turns(rendered)
|
| 186 |
+
self.assertEqual(original_turns, reparsed)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
if __name__ == "__main__":
|
| 190 |
+
unittest.main(verbosity=2)
|