Spaces:
Running
Running
File size: 2,773 Bytes
8e9e85e b4f9ff5 8e9e85e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
"""Deploy Kokoro TTS function to Modal.
This script deploys the TTS function to Modal so it can be called
from the main DeepCritical application.
Usage:
modal deploy deploy_modal_tts.py
After deployment, the function will be available at:
App: deepcritical-tts
Function: kokoro_tts_function
"""
import modal
import numpy as np
# Create Modal app
app = modal.App("deepcritical-tts")
# Define Kokoro TTS dependencies
KOKORO_DEPENDENCIES = [
"torch>=2.0.0",
"transformers>=4.30.0",
"numpy<2.0",
]
# Create Modal image with Kokoro
tts_image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("git") # Install git first for pip install from github
.pip_install(*KOKORO_DEPENDENCIES)
.pip_install("git+https://github.com/hexgrad/kokoro.git")
)
@app.function(
image=tts_image,
gpu="T4",
timeout=60,
)
def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
"""Modal GPU function for Kokoro TTS.
This function runs on Modal's GPU infrastructure.
Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
Args:
text: Text to synthesize
voice: Voice ID (e.g., af_heart, af_bella, am_michael)
speed: Speech speed multiplier (0.5-2.0)
Returns:
Tuple of (sample_rate, audio_array)
"""
import numpy as np
try:
import torch
from kokoro import KModel, KPipeline
# Initialize model (cached on GPU)
model = KModel().to("cuda").eval()
pipeline = KPipeline(lang_code=voice[0])
pack = pipeline.load_voice(voice)
# Generate audio - accumulate all chunks
audio_chunks = []
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps) - 1]
audio = model(ps, ref_s, speed)
audio_chunks.append(audio.numpy())
# Concatenate all audio chunks
if audio_chunks:
full_audio = np.concatenate(audio_chunks)
return (24000, full_audio)
# If no audio generated, return empty
return (24000, np.zeros(1, dtype=np.float32))
except ImportError as e:
raise RuntimeError(
f"Kokoro not installed: {e}. "
"Install with: pip install git+https://github.com/hexgrad/kokoro.git"
) from e
except Exception as e:
raise RuntimeError(f"TTS synthesis failed: {e}") from e
# Optional: Add a test entrypoint
@app.local_entrypoint()
def test():
"""Test the TTS function."""
print("Testing Modal TTS function...")
sample_rate, audio = kokoro_tts_function.remote("Hello, this is a test.", "af_heart", 1.0)
print(f"Generated audio: {sample_rate}Hz, shape={audio.shape}")
print("✓ TTS function works!")
|