Spaces:

DataQuests
/

DeepCritical

Running

App Files Files Community

DeepCritical / deployments /modal_tts.py

SeasonalFall84

Add TTS on-demand with UI credentials, improve UI layout, and fix References removal

b4f9ff5 4 days ago

raw

history blame contribute delete

2.77 kB

	"""Deploy Kokoro TTS function to Modal.

	This script deploys the TTS function to Modal so it can be called
	from the main DeepCritical application.

	Usage:
	modal deploy deploy_modal_tts.py

	After deployment, the function will be available at:
	App: deepcritical-tts
	Function: kokoro_tts_function
	"""

	import modal
	import numpy as np

	# Create Modal app
	app = modal.App("deepcritical-tts")

	# Define Kokoro TTS dependencies
	KOKORO_DEPENDENCIES = [
	"torch>=2.0.0",
	"transformers>=4.30.0",
	"numpy<2.0",
	]

	# Create Modal image with Kokoro
	tts_image = (
	modal.Image.debian_slim(python_version="3.11")
	.apt_install("git") # Install git first for pip install from github
	.pip_install(*KOKORO_DEPENDENCIES)
	.pip_install("git+https://github.com/hexgrad/kokoro.git")
	)


	@app.function(
	image=tts_image,
	gpu="T4",
	timeout=60,
	)
	def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
	"""Modal GPU function for Kokoro TTS.

	This function runs on Modal's GPU infrastructure.
	Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS

	Args:
	text: Text to synthesize
	voice: Voice ID (e.g., af_heart, af_bella, am_michael)
	speed: Speech speed multiplier (0.5-2.0)

	Returns:
	Tuple of (sample_rate, audio_array)
	"""
	import numpy as np

	try:
	import torch
	from kokoro import KModel, KPipeline

	# Initialize model (cached on GPU)
	model = KModel().to("cuda").eval()
	pipeline = KPipeline(lang_code=voice[0])
	pack = pipeline.load_voice(voice)

	# Generate audio - accumulate all chunks
	audio_chunks = []
	for _, ps, _ in pipeline(text, voice, speed):
	ref_s = pack[len(ps) - 1]
	audio = model(ps, ref_s, speed)
	audio_chunks.append(audio.numpy())

	# Concatenate all audio chunks
	if audio_chunks:
	full_audio = np.concatenate(audio_chunks)
	return (24000, full_audio)

	# If no audio generated, return empty
	return (24000, np.zeros(1, dtype=np.float32))

	except ImportError as e:
	raise RuntimeError(
	f"Kokoro not installed: {e}. "
	"Install with: pip install git+https://github.com/hexgrad/kokoro.git"
	) from e
	except Exception as e:
	raise RuntimeError(f"TTS synthesis failed: {e}") from e


	# Optional: Add a test entrypoint
	@app.local_entrypoint()
	def test():
	"""Test the TTS function."""
	print("Testing Modal TTS function...")
	sample_rate, audio = kokoro_tts_function.remote("Hello, this is a test.", "af_heart", 1.0)
	print(f"Generated audio: {sample_rate}Hz, shape={audio.shape}")
	print("✓ TTS function works!")