|
|
import gradio as gr |
|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
from voxcpm import VoxCPM |
|
|
import tempfile |
|
|
import os |
|
|
import spaces |
|
|
|
|
|
|
|
|
model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B") |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_speech( |
|
|
text, |
|
|
prompt_audio, |
|
|
prompt_text, |
|
|
cfg_value, |
|
|
inference_timesteps, |
|
|
normalize, |
|
|
denoise, |
|
|
retry_badcase, |
|
|
retry_badcase_max_times, |
|
|
retry_badcase_ratio_threshold |
|
|
): |
|
|
if not text: |
|
|
gr.Warning("Please enter text to generate speech") |
|
|
return None |
|
|
|
|
|
|
|
|
prompt_wav_path = None |
|
|
if prompt_audio is not None: |
|
|
prompt_wav_path = prompt_audio |
|
|
|
|
|
|
|
|
if prompt_text and prompt_text.strip() == "": |
|
|
prompt_text = None |
|
|
|
|
|
try: |
|
|
|
|
|
wav = model.generate( |
|
|
text=text, |
|
|
prompt_wav_path=prompt_wav_path, |
|
|
prompt_text=prompt_text, |
|
|
cfg_value=cfg_value, |
|
|
inference_timesteps=int(inference_timesteps), |
|
|
normalize=normalize, |
|
|
denoise=denoise, |
|
|
retry_badcase=retry_badcase, |
|
|
retry_badcase_max_times=int(retry_badcase_max_times), |
|
|
retry_badcase_ratio_threshold=retry_badcase_ratio_threshold |
|
|
) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: |
|
|
sf.write(tmp_file.name, wav, 16000) |
|
|
return tmp_file.name |
|
|
|
|
|
except Exception as e: |
|
|
gr.Error(f"Error generating speech: {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
with gr.Blocks(title="VoxCPM Text-to-Speech", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🎙️ VoxCPM Text-to-Speech |
|
|
|
|
|
Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio. |
|
|
|
|
|
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text to Synthesize", |
|
|
placeholder="Enter the text you want to convert to speech...", |
|
|
lines=3, |
|
|
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." |
|
|
) |
|
|
|
|
|
with gr.Accordion("Voice Cloning", open=False): |
|
|
prompt_audio = gr.Audio( |
|
|
label="Reference Audio (Upload a reference audio file for voice cloning)", |
|
|
type="filepath", |
|
|
sources=["upload"] |
|
|
) |
|
|
prompt_text = gr.Textbox( |
|
|
label="Reference Text", |
|
|
placeholder="Text corresponding to the reference audio", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
|
cfg_value = gr.Slider( |
|
|
minimum=0.5, |
|
|
maximum=5.0, |
|
|
value=2.0, |
|
|
step=0.1, |
|
|
label="CFG Value", |
|
|
info="LM guidance on LocDiT, higher for better adherence to prompt" |
|
|
) |
|
|
|
|
|
inference_timesteps = gr.Slider( |
|
|
minimum=5, |
|
|
maximum=50, |
|
|
value=10, |
|
|
step=1, |
|
|
label="Inference Timesteps", |
|
|
info="Higher for better quality, lower for faster speed" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
normalize = gr.Checkbox( |
|
|
value=True, |
|
|
label="Normalize", |
|
|
info="Enable external TN tool" |
|
|
) |
|
|
denoise = gr.Checkbox( |
|
|
value=True, |
|
|
label="Denoise", |
|
|
info="Enable external Denoise tool" |
|
|
) |
|
|
retry_badcase = gr.Checkbox( |
|
|
value=True, |
|
|
label="Retry Bad Cases", |
|
|
info="Enable retrying for bad cases" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
retry_badcase_max_times = gr.Number( |
|
|
value=3, |
|
|
minimum=1, |
|
|
maximum=10, |
|
|
step=1, |
|
|
label="Max Retry Times" |
|
|
) |
|
|
retry_badcase_ratio_threshold = gr.Number( |
|
|
value=6.0, |
|
|
minimum=1.0, |
|
|
maximum=10.0, |
|
|
step=0.5, |
|
|
label="Retry Ratio Threshold" |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
audio_output = gr.Audio( |
|
|
label="Generated Speech", |
|
|
type="filepath", |
|
|
autoplay=False |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
### Tips: |
|
|
- For voice cloning, upload a clear reference audio (3-10 seconds recommended) |
|
|
- Higher CFG values provide better prompt adherence but may affect naturalness |
|
|
- Increase inference timesteps for better quality at the cost of speed |
|
|
- The retry mechanism helps handle edge cases automatically |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech, |
|
|
inputs=[ |
|
|
text_input, |
|
|
prompt_audio, |
|
|
prompt_text, |
|
|
cfg_value, |
|
|
inference_timesteps, |
|
|
normalize, |
|
|
denoise, |
|
|
retry_badcase, |
|
|
retry_badcase_max_times, |
|
|
retry_badcase_ratio_threshold |
|
|
], |
|
|
outputs=audio_output, |
|
|
show_progress="full" |
|
|
) |
|
|
|
|
|
demo.launch() |