HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
Paper • 2010.05646 • Published
Fast, lightweight German Text-to-Speech model based on FastPitch + HiFi-GAN architecture. Full training and export code available at CaroTTS GitHub repository.
This model provides high-quality German text-to-speech synthesis using a non-autoregressive architecture optimized for fast inference on CPUs and mobile devices. The model consists of two components:
Total Parameters: ~60M
👉 Interactive Demo on HuggingFace Spaces (uses PT2 format with Zero GPU)
Karlsson_fastpitch.nemo - FastPitch model in NEMO formatKarlsson_hifigan.nemo - HiFi-GAN vocoder in NEMO formatKarlsson_fastpitch.onnx - FastPitch model in ONNX formatKarlsson_hifigan.onnx - HiFi-GAN vocoder in ONNX formatKarlsson_fastpitch_encoder.pt2 - FastPitch-Encoder compiled with PyTorch Inductor (for CUDA/Zero GPU)Karlsson_fastpitch_decoder.pt2 - FastPitch-Decoder compiled with PyTorch Inductor (for CUDA/Zero GPU)Karlsson_hifigan.pt2 - HiFi-GAN compiled with PyTorch Inductor (for CUDA/Zero GPU)ONNX provides the best compatibility and performance for CPU deployment:
import numpy as np
import onnxruntime as ort
import soundfile as sf
# Tokenization function
def normalize_unicode_text(text: str) -> str:
import unicodedata
if not unicodedata.is_normalized("NFC", text):
text = unicodedata.normalize("NFC", text)
return text
def any_locale_text_preprocessing(text: str) -> str:
res = []
for c in normalize_unicode_text(text):
if c in ["'"]:
res.append("'")
else:
res.append(c)
return "".join(res)
def tokenize_german(text: str, punct: bool = True, apostrophe: bool = True,
pad_with_space: bool = True) -> list[int]:
"""Tokenize German text into a list of integer token IDs."""
_CHARSET_STR = "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜẞabcdefghijklmnopqrstuvwxyzäöüß"
_PUNCT_LIST = [
"!", '"', "(", ")", ",", "-", ".", "/", ":", ";", "?", "[", "]",
"{", "}", "«", "»", "‒", "–", "—", "'", "‚", '"', "„", "‹", "›",
]
tokens = [" "] # Space at index 0
tokens.extend(_CHARSET_STR)
if apostrophe:
tokens.append("'")
if punct:
tokens.extend(_PUNCT_LIST)
tokens.extend(["<pad>", "<blank>", "<oov>"])
token2id = {token: i for i, token in enumerate(tokens)}
space = " "
text = any_locale_text_preprocessing(text)
# Encode
cs = []
tokens_set = set(tokens)
for c in text:
if ((c == space and len(cs) > 0 and cs[-1] != space) or
((c.isalnum() or c == "'") and c in tokens_set) or
(c in _PUNCT_LIST and punct)):
cs.append(c)
if cs:
while cs and cs[-1] == space:
cs.pop()
if pad_with_space:
cs = [space] + cs + [space]
return [token2id[p] for p in cs]
# Load ONNX models
fastpitch_session = ort.InferenceSession("Karlsson_fastpitch.onnx")
hifigan_session = ort.InferenceSession("Karlsson_hifigan.onnx")
# Prepare text
text = "Hallo, ich bin CaroTTS, ein deutsches Text-zu-Sprache-System."
tokens = tokenize_german(text)
# Prepare inputs
paces = np.ones(len(tokens), dtype=np.float32)
pitches = np.zeros(len(tokens), dtype=np.float32)
inputs = {
"text": np.array([tokens], dtype=np.int64),
"pace": np.array([paces], dtype=np.float32),
"pitch": np.array([pitches], dtype=np.float32),
}
# Generate spectrogram
spec = fastpitch_session.run(None, inputs)[0]
# Generate audio
audio = hifigan_session.run(None, {"spec": spec})[0]
# Save audio (44.1kHz sample rate)
sf.write("output.wav", audio.squeeze(), 44100)
If you have NeMo installed (pip install nemo-toolkit[tts])and want to work with the original .nemo checkpoints:
import torch
import soundfile as sf
from nemo.collections.tts.models.fastpitch import FastPitchModel
from nemo.collections.tts.models.hifigan import HifiGanModel
# Load models
device = "cuda" if torch.cuda.is_available() else "cpu"
fastpitch = FastPitchModel.restore_from("Karlsson_fastpitch.nemo", map_location=device).eval()
hifigan = HifiGanModel.restore_from("Karlsson_hifigan.nemo", map_location=device).eval()
# Prepare text
text = "Guten Tag. Herzlich Willkommen zu dieser Demonstration."
with torch.inference_mode():
# Parse and generate
parsed_text = fastpitch.parse(text)
spec = fastpitch.generate_spectrogram(tokens=parsed_text)
audio = hifigan.convert_spectrogram_to_audio(spec=spec)
# Save audio (44.1kHz sample rate)
sf.write("output.wav", audio.squeeze().cpu().numpy(), 44100)
@misc{carotts2024,
title={CaroTTS: Fast Lightweight German Text-to-Speech},
author={Holtzwart, Tassilo},
year={2024},
url={https://github.com/TassiloHo/CaroTTS}
}
MIT License
For more information, visit the CaroTTS GitHub repository.