dkounadis
/

artificial-styletts2

audio-generation

Model card Files Files and versions

artificial-styletts2 / demo.py

Dionyssos's picture

Audionar long form

a1338da 12 months ago

history blame contribute delete

2 kB

	import numpy as np
	import soundfile
	import msinference # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM
	from audiocraft.builders import AudioGen # fixed bug for repeated calls

	def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
	voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
	soundscape = 'birds fomig'): # purposeful spells for AudioGen (behaves as controllable top-p)

	if ('en_US/' in voice) or ('en_UK/' in voice):

	style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
	'/', '_').replace('#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '') + '.wav')

	x = msinference.inference(text, style_vector)

	elif '_' in voice:

	style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
	'/', '_').replace('#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '') + '.wav')

	x = msinference.inference(text, style_vector)

	else:

	x = msinference.foreign(text=text, lang=voice)

	x /= 1.02 * np.abs(x).max() + 1e-7 # volume amplify to [-1,1]
	if soundscape is not None:
	sound_gen = AudioGen().to('cuda:0').eval()
	background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74, # sound duration in seconds
	).detach().cpu().numpy()
	x = .6 * x + .4 * background[:len(x)]
	return x

	soundfile.write(f'demo.wav', tts_entry(), 16000)