Spaces:

parler-tts
/

parler-tts-expresso

Running on Zero

App Files Files Community

parler-tts-expresso / app.py

sanchit-gandhi HF staff

create demo

c36d1d0 6 months ago

raw

history blame

7.59 kB

	import spaces
	import gradio as gr
	import torch
	from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
	from string import punctuation
	import re

	from parler_tts import ParlerTTSForConditionalGeneration
	from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# TODO(SG): update to the latest checkpoint
	repo_id = "reach-vb/parler-tts-expresso-mistral-v0.1"

	model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
	tokenizer = AutoTokenizer.from_pretrained(repo_id)
	feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)

	SAMPLE_RATE = feature_extractor.sampling_rate
	SEED = 42

	default_text = "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times."
	default_description = "Thomas speaks with emphasis at a moderate pace with high quality."
	examples = [
	[
	"Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
	"Thomas speaks sadly at a very slow pace with high quality."
	],
	[
	"Shhh! Did you know? You can reproduce this entire training recipe by following the steps outlined on the model card. It only takes one hour to train!",
	"Talia whispers quickly with high quality audio.",
	],
	[
	"But that's no secret! The entire project is open-source first. We are releasing all datasets, training and inference code, so that you can use them yourself!",
	"Elisabeth speaks happily at a slightly slower than average pace with high quality audio.",
	],
	[
	"Hey there. I'm Jerry. Or at least, I think I am? I just need to check that quickly.",
	"Jerry speaks in a confused tone at a moderate pace with high quality audio.",
	],
	]

	number_normalizer = EnglishNumberNormalizer()


	def preprocess(text):
	text = number_normalizer(text).strip()
	text = text.replace("-", " ")
	if text[-1] not in punctuation:
	text = f"{text}."

	abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'

	def separate_abb(chunk):
	chunk = chunk.replace(".", "")
	print(chunk)
	return " ".join(chunk)

	abbreviations = re.findall(abbreviations_pattern, text)
	for abv in abbreviations:
	if abv in text:
	text = text.replace(abv, separate_abb(abv))
	return text


	@spaces.GPU
	def gen_tts(text, description):
	inputs = tokenizer(description, return_tensors="pt").to(device)
	prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)

	set_seed(SEED)
	generation = model.generate(
	input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, do_sample=True, temperature=1.0
	)
	audio_arr = generation.cpu().numpy().squeeze()

	return SAMPLE_RATE, audio_arr


	css = """
	#share-btn-container {
	display: flex;
	padding-left: 0.5rem !important;
	padding-right: 0.5rem !important;
	background-color: #000000;
	justify-content: center;
	align-items: center;
	border-radius: 9999px !important;
	width: 13rem;
	margin-top: 10px;
	margin-left: auto;
	flex: unset !important;
	}
	#share-btn {
	all: initial;
	color: #ffffff;
	font-weight: 600;
	cursor: pointer;
	font-family: 'IBM Plex Sans', sans-serif;
	margin-left: 0.5rem !important;
	padding-top: 0.25rem !important;
	padding-bottom: 0.25rem !important;
	right:0;
	}
	#share-btn * {
	all: unset !important;
	}
	#share-btn-container div:nth-child(-n+2){
	width: auto !important;
	min-height: 0px !important;
	}
	#share-btn-container .wrap {
	display: none !important;
	}
	"""
	with gr.Blocks(css=css) as block:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	Parler-TTS: Expresso v0.1 ☕️️
	</h1>
	</div>
	</div>
	"""
	)
	gr.HTML(
	f"""
	<p><a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> is a training and inference library for
	high-fidelity text-to-speech (TTS) models. The model demonstrated here, <a href="https://huggingface.co/parler-tts/parler_tts_mini_expresso_v0.1"> Parler-TTS Mini: Expresso v0.1</a>,
	is fine-tuned on the <a href="https://huggingface.co/datasets/ylacombe/expresso"> Expresso dataset</a>.
	It generates high-quality speech in a given <b>emotion</b> and <b>voice</b> that can be controlled through a simple text prompt.</p>

	<p>Tips for ensuring good generation:
	<ul>
	<li>Specify the name of a male speaker (Jerry, Thomas) or female speaker (Talia, Elisabeth) for consistent voices</li>
	<li>The model can generate in a range of emotions, including: "happy", "confused", "default" (meaning no particular emotion conveyed), "laughing", "sad", "whisper", "emphasis"</li>
	<li>Include the term "high quality audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
	<li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
	<li>Wrap words in asterisk to emphasise them (e.g. `Remember` in the example below)</li>
	</ul>
	</p>
	"""
	)
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
	description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
	run_button = gr.Button("Generate Audio", variant="primary")
	with gr.Column():
	audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")

	inputs = [input_text, description]
	outputs = [audio_out]
	gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
	run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
	gr.HTML(
	"""
	<p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
	The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
	and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
	<a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
	"""
	)

	block.queue()
	block.launch(share=True)