import spaces import gradio as gr import torch from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer from string import punctuation import re from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed device = "cuda:0" if torch.cuda.is_available() else "cpu" repo_id = "parler-tts/parler-tts-mini-expresso" model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained(repo_id) feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) SAMPLE_RATE = feature_extractor.sampling_rate SEED = 42 default_text = "*Remember* - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of *five times*." default_description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality." examples = [ [ "Remember - this is only the first iteration of the model. To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.", "Thomas speaks in a sad tone at a moderate pace with high quality." ], [ "Did you know? You can reproduce this entire training recipe by following the steps outlined on the model card!", "Talia speaks quickly with excitement and high quality audio.", ], [ "But that's no secret! The entire project is open source first, with all release artefacts on the Hub.", "Elisabeth speaks happily at a slightly slower than average pace with high quality audio.", ], [ "Hey there! I'm Jerry. Or at least I think I am? I just need to check that quickly.", "Jerry speaks in a confused tone at a moderately slow pace with high quality audio.", ], ] number_normalizer = EnglishNumberNormalizer() def preprocess(text): text = number_normalizer(text).strip() text = text.replace("-", " ") if text[-1] not in punctuation: text = f"{text}." abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b' def separate_abb(chunk): chunk = chunk.replace(".", "") print(chunk) return " ".join(chunk) abbreviations = re.findall(abbreviations_pattern, text) for abv in abbreviations: if abv in text: text = text.replace(abv, separate_abb(abv)) return text @spaces.GPU def gen_tts(text, description): inputs = tokenizer(description, return_tensors="pt").to(device) prompt = tokenizer(preprocess(text), return_tensors="pt").to(device) set_seed(SEED) generation = model.generate( input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, do_sample=True, temperature=1.0 ) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr css = """ #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; margin-top: 10px; margin-left: auto; flex: unset !important; } #share-btn { all: initial; color: #ffffff; font-weight: 600; cursor: pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; right:0; } #share-btn * { all: unset !important; } #share-btn-container div:nth-child(-n+2){ width: auto !important; min-height: 0px !important; } #share-btn-container .wrap { display: none !important; } """ with gr.Blocks(css=css) as block: gr.HTML( """

Parler-TTS: Expresso ☕️️

""" ) gr.HTML( f"""

Parler-TTS Mini: Expresso is a text-to-speech (TTS) model fine-tuned on the Expresso dataset. It generates high-quality speech in a given emotion and voice that can be controlled through a simple text prompt.

Tips for ensuring good generation:

""" ) with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text") description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description") run_button = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out") inputs = [input_text, description] outputs = [audio_out] gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True) run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True) gr.HTML( """

To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech. The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the Parler-TTS repository on GitHub. The Parler-TTS codebase and its associated checkpoints are licensed under Apache 2.0.

""" ) block.queue() block.launch(share=True)