import spaces import gradio as gr import torch from string import punctuation import re from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed device = "cuda:0" if torch.cuda.is_available() else "cpu" custom_repo_id = "AkhilTolani/parler-tts-music-200000" model = ParlerTTSForConditionalGeneration.from_pretrained(custom_repo_id).to(device) tokenizer = AutoTokenizer.from_pretrained("AkhilTolani/parler-tts-music-200000") feature_extractor = AutoFeatureExtractor.from_pretrained("AkhilTolani/parler-tts-music-200000") SAMPLE_RATE = feature_extractor.sampling_rate SEED = 456 default_text = "i cant even find them getting your eyes with i can only think about checking your eyes your eyes your eyes" default_description = "This hip hop track showcases a passionate male vocal with harmonizing background vocals, layered over shimmering cymbals, punchy kick and snare hits, synth pad, and wooden percussions. The song exudes a passionate, emotional, and groovy vibe that is sure to captivate listeners. This track would be perfect for setting the mood in a trendy urban nightclub or a stylish lounge setting." examples = [ [ "i cant even find them getting your eyes with i can only think about checking your eyes your eyes your eyes", "This hip hop track showcases a passionate male vocal with harmonizing background vocals, layered over shimmering cymbals, punchy kick and snare hits, synth pad, and wooden percussions. The song exudes a passionate, emotional, and groovy vibe that is sure to captivate listeners. This track would be perfect for setting the mood in a trendy urban nightclub or a stylish lounge setting." ] ] @spaces.GPU def gen_tts(text, description): inputs = tokenizer(description, return_tensors="pt").to(device) prompt = tokenizer(text, return_tensors="pt").to(device) set_seed(SEED) generation = model.generate(input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, min_length=20).to(torch.float32) audio_arr = generation.cpu().numpy().squeeze() return SAMPLE_RATE, audio_arr css = """ #share-btn-container { display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; margin-top: 10px; margin-left: auto; flex: unset !important; } #share-btn { all: initial; color: #ffffff; font-weight: 600; cursor: pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; right:0; } #share-btn * { all: unset !important; } #share-btn-container div:nth-child(-n+2){ width: auto !important; min-height: 0px !important; } #share-btn-container .wrap { display: none !important; } """ with gr.Blocks(css=css) as block: gr.HTML( """
Parler-TTS + Vocals is a finetuned model for generating high-quality vocals with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).
Tips for ensuring good generation: