Spaces:

AkhilTolani
/

vocals

Runtime error

App Files Files Community

vocals / app.py

AkhilTolani

Update app.py

eb76c8e verified 3 months ago

raw

history blame contribute delete

4.65 kB

	import gradio as gr
	from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSConfig
	from transformers import AutoTokenizer, set_seed
	import soundfile as sf
	import torch
	import os
	from accelerate import Accelerator
	from accelerate.utils import set_seed

	os.system("bash install.sh")

	# Setup accelerator
	accelerator = Accelerator()
	device = accelerator.device
	mixed_precision = "no" if device == "cpu" else "bf16"
	torch_dtype = torch.float32 if device == "cpu" else torch.bfloat16

	# Load model and tokenizer
	model_path = "AkhilTolani/parler-tts-finetune-vocals-only-large-18720-steps"
	config = ParlerTTSConfig.from_pretrained(model_path)
	model = ParlerTTSForConditionalGeneration.from_pretrained(
	model_path,
	config=config,
	torch_dtype=torch_dtype,
	attn_implementation="sdpa"
	)
	model = accelerator.prepare(model)

	tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")

	def generate_audio(prompt, description, seed, temperature, max_length, do_sample):
	seed = int(seed)
	set_seed(seed)

	input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
	prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

	num_codebooks = model.decoder.config.num_codebooks

	gen_kwargs = {
	"do_sample": do_sample,
	"temperature": temperature,
	"max_length": max_length,
	"min_new_tokens": num_codebooks + 1,
	}

	# Prepare batch
	batch = {
	"input_ids": input_ids,
	"prompt_input_ids": prompt_input_ids,
	}

	def generate_step(batch, accelerator):
	batch.pop("decoder_attention_mask", None)
	eval_model = accelerator.unwrap_model(model, keep_fp32_wrapper=True)

	# Handle torch.compile if it was used in training
	if hasattr(eval_model, '_orig_mod'):
	eval_model = eval_model._orig_mod

	if mixed_precision != "no":
	with accelerator.autocast():
	output_audios = eval_model.generate(batch, gen_kwargs)
	else:
	output_audios = eval_model.generate(batch, gen_kwargs)

	output_audios = accelerator.pad_across_processes(output_audios, dim=1, pad_index=0)
	return output_audios

	with torch.no_grad():
	generated_audios = generate_step(batch, accelerator)

	# Gather and pad predictions
	generated_audios, input_ids, prompts = accelerator.pad_across_processes(
	(generated_audios, batch["input_ids"], batch["prompt_input_ids"]), dim=1, pad_index=0
	)
	generated_audios, input_ids, prompts = accelerator.gather_for_metrics(
	(generated_audios, input_ids, prompts)
	)

	# Convert to CPU and float32
	generated_audios = generated_audios.cpu().float()
	input_ids = input_ids.cpu()
	prompts = prompts.cpu()

	# Post-process the generated audio
	audio_arr = generated_audios[0].numpy().squeeze() # Take the first sample if multiple were generated
	sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)

	return "parler_tts_out.wav"

	# Gradio interface setup (unchanged)
	default_prompt = "thought no beef im hate to get murder right in these streets i told yall niggins is dead fucking green tbs and tsg my shit only you cant beat out if you aint going to aim and squeeze take your mvp out the game just like a referee im talking about my life you just rapping on beats i be clapping on streets theyre using technology to try to find where the bullets coming from they wont find those z nope because im a smooth criminal i got some screwed loose because im a sick of the"
	default_description = "A male vocalist delivers an energetic and passionate freestyle in a medium-fast tempo, showcasing an enthusiastic and emotional performance with emphatic expression, conveying a youthful and groovy vibe throughout the track."
	default_seed = "456"

	interface = gr.Interface(
	fn=generate_audio,
	inputs=[
	gr.Textbox(label="Prompt", value=default_prompt),
	gr.Textbox(label="Description", value=default_description),
	gr.Textbox(label="Seed", value=default_seed),
	gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.1, value=0.75),
	gr.Slider(label="Max Length", minimum=256, maximum=5120, step=256, value=2580),
	gr.Dropdown(label="Do Sample", choices=[True, False], value=True)
	],
	outputs=gr.Audio(label="Generated Audio"),
	title="Parler TTS Audio Generation",
	description="Generate audio using the Parler TTS model. Provide a prompt, description, and seed to generate the corresponding audio."
	)

	if __name__ == "__main__":
	interface.launch()