Spaces:

gauri-sharan
/

testn

Runtime error

App Files Files Community

testn / app.py

gauri-sharan

Update app.py

49c2bf6 verified about 1 month ago

raw

history blame contribute delete

4.74 kB

	import subprocess
	import sys

	# Function to install a package
	def install(package):
	subprocess.check_call([sys.executable, "-m", "pip", "install"] + package)

	# Install necessary libraries
	install(["git+https://github.com/huggingface/transformers.git"])
	install(["torch", "torchaudio", "--extra-index-url", "https://download.pytorch.org/whl/cu118"])
	install(["gradio"])

	# Import after installation
	import torch
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
	from datasets import load_dataset
	import soundfile as sf
	import gradio as gr
	import spaces # Import the spaces module
	from torch.utils.data import DataLoader

	# Check if CUDA (GPU) is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load the dataset from Hugging Face
	print("Loading StackOverflow dataset...")
	dataset = load_dataset("c17hawke/stackoverflow-dataset")
	print("Dataset loaded. Number of training examples:", len(dataset['train']))

	# Load the SpeechT5 processor and TTS model
	model_name = "microsoft/speecht5_tts" # SpeechT5 model for TTS
	processor = SpeechT5Processor.from_pretrained(model_name)
	model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device) # Send model to GPU

	# Load speaker embeddings from the CMU Arctic xvectors dataset
	print("Loading speaker embeddings...")
	speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(speaker_embeddings_dataset[0]['xvector']).unsqueeze(0).to(device) # Send to GPU

	# Preprocess the dataset (use 'text' column)
	def preprocess_function(examples):
	inputs = examples['text'] # Use the 'text' column for sentences
	return {'text': inputs} # Return as 'text' for processing

	# Apply preprocessing to the train dataset
	print("Preprocessing the training dataset...")
	train_dataset = dataset['train'].map(preprocess_function, batched=True)
	print("Preprocessing complete. Number of training examples:", len(train_dataset))

	# Create DataLoader for batching
	batch_size = 16
	train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

	# Fine-tune the SpeechT5 model (TTS)
	def train_model():
	print("Starting fine-tuning...")
	scaler = torch.cuda.amp.GradScaler() # Create a gradient scaler for mixed precision
	for epoch in range(5): # Adjust epochs as needed
	print(f"Epoch {epoch + 1} of 5...")
	for batch in train_dataloader: # Iterate over batches
	inputs = processor(text=batch['text'], return_tensors="pt", padding=True, truncation=True).to(device)

	with torch.cuda.amp.autocast(): # Enable autocasting
	# Generate audio output with speaker embeddings
	audio = model.generate(inputs['input_ids'],
	speaker_embeddings=speaker_embeddings,
	attention_mask=inputs['attention_mask'])

	# Cast to float32 before saving
	audio_float32 = audio.cpu().numpy().astype('float32')
	# Save generated audio for each batch in the epoch
	sf.write(f'output_epoch{epoch}.wav', audio_float32, 24000)

	print("Fine-tuning completed!")

	# Call the training function
	train_model()

	# Function to generate and save audio from text
	@spaces.GPU # Decorate the function to use GPU
	def generate_audio(input_text, output_file="output.wav"):
	print(f"Generating audio for input text: {input_text}")
	inputs = processor(text=input_text, return_tensors="pt", padding=True, truncation=True).to(device)

	with torch.no_grad(), torch.cuda.amp.autocast(): # Enable autocasting
	audio = model.generate(inputs['input_ids'],
	speaker_embeddings=speaker_embeddings,
	attention_mask=inputs['attention_mask'])

	# Cast to float32 before saving
	audio_float32 = audio.cpu().numpy().astype('float32')
	# Save the generated audio
	sf.write(output_file, audio_float32, 24000)
	print(f"Audio generated and saved as '{output_file}'.")

	# Define the Gradio interface function
	def tts_interface(text):
	# Generate audio from text
	audio_output = generate_audio(text)
	return audio_output

	# Create the Gradio interface
	gr_interface = gr.Interface(
	fn=tts_interface,
	inputs=gr.Textbox(label="Enter text to generate speech"),
	outputs=gr.Audio(label="Generated Speech"),
	title="SpeechT5 Text-to-Speech Generator",
	description="Enter text, and the fine-tuned SpeechT5 model will generate speech as an audio file."
	)

	# Launch the Gradio interface
	if __name__ == "__main__":
	gr_interface.launch()