Spaces:
Runtime error
Runtime error
import subprocess | |
import sys | |
# Function to install a package | |
def install(package): | |
subprocess.check_call([sys.executable, "-m", "pip", "install"] + package) | |
# Install necessary libraries | |
install(["git+https://github.com/huggingface/transformers.git"]) | |
install(["torch", "torchaudio", "--extra-index-url", "https://download.pytorch.org/whl/cu118"]) | |
install(["gradio"]) | |
# Import after installation | |
import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
from datasets import load_dataset | |
import soundfile as sf | |
import gradio as gr | |
import spaces # Import the spaces module | |
from torch.utils.data import DataLoader | |
# Check if CUDA (GPU) is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
# Load the dataset from Hugging Face | |
print("Loading StackOverflow dataset...") | |
dataset = load_dataset("c17hawke/stackoverflow-dataset") | |
print("Dataset loaded. Number of training examples:", len(dataset['train'])) | |
# Load the SpeechT5 processor and TTS model | |
model_name = "microsoft/speecht5_tts" # SpeechT5 model for TTS | |
processor = SpeechT5Processor.from_pretrained(model_name) | |
model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device) # Send model to GPU | |
# Load speaker embeddings from the CMU Arctic xvectors dataset | |
print("Loading speaker embeddings...") | |
speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(speaker_embeddings_dataset[0]['xvector']).unsqueeze(0).to(device) # Send to GPU | |
# Preprocess the dataset (use 'text' column) | |
def preprocess_function(examples): | |
inputs = examples['text'] # Use the 'text' column for sentences | |
return {'text': inputs} # Return as 'text' for processing | |
# Apply preprocessing to the train dataset | |
print("Preprocessing the training dataset...") | |
train_dataset = dataset['train'].map(preprocess_function, batched=True) | |
print("Preprocessing complete. Number of training examples:", len(train_dataset)) | |
# Create DataLoader for batching | |
batch_size = 16 | |
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |
# Fine-tune the SpeechT5 model (TTS) | |
def train_model(): | |
print("Starting fine-tuning...") | |
scaler = torch.cuda.amp.GradScaler() # Create a gradient scaler for mixed precision | |
for epoch in range(5): # Adjust epochs as needed | |
print(f"Epoch {epoch + 1} of 5...") | |
for batch in train_dataloader: # Iterate over batches | |
inputs = processor(text=batch['text'], return_tensors="pt", padding=True, truncation=True).to(device) | |
with torch.cuda.amp.autocast(): # Enable autocasting | |
# Generate audio output with speaker embeddings | |
audio = model.generate(inputs['input_ids'], | |
speaker_embeddings=speaker_embeddings, | |
attention_mask=inputs['attention_mask']) | |
# Cast to float32 before saving | |
audio_float32 = audio.cpu().numpy().astype('float32') | |
# Save generated audio for each batch in the epoch | |
sf.write(f'output_epoch{epoch}.wav', audio_float32, 24000) | |
print("Fine-tuning completed!") | |
# Call the training function | |
train_model() | |
# Function to generate and save audio from text | |
# Decorate the function to use GPU | |
def generate_audio(input_text, output_file="output.wav"): | |
print(f"Generating audio for input text: {input_text}") | |
inputs = processor(text=input_text, return_tensors="pt", padding=True, truncation=True).to(device) | |
with torch.no_grad(), torch.cuda.amp.autocast(): # Enable autocasting | |
audio = model.generate(inputs['input_ids'], | |
speaker_embeddings=speaker_embeddings, | |
attention_mask=inputs['attention_mask']) | |
# Cast to float32 before saving | |
audio_float32 = audio.cpu().numpy().astype('float32') | |
# Save the generated audio | |
sf.write(output_file, audio_float32, 24000) | |
print(f"Audio generated and saved as '{output_file}'.") | |
# Define the Gradio interface function | |
def tts_interface(text): | |
# Generate audio from text | |
audio_output = generate_audio(text) | |
return audio_output | |
# Create the Gradio interface | |
gr_interface = gr.Interface( | |
fn=tts_interface, | |
inputs=gr.Textbox(label="Enter text to generate speech"), | |
outputs=gr.Audio(label="Generated Speech"), | |
title="SpeechT5 Text-to-Speech Generator", | |
description="Enter text, and the fine-tuned SpeechT5 model will generate speech as an audio file." | |
) | |
# Launch the Gradio interface | |
if __name__ == "__main__": | |
gr_interface.launch() |