import subprocess import sys # Function to install a package def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install"] + package) # Install necessary libraries install(["git+https://github.com/huggingface/transformers.git"]) install(["torch", "torchaudio", "--extra-index-url", "https://download.pytorch.org/whl/cu118"]) install(["gradio"]) # Import after installation import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech from datasets import load_dataset import soundfile as sf import gradio as gr import spaces # Import the spaces module from torch.utils.data import DataLoader # Check if CUDA (GPU) is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load the dataset from Hugging Face print("Loading StackOverflow dataset...") dataset = load_dataset("c17hawke/stackoverflow-dataset") print("Dataset loaded. Number of training examples:", len(dataset['train'])) # Load the SpeechT5 processor and TTS model model_name = "microsoft/speecht5_tts" # SpeechT5 model for TTS processor = SpeechT5Processor.from_pretrained(model_name) model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device) # Send model to GPU # Load speaker embeddings from the CMU Arctic xvectors dataset print("Loading speaker embeddings...") speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(speaker_embeddings_dataset[0]['xvector']).unsqueeze(0).to(device) # Send to GPU # Preprocess the dataset (use 'text' column) def preprocess_function(examples): inputs = examples['text'] # Use the 'text' column for sentences return {'text': inputs} # Return as 'text' for processing # Apply preprocessing to the train dataset print("Preprocessing the training dataset...") train_dataset = dataset['train'].map(preprocess_function, batched=True) print("Preprocessing complete. Number of training examples:", len(train_dataset)) # Create DataLoader for batching batch_size = 16 train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Fine-tune the SpeechT5 model (TTS) def train_model(): print("Starting fine-tuning...") scaler = torch.cuda.amp.GradScaler() # Create a gradient scaler for mixed precision for epoch in range(5): # Adjust epochs as needed print(f"Epoch {epoch + 1} of 5...") for batch in train_dataloader: # Iterate over batches inputs = processor(text=batch['text'], return_tensors="pt", padding=True, truncation=True).to(device) with torch.cuda.amp.autocast(): # Enable autocasting # Generate audio output with speaker embeddings audio = model.generate(inputs['input_ids'], speaker_embeddings=speaker_embeddings, attention_mask=inputs['attention_mask']) # Cast to float32 before saving audio_float32 = audio.cpu().numpy().astype('float32') # Save generated audio for each batch in the epoch sf.write(f'output_epoch{epoch}.wav', audio_float32, 24000) print("Fine-tuning completed!") # Call the training function train_model() # Function to generate and save audio from text @spaces.GPU # Decorate the function to use GPU def generate_audio(input_text, output_file="output.wav"): print(f"Generating audio for input text: {input_text}") inputs = processor(text=input_text, return_tensors="pt", padding=True, truncation=True).to(device) with torch.no_grad(), torch.cuda.amp.autocast(): # Enable autocasting audio = model.generate(inputs['input_ids'], speaker_embeddings=speaker_embeddings, attention_mask=inputs['attention_mask']) # Cast to float32 before saving audio_float32 = audio.cpu().numpy().astype('float32') # Save the generated audio sf.write(output_file, audio_float32, 24000) print(f"Audio generated and saved as '{output_file}'.") # Define the Gradio interface function def tts_interface(text): # Generate audio from text audio_output = generate_audio(text) return audio_output # Create the Gradio interface gr_interface = gr.Interface( fn=tts_interface, inputs=gr.Textbox(label="Enter text to generate speech"), outputs=gr.Audio(label="Generated Speech"), title="SpeechT5 Text-to-Speech Generator", description="Enter text, and the fine-tuned SpeechT5 model will generate speech as an audio file." ) # Launch the Gradio interface if __name__ == "__main__": gr_interface.launch()