import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech from datasets import load_dataset import soundfile as sf import gradio as gr import spaces # Import the spaces module import subprocess import sys # Function to install a package def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) # Install Hugging Face Transformers from GitHub install("git+https://github.com/huggingface/transformers.git") # Install PyTorch and Torchaudio with CUDA 11.8 support install("torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118") print(torch.cuda.is_available()) # This should return True if GPU is available print(torch.cuda.get_device_name(0)) # Should display the name of your GPU (e.g., NVIDIA A100) # Check if CUDA (GPU) is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load the SpeechT5 processor and TTS model model_name = "microsoft/speecht5_tts" # SpeechT5 model for TTS processor = SpeechT5Processor.from_pretrained(model_name) model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device) # Send model to GPU # Load speaker embeddings from the CMU Arctic xvectors dataset (using validation split) speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(speaker_embeddings_dataset[0]['xvector']).unsqueeze(0).to(device) # Send to GPU # Function to generate and save audio from text input @spaces.GPU # Decorate the function to use GPU def generate_audio(input_text): # Preprocess the input text inputs = processor(text=input_text, return_tensors="pt", padding=True, truncation=True).to(device) # Generate audio with no_grad and autocast for mixed precision with torch.no_grad(), torch.cuda.amp.autocast(): # Generate speech using the model and speaker embeddings audio = model.generate( inputs['input_ids'], speaker_embeddings=speaker_embeddings, attention_mask=inputs['attention_mask'] ) # Convert to CPU and float32 before saving audio_float32 = audio.cpu().numpy().astype('float32') # Save the audio to a file and return the file path output_file = "output.wav" sf.write(output_file, audio_float32, 24000) return output_file # Define the Gradio interface function def tts_interface(text): # Generate audio from text audio_output = generate_audio(text) return audio_output # Create the Gradio interface gr_interface = gr.Interface( fn=tts_interface, inputs=gr.Textbox(label="Enter text to generate speech"), outputs=gr.Audio(label="Generated Speech"), title="SpeechT5 Text-to-Speech Generator", description="Enter text, and the fine-tuned SpeechT5 model will generate speech as an audio file." ) # Launch the Gradio interface if __name__ == "__main__": gr_interface.launch()