Spaces:
Runtime error
Runtime error
import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
from datasets import load_dataset | |
import soundfile as sf | |
import gradio as gr | |
import spaces # Import the spaces module | |
import subprocess | |
import sys | |
# Function to install a package | |
def install(package): | |
subprocess.check_call([sys.executable, "-m", "pip", "install", package]) | |
# Install Hugging Face Transformers from GitHub | |
install("git+https://github.com/huggingface/transformers.git") | |
# Install PyTorch and Torchaudio with CUDA 11.8 support | |
install("torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118") | |
print(torch.cuda.is_available()) # This should return True if GPU is available | |
print(torch.cuda.get_device_name(0)) # Should display the name of your GPU (e.g., NVIDIA A100) | |
# Check if CUDA (GPU) is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
# Load the SpeechT5 processor and TTS model | |
model_name = "microsoft/speecht5_tts" # SpeechT5 model for TTS | |
processor = SpeechT5Processor.from_pretrained(model_name) | |
model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device) # Send model to GPU | |
# Load speaker embeddings from the CMU Arctic xvectors dataset (using validation split) | |
speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(speaker_embeddings_dataset[0]['xvector']).unsqueeze(0).to(device) # Send to GPU | |
# Function to generate and save audio from text input | |
# Decorate the function to use GPU | |
def generate_audio(input_text): | |
# Preprocess the input text | |
inputs = processor(text=input_text, return_tensors="pt", padding=True, truncation=True).to(device) | |
# Generate audio with no_grad and autocast for mixed precision | |
with torch.no_grad(), torch.cuda.amp.autocast(): | |
# Generate speech using the model and speaker embeddings | |
audio = model.generate( | |
inputs['input_ids'], | |
speaker_embeddings=speaker_embeddings, | |
attention_mask=inputs['attention_mask'] | |
) | |
# Convert to CPU and float32 before saving | |
audio_float32 = audio.cpu().numpy().astype('float32') | |
# Save the audio to a file and return the file path | |
output_file = "output.wav" | |
sf.write(output_file, audio_float32, 24000) | |
return output_file | |
# Define the Gradio interface function | |
def tts_interface(text): | |
# Generate audio from text | |
audio_output = generate_audio(text) | |
return audio_output | |
# Create the Gradio interface | |
gr_interface = gr.Interface( | |
fn=tts_interface, | |
inputs=gr.Textbox(label="Enter text to generate speech"), | |
outputs=gr.Audio(label="Generated Speech"), | |
title="SpeechT5 Text-to-Speech Generator", | |
description="Enter text, and the fine-tuned SpeechT5 model will generate speech as an audio file." | |
) | |
# Launch the Gradio interface | |
if __name__ == "__main__": | |
gr_interface.launch() |