import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from datasets import load_dataset
import soundfile as sf
import gradio as gr
import spaces  # Import the spaces module

import subprocess
import sys

# Function to install a package
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install Hugging Face Transformers from GitHub
install("git+https://github.com/huggingface/transformers.git")

# Install PyTorch and Torchaudio with CUDA 11.8 support
install("torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118")

print(torch.cuda.is_available())  # This should return True if GPU is available
print(torch.cuda.get_device_name(0))  # Should display the name of your GPU (e.g., NVIDIA A100)

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the SpeechT5 processor and TTS model
model_name = "microsoft/speecht5_tts"  # SpeechT5 model for TTS
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device)  # Send model to GPU

# Load speaker embeddings from the CMU Arctic xvectors dataset (using validation split)
speaker_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(speaker_embeddings_dataset[0]['xvector']).unsqueeze(0).to(device)  # Send to GPU

# Function to generate and save audio from text input
@spaces.GPU  # Decorate the function to use GPU
def generate_audio(input_text):
    # Preprocess the input text
    inputs = processor(text=input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Generate audio with no_grad and autocast for mixed precision
    with torch.no_grad(), torch.cuda.amp.autocast():
        # Generate speech using the model and speaker embeddings
        audio = model.generate(
            inputs['input_ids'],
            speaker_embeddings=speaker_embeddings,
            attention_mask=inputs['attention_mask']
        )
    
    # Convert to CPU and float32 before saving
    audio_float32 = audio.cpu().numpy().astype('float32')
    
    # Save the audio to a file and return the file path
    output_file = "output.wav"
    sf.write(output_file, audio_float32, 24000)
    
    return output_file

# Define the Gradio interface function
def tts_interface(text):
    # Generate audio from text
    audio_output = generate_audio(text)
    return audio_output

# Create the Gradio interface
gr_interface = gr.Interface(
    fn=tts_interface,
    inputs=gr.Textbox(label="Enter text to generate speech"),
    outputs=gr.Audio(label="Generated Speech"),
    title="SpeechT5 Text-to-Speech Generator",
    description="Enter text, and the fine-tuned SpeechT5 model will generate speech as an audio file."
)

# Launch the Gradio interface
if __name__ == "__main__":
    gr_interface.launch()