import gradio as gr
import torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torchaudio
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"


# Modèle 1 : Traduction texte Wolof -> texte anglais
model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english"
translator = pipeline("translation", model=model_name, device=device)

# Modèle 2 : Transcription audio Wolof -> texte Wolof
pipe_wolof = pipeline(
    task="automatic-speech-recognition",
    model="bilalfaye/wav2vec2-large-mms-1b-wolof",
    processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
    device=device
)

# Modèle 3 : Texte anglais -> audio anglais
synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Modèle 4 : Transcription audio anglais -> texte anglais
pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")

# Modèle 5 : Texte anglais -> audio Wolof
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")


# Function 1: Transcription audio Wolof -> texte Wolof
def transcribe_audio_wolof(audio):
    # Load the audio with torchaudio (returns tensor and sample rate)
    waveform, sample_rate = torchaudio.load(audio)
    
    # Check if audio is stereo
    if waveform.shape[0] > 1:  # Stereo (2 channels)
        # Convert stereo to mono: sum the left and right channels and divide by 2
        mono_audio = waveform.mean(dim=0, keepdim=True)
    else:
        # Audio is already mono
        mono_audio = waveform
    
    # Resample to 16000 Hz if not already
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        mono_audio = resampler(mono_audio)
        sample_rate = 16000

    # Convert to numpy array for pipeline processing (if required)
    mono_audio = mono_audio.squeeze(0).numpy()

    # Pass the processed audio to the pipeline
    result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
    return result['text']


# Fonction 2 : Traduction texte Wolof -> texte anglais
def translate_wolof_to_english(wolof_text):
    translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn")
    return translated[0]['translation_text']

def translate_english_to_wolof(wolof_text):
    translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn")
    return translated[0]['translation_text']


# Fonction 3 : Texte anglais -> audio anglais ou Wolof
def text_to_speech(text, language, voice_type):
    if language == "english":
        synthesiser = synthesiser_english
    else:
        synthesiser = synthesiser_wolof

    embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof
    speech = synthesiser(text, forward_params={"speaker_embeddings": embedding})
    return speech["sampling_rate"], speech["audio"]

# Fonction 4 : Transcription audio anglais -> texte anglais
def transcribe_audio_english(audio):
    transcription = pipe_english(audio)
    return transcription["text"]

# Fonction 5 : Traitement audio Wolof vers anglais
def process_audio_wolof(audio, voice_type):
    wolof_text = transcribe_audio_wolof(audio)
    english_text = translate_wolof_to_english(wolof_text)
    audio_english = text_to_speech(english_text, "english", voice_type)
    return wolof_text, english_text, audio_english

# Fonction 6 : Traitement audio anglais vers Wolof
def process_audio_english(audio, voice_type):
    english_text = transcribe_audio_english(audio)
    wolof_text = translate_english_to_wolof(english_text)
    audio_wolof = text_to_speech(wolof_text, "wolof", voice_type)
    return english_text, wolof_text, audio_wolof

# Updated Gradio Interface
iface = gr.TabbedInterface(
    [
        gr.Interface(
            fn=process_audio_wolof,
            inputs=[
                gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"),
                gr.Radio(["Male", "Female"], label="Select Voice Type")
            ],
            outputs=[
                gr.Textbox(label="Texte Wolof"),
                gr.Textbox(label="Texte traduit en Anglais"),
                gr.Audio(label="Audio en Anglais")
            ],
            title="Wolof vers Anglais",
            description="You can upload an audio file or record using a microphone to process Wolof audio."
        ),
        gr.Interface(
            fn=process_audio_english,
            inputs=[
                gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"),
                gr.Radio(["Male", "Female"], label="Select Voice Type")
            ],
            outputs=[
                gr.Textbox(label="Texte Anglais"),
                gr.Textbox(label="Texte traduit en Wolof"),
                gr.Audio(label="Audio en Wolof")
            ],
            title="Anglais vers Wolof",
            description="You can upload an audio file or record using a microphone to process English audio."
        )
    ],
    tab_names=["Wolof vers Anglais", "Anglais vers Wolof"]
)

iface.launch(debug=True, share=True)