import gradio as gr import torch from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration from datasets import load_dataset import torchaudio import numpy as np device = "cuda" if torch.cuda.is_available() else "cpu" # Modèle 1 : Traduction texte Wolof -> texte anglais model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english" translator = pipeline("translation", model=model_name, device=device) # Modèle 2 : Transcription audio Wolof -> texte Wolof pipe_wolof = pipeline( task="automatic-speech-recognition", model="bilalfaye/wav2vec2-large-mms-1b-wolof", processor="bilalfaye/wav2vec2-large-mms-1b-wolof", device=device ) # Modèle 3 : Texte anglais -> audio anglais synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Modèle 4 : Transcription audio anglais -> texte anglais pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small") # Modèle 5 : Texte anglais -> audio Wolof synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof") # Function 1: Transcription audio Wolof -> texte Wolof def transcribe_audio_wolof(audio): # Load the audio with torchaudio (returns tensor and sample rate) waveform, sample_rate = torchaudio.load(audio) # Check if audio is stereo if waveform.shape[0] > 1: # Stereo (2 channels) # Convert stereo to mono: sum the left and right channels and divide by 2 mono_audio = waveform.mean(dim=0, keepdim=True) else: # Audio is already mono mono_audio = waveform # Resample to 16000 Hz if not already if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) mono_audio = resampler(mono_audio) sample_rate = 16000 # Convert to numpy array for pipeline processing (if required) mono_audio = mono_audio.squeeze(0).numpy() # Pass the processed audio to the pipeline result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate}) return result['text'] # Fonction 2 : Traduction texte Wolof -> texte anglais def translate_wolof_to_english(wolof_text): translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn") return translated[0]['translation_text'] def translate_english_to_wolof(wolof_text): translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn") return translated[0]['translation_text'] # Fonction 3 : Texte anglais -> audio anglais ou Wolof def text_to_speech(text, language, voice_type): if language == "english": synthesiser = synthesiser_english else: synthesiser = synthesiser_wolof embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof speech = synthesiser(text, forward_params={"speaker_embeddings": embedding}) return speech["sampling_rate"], speech["audio"] # Fonction 4 : Transcription audio anglais -> texte anglais def transcribe_audio_english(audio): transcription = pipe_english(audio) return transcription["text"] # Fonction 5 : Traitement audio Wolof vers anglais def process_audio_wolof(audio, voice_type): wolof_text = transcribe_audio_wolof(audio) english_text = translate_wolof_to_english(wolof_text) audio_english = text_to_speech(english_text, "english", voice_type) return wolof_text, english_text, audio_english # Fonction 6 : Traitement audio anglais vers Wolof def process_audio_english(audio, voice_type): english_text = transcribe_audio_english(audio) wolof_text = translate_english_to_wolof(english_text) audio_wolof = text_to_speech(wolof_text, "wolof", voice_type) return english_text, wolof_text, audio_wolof # Updated Gradio Interface iface = gr.TabbedInterface( [ gr.Interface( fn=process_audio_wolof, inputs=[ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"), gr.Radio(["Male", "Female"], label="Select Voice Type") ], outputs=[ gr.Textbox(label="Texte Wolof"), gr.Textbox(label="Texte traduit en Anglais"), gr.Audio(label="Audio en Anglais") ], title="Wolof vers Anglais", description="You can upload an audio file or record using a microphone to process Wolof audio." ), gr.Interface( fn=process_audio_english, inputs=[ gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"), gr.Radio(["Male", "Female"], label="Select Voice Type") ], outputs=[ gr.Textbox(label="Texte Anglais"), gr.Textbox(label="Texte traduit en Wolof"), gr.Audio(label="Audio en Wolof") ], title="Anglais vers Wolof", description="You can upload an audio file or record using a microphone to process English audio." ) ], tab_names=["Wolof vers Anglais", "Anglais vers Wolof"] ) iface.launch(debug=True, share=True)