Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration | |
from datasets import load_dataset | |
import torchaudio | |
import numpy as np | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Modèle 1 : Traduction texte Wolof -> texte anglais | |
model_name = "bilalfaye/nllb-200-distilled-600M-wolof-english" | |
translator = pipeline("translation", model=model_name, device=device) | |
# Modèle 2 : Transcription audio Wolof -> texte Wolof | |
pipe_wolof = pipeline( | |
task="automatic-speech-recognition", | |
model="bilalfaye/wav2vec2-large-mms-1b-wolof", | |
processor="bilalfaye/wav2vec2-large-mms-1b-wolof", | |
device=device | |
) | |
# Modèle 3 : Texte anglais -> audio anglais | |
synthesiser_english = pipeline("text-to-speech", "microsoft/speecht5_tts") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embedding_english = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) | |
speaker_embedding_wolof = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
# Modèle 4 : Transcription audio anglais -> texte anglais | |
pipe_english = pipeline(task="automatic-speech-recognition", model="openai/whisper-small") | |
# Modèle 5 : Texte anglais -> audio Wolof | |
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof") | |
# Function 1: Transcription audio Wolof -> texte Wolof | |
def transcribe_audio_wolof(audio): | |
# Load the audio with torchaudio (returns tensor and sample rate) | |
waveform, sample_rate = torchaudio.load(audio) | |
# Check if audio is stereo | |
if waveform.shape[0] > 1: # Stereo (2 channels) | |
# Convert stereo to mono: sum the left and right channels and divide by 2 | |
mono_audio = waveform.mean(dim=0, keepdim=True) | |
else: | |
# Audio is already mono | |
mono_audio = waveform | |
# Resample to 16000 Hz if not already | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
mono_audio = resampler(mono_audio) | |
sample_rate = 16000 | |
# Convert to numpy array for pipeline processing (if required) | |
mono_audio = mono_audio.squeeze(0).numpy() | |
# Pass the processed audio to the pipeline | |
result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate}) | |
return result['text'] | |
# Fonction 2 : Traduction texte Wolof -> texte anglais | |
def translate_wolof_to_english(wolof_text): | |
translated = translator(wolof_text, src_lang="wol_Latn", tgt_lang="eng_Latn") | |
return translated[0]['translation_text'] | |
def translate_english_to_wolof(wolof_text): | |
translated = translator(wolof_text, src_lang="eng_Latn", tgt_lang="wol_Latn") | |
return translated[0]['translation_text'] | |
# Fonction 3 : Texte anglais -> audio anglais ou Wolof | |
def text_to_speech(text, language, voice_type): | |
if language == "english": | |
synthesiser = synthesiser_english | |
else: | |
synthesiser = synthesiser_wolof | |
embedding = speaker_embedding_english if voice_type == "Male" else speaker_embedding_wolof | |
speech = synthesiser(text, forward_params={"speaker_embeddings": embedding}) | |
return speech["sampling_rate"], speech["audio"] | |
# Fonction 4 : Transcription audio anglais -> texte anglais | |
def transcribe_audio_english(audio): | |
transcription = pipe_english(audio) | |
return transcription["text"] | |
# Fonction 5 : Traitement audio Wolof vers anglais | |
def process_audio_wolof(audio, voice_type): | |
wolof_text = transcribe_audio_wolof(audio) | |
english_text = translate_wolof_to_english(wolof_text) | |
audio_english = text_to_speech(english_text, "english", voice_type) | |
return wolof_text, english_text, audio_english | |
# Fonction 6 : Traitement audio anglais vers Wolof | |
def process_audio_english(audio, voice_type): | |
english_text = transcribe_audio_english(audio) | |
wolof_text = translate_english_to_wolof(english_text) | |
audio_wolof = text_to_speech(wolof_text, "wolof", voice_type) | |
return english_text, wolof_text, audio_wolof | |
# Updated Gradio Interface | |
iface = gr.TabbedInterface( | |
[ | |
gr.Interface( | |
fn=process_audio_wolof, | |
inputs=[ | |
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in Wolof"), | |
gr.Radio(["Male", "Female"], label="Select Voice Type") | |
], | |
outputs=[ | |
gr.Textbox(label="Texte Wolof"), | |
gr.Textbox(label="Texte traduit en Anglais"), | |
gr.Audio(label="Audio en Anglais") | |
], | |
title="Wolof vers Anglais", | |
description="You can upload an audio file or record using a microphone to process Wolof audio." | |
), | |
gr.Interface( | |
fn=process_audio_english, | |
inputs=[ | |
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio in English"), | |
gr.Radio(["Male", "Female"], label="Select Voice Type") | |
], | |
outputs=[ | |
gr.Textbox(label="Texte Anglais"), | |
gr.Textbox(label="Texte traduit en Wolof"), | |
gr.Audio(label="Audio en Wolof") | |
], | |
title="Anglais vers Wolof", | |
description="You can upload an audio file or record using a microphone to process English audio." | |
) | |
], | |
tab_names=["Wolof vers Anglais", "Anglais vers Wolof"] | |
) | |
iface.launch(debug=True, share=True) |