from transformers import AutoModelForCTC, Wav2Vec2Tokenizer import torch import gradio as gr model = Wav2Vec2ForCTC.from_pretrained("BenDaouda/wav2vec2-large-xls-r-300m-wolof-test-coloab") processor = Wav2Vec2Processor.from_pretrained("BenDaouda/wav2vec2-large-xls-r-300m-wolof-test-coloab") def transcribe(audio): input_values = tokenizer(audio, return_tensors="pt").input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids)[0] return transcription iface = gr.Interface( fn=transcribe, inputs=gr.inputs.Audio(source="microphone", type="file", resample_to=16000), outputs="text" ) iface.launch()