|
import gradio as gr |
|
import torch |
|
from transformers import AutoProcessor, AutoModelForCTC |
|
import soundfile as sf |
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForPreTraining |
|
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning") |
|
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning") |
|
|
|
|
|
def asr_generate(audio): |
|
|
|
speech, _ = sf.read(audio) |
|
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True) |
|
|
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
|
|
return transcription |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=asr_generate, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs="text", |
|
title="Bulgarian Speech Recognition", |
|
description="Upload or record audio in Bulgarian to get the transcription." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|