bgtts / app.py
englissi's picture
Update app.py
805ef56 verified
raw
history blame
1.23 kB
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForCTC
import soundfile as sf # For handling audio input
# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
# ASR λ³€ν™˜ ν•¨μˆ˜ (speech-to-text conversion)
def asr_generate(audio):
# Load and process the audio file
speech, _ = sf.read(audio)
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
# Get predicted IDs and decode the text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
iface = gr.Interface(
fn=asr_generate,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Bulgarian Speech Recognition",
description="Upload or record audio in Bulgarian to get the transcription."
)
# μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
if __name__ == "__main__":
iface.launch()