bgtts / app.py
englissi's picture
Update app.py
bb16e26 verified
raw
history blame
1.22 kB
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForCTC
import soundfile as sf # To handle audio input
# Load the processor and model directly for Bulgarian ASR
processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian")
# ASR λ³€ν™˜ ν•¨μˆ˜ (speech-to-text conversion)
def asr_generate(audio):
# Load and process the audio file
speech, _ = sf.read(audio)
inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
# Get predicted IDs and decode the text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
iface = gr.Interface(
fn=asr_generate,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Bulgarian Speech Recognition",
description="Upload or record audio in Bulgarian to get the transcription."
)
# μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
if __name__ == "__main__":
iface.launch()