import gradio as gr import torch from transformers import AutoProcessor, AutoModelForCTC import soundfile as sf # To handle audio input # Load the processor and model directly for Bulgarian ASR processor = AutoProcessor.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian") model = AutoModelForCTC.from_pretrained("infinitejoy/wav2vec2-large-xls-r-300m-bulgarian") # ASR 변환 함수 (speech-to-text conversion) def asr_generate(audio): # Load and process the audio file speech, _ = sf.read(audio) inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits # Get predicted IDs and decode the text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription # Gradio 인터페이스 생성 iface = gr.Interface( fn=asr_generate, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Bulgarian Speech Recognition", description="Upload or record audio in Bulgarian to get the transcription." ) # 인터페이스 실행 if __name__ == "__main__": iface.launch()