import gradio as gr import torch from transformers import AutoProcessor, AutoModelForCTC import soundfile as sf # For handling audio input # Load model directly from transformers import AutoTokenizer, AutoModelForPreTraining tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning") model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning") # ASR 변환 함수 (speech-to-text conversion) def asr_generate(audio): # Load and process the audio file speech, _ = sf.read(audio) inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits # Get predicted IDs and decode the text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription # Gradio 인터페이스 생성 iface = gr.Interface( fn=asr_generate, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Bulgarian Speech Recognition", description="Upload or record audio in Bulgarian to get the transcription." ) # 인터페이스 실행 if __name__ == "__main__": iface.launch()