import gradio as gr import torchaudio from transformers import AutoModelForSpeechSeq2Seq, PreTrainedTokenizerFast def transcribe_audio(audio_path): # Load and resample audio audio, sr = torchaudio.load(audio_path) if sr != 16000: audio = torchaudio.functional.resample(audio, sr, 16000) # Get transcription tokens = model(audio) transcription = tokenizer.decode(tokens[0], skip_special_tokens=True) return transcription # Load model and tokenizer globally model = AutoModelForSpeechSeq2Seq.from_pretrained('usefulsensors/moonshine-tiny', trust_remote_code=True) tokenizer = PreTrainedTokenizerFast.from_pretrained('usefulsensors/moonshine-tiny') # Create Gradio interface demo = gr.Blocks() with demo: gr.Markdown("## Audio Transcription App") with gr.Tabs(): with gr.TabItem("Upload Audio"): audio_file = gr.Audio(source="upload", type="filepath") output_text1 = gr.Textbox(label="Transcription") upload_button = gr.Button("Transcribe") upload_button.click(fn=transcribe_audio, inputs=audio_file, outputs=output_text1) with gr.TabItem("Record Audio"): audio_mic = gr.Audio(source="microphone", type="filepath") output_text2 = gr.Textbox(label="Transcription") record_button = gr.Button("Transcribe") record_button.click(fn=transcribe_audio, inputs=audio_mic, outputs=output_text2) demo.launch()