import streamlit as st from transformers import pipeline import tempfile from pydub import AudioSegment import numpy as np # Load the ASR pipeline @st.cache_resource def load_asr_pipeline(): asr_pipeline = pipeline("automatic-speech-recognition", model="Yehor/whisper-small-ukrainian") return asr_pipeline st.title("Voice Recognition App using Whisper") st.write("Upload an audio file and the Whisper model will transcribe it to text.") # Load the ASR pipeline asr_pipeline = load_asr_pipeline() st.write("Model loaded successfully.") # File uploader for audio file uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"]) if uploaded_file is not None: # Save the uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_file.read()) temp_file_path = temp_file.name # Convert audio file to WAV format if necessary audio = AudioSegment.from_file(temp_file_path) temp_wav_path = tempfile.mktemp(suffix=".wav") audio.export(temp_wav_path, format="wav") st.audio(uploaded_file, format="audio/wav") st.write("Transcribing audio...") # Read the audio file audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1) audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32) # Perform transcription result = asr_pipeline(audio_input) # Display transcription st.write("Transcription:") st.write(result['text'])