import os import json import torch import whisper import streamlit as st from tempfile import NamedTemporaryFile def transcribe_audio(audio_file, model): """ Transcribe a single audio file using OpenAI's Whisper model locally. """ result = model.transcribe(audio_file) return result["text"].strip() def main(): st.title("Audio Transcription with Whisper") # File uploader uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a", "flac", "aac"]) if uploaded_file is not None: # Display audio file details file_details = {"Filename": uploaded_file.name, "FileSize": uploaded_file.size} st.write(file_details) # Play audio st.audio(uploaded_file, format='audio/wav') if st.button('Transcribe Audio'): with st.spinner('Transcribing audio using Whisper large model...'): # Check if CUDA is available device = "cuda" if torch.cuda.is_available() else "cpu" st.info(f"Using device: {device}") # Load the Whisper model model = whisper.load_model("large", device=device) # Save uploaded file temporarily and transcribe with NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name transcription = transcribe_audio(tmp_file_path, model) # Remove temporary file os.unlink(tmp_file_path) # Display transcription st.subheader("Transcription:") st.write(transcription) # Save transcription to JSON output_json = 'transcription.json' with open(output_json, 'w', encoding='utf-8') as f: json.dump({uploaded_file.name: transcription}, f, ensure_ascii=False, indent=4) st.success(f"Transcription saved to {output_json}") if __name__ == "__main__": main()