from transformers import BartForConditionalGeneration, BartTokenizer import streamlit as st import torch from transformers import AutoProcessor, WhisperForConditionalGeneration from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torchaudio from transformers import pipeline from streamlit_mic_recorder import mic_recorder,speech_to_text import numpy as np option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file")) if option == "Microphone": # Load your own audio file st.write("Record your voice, and play the recorded audio:") audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder') if audio == None: st.write("Please start the recording in the box above") else: st.audio(audio["bytes"]) elif option == "Upload file": audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3']) if audio: st.audio(audio) option_language = st.selectbox( 'Select the language of your audio', ('English', 'Spanish', 'German','French','Chinese')) if audio == None: st.write("Please upload the audio in the box above") else: if option_language == "English": def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) st.write("Here is your transcription:") st.write(transcription) elif option_language == 'Spanish': def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) st.write("Aqui tienes tu transcripcion:") st.write(transcription) elif option_language == 'German': def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) st.write("Hier ist Ihre Transkription:") st.write(transcription) elif option_language == "French": def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) st.write("Ici, vous avez votre transcription") st.write(transcription) elif option_language == "Chinese": def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) st.write("这是您的转录。") st.write(transcription)