File size: 1,440 Bytes
7a25cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import streamlit as st
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# (You may need to install Streamlit if you haven't  already: pip install streamlit)
LANG_ID = "en"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"

st.title("Speech Recognition App")  # Give your app a title

# Load the model and processor (do this outside the main function for efficiency)
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

def speech_file_to_array_fn(audio_file):
    speech_array, sampling_rate = librosa.load(audio_file, sr=16_000)
    return speech_array

def process_audio(speech_array):
    inputs = processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_sentence = processor.batch_decode(predicted_ids)[0]
    return predicted_sentence
def main():
    uploaded_file = st.file_uploader("Choose an audio file (.wav format)", type='wav')

    if uploaded_file is not None:
        speech_array = speech_file_to_array_fn(uploaded_file)
        predicted_sentence = process_audio(speech_array)

        st.header("Prediction:")
        st.write(predicted_sentence)

if __name__ == "__main__":
    main()