pratikshahp commited on
Commit
7f020a5
·
verified ·
1 Parent(s): d330003

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -18
app.py CHANGED
@@ -1,31 +1,27 @@
1
- #import librosa
2
  import torch
3
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
4
  import streamlit as st
5
  from audio_recorder_streamlit import audio_recorder
6
 
7
  audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
 
8
  if audio_bytes:
9
  st.audio(audio_bytes, format="audio/wav")
10
 
11
- #load pre-trained model and tokenizer
12
- tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
13
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
14
-
15
- #load audio file
16
- #speech, rate = librosa.load("/hip-voice.m4a",sr=16000)
17
-
18
- #import IPython.display as display
19
- #display.Audio("batman1.wav", autoplay=True)
20
-
21
- input_values = tokenizer(audio_bytes, return_tensors = 'pt').input_values
22
 
23
- #input_values = tokenizer(speech, return_tensors = 'pt').input_values
24
- logits = model(input_values).logits
25
 
26
- predicted_ids = torch.argmax(logits, dim =-1)
 
 
27
 
28
- #decode the audio to generate text
29
- transcriptions = tokenizer.decode(predicted_ids[0])
30
 
31
- print(transcriptions)
 
 
 
 
1
  import torch
2
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
3
  import streamlit as st
4
  from audio_recorder_streamlit import audio_recorder
5
 
6
  audio_bytes = audio_recorder(pause_threshold=3.0, sample_rate=16_000)
7
+
8
  if audio_bytes:
9
  st.audio(audio_bytes, format="audio/wav")
10
 
11
+ # Load pre-trained model and tokenizer
12
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
13
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 
 
 
 
 
 
 
 
14
 
15
+ # Tokenize the audio input
16
+ input_values = tokenizer(audio_bytes, return_tensors='pt').input_values
17
 
18
+ # Perform inference
19
+ logits = model(input_values).logits
20
+ predicted_ids = torch.argmax(logits, dim=-1)
21
 
22
+ # Decode the audio to generate text
23
+ transcriptions = tokenizer.decode(predicted_ids[0])
24
 
25
+ st.write(transcriptions)
26
+ else:
27
+ st.write("No audio recorded.")