smajumdar commited on
Commit
737d6ed
·
1 Parent(s): 221f936

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import torch
3
  import time
4
  import librosa
 
5
  import soundfile
6
  import nemo.collections.asr as nemo_asr
7
  import tempfile
@@ -15,8 +16,9 @@ model.change_decoding_strategy(None)
15
  model.eval()
16
 
17
 
18
- def process_audio_file(file):
19
- data, sr = librosa.load(file)
 
20
 
21
  if sr != SAMPLE_RATE:
22
  data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
@@ -30,10 +32,15 @@ def transcribe(state, audio):
30
  # Grant additional context
31
  # time.sleep(1)
32
 
33
- if state is None:
34
- state = ""
 
 
 
 
 
35
 
36
- audio_data = process_audio_file(audio)
37
 
38
  with tempfile.TemporaryDirectory() as tmpdir:
39
  # Filepath transcribe
@@ -50,15 +57,15 @@ def transcribe(state, audio):
50
 
51
  transcriptions = transcriptions[0]
52
 
53
- state = state + transcriptions + " "
54
- return state, state
55
 
56
 
57
  iface = gr.Interface(
58
  fn=transcribe,
59
  inputs=[
60
  "state",
61
- gr.Audio(source="microphone", type='filepath', streaming=True),
62
  ],
63
  outputs=[
64
  "state",
 
2
  import torch
3
  import time
4
  import librosa
5
+ import numpy as np
6
  import soundfile
7
  import nemo.collections.asr as nemo_asr
8
  import tempfile
 
16
  model.eval()
17
 
18
 
19
+ # def process_audio_file(file):
20
+ def process_audio_file(data, sr):
21
+ # data, sr = librosa.load(file)
22
 
23
  if sr != SAMPLE_RATE:
24
  data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
 
32
  # Grant additional context
33
  # time.sleep(1)
34
 
35
+ sr, audio = audio
36
+ audio = audio.astype(np.float32)
37
+ audio /= np.max(np.abs(audio))
38
+
39
+ #if state is None:
40
+ # state = ""
41
+ state = audio
42
 
43
+ audio_data = process_audio_file(audio, sr)
44
 
45
  with tempfile.TemporaryDirectory() as tmpdir:
46
  # Filepath transcribe
 
57
 
58
  transcriptions = transcriptions[0]
59
 
60
+ # state = state + transcriptions + " "
61
+ return state, transcriptions
62
 
63
 
64
  iface = gr.Interface(
65
  fn=transcribe,
66
  inputs=[
67
  "state",
68
+ gr.Audio(source="microphone", streaming=True),
69
  ],
70
  outputs=[
71
  "state",