Spaces:

smajumdar
/

nemo_conformer_rnnt_large_streaming

Runtime error

smajumdar commited on Sep 25, 2023

Commit

737d6ed

1 Parent(s): 221f936

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import torch
 import time
 import librosa
 import soundfile
 import nemo.collections.asr as nemo_asr
 import tempfile
@@ -15,8 +16,9 @@ model.change_decoding_strategy(None)
 model.eval()
-def process_audio_file(file):
-    data, sr = librosa.load(file)
     if sr != SAMPLE_RATE:
         data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
@@ -30,10 +32,15 @@ def transcribe(state, audio):
     # Grant additional context
     # time.sleep(1)
-    if state is None:
-        state = ""
-    audio_data = process_audio_file(audio)
     with tempfile.TemporaryDirectory() as tmpdir:
         # Filepath transcribe
@@ -50,15 +57,15 @@ def transcribe(state, audio):
         transcriptions = transcriptions[0]
-    state = state + transcriptions + " "
-    return state, state
 iface = gr.Interface(
     fn=transcribe,
     inputs=[
         "state",
-        gr.Audio(source="microphone", type='filepath', streaming=True),
     ],
     outputs=[
         "state",

 import torch
 import time
 import librosa
+import numpy as np
 import soundfile
 import nemo.collections.asr as nemo_asr
 import tempfile
 model.eval()
+# def process_audio_file(file):
+def process_audio_file(data, sr):
+    # data, sr = librosa.load(file)
     if sr != SAMPLE_RATE:
         data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
     # Grant additional context
     # time.sleep(1)
+    sr, audio = audio
+    audio = audio.astype(np.float32)
+    audio /= np.max(np.abs(audio))
+    #if state is None:
+    #    state = ""
+    state = audio
+    audio_data = process_audio_file(audio, sr)
     with tempfile.TemporaryDirectory() as tmpdir:
         # Filepath transcribe
         transcriptions = transcriptions[0]
+    # state = state + transcriptions + " "
+    return state, transcriptions
 iface = gr.Interface(
     fn=transcribe,
     inputs=[
         "state",
+        gr.Audio(source="microphone", streaming=True),
     ],
     outputs=[
         "state",