Spaces:
Sleeping
Sleeping
Added numpy workflow
Browse files
app.py
CHANGED
@@ -1,15 +1,51 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
2 |
|
3 |
import nemo.collections.asr as nemo_asr
|
4 |
|
5 |
|
6 |
asr_model = nemo_asr.models.EncDecCTCModelBPE. \
|
7 |
-
from_pretrained("
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
def transcribe(audio, state=""):
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
state += text + " "
|
14 |
return state, state
|
15 |
|
@@ -17,7 +53,7 @@ def transcribe(audio, state=""):
|
|
17 |
gr.Interface(
|
18 |
fn=transcribe,
|
19 |
inputs=[
|
20 |
-
gr.Audio(source="microphone", type="
|
21 |
"state"
|
22 |
],
|
23 |
outputs=[
|
|
|
1 |
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import resampy
|
4 |
+
import torch
|
5 |
|
6 |
import nemo.collections.asr as nemo_asr
|
7 |
|
8 |
|
9 |
asr_model = nemo_asr.models.EncDecCTCModelBPE. \
|
10 |
+
from_pretrained("NeonBohdan/stt_uk_citrinet_512_gamma_0_25",map_location="cpu")
|
11 |
|
12 |
+
asr_model.preprocessor.featurizer.dither = 0.0
|
13 |
+
asr_model.preprocessor.featurizer.pad_to = 0
|
14 |
+
asr_model.eval()
|
15 |
+
asr_model.encoder.freeze()
|
16 |
+
asr_model.decoder.freeze()
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def resample(sr, audio_data):
|
21 |
+
audio_fp32 = np.divide(audio_data, np.iinfo(audio_data.dtype).max, dtype=np.float32)
|
22 |
+
audio_16k = resampy.resample(audio_fp32, sr, asr_model.cfg["sample_rate"])
|
23 |
+
|
24 |
+
return audio_16k
|
25 |
+
|
26 |
+
|
27 |
+
def model(audio_16k):
|
28 |
+
logits, logits_len, greedy_predictions = asr_model.forward(
|
29 |
+
input_signal=torch.tensor([audio_16k]),
|
30 |
+
input_signal_length=torch.tensor([len(audio_16k)])
|
31 |
+
)
|
32 |
+
|
33 |
+
current_hypotheses, all_hyp = asr_model.decoding.ctc_decoder_predictions_tensor(
|
34 |
+
logits, decoder_lengths=logits_len, return_hypotheses=False,
|
35 |
+
)
|
36 |
+
|
37 |
+
return current_hypotheses[0]
|
38 |
|
39 |
|
40 |
def transcribe(audio, state=""):
|
41 |
+
# if state is None:
|
42 |
+
# pass
|
43 |
+
|
44 |
+
sr, audio_data = audio
|
45 |
+
audio_16k = resample(sr, audio_data)
|
46 |
+
|
47 |
+
text = model(audio_16k)
|
48 |
+
|
49 |
state += text + " "
|
50 |
return state, state
|
51 |
|
|
|
53 |
gr.Interface(
|
54 |
fn=transcribe,
|
55 |
inputs=[
|
56 |
+
gr.Audio(source="microphone", type="numpy", streaming=True),
|
57 |
"state"
|
58 |
],
|
59 |
outputs=[
|