theodotus commited on
Commit
4f90f68
·
1 Parent(s): 0fe1069

Added numpy workflow

Browse files
Files changed (1) hide show
  1. app.py +39 -3
app.py CHANGED
@@ -1,15 +1,51 @@
1
  import gradio as gr
 
 
 
2
 
3
  import nemo.collections.asr as nemo_asr
4
 
5
 
6
  asr_model = nemo_asr.models.EncDecCTCModelBPE. \
7
- from_pretrained("theodotus/stt_uk_squeezeformer_ctc_xs",map_location="cpu")
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def transcribe(audio, state=""):
12
- text = asr_model.transcribe([audio], batch_size=1)[0]
 
 
 
 
 
 
 
13
  state += text + " "
14
  return state, state
15
 
@@ -17,7 +53,7 @@ def transcribe(audio, state=""):
17
  gr.Interface(
18
  fn=transcribe,
19
  inputs=[
20
- gr.Audio(source="microphone", type="filepath", streaming=True),
21
  "state"
22
  ],
23
  outputs=[
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import resampy
4
+ import torch
5
 
6
  import nemo.collections.asr as nemo_asr
7
 
8
 
9
  asr_model = nemo_asr.models.EncDecCTCModelBPE. \
10
+ from_pretrained("NeonBohdan/stt_uk_citrinet_512_gamma_0_25",map_location="cpu")
11
 
12
+ asr_model.preprocessor.featurizer.dither = 0.0
13
+ asr_model.preprocessor.featurizer.pad_to = 0
14
+ asr_model.eval()
15
+ asr_model.encoder.freeze()
16
+ asr_model.decoder.freeze()
17
+
18
+
19
+
20
+ def resample(sr, audio_data):
21
+ audio_fp32 = np.divide(audio_data, np.iinfo(audio_data.dtype).max, dtype=np.float32)
22
+ audio_16k = resampy.resample(audio_fp32, sr, asr_model.cfg["sample_rate"])
23
+
24
+ return audio_16k
25
+
26
+
27
+ def model(audio_16k):
28
+ logits, logits_len, greedy_predictions = asr_model.forward(
29
+ input_signal=torch.tensor([audio_16k]),
30
+ input_signal_length=torch.tensor([len(audio_16k)])
31
+ )
32
+
33
+ current_hypotheses, all_hyp = asr_model.decoding.ctc_decoder_predictions_tensor(
34
+ logits, decoder_lengths=logits_len, return_hypotheses=False,
35
+ )
36
+
37
+ return current_hypotheses[0]
38
 
39
 
40
  def transcribe(audio, state=""):
41
+ # if state is None:
42
+ # pass
43
+
44
+ sr, audio_data = audio
45
+ audio_16k = resample(sr, audio_data)
46
+
47
+ text = model(audio_16k)
48
+
49
  state += text + " "
50
  return state, state
51
 
 
53
  gr.Interface(
54
  fn=transcribe,
55
  inputs=[
56
+ gr.Audio(source="microphone", type="numpy", streaming=True),
57
  "state"
58
  ],
59
  outputs=[