Yurii Paniv commited on
Commit
14485b0
·
1 Parent(s): 9da9a4e

Add side-by-side comparison

Browse files
Files changed (4) hide show
  1. README.md +5 -0
  2. app.py +34 -37
  3. requirements-local.txt +2 -0
  4. requirements.txt +0 -1
README.md CHANGED
@@ -4,6 +4,7 @@ emoji: 🐌
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
  ---
@@ -12,6 +13,7 @@ pinned: false
12
  This is a repository with aim to apply various speech recognition models on Ukrainian language.
13
 
14
  You can see online demo here: https://huggingface.co/spaces/robinhad/ukrainian-stt.
 
15
  Source code is in this repository together with auto-deploy pipeline scripts.
16
 
17
 
@@ -30,6 +32,9 @@ If you'd like to check out different models for Ukrainian language, please visit
30
  # 🤖 Training scripts
31
  Guides for training are available in corresponding folders for each model.
32
 
 
 
 
33
  # 🤝 Attribution
34
  [@robinhad](https://github.com/robinhad) - model training.
35
  [@egorsmkv](https://github.com/egorsmkv) - organized [Ukrainian Speech recognition community](https://github.com/egorsmkv/speech-recognition-uk).
 
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 3.2
8
  app_file: app.py
9
  pinned: false
10
  ---
 
13
  This is a repository with aim to apply various speech recognition models on Ukrainian language.
14
 
15
  You can see online demo here: https://huggingface.co/spaces/robinhad/ukrainian-stt.
16
+ Github link: https://github.com/robinhad/voice-recognition-ua.
17
  Source code is in this repository together with auto-deploy pipeline scripts.
18
 
19
 
 
32
  # 🤖 Training scripts
33
  Guides for training are available in corresponding folders for each model.
34
 
35
+ # Support
36
+ If you like my work, please support here: https://send.monobank.ua/jar/48iHq4xAXm
37
+
38
  # 🤝 Attribution
39
  [@robinhad](https://github.com/robinhad) - model training.
40
  [@egorsmkv](https://github.com/egorsmkv) - organized [Ukrainian Speech recognition community](https://github.com/egorsmkv/speech-recognition-uk).
app.py CHANGED
@@ -9,10 +9,6 @@ from os.path import exists
9
  from stt import Model
10
  from datetime import datetime
11
 
12
- MODEL_NAMES = [
13
- "No scorer",
14
- "With scorer"
15
- ]
16
 
17
  # download model
18
  version = "v0.4"
@@ -22,15 +18,17 @@ scorer_name = "kenlm.scorer"
22
  model_link = f"{storage_url}/{model_name}"
23
  scorer_link = f"{storage_url}/{scorer_name}"
24
 
 
 
 
 
 
 
 
 
25
 
26
- def client(audio_data: np.array, sample_rate: int, use_scorer=False):
27
- output_audio = _convert_audio(audio_data, sample_rate)
28
-
29
- fin = wave.open(output_audio, 'rb')
30
- audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
31
-
32
- fin.close()
33
 
 
34
  ds = Model(model_name)
35
  if use_scorer:
36
  ds.enableExternalScorer("kenlm.scorer")
@@ -40,28 +38,30 @@ def client(audio_data: np.array, sample_rate: int, use_scorer=False):
40
  return result
41
 
42
 
43
- def download(url, file_name):
44
- if not exists(file_name):
45
- print(f"Downloading {file_name}")
46
- r = requests.get(url, allow_redirects=True)
47
- with open(file_name, 'wb') as file:
48
- file.write(r.content)
49
- else:
50
- print(f"Found {file_name}. Skipping download...")
51
-
52
 
53
- def stt(audio: Tuple[int, np.array], model_name: str):
54
- sample_rate, audio = audio
55
- print(f"Input sample rate: {sample_rate}. Audio file length: {round(audio.shape[0]/sample_rate ,2)}")
56
- use_scorer = True if model_name == "With scorer" else False
57
 
58
- recognized_result = client(audio, sample_rate, use_scorer)
59
- print(f"Time: {datetime.utcnow()}. Transcript: `{recognized_result}`. Scorer: {use_scorer}.")
 
60
 
61
- return recognized_result
62
 
 
 
 
 
 
 
 
63
 
64
  def _convert_audio(audio_data: np.array, sample_rate: int):
 
 
 
65
  source_audio = BytesIO()
66
  source_audio.write(audio_data)
67
  source_audio.seek(0)
@@ -76,23 +76,20 @@ def _convert_audio(audio_data: np.array, sample_rate: int):
76
  output_audio.seek(0)
77
  return output_audio
78
 
 
 
79
 
80
  iface = gr.Interface(
81
- fn=stt,
82
  inputs=[
83
  gr.inputs.Audio(type="numpy",
84
- label=None, optional=False),
85
- gr.inputs.Radio(
86
- label="Виберіть Speech-to-Text модель",
87
- choices=MODEL_NAMES,
88
- ),
89
-
90
  ],
91
- outputs=gr.outputs.Textbox(label="Output"),
92
- title="🐸🇺🇦 - Coqui STT",
93
  theme="huggingface",
94
  description="Україномовний🇺🇦 Speech-to-Text за допомогою Coqui STT",
95
- article="Якщо вам подобається, підтримайте за посиланням: [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)",
96
  )
97
 
98
  download(model_link, model_name)
 
9
  from stt import Model
10
  from datetime import datetime
11
 
 
 
 
 
12
 
13
  # download model
14
  version = "v0.4"
 
18
  model_link = f"{storage_url}/{model_name}"
19
  scorer_link = f"{storage_url}/{scorer_name}"
20
 
21
+ def download(url, file_name):
22
+ if not exists(file_name):
23
+ print(f"Downloading {file_name}")
24
+ r = requests.get(url, allow_redirects=True)
25
+ with open(file_name, 'wb') as file:
26
+ file.write(r.content)
27
+ else:
28
+ print(f"Found {file_name}. Skipping download...")
29
 
 
 
 
 
 
 
 
30
 
31
+ def deepspeech(audio: np.array, use_scorer=False):
32
  ds = Model(model_name)
33
  if use_scorer:
34
  ds.enableExternalScorer("kenlm.scorer")
 
38
  return result
39
 
40
 
41
+ def inference(audio: Tuple[int, np.array]):
42
+ print("=============================")
43
+ print(f"Time: {datetime.utcnow()}.`")
 
 
 
 
 
 
44
 
45
+ output_audio = _convert_audio(audio[1], audio[0])
 
 
 
46
 
47
+ fin = wave.open(output_audio, 'rb')
48
+ audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
49
+ fin.close()
50
 
51
+ transcripts = []
52
 
53
+ transcripts.append("")
54
+ transcripts.append(deepspeech(audio, use_scorer=True))
55
+ print(f"Deepspeech with LM: `{transcripts[-1]}`")
56
+ transcripts.append(deepspeech(audio))
57
+ print(f"Deepspeech: `{transcripts[-1]}`")
58
+ return tuple(transcripts)
59
+
60
 
61
  def _convert_audio(audio_data: np.array, sample_rate: int):
62
+ audio_limit = sample_rate * 60 * 2 # limit audio to 2 minutes max
63
+ if audio_data.shape[0] > audio_limit:
64
+ audio_data = audio_data[0:audio_limit]
65
  source_audio = BytesIO()
66
  source_audio.write(audio_data)
67
  source_audio.seek(0)
 
76
  output_audio.seek(0)
77
  return output_audio
78
 
79
+ with open("README.md") as file:
80
+ article = file.read()
81
 
82
  iface = gr.Interface(
83
+ fn=inference,
84
  inputs=[
85
  gr.inputs.Audio(type="numpy",
86
+ label="Аудіо", optional=False),
 
 
 
 
 
87
  ],
88
+ outputs=[gr.outputs.Textbox(label="Wav2Vec2"), gr.outputs.Textbox(label="DeepSpeech with LM"), gr.outputs.Textbox(label="DeepSpeech")],
89
+ title="🇺🇦 Ukrainian Speech-to-Text models",
90
  theme="huggingface",
91
  description="Україномовний🇺🇦 Speech-to-Text за допомогою Coqui STT",
92
+ article=article,
93
  )
94
 
95
  download(model_link, model_name)
requirements-local.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ -r requirements.txt
2
+ gradio==3.2
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
- gradio==2.4.5
2
  STT==1.3.0
3
  pydub==0.25.1
 
 
1
  STT==1.3.0
2
  pydub==0.25.1