Porjaz commited on
Commit
df08a7c
1 Parent(s): cfe35e2

Upload 4 files

Browse files
Files changed (4) hide show
  1. 1000_unigram.model +3 -0
  2. README.md +5 -5
  3. app.py +287 -0
  4. requirements.txt +5 -0
1000_unigram.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35a3a5130d52af7c3eb92cbf0c05bfed2f43c3204f3d17941a71cf8b46c84894
3
+ size 257888
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Macedonian ASR Demo Compare
3
- emoji: 📈
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
 
1
  ---
2
+ title: Macedonian ASR Demo
3
+ emoji: 👁
4
+ colorFrom: purple
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
3
+
4
+ import gc
5
+ from functools import partial
6
+ import gradio as gr
7
+ import torch
8
+ from speechbrain.inference.interfaces import Pretrained, foreign_class
9
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
10
+ import librosa
11
+ import whisper_timestamped as whisper
12
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
13
+
14
+
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ torch.backends.cuda.matmul.allow_tf32 = True
17
+
18
+
19
+ def clean_up_memory():
20
+ gc.collect()
21
+ torch.cuda.empty_cache()
22
+
23
+
24
+ def recap_sentence(string):
25
+ # Restore capitalization and punctuation using the model
26
+ inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
27
+ outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
28
+ recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
29
+ return recap_result
30
+
31
+
32
+ def return_prediction_w2v2(mic=None, file=None, device=device):
33
+ if mic is not None:
34
+ waveform, sr = librosa.load(mic, sr=16000)
35
+ waveform = waveform[:30*sr]
36
+ w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
37
+ elif file is not None:
38
+ waveform, sr = librosa.load(file, sr=16000)
39
+ waveform = waveform[:30*sr]
40
+ w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
41
+ else:
42
+ return "You must either provide a mic recording or a file"
43
+
44
+ recap_result = recap_sentence(w2v2_result[0])
45
+
46
+ # If the letter after punct is small, recap it
47
+ for i, letter in enumerate(recap_result):
48
+ if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
49
+ recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
50
+
51
+ clean_up_memory()
52
+ return recap_result
53
+
54
+
55
+
56
+ def return_prediction_whisper(mic=None, file=None, device=device):
57
+ if mic is not None:
58
+ waveform, sr = librosa.load(mic, sr=16000)
59
+ waveform = waveform[:30*sr]
60
+ whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
61
+ elif file is not None:
62
+ waveform, sr = librosa.load(file, sr=16000)
63
+ waveform = waveform[:30*sr]
64
+ whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
65
+ else:
66
+ return "You must either provide a mic recording or a file"
67
+
68
+ recap_result = recap_sentence(whisper_result[0])
69
+
70
+ # If the letter after punct is small, recap it
71
+ for i, letter in enumerate(recap_result):
72
+ if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
73
+ recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
74
+
75
+ clean_up_memory()
76
+ return recap_result
77
+
78
+
79
+ def return_prediction_compare(mic=None, file=None, device=device):
80
+ # pipe_whisper.model.to(device)
81
+ # mms_model.to(device)
82
+ if mic is not None:
83
+ waveform, sr = librosa.load(mic, sr=16000)
84
+ waveform = waveform[:30*sr]
85
+ whisper_mkd_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
86
+ # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(mic, device)
87
+ whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
88
+ mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
89
+
90
+ elif file is not None:
91
+ waveform, sr = librosa.load(file, sr=16000)
92
+ waveform = waveform[:30*sr]
93
+ whisper_mkd_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
94
+ # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(file, device)
95
+ whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
96
+ mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
97
+ else:
98
+ return "You must either provide a mic recording or a file"
99
+ # pipe_whisper.model.to("cpu")
100
+ # mms_model.to("cpu")
101
+
102
+ segment_results_whisper = ""
103
+ prev_segment_whisper = ""
104
+ # segment_results_w2v2 = ""
105
+ # prev_segment_w2v2 = ""
106
+ segment_results_mms = ""
107
+ prev_segment_mms = ""
108
+
109
+ recap_result_whisper_mkd = recap_sentence(whisper_mkd_result[0])
110
+ recap_result_whisper = recap_sentence(whisper_result[0])
111
+ recap_result_mms = recap_sentence(mms_result_generator[0])
112
+
113
+ # If the letter after punct is small, recap it
114
+ for i, letter in enumerate(recap_result_whisper_mkd):
115
+ if i > 1 and recap_result_whisper_mkd[i-2] in [".", "!", "?"] and letter.islower():
116
+ recap_result_whisper_mkd = recap_result_whisper_mkd[:i] + letter.upper() + recap_result_whisper_mkd[i+1:]
117
+
118
+ for i, letter in enumerate(recap_result_whisper):
119
+ if i > 1 and recap_result_whisper[i-2] in [".", "!", "?"] and letter.islower():
120
+ recap_result_whisper = recap_result_whisper[:i] + letter.upper() + recap_result_whisper[i+1:]
121
+
122
+ for i, letter in enumerate(recap_result_mms):
123
+ if i > 1 and recap_result_mms[i-2] in [".", "!", "?"] and letter.islower():
124
+ recap_result_mms = recap_result_mms[:i] + letter.upper() + recap_result_mms[i+1:]
125
+
126
+ clean_up_memory()
127
+ return "Буки-Whisper:\n" + recap_result_whisper_mkd + "\n\n" + "MMS:\n" + recap_result_mms + "\n\n" + "OpenAI Whisper:\n" + recap_result_whisper
128
+ # yield "Our W2v2: \n" + segment_results_w2v2 + "\n\n" + "MMS transcript:\n" + segment_results_mms
129
+
130
+
131
+
132
+ # Load Whisper model
133
+ model_id = "openai/whisper-large-v3"
134
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa")
135
+ whisper_model.to(device)
136
+ processor = AutoProcessor.from_pretrained(model_id)
137
+ pipe_whisper = pipeline(
138
+ "automatic-speech-recognition",
139
+ model=whisper_model,
140
+ tokenizer=processor.tokenizer,
141
+ feature_extractor=processor.feature_extractor,
142
+ torch_dtype=torch.float16,
143
+ return_timestamps=True,
144
+ device=device,
145
+ )
146
+
147
+
148
+ # Load MMS model
149
+ model_id = "facebook/mms-1b-all"
150
+ processor_mms = AutoProcessor.from_pretrained(model_id)
151
+ mms_model = Wav2Vec2ForCTC.from_pretrained(model_id)
152
+ mms_model = mms_model.to(device)
153
+ mms_model.eval()
154
+ processor_mms.tokenizer.set_target_lang("mkd")
155
+ mms_model.load_adapter("mkd")
156
+
157
+
158
+
159
+ # Create a partial function with the device pre-applied
160
+ return_prediction_whisper_with_device = partial(return_prediction_whisper, device=device)
161
+ return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
162
+ return_prediction_with_device_compare = partial(return_prediction_compare, device=device)
163
+
164
+
165
+ # Load the ASR models
166
+ w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
167
+ w2v2_classifier = w2v2_classifier.to(device)
168
+ w2v2_classifier.eval()
169
+ whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
170
+ whisper_classifier = whisper_classifier.to(device)
171
+ whisper_classifier.eval()
172
+
173
+
174
+ # Load the T5 tokenizer and model for restoring capitalization
175
+ recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
176
+ recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
177
+ recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
178
+ recap_model.to(device)
179
+ recap_model.eval()
180
+
181
+
182
+ mic_transcribe_whisper = gr.Interface(
183
+ fn=return_prediction_whisper_with_device,
184
+ inputs=gr.Audio(sources="microphone", type="filepath"),
185
+ outputs=gr.Textbox(),
186
+ allow_flagging="never",
187
+ live=False,
188
+ )
189
+
190
+ # file_transcribe_whisper = gr.Interface(
191
+ # fn=return_prediction_whisper_with_device,
192
+ # inputs=gr.Audio(sources="upload", type="filepath"),
193
+ # outputs=gr.Textbox(),
194
+ # allow_flagging="never",
195
+ # live=False
196
+ # )
197
+
198
+ mic_transcribe_w2v2 = gr.Interface(
199
+ fn=return_prediction_w2v2_with_device,
200
+ inputs=gr.Audio(sources="microphone", type="filepath"),
201
+ outputs=gr.Textbox(),
202
+ allow_flagging="never",
203
+ live=False,
204
+ )
205
+
206
+
207
+ # file_transcribe_w2v2 = gr.Interface(
208
+ # fn=return_prediction_w2v2_with_device,
209
+ # inputs=gr.Audio(sources="upload", type="filepath"),
210
+ # outputs=gr.Textbox(),
211
+ # allow_flagging="never",
212
+ # live=False
213
+ # )
214
+
215
+ mic_transcribe_compare = gr.Interface(
216
+ fn=return_prediction_with_device_compare,
217
+ inputs=gr.Audio(sources="microphone", type="filepath"),
218
+ outputs=gr.Textbox(),
219
+ allow_flagging="never",
220
+ live=False,
221
+ )
222
+
223
+ # file_transcribe_compare = gr.Interface(
224
+ # fn=return_prediction_with_device_compare,
225
+ # inputs=gr.Audio(sources="upload", type="filepath"),
226
+ # outputs=gr.Textbox(),
227
+ # allow_flagging="never",
228
+ # live=False
229
+ # )
230
+
231
+
232
+ project_description = '''
233
+ ## Автори:
234
+ 1. **Дејан Порјазовски**
235
+ 2. **Илина Јакимовска**
236
+ 3. **Ордан Чукалиев**
237
+ 4. **Никола Стиков**
238
+ Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
239
+ ## Во тренирањето на овој модел се употребени податоци од:
240
+ 1. Дигитален архив за етнолошки и антрополошки ресурси (ДАЕАР) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
241
+ 2. Аудио верзија на меѓународното списание „ЕтноАнтропоЗум“ на Институтот за етнологија и антропологија, Природ��о-математички факултет при УКИМ.
242
+ 3. Аудио подкастот „Обични луѓе“ на Илина Јакимовска
243
+ 4. Научните видеа од серијалот „Наука за деца“, фондација КАНТАРОТ
244
+ 5. Македонска верзија на Mozilla Common Voice (верзија 18.0)
245
+ '''
246
+
247
+ # Custom CSS
248
+ css = """
249
+ .gradio-container {
250
+ background-color: #f0f0f0; /* Set your desired background color */
251
+ }
252
+ .custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
253
+ font-size: 15px !important;
254
+ font-family: Arial, sans-serif !important;
255
+ }
256
+ .gradio-container {
257
+ background-color: #f3f3f3 !important;
258
+ }
259
+ """
260
+
261
+ transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
262
+
263
+ with transcriber_app:
264
+ state = gr.State()
265
+ gr.Markdown(project_description, elem_classes="custom-markdown")
266
+
267
+ # gr.TabbedInterface(
268
+ # [mic_transcribe_whisper, mic_transcribe_compare],
269
+ # ["Буки-Whisper транскрипција", "Споредба на модели"],
270
+ # )
271
+ # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
272
+
273
+ gr.TabbedInterface(
274
+ [mic_transcribe_whisper, mic_transcribe_w2v2, mic_transcribe_compare],
275
+ ["Буки-Whisper транскрипција", "Буки-W2v2 транскрипција", "Споредба на модели"],
276
+ )
277
+ state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
278
+
279
+ transcriber_app.unload(return_prediction_whisper)
280
+ transcriber_app.unload(return_prediction_compare)
281
+ transcriber_app.unload(return_prediction_w2v2)
282
+
283
+
284
+ # transcriber_app.launch(debug=True, share=True, ssl_verify=False)
285
+ if __name__ == "__main__":
286
+ transcriber_app.queue()
287
+ transcriber_app.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ speechbrain
2
+ transformers
3
+ librosa
4
+ whisper_timestamped
5
+ accelerate