Sajjo commited on
Commit
f613c26
·
verified ·
1 Parent(s): 762fc8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +427 -99
app.py CHANGED
@@ -1,112 +1,440 @@
1
- # import gradio as gr
2
- # import os
3
- # import wave
4
- # import tempfile
5
- # import numpy as np
6
-
7
- # # Global variables to store file and line index
8
- # file_index = 0
9
- # line_index = 0
10
- # lines = []
11
-
12
- # # Function to read lines from a file
13
- # def read_lines_from_file(file_path):
14
- # global lines
15
- # with open(file_path, 'r') as file:
16
- # lines = file.readlines()
17
-
18
- # # Function to save audio to a WAV file
19
- # def save_audio_to_file(audio):
20
- # global file_index, line_index, lines
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # sample_rate, data = audio # audio is a tuple (sample_rate, data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # # Save the audio data as a WAV file in a temporary location
25
- # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
26
- # with wave.open(tmp_file.name, 'wb') as wav_file:
27
- # wav_file.setnchannels(1) # Mono audio
28
- # wav_file.setsampwidth(2) # 2 bytes per sample (16-bit PCM)
29
- # wav_file.setframerate(sample_rate)
30
- # wav_file.writeframes(data.tobytes())
31
-
32
- # # Return the path to the saved WAV file
33
- # return tmp_file.name
34
 
35
- # # Gradio interface function
36
- # def audio_capture_interface():
37
- # global file_index, line_index, lines
 
 
 
38
 
39
- # # Initial file to read
40
- # files = os.listdir('./audio_samples')
41
- # read_lines_from_file(os.path.join('./audio_samples', files[file_index]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # # Define the interface components
44
- # audio_input = gr.Audio(source="microphone", type="numpy", label="Speak and click submit")
45
- # output_text = gr.Textbox(label="Status", placeholder="Status will appear here")
 
 
 
 
 
 
 
 
46
 
47
- # # Function to capture and process the audio input
48
- # def process_audio(audio):
49
- # global line_index, lines
50
-
51
- # try:
52
- # file_path = save_audio_to_file(audio)
53
- # return f"Audio saved to {file_path}"
54
- # except Exception as e:
55
- # return f"Error saving audio: {str(e)}"
56
-
57
- # # Function to handle navigation buttons
58
- # def navigate_lines(button):
59
- # global line_index, lines
60
-
61
- # if button == 'forward':
62
- # line_index = min(line_index + 1, len(lines) - 1)
63
- # elif button == 'previous':
64
- # line_index = max(line_index - 1, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- # output_text.value = lines[line_index]
67
-
68
- # # Create the Gradio interface
69
- # iface = gr.Interface(
70
- # fn=process_audio,
71
- # inputs=audio_input,
72
- # outputs=output_text,
73
- # live=True,
74
- # title="Audio Capture and Playback",
75
- # description="Speak into the microphone and click submit to save audio. Navigate through lines using buttons below.",
76
- # theme="compact",
77
- # layout="vertical",
78
- # examples=[["Start recording audio."]]
79
- # )
80
-
81
- # # Add navigation buttons
82
- # iface.add_button("Previous", lambda: navigate_lines('previous'))
83
- # iface.add_button("Forward", lambda: navigate_lines('forward'))
84
-
85
- # return iface
86
-
87
- # # Launch the interface
88
- # iface = audio_capture_interface()
89
- # iface.launch()
90
 
91
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- def calculator(num1, operation, num2):
95
- if operation == "add":
96
- return num1 + num2
97
- elif operation == "subtract":
98
- return num1 - num2
99
- elif operation == "multiply":
100
- return num1 * num2
101
- elif operation == "divide":
102
- return num1 / num2
103
 
 
 
 
 
 
104
 
105
- iface = gr.Interface(
106
- calculator,
107
- ["number", gr.Radio(["add", "subtract", "multiply", "divide"]), "number"],
108
- "number",
109
- allow_flagging="manual"
110
- )
111
 
112
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ from transformers import AutoProcessor
5
+ from pyctcdecode import build_ctcdecoder
6
+ from transformers import Wav2Vec2ProcessorWithLM
7
+
8
+ import os
9
+ import re
10
+ #import torchaudio
11
+
12
+ # Initialize the speech recognition pipeline and transliterator
13
+ p1 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-odia_v1")
14
+ p2 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
15
+ #p3 = pipeline(task="automatic-speech-recognition", model="cdactvm/kannada_w2v-bert_model")
16
+ #p4 = pipeline(task="automatic-speech-recognition", model="cdactvm/telugu_w2v-bert_model")
17
+ #p5 = pipeline(task="automatic-speech-recognition", model="Sajjo/w2v-bert-2.0-bangala-gpu-CV16.0_v2")
18
+ #p6 = pipeline(task="automatic-speech-recognition", model="cdactvm/hf-open-assames")
19
+ p7 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-assames")
20
+ processor = AutoProcessor.from_pretrained("cdactvm/w2v-assames")
21
+ vocab_dict = processor.tokenizer.get_vocab()
22
+ sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
23
+ decoder = build_ctcdecoder(
24
+ labels=list(sorted_vocab_dict.keys()),
25
+ kenlm_model_path="lm.binary",
26
+ )
27
+ processor_with_lm = Wav2Vec2ProcessorWithLM(
28
+ feature_extractor=processor.feature_extractor,
29
+ tokenizer=processor.tokenizer,
30
+ decoder=decoder
31
+ )
32
+ processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
33
+ p8 = pipeline("automatic-speech-recognition", model="cdactvm/w2v-assames", tokenizer=processor_with_lm, feature_extractor=processor_with_lm.feature_extractor, decoder=processor_with_lm.decoder)
34
 
35
+
36
+ os.system('git clone https://github.com/irshadbhat/indic-trans.git')
37
+ os.system('pip install ./indic-trans/.')
38
+
39
+ #HF_TOKEN = os.getenv('HF_TOKEN')
40
+ #hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "asr_demo")
41
+
42
+ from indictrans import Transliterator
43
+ trn = Transliterator(source='ori', target='eng', build_lookup=True)
44
+ def transcribe_odiya(speech):
45
+ text = p1(speech)["text"]
46
+ if text is None:
47
+ return "Error: ASR returned None"
48
+ return text
49
+
50
+
51
+ def cleanhtml(raw_html):
52
+ cleantext = re.sub(r'<.*?>', '', raw_html)
53
+ return cleantext
54
+
55
+ def transcribe_hindi(speech):
56
+ text = p2(speech)["text"]
57
+ if text is None:
58
+ return "Error: ASR returned None"
59
+ return text
60
+ def transcribe_kannada(speech):
61
+ text = p3(speech)["text"]
62
+ if text is None:
63
+ return "Error: ASR returned None"
64
+ return text
65
+ def transcribe_telugu(speech):
66
+ text = p4(speech)["text"]
67
+ if text is None:
68
+ return "Error: ASR returned None"
69
+ return text
70
 
71
+ def transcribe_bangala(speech):
72
+ text = p5(speech)["text"]
73
+ if text is None:
74
+ return "Error: ASR returned None"
75
+ return text
 
 
 
 
 
76
 
77
+ def transcribe_assamese_LM(speech):
78
+ text = p8(speech)["text"]
79
+ text = cleanhtml(text)
80
+ if text is None:
81
+ return "Error: ASR returned None"
82
+ return text
83
 
84
+ def transcribe_assamese_model2(speech):
85
+ text = p7(speech)["text"]
86
+ text = cleanhtml(text)
87
+ if text is None:
88
+ return "Error: ASR returned None"
89
+ return text
90
+
91
+ def transcribe_odiya_eng(speech):
92
+ trn = Transliterator(source='ori', target='eng', build_lookup=True)
93
+ text = p1(speech)["text"]
94
+ if text is None:
95
+ return "Error: ASR returned None"
96
+ sentence = trn.transform(text)
97
+ if sentence is None:
98
+ return "Error: Transliteration returned None"
99
+ replaced_words = replace_words(sentence)
100
+ processed_sentence = process_doubles(replaced_words)
101
+ return process_transcription(processed_sentence)
102
+
103
+ def transcribe_ban_eng(speech):
104
+ trn = Transliterator(source='ben', target='eng', build_lookup=True)
105
+ text = p5(speech)["text"]
106
+ if text is None:
107
+ return "Error: ASR returned None"
108
+ sentence = trn.transform(text)
109
+ if sentence is None:
110
+ return "Error: Transliteration returned None"
111
+ replaced_words = replace_words(sentence)
112
+ processed_sentence = process_doubles(replaced_words)
113
+ return process_transcription(processed_sentence)
114
 
115
+ def transcribe_hin_eng(speech):
116
+ trn = Transliterator(source='hin', target='eng', build_lookup=True)
117
+ text = p2(speech)["text"]
118
+ if text is None:
119
+ return "Error: ASR returned None"
120
+ sentence = trn.transform(text)
121
+ if sentence is None:
122
+ return "Error: Transliteration returned None"
123
+ replaced_words = replace_words(sentence)
124
+ processed_sentence = process_doubles(replaced_words)
125
+ return process_transcription(processed_sentence)
126
 
127
+ def transcribe_kan_eng(speech):
128
+ trn = Transliterator(source='kan', target='eng', build_lookup=True)
129
+ text = p3(speech)["text"]
130
+ if text is None:
131
+ return "Error: ASR returned None"
132
+ sentence = trn.transform(text)
133
+ if sentence is None:
134
+ return "Error: Transliteration returned None"
135
+ replaced_words = replace_words(sentence)
136
+ processed_sentence = process_doubles(replaced_words)
137
+ return process_transcription(processed_sentence)
138
+
139
+ def transcribe_tel_eng(speech):
140
+ trn = Transliterator(source='tel', target='eng', build_lookup=True)
141
+ text = p4(speech)["text"]
142
+ if text is None:
143
+ return "Error: ASR returned None"
144
+ sentence = trn.transform(text)
145
+ if sentence is None:
146
+ return "Error: Transliteration returned None"
147
+ replaced_words = replace_words(sentence)
148
+ processed_sentence = process_doubles(replaced_words)
149
+ return process_transcription(processed_sentence)
150
+
151
+
152
+ def process_transcription(input_sentence):
153
+ word_to_code_map = {}
154
+ code_to_word_map = {}
155
+
156
+ transcript_1 = sentence_to_transcript(input_sentence, word_to_code_map)
157
+ if transcript_1 is None:
158
+ return "Error: Transcript conversion returned None"
159
+
160
+ numbers = text2int(transcript_1)
161
+ if numbers is None:
162
+ return "Error: Text to number conversion returned None"
163
+
164
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
165
+ text = transcript_to_sentence(numbers, code_to_word_map)
166
+ return text
167
+
168
+ def sel_lng(lng, mic=None, file=None):
169
+ if mic is not None:
170
+ audio = mic
171
+ elif file is not None:
172
+ audio = file
173
+ else:
174
+ return "You must either provide a mic recording or a file"
175
+
176
+ if lng == "Odiya":
177
+ return transcribe_odiya(audio)
178
+ elif lng == "Odiya-trans":
179
+ return transcribe_odiya_eng(audio)
180
+ elif lng == "Hindi-trans":
181
+ return transcribe_hin_eng(audio)
182
+ elif lng == "Hindi":
183
+ return transcribe_hindi(audio)
184
+ elif lng == "Kannada-trans":
185
+ return transcribe_kan_eng(audio)
186
+ elif lng == "Kannada":
187
+ return transcribe_kannada(audio)
188
+ elif lng == "Telugu-trans":
189
+ return transcribe_tel_eng(audio)
190
+ elif lng == "Telugu":
191
+ return transcribe_telugu(audio)
192
+ elif lng == "Bangala-trans":
193
+ return transcribe_ban_eng(audio)
194
+ elif lng == "Bangala":
195
+ return transcribe_bangala(audio)
196
+ elif lng == "Assamese-LM":
197
+ return transcribe_assamese_LM(audio)
198
+ elif lng == "Assamese-Model2":
199
+ return transcribe_assamese_model2(audio)
200
 
201
+
202
+ # Function to replace incorrectly spelled words
203
+ def replace_words(sentence):
204
+ replacements = [
205
+ (r'\bjiro\b', 'zero'), (r'\bjero\b', 'zero'), (r'\bnn\b', 'one'),
206
+ (r'\bn\b', 'one'), (r'\bna\b', 'one'), (r'\btu\b', 'two'),
207
+ (r'\btoo\b', 'two'), (r'\bthiri\b', 'three'), (r'\bfor\b', 'four'),
208
+ (r'\bfore\b', 'four'), (r'\bfib\b', 'five'), (r'\bdublseven\b', 'double seven'),
209
+ (r'\bdubalathri\b', 'double three'), (r'\bnineeit\b', 'nine eight'),
210
+ (r'\bfipeit\b', 'five eight'), (r'\bdubal\b', 'double'), (r'\bsevenatu\b', 'seven two'),
211
+ ]
212
+ for pattern, replacement in replacements:
213
+ sentence = re.sub(pattern, replacement, sentence)
214
+ return sentence
 
 
 
 
 
 
 
 
 
 
215
 
216
+ # Function to process "double" followed by a number
217
+ def process_doubles(sentence):
218
+ tokens = sentence.split()
219
+ result = []
220
+ i = 0
221
+ while i < len(tokens):
222
+ if tokens[i] in ("double", "dubal"):
223
+ if i + 1 < len(tokens):
224
+ result.append(tokens[i + 1])
225
+ result.append(tokens[i + 1])
226
+ i += 2
227
+ else:
228
+ result.append(tokens[i])
229
+ i += 1
230
+ else:
231
+ result.append(tokens[i])
232
+ i += 1
233
+ return ' '.join(result)
234
+
235
+ # Function to generate Soundex code for a word
236
+ def soundex(word):
237
+ word = word.upper()
238
+ word = ''.join(filter(str.isalpha, word))
239
+ if not word:
240
+ return None
241
+ soundex_mapping = {
242
+ 'B': '1', 'F': '1', 'P': '1', 'V': '1',
243
+ 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
244
+ 'D': '3', 'T': '3', 'L': '4', 'M': '5', 'N': '5', 'R': '6'
245
+ }
246
+ soundex_code = word[0]
247
+ for char in word[1:]:
248
+ if char not in ('H', 'W'):
249
+ soundex_code += soundex_mapping.get(char, '0')
250
+ soundex_code = soundex_code[0] + ''.join(c for i, c in enumerate(soundex_code[1:]) if c != soundex_code[i])
251
+ soundex_code = soundex_code.replace('0', '') + '000'
252
+ return soundex_code[:4]
253
+
254
+ # Function to convert text to numerical representation
255
+ def is_number(x):
256
+ if type(x) == str:
257
+ x = x.replace(',', '')
258
+ try:
259
+ float(x)
260
+ except:
261
+ return False
262
+ return True
263
 
264
+ def text2int(textnum, numwords={}):
265
+ units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
266
+ 'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
267
+ tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
268
+ scales = ['H536', 'T253', 'M450', 'C600']
269
+ ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
270
+ 'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
271
+ ordinal_endings = [('ieth', 'y'), ('th', '')]
272
+ if not numwords:
273
+ numwords['and'] = (1, 0)
274
+ for idx, word in enumerate(units): numwords[word] = (1, idx)
275
+ for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
276
+ for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
277
 
278
+ textnum = textnum.replace('-', ' ')
 
 
 
 
 
 
 
 
279
 
280
+ current = result = 0
281
+ curstring = ''
282
+ onnumber = False
283
+ lastunit = False
284
+ lastscale = False
285
 
286
+ def is_numword(x):
287
+ if is_number(x):
288
+ return True
289
+ if word in numwords:
290
+ return True
291
+ return False
292
 
293
+ def from_numword(x):
294
+ if is_number(x):
295
+ scale = 0
296
+ increment = int(x.replace(',', ''))
297
+ return scale, increment
298
+ return numwords[x]
299
+
300
+ for word in textnum.split():
301
+ if word in ordinal_words:
302
+ scale, increment = (1, ordinal_words[word])
303
+ current = current * scale + increment
304
+ if scale > 100:
305
+ result += current
306
+ current = 0
307
+ onnumber = True
308
+ lastunit = False
309
+ lastscale = False
310
+ else:
311
+ for ending, replacement in ordinal_endings:
312
+ if word.endswith(ending):
313
+ word = "%s%s" % (word[:-len(ending)], replacement)
314
+
315
+ if (not is_numword(word)) or (word == 'and' and not lastscale):
316
+ if onnumber:
317
+ curstring += repr(result + current) + " "
318
+ curstring += word + " "
319
+ result = current = 0
320
+ onnumber = False
321
+ lastunit = False
322
+ lastscale = False
323
+ else:
324
+ scale, increment = from_numword(word)
325
+ onnumber = True
326
+
327
+ if lastunit and (word not in scales):
328
+ curstring += repr(result + current)
329
+ result = current = 0
330
+
331
+ if scale > 1:
332
+ current = max(1, current)
333
+
334
+ current = current * scale + increment
335
+ if scale > 100:
336
+ result += current
337
+ current = 0
338
+
339
+ lastscale = False
340
+ lastunit = False
341
+ if word in scales:
342
+ lastscale = True
343
+ elif word in units:
344
+ lastunit = True
345
+
346
+ if onnumber:
347
+ curstring += repr(result + current)
348
+
349
+ return curstring
350
+
351
+ # Convert sentence to transcript using Soundex
352
+ def sentence_to_transcript(sentence, word_to_code_map):
353
+ words = sentence.split()
354
+ transcript_codes = []
355
+
356
+ for word in words:
357
+ if word not in word_to_code_map:
358
+ word_to_code_map[word] = soundex(word)
359
+ transcript_codes.append(word_to_code_map[word])
360
+
361
+ transcript = ' '.join(transcript_codes)
362
+ return transcript
363
+
364
+ # Convert transcript back to sentence using mapping
365
+ def transcript_to_sentence(transcript, code_to_word_map):
366
+ codes = transcript.split()
367
+ sentence_words = []
368
+
369
+ for code in codes:
370
+ sentence_words.append(code_to_word_map.get(code, code))
371
+
372
+ sentence = ' '.join(sentence_words)
373
+ return sentence
374
+
375
+ # # Process the audio file
376
+ # transcript = pipe("./odia_recorded/AUD-20240614-WA0004.wav")
377
+ # text_value = transcript['text']
378
+ # sentence = trn.transform(text_value)
379
+ # replaced_words = replace_words(sentence)
380
+ # processed_sentence = process_doubles(replaced_words)
381
+
382
+ # input_sentence_1 = processed_sentence
383
+
384
+ # Create empty mappings
385
+ word_to_code_map = {}
386
+ code_to_word_map = {}
387
+
388
+ # Convert sentence to transcript
389
+ # transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
390
+
391
+ # Convert transcript to numerical representation
392
+ # numbers = text2int(transcript_1)
393
+
394
+ # Create reverse mapping
395
+ code_to_word_map = {v: k for k, v in word_to_code_map.items()}
396
+
397
+ # Convert transcript back to sentence
398
+ # reconstructed_sentence_1 = transcript_to_sentence(numbers, code_to_word_map)
399
+
400
+ # demo=gr.Interface(
401
+ # fn=sel_lng,
402
+
403
+ # inputs=[
404
+
405
+ # gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans"],value="Hindi",label="Select Language"),
406
+ # gr.Audio(source="microphone", type="filepath"),
407
+ # gr.Audio(source= "upload", type="filepath"),
408
+ # #gr.Audio(sources="upload", type="filepath"),
409
+ # #"state"
410
+ # ],
411
+ # outputs=[
412
+ # "textbox"
413
+ # # #"state"
414
+ # ],
415
+ # title="Automatic Speech Recognition",
416
+ # description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
417
+ # ).launch()
418
+
419
+ ######################################################
420
+ demo=gr.Interface(
421
+ fn=sel_lng,
422
+
423
+ inputs=[
424
+
425
+ #gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans","Kannada","Kannada-trans","Telugu","Telugu-trans","Bangala","Bangala-trans"],value="Hindi",label="Select Language"),
426
+ gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans","Assamese-LM","Assamese-Model2"],value="Hindi",label="Select Language"),
427
+ gr.Audio(sources=["microphone","upload"], type="filepath"),
428
+ #gr.Audio(sources="upload", type="filepath"),
429
+ #"state"
430
+ ],
431
+ outputs=[
432
+ "textbox"
433
+ # #"state"
434
+ ],
435
+ allow_flagging="auto",
436
+ #flagging_options=["Language error", "English transliteration error", "Other"],
437
+ #flagging_callback=hf_writer,
438
+ title="Automatic Speech Recognition",
439
+ description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
440
+ ).launch()