cdactvm commited on
Commit
f0dfb53
1 Parent(s): 297d33f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -681
app.py CHANGED
@@ -1,73 +1,3 @@
1
- # import warnings
2
- # warnings.filterwarnings("ignore")
3
- # import os
4
- # import re
5
- # import gradio as gr
6
- # import numpy as np
7
- # import torchaudio
8
- # import nbimporter
9
- # from transformers import pipeline
10
- # from transformers import AutoProcessor
11
- # from pyctcdecode import build_ctcdecoder
12
- # from transformers import Wav2Vec2ProcessorWithLM
13
- # from text2int import text_to_int
14
- # from isNumber import is_number
15
- # from Text2List import text_to_list
16
- # from convert2list import convert_to_list
17
- # from processDoubles import process_doubles
18
- # from replaceWords import replace_words
19
-
20
- # # transcriber = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
21
- # # processor = AutoProcessor.from_pretrained("cdactvm/w2v-bert-2.0-hindi_v1")
22
-
23
- # # vocab_dict = processor.tokenizer.get_vocab()
24
-
25
- # # sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
26
- # # decoder = build_ctcdecoder(
27
- # # labels=list(sorted_vocab_dict.keys()),
28
- # # kenlm_model_path="lm.binary",
29
- # # )
30
- # # processor_with_lm = Wav2Vec2ProcessorWithLM(
31
- # # feature_extractor=processor.feature_extractor,
32
- # # tokenizer=processor.tokenizer,
33
- # # decoder=decoder
34
- # # )
35
- # # processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
36
-
37
-
38
- # def transcribe(audio):
39
- # # # Process the audio file
40
- # transcript = transcriber(audio)
41
- # text_value = transcript['text']
42
- # print(text_value)
43
- # processd_doubles=process_doubles(text_value)
44
- # converted_to_list=convert_to_list(processd_doubles,text_to_list())
45
- # replaced_words = replace_words(converted_to_list)
46
- # converted_text=text_to_int(replaced_words)
47
- # return converted_text
48
-
49
-
50
- # # demo = gr.Interface(
51
- # # transcribe,
52
- # # gr.Audio(sources="microphone", type="filepath"),
53
- # # "text",
54
- # # )
55
-
56
- # # demo.launch()
57
-
58
- # demo=gr.Interface(
59
- # transcribe,
60
- # inputs=[
61
- # gr.Audio(sources=["microphone","upload"], type="filepath"),
62
- # ],
63
- # outputs=[
64
- # "textbox"
65
- # ],
66
- # title="Automatic Speech Recognition",
67
- # description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
68
- # ).launch()
69
-
70
- ###############################################################
71
  import warnings
72
  warnings.filterwarnings("ignore")
73
  import os
@@ -87,629 +17,52 @@ from convert2list import convert_to_list
87
  from processDoubles import process_doubles
88
  from replaceWords import replace_words
89
 
90
- hindi_model = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
91
- import warnings
92
- import gradio as gr
93
- from transformers import pipeline
94
- from transformers import AutoProcessor
95
- from pyctcdecode import build_ctcdecoder
96
- from transformers import Wav2Vec2ProcessorWithLM
97
-
98
- import os
99
- import re
100
- #import torchaudio
101
-
102
- # Initialize the speech recognition pipeline and transliterator
103
- # odia_model1 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-odia_v1")
104
- # odia_model2 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-odia_v2")
105
- # p2 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
106
- # punjaib_modle_30000=pipeline(task="automatic-speech-recognition", model="cdactvm/wav2vec-bert-punjabi-30000-model")
107
- # punjaib_modle_155750=pipeline(task="automatic-speech-recognition", model="cdactvm/wav2vec-bert-punjabi-155750-model")
108
- # punjaib_modle_70000_aug=pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-model-30000-augmented")
109
- #p3 = pipeline(task="automatic-speech-recognition", model="cdactvm/kannada_w2v-bert_model")
110
- #p4 = pipeline(task="automatic-speech-recognition", model="cdactvm/telugu_w2v-bert_model")
111
- #p5 = pipeline(task="automatic-speech-recognition", model="Sajjo/w2v-bert-2.0-bangala-gpu-CV16.0_v2")
112
- #p6 = pipeline(task="automatic-speech-recognition", model="cdactvm/hf-open-assames")
113
- # p7 = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-assames")
114
- processor = AutoProcessor.from_pretrained("cdactvm/w2v-bert-odia_v2")
115
- vocab_dict = processor.tokenizer.get_vocab()
116
- sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
117
- decoder = build_ctcdecoder(
118
- labels=list(sorted_vocab_dict.keys()),
119
- kenlm_model_path="lm.binary",
120
- )
121
- processor_with_lm = Wav2Vec2ProcessorWithLM(
122
- feature_extractor=processor.feature_extractor,
123
- tokenizer=processor.tokenizer,
124
- decoder=decoder
125
- )
126
- processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
127
- #p8 = pipeline("automatic-speech-recognition", model="cdactvm/w2v-assames", tokenizer=processor_with_lm, feature_extractor=processor_with_lm.feature_extractor, decoder=processor_with_lm.decoder)
128
-
129
-
130
- os.system('git clone https://github.com/irshadbhat/indic-trans.git')
131
- os.system('pip install ./indic-trans/.')
132
-
133
- #HF_TOKEN = os.getenv('HF_TOKEN')
134
- #hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "asr_demo")
135
-
136
- from indictrans import Transliterator
137
-
138
- ###########################################
139
-
140
- # Function to replace incorrectly spelled words
141
- def replace_words(sentence):
142
- replacements = [
143
- (r'\bjiro\b', 'zero'), (r'\bjero\b', 'zero'),
144
- (r'\bnn\b', 'one'),(r'\bn\b', 'one'), (r'\bvan\b', 'one'),(r'\bna\b', 'one'), (r'\bnn\b', 'one'),(r'\bek\b', 'one'),
145
- (r'\btu\b', 'two'),(r'\btoo\b', 'two'),(r'\bdo\b', 'two'),
146
- (r'\bthiri\b', 'three'), (r'\btiri\b', 'three'), (r'\bdubalathri\b', 'double three'),(r'\btin\b', 'three'),
147
- (r'\bfor\b', 'four'),(r'\bfore\b', 'four'),
148
- (r'\bfib\b', 'five'),(r'\bpaanch\b', 'five'),
149
- (r'\bchha\b', 'six'),(r'\bchhah\b', 'six'),(r'\bchau\b', 'six'),
150
- (r'\bdublseven\b', 'double seven'),(r'\bsath\b', 'seven'),
151
- (r'\baath\b', 'eight'),
152
- (r'\bnau\b', 'nine'),
153
- (r'\bdas\b', 'ten'),
154
- (r'\bnineeit\b', 'nine eight'),
155
- (r'\bfipeit\b', 'five eight'), (r'\bdubal\b', 'double'), (r'\bsevenatu\b', 'seven two'),
156
- ]
157
- for pattern, replacement in replacements:
158
- sentence = re.sub(pattern, replacement, sentence)
159
- return sentence
160
-
161
- # Function to process "double" followed by a number
162
- def process_doubles(sentence):
163
- tokens = sentence.split()
164
- result = []
165
- i = 0
166
- while i < len(tokens):
167
- if tokens[i] in ("double", "dubal"):
168
- if i + 1 < len(tokens):
169
- result.append(tokens[i + 1])
170
- result.append(tokens[i + 1])
171
- i += 2
172
- else:
173
- result.append(tokens[i])
174
- i += 1
175
- else:
176
- result.append(tokens[i])
177
- i += 1
178
- return ' '.join(result)
179
-
180
- # Function to generate Soundex code for a word
181
- def soundex(word):
182
- word = word.upper()
183
- word = ''.join(filter(str.isalpha, word))
184
- if not word:
185
- return None
186
- soundex_mapping = {
187
- 'B': '1', 'F': '1', 'P': '1', 'V': '1',
188
- 'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
189
- 'D': '3', 'T': '3', 'L': '4', 'M': '5', 'N': '5', 'R': '6'
190
- }
191
- soundex_code = word[0]
192
- for char in word[1:]:
193
- if char not in ('H', 'W'):
194
- soundex_code += soundex_mapping.get(char, '0')
195
- soundex_code = soundex_code[0] + ''.join(c for i, c in enumerate(soundex_code[1:]) if c != soundex_code[i])
196
- soundex_code = soundex_code.replace('0', '') + '000'
197
- return soundex_code[:4]
198
-
199
- # Function to convert text to numerical representation
200
- def is_number(x):
201
- if type(x) == str:
202
- x = x.replace(',', '')
203
- try:
204
- float(x)
205
- except:
206
- return False
207
- return True
208
-
209
- def text2int(textnum, numwords={}):
210
- units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
211
- 'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
212
- tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
213
- scales = ['H536', 'T253', 'M450', 'C600']
214
- ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
215
- 'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
216
- ordinal_endings = [('ieth', 'y'), ('th', '')]
217
- if not numwords:
218
- numwords['and'] = (1, 0)
219
- for idx, word in enumerate(units): numwords[word] = (1, idx)
220
- for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
221
- for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
222
-
223
- textnum = textnum.replace('-', ' ')
224
-
225
- current = result = 0
226
- curstring = ''
227
- onnumber = False
228
- lastunit = False
229
- lastscale = False
230
-
231
- def is_numword(x):
232
- if is_number(x):
233
- return True
234
- if x in numwords:
235
- return True
236
- return False
237
-
238
- def from_numword(x):
239
- if is_number(x):
240
- scale = 0
241
- increment = int(x.replace(',', ''))
242
- return scale, increment
243
- return numwords[x]
244
-
245
- for word in textnum.split():
246
- if word in ordinal_words:
247
- scale, increment = (1, ordinal_words[word])
248
- current = current * scale + increment
249
- if scale > 100:
250
- result += current
251
- current = 0
252
- onnumber = True
253
- lastunit = False
254
- lastscale = False
255
- else:
256
- for ending, replacement in ordinal_endings:
257
- if word.endswith(ending):
258
- word = "%s%s" % (word[:-len(ending)], replacement)
259
-
260
- if (not is_numword(word)) or (word == 'and' and not lastscale):
261
- if onnumber:
262
- curstring += repr(result + current) + " "
263
- curstring += word + " "
264
- result = current = 0
265
- onnumber = False
266
- lastunit = False
267
- lastscale = False
268
- else:
269
- scale, increment = from_numword(word)
270
- onnumber = True
271
-
272
- if lastunit and (word not in scales):
273
- curstring += repr(result + current)
274
- result = current = 0
275
-
276
- if scale > 1:
277
- current = max(1, current)
278
-
279
- current = current * scale + increment
280
- if scale > 100:
281
- result += current
282
- current = 0
283
-
284
- lastscale = False
285
- lastunit = False
286
- if word in scales:
287
- lastscale = True
288
- elif word in units:
289
- lastunit = True
290
-
291
- if onnumber:
292
- curstring += repr(result + current)
293
 
294
- return curstring
295
 
296
- # Convert sentence to transcript using Soundex
297
- def sentence_to_transcript(sentence, word_to_code_map):
298
- words = sentence.split()
299
- transcript_codes = []
 
 
 
 
 
 
 
300
 
301
- for word in words:
302
- if word not in word_to_code_map:
303
- word_to_code_map[word] = soundex(word)
304
- transcript_codes.append(word_to_code_map[word])
305
-
306
- transcript = ' '.join(transcript_codes)
307
- return transcript
308
-
309
- # Convert transcript back to sentence using mapping
310
- def transcript_to_sentence(transcript, code_to_word_map):
311
- codes = transcript.split()
312
- sentence_words = []
313
-
314
- for code in codes:
315
- sentence_words.append(code_to_word_map.get(code, code))
316
-
317
- sentence = ' '.join(sentence_words)
318
- return sentence
319
-
320
- # # Process the audio file
321
- # transcript = pipe("./odia_recorded/AUD-20240614-WA0004.wav")
322
- # text_value = transcript['text']
323
- # sentence = trn.transform(text_value)
324
- # replaced_words = replace_words(sentence)
325
- # processed_sentence = process_doubles(replaced_words)
326
-
327
- # input_sentence_1 = processed_sentence
328
-
329
- # Create empty mappings
330
- word_to_code_map = {}
331
- code_to_word_map = {}
332
-
333
- # Convert sentence to transcript
334
- # transcript_1 = sentence_to_transcript(input_sentence_1, word_to_code_map)
335
-
336
- # Convert transcript to numerical representation
337
- # numbers = text2int(transcript_1)
338
-
339
- # Create reverse mapping
340
- code_to_word_map = {v: k for k, v in word_to_code_map.items()}
341
-
342
- def process_transcription(input_sentence):
343
- word_to_code_map = {}
344
- code_to_word_map = {}
345
-
346
- transcript_1 = sentence_to_transcript(input_sentence, word_to_code_map)
347
- if transcript_1 is None:
348
- return "Error: Transcript conversion returned None"
349
-
350
- numbers = text2int(transcript_1)
351
- if numbers is None:
352
- return "Error: Text to number conversion returned None"
353
-
354
- code_to_word_map = {v: k for k, v in word_to_code_map.items()}
355
- text = transcript_to_sentence(numbers, code_to_word_map)
356
- return text
357
-
358
- ###########################################
359
-
360
- def transcribe_punjabi_30000(speech):
361
- text = punjaib_modle_30000(speech)["text"]
362
- text = text.replace("[PAD]","")
363
- if text is None:
364
- return "Error: ASR returned None"
365
- return text
366
-
367
- def transcribe_punjabi_eng_model_30000(speech):
368
- trn = Transliterator(source='pan', target='eng', build_lookup=True)
369
- text = punjaib_modle_30000(speech)["text"]
370
- text = text.replace("[PAD]","")
371
- if text is None:
372
- return "Error: ASR returned None"
373
- sentence = trn.transform(text)
374
- if sentence is None:
375
- return "Error: Transliteration returned None"
376
- replaced_words = replace_words(sentence)
377
- processed_sentence = process_doubles(replaced_words)
378
- return process_transcription(processed_sentence)
379
- return sentence
380
-
381
- def transcribe_punjabi_70000_aug(speech):
382
- text = punjaib_modle_70000_aug(speech)["text"]
383
- text = text.replace("<s>","")
384
- if text is None:
385
- return "Error: ASR returned None"
386
- return text
387
-
388
- def transcribe_punjabi_eng_model_70000_aug(speech):
389
- trn = Transliterator(source='pan', target='eng', build_lookup=True)
390
- text = punjaib_modle_70000_aug(speech)["text"]
391
- text = text.replace("<s>","")
392
- if text is None:
393
- return "Error: ASR returned None"
394
- sentence = trn.transform(text)
395
- if sentence is None:
396
- return "Error: Transliteration returned None"
397
- replaced_words = replace_words(sentence)
398
- processed_sentence = process_doubles(replaced_words)
399
- return process_transcription(processed_sentence)
400
- return sentence
401
-
402
- def transcribe_punjabi_155750(speech):
403
- text = punjaib_modle_155750(speech)["text"]
404
- text = text.replace("[PAD]","")
405
- if text is None:
406
- return "Error: ASR returned None"
407
- return text
408
-
409
- def transcribe_punjabi_eng_model_155750(speech):
410
- trn = Transliterator(source='pan', target='eng', build_lookup=True)
411
- text = punjaib_modle_155750(speech)["text"]
412
- text = text.replace("[PAD]","")
413
- if text is None:
414
- return "Error: ASR returned None"
415
- sentence = trn.transform(text)
416
- if sentence is None:
417
- return "Error: Transliteration returned None"
418
- replaced_words = replace_words(sentence)
419
- processed_sentence = process_doubles(replaced_words)
420
- return process_transcription(processed_sentence)
421
- return sentence
422
-
423
- ###########################################
424
- def transcribe_odiya_model1(speech):
425
- text = odia_model1(speech)["text"]
426
- if text is None:
427
- return "Error: ASR returned None"
428
- return text
429
-
430
- def transcribe_odiya_model2(speech):
431
- text = odia_model2(speech)["text"]
432
- if text is None:
433
- return "Error: ASR returned None"
434
- return text
435
-
436
- def transcribe_odiya_eng_model1(speech):
437
- trn = Transliterator(source='ori', target='eng', build_lookup=True)
438
- text = odia_model1(speech)["text"]
439
- if text is None:
440
- return "Error: ASR returned None"
441
- sentence = trn.transform(text)
442
- if sentence is None:
443
- return "Error: Transliteration returned None"
444
- replaced_words = replace_words(sentence)
445
- processed_sentence = process_doubles(replaced_words)
446
- return process_transcription(processed_sentence)
447
-
448
- def transcribe_odiya_eng_model2(speech):
449
- trn = Transliterator(source='ori', target='eng', build_lookup=True)
450
- text = odia_model2(speech)["text"]
451
- if text is None:
452
- return "Error: ASR returned None"
453
- sentence = trn.transform(text)
454
- if sentence is None:
455
- return "Error: Transliteration returned None"
456
- replaced_words = replace_words(sentence)
457
- processed_sentence = process_doubles(replaced_words)
458
- return process_transcription(processed_sentence)
459
-
460
- ########################################
461
- def cleanhtml(raw_html):
462
- cleantext = re.sub(r'<.*?>', '', raw_html)
463
- return cleantext
464
- #######################################
465
-
466
- # def transcribe_hindi(speech):
467
- # text = p2(speech)["text"]
468
- # if text is None:
469
- # return "Error: ASR returned None"
470
- # return text
471
-
472
- def transcribe_hindi(speech):
473
- text = hindi_model(speech)["text"]
474
- if text is None:
475
- return "Error: ASR returned None"
476
-
477
- hindi_map = {
478
- "सेवन": "7",
479
- "जीरो": "0",
480
- "वन" : "1",
481
- "टू" : "2",
482
- "थ्री" : "3",
483
- "त्री" : "3",
484
- "फोर" : "4",
485
- "फाइव": "5",
486
- "सिक्स": "6",
487
- "एट": "8",
488
- "नाइन": "9",
489
- "टेन": "10",
490
- "एक": "1",
491
- "दो": "2",
492
- "तीन": "3",
493
- "चार": "4",
494
- "पांच": "5",
495
- "पाँच": "5",
496
- "छह": "6",
497
- "छः": "6",
498
- "सात": "7",
499
- "आठ": "8",
500
- "नौ": "9",
501
- "दस": "10"
502
- }
503
-
504
- for hindi, num in hindi_map.items():
505
- text = text.replace(hindi, num)
506
-
507
- # Split the string into parts separated by spaces
508
- parts = text.split(' ')
509
-
510
- # Initialize an empty list to store the processed parts
511
- processed_parts = []
512
-
513
- # Iterate over each part
514
- for part in parts:
515
- # Check if the part is a number (contains only digits)
516
- if part.isdigit():
517
- # If the previous part was also a number, concatenate them
518
- if processed_parts and processed_parts[-1].isdigit():
519
- processed_parts[-1] += part
520
- else:
521
- processed_parts.append(part)
522
- else:
523
- # If the part is not a number, add it to the list as is
524
- processed_parts.append(part)
525
-
526
- # Join the processed parts back into a string with spaces
527
- text = ' '.join(processed_parts)
528
-
529
- return text
530
-
531
- ###########################################################
532
- def transcribe_kannada(speech):
533
- text = p3(speech)["text"]
534
- if text is None:
535
- return "Error: ASR returned None"
536
- return text
537
- def transcribe_telugu(speech):
538
- text = p4(speech)["text"]
539
- if text is None:
540
- return "Error: ASR returned None"
541
- return text
542
-
543
- def transcribe_bangala(speech):
544
- text = p5(speech)["text"]
545
- if text is None:
546
- return "Error: ASR returned None"
547
- return text
548
-
549
- def transcribe_assamese_LM(speech):
550
- text = p8(speech)["text"]
551
- text = cleanhtml(text)
552
- if text is None:
553
- return "Error: ASR returned None"
554
- return text
555
-
556
- def transcribe_assamese_model2(speech):
557
- text = p7(speech)["text"]
558
- text = cleanhtml(text)
559
- if text is None:
560
- return "Error: ASR returned None"
561
- return text
562
-
563
- def transcribe_ban_eng(speech):
564
- trn = Transliterator(source='ben', target='eng', build_lookup=True)
565
- text = p5(speech)["text"]
566
- if text is None:
567
- return "Error: ASR returned None"
568
- sentence = trn.transform(text)
569
- if sentence is None:
570
- return "Error: Transliteration returned None"
571
- replaced_words = replace_words(sentence)
572
- processed_sentence = process_doubles(replaced_words)
573
- return process_transcription(processed_sentence)
574
-
575
- def transcribe_hin_eng(speech):
576
- trn = Transliterator(source='hin', target='eng', build_lookup=True)
577
- text = p2(speech)["text"]
578
- if text is None:
579
- return "Error: ASR returned None"
580
- sentence = trn.transform(text)
581
- if sentence is None:
582
- return "Error: Transliteration returned None"
583
- replaced_words = replace_words(sentence)
584
- processed_sentence = process_doubles(replaced_words)
585
- return process_transcription(processed_sentence)
586
-
587
- def transcribe_kan_eng(speech):
588
- trn = Transliterator(source='kan', target='eng', build_lookup=True)
589
- text = p3(speech)["text"]
590
- if text is None:
591
- return "Error: ASR returned None"
592
- sentence = trn.transform(text)
593
- if sentence is None:
594
- return "Error: Transliteration returned None"
595
- replaced_words = replace_words(sentence)
596
- processed_sentence = process_doubles(replaced_words)
597
- return process_transcription(processed_sentence)
598
-
599
- def transcribe_tel_eng(speech):
600
- trn = Transliterator(source='tel', target='eng', build_lookup=True)
601
- text = p4(speech)["text"]
602
- if text is None:
603
- return "Error: ASR returned None"
604
- sentence = trn.transform(text)
605
- if sentence is None:
606
- return "Error: Transliteration returned None"
607
- replaced_words = replace_words(sentence)
608
- processed_sentence = process_doubles(replaced_words)
609
- return process_transcription(processed_sentence)
610
-
611
-
612
- def sel_lng(lng, mic=None, file=None):
613
- if mic is not None:
614
- audio = mic
615
- elif file is not None:
616
- audio = file
617
- else:
618
- return "You must either provide a mic recording or a file"
619
-
620
- if lng == "Odiya":
621
- return transcribe_odiya(audio)
622
- elif lng == "Odiya-trans":
623
- return transcribe_odiya_eng(audio)
624
- elif lng == "Hindi-trans":
625
- return transcribe_hin_eng(audio)
626
- elif lng == "Hindi":
627
- return transcribe_hindi(audio)
628
- elif lng == "Kannada-trans":
629
- return transcribe_kan_eng(audio)
630
- elif lng == "Kannada":
631
- return transcribe_kannada(audio)
632
- elif lng == "Telugu-trans":
633
- return transcribe_tel_eng(audio)
634
- elif lng == "Telugu":
635
- return transcribe_telugu(audio)
636
- elif lng == "Bangala-trans":
637
- return transcribe_ban_eng(audio)
638
- elif lng == "Bangala":
639
- return transcribe_bangala(audio)
640
- elif lng == "Assamese-LM":
641
- return transcribe_assamese_LM(audio)
642
- elif lng == "Assamese-Model2":
643
- return transcribe_assamese_model2(audio)
644
- elif lng == "Odia_model1":
645
- return transcribe_odiya_model1(audio)
646
- elif lng == "Odiya_trans_model1":
647
- return transcribe_odiya_eng_model1(audio)
648
- elif lng == "Odia_model2":
649
- return transcribe_odiya_model2(audio)
650
- elif lng == "Odia_trans_model2":
651
- return transcribe_odiya_eng_model2(audio)
652
- elif lng == "Punjabi_Model0":
653
- return transcribe_punjabi_30000(audio)
654
- elif lng == "Punjabi_Model0_Trans":
655
- return transcribe_punjabi_eng_model_30000(audio)
656
- elif lng == "Punjabi_Model_aug":
657
- return transcribe_punjabi_70000_aug(audio)
658
- elif lng == "Punjabi_Model_aug_Trans":
659
- return transcribe_punjabi_eng_model_70000_aug(audio)
660
- elif lng == "Punjabi_Model1":
661
- return transcribe_punjabi_155750(audio)
662
- elif lng == "Punjabi_Model1_Trans":
663
- return transcribe_punjabi_eng_model_155750(audio)
664
-
665
-
666
-
667
-
668
- # Convert transcript back to sentence
669
- # reconstructed_sentence_1 = transcript_to_sentence(numbers, code_to_word_map)
670
 
671
- # demo=gr.Interface(
672
- # fn=sel_lng,
673
-
674
- # inputs=[
675
-
676
- # gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans"],value="Hindi",label="Select Language"),
677
- # gr.Audio(source="microphone", type="filepath"),
678
- # gr.Audio(source= "upload", type="filepath"),
679
- # #gr.Audio(sources="upload", type="filepath"),
680
- # #"state"
681
- # ],
682
- # outputs=[
683
- # "textbox"
684
- # # #"state"
685
- # ],
686
- # title="Automatic Speech Recognition",
687
- # description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
688
- # ).launch()
689
 
690
- ######################################################
691
  demo=gr.Interface(
692
- fn=sel_lng,
693
-
694
  inputs=[
695
-
696
- #gr.Dropdown(["Hindi","Hindi-trans","Odiya","Odiya-trans","Kannada","Kannada-trans","Telugu","Telugu-trans","Bangala","Bangala-trans"],value="Hindi",label="Select Language"),
697
- gr.Dropdown([
698
- # "Hindi","Hindi-trans",
699
- "Odia_model1","Odiya_trans_model1","Odia_model2","Odia_trans_model2"],label="Select Language"),
700
- # "Assamese-LM","Assamese-Model2",
701
- # "Punjabi_Model1","Punjabi_Model1_Trans","Punjabi_Model_aug","Punjabi_Model_aug_Trans"],value="Hindi",label="Select Language"),
702
  gr.Audio(sources=["microphone","upload"], type="filepath"),
703
- #gr.Audio(sources="upload", type="filepath"),
704
- #"state"
705
  ],
706
  outputs=[
707
  "textbox"
708
- # #"state"
709
  ],
710
- allow_flagging="auto",
711
- #flagging_options=["Language error", "English transliteration error", "Other"],
712
- #flagging_callback=hf_writer,
713
  title="Automatic Speech Recognition",
714
  description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
715
  ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import warnings
2
  warnings.filterwarnings("ignore")
3
  import os
 
17
  from processDoubles import process_doubles
18
  from replaceWords import replace_words
19
 
20
+ # transcriber = pipeline(task="automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_v1")
21
+ # processor = AutoProcessor.from_pretrained("cdactvm/w2v-bert-2.0-hindi_v1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # vocab_dict = processor.tokenizer.get_vocab()
24
 
25
+ # sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
26
+ # decoder = build_ctcdecoder(
27
+ # labels=list(sorted_vocab_dict.keys()),
28
+ # kenlm_model_path="lm.binary",
29
+ # )
30
+ # processor_with_lm = Wav2Vec2ProcessorWithLM(
31
+ # feature_extractor=processor.feature_extractor,
32
+ # tokenizer=processor.tokenizer,
33
+ # decoder=decoder
34
+ # )
35
+ # processor.feature_extractor._processor_class = "Wav2Vec2ProcessorWithLM"
36
 
37
+
38
+ def transcribe(audio):
39
+ # # Process the audio file
40
+ transcript = transcriber(audio)
41
+ text_value = transcript['text']
42
+ print(text_value)
43
+ processd_doubles=process_doubles(text_value)
44
+ converted_to_list=convert_to_list(processd_doubles,text_to_list())
45
+ replaced_words = replace_words(converted_to_list)
46
+ converted_text=text_to_int(replaced_words)
47
+ return converted_text
48
+
49
+
50
+ # demo = gr.Interface(
51
+ # transcribe,
52
+ # gr.Audio(sources="microphone", type="filepath"),
53
+ # "text",
54
+ # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
58
  demo=gr.Interface(
59
+ transcribe,
 
60
  inputs=[
 
 
 
 
 
 
 
61
  gr.Audio(sources=["microphone","upload"], type="filepath"),
 
 
62
  ],
63
  outputs=[
64
  "textbox"
 
65
  ],
 
 
 
66
  title="Automatic Speech Recognition",
67
  description = "Demo for Automatic Speech Recognition. Use microphone to record speech. Please press Record button. Initially it will take some time to load the model. The recognized text will appear in the output textbox",
68
  ).launch()