Huyen2310 commited on
Commit
b5a9eb4
·
1 Parent(s): ef174c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -24
app.py CHANGED
@@ -1,45 +1,219 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import torch
 
 
 
 
 
 
 
 
4
  import librosa
5
- import soundfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- SAMPLE_RATE = 16000
 
 
 
8
 
9
- pipe = pipeline(model="HuyenNguyen/Vin-W-22000")
 
 
 
 
 
 
10
 
 
 
 
11
 
12
- def transcribe(Microphone, File_Upload):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  warn_output = ""
14
- if (Microphone is not None) and (File_Upload is not None):
15
- warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
16
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
17
- file = Microphone
 
18
 
19
- elif (Microphone is None) and (File_Upload is None):
20
  return "ERROR: You have to either use the microphone or upload an audio file"
21
 
22
- elif Microphone is not None:
23
- file = Microphone
24
- else:
25
- file = File_Upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- text = pipe(file)["text"]
 
 
 
 
28
 
29
- return warn_output + text
 
 
 
 
 
 
30
 
31
 
32
- iface = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  fn=transcribe,
34
  inputs=[
35
- gr.inputs.Audio(source="microphone", type='filepath', optional=True),
36
- gr.inputs.Audio(source="upload", type='filepath', optional=True),
37
  ],
38
  outputs="text",
39
  layout="horizontal",
40
  theme="huggingface",
41
- title="Speed to Text Huyen Nguyen",
42
- allow_flagging='never',
43
  )
44
 
45
- iface.launch(enable_queue=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """demo 2/3.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1QeNS57tZzvJudeNjQczKJ-PbN0l1tK6V
8
+
9
+ # Import library
10
+ """
11
+ import os
12
  import librosa
13
+ import gradio as gr
14
+ import noisereduce as nr
15
+ from scipy.io import wavfile
16
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
17
+
18
+ """# Load model"""
19
+
20
+ from google.colab import drive
21
+ import os
22
+ drive.mount('/content/gdrive')
23
+
24
+ # load model and processor"
25
+ processor = WhisperProcessor.from_pretrained("/content/gdrive/MyDrive/ColabNotebookShared/Speech2TextHuyenNguyen/Model/FPTVinTest2")
26
+ model = WhisperForConditionalGeneration.from_pretrained("/content/gdrive/MyDrive/ColabNotebookShared/Speech2TextHuyenNguyen/Model/FPTVinTest2/checkpoint-1332").to("cuda")
27
+ model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(task = "transcribe")
28
+
29
+ """# Slipt audio"""
30
+
31
+ from pydub import AudioSegment
32
+
33
+ def preprocessing(path):
34
+ # CONVERT MP3 -> WAV
35
+ type_file = path.split(".")[-1]
36
+ sound = AudioSegment.from_file(path, type_file)
37
+ path_list = []
38
+
39
+ # SPLIT AUDIO
40
+ time_audio = int(sound.duration_seconds / 20) + 1
41
+ for i in range(time_audio):
42
+ t1 = i * 20 * 1000
43
+ t2 = (i+1) * 20 * 1000
44
+ if i == (time_audio-1):
45
+ newAudio = sound[t1:]
46
+ else:
47
+ newAudio = sound[t1:t2]
48
+
49
+ newAudio = newAudio.split_to_mono()[0]
50
+ newAudio = newAudio.set_frame_rate(16000) # convert frequency : mọi freq --> 16000kHz
51
+ audio_path = '/content/new_audio' + str(i) + '.wav'
52
+ newAudio.export(audio_path, format="wav")
53
+ path_list.append(audio_path)
54
+ return path_list
55
+
56
+ """# Capitalization"""
57
+
58
+ !git lfs install
59
+ !git clone https://github.com/huyenxam/Vicap.git
60
+
61
+ # Commented out IPython magic to ensure Python compatibility.
62
+ # %cd {"/content/Vicap"}
63
+
64
+ import os
65
+ from gec_model import GecBERTModel
66
+
67
+ cache_dir = "./"
68
+ model_cap = GecBERTModel(
69
+ vocab_path=os.path.join(cache_dir, "vocabulary"),
70
+ model_paths="dragonSwing/vibert-capu",
71
+ split_chunk=True
72
+ )
73
+
74
+ """# Spelling Correction"""
75
 
76
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
77
+ tokenizer_spell = AutoTokenizer.from_pretrained("VietAI/vit5-base")
78
+ model_spell = AutoModelForSeq2SeqLM.from_pretrained("HuyenNguyen/Vi-test1")
79
+ model_spell.cuda()
80
 
81
+ def spelling_text(text):
82
+ encoding = tokenizer_spell(text, return_tensors="pt")
83
+ input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
84
+ outputs = model_spell.generate(
85
+ input_ids=input_ids, attention_mask=attention_masks,
86
+ max_length=30,
87
+ )
88
 
89
+ for output in outputs:
90
+ line = tokenizer_spell.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
91
+ return line
92
 
93
+ def spelling(transcription):
94
+ sentences = transcription.split(" ")
95
+ len_sen = int(len(sentences) / 25) + 1
96
+ result = ""
97
+ for i in range(len_sen):
98
+ t1 = i * 24
99
+ t2 = (i+1) * 24
100
+ if i == (len_sen - 1):
101
+ text = " ".join(sentences[t1:])
102
+ else:
103
+ text = " ".join(sentences[t1:t2])
104
+ result = result + " " + spelling_text(text)
105
+
106
+ return result
107
+
108
+ """# Speech To Text"""
109
+
110
+ import torch
111
+ import numpy as np
112
+ import gradio as gr
113
+ from scipy.io.wavfile import write
114
+ import pytube as pt
115
+ from transformers import pipeline
116
+ from huggingface_hub import model_info
117
+
118
+ def transcribe(microphone, file_upload):
119
  warn_output = ""
120
+ if (microphone is not None) and (file_upload is not None):
121
+ warn_output = (
122
+ "WARNING: You've uploaded an audio file and used the microphone. "
123
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
124
+ )
125
 
126
+ elif (microphone is None) and (file_upload is None):
127
  return "ERROR: You have to either use the microphone or upload an audio file"
128
 
129
+ path = microphone if microphone is not None else file_upload
130
+
131
+ X_new, sr_new = librosa.load(path)
132
+ dst = "/content/audio.wav"
133
+ write(dst, sr_new, X_new)
134
+
135
+ # Split audio
136
+ transcription = ""
137
+ path_list = preprocessing(dst)
138
+ for audio_path in path_list:
139
+ # X, sr = noise(audio_path)
140
+ X, sr = librosa.load(audio_path, sr=16000)
141
+ input_features = processor(X.astype('float16'), return_tensors="pt").input_features
142
+ # predicted_ids = model.generate(input_features.to("cuda"), temperature=1.0)
143
+ predicted_ids = model.generate(input_features.to("cuda"))
144
+ text = processor.batch_decode(predicted_ids, skip_special_tokens = True)[0]
145
+ transcription = transcription + " " + text
146
+
147
+ transcription_spell = spelling(transcription)
148
 
149
+ transcription_cap = model_cap(transcription_spell)[0]
150
+
151
+ # sentence_result = "Câu gốc: " + transcription + "\n" + "Câu sửa lỗi chính tả: " + transcription_spell + "\n" + "Thêm dấu: " + transcription_cap
152
+ return transcription_cap
153
+
154
 
155
+ def _return_yt_html_embed(yt_url):
156
+ video_id = yt_url.split("?v=")[-1]
157
+ HTML_str = (
158
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
159
+ " </center>"
160
+ )
161
+ return HTML_str
162
 
163
 
164
+ def yt_transcribe(yt_url):
165
+ # yt = pt.YouTube(yt_url)
166
+ # html_embed_str = _return_yt_html_embed(yt_url)
167
+ # stream = yt.streams.filter(only_audio=True)[0]
168
+ # src = "/content/audio.mp3"
169
+ # dst = "/content/audio.wav"
170
+ # stream.download(filename=src)
171
+
172
+ # X_new, sr_new = librosa.load(src)
173
+
174
+ # write(dst, sr_new, X_new)
175
+ # # X_new, sr_new = librosa.load(src)
176
+ # path_list = preprocessing(dst)
177
+ # transcription = " "
178
+ # for audio_path in path_list:
179
+ # # X, sr = noise(audio_path)
180
+ # X, sr = librosa.load(audio_path, sr=16000)
181
+ # input_features = processor(X.astype('float16'), return_tensors="pt").input_features
182
+ # predicted_ids = model.generate(input_features.to("cuda"))
183
+ # text = processor.batch_decode(predicted_ids, skip_special_tokens = True)[0]
184
+ # transcription = transcription + " " + text
185
+
186
+ # transcription = spelling(transcription)
187
+ # transcription = model_cap(transcription)[0]
188
+ return "ouput", 'This feature is temporarily locked'
189
+
190
+
191
+ demo = gr.Blocks()
192
+
193
+ mf_transcribe = gr.Interface(
194
  fn=transcribe,
195
  inputs=[
196
+ gr.inputs.Audio(source="microphone", type="filepath", optional=True),
197
+ gr.inputs.Audio(source="upload", type="filepath", optional=True),
198
  ],
199
  outputs="text",
200
  layout="horizontal",
201
  theme="huggingface",
202
+ title="PYLAB Demo: Transcribe Audio",
203
+ allow_flagging="never",
204
  )
205
 
206
+ yt_transcribe = gr.Interface(
207
+ fn=yt_transcribe,
208
+ inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
209
+ outputs=["html", "text"],
210
+ layout="horizontal",
211
+ theme="huggingface",
212
+ title="PYLAB Demo: Transcribe YouTube",
213
+ allow_flagging="never",
214
+ )
215
+
216
+ with demo:
217
+ gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
218
+
219
+ demo.launch(enable_queue=True)