KevinGeng commited on
Commit
5626b60
1 Parent(s): 09a6b36

remove app_whipser_large.py app.whisper.fine_tuned.py

Browse files
Files changed (2) hide show
  1. .gitattributes +1 -0
  2. app.whisper.fine_tuned.py +0 -272
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  xvector filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  xvector filter=lfs diff=lfs merge=lfs -text
36
+ TTS_model filter=lfs diff=lfs merge=lfs -text
app.whisper.fine_tuned.py DELETED
@@ -1,272 +0,0 @@
1
- """
2
- TODO:
3
- + [x] Load Configuration
4
- + [ ] Checking
5
- + [ ] Better saving directory
6
- """
7
- import numpy as np
8
- from pathlib import Path
9
- import torch.nn as nn
10
- import torch
11
- import torchaudio
12
- from transformers import pipeline
13
- from pathlib import Path
14
-
15
- # local import
16
- import sys
17
- from espnet2.bin.tts_inference import Text2Speech
18
- from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
19
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
20
-
21
- sys.path.append("src")
22
-
23
- import gradio as gr
24
-
25
- # ASR part
26
-
27
- audio_files = [
28
- str(x)
29
- for x in sorted(
30
- Path(
31
- "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
32
- ).glob("**/*wav")
33
- )
34
- ]
35
- # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
36
- # transcriber = pipeline(
37
- # "automatic-speech-recognition",
38
- # model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
39
- # )
40
-
41
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
42
-
43
- processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
44
-
45
- model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
46
-
47
- # feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
48
- # representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
49
- # tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
50
-
51
- transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
52
- # transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
53
- # 【Female】kan-bayashi ljspeech parallel wavegan
54
- # tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
55
- # 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
56
- # pdb.set_trace()
57
-
58
- # @title English multi-speaker pretrained model { run: "auto" }
59
- lang = "English"
60
- tag = "kan-bayashi/libritts_xvector_vits"
61
- # vits needs no
62
- vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
63
- from espnet2.bin.tts_inference import Text2Speech
64
- from espnet2.utils.types import str_or_none
65
-
66
- text2speech = Text2Speech.from_pretrained(
67
- model_tag=str_or_none(tag),
68
- vocoder_tag=str_or_none(vocoder_tag),
69
- device="cuda",
70
- use_att_constraint=False,
71
- backward_window=1,
72
- forward_window=3,
73
- speed_control_alpha=1.0,
74
- )
75
-
76
- import glob
77
- import os
78
- import numpy as np
79
- import kaldiio
80
-
81
- # Get model directory path
82
- from espnet_model_zoo.downloader import ModelDownloader
83
-
84
- d = ModelDownloader()
85
- model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
86
-
87
- # Speaker x-vector selection
88
-
89
- xvector_ark = [
90
- p
91
- for p in glob.glob(
92
- f"xvector/test-clean/spk_xvector.ark", recursive=True
93
- )
94
- if "test" in p
95
- ][0]
96
- xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
97
- spks = list(xvectors.keys())
98
-
99
- male_spks = {
100
- "Male1": "2300_131720",
101
- "Male2": "1320_122612",
102
- }
103
- # "M3": "1188_133604",
104
- # "M4": "61_70970",
105
- female_spks = {"Female1": "2961_961", "Female2": "8463_287645", }
106
- # "F3": "121_121726"
107
- spks = dict(male_spks, **female_spks)
108
- spk_names = sorted(spks.keys())
109
-
110
-
111
- ## 20230224 Mousa: No reference,
112
- def ASRTTS(audio_file, spk_name, ref_text=""):
113
- spk = spks[spk_name]
114
- spembs = xvectors[spk]
115
- if ref_text == "":
116
- reg_text = transcriber(audio_file)["text"]
117
- else:
118
- reg_text = ref_text
119
-
120
- speech, sr = torchaudio.load(
121
- audio_file, channels_first=True
122
- ) # Mono channel
123
- wav_tensor_spembs = text2speech(
124
- text=reg_text, speech=speech, spembs=spembs
125
- )["wav"]
126
- wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
127
- sample_rate = 22050
128
- save_id = (
129
- "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
130
- )
131
- torchaudio.save(
132
- save_id,
133
- src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
134
- sample_rate=22050,
135
- )
136
-
137
- return save_id, reg_text
138
-
139
-
140
- def ASRTTS_clean(audio_file, spk_name):
141
- spk = spks[spk_name]
142
- spembs = xvectors[spk]
143
-
144
- reg_text = transcriber(audio_file)["text"]
145
-
146
- speech, sr = torchaudio.load(
147
- audio_file, channels_first=True
148
- ) # Mono channel
149
- wav_tensor_spembs = text2speech(
150
- text=reg_text, speech=speech, spembs=spembs
151
- )["wav"]
152
- wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
153
- sample_rate = 22050
154
- save_id = (
155
- "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
156
- )
157
- torchaudio.save(
158
- save_id,
159
- src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
160
- sample_rate=22050,
161
- )
162
- return save_id
163
-
164
-
165
- reference_textbox = gr.Textbox(
166
- value="",
167
- placeholder="Input reference here",
168
- label="Reference",
169
- )
170
-
171
- recognization_textbox = gr.Textbox(
172
- value="",
173
- placeholder="Output recognization here",
174
- label="recognization_textbox",
175
- )
176
-
177
- speaker_option = gr.Radio(choices=spk_names, label="Speaker")
178
-
179
- input_audio = gr.Audio(
180
- source="upload", type="filepath", label="Audio_to_Evaluate"
181
- )
182
- output_audio = gr.Audio(
183
- source="upload", file="filepath", label="Synthesized Audio"
184
- )
185
- examples = [
186
- ["./samples/001.wav", "M1", ""],
187
- ["./samples/002.wav", "M2", ""],
188
- ["./samples/003.wav", "F1", ""],
189
- ["./samples/004.wav", "F2", ""],
190
- ]
191
-
192
-
193
- def change_audiobox(choice):
194
- if choice == "upload":
195
- input_audio = gr.Audio.update(source="upload", visible=True)
196
- elif choice == "microphone":
197
- input_audio = gr.Audio.update(source="microphone", visible=True)
198
- else:
199
- input_audio = gr.Audio.update(visible=False)
200
- return input_audio
201
-
202
-
203
- def show_icon(choice):
204
- if choice == "Male1":
205
- spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
206
- elif choice == "Male2":
207
- spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
208
- elif choice == "Female1":
209
- spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
210
- elif choice == "Female2":
211
- spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
212
- return spk_icon
213
-
214
- def get_download_file(audio_file=None):
215
- if audio_file == None:
216
- output_audio_file = gr.File.update(visible=False)
217
- else:
218
- output_audio_file = gr.File.update(visible=True)
219
- return output_audio_file
220
-
221
- def download_file(audio_file):
222
- return gr.File(value=audio_file)
223
- # pdb.set_trace()
224
-
225
- with gr.Blocks(
226
- analytics_enabled=False,
227
- css=".gradio-container {background-color: #78BD91}",
228
- ) as demo:
229
- with gr.Column(elem_id="Column"):
230
- input_format = gr.Radio(
231
- choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
232
- )
233
- input_audio = gr.Audio(
234
- source="microphone",
235
- type="filepath",
236
- label="Input Audio",
237
- interactive=True,
238
- visible=False,
239
- elem_id="input_audio"
240
- )
241
- input_format.change(
242
- fn=change_audiobox, inputs=input_format, outputs=input_audio
243
- )
244
-
245
- speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
246
- spk_icon = gr.Image(value="speaker_icons/male1.png",
247
- type="filepath",
248
- image_mode="RGB",
249
- source="upload",
250
- shape=[50, 50],
251
- interactive=True,
252
- visible=True)
253
- speaker_option.change(
254
- fn=show_icon, inputs=speaker_option, outputs=spk_icon
255
- )
256
-
257
- b2 = gr.Button("Convert")
258
-
259
- output_audio = gr.Audio(
260
- source="upload", file="filepath", label="Converted Audio", interactive=False
261
- )
262
-
263
- b2.click(
264
- ASRTTS_clean,
265
- inputs=[input_audio, speaker_option],
266
- outputs=output_audio,
267
- api_name="convert"
268
- )
269
-
270
- # download_file("wav/001_F1_spkembs.wav")
271
-
272
- demo.launch(share=False)