Files changed (1) hide show
  1. voice_main.py +0 -732
voice_main.py DELETED
@@ -1,732 +0,0 @@
1
- from soni_translate.logging_setup import logger
2
- import torch
3
- import gc
4
- import numpy as np
5
- import os
6
- import shutil
7
- import warnings
8
- import threading
9
- from tqdm import tqdm
10
- from lib.infer_pack.models import (
11
- SynthesizerTrnMs256NSFsid,
12
- SynthesizerTrnMs256NSFsid_nono,
13
- SynthesizerTrnMs768NSFsid,
14
- SynthesizerTrnMs768NSFsid_nono,
15
- )
16
- from lib.audio import load_audio
17
- import soundfile as sf
18
- import edge_tts
19
- import asyncio
20
- from soni_translate.utils import remove_directory_contents, create_directories
21
- from scipy import signal
22
- from time import time as ttime
23
- import faiss
24
- from vci_pipeline import VC, change_rms, bh, ah
25
- import librosa
26
-
27
- warnings.filterwarnings("ignore")
28
-
29
-
30
- class Config:
31
- def __init__(self, only_cpu=False):
32
- self.device = "cuda:0"
33
- self.is_half = True
34
- self.n_cpu = 0
35
- self.gpu_name = None
36
- self.gpu_mem = None
37
- (
38
- self.x_pad,
39
- self.x_query,
40
- self.x_center,
41
- self.x_max
42
- ) = self.device_config(only_cpu)
43
-
44
- def device_config(self, only_cpu) -> tuple:
45
- if torch.cuda.is_available() and not only_cpu:
46
- i_device = int(self.device.split(":")[-1])
47
- self.gpu_name = torch.cuda.get_device_name(i_device)
48
- if (
49
- ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
50
- or "P40" in self.gpu_name.upper()
51
- or "1060" in self.gpu_name
52
- or "1070" in self.gpu_name
53
- or "1080" in self.gpu_name
54
- ):
55
- logger.info(
56
- "16/10 Series GPUs and P40 excel "
57
- "in single-precision tasks."
58
- )
59
- self.is_half = False
60
- else:
61
- self.gpu_name = None
62
- self.gpu_mem = int(
63
- torch.cuda.get_device_properties(i_device).total_memory
64
- / 1024
65
- / 1024
66
- / 1024
67
- + 0.4
68
- )
69
- elif torch.backends.mps.is_available() and not only_cpu:
70
- logger.info("Supported N-card not found, using MPS for inference")
71
- self.device = "mps"
72
- else:
73
- logger.info("No supported N-card found, using CPU for inference")
74
- self.device = "cpu"
75
- self.is_half = False
76
-
77
- if self.n_cpu == 0:
78
- self.n_cpu = os.cpu_count()
79
-
80
- if self.is_half:
81
- # 6GB VRAM configuration
82
- x_pad = 3
83
- x_query = 10
84
- x_center = 60
85
- x_max = 65
86
- else:
87
- # 5GB VRAM configuration
88
- x_pad = 1
89
- x_query = 6
90
- x_center = 38
91
- x_max = 41
92
-
93
- if self.gpu_mem is not None and self.gpu_mem <= 4:
94
- x_pad = 1
95
- x_query = 5
96
- x_center = 30
97
- x_max = 32
98
-
99
- logger.info(
100
- f"Config: Device is {self.device}, "
101
- f"half precision is {self.is_half}"
102
- )
103
-
104
- return x_pad, x_query, x_center, x_max
105
-
106
-
107
- BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/"
108
- BASE_MODELS = [
109
- "hubert_base.pt",
110
- "rmvpe.pt"
111
- ]
112
- BASE_DIR = "."
113
-
114
-
115
- def load_hu_bert(config):
116
- from fairseq import checkpoint_utils
117
- from soni_translate.utils import download_manager
118
-
119
- for id_model in BASE_MODELS:
120
- download_manager(
121
- os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR
122
- )
123
-
124
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
125
- ["hubert_base.pt"],
126
- suffix="",
127
- )
128
- hubert_model = models[0]
129
- hubert_model = hubert_model.to(config.device)
130
- if config.is_half:
131
- hubert_model = hubert_model.half()
132
- else:
133
- hubert_model = hubert_model.float()
134
- hubert_model.eval()
135
-
136
- return hubert_model
137
-
138
-
139
- def load_trained_model(model_path, config):
140
-
141
- if not model_path:
142
- raise ValueError("No model found")
143
-
144
- logger.info("Loading %s" % model_path)
145
- cpt = torch.load(model_path, map_location="cpu")
146
- tgt_sr = cpt["config"][-1]
147
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
148
- if_f0 = cpt.get("f0", 1)
149
- if if_f0 == 0:
150
- # protect to 0.5 need?
151
- pass
152
-
153
- version = cpt.get("version", "v1")
154
- if version == "v1":
155
- if if_f0 == 1:
156
- net_g = SynthesizerTrnMs256NSFsid(
157
- *cpt["config"], is_half=config.is_half
158
- )
159
- else:
160
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
161
- elif version == "v2":
162
- if if_f0 == 1:
163
- net_g = SynthesizerTrnMs768NSFsid(
164
- *cpt["config"], is_half=config.is_half
165
- )
166
- else:
167
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
168
- del net_g.enc_q
169
-
170
- net_g.load_state_dict(cpt["weight"], strict=False)
171
- net_g.eval().to(config.device)
172
-
173
- if config.is_half:
174
- net_g = net_g.half()
175
- else:
176
- net_g = net_g.float()
177
-
178
- vc = VC(tgt_sr, config)
179
- n_spk = cpt["config"][-3]
180
-
181
- return n_spk, tgt_sr, net_g, vc, cpt, version
182
-
183
-
184
- class ClassVoices:
185
- def __init__(self, only_cpu=False):
186
- self.model_config = {}
187
- self.config = None
188
- self.only_cpu = only_cpu
189
-
190
- def apply_conf(
191
- self,
192
- tag="base_model",
193
- file_model="",
194
- pitch_algo="pm",
195
- pitch_lvl=0,
196
- file_index="",
197
- index_influence=0.66,
198
- respiration_median_filtering=3,
199
- envelope_ratio=0.25,
200
- consonant_breath_protection=0.33,
201
- resample_sr=0,
202
- file_pitch_algo="",
203
- ):
204
-
205
- if not file_model:
206
- raise ValueError("Model not found")
207
-
208
- if file_index is None:
209
- file_index = ""
210
-
211
- if file_pitch_algo is None:
212
- file_pitch_algo = ""
213
-
214
- if not self.config:
215
- self.config = Config(self.only_cpu)
216
- self.hu_bert_model = None
217
- self.model_pitch_estimator = None
218
-
219
- self.model_config[tag] = {
220
- "file_model": file_model,
221
- "pitch_algo": pitch_algo,
222
- "pitch_lvl": pitch_lvl, # no decimal
223
- "file_index": file_index,
224
- "index_influence": index_influence,
225
- "respiration_median_filtering": respiration_median_filtering,
226
- "envelope_ratio": envelope_ratio,
227
- "consonant_breath_protection": consonant_breath_protection,
228
- "resample_sr": resample_sr,
229
- "file_pitch_algo": file_pitch_algo,
230
- }
231
- return f"CONFIGURATION APPLIED FOR {tag}: {file_model}"
232
-
233
- def infer(
234
- self,
235
- task_id,
236
- params,
237
- # load model
238
- n_spk,
239
- tgt_sr,
240
- net_g,
241
- pipe,
242
- cpt,
243
- version,
244
- if_f0,
245
- # load index
246
- index_rate,
247
- index,
248
- big_npy,
249
- # load f0 file
250
- inp_f0,
251
- # audio file
252
- input_audio_path,
253
- overwrite,
254
- ):
255
-
256
- f0_method = params["pitch_algo"]
257
- f0_up_key = params["pitch_lvl"]
258
- filter_radius = params["respiration_median_filtering"]
259
- resample_sr = params["resample_sr"]
260
- rms_mix_rate = params["envelope_ratio"]
261
- protect = params["consonant_breath_protection"]
262
-
263
- if not os.path.exists(input_audio_path):
264
- raise ValueError(
265
- "The audio file was not found or is not "
266
- f"a valid file: {input_audio_path}"
267
- )
268
-
269
- f0_up_key = int(f0_up_key)
270
-
271
- audio = load_audio(input_audio_path, 16000)
272
-
273
- # Normalize audio
274
- audio_max = np.abs(audio).max() / 0.95
275
- if audio_max > 1:
276
- audio /= audio_max
277
-
278
- times = [0, 0, 0]
279
-
280
- # filters audio signal, pads it, computes sliding window sums,
281
- # and extracts optimized time indices
282
- audio = signal.filtfilt(bh, ah, audio)
283
- audio_pad = np.pad(
284
- audio, (pipe.window // 2, pipe.window // 2), mode="reflect"
285
- )
286
- opt_ts = []
287
- if audio_pad.shape[0] > pipe.t_max:
288
- audio_sum = np.zeros_like(audio)
289
- for i in range(pipe.window):
290
- audio_sum += audio_pad[i:i - pipe.window]
291
- for t in range(pipe.t_center, audio.shape[0], pipe.t_center):
292
- opt_ts.append(
293
- t
294
- - pipe.t_query
295
- + np.where(
296
- np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query])
297
- == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min()
298
- )[0][0]
299
- )
300
-
301
- s = 0
302
- audio_opt = []
303
- t = None
304
- t1 = ttime()
305
-
306
- sid_value = 0
307
- sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long()
308
-
309
- # Pads audio symmetrically, calculates length divided by window size.
310
- audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect")
311
- p_len = audio_pad.shape[0] // pipe.window
312
-
313
- # Estimates pitch from audio signal
314
- pitch, pitchf = None, None
315
- if if_f0 == 1:
316
- pitch, pitchf = pipe.get_f0(
317
- input_audio_path,
318
- audio_pad,
319
- p_len,
320
- f0_up_key,
321
- f0_method,
322
- filter_radius,
323
- inp_f0,
324
- )
325
- pitch = pitch[:p_len]
326
- pitchf = pitchf[:p_len]
327
- if pipe.device == "mps":
328
- pitchf = pitchf.astype(np.float32)
329
- pitch = torch.tensor(
330
- pitch, device=pipe.device
331
- ).unsqueeze(0).long()
332
- pitchf = torch.tensor(
333
- pitchf, device=pipe.device
334
- ).unsqueeze(0).float()
335
-
336
- t2 = ttime()
337
- times[1] += t2 - t1
338
- for t in opt_ts:
339
- t = t // pipe.window * pipe.window
340
- if if_f0 == 1:
341
- pitch_slice = pitch[
342
- :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
343
- ]
344
- pitchf_slice = pitchf[
345
- :, s // pipe.window: (t + pipe.t_pad2) // pipe.window
346
- ]
347
- else:
348
- pitch_slice = None
349
- pitchf_slice = None
350
-
351
- audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window]
352
- audio_opt.append(
353
- pipe.vc(
354
- self.hu_bert_model,
355
- net_g,
356
- sid,
357
- audio_slice,
358
- pitch_slice,
359
- pitchf_slice,
360
- times,
361
- index,
362
- big_npy,
363
- index_rate,
364
- version,
365
- protect,
366
- )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
367
- )
368
- s = t
369
-
370
- pitch_end_slice = pitch[
371
- :, t // pipe.window:
372
- ] if t is not None else pitch
373
- pitchf_end_slice = pitchf[
374
- :, t // pipe.window:
375
- ] if t is not None else pitchf
376
-
377
- audio_opt.append(
378
- pipe.vc(
379
- self.hu_bert_model,
380
- net_g,
381
- sid,
382
- audio_pad[t:],
383
- pitch_end_slice,
384
- pitchf_end_slice,
385
- times,
386
- index,
387
- big_npy,
388
- index_rate,
389
- version,
390
- protect,
391
- )[pipe.t_pad_tgt:-pipe.t_pad_tgt]
392
- )
393
-
394
- audio_opt = np.concatenate(audio_opt)
395
- if rms_mix_rate != 1:
396
- audio_opt = change_rms(
397
- audio, 16000, audio_opt, tgt_sr, rms_mix_rate
398
- )
399
- if resample_sr >= 16000 and tgt_sr != resample_sr:
400
- audio_opt = librosa.resample(
401
- audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
402
- )
403
- audio_max = np.abs(audio_opt).max() / 0.99
404
- max_int16 = 32768
405
- if audio_max > 1:
406
- max_int16 /= audio_max
407
- audio_opt = (audio_opt * max_int16).astype(np.int16)
408
- del pitch, pitchf, sid
409
- if torch.cuda.is_available():
410
- torch.cuda.empty_cache()
411
-
412
- if tgt_sr != resample_sr >= 16000:
413
- final_sr = resample_sr
414
- else:
415
- final_sr = tgt_sr
416
-
417
- """
418
- "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
419
- times[0],
420
- times[1],
421
- times[2],
422
- ), (final_sr, audio_opt)
423
-
424
- """
425
-
426
- if overwrite:
427
- output_audio_path = input_audio_path # Overwrite
428
- else:
429
- basename = os.path.basename(input_audio_path)
430
- dirname = os.path.dirname(input_audio_path)
431
-
432
- new_basename = basename.split(
433
- '.')[0] + "_edited." + basename.split('.')[-1]
434
- new_path = os.path.join(dirname, new_basename)
435
- logger.info(str(new_path))
436
-
437
- output_audio_path = new_path
438
-
439
- # Save file
440
- sf.write(
441
- file=output_audio_path,
442
- samplerate=final_sr,
443
- data=audio_opt
444
- )
445
-
446
- self.model_config[task_id]["result"].append(output_audio_path)
447
- self.output_list.append(output_audio_path)
448
-
449
- def make_test(
450
- self,
451
- tts_text,
452
- tts_voice,
453
- model_path,
454
- index_path,
455
- transpose,
456
- f0_method,
457
- ):
458
-
459
- folder_test = "test"
460
- tag = "test_edge"
461
- tts_file = "test/test.wav"
462
- tts_edited = "test/test_edited.wav"
463
-
464
- create_directories(folder_test)
465
- remove_directory_contents(folder_test)
466
-
467
- if "SET_LIMIT" == os.getenv("DEMO"):
468
- if len(tts_text) > 60:
469
- tts_text = tts_text[:60]
470
- logger.warning("DEMO; limit to 60 characters")
471
-
472
- try:
473
- asyncio.run(edge_tts.Communicate(
474
- tts_text, "-".join(tts_voice.split('-')[:-1])
475
- ).save(tts_file))
476
- except Exception as e:
477
- raise ValueError(
478
- "No audio was received. Please change the "
479
- f"tts voice for {tts_voice}. Error: {str(e)}"
480
- )
481
-
482
- shutil.copy(tts_file, tts_edited)
483
-
484
- self.apply_conf(
485
- tag=tag,
486
- file_model=model_path,
487
- pitch_algo=f0_method,
488
- pitch_lvl=transpose,
489
- file_index=index_path,
490
- index_influence=0.66,
491
- respiration_median_filtering=3,
492
- envelope_ratio=0.25,
493
- consonant_breath_protection=0.33,
494
- )
495
-
496
- self(
497
- audio_files=tts_edited,
498
- tag_list=tag,
499
- overwrite=True
500
- )
501
-
502
- return tts_edited, tts_file
503
-
504
- def run_threads(self, threads):
505
- # Start threads
506
- for thread in threads:
507
- thread.start()
508
-
509
- # Wait for all threads to finish
510
- for thread in threads:
511
- thread.join()
512
-
513
- gc.collect()
514
- torch.cuda.empty_cache()
515
-
516
- def unload_models(self):
517
- self.hu_bert_model = None
518
- self.model_pitch_estimator = None
519
- gc.collect()
520
- torch.cuda.empty_cache()
521
-
522
- def __call__(
523
- self,
524
- audio_files=[],
525
- tag_list=[],
526
- overwrite=False,
527
- parallel_workers=1,
528
- ):
529
- logger.info(f"Parallel workers: {str(parallel_workers)}")
530
-
531
- self.output_list = []
532
-
533
- if not self.model_config:
534
- raise ValueError("No model has been configured for inference")
535
-
536
- if isinstance(audio_files, str):
537
- audio_files = [audio_files]
538
- if isinstance(tag_list, str):
539
- tag_list = [tag_list]
540
-
541
- if not audio_files:
542
- raise ValueError("No audio found to convert")
543
- if not tag_list:
544
- tag_list = [list(self.model_config.keys())[-1]] * len(audio_files)
545
-
546
- if len(audio_files) > len(tag_list):
547
- logger.info("Extend tag list to match audio files")
548
- extend_number = len(audio_files) - len(tag_list)
549
- tag_list.extend([tag_list[0]] * extend_number)
550
-
551
- if len(audio_files) < len(tag_list):
552
- logger.info("Cut list tags")
553
- tag_list = tag_list[:len(audio_files)]
554
-
555
- tag_file_pairs = list(zip(tag_list, audio_files))
556
- sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0])
557
-
558
- # Base params
559
- if not self.hu_bert_model:
560
- self.hu_bert_model = load_hu_bert(self.config)
561
-
562
- cache_params = None
563
- threads = []
564
- progress_bar = tqdm(total=len(tag_list), desc="Progress")
565
- for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file):
566
-
567
- if id_tag not in self.model_config.keys():
568
- logger.info(
569
- f"No configured model for {id_tag} with {input_audio_path}"
570
- )
571
- continue
572
-
573
- if (
574
- len(threads) >= parallel_workers
575
- or cache_params != id_tag
576
- and cache_params is not None
577
- ):
578
-
579
- self.run_threads(threads)
580
- progress_bar.update(len(threads))
581
-
582
- threads = []
583
-
584
- if cache_params != id_tag:
585
-
586
- self.model_config[id_tag]["result"] = []
587
-
588
- # Unload previous
589
- (
590
- n_spk,
591
- tgt_sr,
592
- net_g,
593
- pipe,
594
- cpt,
595
- version,
596
- if_f0,
597
- index_rate,
598
- index,
599
- big_npy,
600
- inp_f0,
601
- ) = [None] * 11
602
- gc.collect()
603
- torch.cuda.empty_cache()
604
-
605
- # Model params
606
- params = self.model_config[id_tag]
607
-
608
- model_path = params["file_model"]
609
- f0_method = params["pitch_algo"]
610
- file_index = params["file_index"]
611
- index_rate = params["index_influence"]
612
- f0_file = params["file_pitch_algo"]
613
-
614
- # Load model
615
- (
616
- n_spk,
617
- tgt_sr,
618
- net_g,
619
- pipe,
620
- cpt,
621
- version
622
- ) = load_trained_model(model_path, self.config)
623
- if_f0 = cpt.get("f0", 1) # pitch data
624
-
625
- # Load index
626
- if os.path.exists(file_index) and index_rate != 0:
627
- try:
628
- index = faiss.read_index(file_index)
629
- big_npy = index.reconstruct_n(0, index.ntotal)
630
- except Exception as error:
631
- logger.error(f"Index: {str(error)}")
632
- index_rate = 0
633
- index = big_npy = None
634
- else:
635
- logger.warning("File index not found")
636
- index_rate = 0
637
- index = big_npy = None
638
-
639
- # Load f0 file
640
- inp_f0 = None
641
- if os.path.exists(f0_file):
642
- try:
643
- with open(f0_file, "r") as f:
644
- lines = f.read().strip("\n").split("\n")
645
- inp_f0 = []
646
- for line in lines:
647
- inp_f0.append([float(i) for i in line.split(",")])
648
- inp_f0 = np.array(inp_f0, dtype="float32")
649
- except Exception as error:
650
- logger.error(f"f0 file: {str(error)}")
651
-
652
- if "rmvpe" in f0_method:
653
- if not self.model_pitch_estimator:
654
- from lib.rmvpe import RMVPE
655
-
656
- logger.info("Loading vocal pitch estimator model")
657
- self.model_pitch_estimator = RMVPE(
658
- "rmvpe.pt",
659
- is_half=self.config.is_half,
660
- device=self.config.device
661
- )
662
-
663
- pipe.model_rmvpe = self.model_pitch_estimator
664
-
665
- cache_params = id_tag
666
-
667
- # self.infer(
668
- # id_tag,
669
- # params,
670
- # # load model
671
- # n_spk,
672
- # tgt_sr,
673
- # net_g,
674
- # pipe,
675
- # cpt,
676
- # version,
677
- # if_f0,
678
- # # load index
679
- # index_rate,
680
- # index,
681
- # big_npy,
682
- # # load f0 file
683
- # inp_f0,
684
- # # output file
685
- # input_audio_path,
686
- # overwrite,
687
- # )
688
-
689
- thread = threading.Thread(
690
- target=self.infer,
691
- args=(
692
- id_tag,
693
- params,
694
- # loaded model
695
- n_spk,
696
- tgt_sr,
697
- net_g,
698
- pipe,
699
- cpt,
700
- version,
701
- if_f0,
702
- # loaded index
703
- index_rate,
704
- index,
705
- big_npy,
706
- # loaded f0 file
707
- inp_f0,
708
- # audio file
709
- input_audio_path,
710
- overwrite,
711
- )
712
- )
713
-
714
- threads.append(thread)
715
-
716
- # Run last
717
- if threads:
718
- self.run_threads(threads)
719
-
720
- progress_bar.update(len(threads))
721
- progress_bar.close()
722
-
723
- final_result = []
724
- valid_tags = set(tag_list)
725
- for tag in valid_tags:
726
- if (
727
- tag in self.model_config.keys()
728
- and "result" in self.model_config[tag].keys()
729
- ):
730
- final_result.extend(self.model_config[tag]["result"])
731
-
732
- return final_result