KevinGeng commited on
Commit
6ba9c8a
2 Parent(s): 5626b60 67d9aaf

Merge branch 'main' into VCAM-27

Browse files
Files changed (2) hide show
  1. app.py +3 -2
  2. app.ver1.py +0 -72
app.py CHANGED
@@ -91,12 +91,13 @@ xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
91
  spks = list(xvectors.keys())
92
 
93
  male_spks = {
94
- "Male1": "2300_131720",
95
  "Male2": "1320_122612",
 
96
  }
97
  # "M3": "1188_133604",
98
  # "M4": "61_70970",
99
- female_spks = {"Female1": "2961_961", "Female2": "8463_287645", }
100
  # "F3": "121_121726"
101
  spks = dict(male_spks, **female_spks)
102
  spk_names = sorted(spks.keys())
 
91
  spks = list(xvectors.keys())
92
 
93
  male_spks = {
94
+ "Male1": "260_123286",
95
  "Male2": "1320_122612",
96
+ "Male3": "672_122797"
97
  }
98
  # "M3": "1188_133604",
99
  # "M4": "61_70970",
100
+ female_spks = {"Female1": "5683_32865", "Female2": "121_131726", "Female3": "8463_287645"}
101
  # "F3": "121_121726"
102
  spks = dict(male_spks, **female_spks)
103
  spk_names = sorted(spks.keys())
app.ver1.py DELETED
@@ -1,72 +0,0 @@
1
- #TODO:
2
- # + [x] Load Configuration
3
- # + [ ] Checking
4
- # + [ ] Better saving directory
5
-
6
- from pathlib import Path
7
- from transformers import pipeline
8
- import torch.nn as nn
9
- import torch
10
- import torchaudio
11
- import gradio as gr
12
- import sys
13
-
14
- # Local imports
15
- sys.path.append("src")
16
- from espnet2.bin.tts_inference import Text2Speech
17
- from espnet2.utils.types import str_or_none
18
-
19
- # Check if GPU is available
20
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
-
22
- # ASR part
23
-
24
- data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
25
- audio_files = sorted(list(Path(data_path).glob("**/*wav")))
26
- # audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
27
-
28
- transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
29
-
30
- # TTS part
31
- def load_model(lang, tag, vocoder_tag):
32
- if lang == "Japanese":
33
- if tag == "kan-bayashi/ljspeech_parallel_wavegan":
34
- tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
35
- elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
36
- tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
37
- else:
38
- raise ValueError(f"Not supported: lang={lang}, tag={tag}")
39
- vocoder = None if vocoder_tag == "none" else vocoder_tag
40
- elif lang == "English":
41
- # VITS needs no vocoder; others do
42
- if tag == "kan-bayashi/libritts_xvector_vits":
43
- tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
44
- vocoder = None
45
- elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
46
- tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
47
- vocoder = "melgan"
48
- else:
49
- raise ValueError(f"Not supported: lang={lang}, tag={tag}")
50
- else:
51
- raise ValueError(f"Not supported: lang={lang}")
52
- return tts_model, vocoder
53
-
54
- tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
55
- tts_model = tts_model.to(device)
56
-
57
- vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
58
-
59
- # Gradio part
60
- def synthesize(text):
61
- with torch.no_grad():
62
- # Text-to-speech
63
- wav = tts_model(text)[0]
64
- if vocoder is not None:
65
- # Apply vocoder
66
- wav = vocoder.inference(wav)
67
- # Convert to numpy array
68
- wav = wav.squeeze().cpu().numpy()
69
- return wav
70
-
71
- interface = gr.Interface(synthesize, inputs="text", outputs="audio")
72
- interface.launch()