smjain commited on
Commit
4a14f7f
·
1 Parent(s): 064a6d0

Create new file

Browse files
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip install gradio==2.9b24")
3
+
4
+ import gradio as gr
5
+
6
+
7
+ vocoder_url = 'https://bj.bcebos.com/v1/ai-studio-online/e46d52315a504f1fa520528582a8422b6fa7006463844b84b8a2c3d21cc314db?/Vocoder.zip'
8
+ models_url = 'https://bj.bcebos.com/v1/ai-studio-online/6c081f29caad483ebd4cded087ee6ddbfc8dca8fb89d4ab69d44253ce5525e32?/Models.zip'
9
+
10
+ from io import BytesIO
11
+ from zipfile import ZipFile
12
+ from urllib.request import urlopen
13
+
14
+
15
+ if not (os.path.isdir('Vocoder') and os.path.isdir('Models')):
16
+ for url in [vocoder_url, models_url]:
17
+ resp = urlopen(url)
18
+ zipfile = ZipFile(BytesIO(resp.read()))
19
+ zipfile.extractall()
20
+
21
+
22
+ import random
23
+ import yaml
24
+ from munch import Munch
25
+ import numpy as np
26
+ import paddle
27
+ from paddle import nn
28
+ import paddle.nn.functional as F
29
+ import paddleaudio
30
+ import librosa
31
+
32
+ from starganv2vc_paddle.Utils.JDC.model import JDCNet
33
+ from starganv2vc_paddle.models import Generator, MappingNetwork, StyleEncoder
34
+
35
+
36
+ speakers = [225,228,229,230,231,233,236,239,240,244,226,227,232,243,254,256,258,259,270,273]
37
+
38
+ to_mel = paddleaudio.features.MelSpectrogram(
39
+ n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
40
+ to_mel.fbank_matrix[:] = paddle.load('starganv2vc_paddle/fbank_matrix.pd')['fbank_matrix']
41
+ mean, std = -4, 4
42
+
43
+ def preprocess(wave):
44
+ wave_tensor = paddle.to_tensor(wave).astype(paddle.float32)
45
+ mel_tensor = to_mel(wave_tensor)
46
+ mel_tensor = (paddle.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
47
+ return mel_tensor
48
+
49
+ def build_model(model_params={}):
50
+ args = Munch(model_params)
51
+ generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
52
+ mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
53
+ style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)
54
+
55
+ nets_ema = Munch(generator=generator,
56
+ mapping_network=mapping_network,
57
+ style_encoder=style_encoder)
58
+
59
+ return nets_ema
60
+
61
+ def compute_style(speaker_dicts):
62
+ reference_embeddings = {}
63
+ for key, (path, speaker) in speaker_dicts.items():
64
+ if path == "":
65
+ label = paddle.to_tensor([speaker], dtype=paddle.int64)
66
+ latent_dim = starganv2.mapping_network.shared[0].weight.shape[0]
67
+ ref = starganv2.mapping_network(paddle.randn([1, latent_dim]), label)
68
+ else:
69
+ wave, sr = librosa.load(path, sr=24000)
70
+ audio, index = librosa.effects.trim(wave, top_db=30)
71
+ if sr != 24000:
72
+ wave = librosa.resample(wave, sr, 24000)
73
+ mel_tensor = preprocess(wave)
74
+
75
+ with paddle.no_grad():
76
+ label = paddle.to_tensor([speaker], dtype=paddle.int64)
77
+ ref = starganv2.style_encoder(mel_tensor.unsqueeze(1), label)
78
+ reference_embeddings[key] = (ref, label)
79
+
80
+ return reference_embeddings
81
+
82
+ F0_model = JDCNet(num_class=1, seq_len=192)
83
+ params = paddle.load("Models/bst.pd")['net']
84
+ F0_model.set_state_dict(params)
85
+ _ = F0_model.eval()
86
+
87
+ import yaml
88
+ import paddle
89
+
90
+ from yacs.config import CfgNode
91
+ from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
92
+
93
+ with open('Vocoder/config.yml') as f:
94
+ voc_config = CfgNode(yaml.safe_load(f))
95
+ voc_config["generator_params"].pop("upsample_net")
96
+ voc_config["generator_params"]["upsample_scales"] = voc_config["generator_params"].pop("upsample_params")["upsample_scales"]
97
+ vocoder = PWGGenerator(**voc_config["generator_params"])
98
+ vocoder.remove_weight_norm()
99
+ vocoder.eval()
100
+ vocoder.set_state_dict(paddle.load('Vocoder/checkpoint-400000steps.pd'))
101
+
102
+ model_path = 'Models/vc_ema.pd'
103
+
104
+ with open('Models/config.yml') as f:
105
+ starganv2_config = yaml.safe_load(f)
106
+ starganv2 = build_model(model_params=starganv2_config["model_params"])
107
+ params = paddle.load(model_path)
108
+ params = params['model_ema']
109
+ _ = [starganv2[key].set_state_dict(params[key]) for key in starganv2]
110
+ _ = [starganv2[key].eval() for key in starganv2]
111
+ starganv2.style_encoder = starganv2.style_encoder
112
+ starganv2.mapping_network = starganv2.mapping_network
113
+ starganv2.generator = starganv2.generator
114
+
115
+ # Compute speakers' styles under the Demo directory
116
+ speaker_dicts = {}
117
+ selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228]
118
+ for s in selected_speakers:
119
+ k = s
120
+ speaker_dicts['p' + str(s)] = ('Demo/VCTK-corpus/p' + str(k) + '/p' + str(k) + '_023.wav', speakers.index(s))
121
+
122
+ reference_embeddings = compute_style(speaker_dicts)
123
+
124
+ examples = [['Demo/VCTK-corpus/p243/p243_023.wav', 'p236'], ['Demo/VCTK-corpus/p236/p236_023.wav', 'p243']]
125
+
126
+
127
+ def app(wav_path, speaker_id):
128
+ audio, _ = librosa.load(wav_path, sr=24000)
129
+ audio = audio / np.max(np.abs(audio))
130
+ audio.dtype = np.float32
131
+ source = preprocess(audio)
132
+ ref = reference_embeddings[speaker_id][0]
133
+
134
+ with paddle.no_grad():
135
+ f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
136
+ out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)
137
+
138
+ c = out.transpose([0,1,3,2]).squeeze()
139
+ y_out = vocoder.inference(c)
140
+ y_out = y_out.reshape([-1])
141
+
142
+ return (24000, y_out.numpy())
143
+
144
+ title="StarGANv2 Voice Conversion"
145
+ description="Gradio Demo for voice conversion using paddlepaddle. "
146
+
147
+ iface = gr.Interface(app, [gr.inputs.Audio(source="microphone", type="filepath"),
148
+ gr.inputs.Radio(list(speaker_dicts.keys()), type="value", default='p228', label='speaker id')],
149
+ "audio", title=title, description=description, examples=examples)
150
+
151
+ iface.launch()