Vijish commited on
Commit
e78ae8f
·
verified ·
1 Parent(s): 0869928

Delete v.py

Browse files
Files changed (1) hide show
  1. v.py +0 -263
v.py DELETED
@@ -1,263 +0,0 @@
1
- import asyncio
2
- import datetime
3
- import logging
4
- import os
5
- import time
6
- import traceback
7
- import tempfile
8
- import concurrent.futures
9
-
10
- import edge_tts
11
- import librosa
12
- import torch
13
- from fairseq import checkpoint_utils
14
- import uuid
15
-
16
- from config import Config
17
- from lib.infer_pack.models import (
18
- SynthesizerTrnMs256NSFsid,
19
- SynthesizerTrnMs256NSFsid_nono,
20
- SynthesizerTrnMs768NSFsid,
21
- SynthesizerTrnMs768NSFsid_nono,
22
- )
23
- from rmvpe import RMVPE
24
- from vc_infer_pipeline import VC
25
-
26
- # Set logging levels
27
- logging.getLogger("fairseq").setLevel(logging.WARNING)
28
- logging.getLogger("numba").setLevel(logging.WARNING)
29
- logging.getLogger("markdown_it").setLevel(logging.WARNING)
30
- logging.getLogger("urllib3").setLevel(logging.WARNING)
31
- logging.getLogger("matplotlib").setLevel(logging.WARNING)
32
-
33
- limitation = os.getenv("SYSTEM") == "spaces"
34
-
35
- config = Config()
36
-
37
- # Edge TTS
38
- tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
39
- tts_voices = ["mn-MN-BataaNeural", "mn-MN-YesuiNeural"] # Specific voices
40
-
41
- # RVC models
42
- model_root = "weights"
43
- models = [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
44
- models.sort()
45
-
46
- def get_unique_filename(extension):
47
- return f"{uuid.uuid4()}.{extension}"
48
-
49
-
50
- def model_data(model_name):
51
- pth_path = [
52
- f"{model_root}/{model_name}/{f}"
53
- for f in os.listdir(f"{model_root}/{model_name}")
54
- if f.endswith(".pth")
55
- ][0]
56
- print(f"Loading {pth_path}")
57
- cpt = torch.load(pth_path, map_location="cpu")
58
- tgt_sr = cpt["config"][-1]
59
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
60
- if_f0 = cpt.get("f0", 1)
61
- version = cpt.get("version", "v1")
62
- if version == "v1":
63
- if if_f0 == 1:
64
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
65
- else:
66
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
67
- elif version == "v2":
68
- if if_f0 == 1:
69
- net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
70
- else:
71
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
72
- else:
73
- raise ValueError("Unknown version")
74
- del net_g.enc_q
75
- net_g.load_state_dict(cpt["weight"], strict=False)
76
- print("Model loaded")
77
- net_g.eval().to(config.device)
78
- if config.is_half:
79
- net_g = net_g.half()
80
- else:
81
- net_g = net_g.float()
82
- vc = VC(tgt_sr, config)
83
-
84
- index_files = [
85
- f"{model_root}/{model_name}/{f}"
86
- for f in os.listdir(f"{model_root}/{model_name}")
87
- if f.endswith(".index")
88
- ]
89
- if len(index_files) == 0:
90
- print("No index file found")
91
- index_file = ""
92
- else:
93
- index_file = index_files[0]
94
- print(f"Index file found: {index_file}")
95
-
96
- return tgt_sr, net_g, vc, version, index_file, if_f0
97
-
98
-
99
- def load_hubert():
100
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
101
- ["hubert_base.pt"],
102
- suffix="",
103
- )
104
- hubert_model = models[0]
105
- hubert_model = hubert_model.to(config.device)
106
- if config.is_half:
107
- hubert_model = hubert_model.half()
108
- else:
109
- hubert_model = hubert_model.float()
110
- return hubert_model.eval()
111
-
112
-
113
- def get_model_names():
114
- model_root = "weights" # Assuming this is where your models are stored
115
- return [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
116
-
117
-
118
- async def tts(
119
- model_name,
120
- tts_text,
121
- tts_voice,
122
- index_rate,
123
- use_uploaded_voice,
124
- uploaded_voice,
125
- ):
126
- # Default values for parameters used in EdgeTTS
127
- speed = 0 # Default speech speed
128
- f0_up_key = 0 # Default pitch adjustment
129
- f0_method = "rmvpe" # Default pitch extraction method
130
- protect = 0.33 # Default protect value
131
- filter_radius = 3
132
- resample_sr = 0
133
- rms_mix_rate = 0.25
134
- edge_time = 0 # Initialize edge_time
135
-
136
- edge_output_filename = get_unique_filename("mp3")
137
-
138
- try:
139
- if use_uploaded_voice:
140
- if uploaded_voice is None:
141
- return "No voice file uploaded.", None, None
142
-
143
- # Process the uploaded voice file
144
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
145
- tmp_file.write(uploaded_voice)
146
- uploaded_file_path = tmp_file.name
147
-
148
- audio, sr = librosa.load(uploaded_file_path, sr=16000, mono=True)
149
- else:
150
- # EdgeTTS processing
151
- if limitation and len(tts_text) > 12000:
152
- return (
153
- f"Text characters should be at most 280 in this huggingface space, but got {len(tts_text)} characters.",
154
- None,
155
- None,
156
- )
157
-
158
- # Invoke Edge TTS
159
- t0 = time.time()
160
- speed_str = f"+{speed}%" if speed >= 0 else f"{speed}%"
161
- await edge_tts.Communicate(
162
- tts_text, tts_voice, rate=speed_str
163
- ).save(edge_output_filename)
164
- t1 = time.time()
165
- edge_time = t1 - t0
166
-
167
- audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
168
-
169
- # Common processing after loading the audio
170
- duration = len(audio) / sr
171
- print(f"Audio duration: {duration}s")
172
- if limitation and duration >= 20000:
173
- return (
174
- f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.",
175
- None,
176
- None,
177
- )
178
-
179
- f0_up_key = int(f0_up_key)
180
- tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
181
-
182
- # Setup for RMVPE or other pitch extraction methods
183
- if f0_method == "rmvpe":
184
- vc.model_rmvpe = rmvpe_model
185
-
186
- # Perform voice conversion pipeline
187
- times = [0, 0, 0]
188
- audio_opt = vc.pipeline(
189
- hubert_model,
190
- net_g,
191
- 0,
192
- audio,
193
- edge_output_filename if not use_uploaded_voice else uploaded_file_path,
194
- times,
195
- f0_up_key,
196
- f0_method,
197
- index_file,
198
- index_rate,
199
- if_f0,
200
- filter_radius,
201
- tgt_sr,
202
- resample_sr,
203
- rms_mix_rate,
204
- version,
205
- protect,
206
- None,
207
- )
208
-
209
- if tgt_sr != resample_sr and resample_sr >= 16000:
210
- tgt_sr = resample_sr
211
-
212
- info = f"Success. Time: tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
213
- print(info)
214
- return (
215
- info,
216
- edge_output_filename if not use_uploaded_voice else None,
217
- (tgt_sr, audio_opt),
218
- edge_output_filename
219
- )
220
-
221
- except EOFError:
222
- info = (
223
- "output not valid. This may occur when input text and speaker do not match."
224
- )
225
- print(info)
226
- return info, None, None
227
- except Exception as e:
228
- traceback_info = traceback.format_exc()
229
- print(traceback_info)
230
- return str(e), None, None
231
-
232
-
233
- voice_mapping = {
234
- "Mongolian Male": "mn-MN-BataaNeural",
235
- "Mongolian Female": "mn-MN-YesuiNeural"
236
- }
237
-
238
-
239
- hubert_model = load_hubert()
240
-
241
- rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device)
242
-
243
-
244
- async def process_files_concurrently(files, model_name, tts_text, tts_voice, index_rate, use_uploaded_voice):
245
- loop = asyncio.get_event_loop()
246
- with concurrent.futures.ThreadPoolExecutor() as pool:
247
- tasks = [
248
- loop.run_in_executor(
249
- pool,
250
- asyncio.ensure_future,
251
- tts(
252
- model_name,
253
- tts_text,
254
- tts_voice,
255
- index_rate,
256
- use_uploaded_voice,
257
- file.read() if use_uploaded_voice else None
258
- )
259
- )
260
- for file in files
261
- ]
262
- return await asyncio.gather(*tasks)
263
-