ArkanDash commited on
Commit
94baf6d
1 Parent(s): 7adb7ab

fix(spaces): separate gc code and ori code

Browse files
Files changed (2) hide show
  1. app-full.py +240 -0
  2. app.py +0 -54
app-full.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import traceback
5
+ import logging
6
+ import gradio as gr
7
+ import numpy as np
8
+ import librosa
9
+ import torch
10
+ import asyncio
11
+ import edge_tts
12
+ import yt_dlp
13
+ import ffmpeg
14
+ import subprocess
15
+ import sys
16
+ import io
17
+ import wave
18
+ from datetime import datetime
19
+ from fairseq import checkpoint_utils
20
+ from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
21
+ from vc_infer_pipeline import VC
22
+ from config import (
23
+ is_half,
24
+ device
25
+ )
26
+ logging.getLogger("numba").setLevel(logging.WARNING)
27
+ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
28
+
29
+ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
30
+ def vc_fn(
31
+ input_audio,
32
+ f0_up_key,
33
+ f0_method,
34
+ index_rate,
35
+ tts_mode,
36
+ tts_text,
37
+ tts_voice
38
+ ):
39
+ try:
40
+ if tts_mode:
41
+ if len(tts_text) > 100 and limitation:
42
+ return "Text is too long", None
43
+ if tts_text is None or tts_voice is None:
44
+ return "You need to enter text and select a voice", None
45
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
46
+ audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
47
+ else:
48
+ if args.files:
49
+ audio, sr = librosa.load(input_audio, sr=16000, mono=True)
50
+ else:
51
+ if input_audio is None:
52
+ return "You need to upload an audio", None
53
+ sampling_rate, audio = input_audio
54
+ duration = audio.shape[0] / sampling_rate
55
+ if duration > 20 and limitation:
56
+ return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
57
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
58
+ if len(audio.shape) > 1:
59
+ audio = librosa.to_mono(audio.transpose(1, 0))
60
+ if sampling_rate != 16000:
61
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
62
+ times = [0, 0, 0]
63
+ f0_up_key = int(f0_up_key)
64
+ audio_opt = vc.pipeline(
65
+ hubert_model,
66
+ net_g,
67
+ 0,
68
+ audio,
69
+ times,
70
+ f0_up_key,
71
+ f0_method,
72
+ file_index,
73
+ file_big_npy,
74
+ index_rate,
75
+ if_f0,
76
+ )
77
+ print(
78
+ f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
79
+ )
80
+ return "Success", (tgt_sr, audio_opt)
81
+ except:
82
+ info = traceback.format_exc()
83
+ print(info)
84
+ return info, (None, None)
85
+ return vc_fn
86
+
87
+ def cut_vocal_and_inst(yt_url):
88
+ if yt_url != "":
89
+ if not os.path.exists("/content/youtube_audio"):
90
+ os.mkdir("/content/youtube_audio")
91
+ ydl_opts = {
92
+ 'format': 'bestaudio/best',
93
+ 'postprocessors': [{
94
+ 'key': 'FFmpegExtractAudio',
95
+ 'preferredcodec': 'wav',
96
+ }],
97
+ "outtmpl": '/content/youtube_audio/audio',
98
+ }
99
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
100
+ ydl.download([yt_url])
101
+ yt_audio_path = "/content/youtube_audio/audio.wav"
102
+ command = f"demucs --two-stems=vocals {yt_audio_path}"
103
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
104
+ print(result.stdout.decode())
105
+ return ("/content/rvc-models/separated/htdemucs/audio/vocals.wav", "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "/content/rvc-models/separated/htdemucs/audio/vocals.wav")
106
+
107
+ def combine_vocal_and_inst(audio_data):
108
+ print(audio_data)
109
+ if not os.path.exists("/content/result"):
110
+ os.mkdir("/content/result")
111
+ vocal_path = "/content/result/output.wav"
112
+ inst_path = "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav"
113
+ output_path = "/content/result/combine.mp3"
114
+ with wave.open(vocal_path, "w") as wave_file:
115
+ wave_file.setnchannels(1)
116
+ wave_file.setsampwidth(2)
117
+ wave_file.setframerate(audio_data[0])
118
+ wave_file.writeframes(audio_data[1].tobytes())
119
+ command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume=6dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
120
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
121
+ return output_path
122
+
123
+ def load_hubert():
124
+ global hubert_model
125
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
126
+ ["hubert_base.pt"],
127
+ suffix="",
128
+ )
129
+ hubert_model = models[0]
130
+ hubert_model = hubert_model.to(device)
131
+ if is_half:
132
+ hubert_model = hubert_model.half()
133
+ else:
134
+ hubert_model = hubert_model.float()
135
+ hubert_model.eval()
136
+
137
+ def change_to_tts_mode(tts_mode):
138
+ if tts_mode:
139
+ return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
140
+ else:
141
+ return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
142
+
143
+ if __name__ == '__main__':
144
+ parser = argparse.ArgumentParser()
145
+ parser.add_argument('--api', action="store_true", default=False)
146
+ parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
147
+ parser.add_argument("--files", action="store_true", default=False, help="load audio from path")
148
+ args, unknown = parser.parse_known_args()
149
+ load_hubert()
150
+ models = []
151
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
152
+ voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
153
+ with open("weights/model_info.json", "r", encoding="utf-8") as f:
154
+ models_info = json.load(f)
155
+ for name, info in models_info.items():
156
+ if not info['enable']:
157
+ continue
158
+ title = info['title']
159
+ author = info.get("author", None)
160
+ cover = f"weights/{name}/{info['cover']}"
161
+ index = f"weights/{name}/{info['feature_retrieval_library']}"
162
+ npy = f"weights/{name}/{info['feature_file']}"
163
+ cpt = torch.load(f"weights/{name}/{name}.pth", map_location="cpu")
164
+ tgt_sr = cpt["config"][-1]
165
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
166
+ if_f0 = cpt.get("f0", 1)
167
+ if if_f0 == 1:
168
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
169
+ else:
170
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
171
+ del net_g.enc_q
172
+ print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净, 真奇葩
173
+ net_g.eval().to(device)
174
+ if is_half:
175
+ net_g = net_g.half()
176
+ else:
177
+ net_g = net_g.float()
178
+ vc = VC(tgt_sr, device, is_half)
179
+ models.append((name, title, author, cover, create_vc_fn(tgt_sr, net_g, vc, if_f0, index, npy)))
180
+ with gr.Blocks() as app:
181
+ gr.Markdown(
182
+ "# <center> RVC Models\n"
183
+ "## <center> The input audio should be clean and pure voice without background music.\n"
184
+ "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=ArkanDash.Rvc-Models)\n\n"
185
+ "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
186
+ "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
187
+ )
188
+ with gr.Tabs():
189
+ for (name, title, author, cover, vc_fn) in models:
190
+ with gr.TabItem(name):
191
+ with gr.Row():
192
+ gr.Markdown(
193
+ '<div align="center">'
194
+ f'<div>{title}</div>\n'+
195
+ (f'<div>Model author: {author}</div>' if author else "")+
196
+ (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
197
+ '</div>'
198
+ )
199
+ with gr.Row():
200
+ if args.files:
201
+ with gr.Column():
202
+ vc_youtube = gr.Textbox(label="Youtube URL")
203
+ vc_convert = gr.Button("Convert", variant="primary")
204
+ vc_vocal_preview = gr.Audio(label="Vocal Preview")
205
+ vc_inst_preview = gr.Audio(label="Instrumental Preview")
206
+ vc_audio_preview = gr.Audio(label="Audio Preview")
207
+ with gr.Column():
208
+ if args.files:
209
+ vc_input = gr.Textbox(label="Input audio path")
210
+ else:
211
+ vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
212
+ vc_transpose = gr.Number(label="Transpose", value=0)
213
+ vc_f0method = gr.Radio(
214
+ label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
215
+ choices=["pm", "harvest"],
216
+ value="pm",
217
+ interactive=True,
218
+ )
219
+ vc_index_ratio = gr.Slider(
220
+ minimum=0,
221
+ maximum=1,
222
+ label="Retrieval feature ratio",
223
+ value=0.6,
224
+ interactive=True,
225
+ )
226
+ tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
227
+ tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
228
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
229
+ vc_submit = gr.Button("Generate", variant="primary")
230
+ with gr.Column():
231
+ vc_output1 = gr.Textbox(label="Output Message")
232
+ vc_output2 = gr.Audio(label="Output Audio")
233
+ vc_combine = gr.Button("Combine",variant="primary")
234
+ vc_outputCombine = gr.Audio(label="Output Combined Audio")
235
+ vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
236
+ tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
237
+ if args.files:
238
+ vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
239
+ vc_combine.click(combine_vocal_and_inst, vc_output2, vc_outputCombine)
240
+ app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
app.py CHANGED
@@ -9,12 +9,6 @@ import librosa
9
  import torch
10
  import asyncio
11
  import edge_tts
12
- import yt_dlp
13
- import ffmpeg
14
- import subprocess
15
- import sys
16
- import io
17
- import wave
18
  from datetime import datetime
19
  from fairseq import checkpoint_utils
20
  from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
@@ -84,42 +78,6 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
84
  return info, (None, None)
85
  return vc_fn
86
 
87
- def cut_vocal_and_inst(yt_url):
88
- if yt_url != "":
89
- if not os.path.exists("/content/youtube_audio"):
90
- os.mkdir("/content/youtube_audio")
91
- ydl_opts = {
92
- 'format': 'bestaudio/best',
93
- 'postprocessors': [{
94
- 'key': 'FFmpegExtractAudio',
95
- 'preferredcodec': 'wav',
96
- }],
97
- "outtmpl": '/content/youtube_audio/audio',
98
- }
99
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
100
- ydl.download([yt_url])
101
- yt_audio_path = "/content/youtube_audio/audio.wav"
102
- command = f"demucs --two-stems=vocals {yt_audio_path}"
103
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
104
- print(result.stdout.decode())
105
- return ("/content/rvc-models/separated/htdemucs/audio/vocals.wav", "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "/content/rvc-models/separated/htdemucs/audio/vocals.wav")
106
-
107
- def combine_vocal_and_inst(audio_data):
108
- print(audio_data)
109
- if not os.path.exists("/content/result"):
110
- os.mkdir("/content/result")
111
- vocal_path = "/content/result/output.wav"
112
- inst_path = "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav"
113
- output_path = "/content/result/combine.mp3"
114
- with wave.open(vocal_path, "w") as wave_file:
115
- wave_file.setnchannels(1)
116
- wave_file.setsampwidth(2)
117
- wave_file.setframerate(audio_data[0])
118
- wave_file.writeframes(audio_data[1].tobytes())
119
- command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume=6dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
120
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
121
- return output_path
122
-
123
  def load_hubert():
124
  global hubert_model
125
  models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
@@ -197,13 +155,6 @@ if __name__ == '__main__':
197
  '</div>'
198
  )
199
  with gr.Row():
200
- if args.files:
201
- with gr.Column():
202
- vc_youtube = gr.Textbox(label="Youtube URL")
203
- vc_convert = gr.Button("Convert", variant="primary")
204
- vc_vocal_preview = gr.Audio(label="Vocal Preview")
205
- vc_inst_preview = gr.Audio(label="Instrumental Preview")
206
- vc_audio_preview = gr.Audio(label="Audio Preview")
207
  with gr.Column():
208
  if args.files:
209
  vc_input = gr.Textbox(label="Input audio path")
@@ -230,11 +181,6 @@ if __name__ == '__main__':
230
  with gr.Column():
231
  vc_output1 = gr.Textbox(label="Output Message")
232
  vc_output2 = gr.Audio(label="Output Audio")
233
- vc_combine = gr.Button("Combine",variant="primary")
234
- vc_outputCombine = gr.Audio(label="Output Combined Audio")
235
  vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
236
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
237
- if args.files:
238
- vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
239
- vc_combine.click(combine_vocal_and_inst, vc_output2, vc_outputCombine)
240
  app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
 
9
  import torch
10
  import asyncio
11
  import edge_tts
 
 
 
 
 
 
12
  from datetime import datetime
13
  from fairseq import checkpoint_utils
14
  from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 
78
  return info, (None, None)
79
  return vc_fn
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def load_hubert():
82
  global hubert_model
83
  models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
 
155
  '</div>'
156
  )
157
  with gr.Row():
 
 
 
 
 
 
 
158
  with gr.Column():
159
  if args.files:
160
  vc_input = gr.Textbox(label="Input audio path")
 
181
  with gr.Column():
182
  vc_output1 = gr.Textbox(label="Output Message")
183
  vc_output2 = gr.Audio(label="Output Audio")
 
 
184
  vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
185
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
 
 
 
186
  app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)