Spaces:
Running
Running
feat(gcolab): youtube feature
Browse files
app.py
CHANGED
@@ -9,6 +9,12 @@ import librosa
|
|
9 |
import torch
|
10 |
import asyncio
|
11 |
import edge_tts
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
from datetime import datetime
|
13 |
from fairseq import checkpoint_utils
|
14 |
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
|
@@ -78,6 +84,42 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
|
|
78 |
return info, (None, None)
|
79 |
return vc_fn
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
def load_hubert():
|
82 |
global hubert_model
|
83 |
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
@@ -142,7 +184,6 @@ if __name__ == '__main__':
|
|
142 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=ArkanDash.Rvc-Models)\n\n"
|
143 |
"[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
|
144 |
"[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
|
145 |
-
|
146 |
)
|
147 |
with gr.Tabs():
|
148 |
for (name, title, author, cover, vc_fn) in models:
|
@@ -156,6 +197,13 @@ if __name__ == '__main__':
|
|
156 |
'</div>'
|
157 |
)
|
158 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
with gr.Column():
|
160 |
if args.files:
|
161 |
vc_input = gr.Textbox(label="Input audio path")
|
@@ -182,6 +230,11 @@ if __name__ == '__main__':
|
|
182 |
with gr.Column():
|
183 |
vc_output1 = gr.Textbox(label="Output Message")
|
184 |
vc_output2 = gr.Audio(label="Output Audio")
|
|
|
|
|
185 |
vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
|
186 |
tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
|
|
|
|
|
|
|
187 |
app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
|
|
|
9 |
import torch
|
10 |
import asyncio
|
11 |
import edge_tts
|
12 |
+
import yt_dlp
|
13 |
+
import ffmpeg
|
14 |
+
import subprocess
|
15 |
+
import sys
|
16 |
+
import io
|
17 |
+
import wave
|
18 |
from datetime import datetime
|
19 |
from fairseq import checkpoint_utils
|
20 |
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
|
|
|
84 |
return info, (None, None)
|
85 |
return vc_fn
|
86 |
|
87 |
+
def cut_vocal_and_inst(yt_url):
|
88 |
+
if yt_url != "":
|
89 |
+
if not os.path.exists("/content/youtube_audio"):
|
90 |
+
os.mkdir("/content/youtube_audio")
|
91 |
+
ydl_opts = {
|
92 |
+
'format': 'bestaudio/best',
|
93 |
+
'postprocessors': [{
|
94 |
+
'key': 'FFmpegExtractAudio',
|
95 |
+
'preferredcodec': 'wav',
|
96 |
+
}],
|
97 |
+
"outtmpl": '/content/youtube_audio/audio',
|
98 |
+
}
|
99 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
100 |
+
ydl.download([yt_url])
|
101 |
+
yt_audio_path = "/content/youtube_audio/audio.wav"
|
102 |
+
command = f"demucs --two-stems=vocals {yt_audio_path}"
|
103 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
104 |
+
print(result.stdout.decode())
|
105 |
+
return ("/content/rvc-models/separated/htdemucs/audio/vocals.wav", "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "/content/rvc-models/separated/htdemucs/audio/vocals.wav")
|
106 |
+
|
107 |
+
def combine_vocal_and_inst(audio_data):
|
108 |
+
print(audio_data)
|
109 |
+
if not os.path.exists("/content/result"):
|
110 |
+
os.mkdir("/content/result")
|
111 |
+
vocal_path = "/content/result/output.wav"
|
112 |
+
inst_path = "/content/rvc-models/separated/htdemucs/audio/no_vocals.wav"
|
113 |
+
output_path = "/content/result/combine.mp3"
|
114 |
+
with wave.open(vocal_path, "w") as wave_file:
|
115 |
+
wave_file.setnchannels(1)
|
116 |
+
wave_file.setsampwidth(2)
|
117 |
+
wave_file.setframerate(audio_data[0])
|
118 |
+
wave_file.writeframes(audio_data[1].tobytes())
|
119 |
+
command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
|
120 |
+
result = subprocess.run(command.split(), stdout=subprocess.PIPE)
|
121 |
+
return output_path
|
122 |
+
|
123 |
def load_hubert():
|
124 |
global hubert_model
|
125 |
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
|
|
|
184 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=ArkanDash.Rvc-Models)\n\n"
|
185 |
"[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
|
186 |
"[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
|
|
|
187 |
)
|
188 |
with gr.Tabs():
|
189 |
for (name, title, author, cover, vc_fn) in models:
|
|
|
197 |
'</div>'
|
198 |
)
|
199 |
with gr.Row():
|
200 |
+
if args.files:
|
201 |
+
with gr.Column():
|
202 |
+
vc_youtube = gr.Textbox(label="Youtube URL")
|
203 |
+
vc_convert = gr.Button("Convert", variant="primary")
|
204 |
+
vc_vocal_preview = gr.Audio(label="Vocal Preview")
|
205 |
+
vc_inst_preview = gr.Audio(label="Instrumental Preview")
|
206 |
+
vc_audio_preview = gr.Audio(label="Audio Preview")
|
207 |
with gr.Column():
|
208 |
if args.files:
|
209 |
vc_input = gr.Textbox(label="Input audio path")
|
|
|
230 |
with gr.Column():
|
231 |
vc_output1 = gr.Textbox(label="Output Message")
|
232 |
vc_output2 = gr.Audio(label="Output Audio")
|
233 |
+
vc_combine = gr.Button("Combine",variant="primary")
|
234 |
+
vc_outputCombine = gr.Audio(label="Output Combined Audio")
|
235 |
vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
|
236 |
tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
|
237 |
+
if args.files:
|
238 |
+
vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
|
239 |
+
vc_combine.click(combine_vocal_and_inst, vc_output2, vc_outputCombine)
|
240 |
app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.share)
|