Spaces:
Runtime error
Runtime error
separate vocals, add progress bar, show selected audio
Browse files- app.py +88 -49
- audios/pica_pau_bolo_de_murango.m4a +0 -0
- requirements.txt +1 -0
app.py
CHANGED
@@ -2,7 +2,7 @@ import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
|
|
2 |
from mega import Mega
|
3 |
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
4 |
import threading
|
5 |
-
from time import
|
6 |
from subprocess import Popen
|
7 |
import datetime, requests
|
8 |
now_dir = os.getcwd()
|
@@ -31,6 +31,8 @@ from vc_infer_pipeline import VC
|
|
31 |
from config import Config
|
32 |
|
33 |
from utils import load_audio, CSVutil
|
|
|
|
|
34 |
|
35 |
DoFormant = False
|
36 |
Quefrency = 1.0
|
@@ -201,17 +203,33 @@ for root, dirs, files in os.walk(index_root, topdown=False):
|
|
201 |
index_paths.append("%s/%s" % (root, name))
|
202 |
|
203 |
def vc_single(
|
204 |
-
|
|
|
|
|
205 |
):
|
|
|
|
|
|
|
206 |
global tgt_sr, net_g, vc, hubert_model, version
|
207 |
if input_audio_path is None:
|
208 |
return "You need to upload an audio", None
|
209 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
audio = load_audio(input_audio_path, 16000, DoFormant, Quefrency, Timbre)
|
211 |
audio_max = np.abs(audio).max() / 0.95
|
212 |
if audio_max > 1:
|
213 |
audio /= audio_max
|
214 |
-
times = [0, 0, 0]
|
215 |
if hubert_model == None:
|
216 |
load_hubert()
|
217 |
if_f0 = cpt.get("f0", 1)
|
@@ -226,6 +244,7 @@ def vc_single(
|
|
226 |
.replace("trained", "added")
|
227 |
)
|
228 |
)
|
|
|
229 |
audio_opt = vc.pipeline(
|
230 |
hubert_model,
|
231 |
net_g,
|
@@ -245,21 +264,22 @@ def vc_single(
|
|
245 |
version,
|
246 |
protect,
|
247 |
crepe_hop_length,
|
|
|
248 |
f0_file=None,
|
249 |
)
|
|
|
250 |
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
251 |
tgt_sr = resample_sr
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
times
|
261 |
-
|
262 |
-
), (tgt_sr, audio_opt)
|
263 |
except:
|
264 |
info = traceback.format_exc()
|
265 |
print(info)
|
@@ -269,13 +289,12 @@ def get_vc(sid):
|
|
269 |
global n_spk, tgt_sr, net_g, vc, cpt, version
|
270 |
if sid == "" or sid == []:
|
271 |
global hubert_model
|
272 |
-
if hubert_model != None:
|
273 |
print("clean_empty_cache")
|
274 |
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
|
275 |
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
|
276 |
if torch.cuda.is_available():
|
277 |
torch.cuda.empty_cache()
|
278 |
-
###楼下不这么折腾清理不干净
|
279 |
if_f0 = cpt.get("f0", 1)
|
280 |
version = cpt.get("version", "v1")
|
281 |
if version == "v1":
|
@@ -379,30 +398,6 @@ def save_to_wav2(dropbox):
|
|
379 |
file_path=dropbox.name
|
380 |
shutil.move(file_path,'./audios')
|
381 |
return os.path.join('./audios',os.path.basename(file_path))
|
382 |
-
|
383 |
-
|
384 |
-
def match_index(sid0):
|
385 |
-
folder=sid0.split(".")[0]
|
386 |
-
parent_dir="./logs/"+folder
|
387 |
-
if os.path.exists(parent_dir):
|
388 |
-
for filename in os.listdir(parent_dir):
|
389 |
-
if filename.endswith(".index"):
|
390 |
-
index_path=os.path.join(parent_dir,filename)
|
391 |
-
return index_path
|
392 |
-
else:
|
393 |
-
return ''
|
394 |
-
|
395 |
-
|
396 |
-
def match_index(sid0):
|
397 |
-
folder=sid0.split(".")[0]
|
398 |
-
parent_dir="./logs/"+folder
|
399 |
-
if os.path.exists(parent_dir):
|
400 |
-
for filename in os.listdir(parent_dir):
|
401 |
-
if filename.endswith(".index"):
|
402 |
-
index_path=os.path.join(parent_dir,filename)
|
403 |
-
return index_path
|
404 |
-
else:
|
405 |
-
return ''
|
406 |
|
407 |
def check_for_name():
|
408 |
if len(names) > 0:
|
@@ -457,9 +452,52 @@ def download_from_youtube(url):
|
|
457 |
pass
|
458 |
filename = subprocess.getoutput(f'yt-dlp --print filename {url} --format m4a -o "./audios/%(title)s.%(ext)s"')
|
459 |
subprocess.getoutput(f'yt-dlp {url} --format m4a -o "./audios/%(title)s.%(ext)s"')
|
460 |
-
if os.path.exists(filename
|
461 |
return filename
|
462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
css = """
|
464 |
.padding {padding-left: 15px; padding-top: 5px;}
|
465 |
"""
|
@@ -476,7 +514,7 @@ with gr.Blocks(theme = gr.themes.Base(), title="Vocais da Loirinha 👱🏻
|
|
476 |
with gr.Row().style(equal_height=True):
|
477 |
with gr.Column():
|
478 |
with gr.Row():
|
479 |
-
model_dropdown = gr.Dropdown(label="1.
|
480 |
if check_for_name() != '':
|
481 |
get_vc(sorted(names)[0])
|
482 |
model_dropdown.change(
|
@@ -488,12 +526,12 @@ with gr.Blocks(theme = gr.themes.Base(), title="Vocais da Loirinha 👱🏻
|
|
488 |
yt_link_textbox = gr.Textbox(label="Insira um link para uma música no Youtube:")
|
489 |
download_yt_button = gr.Button("Baixar áudio do vídeo")
|
490 |
dropbox = gr.File(label="OU selecione um arquivo:")
|
491 |
-
record_button = gr.Audio(source="microphone", label="
|
492 |
|
493 |
with gr.Column():
|
494 |
with gr.Row():
|
495 |
audio_dropdown = gr.Dropdown(
|
496 |
-
label="3.
|
497 |
value="",
|
498 |
choices=audio_files,
|
499 |
scale=1
|
@@ -507,16 +545,17 @@ with gr.Blocks(theme = gr.themes.Base(), title="Vocais da Loirinha 👱🏻
|
|
507 |
record_button.change(fn=change_choices2, inputs=[], outputs=[audio_dropdown])
|
508 |
refresh_button.click(fn=update_dropdowns, inputs=[], outputs=[model_dropdown, audio_dropdown])
|
509 |
selected_audio = gr.Audio(label="Áudio selecionado", interactive=False)
|
|
|
510 |
separate_checkbox = gr.Checkbox(label="Separar vocais e instrumental",
|
511 |
-
info="
|
512 |
-
convert_button = gr.Button("
|
513 |
output_audio = gr.Audio(
|
514 |
-
label="
|
515 |
type='filepath',
|
516 |
interactive=False,
|
517 |
)
|
518 |
-
output_audio_textbox = gr.Textbox(label="Resultado", interactive=False, placeholder="Nenhum áudio gerado.")
|
519 |
-
convert_button.click(vc_single, [audio_dropdown], [output_audio_textbox, output_audio])
|
520 |
|
521 |
with gr.TabItem("Adicione uma voz"):
|
522 |
with gr.Column():
|
|
|
2 |
from mega import Mega
|
3 |
os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
|
4 |
import threading
|
5 |
+
from time import time
|
6 |
from subprocess import Popen
|
7 |
import datetime, requests
|
8 |
now_dir = os.getcwd()
|
|
|
31 |
from config import Config
|
32 |
|
33 |
from utils import load_audio, CSVutil
|
34 |
+
import demucs.separate
|
35 |
+
import audiosegment
|
36 |
|
37 |
DoFormant = False
|
38 |
Quefrency = 1.0
|
|
|
203 |
index_paths.append("%s/%s" % (root, name))
|
204 |
|
205 |
def vc_single(
|
206 |
+
input_audio,
|
207 |
+
separate_vocals_bool,
|
208 |
+
progress = gr.Progress()
|
209 |
):
|
210 |
+
progress(0, desc="Preparando áudio...")
|
211 |
+
overlay_audios_bool = False
|
212 |
+
input_audio_path = input_audio
|
213 |
global tgt_sr, net_g, vc, hubert_model, version
|
214 |
if input_audio_path is None:
|
215 |
return "You need to upload an audio", None
|
216 |
try:
|
217 |
+
t1 = 0
|
218 |
+
t2 = 0
|
219 |
+
if (separate_vocals_bool):
|
220 |
+
t1 = time()
|
221 |
+
progress(0.1, desc="Separando vocais...")
|
222 |
+
path_to_separated_vocals = separate_vocals(input_audio_path)
|
223 |
+
if (path_to_separated_vocals):
|
224 |
+
input_audio_path = path_to_separated_vocals
|
225 |
+
overlay_audios_bool = True
|
226 |
+
t2 = time()
|
227 |
+
progress(0.2, desc="Carregando áudio...")
|
228 |
audio = load_audio(input_audio_path, 16000, DoFormant, Quefrency, Timbre)
|
229 |
audio_max = np.abs(audio).max() / 0.95
|
230 |
if audio_max > 1:
|
231 |
audio /= audio_max
|
232 |
+
times = [0, 0, 0, t2 - t1, 0]
|
233 |
if hubert_model == None:
|
234 |
load_hubert()
|
235 |
if_f0 = cpt.get("f0", 1)
|
|
|
244 |
.replace("trained", "added")
|
245 |
)
|
246 |
)
|
247 |
+
progress(0.3, desc="Gerando áudio...")
|
248 |
audio_opt = vc.pipeline(
|
249 |
hubert_model,
|
250 |
net_g,
|
|
|
264 |
version,
|
265 |
protect,
|
266 |
crepe_hop_length,
|
267 |
+
progress,
|
268 |
f0_file=None,
|
269 |
)
|
270 |
+
progress(0.8, desc="Áudio convertido...")
|
271 |
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
272 |
tgt_sr = resample_sr
|
273 |
+
if (overlay_audios_bool):
|
274 |
+
t1 = time()
|
275 |
+
progress(0.9, desc="Juntando vocal e instrumental...")
|
276 |
+
(tgt_sr, audio_opt) = overlay_audios(tgt_sr, audio_opt, input_audio_path.replace("vocals", "no_vocals"))
|
277 |
+
remove_separated_files(input_audio_path)
|
278 |
+
t2 = time()
|
279 |
+
times[4] = t2 - t1
|
280 |
+
return {"visible": True, "__type__": "update", "value": "Áudio convertido com sucesso!\nTempo: %1fs" % (
|
281 |
+
sum(times),
|
282 |
+
)}, (tgt_sr, audio_opt)
|
|
|
283 |
except:
|
284 |
info = traceback.format_exc()
|
285 |
print(info)
|
|
|
289 |
global n_spk, tgt_sr, net_g, vc, cpt, version
|
290 |
if sid == "" or sid == []:
|
291 |
global hubert_model
|
292 |
+
if hubert_model != None:
|
293 |
print("clean_empty_cache")
|
294 |
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
|
295 |
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
|
296 |
if torch.cuda.is_available():
|
297 |
torch.cuda.empty_cache()
|
|
|
298 |
if_f0 = cpt.get("f0", 1)
|
299 |
version = cpt.get("version", "v1")
|
300 |
if version == "v1":
|
|
|
398 |
file_path=dropbox.name
|
399 |
shutil.move(file_path,'./audios')
|
400 |
return os.path.join('./audios',os.path.basename(file_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
|
402 |
def check_for_name():
|
403 |
if len(names) > 0:
|
|
|
452 |
pass
|
453 |
filename = subprocess.getoutput(f'yt-dlp --print filename {url} --format m4a -o "./audios/%(title)s.%(ext)s"')
|
454 |
subprocess.getoutput(f'yt-dlp {url} --format m4a -o "./audios/%(title)s.%(ext)s"')
|
455 |
+
if os.path.exists(filename):
|
456 |
return filename
|
457 |
|
458 |
+
def find_vocals(root_directory, target_folder_name, file_name='vocals.wav'):
|
459 |
+
for root, dirs, files in os.walk(root_directory):
|
460 |
+
if target_folder_name in dirs:
|
461 |
+
folder_path = os.path.join(root, target_folder_name)
|
462 |
+
vocals_path = os.path.join(folder_path, file_name)
|
463 |
+
if os.path.exists(vocals_path):
|
464 |
+
return vocals_path
|
465 |
+
return None
|
466 |
+
|
467 |
+
def separate_vocals(audio_path):
|
468 |
+
audio_name = audio_path[9:-4]
|
469 |
+
if (os.path.exists(audio_path) and audio_name):
|
470 |
+
demucs.separate.main(["--two-stems", "vocals", audio_path, "-o", './audios'])
|
471 |
+
vocals_path = find_vocals('./audios', audio_name)
|
472 |
+
if vocals_path:
|
473 |
+
return vocals_path
|
474 |
+
return None
|
475 |
+
|
476 |
+
# aqui ainda não tá 100%
|
477 |
+
def overlay_audios(sample_rate, np_array, accompaniment_path):
|
478 |
+
if (not os.path.exists(accompaniment_path)):
|
479 |
+
return (sample_rate, np_array)
|
480 |
+
sound1 = audiosegment.from_numpy_array(np_array, sample_rate)
|
481 |
+
sound2 = audiosegment.from_file(accompaniment_path)
|
482 |
+
overlay = sound1.overlay(sound2, position=0)
|
483 |
+
return (overlay.frame_rate, overlay.to_numpy_array())
|
484 |
+
|
485 |
+
def remove_separated_files(vocals_path):
|
486 |
+
parent_dir = os.path.dirname(vocals_path)
|
487 |
+
try:
|
488 |
+
shutil.rmtree(parent_dir)
|
489 |
+
print(f"Deleted {parent_dir} folder and its contents")
|
490 |
+
except FileNotFoundError:
|
491 |
+
print(f"{parent_dir} folder not found")
|
492 |
+
except Exception as e:
|
493 |
+
print(f"An error occurred: {str(e)}")
|
494 |
+
|
495 |
+
def hide_output_text():
|
496 |
+
return {"visible": False, "__type__": "update", "value": ""}
|
497 |
+
|
498 |
+
def show_selected_audio(input_audio_path):
|
499 |
+
return input_audio_path
|
500 |
+
|
501 |
css = """
|
502 |
.padding {padding-left: 15px; padding-top: 5px;}
|
503 |
"""
|
|
|
514 |
with gr.Row().style(equal_height=True):
|
515 |
with gr.Column():
|
516 |
with gr.Row():
|
517 |
+
model_dropdown = gr.Dropdown(label="1. Selecione a voz:", choices=sorted(names), value=check_for_name())
|
518 |
if check_for_name() != '':
|
519 |
get_vc(sorted(names)[0])
|
520 |
model_dropdown.change(
|
|
|
526 |
yt_link_textbox = gr.Textbox(label="Insira um link para uma música no Youtube:")
|
527 |
download_yt_button = gr.Button("Baixar áudio do vídeo")
|
528 |
dropbox = gr.File(label="OU selecione um arquivo:")
|
529 |
+
record_button = gr.Audio(source="microphone", label="OU grave o áudio:", type="filepath")
|
530 |
|
531 |
with gr.Column():
|
532 |
with gr.Row():
|
533 |
audio_dropdown = gr.Dropdown(
|
534 |
+
label="3. Selecione o áudio",
|
535 |
value="",
|
536 |
choices=audio_files,
|
537 |
scale=1
|
|
|
545 |
record_button.change(fn=change_choices2, inputs=[], outputs=[audio_dropdown])
|
546 |
refresh_button.click(fn=update_dropdowns, inputs=[], outputs=[model_dropdown, audio_dropdown])
|
547 |
selected_audio = gr.Audio(label="Áudio selecionado", interactive=False)
|
548 |
+
audio_dropdown.select(show_selected_audio, inputs=[audio_dropdown], outputs=[selected_audio])
|
549 |
separate_checkbox = gr.Checkbox(label="Separar vocais e instrumental",
|
550 |
+
info="Marque esta opção quando o áudio selecionado NÃO tiver a voz isolada. Os vocais serão extraídos para a conversão e depois reintegrados ao áudio final com os instrumentais. ⚠️ O tempo de conversão pode aumentar significamente com essa opção ativada.")
|
551 |
+
convert_button = gr.Button("Gerar áudio", variant="primary")
|
552 |
output_audio = gr.Audio(
|
553 |
+
label="Áudio convertido (Clique nos três pontos para fazer o download)",
|
554 |
type='filepath',
|
555 |
interactive=False,
|
556 |
)
|
557 |
+
output_audio_textbox = gr.Textbox(label="Resultado", interactive=False, visible=True, placeholder="Nenhum áudio gerado.")
|
558 |
+
convert_button.click(hide_output_text, outputs=[output_audio_textbox]).then(vc_single, [audio_dropdown, separate_checkbox], [output_audio_textbox, output_audio])
|
559 |
|
560 |
with gr.TabItem("Adicione uma voz"):
|
561 |
with gr.Column():
|
audios/pica_pau_bolo_de_murango.m4a
DELETED
Binary file (64 kB)
|
|
requirements.txt
CHANGED
@@ -21,3 +21,4 @@ onnxruntime
|
|
21 |
pyngrok==4.1.12
|
22 |
torch
|
23 |
yt-dlp==2023.07.06
|
|
|
|
21 |
pyngrok==4.1.12
|
22 |
torch
|
23 |
yt-dlp==2023.07.06
|
24 |
+
audiosegment==0.23.0
|