Spaces:
Running
Running
Hmm...
Browse files
app.py
CHANGED
@@ -407,12 +407,22 @@ with gr.Blocks(
|
|
407 |
with gr.Tab("Roformer"):
|
408 |
with gr.Group():
|
409 |
with gr.Row():
|
410 |
-
roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()))
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
with gr.Row():
|
417 |
roformer_audio = gr.Audio(label="Input Audio", type="filepath")
|
418 |
with gr.Row():
|
@@ -424,12 +434,22 @@ with gr.Blocks(
|
|
424 |
with gr.Tab("MDX23C"):
|
425 |
with gr.Group():
|
426 |
with gr.Row():
|
427 |
-
mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()))
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
with gr.Row():
|
434 |
mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
|
435 |
with gr.Row():
|
@@ -441,12 +461,23 @@ with gr.Blocks(
|
|
441 |
with gr.Tab("MDX-NET"):
|
442 |
with gr.Group():
|
443 |
with gr.Row():
|
444 |
-
mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()))
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
with gr.Row():
|
451 |
mdx_audio = gr.Audio(label="Input Audio", type="filepath")
|
452 |
with gr.Row():
|
@@ -458,14 +489,27 @@ with gr.Blocks(
|
|
458 |
with gr.Tab("VR ARCH"):
|
459 |
with gr.Group():
|
460 |
with gr.Row():
|
461 |
-
vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()))
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
with gr.Row():
|
470 |
vr_audio = gr.Audio(label="Input Audio", type="filepath")
|
471 |
with gr.Row():
|
@@ -477,12 +521,22 @@ with gr.Blocks(
|
|
477 |
with gr.Tab("Demucs"):
|
478 |
with gr.Group():
|
479 |
with gr.Row():
|
480 |
-
demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()))
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
with gr.Row():
|
487 |
demucs_audio = gr.Audio(label="Input Audio", type="filepath")
|
488 |
with gr.Row():
|
@@ -498,17 +552,10 @@ with gr.Blocks(
|
|
498 |
demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
|
499 |
|
500 |
with gr.Tab("Settings"):
|
501 |
-
with gr.
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
|
506 |
-
output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.")
|
507 |
-
with gr.Row():
|
508 |
-
norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
509 |
-
amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
510 |
-
with gr.Row():
|
511 |
-
batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
512 |
|
513 |
with gr.Accordion("Rename Stems", open=False):
|
514 |
gr.Markdown(
|
@@ -569,10 +616,10 @@ with gr.Blocks(
|
|
569 |
roformer_pitch_shift,
|
570 |
model_file_dir,
|
571 |
output_dir,
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
vocals_stem,
|
577 |
instrumental_stem,
|
578 |
other_stem,
|
@@ -597,10 +644,10 @@ with gr.Blocks(
|
|
597 |
mdx23c_pitch_shift,
|
598 |
model_file_dir,
|
599 |
output_dir,
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
vocals_stem,
|
605 |
instrumental_stem,
|
606 |
other_stem,
|
@@ -625,10 +672,10 @@ with gr.Blocks(
|
|
625 |
mdx_denoise,
|
626 |
model_file_dir,
|
627 |
output_dir,
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
vocals_stem,
|
633 |
instrumental_stem,
|
634 |
other_stem,
|
@@ -655,10 +702,10 @@ with gr.Blocks(
|
|
655 |
vr_high_end_process,
|
656 |
model_file_dir,
|
657 |
output_dir,
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
vocals_stem,
|
663 |
instrumental_stem,
|
664 |
other_stem,
|
@@ -683,9 +730,9 @@ with gr.Blocks(
|
|
683 |
demucs_segments_enabled,
|
684 |
model_file_dir,
|
685 |
output_dir,
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
vocals_stem,
|
690 |
instrumental_stem,
|
691 |
other_stem,
|
|
|
407 |
with gr.Tab("Roformer"):
|
408 |
with gr.Group():
|
409 |
with gr.Row():
|
410 |
+
roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()), scale=3)
|
411 |
+
roformer_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
412 |
+
with gr.Accordion("Advanced settings", open=False):
|
413 |
+
with gr.Column():
|
414 |
+
with gr.Group():
|
415 |
+
roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
|
416 |
+
roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
|
417 |
+
with gr.Group():
|
418 |
+
with gr.Row():
|
419 |
+
roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
|
420 |
+
roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
|
421 |
+
with gr.Group():
|
422 |
+
with gr.Row():
|
423 |
+
roformer_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
424 |
+
roformer_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
425 |
+
roformer_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
426 |
with gr.Row():
|
427 |
roformer_audio = gr.Audio(label="Input Audio", type="filepath")
|
428 |
with gr.Row():
|
|
|
434 |
with gr.Tab("MDX23C"):
|
435 |
with gr.Group():
|
436 |
with gr.Row():
|
437 |
+
mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()), scale=3)
|
438 |
+
mdx23c_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
439 |
+
with gr.Accordion("Advanced settings", open=False):
|
440 |
+
with gr.Column():
|
441 |
+
with gr.Group():
|
442 |
+
mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
|
443 |
+
mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
|
444 |
+
with gr.Group():
|
445 |
+
with gr.Row():
|
446 |
+
mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
|
447 |
+
mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
|
448 |
+
with gr.Group():
|
449 |
+
with gr.Row():
|
450 |
+
mdx23c_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
451 |
+
mdx23c_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
452 |
+
mdx23c_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
453 |
with gr.Row():
|
454 |
mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
|
455 |
with gr.Row():
|
|
|
461 |
with gr.Tab("MDX-NET"):
|
462 |
with gr.Group():
|
463 |
with gr.Row():
|
464 |
+
mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()), scale=3)
|
465 |
+
mdx_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
466 |
+
with gr.Accordion("Advanced settings", open=False):
|
467 |
+
with gr.Column():
|
468 |
+
with gr.Group():
|
469 |
+
with gr.Row():
|
470 |
+
mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
|
471 |
+
mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
|
472 |
+
with gr.Group():
|
473 |
+
with gr.Row():
|
474 |
+
mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
|
475 |
+
mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
|
476 |
+
with gr.Group():
|
477 |
+
with gr.Row():
|
478 |
+
mdx_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
479 |
+
mdx_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
480 |
+
mdx_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
481 |
with gr.Row():
|
482 |
mdx_audio = gr.Audio(label="Input Audio", type="filepath")
|
483 |
with gr.Row():
|
|
|
489 |
with gr.Tab("VR ARCH"):
|
490 |
with gr.Group():
|
491 |
with gr.Row():
|
492 |
+
vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()), scale=3)
|
493 |
+
vr_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
494 |
+
with gr.Accordion("Advanced settings", open=False):
|
495 |
+
with gr.Column():
|
496 |
+
with gr.Group():
|
497 |
+
with gr.Row():
|
498 |
+
vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
|
499 |
+
vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
|
500 |
+
with gr.Group():
|
501 |
+
with gr.Column():
|
502 |
+
vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
|
503 |
+
vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
|
504 |
+
with gr.Group():
|
505 |
+
with gr.Row():
|
506 |
+
vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
|
507 |
+
vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
|
508 |
+
with gr.Group():
|
509 |
+
with gr.Row():
|
510 |
+
vr_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
|
511 |
+
vr_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
512 |
+
vr_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
513 |
with gr.Row():
|
514 |
vr_audio = gr.Audio(label="Input Audio", type="filepath")
|
515 |
with gr.Row():
|
|
|
521 |
with gr.Tab("Demucs"):
|
522 |
with gr.Group():
|
523 |
with gr.Row():
|
524 |
+
demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()), scale=3)
|
525 |
+
demucs_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
|
526 |
+
with gr.Accordion("Advanced settings", open=False):
|
527 |
+
with gr.Column():
|
528 |
+
with gr.Group():
|
529 |
+
with gr.Row():
|
530 |
+
demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
|
531 |
+
demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
|
532 |
+
with gr.Group():
|
533 |
+
with gr.Row():
|
534 |
+
demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
|
535 |
+
demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
|
536 |
+
with gr.Group():
|
537 |
+
with gr.Row():
|
538 |
+
demucs_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
|
539 |
+
demucs_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
|
540 |
with gr.Row():
|
541 |
demucs_audio = gr.Audio(label="Input Audio", type="filepath")
|
542 |
with gr.Row():
|
|
|
552 |
demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
|
553 |
|
554 |
with gr.Tab("Settings"):
|
555 |
+
with gr.Group():
|
556 |
+
with gr.Row():
|
557 |
+
model_file_dir = gr.Textbox(value="/tmp/PolUVR-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/PolUVR-models/")
|
558 |
+
output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
with gr.Accordion("Rename Stems", open=False):
|
561 |
gr.Markdown(
|
|
|
616 |
roformer_pitch_shift,
|
617 |
model_file_dir,
|
618 |
output_dir,
|
619 |
+
roformer_output_format,
|
620 |
+
roformer_norm_threshold,
|
621 |
+
roformer_amp_threshold,
|
622 |
+
roformer_batch_size,
|
623 |
vocals_stem,
|
624 |
instrumental_stem,
|
625 |
other_stem,
|
|
|
644 |
mdx23c_pitch_shift,
|
645 |
model_file_dir,
|
646 |
output_dir,
|
647 |
+
mdx23c_output_format,
|
648 |
+
mdx23c_norm_threshold,
|
649 |
+
mdx23c_amp_threshold,
|
650 |
+
mdx23c_batch_size,
|
651 |
vocals_stem,
|
652 |
instrumental_stem,
|
653 |
other_stem,
|
|
|
672 |
mdx_denoise,
|
673 |
model_file_dir,
|
674 |
output_dir,
|
675 |
+
mdx_output_format,
|
676 |
+
mdx_norm_threshold,
|
677 |
+
mdx_amp_threshold,
|
678 |
+
mdx_batch_size,
|
679 |
vocals_stem,
|
680 |
instrumental_stem,
|
681 |
other_stem,
|
|
|
702 |
vr_high_end_process,
|
703 |
model_file_dir,
|
704 |
output_dir,
|
705 |
+
vr_output_format,
|
706 |
+
vr_norm_threshold,
|
707 |
+
vr_amp_threshold,
|
708 |
+
vr_batch_size,
|
709 |
vocals_stem,
|
710 |
instrumental_stem,
|
711 |
other_stem,
|
|
|
730 |
demucs_segments_enabled,
|
731 |
model_file_dir,
|
732 |
output_dir,
|
733 |
+
demucs_output_format,
|
734 |
+
demucs_norm_threshold,
|
735 |
+
demucs_amp_threshold,
|
736 |
vocals_stem,
|
737 |
instrumental_stem,
|
738 |
other_stem,
|