Politrees commited on
Commit
69836a2
·
verified ·
1 Parent(s): 2b7c9e8
Files changed (1) hide show
  1. app.py +109 -62
app.py CHANGED
@@ -407,12 +407,22 @@ with gr.Blocks(
407
  with gr.Tab("Roformer"):
408
  with gr.Group():
409
  with gr.Row():
410
- roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()))
411
- with gr.Row():
412
- roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
413
- roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
414
- roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
415
- roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
 
 
 
 
 
 
 
 
 
 
416
  with gr.Row():
417
  roformer_audio = gr.Audio(label="Input Audio", type="filepath")
418
  with gr.Row():
@@ -424,12 +434,22 @@ with gr.Blocks(
424
  with gr.Tab("MDX23C"):
425
  with gr.Group():
426
  with gr.Row():
427
- mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()))
428
- with gr.Row():
429
- mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
430
- mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
431
- mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
432
- mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
 
 
 
 
 
 
 
 
 
 
433
  with gr.Row():
434
  mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
435
  with gr.Row():
@@ -441,12 +461,23 @@ with gr.Blocks(
441
  with gr.Tab("MDX-NET"):
442
  with gr.Group():
443
  with gr.Row():
444
- mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()))
445
- with gr.Row():
446
- mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
447
- mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
448
- mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
449
- mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
 
 
 
 
 
 
 
 
 
 
 
450
  with gr.Row():
451
  mdx_audio = gr.Audio(label="Input Audio", type="filepath")
452
  with gr.Row():
@@ -458,14 +489,27 @@ with gr.Blocks(
458
  with gr.Tab("VR ARCH"):
459
  with gr.Group():
460
  with gr.Row():
461
- vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()))
462
- with gr.Row():
463
- vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
464
- vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
465
- vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
466
- vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
467
- vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
468
- vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  with gr.Row():
470
  vr_audio = gr.Audio(label="Input Audio", type="filepath")
471
  with gr.Row():
@@ -477,12 +521,22 @@ with gr.Blocks(
477
  with gr.Tab("Demucs"):
478
  with gr.Group():
479
  with gr.Row():
480
- demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()))
481
- with gr.Row():
482
- demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
483
- demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
484
- demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
485
- demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
 
 
 
 
 
 
 
 
 
 
486
  with gr.Row():
487
  demucs_audio = gr.Audio(label="Input Audio", type="filepath")
488
  with gr.Row():
@@ -498,17 +552,10 @@ with gr.Blocks(
498
  demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
499
 
500
  with gr.Tab("Settings"):
501
- with gr.Accordion("General settings", open=False):
502
- with gr.Group():
503
- model_file_dir = gr.Textbox(value="/tmp/PolUVR-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/PolUVR-models/")
504
- with gr.Row():
505
- output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
506
- output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.")
507
- with gr.Row():
508
- norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
509
- amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
510
- with gr.Row():
511
- batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
512
 
513
  with gr.Accordion("Rename Stems", open=False):
514
  gr.Markdown(
@@ -569,10 +616,10 @@ with gr.Blocks(
569
  roformer_pitch_shift,
570
  model_file_dir,
571
  output_dir,
572
- output_format,
573
- norm_threshold,
574
- amp_threshold,
575
- batch_size,
576
  vocals_stem,
577
  instrumental_stem,
578
  other_stem,
@@ -597,10 +644,10 @@ with gr.Blocks(
597
  mdx23c_pitch_shift,
598
  model_file_dir,
599
  output_dir,
600
- output_format,
601
- norm_threshold,
602
- amp_threshold,
603
- batch_size,
604
  vocals_stem,
605
  instrumental_stem,
606
  other_stem,
@@ -625,10 +672,10 @@ with gr.Blocks(
625
  mdx_denoise,
626
  model_file_dir,
627
  output_dir,
628
- output_format,
629
- norm_threshold,
630
- amp_threshold,
631
- batch_size,
632
  vocals_stem,
633
  instrumental_stem,
634
  other_stem,
@@ -655,10 +702,10 @@ with gr.Blocks(
655
  vr_high_end_process,
656
  model_file_dir,
657
  output_dir,
658
- output_format,
659
- norm_threshold,
660
- amp_threshold,
661
- batch_size,
662
  vocals_stem,
663
  instrumental_stem,
664
  other_stem,
@@ -683,9 +730,9 @@ with gr.Blocks(
683
  demucs_segments_enabled,
684
  model_file_dir,
685
  output_dir,
686
- output_format,
687
- norm_threshold,
688
- amp_threshold,
689
  vocals_stem,
690
  instrumental_stem,
691
  other_stem,
 
407
  with gr.Tab("Roformer"):
408
  with gr.Group():
409
  with gr.Row():
410
+ roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()), scale=3)
411
+ roformer_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
412
+ with gr.Accordion("Advanced settings", open=False):
413
+ with gr.Column():
414
+ with gr.Group():
415
+ roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
416
+ roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
417
+ with gr.Group():
418
+ with gr.Row():
419
+ roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
420
+ roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
421
+ with gr.Group():
422
+ with gr.Row():
423
+ roformer_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
424
+ roformer_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
425
+ roformer_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
426
  with gr.Row():
427
  roformer_audio = gr.Audio(label="Input Audio", type="filepath")
428
  with gr.Row():
 
434
  with gr.Tab("MDX23C"):
435
  with gr.Group():
436
  with gr.Row():
437
+ mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()), scale=3)
438
+ mdx23c_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
439
+ with gr.Accordion("Advanced settings", open=False):
440
+ with gr.Column():
441
+ with gr.Group():
442
+ mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
443
+ mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
444
+ with gr.Group():
445
+ with gr.Row():
446
+ mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
447
+ mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
448
+ with gr.Group():
449
+ with gr.Row():
450
+ mdx23c_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
451
+ mdx23c_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
452
+ mdx23c_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
453
  with gr.Row():
454
  mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
455
  with gr.Row():
 
461
  with gr.Tab("MDX-NET"):
462
  with gr.Group():
463
  with gr.Row():
464
+ mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()), scale=3)
465
+ mdx_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
466
+ with gr.Accordion("Advanced settings", open=False):
467
+ with gr.Column():
468
+ with gr.Group():
469
+ with gr.Row():
470
+ mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
471
+ mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
472
+ with gr.Group():
473
+ with gr.Row():
474
+ mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
475
+ mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
476
+ with gr.Group():
477
+ with gr.Row():
478
+ mdx_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
479
+ mdx_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
480
+ mdx_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
481
  with gr.Row():
482
  mdx_audio = gr.Audio(label="Input Audio", type="filepath")
483
  with gr.Row():
 
489
  with gr.Tab("VR ARCH"):
490
  with gr.Group():
491
  with gr.Row():
492
+ vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()), scale=3)
493
+ vr_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
494
+ with gr.Accordion("Advanced settings", open=False):
495
+ with gr.Column():
496
+ with gr.Group():
497
+ with gr.Row():
498
+ vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
499
+ vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
500
+ with gr.Group():
501
+ with gr.Column():
502
+ vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
503
+ vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
504
+ with gr.Group():
505
+ with gr.Row():
506
+ vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
507
+ vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
508
+ with gr.Group():
509
+ with gr.Row():
510
+ vr_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
511
+ vr_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
512
+ vr_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
513
  with gr.Row():
514
  vr_audio = gr.Audio(label="Input Audio", type="filepath")
515
  with gr.Row():
 
521
  with gr.Tab("Demucs"):
522
  with gr.Group():
523
  with gr.Row():
524
+ demucs_model = gr.Dropdown(value="htdemucs_6s", label="Select the Model", choices=list(DEMUCS_MODELS.keys()), scale=3)
525
+ demucs_output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.", scale=1)
526
+ with gr.Accordion("Advanced settings", open=False):
527
+ with gr.Column():
528
+ with gr.Group():
529
+ with gr.Row():
530
+ demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
531
+ demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
532
+ with gr.Group():
533
+ with gr.Row():
534
+ demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
535
+ demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
536
+ with gr.Group():
537
+ with gr.Row():
538
+ demucs_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
539
+ demucs_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
540
  with gr.Row():
541
  demucs_audio = gr.Audio(label="Input Audio", type="filepath")
542
  with gr.Row():
 
552
  demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
553
 
554
  with gr.Tab("Settings"):
555
+ with gr.Group():
556
+ with gr.Row():
557
+ model_file_dir = gr.Textbox(value="/tmp/PolUVR-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/PolUVR-models/")
558
+ output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
 
 
 
 
 
 
 
559
 
560
  with gr.Accordion("Rename Stems", open=False):
561
  gr.Markdown(
 
616
  roformer_pitch_shift,
617
  model_file_dir,
618
  output_dir,
619
+ roformer_output_format,
620
+ roformer_norm_threshold,
621
+ roformer_amp_threshold,
622
+ roformer_batch_size,
623
  vocals_stem,
624
  instrumental_stem,
625
  other_stem,
 
644
  mdx23c_pitch_shift,
645
  model_file_dir,
646
  output_dir,
647
+ mdx23c_output_format,
648
+ mdx23c_norm_threshold,
649
+ mdx23c_amp_threshold,
650
+ mdx23c_batch_size,
651
  vocals_stem,
652
  instrumental_stem,
653
  other_stem,
 
672
  mdx_denoise,
673
  model_file_dir,
674
  output_dir,
675
+ mdx_output_format,
676
+ mdx_norm_threshold,
677
+ mdx_amp_threshold,
678
+ mdx_batch_size,
679
  vocals_stem,
680
  instrumental_stem,
681
  other_stem,
 
702
  vr_high_end_process,
703
  model_file_dir,
704
  output_dir,
705
+ vr_output_format,
706
+ vr_norm_threshold,
707
+ vr_amp_threshold,
708
+ vr_batch_size,
709
  vocals_stem,
710
  instrumental_stem,
711
  other_stem,
 
730
  demucs_segments_enabled,
731
  model_file_dir,
732
  output_dir,
733
+ demucs_output_format,
734
+ demucs_norm_threshold,
735
+ demucs_amp_threshold,
736
  vocals_stem,
737
  instrumental_stem,
738
  other_stem,