File size: 5,417 Bytes
56e5a69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e723ecd
 
4cddb4c
 
56e5a69
 
 
 
 
 
 
d090336
 
 
 
56e5a69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20843bd
56e5a69
 
20843bd
56e5a69
5a45d14
20843bd
56e5a69
aca3960
 
56e5a69
 
d090336
56e5a69
d090336
 
56e5a69
 
 
 
 
 
 
 
 
 
 
 
 
96ba2ba
56e5a69
 
 
 
 
d090336
 
 
 
 
 
 
d6a77bc
d090336
 
 
 
 
 
 
 
 
26c1164
56e5a69
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import io
import gradio as gr
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc

import os
def list_files_tree(directory, indent=""):
    items = os.listdir(directory)
    for i, item in enumerate(items):
        prefix = "└── " if i == len(items) - 1 else "β”œβ”€β”€ "
        print(indent + prefix + item)
        item_path = os.path.join(directory, item)
        if os.path.isdir(item_path):
            next_indent = indent + ("    " if i == len(items) - 1 else "β”‚   ")
            list_files_tree(item_path, next_indent)

from huggingface_hub import snapshot_download
print("Models...")
models_id = """None1145/So-VITS-SVC-Vulpisfoglia
None1145/So-VITS-SVC-Lappland
None1145/So-VITS-SVC-Lappland-the-Decadenza
None1145/So-VITS-SVC-Rosmontis"""
for model_id in models_id.split("\n"):
    if model_id in ["", " "]:
        break
    print(f"{model_id}...")
    snapshot_download(repo_id=model_id, local_dir=f"./Models/{model_id}")
    print(f"{model_id}!!!")
print("Models!!!")
print("PretrainedModels...")
base_model_id = "None1145/So-VITS-SVC-Base"
snapshot_download(repo_id=base_model_id, local_dir=f"./PretrainedModels/{base_model_id}")
print("PretrainedModels!!!")
list_files_tree("./")

import re
models_info = {}
models_folder_path = "./Models/None1145"
folder_names = [name for name in os.listdir(models_folder_path) if os.path.isdir(os.path.join(models_folder_path, name))]
for folder_name in folder_names:
    speaker = folder_name[12:]
    pattern = re.compile(r"G_(\d+)\.pth$")
    max_value = -1
    max_file = None
    models_path = f"{models_folder_path}/{folder_name}/Models"
    config_path = f"{models_folder_path}/{folder_name}/Configs"
    for filename in os.listdir(models_path):
        match = pattern.search(filename)
        if match:
            value = int(match.group(1))
            if value > max_value:
                max_value = value
                max_file = filename
    models_info[speaker] = {}
    models_info[speaker]["model"] = f"{models_path}/{max_file}"
    models_info[speaker]["config"] = f"{config_path}/config.json"
    if os.path.exists(f"{models_path}/feature_and_index.pkl"):
        models_info[speaker]["cluster"] = f"{models_path}/feature_and_index.pkl"
        models_info[speaker]["feature_retrieval"] = True
    elif os.path.exists(f"{models_path}/kmeans_10000.pt"):
        models_info[speaker]["cluster"] = f"{models_path}/kmeans_10000.pt"
        models_info[speaker]["feature_retrieval"] = False
    else:
        models_info[speaker]["cluster"] = "logs/44k/kmeans_10000.pt"
        models_info[speaker]["feature_retrieval"] = False
speakers = list(models_info.keys())
print(models_info)
print(speakers)

def load(speaker):
    return Svc(models_info[speaker]["model"], models_info[speaker]["config"], cluster_model_path=models_info[speaker]["cluster"], feature_retrieval=models_info[speaker]["feature_retrieval"])

def vc_fn(speaker, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale):
    model = load(speaker)
    if input_audio is None:
        return "You need to upload an audio", None
    sampling_rate, audio = input_audio
    duration = audio.shape[0] / sampling_rate
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
    print(audio.shape)
    out_wav_path = "temp.wav"
    soundfile.write(out_wav_path, audio, 16000, format="wav")
    print( cluster_ratio, auto_f0, noise_scale)
    _audio = model.slice_inference(out_wav_path, speaker, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale)
    return "Success", (44100, _audio)

app = gr.Blocks()
with app:
    with gr.Tabs():
        for speaker in speakers:
            with gr.TabItem(speaker):
                with gr.Row():
                    gr.Markdown(
                        '<div align="center">'
                        f'<a><strong>{speaker}</strong></a>'
                        '</div>')
                speaker = gr.Textbox(label="Speaker", value=speaker)
                vc_input3 = gr.Audio(label="Upload Audio")
                vc_transform = gr.Number(label="Pitch Shift (integer, can be positive or negative, number of semitones, raising an octave is +12)", value=0)
                cluster_ratio = gr.Number(label="Cluster Model Mixing Ratio (0-1): Defaults to 0 (clustering disabled). Improves timbre similarity but may reduce articulation clarity. Recommended value: ~0.5 if used", value=0)
                auto_f0 = gr.Checkbox(label="Auto f0 Prediction: Works better with the cluster model for f0 prediction but disables the pitch shift feature. (For voice conversion only; do not enable this for singing voices, as it will result in extreme off-pitch issues)", value=False)
                slice_db = gr.Number(label="Slicing Threshold", value=-40)
                noise_scale = gr.Number(label="noise_scale", value=0.4)
                vc_submit = gr.Button("Convert", variant="primary")
                vc_output1 = gr.Textbox(label="Output Message")
                vc_output2 = gr.Audio(label="Output Audio")
                vc_submit.click(vc_fn, [speaker, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale], [vc_output1, vc_output2])
    app.launch()