Spaces:
Sleeping
Sleeping
modify app
Browse files- app.py +166 -74
- inference.py +56 -49
app.py
CHANGED
@@ -9,39 +9,40 @@ from config import args
|
|
9 |
|
10 |
mastering_transfer = MasteringStyleTransfer(args)
|
11 |
|
12 |
-
def process_audio(input_audio, reference_audio
|
13 |
-
|
14 |
-
|
15 |
-
input_audio, reference_audio, ito_reference_audio if ito_reference_audio else reference_audio, {}, perform_ito
|
16 |
)
|
17 |
|
18 |
-
# Generate parameter output strings
|
19 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
20 |
-
ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
21 |
|
22 |
-
|
23 |
-
top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
24 |
-
|
25 |
-
return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
|
26 |
|
27 |
-
def
|
28 |
-
|
29 |
-
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
)
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
42 |
-
top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
43 |
-
|
44 |
-
return "output_mastered_yt.wav", "ito_output_mastered_yt.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
|
45 |
|
46 |
|
47 |
with gr.Blocks() as demo:
|
@@ -50,63 +51,154 @@ with gr.Blocks() as demo:
|
|
50 |
with gr.Tab("Upload Audio"):
|
51 |
input_audio = gr.Audio(label="Input Audio")
|
52 |
reference_audio = gr.Audio(label="Reference Audio")
|
53 |
-
|
54 |
-
with gr.Column(visible=False) as ito_options:
|
55 |
-
use_same_reference = gr.Checkbox(label="Use same reference audio for ITO", value=True)
|
56 |
-
ito_reference_audio = gr.Audio(label="ITO Reference Audio", visible=False)
|
57 |
-
|
58 |
-
def update_ito_options(perform_ito):
|
59 |
-
return gr.Column.update(visible=perform_ito)
|
60 |
-
|
61 |
-
def update_ito_reference(use_same):
|
62 |
-
return gr.Audio.update(visible=not use_same)
|
63 |
-
|
64 |
-
perform_ito.change(fn=update_ito_options, inputs=perform_ito, outputs=ito_options)
|
65 |
-
use_same_reference.change(fn=update_ito_reference, inputs=use_same_reference, outputs=ito_reference_audio)
|
66 |
-
|
67 |
-
submit_button = gr.Button("Process")
|
68 |
output_audio = gr.Audio(label="Output Audio")
|
69 |
-
ito_output_audio = gr.Audio(label="ITO Output Audio")
|
70 |
param_output = gr.Textbox(label="Predicted Parameters", lines=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
|
72 |
-
|
73 |
ito_log = gr.Textbox(label="ITO Log", lines=20)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
)
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
|
93 |
-
|
94 |
|
95 |
-
|
96 |
-
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
-
demo.launch()
|
|
|
9 |
|
10 |
mastering_transfer = MasteringStyleTransfer(args)
|
11 |
|
12 |
+
def process_audio(input_audio, reference_audio):
|
13 |
+
output_audio, predicted_params, _, _, _, sr = mastering_transfer.process_audio(
|
14 |
+
input_audio, reference_audio, reference_audio, {}, False
|
|
|
15 |
)
|
16 |
|
|
|
17 |
param_output = mastering_transfer.get_param_output_string(predicted_params)
|
|
|
18 |
|
19 |
+
return "output_mastered.wav", param_output
|
|
|
|
|
|
|
20 |
|
21 |
+
def perform_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
|
22 |
+
if ito_reference_audio is None:
|
23 |
+
ito_reference_audio = reference_audio
|
24 |
|
25 |
+
ito_config = {
|
26 |
+
'optimizer': optimizer,
|
27 |
+
'learning_rate': learning_rate,
|
28 |
+
'num_steps': num_steps,
|
29 |
+
'af_weights': af_weights,
|
30 |
+
'sample_rate': args.sample_rate
|
31 |
+
}
|
32 |
+
|
33 |
+
input_tensor = mastering_transfer.preprocess_audio(input_audio, args.sample_rate)
|
34 |
+
reference_tensor = mastering_transfer.preprocess_audio(reference_audio, args.sample_rate)
|
35 |
+
ito_reference_tensor = mastering_transfer.preprocess_audio(ito_reference_audio, args.sample_rate)
|
36 |
+
|
37 |
+
initial_reference_feature = mastering_transfer.get_reference_embedding(reference_tensor)
|
38 |
+
|
39 |
+
ito_output, ito_params, optimized_embedding, steps_taken, ito_log = mastering_transfer.inference_time_optimization(
|
40 |
+
input_tensor, ito_reference_tensor, ito_config, initial_reference_feature
|
41 |
)
|
42 |
+
|
43 |
+
ito_param_output = mastering_transfer.get_param_output_string(ito_params)
|
44 |
|
45 |
+
return "ito_output_mastered.wav", ito_param_output, steps_taken, ito_log
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
with gr.Blocks() as demo:
|
|
|
51 |
with gr.Tab("Upload Audio"):
|
52 |
input_audio = gr.Audio(label="Input Audio")
|
53 |
reference_audio = gr.Audio(label="Reference Audio")
|
54 |
+
process_button = gr.Button("Process")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
output_audio = gr.Audio(label="Output Audio")
|
|
|
56 |
param_output = gr.Textbox(label="Predicted Parameters", lines=10)
|
57 |
+
|
58 |
+
process_button.click(
|
59 |
+
process_audio,
|
60 |
+
inputs=[input_audio, reference_audio],
|
61 |
+
outputs=[output_audio, param_output]
|
62 |
+
)
|
63 |
+
|
64 |
+
gr.Markdown("## Inference Time Optimization (ITO)")
|
65 |
+
ito_reference_audio = gr.Audio(label="ITO Reference Audio (optional)")
|
66 |
+
num_steps = gr.Slider(minimum=1, maximum=1000, value=100, step=1, label="Number of Steps")
|
67 |
+
optimizer = gr.Dropdown(["Adam", "RAdam", "SGD"], value="RAdam", label="Optimizer")
|
68 |
+
learning_rate = gr.Slider(minimum=0.0001, maximum=0.1, value=0.001, step=0.0001, label="Learning Rate")
|
69 |
+
af_weights = gr.Textbox(label="AudioFeatureLoss Weights (comma-separated)", value="0.1,0.001,1.0,1.0,0.1")
|
70 |
+
|
71 |
+
ito_button = gr.Button("Perform ITO")
|
72 |
+
ito_output_audio = gr.Audio(label="ITO Output Audio")
|
73 |
ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
|
74 |
+
ito_steps_taken = gr.Number(label="ITO Steps Taken")
|
75 |
ito_log = gr.Textbox(label="ITO Log", lines=20)
|
76 |
+
|
77 |
+
def run_ito(input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights):
|
78 |
+
af_weights = [float(w.strip()) for w in af_weights.split(',')]
|
79 |
+
ito_output, ito_params, steps_taken = perform_ito(
|
80 |
+
input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights
|
81 |
+
)
|
82 |
+
return ito_output, ito_params, steps_taken
|
83 |
+
|
84 |
+
ito_button.click(
|
85 |
+
run_ito,
|
86 |
+
inputs=[input_audio, reference_audio, ito_reference_audio, num_steps, optimizer, learning_rate, af_weights],
|
87 |
+
outputs=[ito_output_audio, ito_param_output, ito_steps_taken, ito_log]
|
88 |
)
|
89 |
|
90 |
+
demo.launch()
|
91 |
+
|
92 |
+
|
93 |
+
# import gradio as gr
|
94 |
+
# import torch
|
95 |
+
# import soundfile as sf
|
96 |
+
# import numpy as np
|
97 |
+
# import yaml
|
98 |
+
# from inference import MasteringStyleTransfer
|
99 |
+
# from utils import download_youtube_audio
|
100 |
+
# from config import args
|
101 |
+
|
102 |
+
# mastering_transfer = MasteringStyleTransfer(args)
|
103 |
+
|
104 |
+
# def process_audio(input_audio, reference_audio, perform_ito, ito_reference_audio=None):
|
105 |
+
# # Process the audio files
|
106 |
+
# output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio(
|
107 |
+
# input_audio, reference_audio, ito_reference_audio if ito_reference_audio else reference_audio, {}, perform_ito
|
108 |
+
# )
|
109 |
+
|
110 |
+
# # Generate parameter output strings
|
111 |
+
# param_output = mastering_transfer.get_param_output_string(predicted_params)
|
112 |
+
# ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
113 |
+
|
114 |
+
# # Generate top 10 differences if ITO was performed
|
115 |
+
# top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
116 |
+
|
117 |
+
# return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
|
118 |
+
|
119 |
+
# def process_with_ito(input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio):
|
120 |
+
# ito_ref = reference_audio if use_same_reference else ito_reference_audio
|
121 |
+
# return process_audio(input_audio, reference_audio, perform_ito, ito_ref)
|
122 |
+
|
123 |
+
# def process_youtube_with_ito(input_url, reference_url, perform_ito, use_same_reference, ito_reference_url):
|
124 |
+
# input_audio = download_youtube_audio(input_url)
|
125 |
+
# reference_audio = download_youtube_audio(reference_url)
|
126 |
+
# ito_ref = reference_audio if use_same_reference else download_youtube_audio(ito_reference_url)
|
127 |
+
|
128 |
+
# output_audio, predicted_params, ito_output_audio, ito_predicted_params, ito_log, sr = mastering_transfer.process_audio(
|
129 |
+
# input_audio, reference_audio, ito_ref, {}, perform_ito, log_ito=True
|
130 |
+
# )
|
131 |
+
|
132 |
+
# param_output = mastering_transfer.get_param_output_string(predicted_params)
|
133 |
+
# ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
134 |
+
# top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
|
135 |
+
|
136 |
+
# return "output_mastered_yt.wav", "ito_output_mastered_yt.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff, ito_log
|
137 |
+
|
138 |
+
|
139 |
+
# with gr.Blocks() as demo:
|
140 |
+
# gr.Markdown("# Mastering Style Transfer Demo")
|
141 |
+
|
142 |
+
# with gr.Tab("Upload Audio"):
|
143 |
+
# input_audio = gr.Audio(label="Input Audio")
|
144 |
+
# reference_audio = gr.Audio(label="Reference Audio")
|
145 |
+
# perform_ito = gr.Checkbox(label="Perform ITO")
|
146 |
+
# with gr.Column(visible=False) as ito_options:
|
147 |
+
# use_same_reference = gr.Checkbox(label="Use same reference audio for ITO", value=True)
|
148 |
+
# ito_reference_audio = gr.Audio(label="ITO Reference Audio", visible=False)
|
149 |
|
150 |
+
# def update_ito_options(perform_ito):
|
151 |
+
# return gr.Column.update(visible=perform_ito)
|
152 |
|
153 |
+
# def update_ito_reference(use_same):
|
154 |
+
# return gr.Audio.update(visible=not use_same)
|
155 |
|
156 |
+
# perform_ito.change(fn=update_ito_options, inputs=perform_ito, outputs=ito_options)
|
157 |
+
# use_same_reference.change(fn=update_ito_reference, inputs=use_same_reference, outputs=ito_reference_audio)
|
158 |
|
159 |
+
# submit_button = gr.Button("Process")
|
160 |
+
# output_audio = gr.Audio(label="Output Audio")
|
161 |
+
# ito_output_audio = gr.Audio(label="ITO Output Audio")
|
162 |
+
# param_output = gr.Textbox(label="Predicted Parameters", lines=10)
|
163 |
+
# ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
|
164 |
+
# top_10_diff = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
|
165 |
+
# ito_log = gr.Textbox(label="ITO Log", lines=20)
|
166 |
+
|
167 |
+
# submit_button.click(
|
168 |
+
# process_with_ito,
|
169 |
+
# inputs=[input_audio, reference_audio, perform_ito, use_same_reference, ito_reference_audio],
|
170 |
+
# outputs=[output_audio, ito_output_audio, param_output, ito_param_output, top_10_diff, ito_log]
|
171 |
+
# )
|
172 |
+
|
173 |
+
# with gr.Tab("YouTube URLs"):
|
174 |
+
# input_url = gr.Textbox(label="Input YouTube URL")
|
175 |
+
# reference_url = gr.Textbox(label="Reference YouTube URL")
|
176 |
+
# perform_ito_yt = gr.Checkbox(label="Perform ITO")
|
177 |
+
# with gr.Column(visible=False) as ito_options_yt:
|
178 |
+
# use_same_reference_yt = gr.Checkbox(label="Use same reference audio for ITO", value=True)
|
179 |
+
# ito_reference_url = gr.Textbox(label="ITO Reference YouTube URL", visible=False)
|
180 |
+
|
181 |
+
# def update_ito_options_yt(perform_ito):
|
182 |
+
# return gr.Column.update(visible=perform_ito)
|
183 |
+
|
184 |
+
# def update_ito_reference_yt(use_same):
|
185 |
+
# return gr.Textbox.update(visible=not use_same)
|
186 |
+
|
187 |
+
# perform_ito_yt.change(fn=update_ito_options_yt, inputs=perform_ito_yt, outputs=ito_options_yt)
|
188 |
+
# use_same_reference_yt.change(fn=update_ito_reference_yt, inputs=use_same_reference_yt, outputs=ito_reference_url)
|
189 |
+
|
190 |
+
# submit_button_yt = gr.Button("Process")
|
191 |
+
# output_audio_yt = gr.Audio(label="Output Audio")
|
192 |
+
# ito_output_audio_yt = gr.Audio(label="ITO Output Audio")
|
193 |
+
# param_output_yt = gr.Textbox(label="Predicted Parameters", lines=10)
|
194 |
+
# ito_param_output_yt = gr.Textbox(label="ITO Predicted Parameters", lines=10)
|
195 |
+
# top_10_diff_yt = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
|
196 |
+
# ito_log_yt = gr.Textbox(label="ITO Log", lines=20)
|
197 |
+
|
198 |
+
# submit_button_yt.click(
|
199 |
+
# process_youtube_with_ito,
|
200 |
+
# inputs=[input_url, reference_url, perform_ito_yt, use_same_reference_yt, ito_reference_url],
|
201 |
+
# outputs=[output_audio_yt, ito_output_audio_yt, param_output_yt, ito_param_output_yt, top_10_diff_yt, ito_log_yt]
|
202 |
+
# )
|
203 |
|
204 |
+
# demo.launch()
|
inference.py
CHANGED
@@ -60,59 +60,66 @@ class MasteringStyleTransfer:
|
|
60 |
predicted_params = self.mastering_converter.get_last_predicted_params()
|
61 |
return output_audio, predicted_params
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
105 |
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
|
112 |
-
|
113 |
-
|
114 |
|
115 |
-
|
116 |
|
117 |
def preprocess_audio(self, audio, target_sample_rate=44100):
|
118 |
sample_rate, data = audio
|
|
|
60 |
predicted_params = self.mastering_converter.get_last_predicted_params()
|
61 |
return output_audio, predicted_params
|
62 |
|
63 |
+
def inference_time_optimization(self, input_tensor, reference_tensor, ito_config, initial_reference_feature):
|
64 |
+
fit_embedding = torch.nn.Parameter(initial_reference_feature)
|
65 |
+
optimizer = getattr(torch.optim, ito_config['optimizer'])([fit_embedding], lr=ito_config['learning_rate'])
|
66 |
+
|
67 |
+
af_loss = AudioFeatureLoss(
|
68 |
+
weights=ito_config['af_weights'],
|
69 |
+
sample_rate=ito_config['sample_rate'],
|
70 |
+
stem_separation=False,
|
71 |
+
use_clap=False
|
72 |
+
)
|
73 |
+
|
74 |
+
min_loss = float('inf')
|
75 |
+
min_loss_step = 0
|
76 |
+
min_loss_output = None
|
77 |
+
min_loss_params = None
|
78 |
+
min_loss_embedding = None
|
79 |
+
|
80 |
+
loss_history = []
|
81 |
+
divergence_counter = 0
|
82 |
+
ito_log = []
|
83 |
+
|
84 |
+
for step in range(ito_config['num_steps']):
|
85 |
+
optimizer.zero_grad()
|
86 |
+
|
87 |
+
output_audio = self.mastering_converter(input_tensor, fit_embedding)
|
88 |
+
current_params = self.mastering_converter.get_last_predicted_params()
|
89 |
+
|
90 |
+
losses = af_loss(output_audio, reference_tensor)
|
91 |
+
total_loss = sum(losses.values())
|
92 |
+
|
93 |
+
loss_history.append(total_loss.item())
|
94 |
+
|
95 |
+
if total_loss < min_loss:
|
96 |
+
min_loss = total_loss.item()
|
97 |
+
min_loss_step = step
|
98 |
+
min_loss_output = output_audio.detach()
|
99 |
+
min_loss_params = current_params
|
100 |
+
min_loss_embedding = fit_embedding.detach().clone()
|
101 |
+
|
102 |
+
# Check for divergence
|
103 |
+
if len(loss_history) > 10 and total_loss > loss_history[-11]:
|
104 |
+
divergence_counter += 1
|
105 |
+
else:
|
106 |
+
divergence_counter = 0
|
107 |
|
108 |
+
# Log top 10 parameter differences
|
109 |
+
if step == 0:
|
110 |
+
initial_params = current_params
|
111 |
+
top_10_diff = self.get_top_10_diff_string(initial_params, current_params)
|
112 |
+
log_entry = f"Step {step + 1}, Loss: {total_loss.item():.4f}\n{top_10_diff}\n"
|
113 |
+
ito_log.append(log_entry)
|
114 |
|
115 |
+
if divergence_counter >= 10:
|
116 |
+
print(f"Optimization stopped early due to divergence at step {step}")
|
117 |
+
break
|
118 |
|
119 |
+
total_loss.backward()
|
120 |
+
optimizer.step()
|
121 |
|
122 |
+
return min_loss_output, min_loss_params, min_loss_embedding, min_loss_step + 1, "\n".join(ito_log)
|
123 |
|
124 |
def preprocess_audio(self, audio, target_sample_rate=44100):
|
125 |
sample_rate, data = audio
|