Spaces:
Running
on
Zero
Running
on
Zero
Nithya
commited on
Commit
Β·
e0d48d1
1
Parent(s):
017b2a5
added other functionalities to the interface
Browse files
app.py
CHANGED
@@ -35,9 +35,11 @@ from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
|
|
35 |
import gamadhani.utils.pitch_to_audio_utils as p2a
|
36 |
from gamadhani.utils.utils import get_device
|
37 |
|
|
|
38 |
|
39 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
|
40 |
-
|
|
|
41 |
audio_path = 'models/pitch_to_audio/'
|
42 |
device = get_device()
|
43 |
|
@@ -96,7 +98,15 @@ def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, nu
|
|
96 |
noisy_pitch = torch.Tensor(pitch[:, :, -1200:]).to(pitch_model.device) + (torch.normal(mean=0.0, std=noise_std*torch.ones((1200)))).to(pitch_model.device)
|
97 |
noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19) # clipping the pitch values to be within the range of the model
|
98 |
samples = pitch_model.sample_sdedit(noisy_pitch, num_samples, num_steps)
|
99 |
-
inverted_pitches =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
return samples, inverted_pitches
|
102 |
|
@@ -109,11 +119,16 @@ def generate_audio(audio_model, f0s, invert_audio_fn, singers=[3], num_steps=100
|
|
109 |
return audio
|
110 |
|
111 |
@spaces.GPU(duration=150)
|
112 |
-
def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None ):
|
113 |
|
114 |
logging.log(logging.INFO, 'Generate function')
|
115 |
logging.log(logging.INFO, 'Generating pitch')
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
117 |
if pitch_qt is not None:
|
118 |
# if there is not pitch quantile transformer, undo the default quantile transformation that occurs
|
119 |
def undo_qt(x, min_clip=200):
|
@@ -129,40 +144,54 @@ def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp',
|
|
129 |
audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100)
|
130 |
audio = audio.detach().cpu().numpy()
|
131 |
pitch = pitch.detach().cpu().numpy()
|
132 |
-
pitch_vals = np.where(pitch[0][:, 0] == 0, np.nan, pitch[0].flatten())
|
133 |
-
|
134 |
# generate plot of model output to display on interface
|
|
|
135 |
model_output_plot = plt.figure()
|
136 |
-
|
|
|
137 |
plt.close(model_output_plot)
|
138 |
-
return (16000, audio[0]), model_output_plot
|
139 |
|
140 |
-
#
|
141 |
-
pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
|
142 |
-
os.path.join(pitch_path, 'last.ckpt'), \
|
143 |
-
model_type = 'diffusion', \
|
144 |
-
config_path = os.path.join(pitch_path, 'config.gin'), \
|
145 |
-
qt_path = os.path.join(pitch_path, 'qt.joblib'), \
|
146 |
-
device = device
|
147 |
-
)
|
148 |
audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
|
149 |
os.path.join(audio_path, 'last.ckpt'),
|
150 |
qt_path = os.path.join(audio_path, 'qt.joblib'),
|
151 |
config_path = os.path.join(audio_path, 'config.gin'),
|
152 |
device = device
|
153 |
)
|
154 |
-
partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt) # generate function with default arguments
|
155 |
|
156 |
-
|
157 |
-
def
|
158 |
-
global
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
if audio is None:
|
161 |
return None, None
|
162 |
sr, audio = audio
|
163 |
-
if len(audio) < 12*sr:
|
|
|
164 |
audio = np.pad(audio, (0, 12*sr - len(audio)), mode='constant')
|
165 |
-
|
|
|
|
|
166 |
audio = audio.astype(np.float32)
|
167 |
audio /= np.max(np.abs(audio))
|
168 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
|
@@ -186,55 +215,73 @@ def set_guide_and_generate(audio):
|
|
186 |
f0 = f0.reshape(1, 1, -1)
|
187 |
f0 = torch.tensor(f0).to(pitch_model.device).float()
|
188 |
logging.log(logging.INFO, 'Calling generate function')
|
189 |
-
audio, pitch, _ = partial_generate(f0)
|
190 |
mic_f0 = np.where(mic_f0 == 0, np.nan, mic_f0)
|
191 |
# plot user input
|
192 |
user_input_plot = plt.figure()
|
193 |
plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=user_input_plot)
|
194 |
plt.close(user_input_plot)
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
|
199 |
-
<style>
|
200 |
.center-text {
|
201 |
text-align: center;
|
202 |
}
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
with gr.Column():
|
212 |
gr.Markdown("""
|
213 |
## Instructions
|
214 |
In this demo you can interact with the model in two ways:
|
215 |
-
1. **Call and response**: The model will try to continue the idea that you input. This is similar to
|
216 |
-
2. **Melodic reinterpretation**: Akin to the idea of
|
217 |
### Upload an audio file or record your voice to get started!
|
218 |
-
"""
|
219 |
gr.Markdown("""
|
220 |
This is still a work in progress, so please feel free to share any weird or interesting examples, we would love to hear them! Contact us at [snnithya.mit.edu](mailto:snnithya.mit.edu).
|
221 |
""")
|
222 |
gr.Markdown("""
|
223 |
-
|
224 |
""")
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
with gr.Column():
|
229 |
audio = gr.Audio(label="Input")
|
230 |
-
sbmt = gr.Button()
|
231 |
-
with gr.Accordion("View Pitch Plot"):
|
232 |
-
user_input = gr.Plot(label="User Input")
|
233 |
with gr.Column():
|
234 |
generated_audio = gr.Audio(label="Generated Audio")
|
|
|
|
|
|
|
|
|
|
|
235 |
with gr.Accordion("View Pitch Plot"):
|
236 |
generated_pitch = gr.Plot(label="Generated Pitch")
|
237 |
-
example_description = gr.Textbox(label="Example Description", interactive=False)
|
238 |
examples = gr.Examples(
|
239 |
examples=[
|
240 |
["examples/ex1.wav"],
|
@@ -245,8 +292,7 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
245 |
],
|
246 |
inputs=audio
|
247 |
)
|
248 |
-
|
249 |
-
sbmt.click(set_guide_and_generate, inputs=[audio], outputs=[generated_audio, user_input, generated_pitch])
|
250 |
|
251 |
def main(argv):
|
252 |
|
|
|
35 |
import gamadhani.utils.pitch_to_audio_utils as p2a
|
36 |
from gamadhani.utils.utils import get_device
|
37 |
|
38 |
+
import copy
|
39 |
|
40 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
|
41 |
+
pitch_paths = {'Diffusion Pitch Generator': 'models/diffusion_pitch/'}
|
42 |
+
model_loaded = None
|
43 |
audio_path = 'models/pitch_to_audio/'
|
44 |
device = get_device()
|
45 |
|
|
|
98 |
noisy_pitch = torch.Tensor(pitch[:, :, -1200:]).to(pitch_model.device) + (torch.normal(mean=0.0, std=noise_std*torch.ones((1200)))).to(pitch_model.device)
|
99 |
noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19) # clipping the pitch values to be within the range of the model
|
100 |
samples = pitch_model.sample_sdedit(noisy_pitch, num_samples, num_steps)
|
101 |
+
inverted_pitches = invert_pitch_fn(f0=samples.detach().cpu().numpy()[0]).flatten() # pitch values in Hz
|
102 |
+
|
103 |
+
return samples, inverted_pitches
|
104 |
+
|
105 |
+
def generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps):
|
106 |
+
'''Generate pitch values for the call and response task'''
|
107 |
+
pitch = torch.Tensor(pitch[:, :, -400:]).to(pitch_model.device)
|
108 |
+
samples = pitch_model.sample_fn(num_samples, num_steps, prime=pitch)
|
109 |
+
inverted_pitches = invert_pitch_fn(f0=samples.detach().cpu().numpy()[0]).flatten() # pitch values in Hz
|
110 |
|
111 |
return samples, inverted_pitches
|
112 |
|
|
|
119 |
return audio
|
120 |
|
121 |
@spaces.GPU(duration=150)
|
122 |
+
def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None, type='response', invert_pitch_fn=None):
|
123 |
|
124 |
logging.log(logging.INFO, 'Generate function')
|
125 |
logging.log(logging.INFO, 'Generating pitch')
|
126 |
+
if type == 'response':
|
127 |
+
pitch, inverted_pitch = generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
|
128 |
+
elif type == 'reinterp':
|
129 |
+
pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
|
130 |
+
else:
|
131 |
+
raise ValueError(f'Invalid type: {type}')
|
132 |
if pitch_qt is not None:
|
133 |
# if there is not pitch quantile transformer, undo the default quantile transformation that occurs
|
134 |
def undo_qt(x, min_clip=200):
|
|
|
144 |
audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100)
|
145 |
audio = audio.detach().cpu().numpy()
|
146 |
pitch = pitch.detach().cpu().numpy()
|
|
|
|
|
147 |
# generate plot of model output to display on interface
|
148 |
+
pdb.set_trace()
|
149 |
model_output_plot = plt.figure()
|
150 |
+
inverted_pitch = np.where(inverted_pitch == 0, np.nan, inverted_pitch)
|
151 |
+
plt.plot(inverted_pitch, figure=model_output_plot, label='Model Output')
|
152 |
plt.close(model_output_plot)
|
153 |
+
return (16000, audio[0]), model_output_plot # return audio and plot
|
154 |
|
155 |
+
pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = None, None, None, None # initialize pitch model based on user preference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
|
157 |
os.path.join(audio_path, 'last.ckpt'),
|
158 |
qt_path = os.path.join(audio_path, 'qt.joblib'),
|
159 |
config_path = os.path.join(audio_path, 'config.gin'),
|
160 |
device = device
|
161 |
)
|
|
|
162 |
|
163 |
+
|
164 |
+
def load_pitch_model(model_selection):
|
165 |
+
global device
|
166 |
+
pitch_path = pitch_paths[model_selection]
|
167 |
+
pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
|
168 |
+
os.path.join(pitch_path, 'last.ckpt'), \
|
169 |
+
model_type = 'diffusion', \
|
170 |
+
config_path = os.path.join(pitch_path, 'config.gin'), \
|
171 |
+
qt_path = os.path.join(pitch_path, 'qt.joblib'), \
|
172 |
+
device = device
|
173 |
+
)
|
174 |
+
return pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn
|
175 |
+
|
176 |
+
def container_generate(model_selection, task_selection, audio):
|
177 |
+
global pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, model_loaded
|
178 |
+
# load pitch model
|
179 |
+
if model_loaded is None or model_loaded != model_selection:
|
180 |
+
pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = load_pitch_model(model_selection)
|
181 |
+
model_loaded = model_selection
|
182 |
+
else:
|
183 |
+
logging.log(logging.INFO, f'using existing model: {model_selection}')
|
184 |
+
|
185 |
+
# extract pitch from input
|
186 |
if audio is None:
|
187 |
return None, None
|
188 |
sr, audio = audio
|
189 |
+
if len(audio) < 12*sr and task_selection == 'Melodic Reinterpretation':
|
190 |
+
# make sure the audio is at least 12 s long
|
191 |
audio = np.pad(audio, (0, 12*sr - len(audio)), mode='constant')
|
192 |
+
if len(audio) < 4*sr and task_selection == 'Call and Response':
|
193 |
+
# make sure the audio is at least 4 s long
|
194 |
+
audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
|
195 |
audio = audio.astype(np.float32)
|
196 |
audio /= np.max(np.abs(audio))
|
197 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
|
|
|
215 |
f0 = f0.reshape(1, 1, -1)
|
216 |
f0 = torch.tensor(f0).to(pitch_model.device).float()
|
217 |
logging.log(logging.INFO, 'Calling generate function')
|
|
|
218 |
mic_f0 = np.where(mic_f0 == 0, np.nan, mic_f0)
|
219 |
# plot user input
|
220 |
user_input_plot = plt.figure()
|
221 |
plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=user_input_plot)
|
222 |
plt.close(user_input_plot)
|
223 |
+
|
224 |
+
if task_selection == 'Call and Response':
|
225 |
+
partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt, type='response', invert_pitch_fn=invert_pitch_fn)
|
226 |
+
else:
|
227 |
+
partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt, type='reinterp', invert_pitch_fn=invert_pitch_fn)
|
228 |
+
audio, output_plot = partial_generate(f0)
|
229 |
+
return audio, user_input_plot, output_plot
|
230 |
|
231 |
+
css = """
|
|
|
232 |
.center-text {
|
233 |
text-align: center;
|
234 |
}
|
235 |
+
.justify-text {
|
236 |
+
text-align: justify;
|
237 |
+
}
|
238 |
+
"""
|
239 |
+
|
240 |
+
with gr.Blocks(css=css) as demo:
|
241 |
+
gr.Markdown("# GaMaDHaNi: Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani Classical Music", elem_classes="center-text")
|
242 |
+
gr.Markdown("### Abstract", elem_classes="center-text")
|
243 |
+
gr.Markdown("""
|
244 |
+
Hindustani music is a performance-driven oral tradition that exhibits the rendition of rich melodic patterns. In this paper, we focus on generative modeling of singers' vocal melodies extracted from audio recordings, as the voice is musically prominent within the tradition. Prior generative work in Hindustani music models melodies as coarse discrete symbols which fails to capture the rich expressive melodic intricacies of singing. Thus, we propose to use a finely quantized pitch contour, as an intermediate representation for hierarchical audio modeling. We propose GaMaDHaNi, a modular two-level hierarchy, consisting of a generative model on pitch contours, and a pitch contour to audio synthesis model. We compare our approach to non-hierarchical audio models and hierarchical models that use a self-supervised intermediate representation, through a listening test and qualitative analysis. We also evaluate audio model's ability to faithfully represent the pitch contour input using Pearson correlation coefficient. By using pitch contours as an intermediate representation, we show that our model may be better equipped to listen and respond to musicians in a human-AI collaborative setting by highlighting two potential interaction use cases (1) primed generation, and (2) coarse pitch conditioning.
|
245 |
+
""", elem_classes="justify-text")
|
246 |
+
gr.Markdown("""
|
247 |
+
π Read more about the project [here](https://arxiv.org/pdf/2408.12658) <br>
|
248 |
+
π§ Listen to the samples [here](https://snnithya.github.io/gamadhani-samples) <br>
|
249 |
+
""", elem_classes="center-text")
|
250 |
with gr.Column():
|
251 |
gr.Markdown("""
|
252 |
## Instructions
|
253 |
In this demo you can interact with the model in two ways:
|
254 |
+
1. **Call and response**: The model will try to continue the idea that you input. This is similar to 'primed generation' discussed in the paper. The last 4 s of the audio will be considered as a 'prime' for the model to continue. <br><br>
|
255 |
+
2. **Melodic reinterpretation**: Akin to the idea of 'coarse pitch conditioning' presented in the paper, you can input a pitch contour and the model will generate audio that is similar to but not exactly the same. <br><br>
|
256 |
### Upload an audio file or record your voice to get started!
|
257 |
+
""")
|
258 |
gr.Markdown("""
|
259 |
This is still a work in progress, so please feel free to share any weird or interesting examples, we would love to hear them! Contact us at [snnithya.mit.edu](mailto:snnithya.mit.edu).
|
260 |
""")
|
261 |
gr.Markdown("""
|
262 |
+
*Note: If you see an error message on the screen after clicking 'Submit', please wait for five seconds and click 'Submit' again.*
|
263 |
""")
|
264 |
+
gr.Markdown("""
|
265 |
+
*Another note: The model may take around 40-60s to generate an output. Hang tight! But if you're left hanging for too long, let me know!*
|
266 |
+
""")
|
267 |
+
gr.Markdown("""
|
268 |
+
*Last note, I promise: There are some example audio samples at the bottom of the page. You can start with those if you'd like!*
|
269 |
+
""")
|
270 |
+
model_dropdown = gr.Dropdown(["Diffusion Pitch Generator"], label="Select a model type")
|
271 |
+
task_dropdown = gr.Dropdown(label="Select a task", choices=["Call and Response", "Melodic Reinterpretation"])
|
272 |
+
sbmt = gr.Button()
|
273 |
+
with gr.Row(equal_height=True):
|
274 |
with gr.Column():
|
275 |
audio = gr.Audio(label="Input")
|
|
|
|
|
|
|
276 |
with gr.Column():
|
277 |
generated_audio = gr.Audio(label="Generated Audio")
|
278 |
+
with gr.Row():
|
279 |
+
with gr.Column():
|
280 |
+
with gr.Accordion("View Pitch Plot"):
|
281 |
+
user_input = gr.Plot(label="User Input")
|
282 |
+
with gr.Column():
|
283 |
with gr.Accordion("View Pitch Plot"):
|
284 |
generated_pitch = gr.Plot(label="Generated Pitch")
|
|
|
285 |
examples = gr.Examples(
|
286 |
examples=[
|
287 |
["examples/ex1.wav"],
|
|
|
292 |
],
|
293 |
inputs=audio
|
294 |
)
|
295 |
+
sbmt.click(container_generate, inputs=[model_dropdown, task_dropdown, audio], outputs=[generated_audio, user_input, generated_pitch])
|
|
|
296 |
|
297 |
def main(argv):
|
298 |
|