Nithya commited on
Commit
e0d48d1
Β·
1 Parent(s): 017b2a5

added other functionalities to the interface

Browse files
Files changed (1) hide show
  1. app.py +95 -49
app.py CHANGED
@@ -35,9 +35,11 @@ from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
35
  import gamadhani.utils.pitch_to_audio_utils as p2a
36
  from gamadhani.utils.utils import get_device
37
 
 
38
 
39
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
40
- pitch_path = 'models/diffusion_pitch/'
 
41
  audio_path = 'models/pitch_to_audio/'
42
  device = get_device()
43
 
@@ -96,7 +98,15 @@ def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, nu
96
  noisy_pitch = torch.Tensor(pitch[:, :, -1200:]).to(pitch_model.device) + (torch.normal(mean=0.0, std=noise_std*torch.ones((1200)))).to(pitch_model.device)
97
  noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19) # clipping the pitch values to be within the range of the model
98
  samples = pitch_model.sample_sdedit(noisy_pitch, num_samples, num_steps)
99
- inverted_pitches = [invert_pitch_fn(f0=samples.detach().cpu().numpy()[0])[0]] # pitch values in Hz
 
 
 
 
 
 
 
 
100
 
101
  return samples, inverted_pitches
102
 
@@ -109,11 +119,16 @@ def generate_audio(audio_model, f0s, invert_audio_fn, singers=[3], num_steps=100
109
  return audio
110
 
111
  @spaces.GPU(duration=150)
112
- def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None ):
113
 
114
  logging.log(logging.INFO, 'Generate function')
115
  logging.log(logging.INFO, 'Generating pitch')
116
- pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
 
 
 
 
 
117
  if pitch_qt is not None:
118
  # if there is not pitch quantile transformer, undo the default quantile transformation that occurs
119
  def undo_qt(x, min_clip=200):
@@ -129,40 +144,54 @@ def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp',
129
  audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100)
130
  audio = audio.detach().cpu().numpy()
131
  pitch = pitch.detach().cpu().numpy()
132
- pitch_vals = np.where(pitch[0][:, 0] == 0, np.nan, pitch[0].flatten())
133
-
134
  # generate plot of model output to display on interface
 
135
  model_output_plot = plt.figure()
136
- plt.plot(pitch_vals, figure=model_output_plot, label='Model Output')
 
137
  plt.close(model_output_plot)
138
- return (16000, audio[0]), model_output_plot, pitch_vals
139
 
140
- # pdb.set_trace()
141
- pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
142
- os.path.join(pitch_path, 'last.ckpt'), \
143
- model_type = 'diffusion', \
144
- config_path = os.path.join(pitch_path, 'config.gin'), \
145
- qt_path = os.path.join(pitch_path, 'qt.joblib'), \
146
- device = device
147
- )
148
  audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
149
  os.path.join(audio_path, 'last.ckpt'),
150
  qt_path = os.path.join(audio_path, 'qt.joblib'),
151
  config_path = os.path.join(audio_path, 'config.gin'),
152
  device = device
153
  )
154
- partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt) # generate function with default arguments
155
 
156
- @spaces.GPU(duration=150)
157
- def set_guide_and_generate(audio):
158
- global selected_prime, pitch_task_fn
159
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  if audio is None:
161
  return None, None
162
  sr, audio = audio
163
- if len(audio) < 12*sr:
 
164
  audio = np.pad(audio, (0, 12*sr - len(audio)), mode='constant')
165
-
 
 
166
  audio = audio.astype(np.float32)
167
  audio /= np.max(np.abs(audio))
168
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
@@ -186,55 +215,73 @@ def set_guide_and_generate(audio):
186
  f0 = f0.reshape(1, 1, -1)
187
  f0 = torch.tensor(f0).to(pitch_model.device).float()
188
  logging.log(logging.INFO, 'Calling generate function')
189
- audio, pitch, _ = partial_generate(f0)
190
  mic_f0 = np.where(mic_f0 == 0, np.nan, mic_f0)
191
  # plot user input
192
  user_input_plot = plt.figure()
193
  plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=user_input_plot)
194
  plt.close(user_input_plot)
195
- return audio, user_input_plot, pitch
196
-
 
 
 
 
 
197
 
198
- gr.HTML("""
199
- <style>
200
  .center-text {
201
  text-align: center;
202
  }
203
- </style>
204
- """)
205
- with gr.Blocks(theme=gr.themes.Glass()) as demo:
206
- demo.title("GaMaDHaNi: Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani Classical Music")
207
- demo.description("""
208
- :book: Read more about the project [here](https://arxiv.org/pdf/2408.12658) <br>
209
- :samples: Listen to the samples [here](https://snnithya.github.io/gamadhani-samples) <br>
210
- """)
 
 
 
 
 
 
 
211
  with gr.Column():
212
  gr.Markdown("""
213
  ## Instructions
214
  In this demo you can interact with the model in two ways:
215
- 1. **Call and response**: The model will try to continue the idea that you input. This is similar to `primed generation' discussed in the paper.
216
- 2. **Melodic reinterpretation**: Akin to the idea of `coarse pitch conditioning' presented in the paper, you can input a pitch contour and the model will generate audio that is similar to but not exactly the same. <br><br>
217
  ### Upload an audio file or record your voice to get started!
218
- """, elem_classes="center-text")
219
  gr.Markdown("""
220
  This is still a work in progress, so please feel free to share any weird or interesting examples, we would love to hear them! Contact us at [snnithya.mit.edu](mailto:snnithya.mit.edu).
221
  """)
222
  gr.Markdown("""
223
- *Note: If you see an error message on the screen after clicking 'Submit', please wait for five seconds and click 'Submit' again.*
224
  """)
225
-
226
- with gr.Row(equal_heights=True):
227
-
 
 
 
 
 
 
 
228
  with gr.Column():
229
  audio = gr.Audio(label="Input")
230
- sbmt = gr.Button()
231
- with gr.Accordion("View Pitch Plot"):
232
- user_input = gr.Plot(label="User Input")
233
  with gr.Column():
234
  generated_audio = gr.Audio(label="Generated Audio")
 
 
 
 
 
235
  with gr.Accordion("View Pitch Plot"):
236
  generated_pitch = gr.Plot(label="Generated Pitch")
237
- example_description = gr.Textbox(label="Example Description", interactive=False)
238
  examples = gr.Examples(
239
  examples=[
240
  ["examples/ex1.wav"],
@@ -245,8 +292,7 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
245
  ],
246
  inputs=audio
247
  )
248
-
249
- sbmt.click(set_guide_and_generate, inputs=[audio], outputs=[generated_audio, user_input, generated_pitch])
250
 
251
  def main(argv):
252
 
 
35
  import gamadhani.utils.pitch_to_audio_utils as p2a
36
  from gamadhani.utils.utils import get_device
37
 
38
+ import copy
39
 
40
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
41
+ pitch_paths = {'Diffusion Pitch Generator': 'models/diffusion_pitch/'}
42
+ model_loaded = None
43
  audio_path = 'models/pitch_to_audio/'
44
  device = get_device()
45
 
 
98
  noisy_pitch = torch.Tensor(pitch[:, :, -1200:]).to(pitch_model.device) + (torch.normal(mean=0.0, std=noise_std*torch.ones((1200)))).to(pitch_model.device)
99
  noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19) # clipping the pitch values to be within the range of the model
100
  samples = pitch_model.sample_sdedit(noisy_pitch, num_samples, num_steps)
101
+ inverted_pitches = invert_pitch_fn(f0=samples.detach().cpu().numpy()[0]).flatten() # pitch values in Hz
102
+
103
+ return samples, inverted_pitches
104
+
105
+ def generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps):
106
+ '''Generate pitch values for the call and response task'''
107
+ pitch = torch.Tensor(pitch[:, :, -400:]).to(pitch_model.device)
108
+ samples = pitch_model.sample_fn(num_samples, num_steps, prime=pitch)
109
+ inverted_pitches = invert_pitch_fn(f0=samples.detach().cpu().numpy()[0]).flatten() # pitch values in Hz
110
 
111
  return samples, inverted_pitches
112
 
 
119
  return audio
120
 
121
  @spaces.GPU(duration=150)
122
+ def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None, type='response', invert_pitch_fn=None):
123
 
124
  logging.log(logging.INFO, 'Generate function')
125
  logging.log(logging.INFO, 'Generating pitch')
126
+ if type == 'response':
127
+ pitch, inverted_pitch = generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
128
+ elif type == 'reinterp':
129
+ pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
130
+ else:
131
+ raise ValueError(f'Invalid type: {type}')
132
  if pitch_qt is not None:
133
  # if there is not pitch quantile transformer, undo the default quantile transformation that occurs
134
  def undo_qt(x, min_clip=200):
 
144
  audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100)
145
  audio = audio.detach().cpu().numpy()
146
  pitch = pitch.detach().cpu().numpy()
 
 
147
  # generate plot of model output to display on interface
148
+ pdb.set_trace()
149
  model_output_plot = plt.figure()
150
+ inverted_pitch = np.where(inverted_pitch == 0, np.nan, inverted_pitch)
151
+ plt.plot(inverted_pitch, figure=model_output_plot, label='Model Output')
152
  plt.close(model_output_plot)
153
+ return (16000, audio[0]), model_output_plot # return audio and plot
154
 
155
+ pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = None, None, None, None # initialize pitch model based on user preference
 
 
 
 
 
 
 
156
  audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
157
  os.path.join(audio_path, 'last.ckpt'),
158
  qt_path = os.path.join(audio_path, 'qt.joblib'),
159
  config_path = os.path.join(audio_path, 'config.gin'),
160
  device = device
161
  )
 
162
 
163
+
164
+ def load_pitch_model(model_selection):
165
+ global device
166
+ pitch_path = pitch_paths[model_selection]
167
+ pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
168
+ os.path.join(pitch_path, 'last.ckpt'), \
169
+ model_type = 'diffusion', \
170
+ config_path = os.path.join(pitch_path, 'config.gin'), \
171
+ qt_path = os.path.join(pitch_path, 'qt.joblib'), \
172
+ device = device
173
+ )
174
+ return pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn
175
+
176
+ def container_generate(model_selection, task_selection, audio):
177
+ global pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, model_loaded
178
+ # load pitch model
179
+ if model_loaded is None or model_loaded != model_selection:
180
+ pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = load_pitch_model(model_selection)
181
+ model_loaded = model_selection
182
+ else:
183
+ logging.log(logging.INFO, f'using existing model: {model_selection}')
184
+
185
+ # extract pitch from input
186
  if audio is None:
187
  return None, None
188
  sr, audio = audio
189
+ if len(audio) < 12*sr and task_selection == 'Melodic Reinterpretation':
190
+ # make sure the audio is at least 12 s long
191
  audio = np.pad(audio, (0, 12*sr - len(audio)), mode='constant')
192
+ if len(audio) < 4*sr and task_selection == 'Call and Response':
193
+ # make sure the audio is at least 4 s long
194
+ audio = np.pad(audio, (4*sr - len(audio), 0), mode='constant')
195
  audio = audio.astype(np.float32)
196
  audio /= np.max(np.abs(audio))
197
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
 
215
  f0 = f0.reshape(1, 1, -1)
216
  f0 = torch.tensor(f0).to(pitch_model.device).float()
217
  logging.log(logging.INFO, 'Calling generate function')
 
218
  mic_f0 = np.where(mic_f0 == 0, np.nan, mic_f0)
219
  # plot user input
220
  user_input_plot = plt.figure()
221
  plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=user_input_plot)
222
  plt.close(user_input_plot)
223
+
224
+ if task_selection == 'Call and Response':
225
+ partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt, type='response', invert_pitch_fn=invert_pitch_fn)
226
+ else:
227
+ partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt, type='reinterp', invert_pitch_fn=invert_pitch_fn)
228
+ audio, output_plot = partial_generate(f0)
229
+ return audio, user_input_plot, output_plot
230
 
231
+ css = """
 
232
  .center-text {
233
  text-align: center;
234
  }
235
+ .justify-text {
236
+ text-align: justify;
237
+ }
238
+ """
239
+
240
+ with gr.Blocks(css=css) as demo:
241
+ gr.Markdown("# GaMaDHaNi: Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani Classical Music", elem_classes="center-text")
242
+ gr.Markdown("### Abstract", elem_classes="center-text")
243
+ gr.Markdown("""
244
+ Hindustani music is a performance-driven oral tradition that exhibits the rendition of rich melodic patterns. In this paper, we focus on generative modeling of singers' vocal melodies extracted from audio recordings, as the voice is musically prominent within the tradition. Prior generative work in Hindustani music models melodies as coarse discrete symbols which fails to capture the rich expressive melodic intricacies of singing. Thus, we propose to use a finely quantized pitch contour, as an intermediate representation for hierarchical audio modeling. We propose GaMaDHaNi, a modular two-level hierarchy, consisting of a generative model on pitch contours, and a pitch contour to audio synthesis model. We compare our approach to non-hierarchical audio models and hierarchical models that use a self-supervised intermediate representation, through a listening test and qualitative analysis. We also evaluate audio model's ability to faithfully represent the pitch contour input using Pearson correlation coefficient. By using pitch contours as an intermediate representation, we show that our model may be better equipped to listen and respond to musicians in a human-AI collaborative setting by highlighting two potential interaction use cases (1) primed generation, and (2) coarse pitch conditioning.
245
+ """, elem_classes="justify-text")
246
+ gr.Markdown("""
247
+ πŸ“– Read more about the project [here](https://arxiv.org/pdf/2408.12658) <br>
248
+ 🎧 Listen to the samples [here](https://snnithya.github.io/gamadhani-samples) <br>
249
+ """, elem_classes="center-text")
250
  with gr.Column():
251
  gr.Markdown("""
252
  ## Instructions
253
  In this demo you can interact with the model in two ways:
254
+ 1. **Call and response**: The model will try to continue the idea that you input. This is similar to 'primed generation' discussed in the paper. The last 4 s of the audio will be considered as a 'prime' for the model to continue. <br><br>
255
+ 2. **Melodic reinterpretation**: Akin to the idea of 'coarse pitch conditioning' presented in the paper, you can input a pitch contour and the model will generate audio that is similar to but not exactly the same. <br><br>
256
  ### Upload an audio file or record your voice to get started!
257
+ """)
258
  gr.Markdown("""
259
  This is still a work in progress, so please feel free to share any weird or interesting examples, we would love to hear them! Contact us at [snnithya.mit.edu](mailto:snnithya.mit.edu).
260
  """)
261
  gr.Markdown("""
262
+ *Note: If you see an error message on the screen after clicking 'Submit', please wait for five seconds and click 'Submit' again.*
263
  """)
264
+ gr.Markdown("""
265
+ *Another note: The model may take around 40-60s to generate an output. Hang tight! But if you're left hanging for too long, let me know!*
266
+ """)
267
+ gr.Markdown("""
268
+ *Last note, I promise: There are some example audio samples at the bottom of the page. You can start with those if you'd like!*
269
+ """)
270
+ model_dropdown = gr.Dropdown(["Diffusion Pitch Generator"], label="Select a model type")
271
+ task_dropdown = gr.Dropdown(label="Select a task", choices=["Call and Response", "Melodic Reinterpretation"])
272
+ sbmt = gr.Button()
273
+ with gr.Row(equal_height=True):
274
  with gr.Column():
275
  audio = gr.Audio(label="Input")
 
 
 
276
  with gr.Column():
277
  generated_audio = gr.Audio(label="Generated Audio")
278
+ with gr.Row():
279
+ with gr.Column():
280
+ with gr.Accordion("View Pitch Plot"):
281
+ user_input = gr.Plot(label="User Input")
282
+ with gr.Column():
283
  with gr.Accordion("View Pitch Plot"):
284
  generated_pitch = gr.Plot(label="Generated Pitch")
 
285
  examples = gr.Examples(
286
  examples=[
287
  ["examples/ex1.wav"],
 
292
  ],
293
  inputs=audio
294
  )
295
+ sbmt.click(container_generate, inputs=[model_dropdown, task_dropdown, audio], outputs=[generated_audio, user_input, generated_pitch])
 
296
 
297
  def main(argv):
298