Nithya commited on
Commit
3d6b478
β€’
1 Parent(s): 0821a2f

updated the models

Browse files
.gitattributes CHANGED
@@ -5,3 +5,5 @@ models/pitch_to_audio/last.ckpt filter=lfs diff=lfs merge=lfs -text
5
  models/diffusion_pitch/qt.joblib filter=lfs diff=lfs merge=lfs -text
6
  models/pitch_to_audio/qt.joblib filter=lfs diff=lfs merge=lfs -text
7
  examples/** filter=lfs diff=lfs merge=lfs -text
 
 
 
5
  models/diffusion_pitch/qt.joblib filter=lfs diff=lfs merge=lfs -text
6
  models/pitch_to_audio/qt.joblib filter=lfs diff=lfs merge=lfs -text
7
  examples/** filter=lfs diff=lfs merge=lfs -text
8
+ models/diffusion_pitch/model.ckpt filter=lfs diff=lfs merge=lfs -text
9
+ models/transformer_pitch/model.ckpt filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -41,7 +41,10 @@ from gamadhani.utils.utils import get_device
41
  import copy
42
 
43
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
44
- pitch_paths = {'Diffusion Pitch Generator': 'models/diffusion_pitch/'}
 
 
 
45
  model_loaded = None
46
  audio_path = 'models/pitch_to_audio/'
47
  device = get_device()
@@ -110,10 +113,13 @@ def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, nu
110
 
111
  return samples, inverted_pitches
112
 
113
- def generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps):
114
  '''Generate pitch values for the call and response task'''
115
  pitch = pitch[:, :, -400:] # consider only the last 4 s of the pitch contour
116
- samples = pitch_model.sample_fn(num_samples, num_steps, prime=pitch)
 
 
 
117
  inverted_pitches = invert_pitch_fn(f0=samples.clone().detach().cpu().numpy()[0]).flatten() # pitch values in Hz
118
 
119
  return samples, inverted_pitches
@@ -127,7 +133,7 @@ def generate_audio(audio_model, f0s, invert_audio_fn, singers=[3], num_steps=100
127
  return audio
128
 
129
  @spaces.GPU(duration=30)
130
- def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None, type='response', invert_pitch_fn=None, t0=0.5):
131
  global pitch_model, audio_model
132
  # move the models to device
133
  pitch_model = pitch_model.to(device)
@@ -135,10 +141,11 @@ def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp',
135
  logging.log(logging.INFO, 'Generate function')
136
  # load pitch values onto GPU
137
  pitch = torch.tensor(pitch).float().unsqueeze(0).unsqueeze(0).to(device)
138
- pitch_qt = p2a.GPUQuantileTransformer(pitch_qt, device=device)
 
139
  logging.log(logging.INFO, 'Generating pitch')
140
  if type == 'response':
141
- pitch, inverted_pitch = generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
142
  elif type == 'reinterp':
143
  pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100, t0=t0)
144
 
@@ -151,9 +158,10 @@ def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp',
151
  pitch= pitch_qt.inverse_transform(x).squeeze(0) # qt transform expects shape (bs, seq_len, 1)
152
  pitch = torch.round(pitch) # round to nearest integer, done in preprocessing of pitch contour fed into model
153
  pitch[pitch < 200] = np.nan
 
154
  return pitch
155
  pitch = undo_qt(pitch)
156
- interpolated_pitch = p2a.interpolate_pitch(pitch=pitch.unsqueeze(0), audio_seq_len=audio_seq_len).squeeze(0) # interpolate pitch values to match the audio model's input size
157
  interpolated_pitch = torch.nan_to_num(interpolated_pitch, nan=196) # replace nan values with silent token
158
  interpolated_pitch = interpolated_pitch.squeeze(1) # to match input size by removing the extra dimension
159
  logging.log(logging.INFO, 'Generating audio')
@@ -178,12 +186,12 @@ audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
178
 
179
  def load_pitch_model(model_selection):
180
  global device
181
- pitch_path = pitch_paths[model_selection]
182
  pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
183
- os.path.join(pitch_path, 'last.ckpt'), \
184
- model_type = 'diffusion', \
185
  config_path = os.path.join(pitch_path, 'config.gin'), \
186
- qt_path = os.path.join(pitch_path, 'qt.joblib'), \
187
  device = 'cpu'
188
  )
189
  return pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn
@@ -239,9 +247,9 @@ def container_generate(model_selection, task_selection, audio, singer_id, t0):
239
  elif singer_id == 'Singer 2':
240
  singer = [27]
241
  if task_selection == 'Call and Response':
242
- partial_generate = partial(generate, num_samples=1, num_steps=100, singers=singer, outfolder=None, pitch_qt=pitch_qt, type='response', invert_pitch_fn=invert_pitch_fn)
243
  else:
244
- partial_generate = partial(generate, num_samples=1, num_steps=100, singers=singer, outfolder=None, pitch_qt=pitch_qt, type='reinterp', invert_pitch_fn=invert_pitch_fn, t0=t0)
245
  audio, output_plot = partial_generate(f0)
246
  return audio, user_input_plot, output_plot
247
 
@@ -260,6 +268,13 @@ def toggle_visibility(selection):
260
  return gr.update(visible=True)
261
  else:
262
  return gr.update(visible=False)
 
 
 
 
 
 
 
263
 
264
  with gr.Blocks(css=css) as demo:
265
  gr.Markdown("# GaMaDHaNi: Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani Classical Music", elem_classes="center-text")
@@ -291,8 +306,9 @@ with gr.Blocks(css=css) as demo:
291
  gr.Markdown("""
292
  *Last note, I promise: There are some example audio samples at the bottom of the page. You can start with those if you'd like!*
293
  """)
294
- model_dropdown = gr.Dropdown(["Diffusion Pitch Generator"], label="Select a model type")
295
  task_dropdown = gr.Dropdown(label="Select a task", choices=["Call and Response", "Melodic Reinterpretation"])
 
296
  t0 = gr.Slider(label="Faithfulness to the input (For melodic reinterpretation task only)", minimum=0.0, maximum=1.0, step=0.01, value=0.3, visible=False)
297
  task_dropdown.change(toggle_visibility, inputs=task_dropdown, outputs=t0)
298
  singer_dropdown = gr.Dropdown(label="Select a singer", choices=["Singer 1", "Singer 2"])
 
41
  import copy
42
 
43
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
44
+ pitch_paths = {
45
+ 'Diffusion Pitch Generator': ('diffusion', 'models/diffusion_pitch/'),
46
+ 'Autoregressive Pitch Generator': ('transformer', 'models/transformer_pitch/')
47
+ }
48
  model_loaded = None
49
  audio_path = 'models/pitch_to_audio/'
50
  device = get_device()
 
113
 
114
  return samples, inverted_pitches
115
 
116
+ def generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, model_type='diffusion'):
117
  '''Generate pitch values for the call and response task'''
118
  pitch = pitch[:, :, -400:] # consider only the last 4 s of the pitch contour
119
+ if model_type == 'diffusion':
120
+ samples = pitch_model.sample_fn(num_samples, num_steps, prime=pitch)
121
+ else:
122
+ samples = pitch_model.sample_fn(batch_size=num_samples, seq_len=800, prime=pitch)
123
  inverted_pitches = invert_pitch_fn(f0=samples.clone().detach().cpu().numpy()[0]).flatten() # pitch values in Hz
124
 
125
  return samples, inverted_pitches
 
133
  return audio
134
 
135
  @spaces.GPU(duration=30)
136
+ def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None, type='response', invert_pitch_fn=None, t0=0.5, model_type='diffusion'):
137
  global pitch_model, audio_model
138
  # move the models to device
139
  pitch_model = pitch_model.to(device)
 
141
  logging.log(logging.INFO, 'Generate function')
142
  # load pitch values onto GPU
143
  pitch = torch.tensor(pitch).float().unsqueeze(0).unsqueeze(0).to(device)
144
+ if pitch_qt is not None:
145
+ pitch_qt = p2a.GPUQuantileTransformer(pitch_qt, device=device)
146
  logging.log(logging.INFO, 'Generating pitch')
147
  if type == 'response':
148
+ pitch, inverted_pitch = generate_pitch_response(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100, model_type=model_type)
149
  elif type == 'reinterp':
150
  pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100, t0=t0)
151
 
 
158
  pitch= pitch_qt.inverse_transform(x).squeeze(0) # qt transform expects shape (bs, seq_len, 1)
159
  pitch = torch.round(pitch) # round to nearest integer, done in preprocessing of pitch contour fed into model
160
  pitch[pitch < 200] = np.nan
161
+ pitch = pitch.unsqueeze(0)
162
  return pitch
163
  pitch = undo_qt(pitch)
164
+ interpolated_pitch = p2a.interpolate_pitch(pitch=pitch, audio_seq_len=audio_seq_len).squeeze(0) # interpolate pitch values to match the audio model's input size
165
  interpolated_pitch = torch.nan_to_num(interpolated_pitch, nan=196) # replace nan values with silent token
166
  interpolated_pitch = interpolated_pitch.squeeze(1) # to match input size by removing the extra dimension
167
  logging.log(logging.INFO, 'Generating audio')
 
186
 
187
  def load_pitch_model(model_selection):
188
  global device
189
+ model_type, pitch_path = pitch_paths[model_selection]
190
  pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
191
+ os.path.join(pitch_path, 'model.ckpt'), \
192
+ model_type = model_type, \
193
  config_path = os.path.join(pitch_path, 'config.gin'), \
194
+ qt_path = os.path.join(pitch_path, 'qt.joblib') if model_type == 'diffusion' else None, \
195
  device = 'cpu'
196
  )
197
  return pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn
 
247
  elif singer_id == 'Singer 2':
248
  singer = [27]
249
  if task_selection == 'Call and Response':
250
+ partial_generate = partial(generate, num_samples=1, num_steps=100, singers=singer, outfolder=None, pitch_qt=pitch_qt, type='response', invert_pitch_fn=invert_pitch_fn, model_type=model_selection)
251
  else:
252
+ partial_generate = partial(generate, num_samples=1, num_steps=100, singers=singer, outfolder=None, pitch_qt=pitch_qt, type='reinterp', invert_pitch_fn=invert_pitch_fn, t0=t0, model_type=model_selection)
253
  audio, output_plot = partial_generate(f0)
254
  return audio, user_input_plot, output_plot
255
 
 
268
  return gr.update(visible=True)
269
  else:
270
  return gr.update(visible=False)
271
+
272
+ def toggle_options(selection, options = ['Call and Response', 'Melodic Reinterpretation']):
273
+ # Show element if selection is "Show", otherwise hide it
274
+ if selection == "Melodic Reinterpretation":
275
+ return gr.update(choices=options)
276
+ else:
277
+ return gr.update(choices=options[:-1])
278
 
279
  with gr.Blocks(css=css) as demo:
280
  gr.Markdown("# GaMaDHaNi: Hierarchical Generative Modeling of Melodic Vocal Contours in Hindustani Classical Music", elem_classes="center-text")
 
306
  gr.Markdown("""
307
  *Last note, I promise: There are some example audio samples at the bottom of the page. You can start with those if you'd like!*
308
  """)
309
+ model_dropdown = gr.Dropdown(["Diffusion Pitch Generator", "Autoregressive Pitch Generator"], label="Select a model type")
310
  task_dropdown = gr.Dropdown(label="Select a task", choices=["Call and Response", "Melodic Reinterpretation"])
311
+ model_dropdown.change(toggle_options, outputs=task_dropdown)
312
  t0 = gr.Slider(label="Faithfulness to the input (For melodic reinterpretation task only)", minimum=0.0, maximum=1.0, step=0.01, value=0.3, visible=False)
313
  task_dropdown.change(toggle_visibility, inputs=task_dropdown, outputs=t0)
314
  singer_dropdown = gr.Dropdown(label="Select a singer", choices=["Singer 1", "Singer 2"])
models/diffusion_pitch/{last.ckpt β†’ model.ckpt} RENAMED
File without changes
models/transformer_pitch/config.gin ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ from gamadhani import src
3
+ from gamadhani.src import dataset
4
+ from gamadhani.src import model_transformer
5
+ from gamadhani.src import task_functions
6
+ from gamadhani.utils import utils
7
+ import torch.optim
8
+
9
+ MODEL_DIM = 512
10
+ EMB_DIM = 512
11
+ NUM_TOKENS = 7928
12
+ NUM_QUANTIZERS = 1
13
+ DROPOUT_RATE = 0.3
14
+ NUM_HEADS = 8
15
+ SEQ_LEN = 1200
16
+ HEAD_DIM = 32
17
+ NUM_LAYERS = 8
18
+ LR = 1e-3
19
+
20
+ model_transformer.XTransformerPrior:
21
+ num_tokens = %NUM_TOKENS
22
+ seq_len = %SEQ_LEN
23
+ model_dim = %MODEL_DIM
24
+ emb_dim = %EMB_DIM
25
+ head_dim = %HEAD_DIM
26
+ num_layers = %NUM_LAYERS
27
+ num_heads = %NUM_HEADS
28
+ dropout_rate = %DROPOUT_RATE
29
+
30
+
31
+ src.dataset.Task:
32
+ read_fn = @src.task_functions.pitch_read_downsample
33
+ invert_fn = @src.task_functions.invert_pitch_read_downsample
34
+ kwargs = {"seq_len": %SEQ_LEN,
35
+ "decoder_key": "pitch",
36
+ "min_norm_pitch": -4915,
37
+ "time_downsample": 2,
38
+ "pitch_downsample": 10,
39
+ "base_tonic": 440.}
40
+
41
+ src.dataset.SequenceDataset:
42
+ task = @dataset.Task()
43
+ apply_transform = False
44
+
45
+ model_transformer.XTransformerPrior.configure_optimizers:
46
+ optimizer_cls = @torch.optim.AdamW
47
+ scheduler_cls = @utils.build_warmed_exponential_lr_scheduler
48
+
49
+ utils.build_warmed_exponential_lr_scheduler:
50
+ start_factor = .01
51
+ peak_iteration = 10000
52
+ cycle_length = 394600
53
+ eta_min = 0.1
54
+ eta_max = %LR
55
+
56
+ utils.set_seed:
57
+ seed = 2023
58
+
59
+ torch.optim.AdamW:
60
+ lr = %LR
61
+ betas = (.9, .98)
models/transformer_pitch/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d849eaca79a0bc390d0550b8187d47a843bdb3a6c81b9401e5e925ae1220acc4
3
+ size 356915980
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  crepe==0.0.15
2
  hmmlearn==0.3.2
3
  tensorflow==2.17.0
4
- GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@c59e24e28ef7b80d43f56b39d3a9dd0563e01df6
 
1
  crepe==0.0.15
2
  hmmlearn==0.3.2
3
  tensorflow==2.17.0
4
+ GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@055df71380e0feced7e409470ffc8603f1cfa926