Imagen-POP-Music-Medley-Diffusion-Transformer

Sleeping

App Files Files Community

asigalov61 commited on Sep 2

Commit

5837401

•

1 Parent(s): 5a9b440

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -92

app.py CHANGED Viewed

@@ -23,129 +23,104 @@ import TMIDIX
 # =================================================================================================
 @spaces.GPU
-def GenerateSong(input_melody_seed_number):
     print('=' * 70)
     print('Req start time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now(PDT)))
     start_time = reqtime.time()
     print('Loading model...')
-    SEQ_LEN = 2560
-    PAD_IDX = 514
-    DEVICE = 'cuda' # 'cuda'
-    # instantiate the model
-    model = TransformerWrapper(
-        num_tokens = PAD_IDX+1,
-        max_seq_len = SEQ_LEN,
-        attn_layers = Decoder(dim = 1024, depth = 24, heads = 16, attn_flash = True)
-        )
-    model = AutoregressiveWrapper(model, ignore_index = PAD_IDX)
-    model.to(DEVICE)
     print('=' * 70)
     print('Loading model checkpoint...')
-    model.load_state_dict(
-        torch.load('Melody2Song_Seq2Seq_Music_Transformer_Trained_Model_28482_steps_0.719_loss_0.7865_acc.pth',
-                   map_location=DEVICE))
-    print('=' * 70)
-    model.eval()
-    if DEVICE == 'cpu':
-        dtype = torch.bfloat16
-    else:
-        dtype = torch.bfloat16
-    ctx = torch.amp.autocast(device_type=DEVICE, dtype=dtype)
     print('Done!')
     print('=' * 70)
-    seed_melody = seed_melodies_data[input_melody_seed_number]
-    print('Input melody seed number:', input_melody_seed_number)
-    print('-' * 70)
-    #==================================================================
-    print('=' * 70)
-    print('Sample output events', seed_melody[:16])
     print('=' * 70)
     print('Generating...')
-    x = (torch.tensor(seed_melody, dtype=torch.long, device='cuda')[None, ...])
-    with ctx:
-        with torch.inference_mode():
-            out = model.generate(x,
-                                1024,
-                                temperature=0.9,
-                                return_prime=False,
-                                verbose=False)
-    output = out[0].tolist()
-    print('=' * 70)
     print('Done!')
     print('=' * 70)
     #===============================================================================
-    print('Rendering results...')
-    print('=' * 70)
-    print('Sample INTs', output[:15])
-    print('=' * 70)
-    out1 = output
-    if len(out1) != 0:
-        song = out1
-        song_f = []
-        time = 0
-        dur = 0
-        vel = 90
-        pitch = 0
-        channel = 0
-        patches = [0] * 16
-        patches[3] = 40
-        for ss in song:
-            if 0 < ss < 128:
-                time += (ss * 32)
-            if 128 < ss < 256:
-                dur = (ss-128) * 32
-            if 256 < ss < 512:
-                pitch = (ss-256) % 128
-                channel = (ss-256) // 128
-                if channel == 1:
-                    channel = 3
-                    vel = 110 + (pitch % 12)
-                    song_f.append(['note', time, dur, channel, pitch, vel, 40])
-                else:
-                    vel = 80 + (pitch % 12)
-                    channel = 0
-                    song_f.append(['note', time, dur, channel, pitch, vel, 0])
-    fn1 = "Melody2Song-Seq2Seq-Music-Transformer-Composition"
     detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
-                                                              output_signature = 'Melody2Song Seq2Seq Music Transformer',
                                                               output_file_name = fn1,
                                                               track_name='Project Los Angeles',
                                                               list_of_MIDI_patches=patches
@@ -223,7 +198,7 @@ if __name__ == "__main__":
         output_plot = gr.Plot(label="Output MIDI score plot")
         output_midi = gr.File(label="Output MIDI file", file_types=[".mid"])
-        run_event = run_btn.click(GenerateSong, [input_melody_seed_number],
                                   [output_midi_title, output_midi_summary, output_midi, output_audio, output_plot])
         app.queue().launch()

 # =================================================================================================
 @spaces.GPU
+def Generate_POP_Medley(input_num_medley_comps):
     print('=' * 70)
     print('Req start time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now(PDT)))
     start_time = reqtime.time()
+    print('=' * 70)
     print('Loading model...')
+    DIM = 64
+    CHANS = 1
+    TSTEPS = 1000
+    DEVICE = 'cuda' # 'cpu'
+    unet = Unet(
+        dim = DIM,
+        dim_mults = (1, 2, 4, 8),
+        num_resnet_blocks = 1,
+        channels=CHANS,
+        layer_attns = (False, False, False, True),
+        layer_cross_attns = False
+    )
+    imagen = Imagen(
+        condition_on_text = False,  # this must be set to False for unconditional Imagen
+        unets = unet,
+        channels=CHANS,
+        image_sizes = 128,
+        timesteps = TSTEPS
+    )
+    trainer = ImagenTrainer(
+        imagen = imagen,
+        split_valid_from_train = True # whether to split the validation dataset from the training
+    ).to(DEVICE)
     print('=' * 70)
     print('Loading model checkpoint...')
+    trainer.load('Imagen_POP909_64_dim_12638_steps_0.00983_loss.ckptt')
     print('Done!')
     print('=' * 70)
+    print('Req number of medley compositions:', input_num_medley_comps)
     print('=' * 70)
     print('Generating...')
+    images = trainer.sample(batch_size = input_num_medley_comps, return_pil_images = True)
+    threshold = 128
+    imgs_array = []
+    for i in images:
+      arr = np.array(i)
+      farr = np.where(arr < threshold, 0, 1)
+      imgs_array.append(farr)
     print('Done!')
     print('=' * 70)
     #===============================================================================
+    print('Converting images to scores...')
+    medley_compositions_escores = []
+    for i in imgs_array:
+        bmatrix = TPLOTS.images_to_binary_matrix([i])
+        score = TMIDIX.binary_matrix_to_original_escore_notes(bmatrix)
+        medley_compositions_escores.append(score)
+    print('Done!')
+    print('=' * 70)
+    print('Creating medley score...')
+    medley_labels = ['Composition #' + str(i+1) for i in range(len(medley_compositions_escores))]
+    medley_escore = TMIDIX.escore_notes_medley(medley_compositions_escores, medley_labels)
+    #===============================================================================
+    print('Rendering results...')
+    print('=' * 70)
+    print('Sample INTs', medley_escore[:15])
+    print('=' * 70)
+    fn1 = "Imagen-POP-Music-Medley-Diffusion-Transformer-Composition"
     detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
+                                                              output_signature = 'Imagen POP Music Medley',
                                                               output_file_name = fn1,
                                                               track_name='Project Los Angeles',
                                                               list_of_MIDI_patches=patches
         output_plot = gr.Plot(label="Output MIDI score plot")
         output_midi = gr.File(label="Output MIDI file", file_types=[".mid"])
+        run_event = run_btn.click(Generate_POP_Medley, [input_num_medley_comps],
                                   [output_midi_title, output_midi_summary, output_midi, output_audio, output_plot])
         app.queue().launch()