teticio commited on
Commit
a71fd34
1 Parent(s): 1dfb040

added latent diffusion to notebook

Browse files
Files changed (1) hide show
  1. notebooks/test_model.ipynb +118 -4
notebooks/test_model.ipynb CHANGED
@@ -177,13 +177,13 @@
177
  },
178
  "outputs": [],
179
  "source": [
180
- "start_steps = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
181
  "track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n",
182
  "for variation in range(12):\n",
183
  " image2, (\n",
184
  " sample_rate,\n",
185
  " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
186
- " raw_audio=audio, start_step=start_steps)\n",
187
  " display(image2)\n",
188
  " display(Audio(audio2, rate=sample_rate))\n",
189
  " track = np.concatenate(\n",
@@ -490,10 +490,124 @@
490
  "display(Audio(audio, rate=sample_rate))"
491
  ]
492
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  {
494
  "cell_type": "code",
495
  "execution_count": null,
496
- "id": "0b05539f",
497
  "metadata": {},
498
  "outputs": [],
499
  "source": []
@@ -520,7 +634,7 @@
520
  "name": "python",
521
  "nbconvert_exporter": "python",
522
  "pygments_lexer": "ipython3",
523
- "version": "3.8.9 (default, Apr 13 2022, 08:48:06) \n[Clang 13.1.6 (clang-1316.0.21.2.5)]"
524
  },
525
  "toc": {
526
  "base_numbering": 1,
 
177
  },
178
  "outputs": [],
179
  "source": [
180
+ "start_step = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
181
  "track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n",
182
  "for variation in range(12):\n",
183
  " image2, (\n",
184
  " sample_rate,\n",
185
  " audio2) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
186
+ " raw_audio=audio, start_step=start_step)\n",
187
  " display(image2)\n",
188
  " display(Audio(audio2, rate=sample_rate))\n",
189
  " track = np.concatenate(\n",
 
490
  "display(Audio(audio, rate=sample_rate))"
491
  ]
492
  },
493
+ {
494
+ "cell_type": "markdown",
495
+ "id": "5b7081f7",
496
+ "metadata": {},
497
+ "source": [
498
+ "## Latent Audio Diffusion\n",
499
+ "Instead of de-noising images directly in the pixel space, we can work in the latent space of a pre-trained VAE (Variational AutoEncoder). This is much faster to train and run inference on, although the quality suffers as there are now three stages involved in encoding / decoding: mel spectrogram, VAE and de-noising."
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": null,
505
+ "id": "17610772",
506
+ "metadata": {},
507
+ "outputs": [],
508
+ "source": [
509
+ "model_id = \"teticio/latent-audio-diffusion-ddim-256\" #@param [\"teticio/latent-audio-diffusion-256\", \"teticio/latent-audio-diffusion-ddim-256\"]"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": null,
515
+ "id": "9e6c73e6",
516
+ "metadata": {},
517
+ "outputs": [],
518
+ "source": [
519
+ "audio_diffusion = AudioDiffusion(model_id=model_id)"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": null,
525
+ "id": "d37a03a9",
526
+ "metadata": {},
527
+ "outputs": [],
528
+ "source": [
529
+ "seed = 6015487092443227811 #@param {type:\"integer\"}\n",
530
+ "generator.manual_seed(seed)\n",
531
+ "image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(\n",
532
+ " generator=generator)\n",
533
+ "display(image)\n",
534
+ "display(Audio(audio, rate=sample_rate))"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": null,
540
+ "id": "c0328a56",
541
+ "metadata": {},
542
+ "outputs": [],
543
+ "source": [
544
+ "seed2 = 5623685468252603494 #@param {type:\"integer\"}\n",
545
+ "generator.manual_seed(seed2)\n",
546
+ "image2, (sample_rate, audio2) = audio_diffusion.generate_spectrogram_and_audio(\n",
547
+ " generator=generator)\n",
548
+ "display(image2)\n",
549
+ "display(Audio(audio2, rate=sample_rate))"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "markdown",
554
+ "id": "bd1f2b58",
555
+ "metadata": {},
556
+ "source": [
557
+ "### Interpolation in latent space\n",
558
+ "As the VAE forces a more compact, lower dimensional representation for the spectrograms, interpolation in latent space can lead to meaningful combinations of audios. In combination with the (deterministic) DDIM from the previous section, the model can be used as an encoder / decoder to a lower dimensional space."
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": null,
564
+ "id": "23ff0ee7",
565
+ "metadata": {},
566
+ "outputs": [],
567
+ "source": [
568
+ "generator.manual_seed(seed)\n",
569
+ "noise = torch.randn((1, audio_diffusion.pipe.unet.in_channels,\n",
570
+ " audio_diffusion.pipe.unet.sample_size[0],\n",
571
+ " audio_diffusion.pipe.unet.sample_size[1]),\n",
572
+ " generator=generator)\n",
573
+ "noise.shape"
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "code",
578
+ "execution_count": null,
579
+ "id": "ff13a2cb",
580
+ "metadata": {},
581
+ "outputs": [],
582
+ "source": [
583
+ "generator.manual_seed(seed2)\n",
584
+ "noise2 = torch.randn((1, audio_diffusion.pipe.unet.in_channels,\n",
585
+ " audio_diffusion.pipe.unet.sample_size[0],\n",
586
+ " audio_diffusion.pipe.unet.sample_size[1]),\n",
587
+ " generator=generator)\n",
588
+ "noise2.shape"
589
+ ]
590
+ },
591
+ {
592
+ "cell_type": "code",
593
+ "execution_count": null,
594
+ "id": "bea26a5e",
595
+ "metadata": {},
596
+ "outputs": [],
597
+ "source": [
598
+ "alpha = 0.5 #@param {type:\"slider\", min:0, max:1, step:0.1}\n",
599
+ "_, (sample_rate, audio3) = audio_diffusion.generate_spectrogram_and_audio(\n",
600
+ " noise=audio_diffusion.pipe.slerp(noise, noise2, alpha),\n",
601
+ " generator=generator)\n",
602
+ "display(Audio(audio, rate=mel.get_sample_rate()))\n",
603
+ "display(Audio(audio2, rate=mel.get_sample_rate()))\n",
604
+ "display(Audio(audio3, rate=sample_rate))"
605
+ ]
606
+ },
607
  {
608
  "cell_type": "code",
609
  "execution_count": null,
610
+ "id": "60080eed",
611
  "metadata": {},
612
  "outputs": [],
613
  "source": []
 
634
  "name": "python",
635
  "nbconvert_exporter": "python",
636
  "pygments_lexer": "ipython3",
637
+ "version": "3.10.6"
638
  },
639
  "toc": {
640
  "base_numbering": 1,