In [None]:
try:
 # are we running on Google Colab?
 import google.colab
 !git clone -q https://github.com/teticio/audio-diffusion.git
 %cd audio-diffusion
 !pip install -q -r requirements.txt
except:
 pass

In [None]:
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath("")))

In [None]:
import torch
import random
import numpy as np
from datasets import load_dataset
from IPython.display import Audio
from audiodiffusion.mel import Mel
from audiodiffusion import AudioDiffusion

### Select model

In [None]:
#@markdown teticio/audio-diffusion-256 - trained on my Spotify "liked" playlist

#@markdown teticio/audio-diffusion-breaks-256 - trained on samples used in music

#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop

model_id = "teticio/audio-diffusion-256" #@param ["teticio/audio-diffusion-256", "teticio/audio-diffusion-breaks-256", "audio-diffusion-instrumenal-hiphop-256"]

### Run model inference to generate mel spectrogram, audios and loops

In [None]:
audio_diffusion = AudioDiffusion(model_id=model_id)

In [None]:
generator = torch.Generator()
for _ in range(10):
 print(f'Seed = {generator.seed()}')
 image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(generator)
 display(image)
 display(Audio(audio, rate=sample_rate))
 loop = AudioDiffusion.loop_it(audio, sample_rate)
 if loop is not None:
 display(Audio(loop, rate=sample_rate))
 else:
 print("Unable to determine loop points")

### Generate variations of audios

Try playing around with `start_steps`. Values closer to zero will produce new samples, while values closer to `steps` will produce samples more faithful to the original.

In [None]:
seed = 16183389798189209330 #@param {type:"integer"}
image, (sample_rate,
 audio) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 generator=torch.Generator().manual_seed(seed))
display(image)
display(Audio(audio, rate=sample_rate))

In [None]:
start_steps = 500 #@param {type:"slider", min:0, max:1000, step:10}
track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)
for variation in range(12):
 image2, (
 sample_rate, audio2
 ) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 raw_audio=audio,
 start_step=start_steps,
 steps=1000)
 display(image2)
 display(Audio(audio2, rate=sample_rate))
 track = np.concatenate([track, AudioDiffusion.loop_it(audio2, sample_rate, loops=1)])
display(Audio(track, rate=sample_rate))

### Remix (style transfer)

Alternatively, you can start from another audio altogether, resulting in a kind of style transfer.

In [None]:
start_steps = 500 #@param {type:"slider", min:0, max:1000, step:10}
audio_file = "/home/teticio/Music/Music/Cesar Mariano And CIA/Gilles Peterson In Brazil_ Da Hora/2-07 Futebol De Bar (Heavy Ãœsker Mix.mp3" #@param {type:"string"}
audio_diffusion.mel.load_audio(audio_file)
track = np.array([])
generator = torch.Generator().manual_seed(seed)
seed = 16183389798189209330 #generator.seed()
for slice in range(audio_diffusion.mel.get_number_of_slices()):
 generator.manual_seed(seed)
 audio = audio_diffusion.mel.get_audio_slice(slice)
 _, (
 sample_rate, audio2
 ) = audio_diffusion.generate_spectrogram_and_audio_from_audio(
 audio_file=audio_file,
 slice=slice,
 start_step=start_steps,
 steps=1000,
 generator=generator)
 display(Audio(audio, rate=sample_rate))
 display(Audio(audio2, rate=sample_rate))
 track = np.concatenate([track, audio2])
display(Audio(track, rate=sample_rate))

 0%| | 0/500 [00:00, ?it/s]

 0%| | 0/500 [00:00, ?it/s]

 0%| | 0/500 [00:00, ?it/s]

In [39]:
display(Audio(track, rate=sample_rate))

### Compare results with random sample from training set

In [None]:
mel = Mel(x_res=256, y_res=256)

In [None]:
ds = load_dataset(model_id)

In [None]:
image = random.choice(ds['train'])['image']
image

In [None]:
audio = mel.image_to_audio(image)
Audio(data=audio, rate=mel.get_sample_rate())