{
"cells": [
{
"cell_type": "markdown",
"id": "62c5865f",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c7800a6",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" # are we running on Google Colab?\n",
" import google.colab\n",
" !git clone -q https://github.com/teticio/audio-diffusion.git\n",
" %cd audio-diffusion\n",
" !pip install -q -r requirements.txt\n",
"except:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b447e2c4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.dirname(os.path.abspath(\"\")))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2fc0e7a",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import random\n",
"import numpy as np\n",
"from datasets import load_dataset\n",
"from IPython.display import Audio\n",
"from audiodiffusion.mel import Mel\n",
"from audiodiffusion import AudioDiffusion"
]
},
{
"cell_type": "markdown",
"id": "7fd945bb",
"metadata": {},
"source": [
"### Select model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97f24046",
"metadata": {},
"outputs": [],
"source": [
"#@markdown teticio/audio-diffusion-256 - trained on my Spotify \"liked\" playlist\n",
"\n",
"#@markdown teticio/audio-diffusion-breaks-256 - trained on samples used in music\n",
"\n",
"#@markdown teticio/audio-diffusion-instrumental-hiphop-256 - trained on instrumental hiphop\n",
"\n",
"model_id = \"teticio/audio-diffusion-256\" #@param [\"teticio/audio-diffusion-256\", \"teticio/audio-diffusion-breaks-256\", \"audio-diffusion-instrumenal-hiphop-256\"]"
]
},
{
"cell_type": "markdown",
"id": "011fb5a1",
"metadata": {},
"source": [
"### Run model inference to generate mel spectrogram, audios and loops"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3d45c36",
"metadata": {},
"outputs": [],
"source": [
"audio_diffusion = AudioDiffusion(model_id=model_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b809fed5",
"metadata": {},
"outputs": [],
"source": [
"generator = torch.Generator()\n",
"for _ in range(10):\n",
" print(f'Seed = {generator.seed()}')\n",
" image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio(generator)\n",
" display(image)\n",
" display(Audio(audio, rate=sample_rate))\n",
" loop = AudioDiffusion.loop_it(audio, sample_rate)\n",
" if loop is not None:\n",
" display(Audio(loop, rate=sample_rate))\n",
" else:\n",
" print(\"Unable to determine loop points\")"
]
},
{
"cell_type": "markdown",
"id": "0bb03e33",
"metadata": {},
"source": [
"### Generate variations of audios"
]
},
{
"cell_type": "markdown",
"id": "80e5b5fa",
"metadata": {},
"source": [
"Try playing around with `start_steps`. Values closer to zero will produce new samples, while values closer to `steps` will produce samples more faithful to the original."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7e637e5",
"metadata": {},
"outputs": [],
"source": [
"seed = 16183389798189209330 #@param {type:\"integer\"}\n",
"image, (sample_rate,\n",
" audio) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
" generator=torch.Generator().manual_seed(seed))\n",
"display(image)\n",
"display(Audio(audio, rate=sample_rate))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0fefe28",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"start_steps = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
"track = AudioDiffusion.loop_it(audio, sample_rate, loops=1)\n",
"for variation in range(12):\n",
" image2, (\n",
" sample_rate, audio2\n",
" ) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
" raw_audio=audio,\n",
" start_step=start_steps,\n",
" steps=1000)\n",
" display(image2)\n",
" display(Audio(audio2, rate=sample_rate))\n",
" track = np.concatenate([track, AudioDiffusion.loop_it(audio2, sample_rate, loops=1)])\n",
"display(Audio(track, rate=sample_rate))"
]
},
{
"cell_type": "markdown",
"id": "b6434d3f",
"metadata": {},
"source": [
"### Remix (style transfer)"
]
},
{
"cell_type": "markdown",
"id": "0da030b2",
"metadata": {},
"source": [
"Alternatively, you can start from another audio altogether, resulting in a kind of style transfer."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a257e69",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "06e31541e8284faa9c71dafb5b686574",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/500 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b9e0e92b62404876b1600c7f01408503",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/500 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "116c3bc7b4c2449fa220e93088a26d6f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/500 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"start_steps = 500 #@param {type:\"slider\", min:0, max:1000, step:10}\n",
"audio_file = \"/home/teticio/Music/Music/Cesar Mariano And CIA/Gilles Peterson In Brazil_ Da Hora/2-07 Futebol De Bar (Heavy Ãœsker Mix.mp3\" #@param {type:\"string\"}\n",
"audio_diffusion.mel.load_audio(audio_file)\n",
"track = np.array([])\n",
"generator = torch.Generator().manual_seed(seed)\n",
"seed = 16183389798189209330 #generator.seed()\n",
"for slice in range(audio_diffusion.mel.get_number_of_slices()):\n",
" generator.manual_seed(seed)\n",
" audio = audio_diffusion.mel.get_audio_slice(slice)\n",
" _, (\n",
" sample_rate, audio2\n",
" ) = audio_diffusion.generate_spectrogram_and_audio_from_audio(\n",
" audio_file=audio_file,\n",
" slice=slice,\n",
" start_step=start_steps,\n",
" steps=1000,\n",
" generator=generator)\n",
" display(Audio(audio, rate=sample_rate))\n",
" display(Audio(audio2, rate=sample_rate))\n",
" track = np.concatenate([track, audio2])\n",
"display(Audio(track, rate=sample_rate))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "90457786",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(Audio(track, rate=sample_rate))"
]
},
{
"cell_type": "markdown",
"id": "ef54cef3",
"metadata": {},
"source": [
"### Compare results with random sample from training set"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f028a3c8",
"metadata": {},
"outputs": [],
"source": [
"mel = Mel(x_res=256, y_res=256)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "269ee816",
"metadata": {},
"outputs": [],
"source": [
"ds = load_dataset(model_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9023846",
"metadata": {},
"outputs": [],
"source": [
"image = random.choice(ds['train'])['image']\n",
"image"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "492e2334",
"metadata": {},
"outputs": [],
"source": [
"audio = mel.image_to_audio(image)\n",
"Audio(data=audio, rate=mel.get_sample_rate())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c59bcc0f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "huggingface",
"language": "python",
"name": "huggingface"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}