Spaces:

erl-j
/

soundfont-generator

Runtime error

App Files Files Community

erl-j commited on Dec 9, 2024

Commit

b362624

0 Parent(s):

first commit

Browse files

Files changed (19) hide show

.gitattributes +37 -0
.gitignore +4 -0
README.md +12 -0
app.py +200 -0
custom.css +203 -0
custom.js +355 -0
requirements.txt +9 -0
sf-creator-fork/.gitignore +3 -0
sf-creator-fork/README.md +90 -0
sf-creator-fork/main.py +72 -0
sf-creator-fork/requirements.txt +2 -0
sf-creator-fork/resources/decentsampler/dspreset.xsd +134 -0
sf-creator-fork/sfcreator/__init__.py +0 -0
sf-creator-fork/sfcreator/decentsampler/__init__.py +47 -0
sf-creator-fork/sfcreator/sfz/__init__.py +37 -0
sf-creator-fork/sfcreator/soundfont/__init__.py +0 -0
sf-creator-fork/sfcreator/soundfont/soundfont.py +119 -0
sf-creator-fork/sfcreator/soundfont/test_soundfont.py +70 -0
train_lfm.py +314 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# everything in assets is large
+assets/**/* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.DS_Store
+*/.DS_Store
+__pycache__/
+output/

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: soundfont-generator
+emoji: 🚀
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 5.8.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import einops
+import gradio as gr
+import datetime
+import numpy as np
+import spaces
+import soundfile
+import os
+import sys
+import zipfile
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+sys.path.append("sf-creator-fork")
+from main import sfz, decentsampler
+decoder_path = "erl-j/soundfont-generator-assets/decoder.pt"
+model_path = "erl-j/soundfont-generator-assets/synth_lfm_modern_bfloat16.pt"
+# Download models from Hugging Face Hub
+decoder_path = hf_hub_download("erl-j/soundfont-generator-assets", "decoder.pt")
+model_path = hf_hub_download("erl-j/soundfont-generator-assets", "synth_lfm_modern_bfloat16.pt")
+# Load models once at startup
+device = "cuda"
+decoder = torch.load(decoder_path, map_location=device).half().eval()
+model = (
+    torch.load(model_path, map_location=device)
+    .half()
+    .eval()
+)
+@spaces.GPU
+def generate_and_export_soundfont(text, steps=20, instrument_name=None):
+    sample_start = datetime.datetime.now()
+    # Generate audio as before
+    z = model.sample(1, text=[text], steps=steps)
+    z_reshaped = einops.rearrange(z, "b t c d -> (b c) d t")
+    with torch.no_grad():
+        audio = decoder.decode(z_reshaped)
+    audio_output = einops.rearrange(audio, "b c t -> c (b t)").cpu().numpy()
+    audio_output = audio_output / np.max(np.abs(audio_output))
+    # Export individual wav files
+    export_audio = audio.cpu().numpy().astype(np.float32)
+    output_dir = "output"
+    os.makedirs(output_dir, exist_ok=True)
+    # Create instrument name if not provided
+    if not instrument_name:
+        instrument_name = text.replace(" ", "_")[:20]
+    # Save individual WAV files
+    pitches = [
+        "C1",
+        "F#1",
+        "C2",
+        "F#2",
+        "C3",
+        "F#3",
+        "C4",
+        "F#4",
+        "C5",
+        "F#5",
+        "C6",
+        "F#6",
+        "C7",
+        "F#7",
+        "C8",
+    ]
+    wav_files = []
+    for i in range(audio.shape[0]):
+        wav_path = f"{output_dir}/{pitches[i]}.wav"
+        soundfile.write(wav_path, export_audio[i].T, 44100)
+        wav_files.append(wav_path)
+    # Generate SFZ file
+    sfz(
+        directory=output_dir,
+        lowkey="21",
+        highkey="108",
+        instrument=instrument_name,
+        loopmode="no_loop",
+        polyphony=None,
+    )
+    # Create zip file containing SFZ and WAV files for the complete soundfont
+    zip_path = f"{output_dir}/{instrument_name}_package.zip"
+    with zipfile.ZipFile(zip_path, "w") as zipf:
+        # Add SFZ file
+        sfz_file = f"{output_dir}/{instrument_name}.sfz"
+        zipf.write(sfz_file, os.path.basename(sfz_file))
+        # Add all WAV files
+        for wav_file in wav_files:
+            if os.path.exists(wav_file):
+                zipf.write(wav_file, os.path.basename(wav_file))
+    total_time = (datetime.datetime.now() - sample_start).total_seconds()
+    return (
+        (44100, audio_output.T),
+        f"Generation took {total_time:.2f}s\nFiles saved in {output_dir}",
+        zip_path,
+        wav_files,
+    )
+custom_js = open("custom.js").read()
+custom_css = open("custom.css").read()
+demo = gr.Blocks(title="Erl-j's sound font generator", js=custom_js,
+                 css = custom_css)
+with demo:
+    gr.Markdown("""
+    # Erl-j's Soundfont Generator.
+    Generate soundfonts from text descriptions using latent flow matching. You can then download the complete SFZ soundfont package to use the instrument locally.
+    ## Instructions
+    1. Enter a text prompt to describe the audio you want to generate.
+    2. Adjust the number of generation steps to tradeoff between quality and speed (kindof).
+    3. Click the "Generate Soundfont" button to generate the audio and soundfont.
+    4. Preview the generated instrument with the keyboard.
+    5. Export the soundfont by clicking the "Download SFZ Soundfont Package" button. You can then use the soundfont in a SFZ-compatible VST like [Sforzando](https://www.plogue.com/products/sforzando/).
+    """)
+    with gr.Row():
+        steps = gr.Slider(
+            minimum=1, maximum=50, value=20, step=1, label="Generation steps"
+        )
+    with gr.Row():
+        text_input = gr.Textbox(
+            label="Prompt",
+            placeholder="Enter text description (e.g. 'hard bass', 'sparkly bells')",
+            lines=2,
+        )
+    with gr.Row():
+        generate_btn = gr.Button("Generate Soundfont", variant="primary")
+    with gr.Row():
+        audio_output = gr.Audio(label="Generated Audio Preview", visible=False)
+        status_output = gr.Textbox(label="Status", lines=2, visible=False)
+    with gr.Row():
+        wav_files = gr.File(label="Individual WAV Files", file_count="multiple", visible=False, elem_id="individual-wav-files")
+    html = """
+    <div id="custom-player"
+    style="width: 100%; height: 600px; background-color: "red"; border: 1px solid #f8f9fa; border-radius: 5px; margin-top: 10px;"
+    ></div>
+    """
+    gr.HTML(html, min_height=800, max_height=800)
+    with gr.Row():
+        sf = gr.File(label="Download SFZ Soundfont Package", type="filepath", visible=True, elem_id="sfz")
+    gr.Markdown("""
+    # About
+    The model is a modified version of [stable audio open](https://huggingface.co/stabilityai/stable-audio-open-1.0).
+    Unlike the original model, this version uses latent flow matching rather than latent diffusion.
+    Secondly, the pitches are stacked in a channel dimension rather than concatenated in the time dimension.
+    This allows for faster generation.
+    Soundfont export code is based on the [sf-creator](https://github.com/paulwellnerbou/sf-creator) project.
+    Similar work by Nercessian and Imort: [InstrumentGen](https://instrumentgen.netlify.app/).
+    Thank you @carlthome for coming up with the name.
+    To cite this work, please use the following BibTeX entry:
+    ```bibtex
+    @misc{erl-j-soundfont-generator,
+        title={Erl-j's Soundfont Generator},
+        author={Nicolas Jonason},
+        year={2024},
+        publisher={Huggingface},
+    }
+    ```
+    """)
+    generate_btn.click(
+        fn=generate_and_export_soundfont,
+        inputs=[text_input, steps],
+        outputs=[audio_output, status_output, sf, wav_files],
+    ).success(js="() => console.log('Success')")
+    text_input.submit(
+        fn=generate_and_export_soundfont,
+        inputs=[text_input, steps],
+        outputs=[audio_output, status_output, sf, wav_files],
+    )
+if __name__ == "__main__":
+    print("Starting demo...")
+    demo.launch()

custom.css ADDED Viewed

	@@ -0,0 +1,203 @@

+@import url('https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap');
+.keyboard-container {
+    width: 100%;
+    padding: 1.5rem;
+    background: #fafafa;
+    border: 1px solid #e5e5e5;
+    border-radius: 4px;
+    font-family: 'Roboto', sans-serif;
+    user-select: none;
+}
+.keyboard-row {
+    display: flex;
+    gap: 0.25rem;
+    margin-bottom: 0.25rem;
+    width: 100%;
+}
+.key {
+    width: calc((100% - 2.75rem) / 12);
+    aspect-ratio: 1;
+    min-width: 40px;
+    flex: none;
+    border: 1px solid #e5e5e5;
+    border-radius: 4px;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    cursor: pointer;
+    background: white;
+    transition: all 0.2s cubic-bezier(0.4, 0, 0.2, 1);
+    user-select: none;
+    padding: 0.5rem;
+}
+.key:hover {
+    background: #f5f5f5;
+    transform: translateY(-1px);
+}
+.key:active {
+    transform: translateY(0);
+}
+.key-label {
+    font-size: 0.875rem;
+    font-weight: 500;
+    color: #333;
+    user-select: none;
+}
+.note-label {
+    font-size: 0.75rem;
+    color: #666;
+    margin-top: 0.25rem;
+    user-select: none;
+}
+.controls {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+}
+.effects-controls {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+    padding: 1rem;
+    background: #fafafa;
+    border-radius: 4px;
+}
+.control-group {
+    display: flex;
+    flex-direction: column;
+    gap: 0.5rem;
+}
+.control-group label {
+    font-size: 0.75rem;
+    color: #666;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    user-select: none;
+}
+input[type="range"] {
+    width: 100%;
+    height: 24px;  /* Increased height for better touch target */
+    background: transparent;  /* Remove default background */
+    border-radius: 2px;
+    appearance: none;
+    cursor: pointer;
+    margin: 0;
+    padding: 10px 0;  /* Add padding for better touch area */
+}
+input[type="range"]::-webkit-slider-thumb {
+    appearance: none;
+    width: 24px;  /* Increased size */
+    height: 24px;  /* Increased size */
+    background: #000000;
+    border-radius: 50%;
+    cursor: pointer;
+    transition: background 0.2s;
+    margin-top: -10px;  /* Center thumb vertically */
+    user-select: none;
+}
+input[type="range"]::-webkit-slider-runnable-track {
+    background: #393a39;
+    height: 4px;
+    border-radius: 2px;
+    user-select: none;
+}
+select {
+    padding: 0.5rem;
+    border-radius: 4px;
+    border: 1px solid #e5e5e5;
+    background: white;
+    font-family: 'Roboto', sans-serif;
+    font-size: 0.875rem;
+    user-select: none;
+    cursor: pointer;
+}
+.button-group {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+}
+.button-group button {
+    width: 2rem;
+    height: 2rem;
+    padding: 0;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    border: 1px solid #e5e5e5;
+    border-radius: 4px;
+    background: white;
+    cursor: pointer;
+    transition: all 0.2s;
+    user-select: none;
+}
+.button-group button:hover {
+    background: #f5f5f5;
+}
+button {
+    padding: 0.5rem 1rem;
+    border: none;
+    border-radius: 4px;
+    background: #015131;
+    color: white;
+    font-family: 'Roboto', sans-serif;
+    font-size: 0.875rem;
+    cursor: pointer;
+    transition: all 0.2s;
+    user-select: none;
+}
+button:hover {
+    background: #002114;
+}
+body {
+    font-family: 'Roboto', sans-serif;
+    font-size: 1rem;
+    line-height: 1.5;
+    color: #333;
+    background: #f5f5f5;
+    margin: 0;
+    padding: 0;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    min-height: 100vh;
+}
+@media (max-width: 768px) {
+    .control-group { min-width: 100%; }
+    .key { min-width: 35px; }
+    .key-label { font-size: 0.75rem; }
+    input[type="range"] {
+        height: 32px;  /* Even larger touch target on mobile */
+        padding: 14px 0;
+    }
+    input[type="range"]::-webkit-slider-thumb {
+        width: 28px;  /* Larger thumb on mobile */
+        height: 28px;
+    }
+}

custom.js ADDED Viewed

	@@ -0,0 +1,355 @@

+function previewPlayer() {
+    class KeyboardPlayer {
+        constructor(containerId) {
+            this.container = document.getElementById(containerId);
+            this.initializeProperties();
+            this.loadToneJS().then(() => this.init());
+            this.setupWavFileObserver();
+            // Add click handlers for activation/deactivation
+            this.container.addEventListener('click', (e) => {
+                e.stopPropagation();
+                if (!this.keyboardEnabled) {
+                    this.enableKeyboard();
+                }
+            });
+            document.addEventListener('click', (e) => {
+                if (!this.container.contains(e.target)) {
+                    this.disableKeyboard();
+                }
+            });
+            // disable keyboard
+            this.disableKeyboard();
+        }
+        enableKeyboard() {
+            this.keyboardEnabled = true;
+            this.container.style.opacity = '1';
+        }
+        disableKeyboard() {
+            this.keyboardEnabled = false;
+            this.container.style.opacity = '0.5';
+        }
+        setupWavFileObserver() {
+            const observer = new MutationObserver((mutations) => {
+                const hasDownloadLinkChanges = mutations.some(mutation =>
+                    mutation.type === 'childList' &&
+                    mutation.target.classList.contains('download-link')
+                );
+                if (hasDownloadLinkChanges) {
+                    this.initializeSampler();
+                    this.enableKeyboard();
+                    // scroll so middle of keyboard is in centre of viewport
+                    const keyboardTop = this.container.querySelector('.keyboard').getBoundingClientRect().top;
+                    window.scrollTo(0, keyboardTop - window.innerHeight / 2, { behavior: 'smooth' });
+                }
+            });
+            const wavFilesContainer = document.getElementById('individual-wav-files');
+            if (wavFilesContainer) {
+                observer.observe(wavFilesContainer, {
+                    childList: true,
+                    subtree: true
+                });
+            }
+        }
+        initializeProperties() {
+            this.sampler = null;
+            this.keyboardEnabled = true;
+            this.layout = null;
+            this.rootPitch = 60;
+            this.columnOffset = 2;
+            this.rowOffset = 4;
+            this.activeNotes = new Map();
+            this.reverb = null;
+            this.releaseTime = 0.1;
+            this.noteNames = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'];
+            this.majorScale = [0, 2, 4, 5, 7, 9, 11];
+        }
+        async loadToneJS() {
+            if (window.Tone) return;
+            const script = document.createElement('script');
+            script.src = 'https://cdnjs.cloudflare.com/ajax/libs/tone/14.8.49/Tone.js';
+            return new Promise((resolve, reject) => {
+                script.onload = resolve;
+                script.onerror = () => reject(new Error('Failed to load Tone.js'));
+                document.head.appendChild(script);
+            });
+        }
+        init() {
+            this.createUI();
+            this.detectKeyboardLayout();
+            this.setupEventListeners();
+            this.initializeEffects();
+            this.initializeSampler();
+        }
+        createUI() {
+            this.container.innerHTML = `
+                <div class="keyboard-container">
+                    <div class="effects-controls">
+                        <h3>Release & Reverb</h3>
+                        <div class="effect-slider">
+                            <label>Release: <span class="release-value">0.1s</span></label>
+                            <input type="range" class="release-slider" min="0" max="3" step="0.1" value="0.1">
+                        </div>
+                        <div class="effect-slider">
+                            <label>Reverb: <span class="reverb-value">50%</span></label>
+                            <input type="range" class="reverb-slider" min="0" max="100" value="50">
+                        </div>
+                    </div>
+                    <div class="keyboard"></div>
+                    <br>
+                      <div class="mapping-controls">
+                        <h3>Keyboard Mapping</h3>
+                        <div class="control-group">
+                            <label>Root Pitch: <span class="root-value">C4</span></label>
+                            <input type="range" class="root-slider" min="24" max="84" value="60">
+                        </div>
+                        <div class="control-group">
+                            <label>Column Offset: <span class="column-value">2</span> keys from left</label>
+                            <input type="range" class="column-slider" min="0" max="6" value="2">
+                        </div>
+                        <div class="control-group">
+                            <label>Row Offset: <span class="row-value">4</span> scale degree(s)</label>
+                            <input type="range" class="row-slider" min="1" max="20" value="4">
+                        </div>
+                    </div>
+                </div>
+            `;
+            this.cacheElements();
+        }
+        cacheElements() {
+            const selectors = {
+                keyboard: '.keyboard',
+                rootSlider: '.root-slider',
+                rootValue: '.root-value',
+                columnSlider: '.column-slider',
+                columnValue: '.column-value',
+                rowSlider: '.row-slider',
+                rowValue: '.row-value',
+                releaseSlider: '.release-slider',
+                releaseValue: '.release-value',
+                reverbSlider: '.reverb-slider',
+                reverbValue: '.reverb-value'
+            };
+            this.elements = Object.fromEntries(
+                Object.entries(selectors).map(([key, selector]) =>
+                    [key, this.container.querySelector(selector)]
+                )
+            );
+        }
+        setupEventListeners() {
+            const handlers = {
+                releaseSlider: e => {
+                    this.releaseTime = parseFloat(e.target.value);
+                    this.elements.releaseValue.textContent = `${this.releaseTime}s`;
+                },
+                reverbSlider: e => {
+                    const wetness = parseInt(e.target.value) / 100;
+                    this.reverb.wet.value = wetness;
+                    this.elements.reverbValue.textContent = `${e.target.value}%`;
+                },
+                rootSlider: e => {
+                    this.rootPitch = parseInt(e.target.value);
+                    this.elements.rootValue.textContent = this.midiToNoteName(this.rootPitch);
+                    this.updateNotes();
+                },
+                columnSlider: e => {
+                    this.columnOffset = parseInt(e.target.value);
+                    this.elements.columnValue.textContent = this.columnOffset;
+                    this.updateNotes();
+                },
+                rowSlider: e => {
+                    this.rowOffset = parseInt(e.target.value);
+                    this.elements.rowValue.textContent = this.rowOffset;
+                    this.updateNotes();
+                }
+            };
+            Object.entries(handlers).forEach(([element, handler]) =>
+                this.elements[element].addEventListener('input', handler));
+            document.addEventListener('mouseup', () => this.handleMouseUp());
+            document.addEventListener('keydown', e => !e.repeat && this.handleKeyEvent(e, true));
+            document.addEventListener('keyup', e => this.handleKeyEvent(e, false));
+        }
+        initializeEffects() {
+            this.reverb = new Tone.Reverb({ decay: 1.5, wet: 0.5 }).toDestination();
+        }
+        async initializeSampler() {
+            const availableNotes = ['C1', 'F#1', 'C2', 'F#2', 'C3', 'F#3', 'C4', 'F#4', 'C5', 'F#5'];
+            const urls = Object.fromEntries(
+                availableNotes
+                    .map(note => [note, document.querySelector(`a[href*="${note}.wav"]`)?.href])
+                    .filter(([, url]) => url)
+            );
+            if (!Object.keys(urls).length) {
+                this.handleSamplerError();
+                return;
+            }
+            this.sampler = new Tone.Sampler({
+                urls,
+                onload: () => this.handleSamplerLoad(),
+            }).connect(this.reverb);
+        }
+        handleSamplerError() {
+            console.log('No WAV files found');
+        }
+        handleSamplerLoad() {
+            console.log('Sampler loaded');
+            this.container.querySelectorAll('.key').forEach(key => key.style.opacity = '1');
+        }
+        detectKeyboardLayout() {
+            this.layout = {
+                keys: [
+                    { keys: '1234567890'.split(''), offset: 0 },
+                    { keys: 'QWERTYUIOP'.split(''), offset: 1 },
+                    { keys: 'ASDFGHJKL'.split(''), offset: 1.5 },
+                    { keys: 'ZXCVBNM,.'.split(''), offset: 2 }
+                ]
+            }.keys;
+            this.createKeyboard();
+        }
+        createKeyboard() {
+            this.elements.keyboard.innerHTML = '';
+            this.layout.forEach((row, rowIndex) => {
+                const rowElement = document.createElement('div');
+                rowElement.className = 'keyboard-row';
+                rowElement.style.paddingLeft = `${row.offset * 3}%`;
+                row.keys.forEach(key => rowElement.appendChild(this.createKey(key)));
+                this.elements.keyboard.appendChild(rowElement);
+            });
+            this.updateNotes();
+        }
+        createKey(keyLabel) {
+            const key = document.createElement('div');
+            key.className = 'key';
+            key.innerHTML = `
+                <div class="key-label">${keyLabel}</div>
+                <div class="note-label"></div>
+            `;
+            key.addEventListener('mousedown', () => this.startNote(key));
+            key.addEventListener('mouseenter', e => e.buttons === 1 && this.startNote(key));
+            key.addEventListener('mouseleave', () => this.stopNote(key));
+            return key;
+        }
+        updateNotes() {
+            Array.from(this.elements.keyboard.children).forEach((row, rowIndex) => {
+                Array.from(row.children).forEach((key, columnIndex) => {
+                    const horizontalDistance = columnIndex - this.columnOffset;
+                    const verticalDistance = rowIndex * this.rowOffset;
+                    const totalScaleDegrees = horizontalDistance - verticalDistance;
+                    const octaves = Math.floor(totalScaleDegrees / 7);
+                    const remainingDegrees = ((totalScaleDegrees % 7) + 7) % 7;
+                    const semitonesFromRoot = this.majorScale[remainingDegrees] + (octaves * 12);
+                    const midiNote = this.rootPitch + semitonesFromRoot;
+                    this.updateKeyDisplay(key, midiNote);
+                });
+            });
+        }
+        updateKeyDisplay(key, midiNote) {
+            const isBaseRoot = midiNote === this.rootPitch;
+            const isOctaveRoot = midiNote % 12 === this.rootPitch % 12;
+            key.style.backgroundColor = isBaseRoot ? '#90EE90' : isOctaveRoot ? '#E8F5E9' : '';
+            const noteName = this.midiToNoteName(midiNote);
+            key.querySelector('.note-label').textContent = noteName;
+            key.dataset.note = noteName;
+            key.dataset.midi = midiNote;
+        }
+        handleKeyEvent(e, isKeyDown) {
+            if (!this.keyboardEnabled || !this.sampler) return;
+            const keyElement = this.findKeyElement(e.key.toUpperCase());
+            if (keyElement) {
+                e.preventDefault();
+                isKeyDown ? this.startNote(keyElement) : this.stopNote(keyElement);
+            }
+        }
+        startNote(keyElement) {
+            if (!this.sampler || !keyElement || this.activeNotes.has(keyElement)) return;
+            const note = keyElement.dataset.note;
+            if (!note) return;
+            Tone.start().then(() => {
+                this.sampler.triggerAttack(note);
+                this.activeNotes.set(keyElement, { note });
+                this.animateKey(keyElement, true);
+            });
+        }
+        stopNote(keyElement) {
+            if (!this.sampler || !keyElement) return;
+            const noteInfo = this.activeNotes.get(keyElement);
+            if (noteInfo) {
+                this.sampler.triggerRelease(noteInfo.note, "+" + this.releaseTime);
+                this.activeNotes.delete(keyElement);
+                this.animateKey(keyElement, false);
+            }
+        }
+        handleMouseUp() {
+            this.activeNotes.forEach((_, keyElement) => this.stopNote(keyElement));
+        }
+        findKeyElement(keyLabel) {
+            for (const row of this.elements.keyboard.children) {
+                for (const key of row.children) {
+                    if (key.querySelector('.key-label').textContent === keyLabel) return key;
+                }
+            }
+            return null;
+        }
+        animateKey(keyElement, isDown) {
+            const midiNote = parseInt(keyElement.dataset.midi);
+            const isBaseRoot = midiNote === this.rootPitch;
+            const isOctaveRoot = midiNote % 12 === this.rootPitch % 12;
+            keyElement.style.transform = isDown ? 'scale(0.95)' : '';
+            keyElement.style.backgroundColor = isBaseRoot ? '#90EE90' :
+                isOctaveRoot ? '#E8F5E9' :
+                    isDown ? '#f0f0f0' : '';
+        }
+        midiToNoteName(midiNumber) {
+            const octave = Math.floor(midiNumber / 12) - 1;
+            return `${this.noteNames[midiNumber % 12]}${octave}`;
+        }
+    }
+    let container = document.getElementById('custom-player');
+    if (!container) {
+        container = document.createElement('div');
+        container.id = 'custom-player';
+        document.body.appendChild(container);
+    }
+    new KeyboardPlayer('custom-player');
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+--extra-index-url https://download.pytorch.org/whl/cu113
+torch
+stable-audio-tools==0.0.16
+gradio==5.8.0
+einops
+spaces
+lxml
+transformers==4.44.0
+tokenizers==0.19.1

sf-creator-fork/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv/
+__pycache__
+*/__pycache__

sf-creator-fork/README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# Soundfont creation
+This library aims to create a soundfont based on a directory containing sound files (`.wav`).
+Current support:
+* [SFZ Format](https://sfzformat.com/)
+* [DecentSampler](https://www.decentsamples.com/product/decent-sampler-plugin/)
+Planned support:
+* [Soundfont 2 `sf2`](https://en.wikipedia.org/wiki/SoundFont) (This will be more tricky, as it is a binary format,
+as [Polyphone](https://www.polyphone-soundfonts.com/) is able to convert `sfz` to `sf2`, I will postpone this)
+## Project setup
+### Creating a project specific virtual environment (recommended)
+You can omit this step if you are ok with installing the dependencies
+system wide and go directly to the next step: [Installing dependencies](#installing-dependencies).
+```
+virtualenv venv
+source venv/bin/activate
+```
+or, under Windows:
+```
+virtualenv venv
+venv\Scripts\activate
+```
+### Installing dependencies
+```
+pip install -r requirements.txt
+```
+## Run
+This will create a file `soundfont.sfz` alongside the `wav` files in the given directory.
+```
+python main.py sfz <directory-to-wave-files>
+```
+Run `python main.py --help` or `python main.py <command> --help`, where `<command>` can be `sfz` or `decentsampler` for now,
+to get the full list of arguments.
+## Automatic note detection and mapping of samples
+The given samples are scanned for note names (A0 to C8). If a note name is found in a filename of a sample, the midi
+note for this sample will be set automatically.
+In addition to that, in case of missing samples in between for certain notes an automatic distribution is calculated, so that all notes between A0 and C8 are covered.
+If there are two samples for the same note available, a round robin/random change is assumed.
+## TODO and resources
+- [ ] Make automatic distribution over all midi notes from 21 to 108 optional, and add an option to configure the highest and the lowest, ideally relative to the hightest and lowest pitch of the samples
+- [ ] Detect pitch automatically (for melodic instruments at least), using https://pypi.org/project/crepe/
+### More SFZ Support
+The best starting point for SFZ is https://sfzformat.com/.
+- [ ] Look at [SFZ Python Automapper by Peter Eastman](https://vis.versilstudios.com/sfzconverter.html#u13452-4), this looks like there is a lot that can be reused for sfz files
+- [ ] And https://github.com/freepats/freepats-tools, too?
+- [ ] Add support for velocity levels
+- [ ] Add support for more options supported by SFZ, reverb, effects, attack, release and so on
+### DecentSampler support
+An XML based format developed by David Hilowitz (see https://youtu.be/UxPRmD_RNCY).
+- [x] Create an XML Schema for highlighting and autocompletion
+- [x] Implement `DecentSamplerWriter`
+- [x] Add options for UI (cover)
+- [ ] ...and effects
+### SF2 Support
+This will get tricky, as this is a binary format with not too much examples. There are a few applications reading or even writing sf2 out there, at least at a very basic level.
+But as [Polyphone](https://www.polyphone-soundfonts.com/) is able to convert `sfz` to `sf2`, I will postpone this.
+* Basic SFZ to SF2 converter in python: https://github.com/freepats/freepats-tools
+* C++ library [sf2cute](http://gocha.github.io/sf2cute/)
+* Python library reading sf2:  https://pypi.org/project/sf2utils/
+* C# code writing basic sf2 file: https://github.com/Kermalis/SoundFont2
+* Code of Polyphone: https://github.com/davy7125/polyphone

sf-creator-fork/main.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+from typing import List
+import glob
+from sfcreator.decentsampler import DecentSamplerWriter
+from sfcreator.sfz import SfzWriter
+from sfcreator.soundfont.soundfont import SoundFont, Sample, NoteNameMidiNumberMapper, HighLowKeyDistributor
+def list_supported_files(directory):
+    return glob.glob(directory + "/*.wav")
+def map_note_name(note_name: str):
+    if note_name.isdigit():
+        return int(note_name)
+    else:
+        return NoteNameMidiNumberMapper().map(note_name)
+# @cli.command()
+# @click.argument('directory')
+# @click.option('--highkey', required=False, type=str, default="108")
+# @click.option('--lowkey', required=False, type=str, default="21")
+# @click.option('--instrument', help="name of the instrument", required=False, type=str)
+# @click.option('--loopmode',
+#               help="loop mode, no_loop (default), one_shot, loop_continuous or loop_sustain,"
+#                    " see https://sfzformat.com/opcodes/loop_mode",
+#               required=False, type=str, default="no_loop")
+# @click.option('--polyphony',
+#               help="Polyphony voice limit, see https://sfzformat.com/opcodes/polyphony",
+#               required=False, type=str, default=None)
+def sfz(directory: str, lowkey: str, highkey: str, instrument: str, loopmode: str, polyphony: str):
+    soundfont = make_soundfont(directory, map_note_name(lowkey), map_note_name(highkey), instrument, loopmode,
+                               polyphony)
+    SfzWriter().write(directory, soundfont)
+# @cli.command()
+# @click.argument('directory')
+# @click.option('--highkey', required=False, type=str, default="108")
+# @click.option('--lowkey', required=False, type=str, default="21")
+# @click.option('--instrument', help="name of the instrument", required=False, type=str)
+# @click.option('--image', required=False, type=str)
+# @click.option('--loopmode',
+#               help="loop mode, only supported for sfz. For compatibility reasons, --loopmode=one_shot will cause"
+#                    " --release=20 for decent sampler, to ensure samples are always played until the end.",
+#               required=False, type=str, default="no_loop")
+# @click.option('--polyphony',
+#               help="Polyphony voice limit, not supported in decent sampler format.",
+#               required=False, type=str, default=None)
+def decentsampler(directory: str, lowkey: str, highkey: str, instrument: str, loopmode: str, polyphony: str,
+                  image: str):
+    soundfont = make_soundfont(directory, map_note_name(lowkey), map_note_name(highkey), instrument, loopmode,
+                               polyphony)
+    DecentSamplerWriter().write(directory, soundfont, image)
+def make_soundfont(directory: str, lowkey: int, highkey: int, instrument: str, loopmode: str, polyphony: str):
+    files = list_supported_files(directory)
+    samples: List[Sample] = []
+    mapper = NoteNameMidiNumberMapper()
+    for file in files:
+        samples.append(mapper.mapped_sample(file))
+    soundfont = SoundFont(samples, loop_mode=loopmode, polyphony=polyphony)
+    if instrument is None or len(instrument) == 0:
+        soundfont.instrument_name = os.path.basename(os.path.dirname(directory))
+    else:
+        soundfont.instrument_name = instrument
+    HighLowKeyDistributor().distribute(soundfont, low_key=lowkey, high_key=highkey)
+    return soundfont

sf-creator-fork/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ click~=7.1.2
2	+ lxml~=4.6.1

sf-creator-fork/resources/decentsampler/dspreset.xsd ADDED Viewed

	@@ -0,0 +1,134 @@

+<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+  <xs:element name="DecentSampler">
+    <xs:complexType>
+      <xs:sequence>
+        <xs:element name="ui">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="tab">
+                <xs:complexType>
+                  <xs:sequence>
+                    <xs:element name="labeled-knob" maxOccurs="unbounded" minOccurs="0">
+                      <xs:complexType>
+                        <xs:sequence>
+                          <xs:element name="binding" maxOccurs="unbounded" minOccurs="0">
+                            <xs:complexType>
+                              <xs:simpleContent>
+                                <xs:extension base="xs:string">
+                                  <xs:attribute type="xs:string" name="type" use="optional"/>
+                                  <xs:attribute type="xs:string" name="level" use="optional"/>
+                                  <xs:attribute type="xs:byte" name="position" use="optional"/>
+                                  <xs:attribute type="xs:string" name="parameter" use="optional"/>
+                                  <xs:attribute type="xs:string" name="translation" use="optional"/>
+                                  <xs:attribute type="xs:byte" name="translationOutputMin" use="optional"/>
+                                  <xs:attribute type="xs:float" name="translationOutputMax" use="optional"/>
+                                  <xs:attribute type="xs:string" name="translationTable" use="optional"/>
+                                </xs:extension>
+                              </xs:simpleContent>
+                            </xs:complexType>
+                          </xs:element>
+                        </xs:sequence>
+                        <xs:attribute type="xs:short" name="x" use="optional"/>
+                        <xs:attribute type="xs:byte" name="y" use="optional"/>
+                        <xs:attribute type="xs:string" name="label" use="optional"/>
+                        <xs:attribute type="xs:string" name="type" use="optional"/>
+                        <xs:attribute type="xs:short" name="minValue" use="optional"/>
+                        <xs:attribute type="xs:short" name="maxValue" use="optional"/>
+                        <xs:attribute type="xs:string" name="textColor" use="optional"/>
+                        <xs:attribute type="xs:float" name="value" use="optional"/>
+                      </xs:complexType>
+                    </xs:element>
+                  </xs:sequence>
+                  <xs:attribute type="xs:string" name="name"/>
+                </xs:complexType>
+              </xs:element>
+            </xs:sequence>
+            <xs:attribute type="xs:string" name="bgImage"/>
+            <xs:attribute type="xs:short" name="width"/>
+            <xs:attribute type="xs:short" name="height"/>
+            <xs:attribute type="xs:string" name="layoutMode"/>
+            <xs:attribute type="xs:string" name="bgMode"/>
+          </xs:complexType>
+        </xs:element>
+        <xs:element name="groups">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="group" maxOccurs="unbounded" minOccurs="0">
+                <xs:complexType>
+                  <xs:sequence>
+                    <xs:element name="sample" maxOccurs="unbounded" minOccurs="0">
+                      <xs:complexType>
+                        <xs:simpleContent>
+                          <xs:extension base="xs:string">
+                            <xs:attribute type="xs:string" name="path" use="optional"/>
+                            <xs:attribute type="xs:byte" name="rootNote" use="optional"/>
+                            <xs:attribute type="xs:byte" name="loNote" use="optional"/>
+                            <xs:attribute type="xs:byte" name="hiNote" use="optional"/>
+                            <xs:attribute type="xs:int" name="loopStart" use="optional"/>
+                            <xs:attribute type="xs:int" name="loopEnd" use="optional"/>
+                          </xs:extension>
+                        </xs:simpleContent>
+                      </xs:complexType>
+                    </xs:element>
+                  </xs:sequence>
+                  <xs:attribute type="xs:string" name="name" use="optional"/>
+                  <xs:attribute type="xs:string" name="volume" use="optional"/>
+                  <xs:attribute type="xs:byte" name="ampVelTrack" use="optional"/>
+                  <xs:attribute type="xs:byte" name="modVolume" use="optional"/>
+                </xs:complexType>
+              </xs:element>
+            </xs:sequence>
+            <xs:attribute type="xs:float" name="attack"/>
+            <xs:attribute type="xs:float" name="decay"/>
+            <xs:attribute type="xs:float" name="sustain"/>
+            <xs:attribute type="xs:byte" name="release"/>
+          </xs:complexType>
+        </xs:element>
+        <xs:element name="effects">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="effect" maxOccurs="unbounded" minOccurs="0">
+                <xs:complexType>
+                  <xs:simpleContent>
+                    <xs:extension base="xs:string">
+                      <xs:attribute type="xs:string" name="type" use="optional"/>
+                    </xs:extension>
+                  </xs:simpleContent>
+                </xs:complexType>
+              </xs:element>
+            </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+        <xs:element name="midi">
+          <xs:complexType>
+            <xs:sequence>
+              <xs:element name="cc">
+                <xs:complexType>
+                  <xs:sequence>
+                    <xs:element name="binding">
+                      <xs:complexType>
+                        <xs:simpleContent>
+                          <xs:extension base="xs:string">
+                            <xs:attribute type="xs:string" name="level"/>
+                            <xs:attribute type="xs:string" name="type"/>
+                            <xs:attribute type="xs:byte" name="position"/>
+                            <xs:attribute type="xs:string" name="parameter"/>
+                            <xs:attribute type="xs:string" name="translation"/>
+                            <xs:attribute type="xs:byte" name="translationOutputMin"/>
+                            <xs:attribute type="xs:float" name="translationOutputMax"/>
+                          </xs:extension>
+                        </xs:simpleContent>
+                      </xs:complexType>
+                    </xs:element>
+                  </xs:sequence>
+                  <xs:attribute type="xs:byte" name="number"/>
+                </xs:complexType>
+              </xs:element>
+            </xs:sequence>
+          </xs:complexType>
+        </xs:element>
+      </xs:sequence>
+      <xs:attribute type="xs:byte" name="pluginVersion"/>
+    </xs:complexType>
+  </xs:element>
+</xs:schema>

sf-creator-fork/sfcreator/__init__.py ADDED Viewed

File without changes

sf-creator-fork/sfcreator/decentsampler/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from typing import List
+from lxml import etree
+from sfcreator.soundfont.soundfont import SoundFont, Sample
+class DecentSamplerWriter:
+    def write(self, directory: str, soundfont: SoundFont, image: str):
+        root = etree.Element("DecentSampler", pluginVersion="1")
+        if image is not None:
+            ui = etree.Element("ui", bgImage=image, width="812", height="375", layoutMode="relative", bgMode="top_left")
+            root.append(ui)
+            tab = etree.Element("tab", name="main")
+            ui.append(tab)
+        groups = etree.Element("groups")
+        root.append(groups)
+        if soundfont.loop_mode == "one_shot":
+            # assuming no sample will be longer than 20s
+            groups.set("release", "20")
+        elif soundfont.release is not None:
+            groups.set("release", str(soundfont.release))
+        for root_key in soundfont.root_keys():
+            self.write_root_key_sample_group(groups, soundfont.samples_for_root_key(root_key))
+        filename = directory + "/" + soundfont.instrument_name + ".dspreset"
+        print("Writing to " + filename)
+        et = etree.ElementTree(root)
+        et.write(filename, pretty_print=True, encoding='utf-8', xml_declaration=True)
+    def write_root_key_sample_group(self, groups, samples: List[Sample]):
+        group = etree.Element("group")
+        groups.append(group)
+        if len(samples) > 1:
+            group.set("seqMode", "random")
+        for index, sample in enumerate(samples, start=1):
+            xml_sample = etree.Element("sample")
+            group.append(xml_sample)
+            xml_sample.set("rootNote", str(sample.key_range.root_key))
+            xml_sample.set("loNote", str(sample.key_range.low_key))
+            xml_sample.set("hiNote", str(sample.key_range.high_key))
+            xml_sample.set("loVel", str(sample.velocity_range.low_velocity))
+            xml_sample.set("hiVel", str(sample.velocity_range.high_velocity))
+            xml_sample.set("seqPosition", str(index))
+            xml_sample.set("path", str(os.path.basename(sample.filename)))

sf-creator-fork/sfcreator/sfz/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+from typing import List
+from sfcreator.soundfont.soundfont import SoundFont, Sample
+class SfzWriter:
+    def write(self, directory: str, soundfont: SoundFont):
+        f = open(directory + "/" + soundfont.instrument_name + ".sfz", "w")
+        f.write("<group>\r\n")
+        if soundfont.loop_mode is not None:
+            f.write(f"loop_mode={soundfont.loop_mode}\r\n")
+        if soundfont.polyphony is not None:
+            f.write(f"loop_mode={soundfont.polyphony}\r\n")
+        for root_key in soundfont.root_keys():
+            self.write_root_key_sample_group(f, soundfont.samples_for_root_key(root_key))
+    def write_root_key_sample_group(self, f, samples: List[Sample]):
+        if len(samples) == 1:
+            sample = samples[0]
+            f.write(f"<region> sample={os.path.basename(sample.filename)}"
+                    f" pitch_keycenter={str(sample.key_range.root_key)} lokey={str(sample.key_range.low_key)}"
+                    f" hikey={str(sample.key_range.high_key)}\r\n")
+        else:
+            lorand = 0.0
+            randstep = 1 / len(samples)
+            for sample in samples:
+                hirand = lorand + randstep
+                f.write(f"<region> sample={os.path.basename(sample.filename)}"
+                        f" pitch_keycenter={str(sample.key_range.root_key)} lokey={str(sample.key_range.low_key)}"
+                        f" hikey={str(sample.key_range.high_key)}")
+                if lorand > 0.0:
+                    f.write(f" lorand={lorand}")
+                if hirand < 1.0:
+                    f.write(f" hirand={hirand}")
+                f.write("\r\n")
+                lorand = hirand

sf-creator-fork/sfcreator/soundfont/__init__.py ADDED Viewed

File without changes

sf-creator-fork/sfcreator/soundfont/soundfont.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from typing import List
+class KeyRange:
+    def __init__(self, root_key: int = 60, low_key: int = None, high_key: int = None):
+        self.root_key = root_key
+        self.low_key = low_key
+        self.high_key = high_key
+        if self.low_key is None:
+            self.low_key = self.root_key
+        if self.high_key is None:
+            self.high_key = self.root_key
+    def in_range(self, key):
+        return self.low_key <= key <= self.high_key
+    def __eq__(self, other):
+        return self.low_key == other.low_key and self.root_key == other.root_key and self.high_key == other.high_key
+class VelocityRange:
+    def __init__(self, low_velocity: int = 0, high_velocity: int = 127):
+        self.low_velocity = low_velocity
+        self.high_velocity = high_velocity
+    def in_range(self, key):
+        return self.low_velocity <= key <= self.high_velocity
+    def __eq__(self, other):
+        return self.low_velocity == other.low_velocity and self.high_velocity == other.root_key and\
+               self.high_velocity == other.high_key
+class Sample:
+    def __init__(self, filename: str, root_key: int = 60, low_key: int = None, high_key: int = None):
+        self.velocity_range = VelocityRange()
+        self.filename = filename
+        self.key_range = KeyRange(root_key, low_key, high_key)
+        self.index = 0
+    def __repr__(self) -> str:
+        return 'Sample(filename=' + self.filename + ', root_key=' + str(self.key_range.root_key) + ', low_key=' + str(
+            self.key_range.low_key) + ', high_key=' + str(self.key_range.high_key) + ')'
+    def __eq__(self, other):
+        return self.filename == other.filename and self.key_range == other.key_range and self.index == other.index
+class SoundFont:
+    def __init__(self, samples: List[Sample], loop_mode: str = "no_loop", polyphony: str = None, release: int = None,
+                 instrument_name=None):
+        self.instrument_name = instrument_name
+        self.samples = samples
+        self.loop_mode = loop_mode
+        self.polyphony = polyphony
+        self.release = release
+    def root_keys(self):
+        return sorted(set([sample.key_range.root_key for sample in self.samples]))
+    def range_for_key(self, key) -> KeyRange:
+        samples_in_range = [sample for sample in self.samples if sample.key_range.in_range(key)]
+        return samples_in_range[0].key_range if len(samples_in_range) > 0 else None
+    def samples_for_root_key(self, root_key):
+        return [sample for sample in self.samples if sample.key_range.root_key == root_key]
+    def set_range(self, root_key, low_key=None, high_key=None):
+        for sample in self.samples_for_root_key(root_key):
+            if low_key is not None:
+                sample.key_range.low_key = low_key
+            if high_key is not None:
+                sample.key_range.high_key = high_key
+class HighLowKeyDistributor:
+    def distribute(self, soundfont: SoundFont, low_key: int = 21, high_key: int = 108):
+        soundfont.samples.sort(key=lambda sample: sample.key_range.root_key, reverse=False)
+        prev_root_key: int = None
+        for root_key in soundfont.root_keys():
+            range = soundfont.range_for_key(root_key)
+            if prev_root_key is None:
+                lo_key = min(low_key, range.low_key)
+                soundfont.set_range(root_key, low_key=min(low_key, lo_key))
+            else:
+                prev_range = soundfont.range_for_key(prev_root_key)
+                mid_sample_key = int((range.low_key - prev_range.high_key) / 2) + prev_range.high_key
+                soundfont.set_range(prev_root_key, high_key=mid_sample_key)
+                soundfont.set_range(root_key, low_key=mid_sample_key + 1)
+            prev_root_key = root_key
+        soundfont.set_range(soundfont.root_keys()[-1],
+                            high_key=max(high_key, soundfont.range_for_key(soundfont.root_keys()[-1]).high_key))
+class NoteNameMidiNumberMapper:
+    def __init__(self):
+        self.index_offset = 21
+        self.note_name_midi_number_map: List[str] = []
+        for octave_number in range(0, 8):
+            for c in range(ord('A'), ord('B') + 1):
+                self._add_note(c, octave_number)
+            for c in range(ord('C'), ord('G') + 1):
+                self._add_note(c, octave_number + 1)
+        self.note_name_midi_number_map.append("C8")
+    def _add_note(self, c, octave_number):
+        self.note_name_midi_number_map.append(f"{chr(c)}{str(octave_number)}")
+        if chr(c) != "A" and chr(c) != "E":
+            self.note_name_midi_number_map.append(f"{chr(c)}#{str(octave_number)}")
+    def mapped_sample(self, filename: str):
+        for note in self.note_name_midi_number_map:
+            if note in filename:
+                return Sample(filename, self.map(note))
+        return Sample(filename)
+    def map(self, note_name: str):
+        return self.note_name_midi_number_map.index(note_name) + self.index_offset

sf-creator-fork/sfcreator/soundfont/test_soundfont.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from unittest import TestCase
+from sfcreator.soundfont.soundfont import *
+class TestHighLowKeyDistributor(TestCase):
+    def test_distribution_of_samples_one_sample_only(self):
+        distributor = HighLowKeyDistributor()
+        sf = SoundFont(samples=[
+            Sample("filename.wav", root_key=60)
+        ])
+        distributor.distribute(sf)
+    def test_distribution_of_samples_unsorted_samples(self):
+        distributor = HighLowKeyDistributor()
+        sf = SoundFont(samples=[
+            Sample("40.wav", root_key=40),
+            Sample("60.wav", root_key=60),
+            Sample("20.wav", root_key=20)
+        ])
+        distributor.distribute(sf)
+        self.assertEqual(Sample("20.wav", root_key=20, low_key=20, high_key=30), sf.samples[0])
+        self.assertEqual(Sample("40.wav", root_key=40, low_key=31, high_key=50), sf.samples[1])
+        self.assertEqual(Sample("60.wav", root_key=60, low_key=51, high_key=108), sf.samples[2])
+    def test_distribution_of_samples_over_108(self):
+        distributor = HighLowKeyDistributor()
+        sf = SoundFont(samples=[
+            Sample("40.wav", root_key=40),
+            Sample("110.wav", root_key=110)
+        ])
+        distributor.distribute(sf)
+        self.assertEqual(Sample("40.wav", root_key=40, low_key=21, high_key=75), sf.samples[0])
+        self.assertEqual(Sample("110.wav", root_key=110, low_key=76, high_key=110), sf.samples[1])
+    def test_distribution_of_samples_with_duplicated_notes(self):
+        distributor = HighLowKeyDistributor()
+        sf = SoundFont(samples=[
+            Sample("40.wav", root_key=40),
+            Sample("40-1.wav", root_key=40),
+            Sample("60.wav", root_key=60),
+            Sample("20.wav", root_key=20)
+        ])
+        distributor.distribute(sf)
+        self.assertEqual(Sample("20.wav", root_key=20, low_key=20, high_key=30), sf.samples[0])
+        self.assertEqual(Sample("40.wav", root_key=40, low_key=31, high_key=50), sf.samples[1])
+        self.assertEqual(Sample("40-1.wav", root_key=40, low_key=31, high_key=50), sf.samples[2])
+        self.assertEqual(Sample("60.wav", root_key=60, low_key=51, high_key=108), sf.samples[3])
+class TestSoundFont(TestCase):
+    def test_creation_of_soundfont(self):
+        sf = SoundFont(samples=[
+            Sample("filename.wav", root_key=60)
+        ])
+        self.assertEqual(sf.samples[0].key_range.high_key, 60)
+        self.assertEqual(sf.samples[0].key_range.low_key, 60)
+class TestNoteNameMidiNumberMapper(TestCase):
+    def test_map_note_name(self):
+        mapper = NoteNameMidiNumberMapper()
+        self.assertEqual(21, mapper.map("A0"))
+        self.assertEqual(108, mapper.map("C8"))
+        self.assertEqual(61, mapper.map("C#4"))
+        self.assertEqual(60, mapper.map("C4"))

train_lfm.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import torch
+import pytorch_lightning as pl
+from torch import nn
+from tqdm import tqdm
+import numpy as np
+import einops
+import wandb
+import torch
+# import wandb logging
+from pytorch_lightning.loggers import WandbLogger
+from stable_audio_tools import get_pretrained_model
+from transformers import T5Tokenizer, T5EncoderModel
+class SinActivation(nn.Module):
+    def forward(self, x):
+        return torch.sin(x)
+class FourierFeatures(nn.Module):
+    def __init__(self, in_features, out_features, n_layers):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.n_layers = n_layers
+        layers = []
+        layers += [nn.Linear(in_features, out_features)]
+        # add sin activation
+        layers += [SinActivation()]
+        for i in range(n_layers-1):
+            layers += [nn.Linear(out_features, out_features)]
+            layers += [SinActivation()]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class FlowMatchingModule(pl.LightningModule):
+    def __init__(self, main_model=None, text_conditioner=None, max_tokens=128, n_channels=None, t_input=None):
+        super().__init__()
+        self.save_hyperparameters(ignore=['main_model', "text_conditioner"])
+        self.model = main_model.transformer
+        self.input_layer = main_model.transformer.project_in
+        self.output_layer = main_model.transformer.project_out
+        self.text_conditioner = text_conditioner
+        self.d_model = self.input_layer.weight.shape[0]
+        self.d_input = self.input_layer.weight.shape[1]
+        # use fourier features for schedule
+        self.schedule_embedding = FourierFeatures(1, self.d_model, 2)
+        # use learned positional encoding
+        self.pitch_embedding = nn.Parameter(torch.randn(n_channels, self.d_model))
+        # make embedding layer for tags
+        self.channels =  n_channels
+        mean_proj = []
+        for layer in self.model.layers:
+            mean_proj += [nn.Linear(self.d_model, self.d_model)]
+        self.mean_proj = nn.ModuleList(mean_proj)
+    def get_example_inputs(self):
+        text = "A piano playing a C major chord"
+        conditioning, conditioning_mask = self.text_conditioner(text, device = self.device)
+        # repeat conditioning
+        conditioning = einops.repeat(conditioning, 'b t d-> b t c d', c=self.channels)
+        conditioning_mask = einops.repeat(conditioning_mask, 'b t -> b t c', c=self.channels)
+        t = torch.rand(1, device=self.device)
+        z = torch.randn(1, self.hparams.t_input ,self.hparams.n_channels, self.d_input , device=self.device)
+        return z, conditioning, conditioning_mask, t
+    def forward(self, x, conditioning, conditioning_mask, t):
+        batch, t_input, n_channels, d_input = x.shape
+        # add conditioning to x
+        x = self.input_layer(x)
+        tz = self.schedule_embedding(t[:,None,None,None])
+        pitch_z = self.pitch_embedding[None, None, :n_channels, :]
+        # print shapes
+        x = x + tz + pitch_z
+        rot = self.model.rotary_pos_emb.forward_from_seq_len(x.shape[1])
+        conditioning = einops.rearrange(conditioning, 'b t c d -> (b c) t d', c=self.channels)
+        conditioning_mask = einops.rearrange(conditioning_mask, 'b t c -> (b c) t', c=self.channels)
+        for layer_idx, layer in enumerate(self.model.layers):
+            x = einops.rearrange(x, 'b t c d -> (b c) t d')
+            x = layer(x, rotary_pos_emb=rot, context = conditioning, context_mask = conditioning_mask)
+            x = einops.rearrange(x, '(b c) t d -> b t c d', c=self.channels)
+            x_ch_mean = x.mean(dim=2)
+            x_ch_mean = self.mean_proj[layer_idx](x_ch_mean)
+            # non linearity
+            # x_ch_mean = torch.relu(x_ch_mean)
+            # # layer norm
+            # x_ch_mean = torch.layer_norm(x_ch_mean, x_ch_mean.shape[1:])
+            x += x_ch_mean[:, :, None, :]
+        x = self.output_layer(x)
+        return x
+    def step(self, batch, batch_idx):
+        x = batch["z"]
+        text = batch["text"]
+        conditioning, conditioning_mask = self.text_conditioner(text, device = self.device)
+        # repeat conditioning
+        conditioning = einops.repeat(conditioning, 'b t d-> b t c d', c=self.channels)
+        conditioning_mask = einops.repeat(conditioning_mask, 'b t -> b t c', c=self.channels)
+        x = einops.rearrange(x, 'b c d t -> b t c d')
+        z0 = torch.randn(x.shape, device=x.device)
+        z1 = x
+        t = torch.rand(x.shape[0], device=x.device)
+        zt = t[:,None,None,None] * z1 + (1 - t[:,None,None,None]) * z0
+        vt = self(zt,conditioning,conditioning_mask,t)
+        loss = (vt - (z1 - z0)).pow(2).mean()
+        return loss
+    @torch.inference_mode()
+    def sample(self, batch_size, text, steps=10, same_latent=False):
+        # Ensure model is on the correct device
+        device = next(self.parameters()).device
+        dtype = self.input_layer.weight.dtype
+        # Move conditioning to the correct device and dtype
+        conditioning, conditioning_mask = self.text_conditioner(text, device=device)
+        conditioning = einops.repeat(conditioning, "b t d-> b t c d", c=self.channels)
+        conditioning_mask = einops.repeat(
+            conditioning_mask, "b t -> b t c", c=self.channels
+        )
+        conditioning = conditioning.to(device=device, dtype=dtype)
+        conditioning_mask = conditioning_mask.to(device=device)
+        self.eval()
+        with torch.no_grad():
+            # Create initial noise on the correct device and dtype
+            z0 = torch.randn(
+                batch_size,
+                self.hparams.t_input,
+                self.hparams.n_channels,
+                self.d_input,
+                device=device,
+                dtype=dtype,
+            )
+            if same_latent:
+                z0 = z0[0].repeat(batch_size, 1, 1, 1)
+            zt = z0
+            for step in tqdm(range(steps)):
+                t = torch.tensor([step / steps], device=device, dtype=dtype)
+                zt = zt + (1 / steps) * self.forward(
+                    zt, conditioning, conditioning_mask, t
+                )
+            return zt
+    def training_step(self, batch, batch_idx):
+        loss = self.step(batch, batch_idx)
+        self.log('trn_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.step(batch, batch_idx)
+        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=1e-5)
+class EncodedAudioDataset(torch.utils.data.Dataset):
+    def __init__(self, paths, pitch_range):
+        records = []
+        print("Loading data")
+        for path in tqdm(paths):
+            records+=torch.load(path)
+        self.records = records
+        self.pitch_range = pitch_range
+        # keep only records with z
+        self.records = [r for r in self.records if "z" in r]
+        print(f"Loaded {len(self.records)} records")
+    def compose_prompt(self,record):
+        title = record["name"] if "name" in record else record["title"]
+        tags = record["tags"]
+        # take tags
+        # shuffle
+        tags = np.random.choice(tags, len(tags), replace=False)
+        # take random number of tags
+        tags = list(tags[:np.random.randint(0, len(tags)+1)])
+        #
+        # take either the title or group or type or nothing
+        if "type_group" in record and "type" in record:
+            type_group = record["type_group"]
+            type = record["type"]
+            head = np.random.choice([title, type_group, type])
+        else:
+            head = np.random.choice([title])
+        # append tags
+        # with 75% chance add head
+        elements = tags
+        if np.random.rand() < 0.75:
+            elements = [head] + elements
+        # shuffle elements
+        elements = np.random.choice(elements, len(elements), replace=False)
+        prompt = " ".join(elements)
+        # make everything lowercase
+        prompt = prompt.lower()
+        return prompt
+    def __len__(self):
+        return len(self.records)
+    def __getitem__(self, idx):
+        return {
+                "z": self.records[idx]["z"][self.pitch_range[0]:self.pitch_range[1]],
+                "text": self.compose_prompt(self.records[idx])
+                }
+    def check_for_nans(self):
+        for r in self.records:
+            # check if z has nan values
+            if np.isnan(r["z"]).any():
+                raise ValueError("Nan values in z")
+    def get_z_shape(self):
+        shapes = [r["z"].shape for r in self.records]
+        # return unique shapes
+        return list(set(shapes))
+if __name__ == "__main__":
+    # set seed
+    SEED = 0
+    torch.manual_seed(SEED)
+    BATCH_SIZE = 1
+    LATENT_T = 86
+    # initialize wandb logger
+    wandb.init()
+    logger = WandbLogger(project="synth_flow")
+    # don't log models
+    wandb.config.log_model = False
+    DATASET = "dataset_a"
+    if DATASET == "dataset_a":
+        PITCH_RANGE = [2,12]
+        trn_ds = EncodedAudioDataset([f"artefacts/synth_data_{i}.pt" for i in range(9)], PITCH_RANGE)
+        trn_ds.check_for_nans()
+        trn_dl = torch.utils.data.DataLoader(trn_ds, batch_size=BATCH_SIZE, shuffle=True)
+        val_ds = EncodedAudioDataset([f"artefacts/synth_data_9.pt"], PITCH_RANGE)
+        val_ds.check_for_nans()
+        val_dl = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True)
+    elif DATASET == "dataset_b":
+        PITCH_RANGE = [0,10]
+        trn_ds = EncodedAudioDataset([f"artefacts/synth_data_2_joined_{i}.pt" for i in range(3)], PITCH_RANGE)
+        trn_ds.check_for_nans()
+        trn_dl = torch.utils.data.DataLoader(trn_ds, batch_size=BATCH_SIZE, shuffle=True)
+        val_ds = EncodedAudioDataset([f"artefacts/synth_data_2_joined_3.pt"], PITCH_RANGE)
+        val_ds.check_for_nans()
+        val_dl = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True)
+    src_model = get_pretrained_model("stabilityai/stable-audio-open-1.0")[0].to("cpu")
+    src_model = src_model.to("cpu")
+    transformer_model = src_model.model.model
+    transformer_model = transformer_model.train()
+    text_conditioner = src_model.conditioner.conditioners.prompt
+    t5_version = "google-t5/t5-base"
+    lr_callback = pl.callbacks.LearningRateMonitor(logging_interval='step')
+    model = FlowMatchingModule(
+    main_model=transformer_model,
+    text_conditioner=text_conditioner,
+    n_channels=PITCH_RANGE[1] - PITCH_RANGE[0],
+    t_input=LATENT_T,
+    )
+    trainer = pl.Trainer(devices = [3], logger=logger, gradient_clip_val=1.0, callbacks=[lr_callback], max_epochs=1000, precision="16-mixed")
+    trainer.fit(model, trn_dl, val_dl, ckpt_path="synth_flow/9gzpz0i6/epoch=85-step=774000.ckpt")
+    # save checkpoint
+    trainer.save_checkpoint("artefacts/model_finetuned_2.ckpt")