Spaces:

sweetcocoa
/

pop2piano

Runtime error

App Files Files Community

codebase_change

by susnato - opened Sep 4, 2023

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+154

-1741

Files changed (20) hide show

README.md +9 -7
app.py +134 -66
config.yaml +0 -61
examples/BornThisWay.mp3 +0 -0
examples/Sk8erBoi.mp3 +0 -0
examples/custom_song.mp3 +0 -0
layer/__init__.py +0 -0
layer/input.py +0 -46
midi_tokenizer.py +0 -430
packages.txt +2 -1
preprocess/README.md +0 -36
preprocess/beat_quantizer.py +0 -111
preprocess/bpm_quantize.py +0 -98
preprocess/melody_accuracy.py +0 -81
preprocess/pop_align.py +0 -331
preprocess/split_spleeter.py +0 -72
requirements.txt +9 -8
transformer_wrapper.py +0 -330
utils/__init__.py +0 -0
utils/dsp.py +0 -63

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
 ---
-title: Pop2Piano Demo
-emoji: 🎹
-colorFrom: black
-colorTo: white
 sdk: gradio
-sdk_version: 3.8.2
 app_file: app.py
-pinned: true
----

 ---
+title: Pop2piano Dev
+emoji: 🏢
+colorFrom: pink
+colorTo: green
 sdk: gradio
+sdk_version: 3.42.0
 app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,54 +1,83 @@
 import torch
 import gradio as gr
-import os
-from transformer_wrapper import TransformerWrapper
-from omegaconf import OmegaConf
-def get_file_content_as_string(path):
-    return open(path, "r", encoding="utf-8").read()
-def model_load():
-    config = OmegaConf.load("config.yaml")
-    wrapper = TransformerWrapper(config)
-    wrapper = wrapper.load_from_checkpoint(
-        "https://huggingface.co/sweetcocoa/pop2piano/resolve/main/model-1999-val_0.67311615.ckpt",
-        config=config,
-        map_location="cpu",
-    )
-    model_id = "dpipqxiy"
-    wrapper.eval()
-    if torch.cuda.is_available():
-        wrapper = wrapper.cuda()
-    return wrapper, model_id, config
-wrapper, model_id, config = model_load()
-composers = list(config.composer_to_feature_token.keys())
-dest_dir = "ytsamples"
-os.makedirs(dest_dir, exist_ok=True)
-def inference(file_up, composer):
-    midi, arranger, mix_path, midi_path = wrapper.generate(
-        audio_path=file_up,
-        composer=composer,
-        model=model_id,
-        ignore_duplicate=True,
-        show_plot=False,
-        save_midi=True,
-        save_mix=True,
-        midi_path="output.mid",
-    )
-    return mix_path, midi_path
-block = gr.Blocks()
 with block:
     gr.HTML(
@@ -67,38 +96,77 @@ with block:
                 </h1>
               </div>
               <p style="margin-bottom: 10px; font-size: 94%">
-                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation. Please select the composer and upload the pop audio to submit.
               </p>
             </div>
         """
     )
     with gr.Group():
-        with gr.Box():
-            with gr.Row().style(mobile_collapse=False, equal_height=True):
-                file_up = gr.Audio(label="Upload an audio", type="filepath")
-                composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
-                btn = gr.Button("Convert")
-        with gr.Box():
-            with gr.Row().style(mobile_collapse=False, equal_height=True):
-                out = gr.Audio(label="Output")
-                midi_out = gr.File(label="Download Midi")
-                btn.click(inference, inputs=[file_up, composer], outputs=[out, midi_out])
         gr.Examples([
-            ["./examples/BornThisWay.mp3", "composer1"],
-            ["./examples/Sk8erBoi.mp3", "composer2"]
         ],
             fn=inference,
-            inputs=[file_up, composer],
-            outputs=[out, midi_out],
             cache_examples=True
         )
         gr.HTML(
             """
         <div class="footer">
-                    <p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
                     </p>
         </div>
         """
         )
-block.launch(debug=True)

+import os
 import torch
+import librosa
+import binascii
+import warnings
+import midi2audio      # to convert midi to wav
+import numpy as np
+import pytube as pt    # to download the youtube videos as audios
 import gradio as gr
+import soundfile as sf # to make the stereo mix
+from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
+yt_video_dir = "./yt_dir"
+outputs_dir = "./midi_wav_outputs"
+os.makedirs(outputs_dir, exist_ok=True)
+os.makedirs(yt_video_dir, exist_ok=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano").to(device)
+processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
+composers = model.generation_config.composer_to_feature_token.keys()
+def get_audio_from_yt_video(yt_link):
+    try:
+        yt = pt.YouTube(yt_link)
+        t = yt.streams.filter(only_audio=True)
+        filename = os.path.join(yt_video_dir, binascii.hexlify(os.urandom(8)).decode() + ".mp4")
+        t[0].download(filename=filename)
+    except:
+        warnings.warn(f"Video Not Found at {yt_link}")
+        filename = None
+    return filename, filename
+def inference(file_uploaded, composer):
+    # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
+    # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
+    waveform, sr = librosa.load(file_uploaded, sr=None)
+    inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
+    model_output = model.generate(input_features=inputs["input_features"], composer=composer)
+    tokenizer_output = processor.batch_decode(token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu"))["pretty_midi_objects"]
+    return prepare_output_file(tokenizer_output, sr)
+def prepare_output_file(tokenizer_output, sr):
+    # Add some random values so that no two file names are same
+    output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
+    midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
+    # write the .mid file
+    tokenizer_output[0].write(midi_output)
+    # convert .mid file to .wav using `midi2audio`
+    wav_output = midi_output.replace(".mid", ".wav")
+    midi2audio.FluidSynth().midi_to_audio(midi_output, wav_output)
+    return wav_output, wav_output, midi_output
+def get_stereo(pop_path, midi, pop_scale=0.5):
+    pop_y, sr = librosa.load(pop_path, sr=None)
+    midi_y, _ = librosa.load(midi.name, sr=None)
+    if len(pop_y) > len(midi_y):
+        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
+    elif len(pop_y) < len(midi_y):
+        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
+    stereo = np.stack((midi_y, pop_y * pop_scale))
+    stereo_mix_path = pop_path.replace("output", "output_stereo_mix")
+    sf.write(file=stereo_mix_path, data=stereo.T, samplerate=sr, format="wav",)
+    return stereo_mix_path, stereo_mix_path
+# Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
+# taken from https://huggingface.co/spaces/NoCrypt/miku
+block = gr.Blocks(theme="Taithrah/Minimal")
 with block:
     gr.HTML(
                 </h1>
               </div>
               <p style="margin-bottom: 10px; font-size: 94%">
+                A demo for Pop2Piano:Pop Audio-based Piano Cover Generation.<br>
+                Please select the composer(Arranger) and upload the pop audio or enter the YouTube link and then click Generate.
               </p>
             </div>
         """
     )
     with gr.Group():
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
+            with gr.Column():
+                with gr.Row():
+                    yt_link = gr.Textbox(label="Enter YouTube Link of the Video", autofocus=True, lines=3)
+                    yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
+                yt_audio_path = gr.Audio(label="Audio Extracted from the YouTube Video", interactive=False)
+                yt_btn.click(get_audio_from_yt_video, inputs=[yt_link], outputs=[yt_audio_path, file_uploaded])
+    with gr.Group():
+        with gr.Column():
+            composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
+            generate_btn = gr.Button("Generate")
+        with gr.Row().style(mobile_collapse=False, equal_height=True):
+            wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
+            wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
+            midi_output = gr.File(label="Download the Generated MIDI (.mid)")
+            generate_btn.click(inference,
+                               inputs=[file_uploaded, composer],
+                               outputs=[wav_output1, wav_output2, midi_output])
+    with gr.Group():
+        gr.HTML(
+            """
+            <div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
+            """
+        )
+        pop_scale = gr.Slider(0, 1, value=0.5, label="Choose the ratio between Pop and MIDI", info="1.0 = Only Pop, 0.0=Only MIDI", interactive=True),
+        stereo_btn = gr.Button("Get Stereo Mix")
+        with gr.Row():
+            stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
+            stereo_mix2 = gr.File(label="Download the Stereo Mix")
+        stereo_btn.click(get_stereo, inputs=[file_uploaded, wav_output2, pop_scale[0]], outputs=[stereo_mix1, stereo_mix2])
+    with gr.Group():
         gr.Examples([
+            ["./examples/custom_song.mp3", "composer1"],
         ],
             fn=inference,
+            inputs=[file_uploaded, composer],
+            outputs=[wav_output1, wav_output2, midi_output],
             cache_examples=True
         )
         gr.HTML(
             """
         <div class="footer">
+                    <center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
+        </div>
+        """
+        )
+        gr.HTML(
+            """
+        <div class="footer">
+                    <center><p><a href="http://sweetcocoa.github.io/pop2piano_samples" style="text-decoration: underline;" target="_blank">Project Page</a>
+                    <center><a href="https://huggingface.co/docs/transformers/main/model_doc/pop2piano" style="text-decoration: underline;" target="_blank">HuggingFace Model Docs</a>
+                    <center><a href="https://github.com/sweetcocoa/pop2piano" style="text-decoration: underline;" target="_blank">Github</a>
                     </p>
         </div>
         """
         )
+block.launch(debug=False)

config.yaml DELETED Viewed

@@ -1,61 +0,0 @@
-project: pop2piano
-dataset:
-  target_length: 256
-  input_length: 1024
-  n_bars: 2
-  sample_rate: 22050
-  use_mel: true
-  mel_is_conditioned: true
-composer_to_feature_token:
-  composer1: 2052
-  composer2: 2053
-  composer3: 2054
-  composer4: 2055
-  composer5: 2056
-  composer6: 2057
-  composer7: 2058
-  composer8: 2059
-  composer9: 2060
-  composer10: 2061
-  composer11: 2062
-  composer12: 2063
-  composer13: 2064
-  composer14: 2065
-  composer15: 2066
-  composer16: 2067
-  composer17: 2068
-  composer18: 2069
-  composer19: 2070
-  composer20: 2071
-  composer21: 2072
-t5:
-  feed_forward_proj: gated-gelu
-  tie_word_embeddings: false
-  tie_encoder_decoder: false
-  vocab_size: 2400
-  n_positions: 1024
-  relative_attention_num_buckets: 32
-tokenizer:
-  vocab_size:
-    special: 4
-    note: 128
-    velocity: 2
-    time: 100
-training:
-  seed: 3407
-  resume: false
-  offline: false
-  num_gpu: 1
-  max_epochs: 5000
-  accumulate_grad_batches: 1
-  check_val_every_n_epoch: 20
-  find_lr: false
-  optimizer: adafactor
-  version: none
-  lr: 0.001
-  lr_min: 1.0e-06
-  lr_scheduler: false
-  lr_decay: 0.99
-  batch_size: 32
-  num_workers: 32
-  gradient_clip_val: 3.0

examples/BornThisWay.mp3 DELETED Viewed

Binary file (482 kB)

examples/Sk8erBoi.mp3 DELETED Viewed

Binary file (673 kB)

examples/custom_song.mp3 ADDED Viewed

Binary file (247 kB). View file

layer/__init__.py DELETED Viewed

File without changes

layer/input.py DELETED Viewed

@@ -1,46 +0,0 @@
-import torch
-import torch.nn as nn
-import torchaudio
-class LogMelSpectrogram(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.melspectrogram = torchaudio.transforms.MelSpectrogram(
-            sample_rate=22050,
-            n_fft=4096,
-            hop_length=1024,
-            f_min=10.0,
-            n_mels=512,
-        )
-    def forward(self, x):
-        # x : audio(batch, sample)
-        # X : melspec (batch, freq, frame)
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
-                X = self.melspectrogram(x)
-                X = X.clamp(min=1e-6).log()
-        return X
-class ConcatEmbeddingToMel(nn.Module):
-    def __init__(self, embedding_offset, n_vocab, n_dim) -> None:
-        super().__init__()
-        self.embedding = nn.Embedding(num_embeddings=n_vocab, embedding_dim=n_dim)
-        self.embedding_offset = embedding_offset
-    def forward(self, feature, index_value):
-        """
-        index_value : (batch, )
-        feature : (batch, time, feature_dim)
-        """
-        index_shifted = index_value - self.embedding_offset
-        # (batch, 1, feature_dim)
-        composer_embedding = self.embedding(index_shifted).unsqueeze(1)
-        # print(composer_embedding.shape, feature.shape)
-        # (batch, 1 + time, feature_dim)
-        inputs_embeds = torch.cat([composer_embedding, feature], dim=1)
-        return inputs_embeds

midi_tokenizer.py DELETED Viewed

@@ -1,430 +0,0 @@
-import numpy as np
-from numba import jit
-import pretty_midi
-import scipy.interpolate as interp
-TOKEN_SPECIAL: int = 0
-TOKEN_NOTE: int = 1
-TOKEN_VELOCITY: int = 2
-TOKEN_TIME: int = 3
-DEFAULT_VELOCITY: int = 77
-TIE: int = 2
-EOS: int = 1
-PAD: int = 0
-def extrapolate_beat_times(beat_times, n_extend=1):
-    beat_times_function = interp.interp1d(
-        np.arange(beat_times.size),
-        beat_times,
-        bounds_error=False,
-        fill_value="extrapolate",
-    )
-    ext_beats = beat_times_function(
-        np.linspace(0, beat_times.size + n_extend - 1, beat_times.size + n_extend)
-    )
-    return ext_beats
-@jit(nopython=True, cache=True)
-def fast_tokenize(idx, token_type, n_special, n_note, n_velocity):
-    if token_type == TOKEN_TIME:
-        return n_special + n_note + n_velocity + idx
-    elif token_type == TOKEN_VELOCITY:
-        return n_special + n_note + idx
-    elif token_type == TOKEN_NOTE:
-        return n_special + idx
-    elif token_type == TOKEN_SPECIAL:
-        return idx
-    else:
-        return -1
-@jit(nopython=True, cache=True)
-def fast_detokenize(idx, n_special, n_note, n_velocity, time_idx_offset):
-    if idx >= n_special + n_note + n_velocity:
-        return (TOKEN_TIME, (idx - (n_special + n_note + n_velocity)) + time_idx_offset)
-    elif idx >= n_special + n_note:
-        return TOKEN_VELOCITY, idx - (n_special + n_note)
-    elif idx >= n_special:
-        return TOKEN_NOTE, idx - n_special
-    else:
-        return TOKEN_SPECIAL, idx
-class MidiTokenizer:
-    def __init__(self, config) -> None:
-        self.config = config
-    def tokenize_note(self, idx, token_type):
-        rt = fast_tokenize(
-            idx,
-            token_type,
-            self.config.vocab_size.special,
-            self.config.vocab_size.note,
-            self.config.vocab_size.velocity,
-        )
-        if rt == -1:
-            raise ValueError(f"type {type} is not a predefined token type.")
-        else:
-            return rt
-    def notes_to_tokens(self, notes):
-        """
-        notes : (onset idx, offset idx, pitch, velocity)
-        """
-        max_time_idx = notes[:, :2].max()
-        times = [[] for i in range((max_time_idx + 1))]
-        for onset, offset, pitch, velocity in notes:
-            times[onset].append([pitch, velocity])
-            times[offset].append([pitch, 0])
-        tokens = []
-        current_velocity = 0
-        for i, time in enumerate(times):
-            if len(time) == 0:
-                continue
-            tokens.append(self.tokenize_note(i, TOKEN_TIME))
-            for pitch, velocity in time:
-                velocity = int(velocity > 0)
-                if current_velocity != velocity:
-                    current_velocity = velocity
-                    tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
-                tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
-        return np.array(tokens, dtype=int)
-    def detokenize(self, token, time_idx_offset):
-        type, value = fast_detokenize(
-            token,
-            n_special=self.config.vocab_size.special,
-            n_note=self.config.vocab_size.note,
-            n_velocity=self.config.vocab_size.velocity,
-            time_idx_offset=time_idx_offset,
-        )
-        if type != TOKEN_TIME:
-            value = int(value)
-        return [type, value]
-    def to_string(self, tokens, time_idx_offset=0):
-        nums = [
-            self.detokenize(token, time_idx_offset=time_idx_offset) for token in tokens
-        ]
-        strings = []
-        for i in range(len(nums)):
-            type = nums[i][0]
-            value = nums[i][1]
-            if type == TOKEN_TIME:
-                type = "time"
-            elif type == TOKEN_SPECIAL:
-                if value == EOS:
-                    value = "EOS"
-                elif value == PAD:
-                    value = "PAD"
-                elif value == TIE:
-                    value = "TIE"
-                else:
-                    value = "Unknown Special"
-            elif type == TOKEN_NOTE:
-                type = "note"
-            elif type == TOKEN_VELOCITY:
-                type = "velocity"
-            strings.append((type, value))
-        return strings
-    def split_notes(self, notes, beatsteps, time_from, time_to):
-        """
-        Assumptions
-        - notes are sorted by onset time
-        - beatsteps are sorted by time
-        """
-        start_idx = np.searchsorted(beatsteps, time_from)
-        start_note = np.searchsorted(notes[:, 0], start_idx)
-        end_idx = np.searchsorted(beatsteps, time_to)
-        end_note = np.searchsorted(notes[:, 0], end_idx)
-        splited_notes = notes[start_note:end_note]
-        return splited_notes, (start_idx, end_idx, start_note, end_note)
-    def notes_to_relative_tokens(
-        self, notes, offset_idx, add_eos=False, add_composer=False, composer_value=None
-    ):
-        """
-        notes : (onset idx, offset idx, pitch, velocity)
-        """
-        def _add_eos(tokens):
-            tokens = np.concatenate((tokens, np.array([EOS], dtype=tokens.dtype)))
-            return tokens
-        def _add_composer(tokens, composer_value):
-            tokens = np.concatenate(
-                (np.array([composer_value], dtype=tokens.dtype), tokens)
-            )
-            return tokens
-        if len(notes) == 0:
-            tokens = np.array([], dtype=int)
-            if add_eos:
-                tokens = _add_eos(tokens)
-            if add_composer:
-                tokens = _add_composer(tokens, composer_value=composer_value)
-            return tokens
-        max_time_idx = notes[:, :2].max()
-        # times[time_idx] = [[pitch, .. ], [pitch, 0], ..]
-        times = [[] for i in range((max_time_idx + 1 - offset_idx))]
-        for abs_onset, abs_offset, pitch, velocity in notes:
-            rel_onset = abs_onset - offset_idx
-            rel_offset = abs_offset - offset_idx
-            times[rel_onset].append([pitch, velocity])
-            times[rel_offset].append([pitch, 0])
-        # 여기서부터는 전부 시간 0(offset) 기준
-        tokens = []
-        current_velocity = 0
-        current_time_idx = 0
-        for rel_idx, time in enumerate(times):
-            if len(time) == 0:
-                continue
-            time_idx_shift = rel_idx - current_time_idx
-            current_time_idx = rel_idx
-            tokens.append(self.tokenize_note(time_idx_shift, TOKEN_TIME))
-            for pitch, velocity in time:
-                velocity = int(velocity > 0)
-                if current_velocity != velocity:
-                    current_velocity = velocity
-                    tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
-                tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))
-        tokens = np.array(tokens, dtype=int)
-        if add_eos:
-            tokens = _add_eos(tokens)
-        if add_composer:
-            tokens = _add_composer(tokens, composer_value=composer_value)
-        return tokens
-    def relative_batch_tokens_to_midi(
-        self,
-        tokens,
-        beatstep,
-        beat_offset_idx=None,
-        bars_per_batch=None,
-        cutoff_time_idx=None,
-    ):
-        """
-        tokens : (batch, sequence)
-        beatstep : (times, )
-        """
-        beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
-        notes = None
-        bars_per_batch = 2 if bars_per_batch is None else bars_per_batch
-        N = len(tokens)
-        for n in range(N):
-            _tokens = tokens[n]
-            _start_idx = beat_offset_idx + n * bars_per_batch * 4
-            _cutoff_time_idx = cutoff_time_idx + _start_idx
-            _notes = self.relative_tokens_to_notes(
-                _tokens,
-                start_idx=_start_idx,
-                cutoff_time_idx=_cutoff_time_idx,
-            )
-            # print(_notes, "\n-------")
-            if len(_notes) == 0:
-                pass
-                # print("_notes zero")
-            elif notes is None:
-                notes = _notes
-            else:
-                notes = np.concatenate((notes, _notes), axis=0)
-        if notes is None:
-            notes = []
-        midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
-        return midi, notes
-    def relative_tokens_to_notes(self, tokens, start_idx, cutoff_time_idx=None):
-        # TODO remove legacy
-        # decoding 첫토큰이 편곡자인 경우
-        if tokens[0] >= sum(self.config.vocab_size.values()):
-            tokens = tokens[1:]
-        words = [self.detokenize(token, time_idx_offset=0) for token in tokens]
-        if hasattr(start_idx, "item"):
-            """
-            if numpy or torch tensor
-            """
-            start_idx = start_idx.item()
-        current_idx = start_idx
-        current_velocity = 0
-        note_onsets_ready = [None for i in range(self.config.vocab_size.note + 1)]
-        notes = []
-        for type, number in words:
-            if type == TOKEN_SPECIAL:
-                if number == EOS:
-                    break
-            elif type == TOKEN_TIME:
-                current_idx += number
-                if cutoff_time_idx is not None:
-                    current_idx = min(current_idx, cutoff_time_idx)
-            elif type == TOKEN_VELOCITY:
-                current_velocity = number
-            elif type == TOKEN_NOTE:
-                pitch = number
-                if current_velocity == 0:
-                    # note_offset
-                    if note_onsets_ready[pitch] is None:
-                        # offset without onset
-                        pass
-                    else:
-                        onset_idx = note_onsets_ready[pitch]
-                        if onset_idx >= current_idx:
-                            # No time shift after previous note_on
-                            pass
-                        else:
-                            offset_idx = current_idx
-                            notes.append(
-                                [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
-                            )
-                            note_onsets_ready[pitch] = None
-                else:
-                    # note_on
-                    if note_onsets_ready[pitch] is None:
-                        note_onsets_ready[pitch] = current_idx
-                    else:
-                        # note-on already exists
-                        onset_idx = note_onsets_ready[pitch]
-                        if onset_idx >= current_idx:
-                            # No time shift after previous note_on
-                            pass
-                        else:
-                            offset_idx = current_idx
-                            notes.append(
-                                [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
-                            )
-                            note_onsets_ready[pitch] = current_idx
-            else:
-                raise ValueError
-        for pitch, note_on in enumerate(note_onsets_ready):
-            # force offset if no offset for each pitch
-            if note_on is not None:
-                if cutoff_time_idx is None:
-                    cutoff = note_on + 1
-                else:
-                    cutoff = max(cutoff_time_idx, note_on + 1)
-                offset_idx = max(current_idx, cutoff)
-                notes.append([note_on, offset_idx, pitch, DEFAULT_VELOCITY])
-        if len(notes) == 0:
-            return []
-        else:
-            notes = np.array(notes)
-            note_order = notes[:, 0] * 128 + notes[:, 1]
-            notes = notes[note_order.argsort()]
-            return notes
-    def notes_to_midi(self, notes, beatstep, offset_sec=None):
-        new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
-        new_inst = pretty_midi.Instrument(program=0)
-        new_notes = []
-        if offset_sec is None:
-            offset_sec = 0.0
-        for onset_idx, offset_idx, pitch, velocity in notes:
-            new_note = pretty_midi.Note(
-                velocity=velocity,
-                pitch=pitch,
-                start=beatstep[onset_idx] - offset_sec,
-                end=beatstep[offset_idx] - offset_sec,
-            )
-            new_notes.append(new_note)
-        new_inst.notes = new_notes
-        new_pm.instruments.append(new_inst)
-        new_pm.remove_invalid_notes()
-        return new_pm
-@jit(nopython=True, cache=False)
-def fast_notes_to_relative_tokens(
-    notes, offset_idx, max_time_idx, n_special, n_note, n_velocity
-):
-    """
-    notes : (onset idx, offset idx, pitch, velocity)
-    """
-    times_p = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
-    times_v = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
-    for abs_onset, abs_offset, pitch, velocity in notes:
-        rel_onset = abs_onset - offset_idx
-        rel_offset = abs_offset - offset_idx
-        times_p[rel_onset] = np.append(times_p[rel_onset], pitch)
-        times_v[rel_onset] = np.append(times_v[rel_onset], velocity)
-        times_p[rel_offset] = np.append(times_p[rel_offset], pitch)
-        times_v[rel_offset] = np.append(times_v[rel_offset], velocity)
-    # 여기서부터는 전부 시간 0(offset) 기준
-    tokens = []
-    current_velocity = np.array([0])
-    current_time_idx = np.array([0])
-    # range가 0일 수도 있으니까..
-    for i in range(len(times_p)):
-        rel_idx = i
-        notes_at_time = times_p[i]
-        if len(notes_at_time) == 0:
-            continue
-        time_idx_shift = rel_idx - current_time_idx[0]
-        current_time_idx[0] = rel_idx
-        token = fast_tokenize(
-            time_idx_shift,
-            TOKEN_TIME,
-            n_special=n_special,
-            n_note=n_note,
-            n_velocity=n_velocity,
-        )
-        tokens.append(token)
-        for j in range(len(notes_at_time)):
-            pitch = times_p[j]
-            velocity = times_v[j]
-            # for pitch, velocity in time:
-            velocity = int(velocity > 0)
-            if current_velocity[0] != velocity:
-                current_velocity[0] = velocity
-                token = fast_tokenize(
-                    velocity,
-                    TOKEN_VELOCITY,
-                    n_special=n_special,
-                    n_note=n_note,
-                    n_velocity=n_velocity,
-                )
-                tokens.append(token)
-            token = fast_tokenize(
-                pitch,
-                TOKEN_NOTE,
-                n_special=n_special,
-                n_note=n_note,
-                n_velocity=n_velocity,
-            )
-            tokens.append(token)
-    return np.array(tokens)

packages.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- fluidsynth


1	+ fluidsynth
2	+ ffmpeg

preprocess/README.md DELETED Viewed

@@ -1,36 +0,0 @@
-# Preprocess Scripts
----
-- Note : the order of these scripts is IMPORTANT.
-- the preprocessing step is easy. but environment setting is not. please understand.
-- If you encounter any problems, please do not hesitate to email me or open an issue to the github.
-1. Transcribe piano wavs to midi
-- You should transcribe {piano_cover_file.wav} -> {piano_cover_file.mid}
-- I recommend you to use original codes from this repo : [High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times](https://github.com/qiuqiangkong/piano_transcription_inference)
-- Instead, you can also you my docker script.
-    ```bash
-    docker run -it --gpus all --rm -v /DIRECTORY_THAT_CONTAINS_PIANO_WAV/:/input -v /DIRECTORY_THAT_MIDI_OUTPUT/:/output jonghochoi/piano_transcribe:bytedance1
-    ```
-- If you are using GPU RTX 30XX or higher, this script may not work properly. It's because the version of pytorch is too low(1.4).
-- then upgrade the version of pytorch in the docker..
-2. Estimate Pop's beats
-```bash
-python bpm_quantize.py DATA_DIR
-```
-3. synchronize midi
-```bash
-python pop_align.py DATA_DIR
-```
-4. get separated vocal track
-```bash
-python split_spleeter.py DATA_DIR
-```
-5. caculate melody chroma accuracy
-```bash
-python melody_accuracy.py DATA_DIR
-```

preprocess/beat_quantizer.py DELETED Viewed

@@ -1,111 +0,0 @@
-import copy
-import librosa
-import essentia
-import essentia.standard
-import numpy as np
-import scipy.interpolate as interp
-import note_seq
-SAMPLERATE = 44100
-def nearest_onset_offset_digitize(on, off, bins):
-    intermediate = (bins[1:] + bins[:-1]) / 2
-    on_idx = np.digitize(on, intermediate)
-    off_idx = np.digitize(off, intermediate)
-    off_idx[on_idx == off_idx] += 1
-    # off_idx = np.clip(off_idx, a_min=0, a_max=len(bins) - 1)
-    return on_idx, off_idx
-def apply_sustain_pedal(pm):
-    ns = note_seq.midi_to_note_sequence(pm)
-    susns = note_seq.apply_sustain_control_changes(ns)
-    suspm = note_seq.note_sequence_to_pretty_midi(susns)
-    return suspm
-def interpolate_beat_times(beat_times, steps_per_beat, extend=False):
-    beat_times_function = interp.interp1d(
-        np.arange(beat_times.size),
-        beat_times,
-        bounds_error=False,
-        fill_value="extrapolate",
-    )
-    if extend:
-        beat_steps_8th = beat_times_function(
-            np.linspace(0, beat_times.size, beat_times.size * steps_per_beat + 1)
-        )
-    else:
-        beat_steps_8th = beat_times_function(
-            np.linspace(0, beat_times.size - 1, beat_times.size * steps_per_beat - 1)
-        )
-    return beat_steps_8th
-def midi_quantize_by_beats(
-    sample, beat_times, steps_per_beat, ignore_sustain_pedal=False
-):
-    ns = note_seq.midi_file_to_note_sequence(sample.midi)
-    if ignore_sustain_pedal:
-        susns = ns
-    else:
-        susns = note_seq.apply_sustain_control_changes(ns)
-    qns = copy.deepcopy(susns)
-    notes = np.array([[n.start_time, n.end_time] for n in susns.notes])
-    note_attributes = np.array([[n.pitch, n.velocity] for n in susns.notes])
-    note_ons = np.array(notes[:, 0])
-    note_offs = np.array(notes[:, 1])
-    beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=False)
-    on_idx, off_idx = nearest_onset_offset_digitize(note_ons, note_offs, beat_steps_8th)
-    beat_steps_8th = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
-    discrete_notes = np.concatenate(
-        (np.stack((on_idx, off_idx), axis=1), note_attributes), axis=1
-    )
-    def delete_duplicate_notes(dnotes):
-        note_order = dnotes[:, 0] * 128 + dnotes[:, 2]
-        dnotes = dnotes[note_order.argsort()]
-        indices = []
-        for i in range(1, len(dnotes)):
-            if dnotes[i, 0] == dnotes[i - 1, 0] and dnotes[i, 2] == dnotes[i - 1, 2]:
-                indices.append(i)
-        dnotes = np.delete(dnotes, indices, axis=0)
-        note_order = dnotes[:, 0] * 128 + dnotes[:, 1]
-        dnotes = dnotes[note_order.argsort()]
-        return dnotes
-    discrete_notes = delete_duplicate_notes(discrete_notes)
-    digitized_note_ons, digitized_note_offs = (
-        beat_steps_8th[on_idx],
-        beat_steps_8th[off_idx],
-    )
-    for i, note in enumerate(qns.notes):
-        note.start_time = digitized_note_ons[i]
-        note.end_time = digitized_note_offs[i]
-    return qns, discrete_notes, beat_steps_8th
-def extract_rhythm(song, y=None):
-    if y is None:
-        y, sr = librosa.load(song, sr=SAMPLERATE)
-    essentia_tracker = essentia.standard.RhythmExtractor2013(method="multifeature")
-    (
-        bpm,
-        beat_times,
-        confidence,
-        estimates,
-        essentia_beat_intervals,
-    ) = essentia_tracker(y)
-    return bpm, beat_times, confidence, estimates, essentia_beat_intervals

preprocess/bpm_quantize.py DELETED Viewed

@@ -1,98 +0,0 @@
-import glob
-import sys
-import os
-import librosa
-import soundfile as sf
-import numpy as np
-import note_seq
-from omegaconf import OmegaConf
-from beat_quantizer import extract_rhythm, midi_quantize_by_beats
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from midiaudiopair import MidiAudioPair
-from utils.dsp import get_stereo
-def estimate(meta_file, ignore_sustain_pedal):
-    sample = MidiAudioPair(meta_file)
-    if (
-        sample.error_code == MidiAudioPair.NO_PIANO
-        or sample.error_code == MidiAudioPair.NO_SONG_DIR
-        or sample.error_code == MidiAudioPair.NO_SONG
-    ):
-        return
-    bpm, beat_times, confidence, estimates, essentia_beat_intervals = extract_rhythm(sample.song)
-    beat_times = np.array(beat_times)
-    essentia_beat_intervals = np.array(essentia_beat_intervals)
-    qns, discrete_notes, beat_steps_8th = midi_quantize_by_beats(
-        sample, beat_times, 2, ignore_sustain_pedal=ignore_sustain_pedal
-    )
-    qpm = note_seq.note_sequence_to_pretty_midi(qns)
-    qpm.instruments[0].control_changes = []
-    qpm.write(sample.qmidi)
-    y, sr = librosa.load(sample.song, sr=None)
-    qpm_y = qpm.fluidsynth(sr)
-    qmix = get_stereo(y, qpm_y, 0.4)
-    sf.write(file=sample.qmix, data=qmix.T, samplerate=sr, format="flac")
-    meta = OmegaConf.load(meta_file)
-    meta.tempo = OmegaConf.create()
-    meta.tempo.bpm = bpm
-    meta.tempo.confidence = confidence
-    OmegaConf.save(meta, meta_file)
-    np.save(sample.notes, discrete_notes)
-    np.save(sample.beatstep, beat_steps_8th)
-    np.save(sample.beattime, beat_times)
-    np.save(sample.beatinterval, essentia_beat_intervals)
-def main(meta_files, ignore_sustain_pedal):
-    from tqdm import tqdm
-    import multiprocessing
-    from joblib import Parallel, delayed
-    def files():
-        pbar = tqdm(meta_files)
-        for meta_file in pbar:
-            pbar.set_description(meta_file)
-            yield meta_file
-    Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
-        delayed(estimate)(meta_file, ignore_sustain_pedal) for meta_file in files()
-    )
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="bpm estimate using essentia")
-    parser.add_argument(
-        "data_dir",
-        type=str,
-        default=None,
-        help="""directory contains {id}/{pop_filename.wav}
-        """,
-    )
-    parser.add_argument(
-        "--ignore_sustain_pedal",
-        default=False,
-        action="store_true",
-        help="whether dry_run",
-    )
-    args = parser.parse_args()
-    meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
-    print("meta ", len(meta_files))
-    main(meta_files, args.ignore_sustain_pedal)

preprocess/melody_accuracy.py DELETED Viewed

@@ -1,81 +0,0 @@
-import glob
-import sys
-import os
-import librosa
-import pretty_midi
-from omegaconf import OmegaConf
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from midiaudiopair import MidiAudioPair
-from evaluate import midi_melody_accuracy as ma
-def estimate(meta_file):
-    import warnings
-    warnings.filterwarnings(action="ignore")
-    sample = MidiAudioPair(meta_file)
-    if (
-        sample.error_code == MidiAudioPair.NO_PIANO
-        or sample.error_code == MidiAudioPair.NO_SONG_DIR
-        or sample.error_code == MidiAudioPair.NO_SONG
-    ):
-        return
-    if "vocals" in sample.invalids:
-        print("no vocal:", meta_file)
-        return
-    midi = pretty_midi.PrettyMIDI(sample.qmidi)
-    vocals, sr = librosa.load(sample.vocals, sr=44100)
-    chroma_accuracy, pitch_accuracy = ma.evaluate_melody(
-        midi, vocals, sr=sr, hop_length=1024
-    )
-    meta = OmegaConf.load(meta_file)
-    meta.eval = OmegaConf.create()
-    meta.eval.melody_chroma_accuracy = chroma_accuracy.item()
-    meta.eval.melody_pitch_accuracy = pitch_accuracy.item()
-    OmegaConf.save(meta, meta_file)
-def main(meta_files):
-    from tqdm import tqdm
-    import multiprocessing
-    from joblib import Parallel, delayed
-    def files():
-        pbar = tqdm(meta_files)
-        for meta_file in pbar:
-            pbar.set_description(meta_file)
-            yield meta_file
-    Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
-        delayed(estimate)(meta_file) for meta_file in files()
-    )
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="bpm estimate using essentia")
-    parser.add_argument(
-        "data_dir",
-        type=str,
-        default=None,
-        help="""directory contains {id}/{pop_filename.wav}
-        """,
-    )
-    args = parser.parse_args()
-    meta_files = sorted(glob.glob(args.data_dir + "/**/*.yaml", recursive=True))
-    print("meta ", len(meta_files))
-    main(meta_files)

preprocess/pop_align.py DELETED Viewed

@@ -1,331 +0,0 @@
-import librosa
-import soundfile as sf
-import glob
-import os
-import copy
-import sys
-import numpy as np
-import pyrubberband as pyrb
-import pretty_midi
-from omegaconf import OmegaConf
-from tqdm.auto import tqdm
-from synctoolbox.dtw.mrmsdtw import sync_via_mrmsdtw
-from synctoolbox.dtw.utils import (
-    compute_optimal_chroma_shift,
-    shift_chroma_vectors,
-    make_path_strictly_monotonic,
-)
-from synctoolbox.feature.chroma import (
-    pitch_to_chroma,
-    quantize_chroma,
-    quantized_chroma_to_CENS,
-)
-from synctoolbox.feature.dlnco import pitch_onset_features_to_DLNCO
-from synctoolbox.feature.pitch import audio_to_pitch_features
-from synctoolbox.feature.pitch_onset import audio_to_pitch_onset_features
-from synctoolbox.feature.utils import estimate_tuning
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from utils.dsp import normalize, get_stereo
-from midiaudiopair import MidiAudioPair
-Fs = 22050
-feature_rate = 50
-step_weights = np.array([1.5, 1.5, 2.0])
-threshold_rec = 10 ** 6
-def save_delayed_song(
-    sample,
-    dry_run,
-):
-    import warnings
-    warnings.filterwarnings(action="ignore")
-    song_audio, _ = librosa.load(sample.original_song, Fs)
-    midi_pm = pretty_midi.PrettyMIDI(sample.original_midi)
-    if np.power(song_audio, 2).sum() < 1:  # low energy: invalid file
-        print("invalid audio :", sample.original_song)
-        sample.delete_files_myself()
-        return
-    rd = get_aligned_results(midi_pm=midi_pm, song_audio=song_audio)
-    mix_song = rd["mix_song"]
-    song_pitch_shifted = rd["song_pitch_shifted"]
-    midi_warped_pm = rd["midi_warped_pm"]
-    pitch_shift_for_song_audio = rd["pitch_shift_for_song_audio"]
-    tuning_offset_song = rd["tuning_offset_song"]
-    tuning_offset_piano = rd["tuning_offset_piano"]
-    try:
-        if dry_run:
-            print("write audio files: ", sample.song)
-        else:
-            sf.write(
-                file=sample.song,
-                data=song_pitch_shifted,
-                samplerate=Fs,
-                format="wav",
-            )
-    except:
-        print("Fail : ", sample.song)
-    try:
-        if dry_run:
-            print("write warped midi :", sample.midi)
-        else:
-            midi_warped_pm.write(sample.midi)
-    except:
-        midi_warped_pm._tick_scales = midi_pm._tick_scales
-        try:
-            if dry_run:
-                print("write warped midi2 :", sample.midi)
-            else:
-                midi_warped_pm.write(sample.midi)
-        except:
-            print("ad-hoc failed midi : ", sample.midi)
-        print("ad-hoc midi : ", sample.midi)
-    sample.yaml.song.pitch_shift = pitch_shift_for_song_audio.item()
-    sample.yaml.song.tuning_offset = tuning_offset_song.item()
-    sample.yaml.piano.tuning_offset = tuning_offset_piano.item()
-    OmegaConf.save(sample.yaml, sample.yaml_path)
-def get_aligned_results(midi_pm, song_audio):
-    piano_audio = midi_pm.fluidsynth(Fs)
-    song_audio = normalize(song_audio)
-    # The reason for estimating tuning ::
-    # https://www.audiolabs-erlangen.de/resources/MIR/FMP/C3/C3S1_TranspositionTuning.html
-    tuning_offset_1 = estimate_tuning(song_audio, Fs)
-    tuning_offset_2 = estimate_tuning(piano_audio, Fs)
-    # DLNCO features (Sebastian Ewert, Meinard Müller, and Peter Grosche: High Resolution Audio Synchronization Using Chroma Onset Features, In Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP): 1869–1872, 2009.):
-    # helpful to increase synchronization accuracy, especially for music with clear onsets.
-    # Quantized and smoothed chroma : CENS features
-    # Because, MrMsDTW Requires CENS.
-    f_chroma_quantized_1, f_DLNCO_1 = get_features_from_audio(
-        song_audio, tuning_offset_1
-    )
-    f_chroma_quantized_2, f_DLNCO_2 = get_features_from_audio(
-        piano_audio, tuning_offset_2
-    )
-    # Shift chroma vectors :
-    # Otherwise, different keys of two audio leads to degradation of alignment.
-    opt_chroma_shift = compute_optimal_chroma_shift(
-        quantized_chroma_to_CENS(f_chroma_quantized_1, 201, 50, feature_rate)[0],
-        quantized_chroma_to_CENS(f_chroma_quantized_2, 201, 50, feature_rate)[0],
-    )
-    f_chroma_quantized_2 = shift_chroma_vectors(f_chroma_quantized_2, opt_chroma_shift)
-    f_DLNCO_2 = shift_chroma_vectors(f_DLNCO_2, opt_chroma_shift)
-    wp = sync_via_mrmsdtw(
-        f_chroma1=f_chroma_quantized_1,
-        f_onset1=f_DLNCO_1,
-        f_chroma2=f_chroma_quantized_2,
-        f_onset2=f_DLNCO_2,
-        input_feature_rate=feature_rate,
-        step_weights=step_weights,
-        threshold_rec=threshold_rec,
-        verbose=False,
-    )
-    wp = make_path_strictly_monotonic(wp)
-    pitch_shift_for_song_audio = -opt_chroma_shift % 12
-    if pitch_shift_for_song_audio > 6:
-        pitch_shift_for_song_audio -= 12
-    if pitch_shift_for_song_audio != 0:
-        song_audio_shifted = pyrb.pitch_shift(
-            song_audio, Fs, pitch_shift_for_song_audio
-        )
-    else:
-        song_audio_shifted = song_audio
-    time_map_second = wp / feature_rate
-    midi_pm_warped = copy.deepcopy(midi_pm)
-    midi_pm_warped = simple_adjust_times(
-        midi_pm_warped, time_map_second[1], time_map_second[0]
-    )
-    piano_audio_warped = midi_pm_warped.fluidsynth(Fs)
-    song_audio_shifted = normalize(song_audio_shifted)
-    stereo_sonification_piano = get_stereo(song_audio_shifted, piano_audio_warped)
-    rd = dict(
-        mix_song=stereo_sonification_piano,
-        song_pitch_shifted=song_audio_shifted,
-        midi_warped_pm=midi_pm_warped,
-        pitch_shift_for_song_audio=pitch_shift_for_song_audio,
-        tuning_offset_song=tuning_offset_1,
-        tuning_offset_piano=tuning_offset_2,
-    )
-    return rd
-def simple_adjust_times(pm, original_times, new_times):
-    """
-    most of these codes are from original pretty_midi
-    https://github.com/craffel/pretty-midi/blob/main/pretty_midi/pretty_midi.py
-    """
-    for instrument in pm.instruments:
-        instrument.notes = [
-            copy.deepcopy(note)
-            for note in instrument.notes
-            if note.start >= original_times[0] and note.end <= original_times[-1]
-        ]
-    # Get array of note-on locations and correct them
-    note_ons = np.array(
-        [note.start for instrument in pm.instruments for note in instrument.notes]
-    )
-    adjusted_note_ons = np.interp(note_ons, original_times, new_times)
-    # Same for note-offs
-    note_offs = np.array(
-        [note.end for instrument in pm.instruments for note in instrument.notes]
-    )
-    adjusted_note_offs = np.interp(note_offs, original_times, new_times)
-    # Correct notes
-    for n, note in enumerate(
-        [note for instrument in pm.instruments for note in instrument.notes]
-    ):
-        note.start = (adjusted_note_ons[n] > 0) * adjusted_note_ons[n]
-        note.end = (adjusted_note_offs[n] > 0) * adjusted_note_offs[n]
-    # After performing alignment, some notes may have an end time which is
-    # on or before the start time.  Remove these!
-    pm.remove_invalid_notes()
-    def adjust_events(event_getter):
-        """This function calls event_getter with each instrument as the
-        sole argument and adjusts the events which are returned."""
-        # Sort the events by time
-        for instrument in pm.instruments:
-            event_getter(instrument).sort(key=lambda e: e.time)
-        # Correct the events by interpolating
-        event_times = np.array(
-            [
-                event.time
-                for instrument in pm.instruments
-                for event in event_getter(instrument)
-            ]
-        )
-        adjusted_event_times = np.interp(event_times, original_times, new_times)
-        for n, event in enumerate(
-            [
-                event
-                for instrument in pm.instruments
-                for event in event_getter(instrument)
-            ]
-        ):
-            event.time = adjusted_event_times[n]
-        for instrument in pm.instruments:
-            # We want to keep only the final event which has time ==
-            # new_times[0]
-            valid_events = [
-                event
-                for event in event_getter(instrument)
-                if event.time == new_times[0]
-            ]
-            if valid_events:
-                valid_events = valid_events[-1:]
-            # Otherwise only keep events within the new set of times
-            valid_events.extend(
-                event
-                for event in event_getter(instrument)
-                if event.time > new_times[0] and event.time < new_times[-1]
-            )
-            event_getter(instrument)[:] = valid_events
-    # Correct pitch bends and control changes
-    adjust_events(lambda i: i.pitch_bends)
-    adjust_events(lambda i: i.control_changes)
-    return pm
-def get_features_from_audio(audio, tuning_offset, visualize=False):
-    f_pitch = audio_to_pitch_features(
-        f_audio=audio,
-        Fs=Fs,
-        tuning_offset=tuning_offset,
-        feature_rate=feature_rate,
-        verbose=visualize,
-    )
-    f_chroma = pitch_to_chroma(f_pitch=f_pitch)
-    f_chroma_quantized = quantize_chroma(f_chroma=f_chroma)
-    f_pitch_onset = audio_to_pitch_onset_features(
-        f_audio=audio, Fs=Fs, tuning_offset=tuning_offset, verbose=visualize
-    )
-    f_DLNCO = pitch_onset_features_to_DLNCO(
-        f_peaks=f_pitch_onset,
-        feature_rate=feature_rate,
-        feature_sequence_length=f_chroma_quantized.shape[1],
-        visualize=visualize,
-    )
-    return f_chroma_quantized, f_DLNCO
-def main(samples, dry_run):
-    import multiprocessing
-    from joblib import Parallel, delayed
-    Parallel(n_jobs=multiprocessing.cpu_count() // 2)(
-        delayed(save_delayed_song)(sample=sample, dry_run=dry_run)
-        for sample in tqdm(samples)
-    )
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="piano cover downloader")
-    parser.add_argument(
-        "data_dir",
-        type=str,
-        default=None,
-        help="""directory contains {id}/{song_filename.wav}
-        """,
-    )
-    parser.add_argument(
-        "--dry_run", default=False, action="store_true", help="whether dry_run"
-    )
-    args = parser.parse_args()
-    def getfiles():
-        meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
-        print("meta ", len(meta_files))
-        samples = list()
-        for meta_file in tqdm(meta_files):
-            m = MidiAudioPair(meta_file, auto_remove_no_song=True)
-            if m.error_code != MidiAudioPair.NO_SONG:
-                aux_txt = os.path.join(
-                    m.audio_dir,
-                    m.yaml.piano.ytid,
-                    f"{m.yaml.piano.title[:50]}___{m.yaml.song.title[:50]}.txt",
-                )
-                with open(aux_txt, "w") as f:
-                    f.write(".")
-                samples.append(m)
-        print(f"files available {len(samples)}")
-        return samples
-    samples = getfiles()
-    main(samples=samples, dry_run=args.dry_run)

preprocess/split_spleeter.py DELETED Viewed

@@ -1,72 +0,0 @@
-import glob
-import os
-import random
-import sys
-from tqdm.auto import tqdm
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from midiaudiopair import MidiAudioPair
-def split_spleeter(meta_files):
-    # Use audio loader explicitly for loading audio waveform :
-    from spleeter.audio.adapter import AudioAdapter
-    from spleeter.separator import Separator
-    import spleeter
-    sample_rate = 44100
-    audio_loader = AudioAdapter.default()
-    # Using embedded configuration.
-    separator = Separator("spleeter:2stems")
-    for meta_file in tqdm(meta_files):
-        sample = MidiAudioPair(meta_file)
-        if sample.error_code == MidiAudioPair.NO_SONG:
-            continue
-        if os.path.exists(sample.vocals):
-            continue
-        waveform, _ = audio_loader.load(sample.song, sample_rate=sample_rate)
-        # Perform the separation :
-        prediction = separator.separate(waveform)
-        audio_loader.save(
-            path=sample.vocals,
-            data=prediction["vocals"][:, 0:1],
-            codec=spleeter.audio.Codec.MP3,
-            sample_rate=sample_rate,
-        )
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="bpm estimate using essentia")
-    parser.add_argument(
-        "data_dir",
-        type=str,
-        default=None,
-        help="""directory contains {id}/{pop_filename.wav}
-        """,
-    )
-    parser.add_argument(
-        "--random_order",
-        default=False,
-        action="store_true",
-        help="Random order process (to run multiple process)",
-    )
-    args = parser.parse_args()
-    meta_files = sorted(glob.glob(args.data_dir + "/*.yaml"))
-    if args.random_order:
-        random.shuffle(meta_files)
-    print("meta ", len(meta_files))
-    split_spleeter(meta_files)

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
 pretty-midi==0.2.9
-omegaconf==2.1.1
-transformers==4.16.1
-pytorch-lightning==1.8.3
-essentia==2.1b6.dev1034
-note-seq==0.0.5
 pyFluidSynth==1.3.0
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==1.13.1
-torchaudio==0.13.1

+torch
+librosa
 pretty-midi==0.2.9
+essentia==2.1b6.dev1034
 pyFluidSynth==1.3.0
+git+https://github.com/huggingface/transformers
+midi2audio
+pytube
+gradio
+resampy
+soundfile

transformer_wrapper.py DELETED Viewed

@@ -1,330 +0,0 @@
-import os
-import random
-import numpy as np
-import librosa
-import torch
-import pytorch_lightning as pl
-import soundfile as sf
-from torch.nn.utils.rnn import pad_sequence
-from transformers import T5Config, T5ForConditionalGeneration
-from midi_tokenizer import MidiTokenizer, extrapolate_beat_times
-from layer.input import LogMelSpectrogram, ConcatEmbeddingToMel
-from preprocess.beat_quantizer import extract_rhythm, interpolate_beat_times
-from utils.dsp import get_stereo
-DEFAULT_COMPOSERS = {"various composer": 2052}
-class TransformerWrapper(pl.LightningModule):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.tokenizer = MidiTokenizer(config.tokenizer)
-        self.t5config = T5Config.from_pretrained("t5-small")
-        for k, v in config.t5.items():
-            self.t5config.__setattr__(k, v)
-        self.transformer = T5ForConditionalGeneration(self.t5config)
-        self.use_mel = self.config.dataset.use_mel
-        self.mel_is_conditioned = self.config.dataset.mel_is_conditioned
-        self.composer_to_feature_token = config.composer_to_feature_token
-        if self.use_mel and not self.mel_is_conditioned:
-            self.composer_to_feature_token = DEFAULT_COMPOSERS
-        if self.use_mel:
-            self.spectrogram = LogMelSpectrogram()
-            if self.mel_is_conditioned:
-                n_dim = 512
-                composer_n_vocab = len(self.composer_to_feature_token)
-                embedding_offset = min(self.composer_to_feature_token.values())
-                self.mel_conditioner = ConcatEmbeddingToMel(
-                    embedding_offset=embedding_offset,
-                    n_vocab=composer_n_vocab,
-                    n_dim=n_dim,
-                )
-        else:
-            self.spectrogram = None
-        self.lr = config.training.lr
-    def forward(self, input_ids, labels):
-        """
-        Deprecated.
-        """
-        rt = self.transformer(input_ids=input_ids, labels=labels)
-        return rt
-    @torch.no_grad()
-    def single_inference(
-        self,
-        feature_tokens=None,
-        audio=None,
-        beatstep=None,
-        max_length=256,
-        max_batch_size=64,
-        n_bars=None,
-        composer_value=None,
-    ):
-        """
-        generate a long audio sequence
-        feature_tokens or audio : shape (time, )
-        beatstep : shape (time, )
-        - input_ids가 해당하는 beatstep 값들
-        (offset 빠짐, 즉 beatstep[0] == 0)
-        - beatstep[-1] : input_ids가 끝나는 지점의 시간값
-        (즉 beatstep[-1] == len(y)//sr)
-        """
-        assert feature_tokens is not None or audio is not None
-        assert beatstep is not None
-        if feature_tokens is not None:
-            assert len(feature_tokens.shape) == 1
-        if audio is not None:
-            assert len(audio.shape) == 1
-        config = self.config
-        PAD = self.t5config.pad_token_id
-        n_bars = config.dataset.n_bars if n_bars is None else n_bars
-        if beatstep[0] > 0.01:
-            print(
-                "inference warning : beatstep[0] is not 0 ({beatstep[0]}). all beatstep will be shifted."
-            )
-            beatstep = beatstep - beatstep[0]
-        if self.use_mel:
-            input_ids = None
-            inputs_embeds, ext_beatstep = self.prepare_inference_mel(
-                audio,
-                beatstep,
-                n_bars=n_bars,
-                padding_value=PAD,
-                composer_value=composer_value,
-            )
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise NotImplementedError
-        # Considering GPU capacity, some sequence would not be generated at once.
-        relative_tokens = list()
-        for i in range(0, batch_size, max_batch_size):
-            start = i
-            end = min(batch_size, i + max_batch_size)
-            if input_ids is None:
-                _input_ids = None
-                _inputs_embeds = inputs_embeds[start:end]
-            else:
-                _input_ids = input_ids[start:end]
-                _inputs_embeds = None
-            _relative_tokens = self.transformer.generate(
-                input_ids=_input_ids,
-                inputs_embeds=_inputs_embeds,
-                max_length=max_length,
-            )
-            _relative_tokens = _relative_tokens.cpu().numpy()
-            relative_tokens.append(_relative_tokens)
-        max_length = max([rt.shape[-1] for rt in relative_tokens])
-        for i in range(len(relative_tokens)):
-            relative_tokens[i] = np.pad(
-                relative_tokens[i],
-                [(0, 0), (0, max_length - relative_tokens[i].shape[-1])],
-                constant_values=PAD,
-            )
-        relative_tokens = np.concatenate(relative_tokens)
-        pm, notes = self.tokenizer.relative_batch_tokens_to_midi(
-            relative_tokens,
-            beatstep=ext_beatstep,
-            bars_per_batch=n_bars,
-            cutoff_time_idx=(n_bars + 1) * 4,
-        )
-        return relative_tokens, notes, pm
-    def prepare_inference_mel(self, audio, beatstep, n_bars, padding_value, composer_value=None):
-        n_steps = n_bars * 4
-        n_target_step = len(beatstep)
-        sample_rate = self.config.dataset.sample_rate
-        ext_beatstep = extrapolate_beat_times(beatstep, (n_bars + 1) * 4 + 1)
-        def split_audio(audio):
-            # Split audio corresponding beat intervals.
-            # Each audio's lengths are different.
-            # Because each corresponding beat interval times are different.
-            batch = []
-            for i in range(0, n_target_step, n_steps):
-                start_idx = i
-                end_idx = min(i + n_steps, n_target_step)
-                start_sample = int(ext_beatstep[start_idx] * sample_rate)
-                end_sample = int(ext_beatstep[end_idx] * sample_rate)
-                feature = audio[start_sample:end_sample]
-                batch.append(feature)
-            return batch
-        def pad_and_stack_batch(batch):
-            batch = pad_sequence(batch, batch_first=True, padding_value=padding_value)
-            return batch
-        batch = split_audio(audio)
-        batch = pad_and_stack_batch(batch)
-        inputs_embeds = self.spectrogram(batch).transpose(-1, -2)
-        if self.mel_is_conditioned:
-            composer_value = torch.tensor(composer_value).to(self.device)
-            composer_value = composer_value.repeat(inputs_embeds.shape[0])
-            inputs_embeds = self.mel_conditioner(inputs_embeds, composer_value)
-        return inputs_embeds, ext_beatstep
-    @torch.no_grad()
-    def generate(
-        self,
-        audio_path=None,
-        composer=None,
-        model="generated",
-        steps_per_beat=2,
-        stereo_amp=0.5,
-        n_bars=2,
-        ignore_duplicate=True,
-        show_plot=False,
-        save_midi=False,
-        save_mix=False,
-        midi_path=None,
-        mix_path=None,
-        click_amp=0.2,
-        add_click=False,
-        max_batch_size=None,
-        beatsteps=None,
-        mix_sample_rate=None,
-        audio_y=None,
-        audio_sr=None,
-    ):
-        config = self.config
-        device = self.device
-        if audio_path is not None:
-            extension = os.path.splitext(audio_path)[1]
-            mix_path = (
-                audio_path.replace(extension, f".{model}.{composer}.wav")
-                if mix_path is None
-                else mix_path
-            )
-            midi_path = (
-                audio_path.replace(extension, f".{model}.{composer}.mid")
-                if midi_path is None
-                else midi_path
-            )
-        max_batch_size = 64 // n_bars if max_batch_size is None else max_batch_size
-        composer_to_feature_token = self.composer_to_feature_token
-        if composer is None:
-            composer = random.sample(list(composer_to_feature_token.keys()), 1)[0]
-        composer_value = composer_to_feature_token[composer]
-        mix_sample_rate = config.dataset.sample_rate if mix_sample_rate is None else mix_sample_rate
-        if not ignore_duplicate:
-            if os.path.exists(midi_path):
-                return
-        ESSENTIA_SAMPLERATE = 44100
-        if beatsteps is None:
-            y, sr = librosa.load(audio_path, sr=ESSENTIA_SAMPLERATE)
-            (
-                bpm,
-                beat_times,
-                confidence,
-                estimates,
-                essentia_beat_intervals,
-            ) = extract_rhythm(audio_path, y=y)
-            beat_times = np.array(beat_times)
-            beatsteps = interpolate_beat_times(beat_times, steps_per_beat, extend=True)
-        else:
-            y = None
-        if self.use_mel:
-            if audio_y is None and config.dataset.sample_rate != ESSENTIA_SAMPLERATE:
-                if y is not None:
-                    y = librosa.core.resample(
-                        y,
-                        orig_sr=ESSENTIA_SAMPLERATE,
-                        target_sr=config.dataset.sample_rate,
-                    )
-                    sr = config.dataset.sample_rate
-                else:
-                    y, sr = librosa.load(audio_path, sr=config.dataset.sample_rate)
-            elif audio_y is not None:
-                if audio_sr != config.dataset.sample_rate:
-                    audio_y = librosa.core.resample(
-                        audio_y, orig_sr=audio_sr, target_sr=config.dataset.sample_rate
-                    )
-                    audio_sr = config.dataset.sample_rate
-                y = audio_y
-                sr = audio_sr
-            start_sample = int(beatsteps[0] * sr)
-            end_sample = int(beatsteps[-1] * sr)
-            _audio = torch.from_numpy(y)[start_sample:end_sample].to(device)
-            fzs = None
-        else:
-            raise NotImplementedError
-        relative_tokens, notes, pm = self.single_inference(
-            feature_tokens=fzs,
-            audio=_audio,
-            beatstep=beatsteps - beatsteps[0],
-            max_length=config.dataset.target_length * max(1, (n_bars // config.dataset.n_bars)),
-            max_batch_size=max_batch_size,
-            n_bars=n_bars,
-            composer_value=composer_value,
-        )
-        for n in pm.instruments[0].notes:
-            n.start += beatsteps[0]
-            n.end += beatsteps[0]
-        if show_plot or save_mix:
-            if mix_sample_rate != sr:
-                y = librosa.core.resample(y, orig_sr=sr, target_sr=mix_sample_rate)
-                sr = mix_sample_rate
-            if add_click:
-                clicks = librosa.clicks(times=beatsteps, sr=sr, length=len(y)) * click_amp
-                y = y + clicks
-            pm_y = pm.fluidsynth(sr)
-            stereo = get_stereo(y, pm_y, pop_scale=stereo_amp)
-        if show_plot:
-            import note_seq
-            note_seq.plot_sequence(note_seq.midi_to_note_sequence(pm))
-        if save_mix:
-            sf.write(
-                file=mix_path,
-                data=stereo.T,
-                samplerate=sr,
-                format="wav",
-            )
-        if save_midi:
-            pm.write(midi_path)
-        return pm, composer, mix_path, midi_path

utils/__init__.py DELETED Viewed

File without changes

utils/dsp.py DELETED Viewed

@@ -1,63 +0,0 @@
-import numpy as np
-from scipy.interpolate import interp1d
-def normalize(audio, min_y=-1.0, max_y=1.0, eps=1e-8):
-    assert len(audio.shape) == 1
-    max_y -= eps
-    min_y += eps
-    amax = audio.max()
-    amin = audio.min()
-    audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
-    return audio
-def get_stereo(pop_y, midi_y, pop_scale=0.99):
-    if len(pop_y) > len(midi_y):
-        midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
-    elif len(pop_y) < len(midi_y):
-        pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
-    stereo = np.stack((midi_y, pop_y * pop_scale))
-    return stereo
-def generate_variable_f0_sine_wave(f0, len_y, sr):
-    """
-    integrate instant frequencies to get pure tone sine wave
-    """
-    x_sample = np.arange(len(f0))
-    intp = interp1d(x_sample, f0, kind="linear")
-    f0_audiorate = intp(np.linspace(0, len(f0) - 1, len_y))
-    pitch_wave = np.sin((np.nan_to_num(f0_audiorate) / sr * 2 * np.pi).cumsum())
-    return pitch_wave
-def fluidsynth_without_normalize(self, fs=44100, sf2_path=None):
-    """Synthesize using fluidsynth. without signal normalize
-    Parameters
-    ----------
-    fs : int
-        Sampling rate to synthesize at.
-    sf2_path : str
-        Path to a .sf2 file.
-        Default ``None``, which uses the TimGM6mb.sf2 file included with
-        ``pretty_midi``.
-    Returns
-    -------
-    synthesized : np.ndarray
-        Waveform of the MIDI data, synthesized at ``fs``.
-    """
-    # If there are no instruments, or all instruments have no notes, return
-    # an empty array
-    if len(self.instruments) == 0 or all(len(i.notes) == 0 for i in self.instruments):
-        return np.array([])
-    # Get synthesized waveform for each instrument
-    waveforms = [i.fluidsynth(fs=fs, sf2_path=sf2_path) for i in self.instruments]
-    # Allocate output waveform, with #sample = max length of all waveforms
-    synthesized = np.zeros(np.max([w.shape[0] for w in waveforms]))
-    # Sum all waveforms in
-    for waveform in waveforms:
-        synthesized[: waveform.shape[0]] += waveform
-    # Normalize
-    # synthesized /= np.abs(synthesized).max()
-    return synthesized