Upload 6 files

Browse files

Files changed (6) hide show

preprocess/.DS_Store +0 -0
preprocess/encoder/__init__.py +3 -0
preprocess/encoder/mert.py +28 -0
preprocess/encoder/music2latent.py +29 -0
preprocess/feature_extractor.py +94 -0
preprocess/jamendo_split.py +83 -0

preprocess/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

preprocess/encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ "Import all submodules"
2	+
3	+ # from model import

preprocess/encoder/mert.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+import numpy as np
+from transformers import Wav2Vec2FeatureExtractor, AutoModel
+class FeatureExtractorMERT:
+    def __init__(self, model_name="m-a-p/MERT-v1-95M", device_id=0, sr=24000):
+        self.model_name = model_name
+        self.device_id = device_id
+        self.sr = sr
+        self.device = torch.device(f"cuda:{self.device_id}" if torch.cuda.is_available() else "cpu")
+        self.model = AutoModel.from_pretrained(self.model_name, trust_remote_code=True).to(self.device)
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(self.model_name, trust_remote_code=True)
+    def extract_features_from_segment(self, segment, sample_rate, save_path):
+        input_audio = segment.float()
+        model_inputs = self.processor(input_audio, sampling_rate=sample_rate, return_tensors="pt")
+        model_inputs = model_inputs.to(self.device)
+        with torch.no_grad():
+            model_outputs = self.model(**model_inputs, output_hidden_states=True)
+        # Stack and process hidden states
+        all_layer_hidden_states = torch.stack(model_outputs.hidden_states).squeeze()[1:, :, :].unsqueeze(0)
+        all_layer_hidden_states = all_layer_hidden_states.mean(dim=2)
+        features = all_layer_hidden_states.cpu().detach().numpy()
+        # Save features
+        np.save(save_path, features)

preprocess/encoder/music2latent.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import torch
+import torchaudio
+import torchaudio.transforms as T
+import numpy as np
+from music2latent import EncoderDecoder  # Import your custom model
+class FeatureExtractorM2L:
+    def __init__(self, device_id=0, sr=44100):
+        self.device_id = device_id
+        self.sr = sr
+        self.device = torch.device(f"cuda:{self.device_id}" if torch.cuda.is_available() else "cpu")
+        self.model = EncoderDecoder(device=self.device)
+    def extract_features_from_segment(self, segment, sample_rate, save_path):
+        input_audio = segment.unsqueeze(0).to(self.device)  # Add batch dimension and move to the device
+        with torch.no_grad():
+            model_outputs = self.model.encode(input_audio, extract_features=True)
+        features = model_outputs.mean(dim=-1).cpu().numpy()
+        np.save(save_path, features)

preprocess/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import torch
+import torchaudio
+import torchaudio.transforms as T
+from tqdm import tqdm
+import numpy as np
+from omegaconf import DictConfig
+import hydra
+from hydra.utils import to_absolute_path
+from transformers import Wav2Vec2FeatureExtractor, AutoModel
+from encoder.mert import FeatureExtractorMERT
+from encoder.music2latent import FeatureExtractorM2L
+class AudioProcessor:
+    def __init__(self, cfg: DictConfig):
+        self.input_directory = cfg.dataset.input_dir
+        self.output_directory = cfg.dataset.output_dir
+        self.segment_duration = cfg.segment_duration
+        self.resample_rate = cfg.model.sr
+        self.device_id = cfg.device_id
+        self.feature_extractor = self._initialize_extractor(cfg.model.name)
+        self.is_split = cfg.is_split
+    def _initialize_extractor(self, model_name: str):
+        if "MERT" in model_name:
+            return FeatureExtractorMERT(model_name=model_name, device_id=self.device_id, sr=self.resample_rate)
+        elif "music2latent" == model_name:
+            return FeatureExtractorM2L(device_id=self.device_id, sr=self.resample_rate)
+        else:
+            raise NotImplementedError(f"Feature extraction for model {model_name} is not implemented.")
+    def resample_waveform(self, waveform, original_sample_rate, target_sample_rate):
+        if original_sample_rate != target_sample_rate:
+            resampler = T.Resample(original_sample_rate, target_sample_rate)
+            return resampler(waveform), target_sample_rate
+        return waveform, original_sample_rate
+    def split_audio(self, waveform, sample_rate):
+        segment_samples = self.segment_duration * sample_rate
+        total_samples = waveform.size(0)
+        segments = []
+        for start in range(0, total_samples, segment_samples):
+            end = start + segment_samples
+            if end <= total_samples:
+                segment = waveform[start:end]
+                segments.append(segment)
+        # In case audio length is shorter than segment length.
+        if len(segments) == 0:
+            segment = waveform
+            segments.append(segment)
+        return segments
+    def process_audio_file(self, file_path, output_dir):
+        print(f"Processing {file_path}")
+        waveform, sample_rate = torchaudio.load(file_path)
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0).unsqueeze(0)
+        waveform = waveform.squeeze()
+        waveform, sample_rate = self.resample_waveform(waveform, sample_rate, self.resample_rate)
+        if self.is_split:
+            segments = self.split_audio(waveform, sample_rate)
+            for i, segment in enumerate(segments):
+                segment_save_path = os.path.join(output_dir, f"segment_{i}.npy")
+                if os.path.exists(segment_save_path):
+                    continue
+                self.feature_extractor.extract_features_from_segment(segment, sample_rate, segment_save_path)
+        else:
+            segment_save_path = os.path.join(output_dir, f"segment_0.npy")
+            if not os.path.exists(segment_save_path):
+                self.feature_extractor.extract_features_from_segment(waveform, sample_rate, segment_save_path)
+    def process_directory(self):
+        for root, _, files in os.walk(self.input_directory):
+            for file in files:
+                if file.endswith('.mp3'):
+                    file_path = os.path.join(root, file)
+                    relative_path = os.path.relpath(file_path, self.input_directory)
+                    output_file_dir = os.path.join(self.output_directory, os.path.splitext(relative_path)[0])
+                    os.makedirs(output_file_dir, exist_ok=True)
+                    self.process_audio_file(file_path, output_file_dir)
+@hydra.main(version_base=None, config_path="../config", config_name="prep_config")
+def main(cfg: DictConfig):
+    processor = AudioProcessor(cfg)
+    processor.process_directory()
+if __name__ == "__main__":
+    main()

preprocess/jamendo_split.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import csv
+import pickle
+import numpy as np
+import fire
+from collections import Counter
+class Split:
+    def read_tsv(self, fn):
+        r = []
+        with open(fn) as tsv:
+            reader = csv.reader(tsv, delimiter='\t')
+            for row in reader:
+                r.append(row)
+        return r[1:]
+    def get_tag_list(self, option):
+        if option == 'top50tags':
+            tag_list = np.load('dataset/jamendo/meta/tag_list_50.npy')
+        else:
+            tag_list = np.load('dataset/jamendo/meta/tag_list.npy')
+            if option == 'genre':
+                tag_list = tag_list[:87]
+            elif option == 'instrument':
+                tag_list = tag_list[87:127]
+            elif option == 'moodtheme':
+                tag_list = tag_list[127:]
+        return list(tag_list)
+    def get_npy_array(self, path, tag_list, option, type_='train'):
+        if option=='all':
+            tsv_fn = os.path.join(path, 'autotagging-%s.tsv'%type_)
+        else:
+            tsv_fn = os.path.join(path, 'autotagging_%s-%s.tsv'%(option, type_))
+        rows = self.read_tsv(tsv_fn)
+        dictionary = {}
+        i = 0
+        for row in rows:
+            temp_dict = {}
+            temp_dict['path'] = row[3]
+            temp_dict['duration'] = (float(row[4]) * 12000 - 512) // 256
+            if option == 'all':
+                temp_dict['tags'] = np.zeros(183)
+            elif option == 'genre':
+                temp_dict['tags'] = np.zeros(87)
+            elif option == 'instrument':
+                temp_dict['tags'] = np.zeros(40)
+            elif option == 'moodtheme':
+                temp_dict['tags'] = np.zeros(56)
+            elif option == 'top50tags':
+                temp_dict['tags'] = np.zeros(50)
+            tags = row[5:]
+            for tag in tags:
+                try:
+                    temp_dict['tags'][tag_list.index(tag)] = 1
+                except:
+                    continue
+            if temp_dict['tags'].sum() > 0 and os.path.exists(os.path.join(self.npy_path, row[3][:-3])+'npy'):
+                dictionary[i] = temp_dict
+                i += 1
+        dict_fn = os.path.join(path, '%s_%s_dict.pickle'%(option, type_))
+        with open(dict_fn, 'wb') as pf:
+            pickle.dump(dictionary, pf)
+    def run_iter(self, split, option='all'):
+        tag_list = self.get_tag_list(option)
+        path = 'dataset/jamendo/splits/split-%d/' % split
+        self.get_npy_array(path, tag_list, option, type_='train')
+        self.get_npy_array(path, tag_list, option, type_='validation')
+        self.get_npy_array(path, tag_list, option, type_='test')
+    def run(self, path):
+        self.npy_path = path
+        for i in range(5):
+            # self.run_iter(i, 'all')
+            self.run_iter(i, 'genre')
+            self.run_iter(i, 'instrument')
+            self.run_iter(i, 'moodtheme')
+            # self.run_iter(i, 'top50tags')
+if __name__ == '__main__':
+    s = Split()
+    fire.Fire({'run': s.run})