#! /usr/bin/env python3 import re import sys import torch.nn as nn import torch from PIL import Image import numpy as np from . import rect_to_square, square_to_rect CHORD_BORDER = 8 # chord border size in pixels # my distillation of all output from polyffusion's chord finder for transposed +/-12 semitones POP909 dataset. NOTE_NAMES = ['C','C#','D','E','Eb','F','F#','G', 'Ab', 'A', 'Bb', 'B'] # these are from polyffusion's chord finder. yes, mixing # & b is weird #NOTE_NAMES2 = ['A','Ab','B','Bb','C','C#','D','E','Eb','F','F#','G'] # how they are in all_chords.txt file CHORD_TYPES = ['aug', 'dim', 'dim7', 'hdim7', 'maj', 'maj(11)', 'maj13', 'maj/3', 'maj/5', 'maj6', 'maj6(9)', 'maj7', 'maj7/3', 'maj7/5', 'maj7/7', 'maj(9)', 'maj9', 'maj9(11)', 'min', 'min(11)', 'min11', 'min13', 'min/5', 'min6', 'min6(9)', 'min7', 'min7/5', 'min7/b7', 'min(9)', 'min9', 'min/b3', 'minmaj7', 'sus2', 'sus4', 'sus4(b7)', 'sus4(b7,9)', '7', '7/3', '7/5', '7(#9)', '7/b7', '9', '11', '13'] # 44 chord types CHORD_IND_PAIRS = [(note, chord) for note in NOTE_NAMES for chord in CHORD_TYPES] POSSIBLE_CHORDS = [f"{note}:{chord}" for (note, chord) in CHORD_IND_PAIRS] #POSSIBLE_CHORDS = [f"{note}:{chord}" for note in NOTE_NAMES for chord in CHORD_TYPES] POSSIBLE_CHORDS += ['N'] # N for no chord assert len(POSSIBLE_CHORDS) == 12*44+1, f"There should be {12*44+1} possible chords, but there are {len(POSSIBLE_CHORDS)}. Check the NOTE_NAMES and CHORD_TYPES lists." def to_base_9(n): # converts a decimal integer to base 9 if n == 0: return [0, 0, 0] digits = [] while n: digits.append(n % 9) n //= 9 while len(digits) < 3: # add leading zeros digits.append(0) return digits[::-1] def chord_num_to_color(cn, scale=30): # "embeddings" for chords, from (0,0,30) up to (240,240,240) in each (RGB) channel, in steps of 30 color = to_base_9(cn+1) return tuple(x*scale for x in color) def color_to_chord_num(color, scale=30, warnings_on=False): # reverse of chord_num_to_color, note that color goes backwards out = sum([x//scale * 9**i for i, x in enumerate(color[::-1])])-1 if out < 0: if warnings_on: print(f"color_to_chord_num: Warning: out should be equal to or greater than 0: color = {color}, out = {out}. Wrapping around to {len(POSSIBLE_CHORDS)+out}") out = len(POSSIBLE_CHORDS) + out return out def simplify_chord(chord_name): """Simplifies chord names by applying a few rules: 1. get rid of the ones with parentheses, e.g. change "A:maj(11)" to just "A:maj"? 2. remove the notes in the bass, like mapping all "A:7/3", "A:7/5" and "A:7/b7" to just "A:7"? 3. remove uspension markings, e.g. sus2, sus4? 4. maybe? high-numbered added notes like "G:min11" & "G:min13" -> "G:min" """ chord_name = re.sub(r'\(.*','',chord_name) # 1 chord_name = re.sub(r'\/.*','',chord_name) # 2 chord_name = re.sub(r'sus.*','',chord_name) # 3 return chord_name def get_unique_indices(data): """Returns the indices of non-repeating values in a list Args: data: A list of any data type. Example: data = [0, 1, 4, 1, 5, 5, 5, 6, 10, 6, 6, 5] Returns: A list of indices for non-repeating values. Example: result = [0, 1, 2, 3, 6, 7, 8, 10, 11] """ return [i for i, (val, next_val) in enumerate(zip(data, data[1:])) if val != next_val] + [len(data) - 1] def get_nonrepeated_values(data, indices=None): """Returns the indices of non-repeating values in a list Args: data: A list of any data type. Example: data = [0, 1, 4, 1, 5, 5, 5, 6, 10, 6, 6, 5] Returns: A list of non-repeating values. Example: returns [0, 1, 4, 1, 5, 6, 10, 6, 5] """ if indices is None: indices = get_unique_indices(data) return [data[i] for i in indices] def most_freq_or_first(arr, debug=False): "returns either the most frequent value in array, or if multiple values are most frequent, it returns the first such value" assert len(arr.shape) == 1, "arr must be 1D" savearr = arr.copy() if debug: print("most_freq_or_first: arr = ", arr) if savearr.min() < 0: # if there are negative values, we need to shift them up to 0 arr = arr - savearr.min() bc = np.bincount(arr) try: if np.any(arr < 0): bc[arr < 0] = 0 # don't inlcude negative arr values when checking for most frequent bc[bc != bc.max()] = 0 # only interested in most frequent values except Exception as e: print("Exception ",e) print("most_freq_or_first: arr.shape = ", arr.shape) print("most_freq_or_first: arr = ", arr ) print("most_freq_or_first: bc.shape = ", bc.shape) raise e out = np.argmax(bc) # shift numbers back down if savearr.min() < 0: out = out + savearr.min() assert out.max() <= arr.max(), f"out.max() = {out.max()} should be less than arr.max() = {arr.max()}" return out def most_freq_or_first_every(arr, every=4, # pixels per chord label. 4=every quarter note ): assert len(arr.shape) == 1, "arr must be 1D" "used to grab most frequent chord labels, assuming we're starting on a beat. arr=chord label indices, e.g. in 0..528" remainder = len(arr) % every if remainder != 0: arr = np.pad(arr, (0, every - remainder), mode='constant', constant_values=(0, arr[- remainder])) #print("most_freq_or_first_every: Warning: Padding arr with last beat value on end. new arr =",arr) check = arr.reshape((-1,every)) out = np.array( [most_freq_or_first(a) for a in arr.reshape((-1,every))] ) if out.max() > arr.max(): for i, c in enumerate(check): mfof = most_freq_or_first(c) if mfof > c.max(): print(f"i={i}, c={c}, most_freq_or_first(c)={mfof}") raise ValueError(f"out.max() = {out.max()} should be less than arr.max() = {arr.max()}") return out def chord_str_to_pair(chord_str): "converts a chord string to a pair of (note, chord) indices" if chord_str == 'N': return (-1,-1) note, chord_type = chord_str.split(':') note_ind = NOTE_NAMES.index(note) chord_type_ind = CHORD_TYPES.index(chord_type) return (note_ind, chord_type_ind) def chords_str_to_pairs(chords_str): for chord_str in chords_str.split(','): yield chord_str_to_pair(chord_str) def chords_str_to_inds(chords_str): for chord_str in chords_str.split(','): yield POSSIBLE_CHORDS.index(chord_str) def pair_to_chord_index(pair): "converts a pair of (note, chord_type) indices to a single chord index" note_ind, chord_type_ind = pair return note_ind*len(CHORD_TYPES) + chord_type_ind def chord_index_to_pair(ci): "converts a single chord index to a pair of (note, chord) indices" note_ind = ci // len(CHORD_TYPES) chord_type_ind = ci % len(CHORD_TYPES) return (note_ind, chord_type_ind) def chord_index_to_str(ci): "converts a single chord index to a chord string" return POSSIBLE_CHORDS[ci] class ChordEmbedding(nn.Module): def __init__(self, chord_emb_dim=8, note_emb_dim=8, type_emb_dim=8, debug=False): super(ChordEmbedding, self).__init__() self.emb_note = nn.Embedding(len(NOTE_NAMES)+1, note_emb_dim) # +1 for "N" ie no chord" self.emb_type = nn.Embedding(len(CHORD_TYPES), type_emb_dim) self.compactify = nn.Linear(note_emb_dim + type_emb_dim, chord_emb_dim) self.chord_emb_dim = chord_emb_dim self.debug = debug self.zero_vec = torch.zeros((1, self.chord_emb_dim)) self.chord_emb_dim = chord_emb_dim def forward(self, chord_inds:torch.Tensor, debug=False): """x should have dimensions (B) where B is the batch size each value is the index of the chord in the vocabulary Any note wherever inds is len(POSSIBLE_CHORDS), we want to return a zero vector, otherwise we want to return the embedding""" if chord_inds.max() > len(POSSIBLE_CHORDS): torch.set_printoptions(threshold=10000) print(f"\nchord_inds.max() = {chord_inds.max()} but len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}. \nchord_inds = {chord_inds}") raise ValueError("chord_inds.max() should be less than len(POSSIBLE_CHORDS)") note_inds, type_inds = chord_inds // len(CHORD_TYPES), chord_inds % len(CHORD_TYPES) # note that for 'N' chord in which chord_ind==len(POSSIBLE_CHORDS)-1, we will get note_inds=LEN(NOTE_NAMES) and type_inds=0. that's why self.embed_note has len(NOTE_NAMES)+1 if debug: print("note_inds, type_inds = ", note_inds, type_inds) print("note_inds.max(), type_inds.max() = ", note_inds.max(), type_inds.max()) note_emb = self.emb_note(note_inds) type_emb = self.emb_type(type_inds) if debug: print("\nnote_emb.shape, type_emb.shape = ", note_emb.shape, type_emb.shape) combined_emb = torch.cat((note_emb, type_emb), dim=1) if debug: print("combined_emb.shape = ", combined_emb.shape) x = self.compactify(combined_emb) if debug: print("ce: x.shape, self.chord_emb_dim = ", x.shape, self.chord_emb_dim) return x class ChordAE(nn.Module): """Maybe not needed: Autoencoder for training chord embeddings? Note: we don't really need an AE for the full model, we can get by with just the encoder (and no decoder) but the AE is useful for exploring how few dimensions we can get away with""" def __init__(self, chord_vocab_size=len(POSSIBLE_CHORDS), chord_emb_dim=8): super(ChordAE, self).__init__() self.encoder = ChordEmbedding(chord_emb_dim) self.decoder = nn.Linear(chord_emb_dim, chord_vocab_size) # could do better maybe def forward(self, x, debug=False): x = self.encoder(x) x = self.decoder(x) return x def abs_seq_to_rel_seq(seq:torch.Tensor): """converts a batch of absolute sequences of chord indices to a batch of relative sequence of chord indices subtract the note of the first element in each batch from all the other note indices, modulo len(NOTE_NAMES) overwrite the first element so it's unchanged, and overwrite and 'N' chords with...something else? TODO """ assert len(seq.shape)==2, f"seq should be 2D, but seq.shape = {seq.shape}" # decompose seq into two tensors, one of notes and one of chord types note_inds, type_inds = seq // len(CHORD_TYPES), seq % len(CHORD_TYPES) # for note_inds<12, subtract these from the first element in the sequence, modulo len(NOTE_NAMES) i.e. 12 note_inds2 = note_inds.clone() note_inds2[:,1:] = (note_inds2[:,1:] - note_inds2[:,0].unsqueeze(1)) % len(NOTE_NAMES) # 'N' chords: whereever note_inds == 12, overwrite note_inds2 with 12 note_inds2[note_inds == len(NOTE_NAMES)] = len(NOTE_NAMES) # recompose seq changes_seq = note_inds2 * len(CHORD_TYPES) + type_inds # now these are no longer chords, they are chord *changes* rel to first chord return changes_seq class ChordSeqEncoder(nn.Module): """Encoder for sequences of chords: We embed the first chord, then we embed the CHANGES in chords thereafter (using modulo-12 arithmetic on the bass note) (4 chords per bar x 32 bars = 128 chords), and then pass the sequence of the chords through some sequence model (LSTM for now, could use a Transformer or something else later) to generate a [256]-dimensional embedding of the sequence of chord embeddings """ def __init__(self, chord_emb_dim=8, seq_len=512//4, seq_emb_dim=256, hidden_dim=512, dropout=0.2): super(ChordSeqEncoder, self).__init__() self.chord_encoder = ChordEmbedding() self.seq_encoder = nn.LSTM(chord_emb_dim, seq_emb_dim, batch_first=True, num_layers=2, dropout=dropout) self.seq_len = seq_len def forward(self, bs): "x should have dimensions (B, S) where B is the batch size and S is the length of the sequence of chord indices" B,S = bs.shape changes_seq = abs_seq_to_rel_seq(bs) # convert to relative sequence of chord indices # get chord embeddings for every chord in the batch in the sequence x = self.chord_encoder(changes_seq.flatten()) # reshape x into (B, S, E) where B is the batch size, S is the sequence length, and E is the chord embedding dimension x = x.view(B, S, -1) E = x.shape[-1] #print("before seq_encoder, x.shape = ", x.shape) #x, _ = self.seq_encoder(x) output, (hidden, cell) = self.seq_encoder(x) #output of forward should be a 2-D tensor of shape (B, SE) where SE = seq_emb_dim x = hidden[0, :, :] # return the hidden state of the LSTM, which is the last state of the sequence #print("after seq_encoder, x.shape = ", x.shape) return x class ChordSeqAE(nn.Module): """ Chord Sequence Autoencoder. For pretraining a ChordSeqEncoder """ def __init__(self, chord_emb_dim=8, seq_len=512//4, seq_emb_dim=256, hidden_dim=512, chord_vocab_size=len(POSSIBLE_CHORDS), vae_scale=0.1): super(ChordSeqAE, self).__init__() self.encoder = ChordSeqEncoder(chord_emb_dim=chord_emb_dim, seq_len=seq_len, seq_emb_dim=seq_emb_dim, hidden_dim=hidden_dim) # made decoder a sequence of linear layers with a ReLU in between self.decoder = nn.Sequential( nn.Linear(seq_emb_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, seq_len*chord_vocab_size) ) self.chord_vocab_size = chord_vocab_size self.vae_scale = vae_scale def forward(self, bs, debug=False): "x should have dimensions (B, S) where B is the batch size and S is the length of the sequence of chord indices" if debug: print("ChordSeqAE: bs.shape = ", bs.shape) B,S = bs.shape x = self.encoder(bs) if debug: print("ChordSeqAE: encoded x.shape = ", x.shape) if self.vae_scale > 0 and self.training: x = x + self.vae_scale*((x.max()-x.min())) * torch.randn_like(x) x = self.decoder(x) x = x.view(B, S, -1) if debug: print("ChordSeqAE: decoded x.shape = ", x.shape) return x def chord_seq_from_img(img:Image.Image, every=8, # was imaginging every beat (every=4) but looking at data, it seems like the smallest chord label is 8 pixels wide debug=False): """extracts a sequence of chord indices from a pianoroll image hopefully the dataloader will mean we can just do one image and it'll batch them """ if debug: print("img.size, img.min, img.max = ",img.size, np.array(img).min(), np.array(img).max()) if img.size[0] == img.size[1]: # if image is square, make it rectangular img = square_to_rect(img) img_arr = np.array(img) top_row = img_arr[CHORD_BORDER//2] # all x's along y=CHORD_BORDER/2 if debug: img.save("chord_seq_from_img.png") print("img_arr.shape = ", img_arr.shape) print("top_row.shape = ", top_row.shape) print("top_row = ", top_row) chord_seq = np.array([color_to_chord_num(tuple(c)) for c in top_row]) if chord_seq.max() >= len(POSSIBLE_CHORDS): print(f"chord_seq.max = {chord_seq.max()} should be less than len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}\nchord_seq = {chord_seq}") indices = np.where(chord_seq >= len(POSSIBLE_CHORDS))[0] print("indices, chord_seq[indices], top_row[indices] = ", indices, chord_seq[indices], top_row[indices]) raise ValueError("chord_seq.max() should be less than len(POSSIBLE_CHORDS)") chord_seq_beats = most_freq_or_first_every(chord_seq, every=every) assert chord_seq_beats.max() <= chord_seq.max(), f"chord_seq_beats.max() = {chord_seq_beats.max()} should be less than chord_seq.max() = {chord_seq.max()}" if debug: print("chord_seq_beats, len(POSSIBLE_CHORDS) = ", chord_seq_beats, len(POSSIBLE_CHORDS)) assert chord_seq_beats.max() < len(POSSIBLE_CHORDS), f"chord_seq_beats.max() should be less than len(POSSIBLE_CHORDS) = {len(POSSIBLE_CHORDS)}" return torch.tensor(chord_seq_beats) def chord_seq_from_img_tensor_batch(img_tensor_batch:torch.Tensor, every=8, debug=False): """extracts a sequence of chord indices from a batch of pianoroll images""" batch_size = img_tensor_batch.shape[0] itb = (img_tensor_batch + 1.0) * 127.5 #rescale from -1..1 to 0..255 chord_seqs = [] for i in range(batch_size): # TODO: may be a faster way to do this with tensor ops # converting to images and back is slow this is slow img = Image.fromarray(np.round( itb[i].cpu().permute(1,2,0).numpy()).astype(np.uint8)) img = square_to_rect(img) chord_seq = chord_seq_from_img(img, every=every, ) chord_seqs.append(chord_seq) return torch.stack(chord_seqs).to(img_tensor_batch.device) def img_batch_to_seq_emb(img_tensor_batch:torch.Tensor, chord_seq_encoder:nn.Module, every=8, debug=False): """converts a batch of pianoroll images to a batch of chord sequence embeddings""" chord_seq_batch = chord_seq_from_img_tensor_batch(img_tensor_batch, every=every, debug=debug) cs_emb = chord_seq_encoder(chord_seq_batch) return cs_emb # TODO: test it! if __name__ == '__main__': # FOR TESTING/DEV ONLY import sys, random def make_image_tensor_batch(batch_size=2): """FOR TESTING/DEV ONLY: makes a batch of random chord-endowed pianoroll (square) images So I can iterate other parts of this faster w/o having to spin up crowson's training code every time while i write code here shape = (B, 3, 256, 256), normalization = -1.0 to 1.0 """ img_batch = torch.zeros((batch_size, 3, 256, 256)) for i in range(batch_size): n = i+1# np.random.randint(0, 909) img_filename = f"/data/POP909-Dataset/images_128_rg_chords_TOTAL/{n:03}_TOTAL.png" # place to grab images from img = Image.open(img_filename).convert('RGB') # crop to 512 pixels wide img = img.crop((0,0,512,128)) img = rect_to_square(img) img_batch[i] = torch.tensor(np.array(img)).permute(2,0,1).float() / 127.5 - 1.0 # normalization done by dataloader makes images -1 to 1 return img_batch # quick check of the mapping for cn in range(len(POSSIBLE_CHORDS)): color = chord_num_to_color(cn) print("cn, color = ", cn, color) cn2 = color_to_chord_num(color) assert cn2 == cn, f"cn2={cn2} should be cn={cn}, color={color}" if len(sys.argv) <= 1: print("Testing suite, Usage: python chords.py ") sys.exit(1) some_arg = sys.argv[1] batch_size=2 img_tensor_batch = make_image_tensor_batch(batch_size=batch_size) print("img_tensor_batch.shape = ", img_tensor_batch.shape) print("img_tensor_batch.min(), img_tensor_batch.max() = ", img_tensor_batch.min(), img_tensor_batch.max()) chord_seq_batch = chord_seq_from_img_tensor_batch(img_tensor_batch, every=8, debug=False) print("chord_seq_batch.shape = ", chord_seq_batch.shape) print(f"chord_seq_batch = \n{chord_seq_batch}") cse = ChordSeqEncoder() cs_emb = cse(chord_seq_batch) print("cs_emb.shape = ", cs_emb.shape) #print(f"cs_emb = \n{cs_emb}") sys.exit(0) #img_filename = some_arg img = Image.open(img_filename).convert('RGB') chord_ind_seq = chord_seq_from_img(img, debug=False) print("chord_ind_seq = ", chord_ind_seq) print("len(chord_ind_seq) = ", len(chord_ind_seq)) chord_embedder = ChordEmbedding(len(POSSIBLE_CHORDS)) #print("chord_embeddings = ", chord_embedder(chord_ind_seq)) sys.exit(0) #chords_str = some_arg #cis = chords_str_to_inds(chords_str) cis = chord_ind_seq for ci in cis: print("\n-------") #ci = pair_to_chord_index(pair) pair = chord_index_to_pair(ci) print(f"Input: chord_str = {chords_str}, pair = {pair}, ci = {ci}") color = chord_num_to_color(ci) print(color) cn2 = color_to_chord_num(color) out_str = chord_index_to_str(cn2) print(f"Output: cn2 = {cn2}, out_str = {out_str}") print("Embedding: ") with torch.no_grad(): x = torch.tensor([ci]) print(chord_embedder(x))