English
music
emotion
File size: 4,191 Bytes
146cf91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import torch
import torch.nn as nn
import pytorch_lightning as pl
from sklearn import metrics
from transformers import AutoModelForAudioClassification
import numpy as np

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)  # Shape: (1, max_len, d_model)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :].to(x.device)

class FeedforwardModelAttnCK(nn.Module):
    def __init__(self, input_size, output_size, nhead=8, num_layers=1, dropout_rate=0.1, 
                 num_key = 2, num_chords=158, num_chords_root=14, num_chords_attr=14, 
                 key_emb_dim=4, chord_emb_dim=8, chord_root_emb_dim=4, chord_attr_emb_dim=4):
        super().__init__()
        self.d_model = 512

        self.d_model_transformer = chord_root_emb_dim + chord_attr_emb_dim

        # Embedding layers for chords and keys
        self.chord_root_embedding = nn.Embedding(num_chords_root, chord_root_emb_dim)
        self.chord_attr_embedding = nn.Embedding(num_chords_attr, chord_attr_emb_dim)

        nn.init.xavier_uniform_(self.chord_root_embedding.weight)
        nn.init.xavier_uniform_(self.chord_attr_embedding.weight)
        
        # Positional encoding for chord progression
        self.positional_encoding = PositionalEncoding(self.d_model_transformer)

        # Transformer for chord progression modeling
        self.chord_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=self.d_model_transformer, nhead=nhead, dim_feedforward= 64, dropout=0.1, batch_first=True),
            num_layers=2
        )
        # Input projection for latent features
        self.input_proj = nn.Sequential(
            nn.Linear(input_size +  self.d_model_transformer + 1, self.d_model),
            nn.ReLU(),
        )

        # Output projection
        self.output_proj = nn.Sequential(
            nn.Linear(self.d_model, 256),
            nn.ReLU(),
            nn.Linear(256, output_size),
        )

    def forward(self, model_input_dic ):
        x_mert = model_input_dic["x_mert"] 
        x_chord_root = model_input_dic["x_chord_root"]
        x_chord_attr = model_input_dic["x_chord_attr"]
        x_key = model_input_dic["x_key"]

        key_embedding = x_key.float()

        chord_root_embedding = self.chord_root_embedding(x_chord_root)  # Shape: (batch_size, seq_len, chord_root_emb_dim)
        chord_attr_embedding = self.chord_attr_embedding(x_chord_attr)  # Shape: (batch_size, seq_len, chord_attr_emb_dim)
        
        # Concatenate root and attribute embeddings
        chord_combined_embedding = torch.cat(
            (chord_root_embedding, chord_attr_embedding), dim=-1
        )  # Shape: (batch_size, seq_len, chord_root_emb_dim + chord_attr_emb_dim)

        # Positional encoding and chord transformer
        chord_combined_embedding = self.positional_encoding(chord_combined_embedding)

        cls_token = torch.zeros_like(chord_combined_embedding[:, :1, :])

        chord_embedding_with_cls = torch.cat([cls_token, chord_combined_embedding], dim=1)  # Add CLS at the start                
        chord_embedding_transformed = self.chord_transformer(chord_embedding_with_cls)  # Shape: (seq_len+1, batch_size, chord_emb_dim)

        chord_embedding_cls = chord_embedding_transformed[:,0,:]  # Shape: (batch_size, chord_emb_dim)
        
        # Combine all features
        combined_features = torch.cat((x_mert, chord_embedding_cls, key_embedding), dim=1)
        # Input projection
        combined_features = self.input_proj(combined_features)  # Shape: (batch_size, d_model)
        
        output = self.output_proj(combined_features)  # Shape: (batch_size, output_size)
        return output