Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Oct 7, 2024

Commit

6fc042a

1 Parent(s): 694161d

revise

Browse files

Files changed (9) hide show

app.py +20 -3
inference.py +55 -4
modules/__init__.py +2 -0
modules/front_back_end.py +240 -0
modules/loss.py +432 -0
networks/__init__.py +3 -0
networks/architectures.py +405 -0
networks/dasp_additionals.py +441 -0
networks/network_utils.py +254 -0

app.py CHANGED Viewed

@@ -25,7 +25,14 @@ def process_audio(input_audio, reference_audio, perform_ito):
     if ito_output_audio is not None:
         sf.write("ito_output_mastered.wav", ito_output_audio.T, sr)
-    return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None
 def process_youtube(input_url, reference_url, perform_ito):
     input_audio = download_youtube_audio(input_url)
@@ -41,7 +48,12 @@ with gr.Blocks() as demo:
         submit_button = gr.Button("Process")
         output_audio = gr.Audio(label="Output Audio")
         ito_output_audio = gr.Audio(label="ITO Output Audio")
-        submit_button.click(process_audio, inputs=[input_audio, reference_audio, perform_ito], outputs=[output_audio, ito_output_audio])
     with gr.Tab("YouTube URLs"):
         input_url = gr.Textbox(label="Input YouTube URL")
@@ -50,6 +62,11 @@ with gr.Blocks() as demo:
         submit_button_yt = gr.Button("Process")
         output_audio_yt = gr.Audio(label="Output Audio")
         ito_output_audio_yt = gr.Audio(label="ITO Output Audio")
-        submit_button_yt.click(process_youtube, inputs=[input_url, reference_url, perform_ito_yt], outputs=[output_audio_yt, ito_output_audio_yt])
 demo.launch()

     if ito_output_audio is not None:
         sf.write("ito_output_mastered.wav", ito_output_audio.T, sr)
+    # Generate parameter output strings
+    param_output = mastering_transfer.get_param_output_string(predicted_params)
+    ito_param_output = mastering_transfer.get_param_output_string(ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
+    # Generate top 10 differences if ITO was performed
+    top_10_diff = mastering_transfer.get_top_10_diff_string(predicted_params, ito_predicted_params) if ito_predicted_params is not None else "ITO not performed"
+    return "output_mastered.wav", "ito_output_mastered.wav" if ito_output_audio is not None else None, param_output, ito_param_output, top_10_diff
 def process_youtube(input_url, reference_url, perform_ito):
     input_audio = download_youtube_audio(input_url)
         submit_button = gr.Button("Process")
         output_audio = gr.Audio(label="Output Audio")
         ito_output_audio = gr.Audio(label="ITO Output Audio")
+        param_output = gr.Textbox(label="Predicted Parameters", lines=10)
+        ito_param_output = gr.Textbox(label="ITO Predicted Parameters", lines=10)
+        top_10_diff = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
+        submit_button.click(process_audio,
+                            inputs=[input_audio, reference_audio, perform_ito],
+                            outputs=[output_audio, ito_output_audio, param_output, ito_param_output, top_10_diff])
     with gr.Tab("YouTube URLs"):
         input_url = gr.Textbox(label="Input YouTube URL")
         submit_button_yt = gr.Button("Process")
         output_audio_yt = gr.Audio(label="Output Audio")
         ito_output_audio_yt = gr.Audio(label="ITO Output Audio")
+        param_output_yt = gr.Textbox(label="Predicted Parameters", lines=10)
+        ito_param_output_yt = gr.Textbox(label="ITO Predicted Parameters", lines=10)
+        top_10_diff_yt = gr.Textbox(label="Top 10 Parameter Differences", lines=10)
+        submit_button_yt.click(process_youtube,
+                               inputs=[input_url, reference_url, perform_ito_yt],
+                               outputs=[output_audio_yt, ito_output_audio_yt, param_output_yt, ito_param_output_yt, top_10_diff_yt])
 demo.launch()

inference.py CHANGED Viewed

@@ -9,8 +9,7 @@ import sys
 currentdir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(os.path.dirname(currentdir))
 from networks import Dasp_Mastering_Style_Transfer, Effects_Encoder
-from modules import FrontEnd, BackEnd
-from modules.loss import AudioFeatureLoss
 class MasteringStyleTransfer:
     def __init__(self, args):
@@ -205,6 +204,59 @@ class MasteringStyleTransfer:
             else:
                 print(f"  {fx_params}")
 def reload_weights(model, ckpt_path, device):
     checkpoint = torch.load(ckpt_path, map_location=device)
@@ -215,6 +267,7 @@ def reload_weights(model, ckpt_path, device):
         new_state_dict[name] = v
     model.load_state_dict(new_state_dict, strict=False)
 if __name__ == "__main__":
     basis_path = '/data2/tony/Mastering_Style_Transfer/results/dasp_tcn_tuneenc_daspman_loudnessnorm/ckpt/1000/'
@@ -258,5 +311,3 @@ if __name__ == "__main__":
     if ito_output_audio is not None:
         sf.write("ito_output_mastered.wav", ito_output_audio.T, sr)

 currentdir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(os.path.dirname(currentdir))
 from networks import Dasp_Mastering_Style_Transfer, Effects_Encoder
+from modules.loss import AudioFeatureLoss, Loss
 class MasteringStyleTransfer:
     def __init__(self, args):
             else:
                 print(f"  {fx_params}")
+    def get_param_output_string(self, params):
+        if params is None:
+            return "No parameters available"
+        output = []
+        for fx_name, fx_params in params.items():
+            output.append(f"{fx_name.upper()}:")
+            if isinstance(fx_params, dict):
+                for param_name, param_value in fx_params.items():
+                    if isinstance(param_value, torch.Tensor):
+                        param_value = param_value.item()
+                    output.append(f"  {param_name}: {param_value:.4f}")
+            elif isinstance(fx_params, torch.Tensor):
+                output.append(f"  {fx_params.item():.4f}")
+            else:
+                output.append(f"  {fx_params:.4f}")
+        return "\n".join(output)
+    def get_top_10_diff_string(self, initial_params, ito_params):
+        if initial_params is None or ito_params is None:
+            return "Cannot compare parameters"
+        all_diffs = []
+        for fx_name in initial_params.keys():
+            if isinstance(initial_params[fx_name], dict):
+                for param_name in initial_params[fx_name].keys():
+                    initial_value = initial_params[fx_name][param_name]
+                    ito_value = ito_params[fx_name][param_name]
+                    param_range = self.mastering_converter.fx_processors[fx_name].param_ranges[param_name]
+                    normalized_diff = abs((ito_value - initial_value) / (param_range[1] - param_range[0]))
+                    all_diffs.append((fx_name, param_name, initial_value.item(), ito_value.item(), normalized_diff.item()))
+            else:
+                initial_value = initial_params[fx_name]
+                ito_value = ito_params[fx_name]
+                normalized_diff = abs(ito_value - initial_value)
+                all_diffs.append((fx_name, 'width', initial_value.item(), ito_value.item(), normalized_diff.item()))
+        top_diffs = sorted(all_diffs, key=lambda x: x[4], reverse=True)[:10]
+        output = ["Top 10 parameter differences (sorted by normalized difference):"]
+        for fx_name, param_name, initial_value, ito_value, normalized_diff in top_diffs:
+            output.append(f"{fx_name.upper()} - {param_name}:")
+            output.append(f"  Initial: {initial_value:.4f}")
+            output.append(f"  ITO:     {ito_value:.4f}")
+            output.append(f"  Normalized Diff: {normalized_diff:.4f}")
+            output.append("")
+        return "\n".join(output)
 def reload_weights(model, ckpt_path, device):
     checkpoint = torch.load(ckpt_path, map_location=device)
         new_state_dict[name] = v
     model.load_state_dict(new_state_dict, strict=False)
 if __name__ == "__main__":
     basis_path = '/data2/tony/Mastering_Style_Transfer/results/dasp_tcn_tuneenc_daspman_loudnessnorm/ckpt/1000/'
     if ito_output_audio is not None:
         sf.write("ito_output_mastered.wav", ito_output_audio.T, sr)

modules/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .front_back_end import *
2	+ from .loss import *

modules/front_back_end.py ADDED Viewed

	@@ -0,0 +1,240 @@

+""" Front-end: processing raw data input """
+import torch
+import torch.nn as nn
+import torchaudio.functional as ta_F
+import torchaudio
+class FrontEnd(nn.Module):
+    def __init__(self, channel='stereo', \
+                        n_fft=2048, \
+                        n_mels=128, \
+                        sample_rate=44100, \
+                        hop_length=None, \
+                        win_length=None, \
+                        window="hann", \
+                        eps=1e-7, \
+                        device=torch.device("cpu")):
+        super(FrontEnd, self).__init__()
+        self.channel = channel
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.hop_length = n_fft//4 if hop_length==None else hop_length
+        self.win_length = n_fft if win_length==None else win_length
+        self.eps = eps
+        if window=="hann":
+            self.window = torch.hann_window(window_length=self.win_length, periodic=True).to(device)
+        elif window=="hamming":
+            self.window = torch.hamming_window(window_length=self.win_length, periodic=True).to(device)
+        self.melscale_transform = torchaudio.transforms.MelScale(n_mels=self.n_mels, \
+                                                                    sample_rate=self.sample_rate, \
+                                                                    n_stft=self.n_fft//2+1).to(device)
+    def forward(self, input, mode):
+        # front-end function which channel-wise combines all demanded features
+        # input shape : batch x channel x raw waveform
+        # output shape : batch x channel x frequency x time
+        phase_output = None
+        front_output_list = []
+        for cur_mode in mode:
+            # Real & Imaginary
+            if cur_mode=="cplx":
+                if self.channel=="mono":
+                    output = torch.stft(input, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window)
+                elif self.channel=="stereo":
+                    output_l = torch.stft(input[:,0], n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window)
+                    output_r = torch.stft(input[:,1], n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window)
+                    output = torch.cat((output_l, output_r), axis=-1)
+                if input.shape[-1] % round(self.n_fft/4) == 0:
+                    output = output[:, :, :-1]
+                if self.n_fft % 2 == 0:
+                    output = output[:, :-1]
+                front_output_list.append(output.permute(0, 3, 1, 2))
+            # Magnitude & Phase or Mel
+            elif "mag" in cur_mode or "mel" in cur_mode:
+                if self.channel=="mono":
+                    cur_cplx = torch.stft(input, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, return_complex=True)
+                    output = self.mag(cur_cplx).unsqueeze(-1)[..., 0:1]
+                    if "mag_phase" in cur_mode:
+                        phase = self.phase(cur_cplx)
+                    if "mel" in cur_mode:
+                        output = self.melscale_transform(output.squeeze(-1)).unsqueeze(-1)
+                elif self.channel=="stereo":
+                    cplx_l = torch.stft(input[:,0], n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, return_complex=True)
+                    cplx_r = torch.stft(input[:,1], n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window, return_complex=True)
+                    mag_l = self.mag(cplx_l).unsqueeze(-1)
+                    mag_r = self.mag(cplx_r).unsqueeze(-1)
+                    output = torch.cat((mag_l, mag_r), axis=-1)
+                    if "mag_phase" in cur_mode:
+                        phase_l = self.phase(cplx_l).unsqueeze(-1)
+                        phase_r = self.phase(cplx_r).unsqueeze(-1)
+                        output = torch.cat((mag_l, phase_l, mag_r, phase_r), axis=-1)
+                    if "mel" in cur_mode:
+                        output = torch.cat((self.melscale_transform(mag_l.squeeze(-1)).unsqueeze(-1), self.melscale_transform(mag_r.squeeze(-1)).unsqueeze(-1)), axis=-1)
+                if "log" in cur_mode:
+                    output = torch.log(output+self.eps)
+                if input.shape[-1] % round(self.n_fft/4) == 0:
+                    output = output[:, :, :-1]
+                if cur_mode!="mel" and self.n_fft % 2 == 0: # discard highest frequency
+                    output = output[:, 1:]
+                front_output_list.append(output.permute(0, 3, 1, 2))
+        # combine all demanded features
+        if not front_output_list:
+            raise NameError("NameError at FrontEnd: check using features for front-end")
+        elif len(mode)!=1:
+            for i, cur_output in enumerate(front_output_list):
+                if i==0:
+                    front_output = cur_output
+                else:
+                    front_output = torch.cat((front_output, cur_output), axis=1)
+        else:
+            front_output = front_output_list[0]
+        return front_output
+    def mag(self, cplx_input, eps=1e-07):
+        # mag_summed = cplx_input.pow(2.).sum(-1) + eps
+        mag_summed = cplx_input.real.pow(2.) + cplx_input.imag.pow(2.) + eps
+        return mag_summed.pow(0.5)
+    def phase(self, cplx_input, ):
+        return torch.atan2(cplx_input.imag, cplx_input.real)
+        # return torch.angle(cplx_input)
+class BackEnd(nn.Module):
+    def __init__(self, channel='stereo', \
+                        n_fft=2048, \
+                        hop_length=None, \
+                        win_length=None, \
+                        window="hann", \
+                        eps=1e-07, \
+                        orig_freq=44100, \
+                        new_freq=16000, \
+                        device=torch.device("cpu")):
+        super(BackEnd, self).__init__()
+        self.device = device
+        self.channel = channel
+        self.n_fft = n_fft
+        self.hop_length = n_fft//4 if hop_length==None else hop_length
+        self.win_length = n_fft if win_length==None else win_length
+        self.eps = eps
+        if window=="hann":
+            self.window = torch.hann_window(window_length=self.win_length, periodic=True).to(self.device)
+        elif window=="hamming":
+            self.window = torch.hamming_window(window_length=self.win_length, periodic=True).to(self.device)
+        self.resample_func_8k = torchaudio.transforms.Resample(orig_freq=orig_freq, new_freq=8000).to(self.device)
+        self.resample_func = torchaudio.transforms.Resample(orig_freq=orig_freq, new_freq=new_freq).to(self.device)
+    def magphase_to_cplx(self, magphase_spec):
+        real = magphase_spec[..., 0] * torch.cos(magphase_spec[..., 1])
+        imaginary = magphase_spec[..., 0] * torch.sin(magphase_spec[..., 1])
+        return torch.cat((real.unsqueeze(-1), imaginary.unsqueeze(-1)), dim=-1)
+    def forward(self, input, phase, mode):
+        # back-end function which convert output spectrograms into waveform
+        # input shape : batch x channel x frequency x time
+        # output shape : batch x channel x raw waveform
+        # convert to shape : batch x frequency x time x channel
+        input = input.permute(0, 2, 3, 1)
+        # pad highest frequency
+        pad = torch.zeros((input.shape[0], 1, input.shape[2], input.shape[3])).to(self.device)
+        input = torch.cat((pad, input), dim=1)
+        back_output_list = []
+        channel_count = 0
+        for i, cur_mode in enumerate(mode):
+            # Real & Imaginary
+            if cur_mode=="cplx":
+                if self.channel=="mono":
+                    output = ta_F.istft(input[...,channel_count:channel_count+2], n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window).unsqueeze(1)
+                    channel_count += 2
+                elif self.channel=="stereo":
+                    cplx_spec = torch.cat([input[...,channel_count:channel_count+2], input[...,channel_count+2:channel_count+4]], dim=0)
+                    output_wav = ta_F.istft(cplx_spec, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window)
+                    output = torch.cat((output_wav[:output_wav.shape[0]//2].unsqueeze(1), output_wav[output_wav.shape[0]//2:].unsqueeze(1)), dim=1)
+                    channel_count += 4
+                back_output_list.append(output)
+            # Magnitude & Phase
+            elif cur_mode=="mag_phase" or cur_mode=="mag":
+                if self.channel=="mono":
+                    if cur_mode=="mag":
+                        input_spec = torch.cat((input[...,channel_count:channel_count+1], phase), axis=-1)
+                        channel_count += 1
+                    else:
+                        input_spec = input[...,channel_count:channel_count+2]
+                        channel_count += 2
+                    cplx_spec = self.magphase_to_cplx(input_spec)
+                    output = ta_F.istft(cplx_spec, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window).unsqueeze(1)
+                elif self.channel=="stereo":
+                    if cur_mode=="mag":
+                        input_spec_l = torch.cat((input[...,channel_count:channel_count+1], phase[...,0:1]), axis=-1)
+                        input_spec_r = torch.cat((input[...,channel_count+1:channel_count+2], phase[...,1:2]), axis=-1)
+                        channel_count += 2
+                    else:
+                        input_spec_l = input[...,channel_count:channel_count+2]
+                        input_spec_r = input[...,channel_count+2:channel_count+4]
+                        channel_count += 4
+                    cplx_spec_l = self.magphase_to_cplx(input_spec_l)
+                    cplx_spec_r = self.magphase_to_cplx(input_spec_r)
+                    cplx_spec = torch.cat([cplx_spec_l, cplx_spec_r], dim=0)
+                    output_wav = torch.istft(cplx_spec, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=self.window)
+                    output = torch.cat((output_wav[:output_wav.shape[0]//2].unsqueeze(1), output_wav[output_wav.shape[0]//2:].unsqueeze(1)), dim=1)
+                    channel_count += 4
+                back_output_list.append(output)
+            elif cur_mode=="griff":
+                if self.channel=="mono":
+                    output = self.griffin_lim(input.squeeze(-1), input.device).unsqueeze(1)
+                    # output = self.griff(input.permute(0, 3, 1, 2))
+                else:
+                    output_l = self.griffin_lim(input[..., 0], input.device).unsqueeze(1)
+                    output_r = self.griffin_lim(input[..., 1], input.device).unsqueeze(1)
+                    output = torch.cat((output_l, output_r), axis=1)
+            back_output_list.append(output)
+        # combine all demanded feature outputs
+        if not back_output_list:
+            raise NameError("NameError at BackEnd: check using features for back-end")
+        elif len(mode)!=1:
+            for i, cur_output in enumerate(back_output_list):
+                if i==0:
+                    back_output = cur_output
+                else:
+                    back_output = torch.cat((back_output, cur_output), axis=1)
+        else:
+            back_output = back_output_list[0]
+        return back_output
+    def griffin_lim(self, l_est, gpu, n_iter=100):
+        l_est = l_est.cpu().detach()
+        l_est = torch.pow(l_est, 1/0.80)
+        # l_est  [batch, channel, time]
+        l_mag = l_est.unsqueeze(-1)
+        l_phase = 2 * np.pi * torch.rand_like(l_mag) - np.pi
+        real = l_mag * torch.cos(l_phase)
+        imag = l_mag * torch.sin(l_phase)
+        S = torch.cat((real, imag), axis=-1)
+        S_mag = (real**2 + imag**2 + self.eps) ** 1/2
+        for i in range(n_iter):
+            x = ta_F.istft(S, n_fft=2048, hop_length=512, win_length=2048, window=torch.hann_window(2048))
+            S_new = torch.stft(x, n_fft=2048, hop_length=512, win_length=2048, window=torch.hann_window(2048))
+            S_new_phase = S_new/mag(S_new)
+            S = S_mag * S_new_phase
+        return x / torch.max(torch.abs(x))

modules/loss.py ADDED Viewed

	@@ -0,0 +1,432 @@

+"""
+    Implementation of objective functions used in the task 'ITO-Master'
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import auraloss
+import os
+import sys
+currentdir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(os.path.dirname(currentdir))
+from modules.front_back_end import *
+# Root Mean Squared Loss
+#   penalizes the volume factor with non-linearlity
+class RMSLoss(nn.Module):
+    def __init__(self, reduce, loss_type="l2"):
+        super(RMSLoss, self).__init__()
+        self.weight_factor = 100.
+        if loss_type=="l2":
+            self.loss = nn.MSELoss(reduce=None)
+    def forward(self, est_targets, targets):
+        est_targets = est_targets.reshape(est_targets.shape[0]*est_targets.shape[1], est_targets.shape[2])
+        targets = targets.reshape(targets.shape[0]*targets.shape[1], targets.shape[2])
+        normalized_est = torch.sqrt(torch.mean(est_targets**2, dim=-1))
+        normalized_tgt = torch.sqrt(torch.mean(targets**2, dim=-1))
+        weight = torch.clamp(torch.abs(normalized_tgt-normalized_est), min=1/self.weight_factor) * self.weight_factor
+        return torch.mean(weight**1.5 * self.loss(normalized_est, normalized_tgt))
+# Multi-Scale Spectral Loss proposed at the paper "DDSP: DIFFERENTIABLE DIGITAL SIGNAL PROCESSING" (https://arxiv.org/abs/2001.04643)
+#   we extend this loss by applying it to mid/side channels
+class MultiScale_Spectral_Loss_MidSide_DDSP(nn.Module):
+    def __init__(self, mode='midside', \
+                        reduce=True, \
+                        n_filters=None, \
+                        windows_size=None, \
+                        hops_size=None, \
+                        window="hann", \
+                        eps=1e-7, \
+                        device=torch.device("cpu")):
+        super(MultiScale_Spectral_Loss_MidSide_DDSP, self).__init__()
+        self.mode = mode
+        self.eps = eps
+        self.mid_weight = 0.5   # value in the range of 0.0 ~ 1.0
+        self.logmag_weight = 0.1
+        if n_filters is None:
+            n_filters = [4096, 2048, 1024, 512]
+        if windows_size is None:
+            windows_size = [4096, 2048, 1024, 512]
+        if hops_size is None:
+            hops_size = [1024, 512, 256, 128]
+        self.multiscales = []
+        for i in range(len(windows_size)):
+            cur_scale = {'window_size' : float(windows_size[i])}
+            if self.mode=='midside':
+                cur_scale['front_end'] = FrontEnd(channel='mono', \
+                                                    n_fft=n_filters[i], \
+                                                    hop_length=hops_size[i], \
+                                                    win_length=windows_size[i], \
+                                                    window=window, \
+                                                    device=device)
+            elif self.mode=='ori':
+                cur_scale['front_end'] = FrontEnd(channel='stereo', \
+                                                    n_fft=n_filters[i], \
+                                                    hop_length=hops_size[i], \
+                                                    win_length=windows_size[i], \
+                                                    window=window, \
+                                                    device=device)
+            self.multiscales.append(cur_scale)
+        self.objective_l1 = nn.L1Loss(reduce=reduce)
+        self.objective_l2 = nn.MSELoss(reduce=reduce)
+    def forward(self, est_targets, targets):
+        if self.mode=='midside':
+            return self.forward_midside(est_targets, targets)
+        elif self.mode=='ori':
+            return self.forward_ori(est_targets, targets)
+    def forward_ori(self, est_targets, targets):
+        total_loss = 0.0
+        total_mag_loss = 0.0
+        total_logmag_loss = 0.0
+        for cur_scale in self.multiscales:
+            est_mag = cur_scale['front_end'](est_targets, mode=["mag"])
+            tgt_mag = cur_scale['front_end'](targets, mode=["mag"])
+            mag_loss = self.magnitude_loss(est_mag, tgt_mag)
+            logmag_loss = self.log_magnitude_loss(est_mag, tgt_mag)
+            total_mag_loss += mag_loss
+            total_logmag_loss += logmag_loss
+        # return total_loss
+        return (1-self.logmag_weight)*total_mag_loss + \
+                (self.logmag_weight)*total_logmag_loss
+    def forward_midside(self, est_targets, targets):
+        est_mid, est_side = self.to_mid_side(est_targets)
+        tgt_mid, tgt_side = self.to_mid_side(targets)
+        total_loss = 0.0
+        total_mag_loss = 0.0
+        total_logmag_loss = 0.0
+        for cur_scale in self.multiscales:
+            est_mid_mag = cur_scale['front_end'](est_mid, mode=["mag"])
+            est_side_mag = cur_scale['front_end'](est_side, mode=["mag"])
+            tgt_mid_mag = cur_scale['front_end'](tgt_mid, mode=["mag"])
+            tgt_side_mag = cur_scale['front_end'](tgt_side, mode=["mag"])
+            mag_loss = self.mid_weight*self.magnitude_loss(est_mid_mag, tgt_mid_mag) + \
+                        (1-self.mid_weight)*self.magnitude_loss(est_side_mag, tgt_side_mag)
+            logmag_loss = self.mid_weight*self.log_magnitude_loss(est_mid_mag, tgt_mid_mag) + \
+                        (1-self.mid_weight)*self.log_magnitude_loss(est_side_mag, tgt_side_mag)
+            total_mag_loss += mag_loss
+            total_logmag_loss += logmag_loss
+        # return total_loss
+        return (1-self.logmag_weight)*total_mag_loss + \
+                (self.logmag_weight)*total_logmag_loss
+    def to_mid_side(self, stereo_in):
+        mid = stereo_in[:,0] + stereo_in[:,1]
+        side = stereo_in[:,0] - stereo_in[:,1]
+        return mid, side
+    def magnitude_loss(self, est_mag_spec, tgt_mag_spec):
+        return torch.norm(self.objective_l1(est_mag_spec, tgt_mag_spec))
+    def log_magnitude_loss(self, est_mag_spec, tgt_mag_spec):
+        est_log_mag_spec = torch.log10(est_mag_spec+self.eps)
+        tgt_log_mag_spec = torch.log10(tgt_mag_spec+self.eps)
+        return self.objective_l2(est_log_mag_spec, tgt_log_mag_spec)
+# Class of available loss functions
+class Loss:
+    def __init__(self, args, reduce=True):
+        device = torch.device("cpu")
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{args.gpu}")
+        self.l1 = nn.L1Loss(reduce=reduce)
+        self.mse = nn.MSELoss(reduce=reduce)
+        self.ce = nn.CrossEntropyLoss()
+        self.triplet = nn.TripletMarginLoss(margin=1., p=2)
+        self.cos = nn.CosineSimilarity(eps=args.eps)
+        self.cosemb = nn.CosineEmbeddingLoss()
+        self.multi_scale_spectral_midside = MultiScale_Spectral_Loss_MidSide_DDSP(mode='midside', eps=args.eps, device=device)
+        self.multi_scale_spectral_ori = MultiScale_Spectral_Loss_MidSide_DDSP(mode='ori', eps=args.eps, device=device)
+        self.gain = RMSLoss(reduce=reduce)
+        self.infonce = infoNCE
+        # perceptual weighting with mel scaled spectrograms
+        self.mrs_mel_perceptual = auraloss.freq.MultiResolutionSTFTLoss(
+            fft_sizes=[1024, 2048, 8192],
+            hop_sizes=[256, 512, 2048],
+            win_lengths=[1024, 2048, 8192],
+            scale="mel",
+            n_bins=128,
+            sample_rate=args.sample_rate,
+            perceptual_weighting=True,
+        )
+"""
+    Audio Feature Loss implementation
+        copied from https://github.com/sai-soum/Diff-MST/blob/main/mst/loss.py
+"""
+import librosa
+from typing import List
+from modules.filter import barkscale_fbanks
+def compute_mid_side(x: torch.Tensor):
+    x_mid = x[:, 0, :] + x[:, 1, :]
+    x_side = x[:, 0, :] - x[:, 1, :]
+    return x_mid, x_side
+def compute_melspectrum(
+    x: torch.Tensor,
+    sample_rate: int = 44100,
+    fft_size: int = 32768,
+    n_bins: int = 128,
+    **kwargs,
+):
+    """Compute mel-spectrogram.
+    Args:
+        x: (bs, 2, seq_len)
+        sample_rate: sample rate of audio
+        fft_size: size of fft
+        n_bins: number of mel bins
+    Returns:
+        X: (bs, n_bins)
+    """
+    fb = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=n_bins)
+    fb = torch.tensor(fb).unsqueeze(0).type_as(x)
+    x = x.mean(dim=1, keepdim=True)
+    X = torch.fft.rfft(x, n=fft_size, dim=-1)
+    X = torch.abs(X)
+    X = torch.mean(X, dim=1, keepdim=True)  # take mean over time
+    X = X.permute(0, 2, 1)  # swap time and freq dims
+    X = torch.matmul(fb, X)
+    X = torch.log(X + 1e-8)
+    return X
+def compute_barkspectrum(
+    x: torch.Tensor,
+    fft_size: int = 32768,
+    n_bands: int = 24,
+    sample_rate: int = 44100,
+    f_min: float = 20.0,
+    f_max: float = 20000.0,
+    mode: str = "mid-side",
+    **kwargs,
+):
+    """Compute bark-spectrogram.
+    Args:
+        x: (bs, 2, seq_len)
+        fft_size: size of fft
+        n_bands: number of bark bins
+        sample_rate: sample rate of audio
+        f_min: minimum frequency
+        f_max: maximum frequency
+        mode: "mono", "stereo", or "mid-side"
+    Returns:
+        X: (bs, 24)
+    """
+    # compute filterbank
+    fb = barkscale_fbanks((fft_size // 2) + 1, f_min, f_max, n_bands, sample_rate)
+    fb = fb.unsqueeze(0).type_as(x)
+    fb = fb.permute(0, 2, 1)
+    if mode == "mono":
+        x = x.mean(dim=1)  # average over channels
+        signals = [x]
+    elif mode == "stereo":
+        signals = [x[:, 0, :], x[:, 1, :]]
+    elif mode == "mid-side":
+        x_mid = x[:, 0, :] + x[:, 1, :]
+        x_side = x[:, 0, :] - x[:, 1, :]
+        signals = [x_mid, x_side]
+    else:
+        raise ValueError(f"Invalid mode {mode}")
+    outputs = []
+    for signal in signals:
+        X = torch.stft(
+            signal,
+            n_fft=fft_size,
+            hop_length=fft_size // 4,
+            return_complex=True,
+            window=torch.hann_window(fft_size).to(x.device),
+        )  # compute stft
+        X = torch.abs(X)  # take magnitude
+        X = torch.mean(X, dim=-1, keepdim=True)  # take mean over time
+        # X = X.permute(0, 2, 1)  # swap time and freq dims
+        X = torch.matmul(fb, X)  # apply filterbank
+        X = torch.log(X + 1e-8)
+        # X = torch.cat([X, X_log], dim=-1)
+        outputs.append(X)
+    # stack into tensor
+    X = torch.cat(outputs, dim=-1)
+    return X
+def compute_rms(x: torch.Tensor, **kwargs):
+    """Compute root mean square energy.
+    Args:
+        x: (bs, 1, seq_len)
+    Returns:
+        rms: (bs, )
+    """
+    rms = torch.sqrt(torch.mean(x**2, dim=-1).clamp(min=1e-8))
+    return rms
+def compute_crest_factor(x: torch.Tensor, **kwargs):
+    """Compute crest factor as ratio of peak to rms energy in dB.
+    Args:
+        x: (bs, 2, seq_len)
+    """
+    num = torch.max(torch.abs(x), dim=-1)[0]
+    den = compute_rms(x).clamp(min=1e-8)
+    cf = 20 * torch.log10((num / den).clamp(min=1e-8))
+    return cf
+def compute_stereo_width(x: torch.Tensor, **kwargs):
+    """Compute stereo width as ratio of energy in sum and difference signals.
+    Args:
+        x: (bs, 2, seq_len)
+    """
+    bs, chs, seq_len = x.size()
+    assert chs == 2, "Input must be stereo"
+    # compute sum and diff of stereo channels
+    x_sum = x[:, 0, :] + x[:, 1, :]
+    x_diff = x[:, 0, :] - x[:, 1, :]
+    # compute power of sum and diff
+    sum_energy = torch.mean(x_sum**2, dim=-1)
+    diff_energy = torch.mean(x_diff**2, dim=-1)
+    # compute stereo width as ratio
+    stereo_width = diff_energy / sum_energy.clamp(min=1e-8)
+    return stereo_width
+def compute_stereo_imbalance(x: torch.Tensor, **kwargs):
+    """Compute stereo imbalance as ratio of energy in left and right channels.
+    Args:
+        x: (bs, 2, seq_len)
+    Returns:
+        stereo_imbalance: (bs, )
+    """
+    left_energy = torch.mean(x[:, 0, :] ** 2, dim=-1)
+    right_energy = torch.mean(x[:, 1, :] ** 2, dim=-1)
+    stereo_imbalance = (right_energy - left_energy) / (
+        right_energy + left_energy
+    ).clamp(min=1e-8)
+    return stereo_imbalance
+class AudioFeatureLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weights: List[float],
+        sample_rate: int,
+        stem_separation: bool = False,
+        use_clap: bool = False,
+    ) -> None:
+        """Compute loss using a set of differentiable audio features.
+        Args:
+            weights: weights for each feature
+            sample_rate: sample rate of audio
+            stem_separation: whether to compute loss on stems or mix
+        Based on features proposed in:
+        Man, B. D., et al.
+        "An analysis and evaluation of audio features for multitrack music mixtures."
+        (2014).
+        """
+        super().__init__()
+        self.weights = weights
+        self.sample_rate = sample_rate
+        self.stem_separation = stem_separation
+        self.sources_list = ["mix"]
+        self.source_weights = [1.0]
+        self.use_clap = use_clap
+        self.transforms = [
+            compute_rms,
+            compute_crest_factor,
+            compute_stereo_width,
+            compute_stereo_imbalance,
+            compute_barkspectrum,
+        ]
+        assert len(self.transforms) == len(weights)
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        losses = {}
+        # reshape for example stem dim
+        input_stems = input.unsqueeze(1)
+        target_stems = target.unsqueeze(1)
+        n_stems = input_stems.shape[1]
+        # iterate over each stem compute loss for each transform
+        for stem_idx in range(n_stems):
+            input_stem = input_stems[:, stem_idx, ...]
+            target_stem = target_stems[:, stem_idx, ...]
+            for transform, weight in zip(self.transforms, self.weights):
+                transform_name = "_".join(transform.__name__.split("_")[1:])
+                key = f"{self.sources_list[stem_idx]}-{transform_name}"
+                input_transform = transform(input_stem, sample_rate=self.sample_rate)
+                target_transform = transform(target_stem, sample_rate=self.sample_rate)
+                val = torch.nn.functional.mse_loss(input_transform, target_transform)
+                losses[key] = weight * val * self.source_weights[stem_idx]
+        return losses

networks/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .architectures import *
+from .network_utils import *
+from .dasp_additionals import *

networks/architectures.py ADDED Viewed

	@@ -0,0 +1,405 @@

+"""
+    Implementation of neural networks used in the task 'Music Mastering Style Transfer'
+        - 'Effects Encoder'
+        - 'Mastering Style Transfer'
+        - 'Differentiable Mastering Style Transfer'
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+import dasp_pytorch
+import os
+import sys
+import time
+currentdir = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(currentdir)
+from network_utils import *
+from dasp_additionals import Multiband_Compressor, Distortion, Limiter
+# compute receptive field
+def compute_receptive_field(kernels, strides, dilations):
+    rf = 0
+    for i in range(len(kernels)):
+        rf += rf * strides[i] + (kernels[i]-strides[i]) * dilations[i]
+    return rf
+# Encoder of music effects for contrastive learning of music effects
+class Effects_Encoder(nn.Module):
+    def __init__(self, config):
+        super(Effects_Encoder, self).__init__()
+        # input is stereo channeled audio
+        config["channels"].insert(0, 2)
+        # encoder layers
+        encoder = []
+        for i in range(len(config["kernels"])):
+            if config["conv_block"]=='res':
+                encoder.append(Res_ConvBlock(dimension=1, \
+                                                in_channels=config["channels"][i], \
+                                                out_channels=config["channels"][i+1], \
+                                                kernel_size=config["kernels"][i], \
+                                                stride=config["strides"][i], \
+                                                padding="SAME", \
+                                                dilation=config["dilation"][i], \
+                                                norm=config["norm"], \
+                                                activation=config["activation"], \
+                                                last_activation=config["activation"]))
+            elif config["conv_block"]=='conv':
+                encoder.append(ConvBlock(dimension=1, \
+                                            layer_num=1, \
+                                            in_channels=config["channels"][i], \
+                                            out_channels=config["channels"][i+1], \
+                                            kernel_size=config["kernels"][i], \
+                                            stride=config["strides"][i], \
+                                            padding="VALID", \
+                                            dilation=config["dilation"][i], \
+                                            norm=config["norm"], \
+                                            activation=config["activation"], \
+                                            last_activation=config["activation"], \
+                                            mode='conv'))
+        self.encoder = nn.Sequential(*encoder)
+        # pooling method
+        self.glob_pool = nn.AdaptiveAvgPool1d(1)
+    # network forward operation
+    def forward(self, input):
+        enc_output = self.encoder(input)
+        glob_pooled = self.glob_pool(enc_output).squeeze(-1)
+        # outputs c feature
+        return glob_pooled
+class TCNBlock(torch.nn.Module):
+    def __init__(self,
+                in_ch,
+                out_ch,
+                kernel_size=3,
+                stride=1,
+                dilation=1,
+                cond_dim=2048,
+                grouped=False,
+                causal=False,
+                conditional=False,
+                **kwargs):
+        super(TCNBlock, self).__init__()
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.grouped = grouped
+        self.causal = causal
+        self.conditional = conditional
+        groups = out_ch if grouped and (in_ch % out_ch == 0) else 1
+        self.pad_length = ((kernel_size-1)*dilation) if self.causal else ((kernel_size-1)*dilation)//2
+        self.conv1 = torch.nn.Conv1d(in_ch,
+                                     out_ch,
+                                     kernel_size=kernel_size,
+                                     stride=stride,
+                                     padding=self.pad_length,
+                                     dilation=dilation,
+                                     groups=groups,
+                                     bias=False)
+        if grouped:
+            self.conv1b = torch.nn.Conv1d(out_ch, out_ch, kernel_size=1)
+        if conditional:
+            self.film = FiLM(cond_dim, out_ch)
+        self.bn = torch.nn.BatchNorm1d(out_ch)
+        self.relu = torch.nn.LeakyReLU()
+        self.res = torch.nn.Conv1d(in_ch,
+                                   out_ch,
+                                   kernel_size=1,
+                                   stride=stride,
+                                   groups=in_ch,
+                                   bias=False)
+    def forward(self, x, p):
+        x_in = x
+        x = self.relu(self.bn(self.conv1(x)))
+        x = self.film(x, p)
+        x_res = self.res(x_in)
+        if self.causal:
+            x = x[..., :-self.pad_length]
+        x += x_res
+        return x
+import pytorch_lightning as pl
+class TCNModel(pl.LightningModule):
+    """ Temporal convolutional network with conditioning module.
+        Args:
+            nparams (int): Number of conditioning parameters.
+            ninputs (int): Number of input channels (mono = 1, stereo 2). Default: 1
+            noutputs (int): Number of output channels (mono = 1, stereo 2). Default: 1
+            nblocks (int): Number of total TCN blocks. Default: 10
+            kernel_size (int): Width of the convolutional kernels. Default: 3
+            dialation_growth (int): Compute the dilation factor at each block as dilation_growth ** (n % stack_size). Default: 1
+            channel_growth (int): Compute the output channels at each black as in_ch * channel_growth. Default: 2
+            channel_width (int): When channel_growth = 1 all blocks use convolutions with this many channels. Default: 64
+            stack_size (int): Number of blocks that constitute a single stack of blocks. Default: 10
+            grouped (bool): Use grouped convolutions to reduce the total number of parameters. Default: False
+            causal (bool): Causal TCN configuration does not consider future input values. Default: False
+            skip_connections (bool): Skip connections from each block to the output. Default: False
+            num_examples (int): Number of evaluation audio examples to log after each epochs. Default: 4
+        """
+    def __init__(self,
+                 nparams,
+                 ninputs=1,
+                 noutputs=1,
+                 nblocks=10,
+                 kernel_size=3,
+                 stride=1,
+                 dilation_growth=1,
+                 channel_growth=1,
+                 channel_width=32,
+                 stack_size=10,
+                 cond_dim=2048,
+                 grouped=False,
+                 causal=False,
+                 skip_connections=False,
+                 num_examples=4,
+                 save_dir=None,
+                 **kwargs):
+        super(TCNModel, self).__init__()
+        self.save_hyperparameters()
+        self.blocks = torch.nn.ModuleList()
+        for n in range(nblocks):
+            in_ch = out_ch if n > 0 else ninputs
+            if self.hparams.channel_growth > 1:
+                out_ch = in_ch * self.hparams.channel_growth
+            else:
+                out_ch = self.hparams.channel_width
+            dilation = self.hparams.dilation_growth ** (n % self.hparams.stack_size)
+            cur_stride = stride[n] if isinstance(stride, list) else stride
+            self.blocks.append(TCNBlock(in_ch,
+                                        out_ch,
+                                        kernel_size=self.hparams.kernel_size,
+                                        stride=cur_stride,
+                                        dilation=dilation,
+                                        padding="same" if self.hparams.causal else "valid",
+                                        causal=self.hparams.causal,
+                                        cond_dim=cond_dim,
+                                        grouped=self.hparams.grouped,
+                                        conditional=True if self.hparams.nparams > 0 else False))
+        self.output = torch.nn.Conv1d(out_ch, noutputs, kernel_size=1)
+    def forward(self, x, cond):
+        # iterate over blocks passing conditioning
+        for idx, block in enumerate(self.blocks):
+            # for SeFa
+            if isinstance(cond, list):
+                x = block(x, cond[idx])
+            else:
+                x = block(x, cond)
+            skips = 0
+        # out = torch.tanh(self.output(x + skips))
+        out = torch.clamp(self.output(x + skips), min=-1, max=1)
+        return out
+    def compute_receptive_field(self):
+        """ Compute the receptive field in samples."""
+        rf = self.hparams.kernel_size
+        for n in range(1,self.hparams.nblocks):
+            dilation = self.hparams.dilation_growth ** (n % self.hparams.stack_size)
+            rf = rf + ((self.hparams.kernel_size-1) * dilation)
+        return rf
+    # add any model hyperparameters here
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        # --- model related ---
+        parser.add_argument('--ninputs', type=int, default=1)
+        parser.add_argument('--noutputs', type=int, default=1)
+        parser.add_argument('--nblocks', type=int, default=4)
+        parser.add_argument('--kernel_size', type=int, default=5)
+        parser.add_argument('--dilation_growth', type=int, default=10)
+        parser.add_argument('--channel_growth', type=int, default=1)
+        parser.add_argument('--channel_width', type=int, default=32)
+        parser.add_argument('--stack_size', type=int, default=10)
+        parser.add_argument('--grouped', default=False, action='store_true')
+        parser.add_argument('--causal', default=False, action="store_true")
+        parser.add_argument('--skip_connections', default=False, action="store_true")
+        return parser
+# Module for fitting SeFa parameters
+class Dasp_Mastering_Style_Transfer(nn.Module):
+    def __init__(self, num_features, sample_rate, \
+                    tgt_fx_names = ['eq', 'comp', 'imager', 'gain'], \
+                    model_type='2mlp', \
+                    config=None, \
+                    batch_size=4):
+        super(Dasp_Mastering_Style_Transfer, self).__init__()
+        self.sample_rate = sample_rate
+        self.tgt_fx_names = tgt_fx_names
+        self.fx_processors = {}
+        self.last_predicted_params = None
+        for cur_fx in tgt_fx_names:
+            if cur_fx=='eq':
+                cur_fx_module = dasp_pytorch.ParametricEQ(sample_rate=sample_rate, \
+                                                            min_gain_db = -20.0, \
+                                                            max_gain_db = 20.0, \
+                                                            min_q_factor = 0.1, \
+                                                            max_q_factor=5.0)
+            elif cur_fx=='distortion':
+                cur_fx_module = Distortion(sample_rate=sample_rate,
+                                            min_gain_db = 0.0,
+                                            max_gain_db = 8.0)
+            elif cur_fx=='comp':
+                cur_fx_module = dasp_pytorch.Compressor(sample_rate=sample_rate)
+            elif cur_fx=='multiband_comp':
+                cur_fx_module = Multiband_Compressor(sample_rate=sample_rate)
+            elif cur_fx=='gain':
+                cur_fx_module = dasp_pytorch.Gain(sample_rate=sample_rate)
+            elif cur_fx=='imager':
+                continue
+            elif cur_fx=='limiter':
+                cur_fx_module = Limiter(sample_rate=sample_rate)
+            else:
+                raise AssertionError(f"current fx name ({cur_fx}) not found")
+            self.fx_processors[cur_fx] = cur_fx_module
+        total_num_param = sum([self.fx_processors[cur_fx].num_params for cur_fx in self.fx_processors])
+        if 'imager' in tgt_fx_names:
+            total_num_param += 1
+        ''' model architecture '''
+        self.model_type = model_type
+        if self.model_type.lower()=='tcn':
+            self.network = TCNModel(nparams=config["condition_dimension"], ninputs=2, \
+                                    noutputs=total_num_param, \
+                                    nblocks=config["nblocks"], \
+                                    dilation_growth=config["dilation_growth"], \
+                                    kernel_size=config["kernel_size"], \
+                                    stride=config['stride'], \
+                                    channel_width=config["channel_width"], \
+                                    stack_size=config["stack_size"], \
+                                    cond_dim=config["condition_dimension"], \
+                                    causal=config["causal"])
+        elif self.model_type.lower()=='ito':
+            self.params = torch.nn.Parameter(torch.ones((batch_size,total_num_param))*0.5)
+    # network forward operation
+    def forward(self, x, embedding):
+        # embedding mapper
+        if self.model_type.lower()=='tcn':
+            est_param = self.network(x, embedding)
+            est_param = est_param.mean(axis=-1)
+        elif self.model_type.lower()=='ito':
+            est_param = self.params
+            est_param = torch.clamp(est_param, min=0.0, max=1.0)
+        if self.model_type.lower()!='ito':
+            est_param = F.sigmoid(est_param)
+        self.last_predicted_params = est_param
+        # dafx chain
+        cur_param_idx = 0
+        for cur_fx in self.tgt_fx_names:
+            if cur_fx=='imager':
+                cur_param_count = 1
+                x = dasp_pytorch.functional.stereo_widener(x, \
+                                                            sample_rate=self.sample_rate, \
+                                                            width=est_param[:,cur_param_idx:cur_param_idx+1])
+            else:
+                cur_param_count = self.fx_processors[cur_fx].num_params
+                cur_input_param = est_param[:, cur_param_idx:cur_param_idx+cur_param_count]
+                x = self.fx_processors[cur_fx].process_normalized(x, cur_input_param)
+            # update param index
+            cur_param_idx += cur_param_count
+        return x
+    def reset_fx_chain(self, ):
+        self.fx_processors = {}
+        for cur_fx in self.tgt_fx_names:
+            if cur_fx=='eq':
+                cur_fx_module = dasp_pytorch.ParametricEQ(sample_rate=self.sample_rate, \
+                                                            min_gain_db = -20.0, \
+                                                            max_gain_db = 20.0, \
+                                                            min_q_factor = 0.1, \
+                                                            max_q_factor=5.0)
+            elif cur_fx=='distortion':
+                cur_fx_module = Distortion(sample_rate=self.sample_rate,
+                                            min_gain_db = 0.0,
+                                            max_gain_db = 8.0)
+            elif cur_fx=='comp':
+                cur_fx_module = dasp_pytorch.Compressor(sample_rate=self.sample_rate)
+            elif cur_fx=='multiband_comp':
+                cur_fx_module = Multiband_Compressor(sample_rate=self.sample_rate)
+            elif cur_fx=='gain':
+                cur_fx_module = dasp_pytorch.Gain(sample_rate=self.sample_rate)
+            elif cur_fx=='imager':
+                continue
+            elif cur_fx=='limiter':
+                cur_fx_module = Limiter(sample_rate=self.sample_rate)
+            else:
+                raise AssertionError(f"current fx name ({cur_fx}) not found")
+            self.fx_processors[cur_fx] = cur_fx_module
+    def get_last_predicted_params(self):
+        if self.last_predicted_params is None:
+            return None
+        params_dict = {}
+        cur_param_idx = 0
+        for cur_fx in self.tgt_fx_names:
+            if cur_fx == 'imager':
+                cur_param_count = 1
+                normalized_param = self.last_predicted_params[:, cur_param_idx:cur_param_idx+1]
+                original_param = self.denormalize_param(normalized_param, 0, 1)
+                params_dict[cur_fx] = original_param
+            else:
+                cur_param_count = self.fx_processors[cur_fx].num_params
+                normalized_params = self.last_predicted_params[:, cur_param_idx:cur_param_idx+cur_param_count]
+                original_params = self.denormalize_params(cur_fx, normalized_params)
+                params_dict[cur_fx] = original_params
+            cur_param_idx += cur_param_count
+        return params_dict
+    def denormalize_params(self, fx_name, normalized_params):
+        fx_processor = self.fx_processors[fx_name]
+        original_params = {}
+        for i, (param_name, (min_val, max_val)) in enumerate(fx_processor.param_ranges.items()):
+            original_param = self.denormalize_param(normalized_params[:, i:i+1], min_val, max_val)
+            original_params[param_name] = original_param
+        return original_params
+    @staticmethod
+    def denormalize_param(normalized_param, min_val, max_val):
+        return normalized_param * (max_val - min_val) + min_val

networks/dasp_additionals.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+    Implementation of differentiable mastering effects based on DASP-pytorch and torchcomp libraries
+        - Distortion
+        - Multiband Compressor
+        - Limiter
+    DASP-pytorch: https://github.com/csteinmetz1/dasp-pytorch
+    torchcomp: https://github.com/yoyololicon/torchcomp
+"""
+import dasp_pytorch
+from dasp_pytorch.modules import Processor
+import torchcomp
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import numpy as np
+import time
+EPS = 1e-6
+class Distortion(Processor):
+    def __init__(
+        self,
+        sample_rate: int,
+        min_gain_db: float = 0.0,
+        max_gain_db: float = 24.0,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.process_fn = distortion
+        self.param_ranges = {
+            "drive_db": (min_gain_db, max_gain_db),
+            "parallel_weight_factor": (0.2, 0.7),
+        }
+        self.num_params = len(self.param_ranges)
+def distortion(x: torch.Tensor,
+    sample_rate: int,
+    drive_db: torch.Tensor,
+    parallel_weight_factor: torch.Tensor()):
+    """Simple soft-clipping distortion with drive control.
+    Args:
+        x (torch.Tensor): Input audio tensor with shape (bs, chs, seq_len)
+        sample_rate (int): Audio sample rate.
+        drive_db (torch.Tensor): Drive in dB with shape (bs)
+    Returns:
+        torch.Tensor: Output audio tensor with shape (bs, chs, seq_len)
+    """
+    bs, chs, seq_len = x.size()
+    parallel_weight_factor = parallel_weight_factor.view(-1, 1, 1)
+    # return torch.tanh(x * (10 ** (drive_db.view(bs, chs, -1) / 20.0))) -> wrong?
+    x_dist = torch.tanh(x * (10 ** (drive_db.view(bs, 1, 1) / 20.0)))
+    # parallel compuatation
+    return parallel_weight_factor * x_dist + (1-parallel_weight_factor) * x
+class Multiband_Compressor(Processor):
+    def __init__(
+        self,
+        sample_rate: int,
+        min_threshold_db_comp: float = -60.0,
+        max_threshold_db_comp: float = 0.0-EPS,
+        min_ratio_comp: float = 1.0+EPS,
+        max_ratio_comp: float = 20.0,
+        min_attack_ms_comp: float = 5.0,
+        max_attack_ms_comp: float = 100.0,
+        min_release_ms_comp: float = 5.0,
+        max_release_ms_comp: float = 100.0,
+        min_threshold_db_exp: float = -60.0,
+        max_threshold_db_exp: float = 0.0-EPS,
+        min_ratio_exp: float = 0.0+EPS,
+        max_ratio_exp: float = 1.0-EPS,
+        min_attack_ms_exp: float = 5.0,
+        max_attack_ms_exp: float = 100.0,
+        min_release_ms_exp: float = 5.0,
+        max_release_ms_exp: float = 100.0,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.process_fn = multiband_compressor
+        self.param_ranges = {
+            "low_cutoff": (20, 300),
+            "high_cutoff": (2000, 12000),
+            "parallel_weight_factor": (0.2, 0.7),
+            "low_shelf_comp_thresh": (min_threshold_db_comp, max_threshold_db_comp),
+            "low_shelf_comp_ratio": (min_ratio_comp, max_ratio_comp),
+            "low_shelf_exp_thresh": (min_threshold_db_exp, max_threshold_db_exp),
+            "low_shelf_exp_ratio": (min_ratio_exp, max_ratio_exp),
+            "low_shelf_at": (min_attack_ms_exp, max_attack_ms_exp),
+            "low_shelf_rt": (min_release_ms_exp, max_release_ms_exp),
+            "mid_band_comp_thresh": (min_threshold_db_comp, max_threshold_db_comp),
+            "mid_band_comp_ratio": (min_ratio_comp, max_ratio_comp),
+            "mid_band_exp_thresh": (min_threshold_db_exp, max_threshold_db_exp),
+            "mid_band_exp_ratio": (min_ratio_exp, max_ratio_exp),
+            "mid_band_at": (min_attack_ms_exp, max_attack_ms_exp),
+            "mid_band_rt": (min_release_ms_exp, max_release_ms_exp),
+            "high_shelf_comp_thresh": (min_threshold_db_comp, max_threshold_db_comp),
+            "high_shelf_comp_ratio": (min_ratio_comp, max_ratio_comp),
+            "high_shelf_exp_thresh": (min_threshold_db_exp, max_threshold_db_exp),
+            "high_shelf_exp_ratio": (min_ratio_exp, max_ratio_exp),
+            "high_shelf_at": (min_attack_ms_exp, max_attack_ms_exp),
+            "high_shelf_rt": (min_release_ms_exp, max_release_ms_exp),
+        }
+        self.num_params = len(self.param_ranges)
+def linkwitz_riley_4th_order(
+    x: torch.Tensor,
+    cutoff_freq: torch.Tensor,
+    sample_rate: float,
+    filter_type: str):
+    q_factor = torch.ones(cutoff_freq.shape) / torch.sqrt(torch.tensor([2.0]))
+    gain_db = torch.zeros(cutoff_freq.shape)
+    q_factor = q_factor.to(x.device)
+    gain_db = gain_db.to(x.device)
+    b, a = dasp_pytorch.signal.biquad(
+        gain_db,
+        cutoff_freq,
+        q_factor,
+        sample_rate,
+        filter_type
+    )
+    del gain_db
+    del q_factor
+    eff_bs = x.size(0)
+    # six second order sections
+    sos = torch.cat((b, a), dim=-1).unsqueeze(1)
+    # apply filter twice to phase difference amounts of 360°
+    x = dasp_pytorch.signal.sosfilt_via_fsm(sos, x)
+    x_out = dasp_pytorch.signal.sosfilt_via_fsm(sos, x)
+    return x_out
+def multiband_compressor(
+    x: torch.Tensor,
+    sample_rate: float,
+    low_cutoff: torch.Tensor,
+    high_cutoff: torch.Tensor,
+    parallel_weight_factor: torch.Tensor,
+    low_shelf_comp_thresh: torch.Tensor,
+    low_shelf_comp_ratio: torch.Tensor,
+    low_shelf_exp_thresh: torch.Tensor,
+    low_shelf_exp_ratio: torch.Tensor,
+    low_shelf_at: torch.Tensor,
+    low_shelf_rt: torch.Tensor,
+    mid_band_comp_thresh: torch.Tensor,
+    mid_band_comp_ratio: torch.Tensor,
+    mid_band_exp_thresh: torch.Tensor,
+    mid_band_exp_ratio: torch.Tensor,
+    mid_band_at: torch.Tensor,
+    mid_band_rt: torch.Tensor,
+    high_shelf_comp_thresh: torch.Tensor,
+    high_shelf_comp_ratio: torch.Tensor,
+    high_shelf_exp_thresh: torch.Tensor,
+    high_shelf_exp_ratio: torch.Tensor,
+    high_shelf_at: torch.Tensor,
+    high_shelf_rt: torch.Tensor,
+):
+    """Multiband (Three-band) Compressor.
+    Low-shelf -> Mid-band -> High-shelf
+    Args:
+        x (torch.Tensor): Time domain tensor with shape (bs, chs, seq_len)
+        sample_rate (float): Audio sample rate.
+        low_cutoff (torch.Tensor): Low-shelf filter cutoff frequency in Hz.
+        high_cutoff (torch.Tensor): High-shelf filter cutoff frequency in Hz.
+        low_shelf_comp_thresh (torch.Tensor):
+        low_shelf_comp_ratio (torch.Tensor):
+        low_shelf_exp_thresh (torch.Tensor):
+        low_shelf_exp_ratio (torch.Tensor):
+        low_shelf_at (torch.Tensor):
+        low_shelf_rt (torch.Tensor):
+        mid_band_comp_thresh (torch.Tensor):
+        mid_band_comp_ratio (torch.Tensor):
+        mid_band_exp_thresh (torch.Tensor):
+        mid_band_exp_ratio (torch.Tensor):
+        mid_band_at (torch.Tensor):
+        mid_band_rt (torch.Tensor):
+        high_shelf_comp_thresh (torch.Tensor):
+        high_shelf_comp_ratio (torch.Tensor):
+        high_shelf_exp_thresh (torch.Tensor):
+        high_shelf_exp_ratio (torch.Tensor):
+        high_shelf_at (torch.Tensor):
+        high_shelf_rt (torch.Tensor):
+    Returns:
+        y (torch.Tensor): Filtered signal.
+    """
+    bs, chs, seq_len = x.size()
+    low_cutoff = low_cutoff.view(-1, 1, 1)
+    high_cutoff = high_cutoff.view(-1, 1, 1)
+    parallel_weight_factor = parallel_weight_factor.view(-1, 1, 1)
+    eff_bs = x.size(0)
+    ''' cross over filter '''
+    # Low-shelf band (low frequencies)
+    low_band = linkwitz_riley_4th_order(x, low_cutoff, sample_rate, filter_type="low_pass")
+    # High-shelf band (high frequencies)
+    high_band = linkwitz_riley_4th_order(x, high_cutoff, sample_rate, filter_type="high_pass")
+    # Mid-band (band-pass)
+    mid_band = x - low_band - high_band  # Subtract low and high bands from original signal
+    ''' compressor '''
+    try:
+        x_out_low = low_band * torchcomp.compexp_gain(low_band.sum(axis=1).abs(),
+                                            comp_thresh=low_shelf_comp_thresh, \
+                                            comp_ratio=low_shelf_comp_ratio, \
+                                            exp_thresh=low_shelf_exp_thresh, \
+                                            exp_ratio=low_shelf_exp_ratio, \
+                                            at=torchcomp.ms2coef(low_shelf_at, sample_rate), \
+                                            rt=torchcomp.ms2coef(low_shelf_rt, sample_rate)).unsqueeze(1)
+    except:
+        x_out_low = low_band
+        print('\t!!!failed computing low-band compression!!!')
+    try:
+        x_out_high = high_band * torchcomp.compexp_gain(high_band.sum(axis=1).abs(),
+                                            comp_thresh=high_shelf_comp_thresh, \
+                                            comp_ratio=high_shelf_comp_ratio, \
+                                            exp_thresh=high_shelf_exp_thresh, \
+                                            exp_ratio=high_shelf_exp_ratio, \
+                                            at=torchcomp.ms2coef(high_shelf_at, sample_rate), \
+                                            rt=torchcomp.ms2coef(high_shelf_rt, sample_rate)).unsqueeze(1)
+    except:
+        x_out_high = high_band
+        print('\t!!!failed computing high-band compression!!!')
+    try:
+        x_out_mid = mid_band * torchcomp.compexp_gain(mid_band.sum(axis=1).abs(),
+                                            comp_thresh=mid_band_comp_thresh, \
+                                            comp_ratio=mid_band_comp_ratio, \
+                                            exp_thresh=mid_band_exp_thresh, \
+                                            exp_ratio=mid_band_exp_ratio, \
+                                            at=torchcomp.ms2coef(mid_band_at, sample_rate), \
+                                            rt=torchcomp.ms2coef(mid_band_rt, sample_rate)).unsqueeze(1)
+    except:
+        x_out_mid = mid_band
+        print('\t!!!failed computing mid-band compression!!!')
+    x_out = x_out_low + x_out_high + x_out_mid
+    # parallel computation
+    x_out = parallel_weight_factor * x_out + (1-parallel_weight_factor) * x
+    # move channels back
+    x_out = x_out.view(bs, chs, seq_len)
+    return x_out
+class Limiter(Processor):
+    def __init__(
+        self,
+        sample_rate: int,
+        min_threshold_db: float = -60.0,
+        max_threshold_db: float = 0.0-EPS,
+        min_attack_ms: float = 5.0,
+        max_attack_ms: float = 100.0,
+        min_release_ms: float = 5.0,
+        max_release_ms: float = 100.0,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.process_fn = limiter
+        self.param_ranges = {
+            "threshold": (min_threshold_db, max_threshold_db),
+            "at": (min_attack_ms, max_attack_ms),
+            "rt": (min_release_ms, max_release_ms),
+        }
+        self.num_params = len(self.param_ranges)
+def limiter(
+    x: torch.Tensor,
+    sample_rate: float,
+    threshold: float,
+    at: float,
+    rt: float,
+):
+    """Limiter.
+    from Chin-yun's paper
+    Args:
+        x (torch.Tensor): Time domain tensor with shape (bs, chs, seq_len)
+        sample_rate (float): Audio sample rate.
+        threshold (torch.Tensor): Limiter threshold in dB.
+        at (torch.Tensor): Attack time.
+        rt (torch.Tensor): Release time.
+    Returns:
+        y (torch.Tensor): Limited signal.
+    """
+    bs, chs, seq_len = x.size()
+    x_out = x * torchcomp.limiter_gain(x.sum(axis=1).abs(),
+                                        threshold=threshold,
+                                        at=torchcomp.ms2coef(at, sample_rate),
+                                        rt=torchcomp.ms2coef(rt, sample_rate)).unsqueeze(1)
+    # move channels back
+    x_out = x_out.view(bs, chs, seq_len)
+    return x_out
+class Random_Augmentation_Dasp(nn.Module):
+    def __init__(self, sample_rate, \
+                    tgt_fx_names = ['eq', 'comp', 'imager', 'gain']):
+        super(Random_Augmentation_Dasp, self).__init__()
+        self.sample_rate = sample_rate
+        self.tgt_fx_names = tgt_fx_names
+        self.device = torch.device("cpu")
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda")
+        self.fx_prob = {'eq': 0.9, \
+                        'distortion': 0.3, \
+                        'comp': 0.8, \
+                        'multiband_comp': 0.8, \
+                        'gain': 0.85, \
+                        'imager': 0.6, \
+                        'limiter': 1.0}
+        self.fx_processors = {}
+        for cur_fx in tgt_fx_names:
+            if cur_fx=='eq':
+                cur_fx_module = dasp_pytorch.ParametricEQ(sample_rate=sample_rate, \
+                                                            min_gain_db = -10.0, \
+                                                            max_gain_db = 10.0, \
+                                                            min_q_factor = 0.5, \
+                                                            max_q_factor=5.0)
+            elif cur_fx=='distortion':
+                cur_fx_module = Distortion(sample_rate=sample_rate,
+                                            min_gain_db = 0.0,
+                                            max_gain_db = 4.0)
+            elif cur_fx=='comp':
+                cur_fx_module = dasp_pytorch.Compressor(sample_rate=sample_rate)
+            elif cur_fx=='multiband_comp':
+                cur_fx_module = Multiband_Compressor(sample_rate=sample_rate,
+                                                    min_threshold_db_comp = -30.0,
+                                                    max_threshold_db_comp = -5.0,
+                                                    min_ratio_comp = 1.5,
+                                                    max_ratio_comp = 6.0,
+                                                    min_attack_ms_comp = 1.0,
+                                                    max_attack_ms_comp = 20.0,
+                                                    min_release_ms_comp = 20.0,
+                                                    max_release_ms_comp = 500.0,
+                                                    min_threshold_db_exp = -30.0,
+                                                    max_threshold_db_exp = -5.0,
+                                                    min_ratio_exp = 0.0+EPS,
+                                                    max_ratio_exp = 1.0-EPS,
+                                                    min_attack_ms_exp = 1.0,
+                                                    max_attack_ms_exp = 20.0,
+                                                    min_release_ms_exp = 20.0,
+                                                    max_release_ms_exp = 500.0,
+                )
+            elif cur_fx=='gain':
+                cur_fx_module = dasp_pytorch.Gain(sample_rate=sample_rate,
+                                                    min_gain_db = 0.0,
+                                                    max_gain_db = 6.0,)
+            elif cur_fx=='imager':
+                continue
+            elif cur_fx=='limiter':
+                cur_fx_module = Limiter(sample_rate=sample_rate,
+                                        min_threshold_db = -20.0,
+                                        max_threshold_db = 0.0-EPS,
+                                        min_attack_ms = 0.1,
+                                        max_attack_ms = 5.0,
+                                        min_release_ms = 20.0,
+                                        max_release_ms = 1000.0,)
+            else:
+                raise AssertionError(f"current fx name ({cur_fx}) not found")
+            self.fx_processors[cur_fx] = cur_fx_module
+        total_num_param = sum([self.fx_processors[cur_fx].num_params for cur_fx in self.fx_processors])
+        if 'imager' in tgt_fx_names:
+            total_num_param += 1
+        self.total_num_param = total_num_param
+    # network forward operation
+    def forward(self, x, rand_param=None, use_mask=None):
+        if rand_param==None:
+            rand_param = torch.rand((x.shape[0], self.total_num_param)).to(self.device)
+        else:
+            assert rand_param.shape[0]==x.shape[0] and rand_param.shape[1]==self.total_num_param
+        if use_mask==None:
+            use_mask = self.random_mask_generator(x.shape[0])
+        # dafx chain
+        cur_param_idx = 0
+        for cur_fx in self.tgt_fx_names:
+            cur_param_count = 1 if cur_fx=='imager' else self.fx_processors[cur_fx].num_params
+            if cur_fx=='imager':
+                x_processed = dasp_pytorch.functional.stereo_widener(x, \
+                                                            sample_rate=self.sample_rate, \
+                                                            width=rand_param[:,cur_param_idx:cur_param_idx+1])
+            else:
+                cur_input_param = rand_param[:, cur_param_idx:cur_param_idx+cur_param_count]
+                x_processed = self.fx_processors[cur_fx].process_normalized(x, cur_input_param)
+            # process all FX but decide to use the processed output based on probability
+            cur_mask = use_mask[cur_fx]
+            x = x_processed*cur_mask + x*~cur_mask
+            # update param index
+            cur_param_idx += cur_param_count
+        return x
+    def random_mask_generator(self, batch_size, repeat=1):
+        mask = {}
+        for cur_fx in self.tgt_fx_names:
+            mask[cur_fx] = self.fx_prob[cur_fx] > torch.rand(batch_size).view(-1, 1, 1)
+            if repeat>1:
+                mask[cur_fx] = mask[cur_fx].repeat(repeat, 1, 1)
+            mask[cur_fx] = mask[cur_fx].to(self.device)
+        return mask

networks/network_utils.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""
+    Utility File
+        containing functions for neural networks
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+import torch
+import torchaudio
+# 2-dimensional convolutional layer
+# in the order of conv -> norm -> activation
+class Conv2d_layer(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, \
+                                    stride=1, \
+                                    padding="SAME", dilation=(1,1), bias=True, \
+                                    norm="batch", activation="relu", \
+                                    mode="conv"):
+        super(Conv2d_layer, self).__init__()
+        self.conv2d = nn.Sequential()
+        if isinstance(kernel_size, int):
+            kernel_size = [kernel_size, kernel_size]
+        if isinstance(stride, int):
+            stride = [stride, stride]
+        if isinstance(dilation, int):
+            dilation = [dilation, dilation]
+        ''' padding '''
+        if mode=="deconv":
+            padding = tuple(int((current_kernel - 1)/2) for current_kernel in kernel_size)
+            out_padding = tuple(0 if current_stride == 1 else 1 for current_stride in stride)
+        elif mode=="conv":
+            if padding == "SAME":
+                f_pad = int((kernel_size[0]-1) * dilation[0])
+                t_pad = int((kernel_size[1]-1) * dilation[1])
+                t_l_pad = int(t_pad//2)
+                t_r_pad = t_pad - t_l_pad
+                f_l_pad = int(f_pad//2)
+                f_r_pad = f_pad - f_l_pad
+                padding_area = (t_l_pad, t_r_pad, f_l_pad, f_r_pad)
+            elif padding == "VALID":
+                padding = 0
+            else:
+                pass
+        ''' convolutional layer '''
+        if mode=="deconv":
+            self.conv2d.add_module("deconv2d", nn.ConvTranspose2d(in_channels, out_channels, \
+                                                            (kernel_size[0], kernel_size[1]), \
+                                                            stride=stride, \
+                                                            padding=padding, output_padding=out_padding, \
+                                                            dilation=dilation, \
+                                                            bias=bias))
+        elif mode=="conv":
+            self.conv2d.add_module(f"{mode}2d_pad", nn.ReflectionPad2d(padding_area))
+            self.conv2d.add_module(f"{mode}2d", nn.Conv2d(in_channels, out_channels, \
+                                                            (kernel_size[0], kernel_size[1]), \
+                                                            stride=stride, \
+                                                            padding=0, \
+                                                            dilation=dilation, \
+                                                            bias=bias))
+        ''' normalization '''
+        if norm=="batch":
+            self.conv2d.add_module("batch_norm", nn.BatchNorm2d(out_channels))
+        ''' activation '''
+        if activation=="relu":
+            self.conv2d.add_module("relu", nn.ReLU())
+        elif activation=="lrelu":
+            self.conv2d.add_module("lrelu", nn.LeakyReLU())
+    def forward(self, input):
+        # input shape should be : batch x channel x height x width
+        output = self.conv2d(input)
+        return output
+# 1-dimensional convolutional layer
+# in the order of conv -> norm -> activation
+class Conv1d_layer(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, \
+                                    stride=1, \
+                                    padding="SAME", dilation=1, bias=True, \
+                                    norm="batch", activation="relu", \
+                                    mode="conv"):
+        super(Conv1d_layer, self).__init__()
+        self.conv1d = nn.Sequential()
+        ''' padding '''
+        if mode=="deconv":
+            padding = int(dilation * (kernel_size-1) / 2)
+            out_padding = 0 if stride==1 else 1
+        elif mode=="conv" or "alias_free" in mode:
+            if padding == "SAME":
+                pad = int((kernel_size-1) * dilation)
+                l_pad = int(pad//2)
+                r_pad = pad - l_pad
+                padding_area = (l_pad, r_pad)
+            elif padding == "VALID":
+                padding_area = (0, 0)
+            else:
+                pass
+        ''' convolutional layer '''
+        if mode=="deconv":
+            self.conv1d.add_module("deconv1d", nn.ConvTranspose1d(in_channels, out_channels, kernel_size, \
+                                                            stride=stride, padding=padding, output_padding=out_padding, \
+                                                            dilation=dilation, \
+                                                            bias=bias))
+        elif mode=="conv":
+            self.conv1d.add_module(f"{mode}1d_pad", nn.ReflectionPad1d(padding_area))
+            self.conv1d.add_module(f"{mode}1d", nn.Conv1d(in_channels, out_channels, kernel_size, \
+                                                            stride=stride, padding=0, \
+                                                            dilation=dilation, \
+                                                            bias=bias))
+        elif "alias_free" in mode:
+            if "up" in mode:
+                up_factor = stride * 2
+                down_factor = 2
+            elif "down" in mode:
+                up_factor = 2
+                down_factor = stride * 2
+            else:
+                raise ValueError("choose alias-free method : 'up' or 'down'")
+            # procedure : conv -> upsample -> lrelu -> low-pass filter -> downsample
+            # the torchaudio.transforms.Resample's default resampling_method is 'sinc_interpolation' which performs low-pass filter during the process
+            # details at https://pytorch.org/audio/stable/transforms.html
+            self.conv1d.add_module(f"{mode}1d_pad", nn.ReflectionPad1d(padding_area))
+            self.conv1d.add_module(f"{mode}1d", nn.Conv1d(in_channels, out_channels, kernel_size, \
+                                                            stride=1, padding=0, \
+                                                            dilation=dilation, \
+                                                            bias=bias))
+            self.conv1d.add_module(f"{mode}upsample", torchaudio.transforms.Resample(orig_freq=1, new_freq=up_factor))
+            self.conv1d.add_module(f"{mode}lrelu", nn.LeakyReLU())
+            self.conv1d.add_module(f"{mode}downsample", torchaudio.transforms.Resample(orig_freq=down_factor, new_freq=1))
+        ''' normalization '''
+        if norm=="batch":
+            self.conv1d.add_module("batch_norm", nn.BatchNorm1d(out_channels))
+            # self.conv1d.add_module("batch_norm", nn.SyncBatchNorm(out_channels))
+        ''' activation '''
+        if 'alias_free' not in mode:
+            if activation=="relu":
+                self.conv1d.add_module("relu", nn.ReLU())
+            elif activation=="lrelu":
+                self.conv1d.add_module("lrelu", nn.LeakyReLU())
+    def forward(self, input):
+        # input shape should be : batch x channel x height x width
+        output = self.conv1d(input)
+        return output
+# Residual Block
+    # the input is added after the first convolutional layer, retaining its original channel size
+    # therefore, the second convolutional layer's output channel may differ
+class Res_ConvBlock(nn.Module):
+    def __init__(self, dimension, \
+                        in_channels, out_channels, \
+                        kernel_size, \
+                        stride=1, padding="SAME", \
+                        dilation=1, \
+                        bias=True, \
+                        norm="batch", \
+                        activation="relu", last_activation="relu", \
+                        mode="conv"):
+        super(Res_ConvBlock, self).__init__()
+        if dimension==1:
+            self.conv1 = Conv1d_layer(in_channels, in_channels, kernel_size, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=activation)
+            self.conv2 = Conv1d_layer(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=last_activation, mode=mode)
+        elif dimension==2:
+            self.conv1 = Conv2d_layer(in_channels, in_channels, kernel_size, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=activation)
+            self.conv2 = Conv2d_layer(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=last_activation, mode=mode)
+    def forward(self, input):
+        c1_out = self.conv1(input) + input
+        c2_out = self.conv2(c1_out)
+        return c2_out
+# Convoluaionl Block
+    # consists of multiple (number of layer_num) convolutional layers
+    # only the final convoluational layer outputs the desired 'out_channels'
+class ConvBlock(nn.Module):
+    def __init__(self, dimension, layer_num, \
+                        in_channels, out_channels, \
+                        kernel_size, \
+                        stride=1, padding="SAME", \
+                        dilation=1, \
+                        bias=True, \
+                        norm="batch", \
+                        activation="relu", last_activation="relu", \
+                        mode="conv"):
+        super(ConvBlock, self).__init__()
+        conv_block = []
+        if dimension==1:
+            for i in range(layer_num-1):
+                conv_block.append(Conv1d_layer(in_channels, in_channels, kernel_size, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=activation))
+            conv_block.append(Conv1d_layer(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=last_activation, mode=mode))
+        elif dimension==2:
+            for i in range(layer_num-1):
+                conv_block.append(Conv2d_layer(in_channels, in_channels, kernel_size, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=activation))
+            conv_block.append(Conv2d_layer(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias, norm=norm, activation=last_activation, mode=mode))
+        self.conv_block = nn.Sequential(*conv_block)
+    def forward(self, input):
+        return self.conv_block(input)
+# Feature-wise Linear Modulation
+class FiLM(nn.Module):
+    def __init__(self, condition_len=2048, feature_len=1024):
+        super(FiLM, self).__init__()
+        self.film_fc = nn.Linear(condition_len, feature_len*2)
+        self.feat_len = feature_len
+    def forward(self, feature, condition, sefa=None):
+        # SeFA
+        if sefa:
+            weight = self.film_fc.weight.T
+            weight = weight / torch.linalg.norm((weight+1e-07), dim=0, keepdims=True)
+            eigen_values, eigen_vectors = torch.eig(torch.matmul(weight, weight.T), eigenvectors=True)
+            ####### custom parameters #######
+            chosen_eig_idx = sefa[0]
+            alpha = eigen_values[chosen_eig_idx][0] * sefa[1]
+            #################################
+            An = eigen_vectors[chosen_eig_idx].repeat(condition.shape[0], 1)
+            alpha_An = alpha * An
+            condition += alpha_An
+        film_factor = self.film_fc(condition).unsqueeze(-1)
+        r, b = torch.split(film_factor, self.feat_len, dim=1)
+        return r*feature + b