Spaces:

jhtonyKoo
/

ITO-Master

Running

App Files Files Community

jhtonyKoo commited on Oct 17, 2024

Commit

78bac9e

1 Parent(s): 278b4aa

update loss

Browse files

Files changed (2) hide show

inference.py +1 -1
modules/loss.py +91 -24

inference.py CHANGED Viewed

@@ -99,6 +99,7 @@ class MasteringStyleTransfer:
                     target = ito_config['clap_text_prompt']
                 print(f'ito_config clap_distance_fn: {ito_config["clap_distance_fn"]}')
                 total_loss = self.clap_loss(output_audio, target, self.args.sample_rate, distance_fn=ito_config['clap_distance_fn'])
             if total_loss < min_loss:
                 min_loss = total_loss.item()
@@ -243,7 +244,6 @@ class MasteringStyleTransfer:
                     if isinstance(param_value, torch.Tensor):
                         param_value = param_value.item()
-                    print(f"fx name: {fx_name}   param_name: {param_name}")
                     if fx_name in param_mapper and param_name in param_mapper[fx_name]:
                         friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
                         if unit=='%':

                     target = ito_config['clap_text_prompt']
                 print(f'ito_config clap_distance_fn: {ito_config["clap_distance_fn"]}')
                 total_loss = self.clap_loss(output_audio, target, self.args.sample_rate, distance_fn=ito_config['clap_distance_fn'])
+                print(f'total_loss: {total_loss}')
             if total_loss < min_loss:
                 min_loss = total_loss.item()
                     if isinstance(param_value, torch.Tensor):
                         param_value = param_value.item()
                     if fx_name in param_mapper and param_name in param_mapper[fx_name]:
                         friendly_name, unit, min_val, max_val = param_mapper[fx_name][param_name]
                         if unit=='%':

modules/loss.py CHANGED Viewed

@@ -185,25 +185,35 @@ class CLAPFeatureLoss(nn.Module):
         self.target_sample_rate = 48000  # CLAP expects 48kHz audio
         self.model = laion_clap.CLAP_Module(enable_fusion=False)
         self.model.load_ckpt()  # download the default pretrained checkpoint
-    def forward(self, input_audio, target, sample_rate, distance_fn='cosine'):
         # Process input audio
-        input_embed = self.process_audio(input_audio, sample_rate)
         # Process target (audio or text)
-        if isinstance(target, torch.Tensor):
-            target_embed = self.process_audio(target, sample_rate)
-        elif isinstance(target, str) or (isinstance(target, list) and isinstance(target[0], str)):
-            target_embed = self.process_text(target)
-        else:
-            raise ValueError("Target must be either audio tensor or text (string or list of strings)")
         # Compute loss using the specified distance function
         loss = self.compute_distance(input_embed, target_embed, distance_fn)
         return loss
-    def process_audio(self, audio, sample_rate):
         # Ensure input is in the correct shape (N, C, T)
         if audio.dim() == 2:
             audio = audio.unsqueeze(1)
@@ -219,19 +229,7 @@ class CLAPFeatureLoss(nn.Module):
         # Quantize audio data
         audio = self.quantize(audio)
-        # Get CLAP embeddings
-        with torch.no_grad():
-            embed = self.model.get_audio_embedding_from_data(x=audio, use_tensor=True)
-        return embed
-    def process_text(self, text):
-        # Get CLAP embeddings for text
-        # ensure input is a list of strings
-        if not isinstance(text, list):
-            text = [text]
-        with torch.no_grad():
-            embed = self.model.get_text_embedding(text, use_tensor=True)
-        return embed
     def compute_distance(self, x, y, distance_fn):
         if distance_fn == 'mse':
@@ -249,11 +247,80 @@ class CLAPFeatureLoss(nn.Module):
         audio = (audio * 32767.0).to(torch.int16).to(torch.float32) / 32767.0
         return audio
-    def resample(self, audio, input_sample_rate):
         resampler = torchaudio.transforms.Resample(
-            orig_freq=input_sample_rate, new_freq=self.target_sample_rate
         ).to(audio.device)
         return resampler(audio)
 """

         self.target_sample_rate = 48000  # CLAP expects 48kHz audio
         self.model = laion_clap.CLAP_Module(enable_fusion=False)
         self.model.load_ckpt()  # download the default pretrained checkpoint
+        # Freeze the CLAP model parameters
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def forward(self, input_audio, target, sample_rate, distance_fn='mse'):
         # Process input audio
+        with torch.no_grad():
+            input_audio = self.preprocess_audio(input_audio, sample_rate)
+        with torch.enable_grad():
+            input_embed = self.model.get_audio_embedding_from_data(x=input_audio, use_tensor=True)
         # Process target (audio or text)
+        with torch.no_grad():
+            if isinstance(target, torch.Tensor):
+                target_audio = self.preprocess_audio(target, sample_rate)
+                target_embed = self.model.get_audio_embedding_from_data(x=target_audio, use_tensor=True)
+            elif isinstance(target, str) or (isinstance(target, list) and isinstance(target[0], str)):
+                target_embed = self.model.get_text_embedding(target, use_tensor=True)
+            else:
+                raise ValueError("Target must be either audio tensor or text (string or list of strings)")
         # Compute loss using the specified distance function
         loss = self.compute_distance(input_embed, target_embed, distance_fn)
         return loss
+    def preprocess_audio(self, audio, sample_rate):
         # Ensure input is in the correct shape (N, C, T)
         if audio.dim() == 2:
             audio = audio.unsqueeze(1)
         # Quantize audio data
         audio = self.quantize(audio)
+        return audio
     def compute_distance(self, x, y, distance_fn):
         if distance_fn == 'mse':
         audio = (audio * 32767.0).to(torch.int16).to(torch.float32) / 32767.0
         return audio
+    def resample(self, audio, orig_sample_rate):
         resampler = torchaudio.transforms.Resample(
+            orig_freq=orig_sample_rate, new_freq=self.target_sample_rate
         ).to(audio.device)
         return resampler(audio)
+    # def forward(self, input_audio, target, sample_rate, distance_fn='cosine'):
+    #     # Process input audio
+    #     input_embed = self.process_audio(input_audio, sample_rate)
+    #     # Process target (audio or text)
+    #     if isinstance(target, torch.Tensor):
+    #         target_embed = self.process_audio(target, sample_rate)
+    #     elif isinstance(target, str) or (isinstance(target, list) and isinstance(target[0], str)):
+    #         target_embed = self.process_text(target)
+    #     else:
+    #         raise ValueError("Target must be either audio tensor or text (string or list of strings)")
+    #     # Compute loss using the specified distance function
+    #     loss = self.compute_distance(input_embed, target_embed, distance_fn)
+    #     return loss
+    # def process_audio(self, audio, sample_rate):
+    #     # Ensure input is in the correct shape (N, C, T)
+    #     if audio.dim() == 2:
+    #         audio = audio.unsqueeze(1)
+    #     # Convert to mono if stereo
+    #     if audio.shape[1] > 1:
+    #         audio = audio.mean(dim=1, keepdim=True)
+    #     # Resample if necessary
+    #     if sample_rate != self.target_sample_rate:
+    #         audio = self.resample(audio, sample_rate)
+    #     # Quantize audio data
+    #     audio = self.quantize(audio)
+    #     # Get CLAP embeddings
+    #     with torch.no_grad():
+    #         embed = self.model.get_audio_embedding_from_data(x=audio, use_tensor=True)
+    #     return embed
+    # def process_text(self, text):
+    #     # Get CLAP embeddings for text
+    #     # ensure input is a list of strings
+    #     if not isinstance(text, list):
+    #         text = [text]
+    #     with torch.no_grad():
+    #         embed = self.model.get_text_embedding(text, use_tensor=True)
+    #     return embed
+    # def compute_distance(self, x, y, distance_fn):
+    #     if distance_fn == 'mse':
+    #         return F.mse_loss(x, y)
+    #     elif distance_fn == 'l1':
+    #         return F.l1_loss(x, y)
+    #     elif distance_fn == 'cosine':
+    #         return 1 - F.cosine_similarity(x, y).mean()
+    #     else:
+    #         raise ValueError(f"Unsupported distance function: {distance_fn}")
+    # def quantize(self, audio):
+    #     audio = audio.squeeze(1)  # Remove channel dimension
+    #     audio = torch.clamp(audio, -1.0, 1.0)
+    #     audio = (audio * 32767.0).to(torch.int16).to(torch.float32) / 32767.0
+    #     return audio
+    # def resample(self, audio, input_sample_rate):
+    #     resampler = torchaudio.transforms.Resample(
+    #         orig_freq=input_sample_rate, new_freq=self.target_sample_rate
+    #     ).to(audio.device)
+    #     return resampler(audio)
 """