Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 18, 2024

Commit

c5224f7

verified ·

1 Parent(s): 0741c2d

Update vc_infer_pipeline.py

Browse files

Files changed (1) hide show

vc_infer_pipeline.py +11 -11

vc_infer_pipeline.py CHANGED Viewed

@@ -169,7 +169,7 @@ class VC(object):
         model,
         net_g,
         sid,
-        audio0,
         pitch,
         pitchf,
         times,
@@ -179,7 +179,8 @@ class VC(object):
         version,
         protect,
     ):
-        feats = torch.from_numpy(audio0)
         if self.is_half:
             feats = feats.half()
         else:
@@ -189,7 +190,7 @@ class VC(object):
         assert feats.dim() == 1, feats.dim()
         feats = feats.view(1, -1)
         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
         inputs = {
             "source": feats.to(self.device),
             "padding_mask": padding_mask,
@@ -209,32 +210,30 @@ class VC(object):
             npy = feats[0].cpu().numpy()
             if self.is_half:
                 npy = npy.astype("float32")
             score, ix = index.search(npy, k=8)
             weight = np.square(1 / score)
             weight /= weight.sum(axis=1, keepdims=True)
             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
             if self.is_half:
                 npy = npy.astype("float16")
             feats = (
                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
                 + (1 - index_rate) * feats
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         if protect < 0.5 and pitch is not None and pitchf is not None:
-            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
-                0, 2, 1
-            )
         t1 = ttime()
-        p_len = audio0.shape[0] // self.window
         if feats.shape[1] < p_len:
             p_len = feats.shape[1]
             if pitch is not None and pitchf is not None:
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
         if protect < 0.5 and pitch is not None and pitchf is not None:
             pitchff = pitchf.clone()
             pitchff[pitchf > 0] = 1
@@ -261,6 +260,7 @@ class VC(object):
         t2 = ttime()
         times[0] += t1 - t0
         times[2] += t2 - t1
         return audio1
     def pipeline(

         model,
         net_g,
         sid,
+        audio,
         pitch,
         pitchf,
         times,
         version,
         protect,
     ):
+        logger.info(f"VC input shape: {audio.shape}")
+        feats = torch.from_numpy(audio)
         if self.is_half:
             feats = feats.half()
         else:
         assert feats.dim() == 1, feats.dim()
         feats = feats.view(1, -1)
         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
         inputs = {
             "source": feats.to(self.device),
             "padding_mask": padding_mask,
             npy = feats[0].cpu().numpy()
             if self.is_half:
                 npy = npy.astype("float32")
             score, ix = index.search(npy, k=8)
             weight = np.square(1 / score)
             weight /= weight.sum(axis=1, keepdims=True)
             npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
             if self.is_half:
                 npy = npy.astype("float16")
             feats = (
                 torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
                 + (1 - index_rate) * feats
             )
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         if protect < 0.5 and pitch is not None and pitchf is not None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
         t1 = ttime()
+        p_len = audio.shape[0] // self.window
         if feats.shape[1] < p_len:
             p_len = feats.shape[1]
             if pitch is not None and pitchf is not None:
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
         if protect < 0.5 and pitch is not None and pitchf is not None:
             pitchff = pitchf.clone()
             pitchff[pitchf > 0] = 1
         t2 = ttime()
         times[0] += t1 - t0
         times[2] += t2 - t1
+        logger.info(f"VC output shape: {audio1.shape}")
         return audio1
     def pipeline(