Spaces:

mazalaai
/

tts

Sleeping

App Files Files Community

MAZALA2024 commited on Oct 18, 2024

Commit

454fba8

verified ·

1 Parent(s): c5224f7

Update vc_infer_pipeline.py

Browse files

Files changed (1) hide show

vc_infer_pipeline.py +46 -38

vc_infer_pipeline.py CHANGED Viewed

@@ -284,6 +284,7 @@ class VC(object):
         protect,
         f0_file=None,
     ):
         if (
             file_index != ""
             and os.path.exists(file_index)
@@ -313,6 +314,7 @@ class VC(object):
                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
                     )[0][0]
                 )
         s = 0
         audio_opt = []
         t = None
@@ -350,17 +352,54 @@ class VC(object):
             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
         t2 = ttime()
         times[1] += t2 - t1
-        for t in opt_ts:
             t = t // self.window * self.window
             if if_f0 == 1:
                 audio_opt.append(
                     self.vc(
                         model,
                         net_g,
                         sid,
-                        audio_pad[s : t + self.t_pad2 + self.window],
-                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
-                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
                         times,
                         index,
                         big_npy,
@@ -375,7 +414,7 @@ class VC(object):
                         model,
                         net_g,
                         sid,
-                        audio_pad[s : t + self.t_pad2 + self.window],
                         None,
                         None,
                         times,
@@ -386,39 +425,8 @@ class VC(object):
                         protect,
                     )[self.t_pad_tgt : -self.t_pad_tgt]
                 )
-            s = t
-        if t is not None:
-            if if_f0 == 1:
-                audio_piece = self.process_batch(
-                    model,
-                    net_g,
-                    sid,
-                    [audio_pad[t:]],
-                    [pitch[:, t // self.window :]],
-                    [pitchf[:, t // self.window :]],
-                    times,
-                    index,
-                    big_npy,
-                    index_rate,
-                    version,
-                    protect,
-                )[0]
-            else:
-                audio_piece = self.process_batch(
-                    model,
-                    net_g,
-                    sid,
-                    [audio_pad[t:]],
-                    None,
-                    None,
-                    times,
-                    index,
-                    big_npy,
-                    index_rate,
-                    version,
-                    protect,
-                )[0]
-            audio_opt.append(audio_piece[self.t_pad_tgt : -self.t_pad_tgt])
         if not audio_opt:
             raise ValueError("No audio segments were generated")

         protect,
         f0_file=None,
     ):
+        logger.info(f"Starting pipeline with audio shape: {audio.shape}")
         if (
             file_index != ""
             and os.path.exists(file_index)
                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
                     )[0][0]
                 )
+        logger.info(f"Number of opt_ts: {len(opt_ts)}")
         s = 0
         audio_opt = []
         t = None
             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
         t2 = ttime()
         times[1] += t2 - t1
+        for i, t in enumerate(opt_ts):
             t = t // self.window * self.window
+            logger.info(f"Processing segment {i+1}/{len(opt_ts)}")
+            if if_f0 == 1:
+                segment = self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[s : t + self.t_pad2 + self.window],
+                    pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                    pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            else:
+                segment = self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[s : t + self.t_pad2 + self.window],
+                    None,
+                    None,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            logger.info(f"Segment {i+1} shape: {segment.shape}")
+            audio_opt.append(segment)
+            s = t
+        if t is not None:
+            logger.info("Processing final segment")
             if if_f0 == 1:
                 audio_opt.append(
                     self.vc(
                         model,
                         net_g,
                         sid,
+                        audio_pad[t:],
+                        pitch[:, t // self.window :],
+                        pitchf[:, t // self.window :],
                         times,
                         index,
                         big_npy,
                         model,
                         net_g,
                         sid,
+                        audio_pad[t:],
                         None,
                         None,
                         times,
                         protect,
                     )[self.t_pad_tgt : -self.t_pad_tgt]
                 )
+        logger.info(f"Number of audio
         if not audio_opt:
             raise ValueError("No audio segments were generated")