StormblessedKal commited on
Commit
e385e48
·
1 Parent(s): 0951938
src/__pycache__/predict.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/predict.cpython-310.pyc and b/src/__pycache__/predict.cpython-310.pyc differ
 
src/predict.py CHANGED
@@ -266,27 +266,31 @@ class Predictor:
266
  "voice_clone_2":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-2.mp3"
267
  }
268
  if method_type == 'voice_clone_with_emotions':
269
- print("INSIDE emotions")
270
- base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
271
- reference_speaker = local_file_path
272
- target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
273
- src_path = os.path.join(results_dir,f"{gen_id}-tmp-emotions.wav")
274
- openvoice_output = os.path.join(results_dir,f"{gen_id}-4.wav")
275
- base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
276
- source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
277
- tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
278
- if process_audio:
279
- (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
280
- sf.write(openvoice_output,wav1,new_sr)
281
-
282
- mp3_final_output_1 = str(openvoice_output).replace('wav','mp3')
283
- self.convert_wav_to_mp3(openvoice_output,mp3_final_output_1)
284
- print(mp3_final_output_1)
285
- self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
286
- shutil.rmtree(os.path.join(output_dir,gen_id))
287
- return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
288
- }
289
-
 
 
 
 
290
  if method_type == 'voice_clone_with_multi_lang':
291
  print("Inside multilang")
292
  #voice clone with multi-lingugal
 
266
  "voice_clone_2":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-2.mp3"
267
  }
268
  if method_type == 'voice_clone_with_emotions':
269
+ try:
270
+ print("INSIDE emotions")
271
+ base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
272
+ reference_speaker = local_file_path
273
+ print("here 1")
274
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
275
+ print("here 2")
276
+ src_path = os.path.join(results_dir,f"{gen_id}-tmp-emotions.wav")
277
+ openvoice_output = os.path.join(results_dir,f"{gen_id}-4.wav")
278
+ base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
279
+ source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
280
+ tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
281
+ if process_audio:
282
+ (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
283
+ sf.write(openvoice_output,wav1,new_sr)
284
+
285
+ mp3_final_output_1 = str(openvoice_output).replace('wav','mp3')
286
+ self.convert_wav_to_mp3(openvoice_output,mp3_final_output_1)
287
+ print(mp3_final_output_1)
288
+ self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
289
+ shutil.rmtree(os.path.join(output_dir,gen_id))
290
+ return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
291
+ }
292
+ except Exception as e:
293
+ return {"error":f"Unexpected error{e}"}
294
  if method_type == 'voice_clone_with_multi_lang':
295
  print("Inside multilang")
296
  #voice clone with multi-lingugal
src/se_extractor.py CHANGED
@@ -7,13 +7,11 @@ from pydub import AudioSegment
7
  from faster_whisper import WhisperModel
8
  from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
9
 
10
- model_size = "medium"
11
  # Run on GPU with FP16
12
  model = None
13
  def split_audio_whisper(audio_path, target_dir='processed',needs_offset=True):
14
- global model
15
- if model is None:
16
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
17
  audio = AudioSegment.from_file(audio_path)
18
  max_len = len(audio)
19
 
 
7
  from faster_whisper import WhisperModel
8
  from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
9
 
 
10
  # Run on GPU with FP16
11
  model = None
12
  def split_audio_whisper(audio_path, target_dir='processed',needs_offset=True):
13
+ print("in whisper split")
14
+ model = WhisperModel('medium', device="cuda:0", compute_type="float16")
 
15
  audio = AudioSegment.from_file(audio_path)
16
  max_len = len(audio)
17