MAZALA2024 commited on
Commit
8ebe7fa
·
verified ·
1 Parent(s): 2a969d1

Update voice_processing.py

Browse files
Files changed (1) hide show
  1. voice_processing.py +32 -41
voice_processing.py CHANGED
@@ -95,47 +95,38 @@ def process_audio(model, audio_file, logger, index_rate=0, use_uploaded_voice=Tr
95
  if model is None:
96
  logger.error("No model provided for processing")
97
  return None
98
-
99
- # Load audio
100
- sr, audio = wavfile.read(audio_file)
101
- logger.info(f"Loaded audio: sr={sr}Hz, shape={audio.shape}")
102
-
103
- # Convert to mono if needed
104
- if len(audio.shape) > 1:
105
- audio = np.mean(audio, axis=1)
106
- audio = audio.astype(np.float32)
107
-
108
- # Prepare input tensor
109
- input_tensor = torch.FloatTensor(audio)
110
- if torch.cuda.is_available():
111
- input_tensor = input_tensor.cuda()
112
- model = model.cuda()
113
-
114
- # Process through model
115
- with torch.no_grad():
116
- # Prepare required arguments for model.infer()
117
- phone = input_tensor.unsqueeze(0) # Add batch dimension [1, sequence_length]
118
- phone_lengths = torch.LongTensor([len(input_tensor)]).to(input_tensor.device)
119
- pitch = torch.zeros(1, len(input_tensor)).to(input_tensor.device) # Default pitch
120
- nsff0 = torch.zeros_like(pitch).to(input_tensor.device)
121
- sid = torch.LongTensor([0]).to(input_tensor.device) # Speaker ID
122
-
123
- # Call infer with all required arguments
124
- output = model.infer(
125
- phone=phone,
126
- phone_lengths=phone_lengths,
127
- pitch=pitch,
128
- nsff0=nsff0,
129
- sid=sid
130
- )
131
-
132
- if torch.cuda.is_available():
133
- output = output.cpu()
134
- output = output.numpy()
135
-
136
- logger.info(f"Processing complete, output shape: {output.shape}")
137
- return (None, None, (sr, output))
138
-
139
  except Exception as e:
140
  logger.error(f"Error processing audio: {str(e)}")
141
  logger.error(traceback.format_exc())
 
95
  if model is None:
96
  logger.error("No model provided for processing")
97
  return None
98
+
99
+ # Load and process audio
100
+ tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name)
101
+ if f0_method == "rmvpe":
102
+ vc.model_rmvpe = rmvpe_model
103
+
104
+ times = [0, 0, 0]
105
+ audio_opt = vc.pipeline(
106
+ hubert_model,
107
+ net_g,
108
+ 0, # sid
109
+ audio,
110
+ audio_file,
111
+ times,
112
+ f0_up_key=0,
113
+ f0_method="rmvpe",
114
+ index_file=index_file,
115
+ index_rate=index_rate,
116
+ if_f0=if_f0,
117
+ filter_radius=3,
118
+ tgt_sr=tgt_sr,
119
+ resample_sr=0,
120
+ rms_mix_rate=0.25,
121
+ version=version,
122
+ protect=0.33,
123
+ f0_file=None
124
+ )
125
+
126
+ info = f"Success. Time: npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
127
+ logger.info(info)
128
+ return (info, None, (tgt_sr, audio_opt))
129
+
 
 
 
 
 
 
 
 
 
130
  except Exception as e:
131
  logger.error(f"Error processing audio: {str(e)}")
132
  logger.error(traceback.format_exc())