Chillarmo commited on
Commit
5dbc09c
1 Parent(s): 776e91e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -144
app.py CHANGED
@@ -1,198 +1,114 @@
1
  import gradio as gr
2
  import torch
3
- import os
4
  from outetts.v0_1.interface import InterfaceHF
5
  import soundfile as sf
6
  import tempfile
 
7
  from faster_whisper import WhisperModel
8
- from pathlib import Path
9
-
10
- # Configure PyTorch for CPU efficiency
11
- torch.set_num_threads(4) # Limit CPU threads
12
- torch.set_grad_enabled(False) # Disable gradient computation
13
-
14
- class OptimizedTTSInterface:
15
- def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
16
- self.interface = InterfaceHF(model_name)
17
- # Apply FP16 optimization where possible
18
- self.interface.model = self.interface.model.half().float()
19
- # Cache commonly used attributes
20
- self.tokenizer = self.interface.model.tokenizer
21
-
22
- def create_speaker(self, *args, **kwargs):
23
- with torch.inference_mode():
24
- return self.interface.create_speaker(*args, **kwargs)
25
-
26
- def generate(self, *args, **kwargs):
27
- with torch.inference_mode():
28
- return self.interface.generate(*args, **kwargs)
29
 
30
  def initialize_models():
31
- """Initialize the OptimizedTTS and Faster-Whisper models"""
32
- # Create cache directory for models
33
- cache_dir = Path("model_cache")
34
- cache_dir.mkdir(exist_ok=True)
35
-
36
- # Set environment variables for better performance
37
- os.environ['OMP_NUM_THREADS'] = '4'
38
- os.environ['MKL_NUM_THREADS'] = '4'
39
-
40
- print("Loading ASR model...")
41
  asr_model = WhisperModel("tiny",
42
  device="cpu",
43
- compute_type="int8",
44
- num_workers=1,
45
- cpu_threads=2,
46
- download_root=str(cache_dir))
47
-
48
- print("Loading TTS model...")
49
- tts_interface = OptimizedTTSInterface()
50
-
51
  return tts_interface, asr_model
52
 
 
 
 
53
  def transcribe_audio(audio_path):
54
  """Transcribe audio using Faster-Whisper tiny"""
55
  try:
56
- segments, _ = ASR_MODEL.transcribe(audio_path,
57
- beam_size=1,
58
- best_of=1,
59
- temperature=1.0,
60
- condition_on_previous_text=False,
61
- compression_ratio_threshold=2.4,
62
- log_prob_threshold=-1.0,
63
- no_speech_threshold=0.6)
 
64
 
 
65
  text = " ".join([segment.text for segment in segments]).strip()
66
  return text
67
  except Exception as e:
68
  return f"Error transcribing audio: {str(e)}"
69
 
70
- def preprocess_audio(audio_path):
71
- """Preprocess audio to reduce memory usage"""
72
- try:
73
- # Load and resample audio to 16kHz if needed
74
- data, sr = sf.read(audio_path)
75
- if sr != 16000:
76
- import resampy
77
- data = resampy.resample(data, sr, 16000)
78
- sr = 16000
79
-
80
- # Convert to mono if stereo
81
- if len(data.shape) > 1:
82
- data = data.mean(axis=1)
83
-
84
- # Normalize audio
85
- data = data / max(abs(data.max()), abs(data.min()))
86
-
87
- # Save preprocessed audio
88
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
89
- sf.write(temp_file.name, data, sr)
90
- return temp_file.name
91
- except Exception as e:
92
- return audio_path # Return original if preprocessing fails
93
-
94
  def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
95
  """Process the audio file and generate speech with the cloned voice"""
96
  try:
97
- # Preprocess audio
98
- processed_audio = preprocess_audio(audio_path)
99
-
100
  # If no reference text provided, transcribe the audio
101
  if not reference_text.strip():
102
- reference_text = transcribe_audio(processed_audio)
103
  if reference_text.startswith("Error"):
104
  return None, reference_text
105
-
106
  # Create speaker from reference audio
107
- with torch.inference_mode():
108
- speaker = TTS_INTERFACE.create_speaker(
109
- processed_audio,
110
- reference_text
111
- )
112
-
113
- # Generate speech with cloned voice
114
- output = TTS_INTERFACE.generate(
115
- text=text_to_speak,
116
- speaker=speaker,
117
- temperature=temperature,
118
- repetition_penalty=repetition_penalty,
119
- max_lenght=4096
120
- )
121
 
122
- # Clean up preprocessed audio if it was created
123
- if processed_audio != audio_path:
124
- try:
125
- os.unlink(processed_audio)
126
- except:
127
- pass
 
 
128
 
129
- # Save output to temporary file
130
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
131
  output.save(temp_file.name)
132
  return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
133
 
134
  except Exception as e:
135
- if processed_audio != audio_path:
136
- try:
137
- os.unlink(processed_audio)
138
- except:
139
- pass
140
  return None, f"Error: {str(e)}"
141
 
142
- print("Starting initialization...")
143
- # Initialize models globally
144
- TTS_INTERFACE, ASR_MODEL = initialize_models()
145
- print("Models initialized successfully!")
146
-
147
  # Create Gradio interface
148
  with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
149
- gr.Markdown("# 🎙️ Optimized Voice Cloning with OuteTTS")
150
  gr.Markdown("""
151
- This app uses optimized versions of OuteTTS and Whisper for efficient voice cloning on CPU.
152
- Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
153
  and enter the new text you want to be spoken in the cloned voice.
154
 
155
- Note: First run may take longer while models are being cached.
156
  """)
157
 
158
  with gr.Row():
159
  with gr.Column():
160
- audio_input = gr.Audio(
161
- label="Upload Reference Audio",
162
- type="filepath",
163
- source="microphone"
164
- )
165
  reference_text = gr.Textbox(
166
- label="Reference Text (leave blank for auto-transcription)",
167
  placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
168
  )
169
  text_to_speak = gr.Textbox(
170
- label="Text to Speak",
171
  placeholder="Enter the text you want the cloned voice to speak"
172
  )
173
 
174
  with gr.Row():
175
- temperature = gr.Slider(
176
- minimum=0.1,
177
- maximum=1.0,
178
- value=0.1,
179
- step=0.1,
180
- label="Temperature"
181
- )
182
- repetition_penalty = gr.Slider(
183
- minimum=1.0,
184
- maximum=2.0,
185
- value=1.1,
186
- step=0.1,
187
- label="Repetition Penalty"
188
- )
189
 
 
190
  submit_btn = gr.Button("Generate Voice", variant="primary")
191
 
192
  with gr.Column():
 
193
  output_audio = gr.Audio(label="Generated Speech")
194
  output_message = gr.Textbox(label="Status", max_lines=3)
195
 
 
196
  submit_btn.click(
197
  fn=process_audio_file,
198
  inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
@@ -200,18 +116,14 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
200
  )
201
 
202
  gr.Markdown("""
203
- ### Optimization Notes:
204
- - Optimized for CPU performance
205
- - Model caching enabled
206
- - Memory-efficient inference
207
- - Automatic audio preprocessing
208
-
209
  ### Tips for best results:
210
- 1. Use clear, high-quality reference audio
211
- 2. Keep reference audio short (5-10 seconds)
212
- 3. Verify auto-transcription accuracy
213
- 4. For best quality, manually input exact reference text
214
- 5. Keep generated text concise
 
 
215
  """)
216
 
217
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import torch
 
3
  from outetts.v0_1.interface import InterfaceHF
4
  import soundfile as sf
5
  import tempfile
6
+ import os
7
  from faster_whisper import WhisperModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def initialize_models():
10
+ """Initialize the OuteTTS and Faster-Whisper models"""
11
+ tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
12
+ # Use tiny model with lowest compute settings for maximum speed
 
 
 
 
 
 
 
13
  asr_model = WhisperModel("tiny",
14
  device="cpu",
15
+ compute_type="int8", # Use int8 quantization for efficiency
16
+ num_workers=1, # Limit workers for low-resource environment
17
+ cpu_threads=1) # Limit CPU threads
 
 
 
 
 
18
  return tts_interface, asr_model
19
 
20
+ # Initialize models globally to avoid reloading
21
+ TTS_INTERFACE, ASR_MODEL = initialize_models()
22
+
23
  def transcribe_audio(audio_path):
24
  """Transcribe audio using Faster-Whisper tiny"""
25
  try:
26
+ # Transcribe with minimal settings for speed
27
+ segments, _ = ASR_MODEL.transcribe(audio_path,
28
+ beam_size=1, # Reduce beam size
29
+ best_of=1, # Don't generate alternatives
30
+ temperature=1.0, # No temperature sampling
31
+ condition_on_previous_text=False, # Don't condition on previous
32
+ compression_ratio_threshold=2.4, # Less strict threshold
33
+ log_prob_threshold=-1.0, # Less strict threshold
34
+ no_speech_threshold=0.6) # Less strict threshold
35
 
36
+ # Combine all segments
37
  text = " ".join([segment.text for segment in segments]).strip()
38
  return text
39
  except Exception as e:
40
  return f"Error transcribing audio: {str(e)}"
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
43
  """Process the audio file and generate speech with the cloned voice"""
44
  try:
 
 
 
45
  # If no reference text provided, transcribe the audio
46
  if not reference_text.strip():
47
+ reference_text = transcribe_audio(audio_path)
48
  if reference_text.startswith("Error"):
49
  return None, reference_text
50
+
51
  # Create speaker from reference audio
52
+ speaker = TTS_INTERFACE.create_speaker(
53
+ audio_path,
54
+ reference_text
55
+ )
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # Generate speech with cloned voice
58
+ output = TTS_INTERFACE.generate(
59
+ text=text_to_speak,
60
+ speaker=speaker,
61
+ temperature=temperature,
62
+ repetition_penalty=repetition_penalty,
63
+ max_lenght=4096
64
+ )
65
 
66
+ # Save to temporary file and return path
67
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
68
  output.save(temp_file.name)
69
  return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
70
 
71
  except Exception as e:
 
 
 
 
 
72
  return None, f"Error: {str(e)}"
73
 
 
 
 
 
 
74
  # Create Gradio interface
75
  with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
76
+ gr.Markdown("# 🎙️ Voice Cloning with OuteTTS")
77
  gr.Markdown("""
78
+ This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
 
79
  and enter the new text you want to be spoken in the cloned voice.
80
 
81
+ Note: For best results, use clear audio with minimal background noise.
82
  """)
83
 
84
  with gr.Row():
85
  with gr.Column():
86
+ # Input components
87
+ audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
 
 
 
88
  reference_text = gr.Textbox(
89
+ label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
90
  placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
91
  )
92
  text_to_speak = gr.Textbox(
93
+ label="Text to Speak (what you want the cloned voice to say)",
94
  placeholder="Enter the text you want the cloned voice to speak"
95
  )
96
 
97
  with gr.Row():
98
+ temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1,
99
+ label="Temperature (higher = more variation)")
100
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
101
+ label="Repetition Penalty")
 
 
 
 
 
 
 
 
 
 
102
 
103
+ # Submit button
104
  submit_btn = gr.Button("Generate Voice", variant="primary")
105
 
106
  with gr.Column():
107
+ # Output components
108
  output_audio = gr.Audio(label="Generated Speech")
109
  output_message = gr.Textbox(label="Status", max_lines=3)
110
 
111
+ # Handle submission
112
  submit_btn.click(
113
  fn=process_audio_file,
114
  inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
 
116
  )
117
 
118
  gr.Markdown("""
 
 
 
 
 
 
119
  ### Tips for best results:
120
+ 1. Use high-quality reference audio (clear speech, minimal background noise)
121
+ 2. If providing reference text manually, ensure it matches the audio exactly
122
+ 3. If using auto-transcription, verify the transcribed text in the status message
123
+ 4. Keep generated text relatively short for better quality
124
+ 5. Adjust temperature and repetition penalty if needed:
125
+ - Lower temperature (0.1-0.3) for more consistent output
126
+ - Higher repetition penalty (1.1-1.3) to avoid repetition
127
  """)
128
 
129
  if __name__ == "__main__":