Chillarmo commited on
Commit
30aecac
1 Parent(s): cee7312

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -32
app.py CHANGED
@@ -1,29 +1,111 @@
1
  import gradio as gr
2
  import torch
 
 
3
  from outetts.v0_1.interface import InterfaceHF
4
  import soundfile as sf
5
  import tempfile
6
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def initialize_model():
9
- """Initialize the OuteTTS model"""
10
- interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M")
11
- return interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
14
  """Process the audio file and generate speech with the cloned voice"""
15
  try:
16
- # Initialize model
17
- interface = initialize_model()
 
 
 
 
 
 
18
 
19
  # Create speaker from reference audio
20
- speaker = interface.create_speaker(
21
- audio_path,
22
  reference_text
23
  )
24
 
25
  # Generate speech with cloned voice
26
- output = interface.generate(
27
  text=text_to_speak,
28
  speaker=speaker,
29
  temperature=temperature,
@@ -31,19 +113,37 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
31
  max_lenght=4096
32
  )
33
 
34
- # Save to temporary file and return path
 
 
 
 
 
 
 
35
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
36
  output.save(temp_file.name)
37
- return temp_file.name, "Voice cloning successful!"
38
 
39
  except Exception as e:
 
 
 
 
 
40
  return None, f"Error: {str(e)}"
41
 
 
 
 
 
 
42
  # Create Gradio interface
43
  with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
44
- gr.Markdown("# 🎙️ Voice Cloning with OuteTTS")
45
  gr.Markdown("""
46
- This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio,
 
47
  and enter the new text you want to be spoken in the cloned voice.
48
 
49
  Note: For best results, use clear audio with minimal background noise.
@@ -51,26 +151,41 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
51
 
52
  with gr.Row():
53
  with gr.Column():
54
- # Input components
55
- audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
56
- reference_text = gr.Textbox(label="Reference Text (what is being said in the audio)")
57
- text_to_speak = gr.Textbox(label="Text to Speak (what you want the cloned voice to say)")
 
 
 
 
 
 
 
 
58
 
59
  with gr.Row():
60
- temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1,
61
- label="Temperature (higher = more variation)")
62
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
63
- label="Repetition Penalty")
 
 
 
 
 
 
 
 
 
 
64
 
65
- # Submit button
66
  submit_btn = gr.Button("Generate Voice", variant="primary")
67
 
68
  with gr.Column():
69
- # Output components
70
  output_audio = gr.Audio(label="Generated Speech")
71
- output_message = gr.Textbox(label="Status")
72
 
73
- # Handle submission
74
  submit_btn.click(
75
  fn=process_audio_file,
76
  inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
@@ -78,13 +193,18 @@ with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
78
  )
79
 
80
  gr.Markdown("""
 
 
 
 
 
 
81
  ### Tips for best results:
82
- 1. Use high-quality reference audio (clear speech, minimal background noise)
83
- 2. Ensure reference text matches the audio exactly
84
- 3. Keep generated text relatively short for better quality
85
- 4. Adjust temperature and repetition penalty if needed:
86
- - Lower temperature (0.1-0.3) for more consistent output
87
- - Higher repetition penalty (1.1-1.3) to avoid repetition
88
  """)
89
 
90
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import torch
3
+ import torch.nn as nn
4
+ import os
5
  from outetts.v0_1.interface import InterfaceHF
6
  import soundfile as sf
7
  import tempfile
8
+ from faster_whisper import WhisperModel
9
+ from pathlib import Path
10
+
11
+ # Configure PyTorch for CPU efficiency
12
+ torch.set_num_threads(4) # Limit CPU threads
13
+ torch.set_grad_enabled(False) # Disable gradient computation
14
+
15
+ class OptimizedTTSInterface:
16
+ def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
17
+ self.interface = InterfaceHF(model_name)
18
+ # Quantize the model to INT8
19
+ self.interface.model = torch.quantization.quantize_dynamic(
20
+ self.interface.model, {nn.Linear}, dtype=torch.qint8
21
+ )
22
+ # Move model to CPU and enable inference mode
23
+ self.interface.model.cpu()
24
+ self.interface.model.eval()
25
+
26
+ def create_speaker(self, *args, **kwargs):
27
+ with torch.inference_mode():
28
+ return self.interface.create_speaker(*args, **kwargs)
29
+
30
+ def generate(self, *args, **kwargs):
31
+ with torch.inference_mode():
32
+ return self.interface.generate(*args, **kwargs)
33
 
34
+ def initialize_models():
35
+ """Initialize the OptimizedTTS and Faster-Whisper models"""
36
+ # Use cached models if available
37
+ cache_dir = Path("model_cache")
38
+ cache_dir.mkdir(exist_ok=True)
39
+
40
+ tts_interface = OptimizedTTSInterface()
41
+
42
+ # Initialize Whisper with maximum optimization
43
+ asr_model = WhisperModel("tiny",
44
+ device="cpu",
45
+ compute_type="int8",
46
+ num_workers=1,
47
+ cpu_threads=2,
48
+ download_root=str(cache_dir))
49
+ return tts_interface, asr_model
50
+
51
+ def transcribe_audio(audio_path):
52
+ """Transcribe audio using Faster-Whisper tiny"""
53
+ try:
54
+ segments, _ = ASR_MODEL.transcribe(audio_path,
55
+ beam_size=1,
56
+ best_of=1,
57
+ temperature=1.0,
58
+ condition_on_previous_text=False,
59
+ compression_ratio_threshold=2.4,
60
+ log_prob_threshold=-1.0,
61
+ no_speech_threshold=0.6)
62
+
63
+ text = " ".join([segment.text for segment in segments]).strip()
64
+ return text
65
+ except Exception as e:
66
+ return f"Error transcribing audio: {str(e)}"
67
+
68
+ def preprocess_audio(audio_path):
69
+ """Preprocess audio to reduce memory usage"""
70
+ try:
71
+ # Load and resample audio to 16kHz if needed
72
+ data, sr = sf.read(audio_path)
73
+ if sr != 16000:
74
+ import resampy
75
+ data = resampy.resample(data, sr, 16000)
76
+ sr = 16000
77
+
78
+ # Convert to mono if stereo
79
+ if len(data.shape) > 1:
80
+ data = data.mean(axis=1)
81
+
82
+ # Save preprocessed audio
83
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
84
+ sf.write(temp_file.name, data, sr)
85
+ return temp_file.name
86
+ except Exception as e:
87
+ return audio_path # Return original if preprocessing fails
88
 
89
  def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
90
  """Process the audio file and generate speech with the cloned voice"""
91
  try:
92
+ # Preprocess audio
93
+ processed_audio = preprocess_audio(audio_path)
94
+
95
+ # If no reference text provided, transcribe the audio
96
+ if not reference_text.strip():
97
+ reference_text = transcribe_audio(processed_audio)
98
+ if reference_text.startswith("Error"):
99
+ return None, reference_text
100
 
101
  # Create speaker from reference audio
102
+ speaker = TTS_INTERFACE.create_speaker(
103
+ processed_audio,
104
  reference_text
105
  )
106
 
107
  # Generate speech with cloned voice
108
+ output = TTS_INTERFACE.generate(
109
  text=text_to_speak,
110
  speaker=speaker,
111
  temperature=temperature,
 
113
  max_lenght=4096
114
  )
115
 
116
+ # Clean up preprocessed audio if it was created
117
+ if processed_audio != audio_path:
118
+ try:
119
+ os.unlink(processed_audio)
120
+ except:
121
+ pass
122
+
123
+ # Save output to temporary file
124
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
125
  output.save(temp_file.name)
126
+ return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"
127
 
128
  except Exception as e:
129
+ if processed_audio != audio_path:
130
+ try:
131
+ os.unlink(processed_audio)
132
+ except:
133
+ pass
134
  return None, f"Error: {str(e)}"
135
 
136
+ print("Initializing models...")
137
+ # Initialize models globally
138
+ TTS_INTERFACE, ASR_MODEL = initialize_models()
139
+ print("Models initialized!")
140
+
141
  # Create Gradio interface
142
  with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
143
+ gr.Markdown("# 🎙️ Optimized Voice Cloning with OuteTTS")
144
  gr.Markdown("""
145
+ This app uses optimized versions of OuteTTS and Whisper for efficient voice cloning on CPU.
146
+ Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
147
  and enter the new text you want to be spoken in the cloned voice.
148
 
149
  Note: For best results, use clear audio with minimal background noise.
 
151
 
152
  with gr.Row():
153
  with gr.Column():
154
+ audio_input = gr.Audio(
155
+ label="Upload Reference Audio",
156
+ type="filepath"
157
+ )
158
+ reference_text = gr.Textbox(
159
+ label="Reference Text (leave blank for auto-transcription)",
160
+ placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
161
+ )
162
+ text_to_speak = gr.Textbox(
163
+ label="Text to Speak",
164
+ placeholder="Enter the text you want the cloned voice to speak"
165
+ )
166
 
167
  with gr.Row():
168
+ temperature = gr.Slider(
169
+ minimum=0.1,
170
+ maximum=1.0,
171
+ value=0.1,
172
+ step=0.1,
173
+ label="Temperature"
174
+ )
175
+ repetition_penalty = gr.Slider(
176
+ minimum=1.0,
177
+ maximum=2.0,
178
+ value=1.1,
179
+ step=0.1,
180
+ label="Repetition Penalty"
181
+ )
182
 
 
183
  submit_btn = gr.Button("Generate Voice", variant="primary")
184
 
185
  with gr.Column():
 
186
  output_audio = gr.Audio(label="Generated Speech")
187
+ output_message = gr.Textbox(label="Status", max_lines=3)
188
 
 
189
  submit_btn.click(
190
  fn=process_audio_file,
191
  inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
 
193
  )
194
 
195
  gr.Markdown("""
196
+ ### Optimization Notes:
197
+ - Using INT8 quantization for efficient CPU usage
198
+ - Optimized audio preprocessing
199
+ - Cached model loading
200
+ - Memory-efficient inference
201
+
202
  ### Tips for best results:
203
+ 1. Use clear, high-quality reference audio
204
+ 2. Keep reference audio short (5-10 seconds)
205
+ 3. Verify auto-transcription accuracy
206
+ 4. For best quality, manually input exact reference text
207
+ 5. Keep generated text concise
 
208
  """)
209
 
210
  if __name__ == "__main__":