Chillarmo commited on
Commit
cc2340f
β€’
1 Parent(s): 71c72c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -49
app.py CHANGED
@@ -11,15 +11,21 @@ def download_model():
11
  """Download the GGUF model from HuggingFace"""
12
  model_path = huggingface_hub.hf_hub_download(
13
  repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
14
- filename="OuteTTS-0.1-350M-Q6_K.gguf"
15
  )
16
  return model_path
17
 
18
  def initialize_models():
19
  """Initialize the OuteTTS and Faster-Whisper models"""
20
- # Download and initialize GGUF model
21
  model_path = download_model()
22
- tts_interface = InterfaceGGUF(model_path)
 
 
 
 
 
 
23
 
24
  # Initialize Whisper
25
  asr_model = WhisperModel("tiny",
@@ -30,24 +36,11 @@ def initialize_models():
30
  return tts_interface, asr_model
31
 
32
  # Initialize models globally to avoid reloading
33
- TTS_INTERFACE, ASR_MODEL = initialize_models()
34
-
35
- def transcribe_audio(audio_path):
36
- """Transcribe audio using Faster-Whisper tiny"""
37
- try:
38
- segments, _ = ASR_MODEL.transcribe(audio_path,
39
- beam_size=1,
40
- best_of=1,
41
- temperature=1.0,
42
- condition_on_previous_text=False,
43
- compression_ratio_threshold=2.4,
44
- log_prob_threshold=-1.0,
45
- no_speech_threshold=0.6)
46
-
47
- text = " ".join([segment.text for segment in segments]).strip()
48
- return text
49
- except Exception as e:
50
- return f"Error transcribing audio: {str(e)}"
51
 
52
  def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
53
  """Process the audio file and generate speech with the cloned voice"""
@@ -60,28 +53,32 @@ def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.
60
  return None, reference_text
61
 
62
  gr.Info(f"Using reference text: {reference_text}")
 
 
 
 
63
 
64
  # Create speaker from reference audio
65
  speaker = TTS_INTERFACE.create_speaker(
66
  audio_path,
67
- reference_text[:4000] # Limit reference text length
68
  )
69
 
70
  # Generate speech with cloned voice
71
  output = TTS_INTERFACE.generate(
72
- text=text_to_speak[:500], # Limit output text length
73
  speaker=speaker,
74
  temperature=temperature,
75
  repetition_penalty=repetition_penalty,
76
- max_lenght=2048 # Reduced from 4096 to avoid errors
77
  )
78
 
79
  # Save to temporary file and return path
80
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
81
  output.save(temp_file.name)
82
  return temp_file.name, f"""Processing complete!
83
- Reference text: {reference_text[:500]}...
84
- (Showing first 500 characters of reference text)"""
85
 
86
  except Exception as e:
87
  return None, f"Error: {str(e)}"
@@ -90,40 +87,56 @@ Reference text: {reference_text[:500]}...
90
  with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
91
  gr.Markdown("# πŸŽ™οΈ Voice Cloning with OuteTTS (GGUF)")
92
  gr.Markdown("""
93
- This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file,
94
  provide the text being spoken in that audio (or leave blank for automatic transcription),
95
  and enter the new text you want to be spoken in the cloned voice.
96
 
97
  Note:
98
  - For best results, use clear audio with minimal background noise
99
- - Reference text is limited to 4000 characters
100
- - Output text is limited to 500 characters
 
101
  """)
102
 
103
  with gr.Row():
104
  with gr.Column():
105
  # Input components
106
- audio_input = gr.Audio(label="Upload Reference Audio", type="filepath")
 
 
 
 
107
  with gr.Row():
108
  transcribe_btn = gr.Button("πŸ“ Transcribe Audio", variant="secondary")
109
 
110
  reference_text = gr.Textbox(
111
  label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
112
  placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
113
- lines=3
 
114
  )
115
  text_to_speak = gr.Textbox(
116
- label="Text to Speak (what you want the cloned voice to say, max 500 characters)",
117
- placeholder="Enter the text you want the cloned voice to speak",
118
  lines=3,
119
  max_lines=5
120
  )
121
 
122
  with gr.Row():
123
- temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1,
124
- label="Temperature (higher = more variation)")
125
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1,
126
- label="Repetition Penalty")
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Submit button
129
  submit_btn = gr.Button("πŸŽ™οΈ Generate Voice", variant="primary")
@@ -132,15 +145,37 @@ with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
132
  # Output components
133
  output_audio = gr.Audio(label="Generated Speech")
134
  output_message = gr.Textbox(label="Status", lines=4)
 
 
 
 
 
135
 
136
  # Handle transcription button
137
- def transcribe_button(audio):
138
- if not audio:
139
- return "Please upload audio first."
140
- return transcribe_audio(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  transcribe_btn.click(
143
- fn=transcribe_button,
144
  inputs=[audio_input],
145
  outputs=[reference_text],
146
  )
@@ -154,13 +189,15 @@ with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
154
 
155
  gr.Markdown("""
156
  ### Tips for best results:
157
- 1. Use high-quality reference audio (clear speech, minimal background noise)
158
- 2. Try to keep reference audio under 30 seconds
159
- 3. If auto-transcription isn't accurate, you can manually correct the text
160
- 4. Keep generated text short for better quality
161
- 5. Adjust temperature and repetition penalty if needed:
162
- - Lower temperature (0.1-0.3) for more consistent output
163
- - Higher repetition penalty (1.1-1.3) to avoid repetition
 
 
164
  """)
165
 
166
  if __name__ == "__main__":
 
11
  """Download the GGUF model from HuggingFace"""
12
  model_path = huggingface_hub.hf_hub_download(
13
  repo_id="OuteAI/OuteTTS-0.1-350M-GGUF",
14
+ filename="outetts-0.1-350m.gguf"
15
  )
16
  return model_path
17
 
18
  def initialize_models():
19
  """Initialize the OuteTTS and Faster-Whisper models"""
20
+ # Download and initialize GGUF model with adjusted parameters
21
  model_path = download_model()
22
+ tts_interface = InterfaceGGUF(
23
+ model_path,
24
+ n_ctx=2048, # Reduced context size
25
+ n_batch=512, # Reduced batch size
26
+ n_threads=4, # Adjust based on CPU
27
+ verbose=False, # Reduce logging
28
+ )
29
 
30
  # Initialize Whisper
31
  asr_model = WhisperModel("tiny",
 
36
  return tts_interface, asr_model
37
 
38
  # Initialize models globally to avoid reloading
39
+ try:
40
+ TTS_INTERFACE, ASR_MODEL = initialize_models()
41
+ except Exception as e:
42
+ print(f"Error initializing models: {str(e)}")
43
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
46
  """Process the audio file and generate speech with the cloned voice"""
 
53
  return None, reference_text
54
 
55
  gr.Info(f"Using reference text: {reference_text}")
56
+
57
+ # Limit text lengths to prevent context overflow
58
+ reference_text = reference_text[:2000] # Further reduced
59
+ text_to_speak = text_to_speak[:300] # Further reduced
60
 
61
  # Create speaker from reference audio
62
  speaker = TTS_INTERFACE.create_speaker(
63
  audio_path,
64
+ reference_text,
65
  )
66
 
67
  # Generate speech with cloned voice
68
  output = TTS_INTERFACE.generate(
69
+ text=text_to_speak,
70
  speaker=speaker,
71
  temperature=temperature,
72
  repetition_penalty=repetition_penalty,
73
+ max_lenght=1024 # Reduced from 2048
74
  )
75
 
76
  # Save to temporary file and return path
77
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
78
  output.save(temp_file.name)
79
  return temp_file.name, f"""Processing complete!
80
+ Reference text: {reference_text[:300]}...
81
+ (Showing first 300 characters of reference text)"""
82
 
83
  except Exception as e:
84
  return None, f"Error: {str(e)}"
 
87
  with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo:
88
  gr.Markdown("# πŸŽ™οΈ Voice Cloning with OuteTTS (GGUF)")
89
  gr.Markdown("""
90
+ This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file,
91
  provide the text being spoken in that audio (or leave blank for automatic transcription),
92
  and enter the new text you want to be spoken in the cloned voice.
93
 
94
  Note:
95
  - For best results, use clear audio with minimal background noise
96
+ - Reference text is limited to 2000 characters
97
+ - Output text is limited to 300 characters
98
+ - Short inputs work best for quality results
99
  """)
100
 
101
  with gr.Row():
102
  with gr.Column():
103
  # Input components
104
+ audio_input = gr.Audio(
105
+ label="Upload Reference Audio",
106
+ type="filepath",
107
+ max_length=30 # Limit audio length to 30 seconds
108
+ )
109
  with gr.Row():
110
  transcribe_btn = gr.Button("πŸ“ Transcribe Audio", variant="secondary")
111
 
112
  reference_text = gr.Textbox(
113
  label="Reference Text (what is being said in the audio, leave blank for auto-transcription)",
114
  placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio",
115
+ lines=3,
116
+ max_lines=5
117
  )
118
  text_to_speak = gr.Textbox(
119
+ label="Text to Speak (what you want the cloned voice to say, max 300 characters)",
120
+ placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)",
121
  lines=3,
122
  max_lines=5
123
  )
124
 
125
  with gr.Row():
126
+ temperature = gr.Slider(
127
+ minimum=0.1,
128
+ maximum=0.5, # Reduced maximum temperature
129
+ value=0.1,
130
+ step=0.05,
131
+ label="Temperature (keep low for stability)"
132
+ )
133
+ repetition_penalty = gr.Slider(
134
+ minimum=1.0,
135
+ maximum=1.3, # Reduced maximum
136
+ value=1.1,
137
+ step=0.05,
138
+ label="Repetition Penalty"
139
+ )
140
 
141
  # Submit button
142
  submit_btn = gr.Button("πŸŽ™οΈ Generate Voice", variant="primary")
 
145
  # Output components
146
  output_audio = gr.Audio(label="Generated Speech")
147
  output_message = gr.Textbox(label="Status", lines=4)
148
+
149
+ # Add warning about processing time
150
+ gr.Markdown("""
151
+ ⚠️ Note: Initial processing may take a few moments. Please be patient.
152
+ """)
153
 
154
  # Handle transcription button
155
+ def transcribe_audio(audio_path):
156
+ """Transcribe audio using Faster-Whisper tiny"""
157
+ try:
158
+ if not audio_path:
159
+ return "Please upload audio first."
160
+
161
+ segments, _ = ASR_MODEL.transcribe(
162
+ audio_path,
163
+ beam_size=1,
164
+ best_of=1,
165
+ temperature=1.0,
166
+ condition_on_previous_text=False,
167
+ compression_ratio_threshold=2.4,
168
+ log_prob_threshold=-1.0,
169
+ no_speech_threshold=0.6
170
+ )
171
+
172
+ text = " ".join([segment.text for segment in segments]).strip()
173
+ return text[:2000] # Limit transcription length
174
+ except Exception as e:
175
+ return f"Error transcribing audio: {str(e)}"
176
 
177
  transcribe_btn.click(
178
+ fn=transcribe_audio,
179
  inputs=[audio_input],
180
  outputs=[reference_text],
181
  )
 
189
 
190
  gr.Markdown("""
191
  ### Tips for best results:
192
+ 1. Use clear, short audio samples (5-15 seconds is ideal)
193
+ 2. Keep both reference and output text concise
194
+ 3. Use lower temperature (0.1-0.2) for more stable output
195
+ 4. Start with short phrases to test the voice
196
+ 5. If generation fails, try:
197
+ - Using shorter text
198
+ - Reducing temperature
199
+ - Using clearer audio
200
+ - Simplifying the text
201
  """)
202
 
203
  if __name__ == "__main__":