drewThomasson commited on
Commit
153c25e
Β·
verified Β·
1 Parent(s): 4e4528b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -44
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  from outetts.v0_1.interface import InterfaceHF
3
  import logging
 
4
 
5
  # Configure logging to display information in the terminal
6
  logging.basicConfig(level=logging.INFO)
@@ -15,28 +16,31 @@ except Exception as e:
15
  logger.error(f"Failed to load model: {e}")
16
  raise e
17
 
18
- def generate_tts(text, temperature, repetition_penalty, max_length):
19
  """
20
  Generates speech from the input text using the OuteTTS model.
21
-
22
  Parameters:
23
  text (str): The input text for TTS.
24
  temperature (float): Sampling temperature.
25
  repetition_penalty (float): Repetition penalty.
26
  max_length (int): Maximum length of the generated audio tokens.
27
-
 
28
  Returns:
29
  str: Path to the generated audio file.
30
  """
31
  logger.info("Received TTS generation request.")
32
- logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}")
33
 
34
  try:
 
35
  output = interface.generate(
36
  text=text,
37
  temperature=temperature,
38
  repetition_penalty=repetition_penalty,
39
- max_length=max_length # Corrected spelling from 'max_lenght' to 'max_length'
 
40
  )
41
  logger.info("TTS generation complete.")
42
 
@@ -50,13 +54,35 @@ def generate_tts(text, temperature, repetition_penalty, max_length):
50
  logger.error(f"Error during TTS generation: {e}")
51
  return None
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # Define the Gradio Blocks interface
54
  with gr.Blocks() as demo:
55
  gr.Markdown("# 🎀 OuteTTS - Text to Speech Interface")
56
  gr.Markdown(
57
  """
58
  Generate speech from text using the **OuteTTS-0.1-350M** model.
59
-
60
  **Key Features:**
61
  - Pure language modeling approach to TTS
62
  - Voice cloning capabilities
@@ -64,55 +90,124 @@ with gr.Blocks() as demo:
64
  """
65
  )
66
 
67
- with gr.Row():
68
- text_input = gr.Textbox(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  label="πŸ“„ Text Input",
70
- placeholder="Enter the text for TTS generation",
71
  lines=3
72
  )
73
-
74
- with gr.Row():
75
- temperature = gr.Slider(
76
- minimum=0.1,
77
- maximum=1.0,
78
- value=0.1,
79
- step=0.01,
80
- label="🌑️ Temperature"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  )
82
- repetition_penalty = gr.Slider(
83
- minimum=0.5,
84
- maximum=2.0,
85
- value=1.1,
86
- step=0.1,
87
- label="πŸ” Repetition Penalty"
88
  )
89
- max_length = gr.Slider(
90
- minimum=256,
91
- maximum=4096,
92
- value=1024,
93
- step=256,
94
- label="πŸ“ Max Length"
95
  )
96
 
97
- generate_button = gr.Button("πŸ”Š Generate Speech")
98
-
99
- output_audio = gr.Audio(
100
- label="🎧 Generated Speech",
101
- type="filepath" # Expecting a file path to the audio
102
- )
103
-
104
- # Define the button click event
105
- generate_button.click(
106
- fn=generate_tts,
107
- inputs=[text_input, temperature, repetition_penalty, max_length],
108
- outputs=output_audio
109
- )
110
-
111
  gr.Markdown(
112
  """
113
  ---
114
  **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
115
-
116
  **Credits:**
117
  - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
118
  - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
 
1
  import gradio as gr
2
  from outetts.v0_1.interface import InterfaceHF
3
  import logging
4
+ import os
5
 
6
  # Configure logging to display information in the terminal
7
  logging.basicConfig(level=logging.INFO)
 
16
  logger.error(f"Failed to load model: {e}")
17
  raise e
18
 
19
+ def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
20
  """
21
  Generates speech from the input text using the OuteTTS model.
22
+
23
  Parameters:
24
  text (str): The input text for TTS.
25
  temperature (float): Sampling temperature.
26
  repetition_penalty (float): Repetition penalty.
27
  max_length (int): Maximum length of the generated audio tokens.
28
+ speaker (dict): Speaker configuration for voice cloning.
29
+
30
  Returns:
31
  str: Path to the generated audio file.
32
  """
33
  logger.info("Received TTS generation request.")
34
+ logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker: {speaker is not None}")
35
 
36
  try:
37
+ # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
38
  output = interface.generate(
39
  text=text,
40
  temperature=temperature,
41
  repetition_penalty=repetition_penalty,
42
+ max_lenght=max_length, # Pass the parameter with typo
43
+ speaker=speaker
44
  )
45
  logger.info("TTS generation complete.")
46
 
 
54
  logger.error(f"Error during TTS generation: {e}")
55
  return None
56
 
57
+ def create_speaker(audio_file, transcript):
58
+ """
59
+ Creates a custom speaker from a reference audio file and transcript.
60
+
61
+ Parameters:
62
+ audio_file (file): Path to the reference audio file.
63
+ transcript (str): The transcript matching the audio.
64
+
65
+ Returns:
66
+ dict: Speaker configuration.
67
+ """
68
+ logger.info("Received Voice Cloning request.")
69
+ logger.info(f"Reference Audio: {audio_file.name}, Transcript: {transcript}")
70
+
71
+ try:
72
+ speaker = interface.create_speaker(audio_file.name, transcript)
73
+ logger.info("Speaker created successfully.")
74
+ return speaker
75
+ except Exception as e:
76
+ logger.error(f"Error during speaker creation: {e}")
77
+ return None
78
+
79
  # Define the Gradio Blocks interface
80
  with gr.Blocks() as demo:
81
  gr.Markdown("# 🎀 OuteTTS - Text to Speech Interface")
82
  gr.Markdown(
83
  """
84
  Generate speech from text using the **OuteTTS-0.1-350M** model.
85
+
86
  **Key Features:**
87
  - Pure language modeling approach to TTS
88
  - Voice cloning capabilities
 
90
  """
91
  )
92
 
93
+ with gr.Tab("Basic TTS"):
94
+ with gr.Row():
95
+ text_input = gr.Textbox(
96
+ label="πŸ“„ Text Input",
97
+ placeholder="Enter the text for TTS generation",
98
+ lines=3
99
+ )
100
+
101
+ with gr.Row():
102
+ temperature = gr.Slider(
103
+ minimum=0.1,
104
+ maximum=1.0,
105
+ value=0.1,
106
+ step=0.01,
107
+ label="🌑️ Temperature"
108
+ )
109
+ repetition_penalty = gr.Slider(
110
+ minimum=0.5,
111
+ maximum=2.0,
112
+ value=1.1,
113
+ step=0.1,
114
+ label="πŸ” Repetition Penalty"
115
+ )
116
+ max_length = gr.Slider(
117
+ minimum=256,
118
+ maximum=4096,
119
+ value=1024,
120
+ step=256,
121
+ label="πŸ“ Max Length"
122
+ )
123
+
124
+ generate_button = gr.Button("πŸ”Š Generate Speech")
125
+
126
+ output_audio = gr.Audio(
127
+ label="🎧 Generated Speech",
128
+ type="filepath" # Expecting a file path to the audio
129
+ )
130
+
131
+ # Define the button click event for Basic TTS
132
+ generate_button.click(
133
+ fn=generate_tts,
134
+ inputs=[text_input, temperature, repetition_penalty, max_length, None],
135
+ outputs=output_audio
136
+ )
137
+
138
+ with gr.Tab("Voice Cloning"):
139
+ with gr.Row():
140
+ reference_audio = gr.Audio(
141
+ label="πŸ”Š Reference Audio",
142
+ type="filepath",
143
+ source="upload",
144
+ optional=False
145
+ )
146
+ reference_transcript = gr.Textbox(
147
+ label="πŸ“ Transcript",
148
+ placeholder="Enter the transcript matching the reference audio",
149
+ lines=2
150
+ )
151
+
152
+ create_speaker_button = gr.Button("🎀 Create Speaker")
153
+
154
+ speaker_info = gr.JSON(label="πŸ—‚οΈ Speaker Configuration")
155
+
156
+ generate_cloned_speech = gr.Textbox(
157
  label="πŸ“„ Text Input",
158
+ placeholder="Enter the text for TTS generation with cloned voice",
159
  lines=3
160
  )
161
+
162
+ with gr.Row():
163
+ temperature_clone = gr.Slider(
164
+ minimum=0.1,
165
+ maximum=1.0,
166
+ value=0.1,
167
+ step=0.01,
168
+ label="🌑️ Temperature"
169
+ )
170
+ repetition_penalty_clone = gr.Slider(
171
+ minimum=0.5,
172
+ maximum=2.0,
173
+ value=1.1,
174
+ step=0.1,
175
+ label="πŸ” Repetition Penalty"
176
+ )
177
+ max_length_clone = gr.Slider(
178
+ minimum=256,
179
+ maximum=4096,
180
+ value=1024,
181
+ step=256,
182
+ label="πŸ“ Max Length"
183
+ )
184
+
185
+ generate_cloned_button = gr.Button("πŸ”Š Generate Cloned Speech")
186
+
187
+ output_cloned_audio = gr.Audio(
188
+ label="🎧 Generated Cloned Speech",
189
+ type="filepath" # Expecting a file path to the audio
190
  )
191
+
192
+ # Define the button click event for creating a speaker
193
+ create_speaker_button.click(
194
+ fn=create_speaker,
195
+ inputs=[reference_audio, reference_transcript],
196
+ outputs=speaker_info
197
  )
198
+
199
+ # Define the button click event for generating speech with the cloned voice
200
+ generate_cloned_button.click(
201
+ fn=generate_tts,
202
+ inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
203
+ outputs=output_cloned_audio
204
  )
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  gr.Markdown(
207
  """
208
  ---
209
  **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
210
+
211
  **Credits:**
212
  - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
213
  - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)