drewThomasson commited on
Commit
a2dc963
β€’
1 Parent(s): 153c25e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -23
app.py CHANGED
@@ -2,6 +2,10 @@ import gradio as gr
2
  from outetts.v0_1.interface import InterfaceHF
3
  import logging
4
  import os
 
 
 
 
5
 
6
  # Configure logging to display information in the terminal
7
  logging.basicConfig(level=logging.INFO)
@@ -16,6 +20,15 @@ except Exception as e:
16
  logger.error(f"Failed to load model: {e}")
17
  raise e
18
 
 
 
 
 
 
 
 
 
 
19
  def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
20
  """
21
  Generates speech from the input text using the OuteTTS model.
@@ -45,7 +58,7 @@ def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
45
  logger.info("TTS generation complete.")
46
 
47
  # Save the output to a temporary WAV file
48
- output_path = "output.wav"
49
  output.save(output_path)
50
  logger.info(f"Audio saved to {output_path}")
51
 
@@ -54,23 +67,57 @@ def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
54
  logger.error(f"Error during TTS generation: {e}")
55
  return None
56
 
57
- def create_speaker(audio_file, transcript):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  """
59
- Creates a custom speaker from a reference audio file and transcript.
60
 
61
  Parameters:
62
- audio_file (file): Path to the reference audio file.
63
- transcript (str): The transcript matching the audio.
64
 
65
  Returns:
66
  dict: Speaker configuration.
67
  """
68
- logger.info("Received Voice Cloning request.")
69
- logger.info(f"Reference Audio: {audio_file.name}, Transcript: {transcript}")
70
 
71
  try:
72
- speaker = interface.create_speaker(audio_file.name, transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  logger.info("Speaker created successfully.")
 
 
 
 
 
74
  return speaker
75
  except Exception as e:
76
  logger.error(f"Error during speaker creation: {e}")
@@ -85,7 +132,7 @@ with gr.Blocks() as demo:
85
 
86
  **Key Features:**
87
  - Pure language modeling approach to TTS
88
- - Voice cloning capabilities
89
  - Compatible with LLaMa architecture
90
  """
91
  )
@@ -139,25 +186,21 @@ with gr.Blocks() as demo:
139
  with gr.Row():
140
  reference_audio = gr.Audio(
141
  label="πŸ”Š Reference Audio",
142
- type="filepath",
143
  source="upload",
144
  optional=False
145
  )
146
- reference_transcript = gr.Textbox(
147
- label="πŸ“ Transcript",
148
- placeholder="Enter the transcript matching the reference audio",
149
- lines=2
150
- )
151
 
152
  create_speaker_button = gr.Button("🎀 Create Speaker")
153
 
154
- speaker_info = gr.JSON(label="πŸ—‚οΈ Speaker Configuration")
155
 
156
- generate_cloned_speech = gr.Textbox(
157
- label="πŸ“„ Text Input",
158
- placeholder="Enter the text for TTS generation with cloned voice",
159
- lines=3
160
- )
 
161
 
162
  with gr.Row():
163
  temperature_clone = gr.Slider(
@@ -191,8 +234,8 @@ with gr.Blocks() as demo:
191
 
192
  # Define the button click event for creating a speaker
193
  create_speaker_button.click(
194
- fn=create_speaker,
195
- inputs=[reference_audio, reference_transcript],
196
  outputs=speaker_info
197
  )
198
 
@@ -211,6 +254,7 @@ with gr.Blocks() as demo:
211
  **Credits:**
212
  - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
213
  - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
 
214
  """
215
  )
216
 
 
2
  from outetts.v0_1.interface import InterfaceHF
3
  import logging
4
  import os
5
+ import tempfile
6
+
7
+ # Import faster-whisper for transcription
8
+ from faster_whisper import WhisperModel
9
 
10
  # Configure logging to display information in the terminal
11
  logging.basicConfig(level=logging.INFO)
 
20
  logger.error(f"Failed to load model: {e}")
21
  raise e
22
 
23
+ # Initialize the faster-whisper model
24
+ try:
25
+ logger.info("Initializing faster-whisper model for transcription.")
26
+ whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
27
+ logger.info("faster-whisper model loaded successfully.")
28
+ except Exception as e:
29
+ logger.error(f"Failed to load faster-whisper model: {e}")
30
+ raise e
31
+
32
  def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
33
  """
34
  Generates speech from the input text using the OuteTTS model.
 
58
  logger.info("TTS generation complete.")
59
 
60
  # Save the output to a temporary WAV file
61
+ output_path = os.path.join(tempfile.gettempdir(), "output.wav")
62
  output.save(output_path)
63
  logger.info(f"Audio saved to {output_path}")
64
 
 
67
  logger.error(f"Error during TTS generation: {e}")
68
  return None
69
 
70
+ def transcribe_audio(audio_path):
71
+ """
72
+ Transcribes the given audio file using faster-whisper.
73
+
74
+ Parameters:
75
+ audio_path (str): Path to the audio file.
76
+
77
+ Returns:
78
+ str: Transcribed text.
79
+ """
80
+ logger.info(f"Transcribing audio file: {audio_path}")
81
+ segments, info = whisper_model.transcribe(audio_path)
82
+ transcript = " ".join([segment.text for segment in segments])
83
+ logger.info(f"Transcription complete: {transcript}")
84
+ return transcript
85
+
86
+ def create_speaker_with_transcription(audio_file):
87
  """
88
+ Creates a custom speaker from a reference audio file by automatically transcribing it.
89
 
90
  Parameters:
91
+ audio_file (file): Uploaded reference audio file.
 
92
 
93
  Returns:
94
  dict: Speaker configuration.
95
  """
96
+ logger.info("Received Voice Cloning request with audio file.")
 
97
 
98
  try:
99
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
100
+ temp_audio_path = temp_audio.name
101
+ # Save uploaded audio to temporary file
102
+ with open(temp_audio_path, "wb") as f:
103
+ f.write(audio_file.read())
104
+ logger.info(f"Reference audio saved to {temp_audio_path}")
105
+
106
+ # Transcribe the audio file
107
+ transcript = transcribe_audio(temp_audio_path)
108
+
109
+ if not transcript.strip():
110
+ logger.error("Transcription resulted in empty text.")
111
+ return None
112
+
113
+ # Create speaker using the transcribed text
114
+ speaker = interface.create_speaker(temp_audio_path, transcript)
115
  logger.info("Speaker created successfully.")
116
+
117
+ # Clean up the temporary audio file
118
+ os.remove(temp_audio_path)
119
+ logger.info(f"Temporary audio file {temp_audio_path} removed.")
120
+
121
  return speaker
122
  except Exception as e:
123
  logger.error(f"Error during speaker creation: {e}")
 
132
 
133
  **Key Features:**
134
  - Pure language modeling approach to TTS
135
+ - Voice cloning capabilities with automatic transcription
136
  - Compatible with LLaMa architecture
137
  """
138
  )
 
186
  with gr.Row():
187
  reference_audio = gr.Audio(
188
  label="πŸ”Š Reference Audio",
189
+ type="file",
190
  source="upload",
191
  optional=False
192
  )
 
 
 
 
 
193
 
194
  create_speaker_button = gr.Button("🎀 Create Speaker")
195
 
196
+ speaker_info = gr.JSON(label="πŸ—‚οΈ Speaker Configuration", interactive=False)
197
 
198
+ with gr.Row():
199
+ generate_cloned_speech = gr.Textbox(
200
+ label="πŸ“„ Text Input",
201
+ placeholder="Enter the text for TTS generation with cloned voice",
202
+ lines=3
203
+ )
204
 
205
  with gr.Row():
206
  temperature_clone = gr.Slider(
 
234
 
235
  # Define the button click event for creating a speaker
236
  create_speaker_button.click(
237
+ fn=create_speaker_with_transcription,
238
+ inputs=[reference_audio],
239
  outputs=speaker_info
240
  )
241
 
 
254
  **Credits:**
255
  - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
256
  - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
257
+ - [faster-whisper](https://github.com/guillaumekln/faster-whisper)
258
  """
259
  )
260