Spaces:

drewThomasson
/

OuteTTS-DEMO

Running

App Files Files Community

drewThomasson commited on Nov 5

Commit

a2dc963

•

1 Parent(s): 153c25e

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -23

app.py CHANGED Viewed

@@ -2,6 +2,10 @@ import gradio as gr
 from outetts.v0_1.interface import InterfaceHF
 import logging
 import os
 # Configure logging to display information in the terminal
 logging.basicConfig(level=logging.INFO)
@@ -16,6 +20,15 @@ except Exception as e:
     logger.error(f"Failed to load model: {e}")
     raise e
 def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
     """
     Generates speech from the input text using the OuteTTS model.
@@ -45,7 +58,7 @@ def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
         logger.info("TTS generation complete.")
         # Save the output to a temporary WAV file
-        output_path = "output.wav"
         output.save(output_path)
         logger.info(f"Audio saved to {output_path}")
@@ -54,23 +67,57 @@ def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
         logger.error(f"Error during TTS generation: {e}")
         return None
-def create_speaker(audio_file, transcript):
     """
-    Creates a custom speaker from a reference audio file and transcript.
     Parameters:
-        audio_file (file): Path to the reference audio file.
-        transcript (str): The transcript matching the audio.
     Returns:
         dict: Speaker configuration.
     """
-    logger.info("Received Voice Cloning request.")
-    logger.info(f"Reference Audio: {audio_file.name}, Transcript: {transcript}")
     try:
-        speaker = interface.create_speaker(audio_file.name, transcript)
         logger.info("Speaker created successfully.")
         return speaker
     except Exception as e:
         logger.error(f"Error during speaker creation: {e}")
@@ -85,7 +132,7 @@ with gr.Blocks() as demo:
         **Key Features:**
         - Pure language modeling approach to TTS
-        - Voice cloning capabilities
         - Compatible with LLaMa architecture
         """
     )
@@ -139,25 +186,21 @@ with gr.Blocks() as demo:
         with gr.Row():
             reference_audio = gr.Audio(
                 label="🔊 Reference Audio",
-                type="filepath",
                 source="upload",
                 optional=False
             )
-            reference_transcript = gr.Textbox(
-                label="📝 Transcript",
-                placeholder="Enter the transcript matching the reference audio",
-                lines=2
-            )
         create_speaker_button = gr.Button("🎤 Create Speaker")
-        speaker_info = gr.JSON(label="🗂️ Speaker Configuration")
-        generate_cloned_speech = gr.Textbox(
-            label="📄 Text Input",
-            placeholder="Enter the text for TTS generation with cloned voice",
-            lines=3
-        )
         with gr.Row():
             temperature_clone = gr.Slider(
@@ -191,8 +234,8 @@ with gr.Blocks() as demo:
         # Define the button click event for creating a speaker
         create_speaker_button.click(
-            fn=create_speaker,
-            inputs=[reference_audio, reference_transcript],
             outputs=speaker_info
         )
@@ -211,6 +254,7 @@ with gr.Blocks() as demo:
         **Credits:**
         - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
         - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
         """
     )

 from outetts.v0_1.interface import InterfaceHF
 import logging
 import os
+import tempfile
+# Import faster-whisper for transcription
+from faster_whisper import WhisperModel
 # Configure logging to display information in the terminal
 logging.basicConfig(level=logging.INFO)
     logger.error(f"Failed to load model: {e}")
     raise e
+# Initialize the faster-whisper model
+try:
+    logger.info("Initializing faster-whisper model for transcription.")
+    whisper_model = WhisperModel("tiny", device="cpu", compute_type="int8")
+    logger.info("faster-whisper model loaded successfully.")
+except Exception as e:
+    logger.error(f"Failed to load faster-whisper model: {e}")
+    raise e
 def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
     """
     Generates speech from the input text using the OuteTTS model.
         logger.info("TTS generation complete.")
         # Save the output to a temporary WAV file
+        output_path = os.path.join(tempfile.gettempdir(), "output.wav")
         output.save(output_path)
         logger.info(f"Audio saved to {output_path}")
         logger.error(f"Error during TTS generation: {e}")
         return None
+def transcribe_audio(audio_path):
+    """
+    Transcribes the given audio file using faster-whisper.
+    Parameters:
+        audio_path (str): Path to the audio file.
+    Returns:
+        str: Transcribed text.
+    """
+    logger.info(f"Transcribing audio file: {audio_path}")
+    segments, info = whisper_model.transcribe(audio_path)
+    transcript = " ".join([segment.text for segment in segments])
+    logger.info(f"Transcription complete: {transcript}")
+    return transcript
+def create_speaker_with_transcription(audio_file):
     """
+    Creates a custom speaker from a reference audio file by automatically transcribing it.
     Parameters:
+        audio_file (file): Uploaded reference audio file.
     Returns:
         dict: Speaker configuration.
     """
+    logger.info("Received Voice Cloning request with audio file.")
     try:
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+            temp_audio_path = temp_audio.name
+            # Save uploaded audio to temporary file
+            with open(temp_audio_path, "wb") as f:
+                f.write(audio_file.read())
+            logger.info(f"Reference audio saved to {temp_audio_path}")
+        # Transcribe the audio file
+        transcript = transcribe_audio(temp_audio_path)
+        if not transcript.strip():
+            logger.error("Transcription resulted in empty text.")
+            return None
+        # Create speaker using the transcribed text
+        speaker = interface.create_speaker(temp_audio_path, transcript)
         logger.info("Speaker created successfully.")
+        # Clean up the temporary audio file
+        os.remove(temp_audio_path)
+        logger.info(f"Temporary audio file {temp_audio_path} removed.")
         return speaker
     except Exception as e:
         logger.error(f"Error during speaker creation: {e}")
         **Key Features:**
         - Pure language modeling approach to TTS
+        - Voice cloning capabilities with automatic transcription
         - Compatible with LLaMa architecture
         """
     )
         with gr.Row():
             reference_audio = gr.Audio(
                 label="🔊 Reference Audio",
+                type="file",
                 source="upload",
                 optional=False
             )
         create_speaker_button = gr.Button("🎤 Create Speaker")
+        speaker_info = gr.JSON(label="🗂️ Speaker Configuration", interactive=False)
+        with gr.Row():
+            generate_cloned_speech = gr.Textbox(
+                label="📄 Text Input",
+                placeholder="Enter the text for TTS generation with cloned voice",
+                lines=3
+            )
         with gr.Row():
             temperature_clone = gr.Slider(
         # Define the button click event for creating a speaker
         create_speaker_button.click(
+            fn=create_speaker_with_transcription,
+            inputs=[reference_audio],
             outputs=speaker_info
         )
         **Credits:**
         - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
         - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)
+        - [faster-whisper](https://github.com/guillaumekln/faster-whisper)
         """
     )