Spaces:

drewThomasson
/

OuteTTS-DEMO

Sleeping

App Files Files Community

drewThomasson commited on Nov 5, 2024

Commit

153c25e

verified ·

1 Parent(s): 4e4528b

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -44

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 from outetts.v0_1.interface import InterfaceHF
 import logging
 # Configure logging to display information in the terminal
 logging.basicConfig(level=logging.INFO)
@@ -15,28 +16,31 @@ except Exception as e:
     logger.error(f"Failed to load model: {e}")
     raise e
-def generate_tts(text, temperature, repetition_penalty, max_length):
     """
     Generates speech from the input text using the OuteTTS model.
     Parameters:
         text (str): The input text for TTS.
         temperature (float): Sampling temperature.
         repetition_penalty (float): Repetition penalty.
         max_length (int): Maximum length of the generated audio tokens.
     Returns:
         str: Path to the generated audio file.
     """
     logger.info("Received TTS generation request.")
-    logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}")
     try:
         output = interface.generate(
             text=text,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
-            max_length=max_length  # Corrected spelling from 'max_lenght' to 'max_length'
         )
         logger.info("TTS generation complete.")
@@ -50,13 +54,35 @@ def generate_tts(text, temperature, repetition_penalty, max_length):
         logger.error(f"Error during TTS generation: {e}")
         return None
 # Define the Gradio Blocks interface
 with gr.Blocks() as demo:
     gr.Markdown("# 🎤 OuteTTS - Text to Speech Interface")
     gr.Markdown(
         """
         Generate speech from text using the **OuteTTS-0.1-350M** model.
         **Key Features:**
         - Pure language modeling approach to TTS
         - Voice cloning capabilities
@@ -64,55 +90,124 @@ with gr.Blocks() as demo:
         """
     )
-    with gr.Row():
-        text_input = gr.Textbox(
             label="📄 Text Input",
-            placeholder="Enter the text for TTS generation",
             lines=3
         )
-    with gr.Row():
-        temperature = gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.1,
-            step=0.01,
-            label="🌡️ Temperature"
         )
-        repetition_penalty = gr.Slider(
-            minimum=0.5,
-            maximum=2.0,
-            value=1.1,
-            step=0.1,
-            label="🔁 Repetition Penalty"
         )
-        max_length = gr.Slider(
-            minimum=256,
-            maximum=4096,
-            value=1024,
-            step=256,
-            label="📏 Max Length"
         )
-    generate_button = gr.Button("🔊 Generate Speech")
-    output_audio = gr.Audio(
-        label="🎧 Generated Speech",
-        type="filepath"  # Expecting a file path to the audio
-    )
-    # Define the button click event
-    generate_button.click(
-        fn=generate_tts,
-        inputs=[text_input, temperature, repetition_penalty, max_length],
-        outputs=output_audio
-    )
     gr.Markdown(
         """
         ---
         **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
         **Credits:**
         - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
         - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)

 import gradio as gr
 from outetts.v0_1.interface import InterfaceHF
 import logging
+import os
 # Configure logging to display information in the terminal
 logging.basicConfig(level=logging.INFO)
     logger.error(f"Failed to load model: {e}")
     raise e
+def generate_tts(text, temperature, repetition_penalty, max_length, speaker):
     """
     Generates speech from the input text using the OuteTTS model.
     Parameters:
         text (str): The input text for TTS.
         temperature (float): Sampling temperature.
         repetition_penalty (float): Repetition penalty.
         max_length (int): Maximum length of the generated audio tokens.
+        speaker (dict): Speaker configuration for voice cloning.
     Returns:
         str: Path to the generated audio file.
     """
     logger.info("Received TTS generation request.")
+    logger.info(f"Parameters - Text: {text}, Temperature: {temperature}, Repetition Penalty: {repetition_penalty}, Max Length: {max_length}, Speaker: {speaker is not None}")
     try:
+        # Due to a typo in interface.py, use 'max_lenght' instead of 'max_length'
         output = interface.generate(
             text=text,
             temperature=temperature,
             repetition_penalty=repetition_penalty,
+            max_lenght=max_length,  # Pass the parameter with typo
+            speaker=speaker
         )
         logger.info("TTS generation complete.")
         logger.error(f"Error during TTS generation: {e}")
         return None
+def create_speaker(audio_file, transcript):
+    """
+    Creates a custom speaker from a reference audio file and transcript.
+    Parameters:
+        audio_file (file): Path to the reference audio file.
+        transcript (str): The transcript matching the audio.
+    Returns:
+        dict: Speaker configuration.
+    """
+    logger.info("Received Voice Cloning request.")
+    logger.info(f"Reference Audio: {audio_file.name}, Transcript: {transcript}")
+    try:
+        speaker = interface.create_speaker(audio_file.name, transcript)
+        logger.info("Speaker created successfully.")
+        return speaker
+    except Exception as e:
+        logger.error(f"Error during speaker creation: {e}")
+        return None
 # Define the Gradio Blocks interface
 with gr.Blocks() as demo:
     gr.Markdown("# 🎤 OuteTTS - Text to Speech Interface")
     gr.Markdown(
         """
         Generate speech from text using the **OuteTTS-0.1-350M** model.
         **Key Features:**
         - Pure language modeling approach to TTS
         - Voice cloning capabilities
         """
     )
+    with gr.Tab("Basic TTS"):
+        with gr.Row():
+            text_input = gr.Textbox(
+                label="📄 Text Input",
+                placeholder="Enter the text for TTS generation",
+                lines=3
+            )
+        with gr.Row():
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.1,
+                step=0.01,
+                label="🌡️ Temperature"
+            )
+            repetition_penalty = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.1,
+                step=0.1,
+                label="🔁 Repetition Penalty"
+            )
+            max_length = gr.Slider(
+                minimum=256,
+                maximum=4096,
+                value=1024,
+                step=256,
+                label="📏 Max Length"
+            )
+        generate_button = gr.Button("🔊 Generate Speech")
+        output_audio = gr.Audio(
+            label="🎧 Generated Speech",
+            type="filepath"  # Expecting a file path to the audio
+        )
+        # Define the button click event for Basic TTS
+        generate_button.click(
+            fn=generate_tts,
+            inputs=[text_input, temperature, repetition_penalty, max_length, None],
+            outputs=output_audio
+        )
+    with gr.Tab("Voice Cloning"):
+        with gr.Row():
+            reference_audio = gr.Audio(
+                label="🔊 Reference Audio",
+                type="filepath",
+                source="upload",
+                optional=False
+            )
+            reference_transcript = gr.Textbox(
+                label="📝 Transcript",
+                placeholder="Enter the transcript matching the reference audio",
+                lines=2
+            )
+        create_speaker_button = gr.Button("🎤 Create Speaker")
+        speaker_info = gr.JSON(label="🗂️ Speaker Configuration")
+        generate_cloned_speech = gr.Textbox(
             label="📄 Text Input",
+            placeholder="Enter the text for TTS generation with cloned voice",
             lines=3
         )
+        with gr.Row():
+            temperature_clone = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.1,
+                step=0.01,
+                label="🌡️ Temperature"
+            )
+            repetition_penalty_clone = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.1,
+                step=0.1,
+                label="🔁 Repetition Penalty"
+            )
+            max_length_clone = gr.Slider(
+                minimum=256,
+                maximum=4096,
+                value=1024,
+                step=256,
+                label="📏 Max Length"
+            )
+        generate_cloned_button = gr.Button("🔊 Generate Cloned Speech")
+        output_cloned_audio = gr.Audio(
+            label="🎧 Generated Cloned Speech",
+            type="filepath"  # Expecting a file path to the audio
         )
+        # Define the button click event for creating a speaker
+        create_speaker_button.click(
+            fn=create_speaker,
+            inputs=[reference_audio, reference_transcript],
+            outputs=speaker_info
         )
+        # Define the button click event for generating speech with the cloned voice
+        generate_cloned_button.click(
+            fn=generate_tts,
+            inputs=[generate_cloned_speech, temperature_clone, repetition_penalty_clone, max_length_clone, speaker_info],
+            outputs=output_cloned_audio
         )
     gr.Markdown(
         """
         ---
         **Technical Blog:** [OuteTTS-0.1-350M](https://www.outeai.com/blog/OuteTTS-0.1-350M)
         **Credits:**
         - [WavTokenizer](https://github.com/jishengpeng/WavTokenizer)
         - [CTC Forced Alignment](https://pytorch.org/audio/stable/tutorials/ctc_forced_alignment_api_tutorial.html)