voice_clone_v2 / app.py
ahassoun's picture
Update app.py
8ab15de
from TTS.api import TTS
import gradio as gr
from gradio import Dropdown
from scipy.io.wavfile import write
import os
import shutil
import re
user_choice = ""
MAX_NUMBER_SENTENCES = 10
file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
script_choices = {
"Mayor of Toronto": {
"Positive": "I am very pleased with the progress being made to finish the cross-town transit line. This has been an excellent use of taxpayer dollars.",
"Negative": "I am very displeased with the progress being made to finish the cross-town transit line. This has been an embarrassing use of taxpayer dollars.",
"Random": "I like being Mayor because I don’t have to pay my parking tickets."
},
"Witness": {
"Positive": "Yes, John is my friend. He was at my house watching the baseball game all night.",
"Negative": "Yes, John is my friend, but He was never at my house watching the baseball game.",
"Random": "He is my friend, but I do not trust John."
},
"Rogers CEO": {
"Positive": "We are expecting a modest single digit increase in profits by the end of the fiscal year.",
"Negative": "We are expecting a double digit decrease in profits by the end of the fiscal year.",
"Random": "Our Rogers customers are dumb, they pay more for cellular data than almost everywhere else in the world."
},
"Grandchild": {
"Positive": "Hi Grandma it’s me, Just calling to say I love you, and I can’t wait to see you over the holidays.",
"Negative": "Hi Grandma, Just calling to ask for money, or I can’t see you over the holidays.",
"Random": "Grandma, I can’t find your email address. I need to send you something important."
}
}
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
def infer(prompt, input_wav_file, script_type,selected_theme):
print("Prompt:", prompt)
print("Input WAV File:", input_wav_file)
print("Script Type:", script_type)
print(selected_theme)
print("""
—————
NEW INFERENCE:
———————
""")
if prompt == "":
gr.Warning("Do not forget to provide a tts prompt !")
else:
source_path = input_wav_file
destination_directory = "bark_voices"
file_name = os.path.splitext(os.path.basename(source_path))[0]
destination_path = os.path.join(destination_directory, file_name)
os.makedirs(destination_path, exist_ok=True)
shutil.move(source_path, os.path.join(
destination_path, f"{file_name}.wav"))
sentences = re.split(r'(?<=[.!?])\s+', prompt)
if len(sentences) > MAX_NUMBER_SENTENCES:
gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
limited_prompt = ' '.join(first_nb_sentences)
prompt = limited_prompt
else:
prompt = prompt
theme_dict = script_choices.get(selected_theme, {})
chosen_script = theme_dict.get(script_type, "")
gr.Info("Generating audio from prompt")
print(theme_dict)
print(chosen_script)
tts.tts_to_file(text=chosen_script,
file_path="output.wav",
voice_dir="bark_voices/",
speaker=f"{file_name}")
contents = os.listdir(f"bark_voices/{file_name}")
for item in contents:
print(item)
print("Preparing final waveform video ...")
tts_video = gr.make_waveform(audio="output.wav")
print(tts_video)
print("FINISHED")
return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path
# s
theme_emojis = {
"Mayor of Toronto": "🏙️",
"Witness": "👤",
"Rogers CEO": "📱",
"Grandchild": "👪"
}
css = """
#col-container {max-width: 780px; margin-left: auto; margin-right: auto; background-size: contain; background-repeat: no-repeat;}
#theme-emoji-bg {position: absolute; top: 0; left: 0; width: 100%; height: 100%; z-index: -1; opacity: 0.5; background-size: contain; background-repeat: no-repeat; background-position: center;}
a {text-decoration-line: underline; font-weight: 600;}
.mic-wrap > button {
width: 100%;
height: 60px;
font-size: 1.4em!important;
}
.record-icon.svelte-1thnwz {
display: flex;
position: relative;
margin-right: var(--size-2);
width: unset;
height: unset;
}
span.record-icon > span.dot.svelte-1thnwz {
width: 20px!important;
height: 20px!important;
}
.animate-spin {
animation: spin 1s linear infinite;
}
@keyframes spin {
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
#theme-emoji {
position: absolute;
top: 10px;
right: 10px;
}
"""
def load_hidden_mic(audio_in):
print("USER RECORDED A NEW SAMPLE")
return audio_in
def update_script_text(theme, script_type):
positive_script = script_choices.get(theme, {}).get("Positive", "")
output_script = script_choices.get(theme, {}).get(script_type, "")
theme_emoji = theme_emojis.get(theme, "")
return positive_script, output_script, theme_emoji, theme # Include theme as an output
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
with gr.Row():
with gr.Column():
theme_emoji_output = gr.Label(label="Theme Emoji")
theme_dropdown = gr.Dropdown(
label="1. Select a Theme", choices=list(script_choices.keys()))
script_text = gr.Textbox(
label="2 & 3. Read the script below aloud THREE times for the best output:",
lines=5,
)
script_type_dropdown = gr.Dropdown(
label="4. Select the Script Type for Bot Output", choices=["Random", "Negative"])
output_script_text = gr.Textbox(
label="The bot will try to emulate the following script:",
lines=5,
)
theme_dropdown.change(fn=update_script_text, inputs=[
theme_dropdown, script_type_dropdown], outputs=[script_text, output_script_text, theme_emoji_output])
script_type_dropdown.change(fn=update_script_text, inputs=[
theme_dropdown, script_type_dropdown], outputs=[script_text, output_script_text, theme_emoji_output])
theme_dropdown.change(fn=update_script_text, inputs=[theme_dropdown, script_type_dropdown], outputs=[
script_text, output_script_text, theme_emoji_output])
# Replace file input with microphone input
micro_in = gr.Audio(
label="Record voice to clone",
type="filepath",
source="microphone",
interactive=True
)
hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
submit_btn = gr.Button("Submit")
with gr.Column():
cloned_out = gr.Audio(
label="Text to speech output", visible=False)
video_out = gr.Video(label="Waveform video",
elem_id="voice-video-out")
npz_file = gr.File(label=".npz file", visible=False)
folder_path = gr.Textbox(visible=False)
micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[
hidden_audio_numpy], queue=False)
submit_btn.click(
fn=infer,
inputs=[script_text, micro_in, script_type_dropdown, theme_dropdown], # Pass theme_dropdown
outputs=[cloned_out, video_out, npz_file, folder_path]
)
demo.queue(api_open=False, max_size=10).launch()