fffiloni's picture
Update app.py
65814fd
raw
history blame
5.38 kB
import gradio as gr
import os
import shutil
from huggingface_hub import snapshot_download
import numpy as np
from scipy.io import wavfile
model_ids = [
'suno/bark',
]
for model_id in model_ids:
model_name = model_id.split('/')[-1]
snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')
#from TTS.tts.configs.bark_config import BarkConfig
#from TTS.tts.models.bark import Bark
#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
#config = BarkConfig()
#model = Bark.init_from_config(config)
#model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
def infer(prompt, input_wav_file):
print("SAVING THE AUDIO FILE TO WHERE IT BELONGS")
# Path to your WAV file
source_path = input_wav_file
# Destination directory
destination_directory = "bark_voices"
# Extract the file name without the extension
file_name = os.path.splitext(os.path.basename(source_path))[0]
# Construct the full destination directory path
destination_path = os.path.join(destination_directory, file_name)
# Create the new directory
os.makedirs(destination_path, exist_ok=True)
# Move the WAV file to the new directory
shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))
"""
text = prompt
print("SYNTHETIZING...")
# with random speaker
#output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)
# cloning a speaker.
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
output_dict = model.synthesize(
text,
config,
speaker_id=f"{file_name}",
voice_dirs="bark_voices/",
gpu=True
)
print(output_dict)
sample_rate = 24000 # Replace with the actual sample rate
print("WRITING WAVE FILE")
wavfile.write(
'output.wav',
sample_rate,
output_dict['wav']
)
"""
tts.tts_to_file(text=prompt,
file_path="output.wav",
voice_dir="bark_voices/",
speaker=f"{file_name}")
# List all the files and subdirectories in the given directory
contents = os.listdir(f"bark_voices/{file_name}")
# Print the contents
for item in contents:
print(item)
return "output.wav", f"bark_voices/{file_name}/{contents[1]}", gr.update(visible=False), gr.update(visible=True)
def infer_with_npz(prompt, input_wav_file):
print("NEW GENERATION WITH EXISTING .NPZ")
# Path to your WAV file
source_path = input_wav_file
# Extract the file name without the extension
file_name = os.path.splitext(os.path.basename(source_path))[0]
# List all the files and subdirectories in the given directory
contents = os.listdir(f"bark_voices/{file_name}")
# Print the contents
for item in contents:
print(item)
first_item = contents[0] # Index 0 corresponds to the first item
item_path = os.path.join(f"bark_voices/{file_name}", first_item)
os.remove(item_path)
"""
print("BEGINNING GENERATION")
# cloning a speaker.
text = prompt
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.npz`
output_dict = model.synthesize(
text,
config,
speaker_id=f"{file_name}",
voice_dirs="bark_voices/"
)
print(output_dict)
print("WRITING WAVE FILE")
sample_rate = 24000 # Replace with the actual sample rate
wavfile.write(
'output.wav',
sample_rate,
output_dict['wav']
)
"""
# Print again the contents
contents = os.listdir(f"bark_voices/{file_name}")
for item in contents:
print(item)
return 'output.wav'
def uploaded_audio():
return gr.update(visible=True), gr.update(visible=False)
css = """
#col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h1>Instant Voice Cloning</h1>
""")
prompt = gr.Textbox(
label="Text to speech prompt"
)
audio_in = gr.Audio(
label="WAV voice to clone",
type="filepath",
source="upload"
)
submit_btn = gr.Button("Submit")
submit_with_npz_btn = gr.Button("Submit 2", visible=False)
cloned_out = gr.Audio(
label="Text to speech output"
)
npz_file = gr.File(
label=".npz file"
)
submit_btn.click(
fn = infer,
inputs = [
prompt,
audio_in
],
outputs = [
cloned_out,
npz_file,
submit_btn,
submit_with_npz_btn
]
)
submit_with_npz_btn.click(
fn = infer_with_npz,
inputs = [
prompt,
audio_in
],
outputs = [
cloned_out
]
)
audio_in.upload(
fn=uploaded_audio,
inputs=[],
outputs=[
submit_btn,
submit_with_npz_btn
]
)
demo.queue().launch()