|
import gradio as gr |
|
import os |
|
import shutil |
|
|
|
from huggingface_hub import snapshot_download |
|
import numpy as np |
|
from scipy.io import wavfile |
|
|
|
model_ids = [ |
|
'suno/bark', |
|
] |
|
|
|
for model_id in model_ids: |
|
model_name = model_id.split('/')[-1] |
|
snapshot_download(model_id, local_dir=f'checkpoints/{model_name}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from TTS.api import TTS |
|
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) |
|
|
|
def infer(prompt, input_wav_file): |
|
|
|
print("SAVING THE AUDIO FILE TO WHERE IT BELONGS") |
|
|
|
|
|
source_path = input_wav_file |
|
|
|
|
|
destination_directory = "bark_voices" |
|
|
|
|
|
file_name = os.path.splitext(os.path.basename(source_path))[0] |
|
|
|
|
|
destination_path = os.path.join(destination_directory, file_name) |
|
|
|
|
|
os.makedirs(destination_path, exist_ok=True) |
|
|
|
|
|
shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) |
|
|
|
""" |
|
text = prompt |
|
|
|
print("SYNTHETIZING...") |
|
# with random speaker |
|
#output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None) |
|
|
|
# cloning a speaker. |
|
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz` |
|
output_dict = model.synthesize( |
|
text, |
|
config, |
|
speaker_id=f"{file_name}", |
|
voice_dirs="bark_voices/", |
|
gpu=True |
|
) |
|
|
|
print(output_dict) |
|
|
|
|
|
|
|
sample_rate = 24000 # Replace with the actual sample rate |
|
print("WRITING WAVE FILE") |
|
wavfile.write( |
|
'output.wav', |
|
sample_rate, |
|
output_dict['wav'] |
|
) |
|
""" |
|
|
|
tts.tts_to_file(text=prompt, |
|
file_path="output.wav", |
|
voice_dir="bark_voices/", |
|
speaker=f"{file_name}") |
|
|
|
|
|
contents = os.listdir(f"bark_voices/{file_name}") |
|
|
|
|
|
for item in contents: |
|
print(item) |
|
|
|
return "output.wav", f"bark_voices/{file_name}/{contents[1]}", gr.update(visible=False), gr.update(visible=True) |
|
|
|
def infer_with_npz(prompt, input_wav_file): |
|
print("NEW GENERATION WITH EXISTING .NPZ") |
|
|
|
source_path = input_wav_file |
|
|
|
file_name = os.path.splitext(os.path.basename(source_path))[0] |
|
|
|
contents = os.listdir(f"bark_voices/{file_name}") |
|
|
|
for item in contents: |
|
print(item) |
|
|
|
first_item = contents[0] |
|
item_path = os.path.join(f"bark_voices/{file_name}", first_item) |
|
os.remove(item_path) |
|
|
|
""" |
|
print("BEGINNING GENERATION") |
|
# cloning a speaker. |
|
text = prompt |
|
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.npz` |
|
output_dict = model.synthesize( |
|
text, |
|
config, |
|
speaker_id=f"{file_name}", |
|
voice_dirs="bark_voices/" |
|
) |
|
|
|
print(output_dict) |
|
|
|
print("WRITING WAVE FILE") |
|
|
|
sample_rate = 24000 # Replace with the actual sample rate |
|
|
|
wavfile.write( |
|
'output.wav', |
|
sample_rate, |
|
output_dict['wav'] |
|
) |
|
""" |
|
|
|
contents = os.listdir(f"bark_voices/{file_name}") |
|
for item in contents: |
|
print(item) |
|
|
|
return 'output.wav' |
|
|
|
def uploaded_audio(): |
|
return gr.update(visible=True), gr.update(visible=False) |
|
css = """ |
|
#col-container {max-width: 780px; margin-left: auto; margin-right: auto;} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
|
|
gr.HTML(""" |
|
<h1>Instant Voice Cloning</h1> |
|
""") |
|
|
|
prompt = gr.Textbox( |
|
label="Text to speech prompt" |
|
) |
|
|
|
audio_in = gr.Audio( |
|
label="WAV voice to clone", |
|
type="filepath", |
|
source="upload" |
|
) |
|
|
|
submit_btn = gr.Button("Submit") |
|
submit_with_npz_btn = gr.Button("Submit 2", visible=False) |
|
|
|
cloned_out = gr.Audio( |
|
label="Text to speech output" |
|
) |
|
|
|
npz_file = gr.File( |
|
label=".npz file" |
|
) |
|
|
|
submit_btn.click( |
|
fn = infer, |
|
inputs = [ |
|
prompt, |
|
audio_in |
|
], |
|
outputs = [ |
|
cloned_out, |
|
npz_file, |
|
submit_btn, |
|
submit_with_npz_btn |
|
] |
|
) |
|
|
|
submit_with_npz_btn.click( |
|
fn = infer_with_npz, |
|
inputs = [ |
|
prompt, |
|
audio_in |
|
], |
|
outputs = [ |
|
cloned_out |
|
] |
|
) |
|
|
|
audio_in.upload( |
|
fn=uploaded_audio, |
|
inputs=[], |
|
outputs=[ |
|
submit_btn, |
|
submit_with_npz_btn |
|
] |
|
) |
|
|
|
demo.queue().launch() |