Spaces:

fffiloni
/

instant-TTS-Bark-cloning

Paused

File size: 5,377 Bytes

import gradio as gr
import os 
import shutil

from huggingface_hub import snapshot_download
import numpy as np
from scipy.io import wavfile

model_ids = [
    'suno/bark',
]

for model_id in model_ids:
    model_name = model_id.split('/')[-1]
    snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')

#from TTS.tts.configs.bark_config import BarkConfig
#from TTS.tts.models.bark import Bark

#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
#config = BarkConfig()
#model = Bark.init_from_config(config)
#model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)

from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)

def infer(prompt, input_wav_file):

    print("SAVING THE AUDIO FILE TO WHERE IT BELONGS")

    # Path to your WAV file
    source_path = input_wav_file

    # Destination directory
    destination_directory = "bark_voices"

    # Extract the file name without the extension
    file_name = os.path.splitext(os.path.basename(source_path))[0]

    # Construct the full destination directory path
    destination_path = os.path.join(destination_directory, file_name)

    # Create the new directory
    os.makedirs(destination_path, exist_ok=True)

    # Move the WAV file to the new directory
    shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))

    """
    text = prompt

    print("SYNTHETIZING...")
    # with random speaker
    #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)

    # cloning a speaker.
    # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
    output_dict = model.synthesize(
        text, 
        config, 
        speaker_id=f"{file_name}", 
        voice_dirs="bark_voices/",
        gpu=True
    )
    
    print(output_dict)

    

    sample_rate = 24000  # Replace with the actual sample rate
    print("WRITING WAVE FILE")
    wavfile.write(
        'output.wav', 
        sample_rate, 
        output_dict['wav']
    )
    """
    
    tts.tts_to_file(text=prompt,
                file_path="output.wav",
                voice_dir="bark_voices/",
                speaker=f"{file_name}")

    # List all the files and subdirectories in the given directory
    contents = os.listdir(f"bark_voices/{file_name}")

    # Print the contents
    for item in contents:
        print(item)   
    
    return "output.wav", f"bark_voices/{file_name}/{contents[1]}", gr.update(visible=False), gr.update(visible=True)

def infer_with_npz(prompt, input_wav_file):
    print("NEW GENERATION WITH EXISTING .NPZ")
    # Path to your WAV file
    source_path = input_wav_file
    # Extract the file name without the extension
    file_name = os.path.splitext(os.path.basename(source_path))[0]
    # List all the files and subdirectories in the given directory
    contents = os.listdir(f"bark_voices/{file_name}")
    # Print the contents
    for item in contents:
        print(item)   
    
    first_item = contents[0]  # Index 0 corresponds to the first item
    item_path = os.path.join(f"bark_voices/{file_name}", first_item)    
    os.remove(item_path)

    """
    print("BEGINNING GENERATION")
    # cloning a speaker.
    text = prompt
    # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.npz`
    output_dict = model.synthesize(
        text, 
        config, 
        speaker_id=f"{file_name}", 
        voice_dirs="bark_voices/"
    )
    
    print(output_dict)

    print("WRITING WAVE FILE")

    sample_rate = 24000  # Replace with the actual sample rate
    
    wavfile.write(
        'output.wav', 
        sample_rate, 
        output_dict['wav']
    )
    """
    # Print again the contents
    contents = os.listdir(f"bark_voices/{file_name}")
    for item in contents:
        print(item)   

    return 'output.wav'

def uploaded_audio():
    return gr.update(visible=True), gr.update(visible=False)
css = """
#col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML("""
        <h1>Instant Voice Cloning</h1>
        """)
        
        prompt = gr.Textbox(
            label="Text to speech prompt"
        )
        
        audio_in = gr.Audio(
            label="WAV voice to clone", 
            type="filepath",
            source="upload"
        )
        
        submit_btn = gr.Button("Submit")
        submit_with_npz_btn = gr.Button("Submit 2", visible=False)
        
        cloned_out = gr.Audio(
            label="Text to speech output"
        )
        
        npz_file = gr.File(
            label=".npz file"
        )
    
    submit_btn.click(
        fn = infer,
        inputs = [
            prompt,
            audio_in
        ],
        outputs = [
            cloned_out, 
            npz_file,
            submit_btn,
            submit_with_npz_btn
        ]
    )

    submit_with_npz_btn.click(
        fn = infer_with_npz, 
        inputs = [
            prompt,
            audio_in
        ],
        outputs = [
            cloned_out
        ]
    )

    audio_in.upload(
        fn=uploaded_audio,
        inputs=[],
        outputs=[
            submit_btn,
            submit_with_npz_btn
        ]
    )

demo.queue().launch()