import gradio as gr import os import shutil from huggingface_hub import snapshot_download import numpy as np from scipy.io import wavfile model_ids = [ 'suno/bark', ] for model_id in model_ids: model_name = model_id.split('/')[-1] snapshot_download(model_id, local_dir=f'checkpoints/{model_name}') #from TTS.tts.configs.bark_config import BarkConfig #from TTS.tts.models.bark import Bark #os.environ['CUDA_VISIBLE_DEVICES'] = '1' #config = BarkConfig() #model = Bark.init_from_config(config) #model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True) from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) def infer(prompt, input_wav_file): print("SAVING THE AUDIO FILE TO WHERE IT BELONGS") # Path to your WAV file source_path = input_wav_file # Destination directory destination_directory = "bark_voices" # Extract the file name without the extension file_name = os.path.splitext(os.path.basename(source_path))[0] # Construct the full destination directory path destination_path = os.path.join(destination_directory, file_name) # Create the new directory os.makedirs(destination_path, exist_ok=True) # Move the WAV file to the new directory shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) """ text = prompt print("SYNTHETIZING...") # with random speaker #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None) # cloning a speaker. # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz` output_dict = model.synthesize( text, config, speaker_id=f"{file_name}", voice_dirs="bark_voices/", gpu=True ) print(output_dict) sample_rate = 24000 # Replace with the actual sample rate print("WRITING WAVE FILE") wavfile.write( 'output.wav', sample_rate, output_dict['wav'] ) """ tts.tts_to_file(text=prompt, file_path="output.wav", voice_dir="bark_voices/", speaker=f"{file_name}") # List all the files and subdirectories in the given directory contents = os.listdir(f"bark_voices/{file_name}") # Print the contents for item in contents: print(item) return "output.wav", f"bark_voices/{file_name}/{contents[1]}", gr.update(visible=False), gr.update(visible=True) def infer_with_npz(prompt, input_wav_file): print("NEW GENERATION WITH EXISTING .NPZ") # Path to your WAV file source_path = input_wav_file # Extract the file name without the extension file_name = os.path.splitext(os.path.basename(source_path))[0] # List all the files and subdirectories in the given directory contents = os.listdir(f"bark_voices/{file_name}") # Print the contents for item in contents: print(item) first_item = contents[0] # Index 0 corresponds to the first item item_path = os.path.join(f"bark_voices/{file_name}", first_item) os.remove(item_path) """ print("BEGINNING GENERATION") # cloning a speaker. text = prompt # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.npz` output_dict = model.synthesize( text, config, speaker_id=f"{file_name}", voice_dirs="bark_voices/" ) print(output_dict) print("WRITING WAVE FILE") sample_rate = 24000 # Replace with the actual sample rate wavfile.write( 'output.wav', sample_rate, output_dict['wav'] ) """ # Print again the contents contents = os.listdir(f"bark_voices/{file_name}") for item in contents: print(item) return 'output.wav' def uploaded_audio(): return gr.update(visible=True), gr.update(visible=False) css = """ #col-container {max-width: 780px; margin-left: auto; margin-right: auto;} """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML("""

Instant Voice Cloning

""") prompt = gr.Textbox( label="Text to speech prompt" ) audio_in = gr.Audio( label="WAV voice to clone", type="filepath", source="upload" ) submit_btn = gr.Button("Submit") submit_with_npz_btn = gr.Button("Submit 2", visible=False) cloned_out = gr.Audio( label="Text to speech output" ) npz_file = gr.File( label=".npz file" ) submit_btn.click( fn = infer, inputs = [ prompt, audio_in ], outputs = [ cloned_out, npz_file, submit_btn, submit_with_npz_btn ] ) submit_with_npz_btn.click( fn = infer_with_npz, inputs = [ prompt, audio_in ], outputs = [ cloned_out ] ) audio_in.upload( fn=uploaded_audio, inputs=[], outputs=[ submit_btn, submit_with_npz_btn ] ) demo.queue().launch()