Spaces:

fffiloni
/

instant-TTS-Bark-cloning

Paused

App Files Files Community

instant-TTS-Bark-cloning / app.py

fffiloni

Update app.py

65814fd about 1 year ago

raw

history blame

5.38 kB

	import gradio as gr
	import os
	import shutil

	from huggingface_hub import snapshot_download
	import numpy as np
	from scipy.io import wavfile

	model_ids = [
	'suno/bark',
	]

	for model_id in model_ids:
	model_name = model_id.split('/')[-1]
	snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')

	#from TTS.tts.configs.bark_config import BarkConfig
	#from TTS.tts.models.bark import Bark

	#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
	#config = BarkConfig()
	#model = Bark.init_from_config(config)
	#model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)

	from TTS.api import TTS
	tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)

	def infer(prompt, input_wav_file):

	print("SAVING THE AUDIO FILE TO WHERE IT BELONGS")

	# Path to your WAV file
	source_path = input_wav_file

	# Destination directory
	destination_directory = "bark_voices"

	# Extract the file name without the extension
	file_name = os.path.splitext(os.path.basename(source_path))[0]

	# Construct the full destination directory path
	destination_path = os.path.join(destination_directory, file_name)

	# Create the new directory
	os.makedirs(destination_path, exist_ok=True)

	# Move the WAV file to the new directory
	shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))

	"""
	text = prompt

	print("SYNTHETIZING...")
	# with random speaker
	#output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)

	# cloning a speaker.
	# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
	output_dict = model.synthesize(
	text,
	config,
	speaker_id=f"{file_name}",
	voice_dirs="bark_voices/",
	gpu=True
	)

	print(output_dict)



	sample_rate = 24000 # Replace with the actual sample rate
	print("WRITING WAVE FILE")
	wavfile.write(
	'output.wav',
	sample_rate,
	output_dict['wav']
	)
	"""

	tts.tts_to_file(text=prompt,
	file_path="output.wav",
	voice_dir="bark_voices/",
	speaker=f"{file_name}")

	# List all the files and subdirectories in the given directory
	contents = os.listdir(f"bark_voices/{file_name}")

	# Print the contents
	for item in contents:
	print(item)

	return "output.wav", f"bark_voices/{file_name}/{contents[1]}", gr.update(visible=False), gr.update(visible=True)

	def infer_with_npz(prompt, input_wav_file):
	print("NEW GENERATION WITH EXISTING .NPZ")
	# Path to your WAV file
	source_path = input_wav_file
	# Extract the file name without the extension
	file_name = os.path.splitext(os.path.basename(source_path))[0]
	# List all the files and subdirectories in the given directory
	contents = os.listdir(f"bark_voices/{file_name}")
	# Print the contents
	for item in contents:
	print(item)

	first_item = contents[0] # Index 0 corresponds to the first item
	item_path = os.path.join(f"bark_voices/{file_name}", first_item)
	os.remove(item_path)

	"""
	print("BEGINNING GENERATION")
	# cloning a speaker.
	text = prompt
	# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.npz`
	output_dict = model.synthesize(
	text,
	config,
	speaker_id=f"{file_name}",
	voice_dirs="bark_voices/"
	)

	print(output_dict)

	print("WRITING WAVE FILE")

	sample_rate = 24000 # Replace with the actual sample rate

	wavfile.write(
	'output.wav',
	sample_rate,
	output_dict['wav']
	)
	"""
	# Print again the contents
	contents = os.listdir(f"bark_voices/{file_name}")
	for item in contents:
	print(item)

	return 'output.wav'

	def uploaded_audio():
	return gr.update(visible=True), gr.update(visible=False)
	css = """
	#col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):

	gr.HTML("""
	<h1>Instant Voice Cloning</h1>
	""")

	prompt = gr.Textbox(
	label="Text to speech prompt"
	)

	audio_in = gr.Audio(
	label="WAV voice to clone",
	type="filepath",
	source="upload"
	)

	submit_btn = gr.Button("Submit")
	submit_with_npz_btn = gr.Button("Submit 2", visible=False)

	cloned_out = gr.Audio(
	label="Text to speech output"
	)

	npz_file = gr.File(
	label=".npz file"
	)

	submit_btn.click(
	fn = infer,
	inputs = [
	prompt,
	audio_in
	],
	outputs = [
	cloned_out,
	npz_file,
	submit_btn,
	submit_with_npz_btn
	]
	)

	submit_with_npz_btn.click(
	fn = infer_with_npz,
	inputs = [
	prompt,
	audio_in
	],
	outputs = [
	cloned_out
	]
	)

	audio_in.upload(
	fn=uploaded_audio,
	inputs=[],
	outputs=[
	submit_btn,
	submit_with_npz_btn
	]
	)

	demo.queue().launch()