Spaces:

fffiloni
/

clone-voice-for-bark

Paused

App Files Files Community

clone-voice-for-bark / app.py

fffiloni

Update app.py

40ad620 over 1 year ago

raw

history blame

2.76 kB

	import gradio as gr
	import numpy as np
	from bark.generation import load_codec_model, generate_text_semantic
	from encodec.utils import convert_audio
	import torchaudio
	import torch
	#from pydub import AudioSegment

	model = load_codec_model(use_gpu=True)

	def clone_voice(audio_in, name, transcript_text):
	# Load and pre-process the audio waveform
	audio_filepath = audio_in # the audio WAV you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
	wav, sr = torchaudio.load(audio_filepath)
	wav = convert_audio(wav, sr, model.sample_rate, model.channels)
	wav = wav.unsqueeze(0).to('cuda')

	# Extract discrete codes from EnCodec
	with torch.no_grad():
	encoded_frames = model.encode(wav)
	codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]

	#"Transcription of the audio you are cloning"
	text = transcript_text

	# get seconds of audio
	seconds = wav.shape[-1] / model.sample_rate

	# generate semantic token
	semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)

	# move codes to cpu
	codes = codes.cpu().numpy()

	voice_name = name # whatever you want the name of the voice to be
	output_path = voice_name + '.npz'
	np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

	return voice_name + '.npz'

	css="""
	#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
	"""

	title="""
	<div style="text-align: center;">
	<h1>Voice Cloning for Bark Text-to-Audio</h1>
	<p>This demo is an adaptation of the <a href="https://github.com/serp-ai/bark-with-voice-clone" target="_blank">Serp-AI</a> attempts to enable voice cloning using Bark</p>
	<p>If you want to generate audio from text with this npz file,<br />follow the generate.ipynb notebook you will find at the Serp-AI Bark clone repo.</p>
	</div>
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML(title)
	audio_in = gr.Audio(label="Voice in to clone", source="microphone", type="filepath")
	transcript = gr.Textbox(label="Manual transcription of your audio", placeholder="Please transcribe audio here", info="The audio you want to clone will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds, then you'll need to manually transcribe your audio below:")
	name = gr.Textbox(label="Name your voice")

	generate_btn = gr.Button("Get NPZ file: Clone voice !")

	npz_file = gr.File(label=".npz file")

	generate_btn.click(clone_voice, inputs=[audio_in, name, transcript], outputs=[npz_file])

	demo.launch()