|
import gradio as gr |
|
import numpy as np |
|
from bark.generation import load_codec_model, generate_text_semantic |
|
from encodec.utils import convert_audio |
|
import torchaudio |
|
import torch |
|
|
|
|
|
model = load_codec_model(use_gpu=True) |
|
|
|
def clone_voice(audio_in, name, transcript_text): |
|
|
|
audio_filepath = audio_in |
|
wav, sr = torchaudio.load(audio_filepath) |
|
wav = convert_audio(wav, sr, model.sample_rate, model.channels) |
|
wav = wav.unsqueeze(0).to('cuda') |
|
|
|
|
|
with torch.no_grad(): |
|
encoded_frames = model.encode(wav) |
|
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() |
|
|
|
|
|
text = transcript_text |
|
|
|
|
|
seconds = wav.shape[-1] / model.sample_rate |
|
|
|
|
|
semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds) |
|
|
|
|
|
codes = codes.cpu().numpy() |
|
|
|
voice_name = name |
|
output_path = voice_name + '.npz' |
|
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) |
|
|
|
return voice_name + '.npz' |
|
|
|
css=""" |
|
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;} |
|
""" |
|
|
|
title=""" |
|
<div style="text-align: center;"> |
|
<h1>Voice Cloning for Bark Text-to-Audio</h1> |
|
<p>This demo is an adaptation of the <a href="https://github.com/serp-ai/bark-with-voice-clone" target="_blank">Serp-AI</a> attempts to enable voice cloning using Bark</p> |
|
<p>If you want to generate audio from text with this npz file,<br />follow the generate.ipynb notebook you will find at the Serp-AI Bark clone repo.</p> |
|
</div> |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.HTML(title) |
|
audio_in = gr.Audio(label="Voice in to clone", source="microphone", type="filepath") |
|
transcript = gr.Textbox(label="Manual transcription of your audio", placeholder="Please transcribe audio here", info="The audio you want to clone will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds, then you'll need to manually transcribe your audio below:") |
|
name = gr.Textbox(label="Name your voice") |
|
|
|
generate_btn = gr.Button("Get NPZ file: Clone voice !") |
|
|
|
npz_file = gr.File(label=".npz file") |
|
|
|
generate_btn.click(clone_voice, inputs=[audio_in, name, transcript], outputs=[npz_file]) |
|
|
|
demo.launch() |