File size: 2,642 Bytes
34bc786 f46dfa8 1e949ec 34bc786 0503c8e 34bc786 c27a8b1 34bc786 fc238b7 34bc786 dc7cb61 34bc786 81bac9d 34bc786 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import gradio as gr
import numpy as np
from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio
import torchaudio
import torch
#from pydub import AudioSegment
model = load_codec_model(use_gpu=True)
def clone_voice(audio_in, name, transcript_text):
# Load and pre-process the audio waveform
audio_filepath = audio_in # the audio WAV you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.unsqueeze(0).to('cuda')
# Extract discrete codes from EnCodec
with torch.no_grad():
encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
#"Transcription of the audio you are cloning"
text = transcript_text
# get seconds of audio
seconds = wav.shape[-1] / model.sample_rate
# generate semantic token
semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)
# move codes to cpu
codes = codes.cpu().numpy()
voice_name = name # whatever you want the name of the voice to be
output_path = voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
return voice_name + '.npz'
css="""
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
"""
title="""
<div style="text-align: center;">
<h1>Voice Cloning for Bark Text-to-Audio</h1>
<p>This demo is an adaptation of the <a href="https://github.com/serp-ai/bark-with-voice-clone" target="_blank">Serp-AI</a></p> attempts to enable voice cloning using Bark</p>
</div>
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML(title)
audio_in = gr.Audio(label="Voice in to clone", info="the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)", source="microphone", type="filepath")
transcript = gr.Textbox(label="Manual transcription of your audio")
name = gr.Textbox(label="Name your voice")
generate_btn = gr.Button("Clone voice !")
npz_file = gr.File(label=".npz file", info="If you want to generate audio from text with this npz file, follow the generate.ipynb notebook you will find at the Serp-AI Bark clone repo.")
generate_btn.click(clone_voice, inputs=[audio_in, name, transcript], outputs=[npz_file])
demo.launch() |