Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from bark.generation import codec_encode, load_codec_model, generate_text_semantic
|
3 |
+
from encodec.utils import convert_audio
|
4 |
+
import torchaudio
|
5 |
+
import torch
|
6 |
+
|
7 |
+
model = load_codec_model(use_gpu=True)
|
8 |
+
|
9 |
+
def clone_voice(audio_in, name, transcript_text):
|
10 |
+
# Load and pre-process the audio waveform
|
11 |
+
audio_filepath = audio_in # the audio WAV you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)
|
12 |
+
wav, sr = torchaudio.load(audio_filepath)
|
13 |
+
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
|
14 |
+
wav = wav.unsqueeze(0).to('cuda')
|
15 |
+
|
16 |
+
# Extract discrete codes from EnCodec
|
17 |
+
with torch.no_grad():
|
18 |
+
encoded_frames = model.encode(wav)
|
19 |
+
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
|
20 |
+
|
21 |
+
#"Transcription of the audio you are cloning"
|
22 |
+
text = transcript_text
|
23 |
+
|
24 |
+
# get seconds of audio
|
25 |
+
seconds = wav.shape[-1] / model.sample_rate
|
26 |
+
|
27 |
+
# generate semantic token
|
28 |
+
semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds)
|
29 |
+
|
30 |
+
# move codes to cpu
|
31 |
+
codes = codes.cpu().numpy()
|
32 |
+
|
33 |
+
voice_name = name # whatever you want the name of the voice to be
|
34 |
+
output_path = voice_name + '.npz'
|
35 |
+
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
|
36 |
+
|
37 |
+
return voice_name + '.npz'
|
38 |
+
|
39 |
+
css="""
|
40 |
+
#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
|
41 |
+
"""
|
42 |
+
|
43 |
+
title="""
|
44 |
+
<div style="text-align: center;">
|
45 |
+
<h1>Voice Cloning for Bark Text-to-Audio</h1>
|
46 |
+
<p>This demo is an adaptation of the Serp-AI attempts to enable voice cloning using Bark</p>
|
47 |
+
</div>
|
48 |
+
"""
|
49 |
+
|
50 |
+
with gr.Blocks(css=css) as demo:
|
51 |
+
with gr.Column(elem_id="col-container"):
|
52 |
+
gr.HTML(title)
|
53 |
+
audio_in = gr.Audio(label="Voice in to clone", source="upload", type="filepath")
|
54 |
+
transcript = gr.Textbox(label="Manual transcription of your audio")
|
55 |
+
name = gr.Textbox(label="Name your voice")
|
56 |
+
|
57 |
+
generate_btn = gr.Button("Clone voice !")
|
58 |
+
|
59 |
+
npz_file = gr.File()
|
60 |
+
|
61 |
+
generate_btn.click(clone_voice, inputs=[audio_in, transcript, name], outputs=[npe_file])
|
62 |
+
|
63 |
+
demo.launch()
|