In [1]:
%%bash
# install the vall-e and required libraries
# PyTorch
pip install torch==1.13.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
pip install torchmetrics==0.11.1
# fbank
pip install librosa matplotlib

# phonemizer pypinyin
apt-get install espeak-ng git-lfs -y
## OSX: brew install espeak
pip install phonemizer==3.2.1 pypinyin==0.48.0

pip install lhotse

# k2
# find the right version in https://huggingface.co/csukuangfj/k2
pip install https://huggingface.co/csukuangfj/k2/resolve/main/cuda/k2-1.23.4.dev20230224+cuda11.6.torch1.13.1-cp310-cp310-linux_x86_64.whl
mkdir tmp_lib

git clone https://github.com/k2-fsa/icefall
mv icefall tmp_lib
cd tmp_lib/icefall
pip install -r requirements.txt
pip install -e .
export PYTHONPATH=`pwd`/../icefall:$PYTHONPATH
echo "export PYTHONPATH=`pwd`/../icefall:\$PYTHONPATH" >> ~/.zshrc
echo "export PYTHONPATH=`pwd`/../icefall:\$PYTHONPATH" >> ~/.bashrc
cd ../..
source ~/.zshrc
source ~/.bashrc


git clone https://github.com/lifeiteng/vall-e
mv vall-e tmp_lib
cd tmp_lib/vall-e
pip install -e .

pip install vocos
pip install gradio

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Reading package lists...
Building dependency tree...
Reading state information...
espeak-ng is already the newest version (1.50+dfsg-10).
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
Collecting k2==1.23.4.dev20230224+cuda11.6.torch1.13.1
  Downloading https://huggingface.co/csukuangfj/k2/resolve/main/cuda/k2-1.23.4.dev20230224+cuda11.6.torch1.13.1-cp310-cp310-linux_x86_64.whl (102.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 102.9/102.9 MB 8.9 MB/s eta 0:00:00
Obtaining file:///content/tmp_lib/icefall
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build 

mkdir: cannot create directory ‘tmp_lib’: File exists
Cloning into 'icefall'...
mv: cannot move 'icefall' to 'tmp_lib/icefall': Directory not empty
Cloning into 'vall-e'...


In [2]:
!wget https://huggingface.co/LearnItAnyway/vall-e_korean/resolve/main/vall-e_ko_v0.pt
!wget https://huggingface.co/LearnItAnyway/vall-e_korean/resolve/main/unique_text_tokens.k2symbols

--2023-09-21 06:51:42--  https://huggingface.co/LearnItAnyway/vall-e_korean/resolve/main/vall-e_ko_v0.pt
Resolving huggingface.co (huggingface.co)... 65.8.178.93, 65.8.178.27, 65.8.178.118, ...
Connecting to huggingface.co (huggingface.co)|65.8.178.93|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/f8/b9/f8b90061950052a23944dd83f71a6f031a5d39c73dbb799693620ca8a4186a97/e2894775b160c24132f8f6d6d7df6cc8bf59a9b465778d08989e200859610560?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27vall-e_ko_v0.pt%3B+filename%3D%22vall-e_ko_v0.pt%22%3B&Expires=1695538302&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NTUzODMwMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9mOC9iOS9mOGI5MDA2MTk1MDA1MmEyMzk0NGRkODNmNzFhNmYwMzFhNWQzOWM3M2RiYjc5OTY5MzYyMGNhOGE0MTg2YTk3L2UyODk0Nzc1YjE2MGMyNDEzMmY4ZjZkNmQ3ZGY2Y2M4YmY1OWE5YjQ2NTc3OGQwODk4OWUyMDA4NTk2MTA1NjA%7EcmV

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [4]:
import argparse
import logging
import os
import pathlib
import time
import tempfile
import platform
import webbrowser
import sys
import torch, torchaudio
import random

import numpy as np

from valle.data import (
    AudioTokenizer,
    TextTokenizer,
    tokenize_audio,
    tokenize_text,
)
from icefall.utils import AttributeDict
from valle.data.collation import get_text_token_collater
from valle.models import get_model

from vocos import Vocos
from encodec.utils import convert_audio
import multiprocessing

thread_count = multiprocessing.cpu_count()

print("Use",thread_count,"cpu cores for computing")

torch.set_num_threads(thread_count)
torch.set_num_interop_threads(thread_count)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_set_profiling_mode(False)
torch._C._set_graph_executor_optimize(False)

text_tokenizer = TextTokenizer(language='ko')

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda", 0)

checkpoint = torch.load("./vall-e_ko_v0.pt", map_location='cpu')
model = get_model(AttributeDict(checkpoint))
missing_keys, unexpected_keys = model.load_state_dict(
    checkpoint["model"], strict=True
)
assert not missing_keys
model.eval()
model.to(device)
text_collater = get_text_token_collater('./unique_text_tokens.k2symbols')

# Encodec model
audio_tokenizer = AudioTokenizer(device)

# Vocos decoder
vocos = Vocos.from_pretrained('charactr/vocos-encodec-24khz').to(device)

model.to(device)
@torch.no_grad()
def infer_from_prompt(text_prompt, audio_prompt, text):
    ## text to token
    text_tokens, text_tokens_lens = text_collater(
        [
            tokenize_text(
                text_tokenizer, text=f"{text_prompt} {text}".strip()
            )
        ]
    )
    _, enroll_x_lens = text_collater(
        [
            tokenize_text(
                text_tokenizer, text=f"{text_prompt}".strip()
            )
        ]
    )
    print('text_loaded')

    # text to synthesize
    wav_pr, sr = torchaudio.load(audio_prompt)
    wav_pr = convert_audio(wav_pr, sr, audio_tokenizer.sample_rate, audio_tokenizer.channels)
    audio_prompts = audio_tokenizer.encode(wav_pr.unsqueeze(0))[0][0].transpose(2, 1).to(device)
    print('Audio encoded')

    encoded_frames = model.inference(
        text_tokens.to(device), text_tokens_lens.to(device),
        audio_prompts, enroll_x_lens=enroll_x_lens,
        top_k=-100, temperature=1)
    vocos_features = vocos.codes_to_features(encoded_frames.permute(2, 0, 1))
    samples = vocos.decode(vocos_features, bandwidth_id=torch.tensor([2], device=device))
    message = f"sythesized text: {text}"
    return message, (24000, samples.squeeze(0).cpu().numpy())

import gradio as gr
app = gr.Blocks(title="VALL-E Korean")
with app:
    #gr.Markdown(top_md)
    with gr.Tab("VALL-E Korean Demo"):
        #gr.Markdown(infer_from_prompt_md)
        with gr.Row():
            with gr.Column():
                text_prompt = gr.TextArea(label="Input Text",
                                      placeholder="Type text in the audio file (Korean)",)
                audio_prompt= gr.Audio(label="Input Audio", source='upload', interactive=True, type="filepath")
                text_input = gr.TextArea(label="Output Text",
                                      placeholder="Type text you want to generate (Korean)",)
            with gr.Column():
                text_output = gr.Textbox(label="Message")
                audio_output= gr.Audio(label="Output Audio")
                btn = gr.Button("Generate!")
                btn.click(infer_from_prompt,
                          inputs=[text_prompt, audio_prompt, text_input],
                          outputs=[text_output, audio_output])
webbrowser.open("http://127.0.0.1:7860")
app.launch(share=True)

  warn(


Use 2 cpu cores for computing


Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th


  0%|          | 0.00/88.9M [00:00<?, ?B/s]

Downloading (…)lve/main/config.yaml:   0%|          | 0.00/503 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/40.4M [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ed9139a0e413300fd1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


