Respair/Tsukasa_Speech · Can u please opensource the right weights?

lucasjin

3 days ago

The weights in your repo can not infer correctly.

Respair

Owner 3 days ago

Hi, the weights are exactly what I used and trained. may I ask you what's the issue you're facing please?

lucasjin

3 days ago

You are finally online, plz help these model:

KotoDama_Prompter = load_KotoDama_Prompter(
path="Utils/Kotodama/prompt_enc/checkpoint-73285"
)
KotoDama_TextSampler = load_KotoDama_TextSampler(
path="Utils/Kotodama/text_enc/checkpoint-22680"
)

without these, the model can not infer correctly.

lucasjin

3 days ago

Also, what does these module helped in generation? How does it trained?

Respair

Owner 3 days ago

•

edited 3 days ago

oh, Sorry i forgot to rename the path.

change "Kotodama" to "KTD" in the path, and then it should work.

Kotodama is a new way of generating the style vectors without audio files or the diffusion sampler.

lucasjin

3 days ago

@Respair Hi, in that case, what if users want styles from audio exactly

lucasjin

3 days ago

Hi, the result still not right.

config = yaml.safe_load(open("Configs/config_kanade.yml"))

params_whole = torch.load(
# "Models/Style_Kanade_v02/Top_ckpt_24khz.pth", map_location="cpu"
"Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth",
map_location="cpu",
)

I can only saw the Style_Tsukasa_v02 this one pth. Can u point me where could be wrong?

import IPython.display as ipd
import os
import torch

from utils import recursive_munch

torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

from Utils.phonemize.cotlet_phon import phonemize

import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize

from models import *
from Modules.KotoDama_sampler import (
    tokenizer_koto_prompt,
    tokenizer_koto_text,
    inference,
    Longform,
    merging_sentences,
    # trim_long_silences,
)
from utils import *

import nltk

nltk.download("punkt_tab")

from nltk.tokenize import sent_tokenize

from konoha import SentenceTokenizer


sent_tokenizer = SentenceTokenizer()

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300
)
mean, std = -4, 4


def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def compute_style_through_clip_through_clip(path):
    wave, sr = librosa.load(path, sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:
        audio = librosa.resample(audio, sr, 24000)
    mel_tensor = preprocess(audio).to(device)

    with torch.no_grad():
        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

    return torch.cat([ref_s, ref_p], dim=1)


def Kotodama_Prompter(model, text, device):

    with torch.no_grad():
        style = model.KotoDama_Prompt(
            **tokenizer_koto_prompt(text, return_tensors="pt").to(device)
        )["logits"]
    return style


def Kotodama_Sampler(model, text, device):

    with torch.no_grad():
        style = model.KotoDama_Text(
            **tokenizer_koto_text(text, return_tensors="pt").to(device)
        )["logits"]
    return style


import soundfile as sf


def save_wav(wav, rate=24000, filename="output.wav"):

    if not isinstance(wav, np.ndarray):
        wav = np.array(wav, dtype=np.float32)

    # Clip the audio data to the valid range [-1, 1] to avoid issues
    wav = np.clip(wav, -1.0, 1.0)

    # Save the WAV file using soundfile
    sf.write(filename, wav, 24000)


device = "cuda" if torch.cuda.is_available() else "cpu"

config = yaml.safe_load(open("Configs/config_kanade.yml"))

# load pretrained ASR model
ASR_config = config.get("ASR_config", False)
ASR_path = config.get("ASR_path", False)
text_aligner = load_ASR_models(ASR_path, ASR_config)


KotoDama_Prompter = load_KotoDama_Prompter(
    # path="Utils/Kotodama/prompt_enc/checkpoint-73285"
    path="Utils/KTD/prompt_enc/checkpoint-73285"
)
KotoDama_TextSampler = load_KotoDama_TextSampler(
    # path="Utils/Kotodama/text_enc/checkpoint-22680"
    path="Utils/KTD/text_enc/checkpoint-22680"
)

# load pretrained F0 model
F0_path = config.get("F0_path", False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert

BERT_path = config.get("PLBERT_dir", False)
plbert = load_plbert(BERT_path)

model_params = recursive_munch(config["model_params"])
model = build_model(
    model_params,
    text_aligner,
    pitch_extractor,
    plbert,
    KotoDama_Prompter,
    KotoDama_TextSampler,
    # None, None
)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

params_whole = torch.load(
    # "Models/Style_Kanade_v02/Top_ckpt_24khz.pth", map_location="cpu"
    "Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth",
    map_location="cpu",
)
params = params_whole["net"]


for key in model:
    if key in params:
        print("%s loaded" % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict

            state_dict = params[key]
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])


_ = [model[key].eval() for key in model]


from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

diffusion_sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(
        sigma_min=0.0001, sigma_max=3.0, rho=9.0
    ),  # empirical parameters
    clamp=False,
)

"""different speaker by switching the name. we then define how much should be the impact of the diffusion sampler. 
if the diffusion sampler works for you, it's recommended to use both Kotodama. otherwise, set alpha and beta to 0."""

japanese = "Kimiji: 人生は、果てしない探求の旅のようなもの。私たちは、自分自身や周囲の世界について、常に新しい発見をしていく。それは、時として喜びをもたらすこともあれば、困難に直面することもある。しかしそれら全てが、自分を形作る貴重な経験である。"

raw_jpn = japanese[
    japanese.find(":") + 2 :
]  # factoring out the name of the speaker, since we don't need that for phonemization.
text = phonemize(raw_jpn)

Kotodama = Kotodama_Sampler(
    model, text=japanese, device=device
)  # provide the Japanese text, not the Phonemized version.

reference_dicts = {}

reference_dicts["1789_14289w"] = japanese

start = time.time()
noise = torch.randn(1, 1, 256).to(device)
for k, path in reference_dicts.items():

    wav = inference(
        model,
        diffusion_sampler,
        text,
        Kotodama,
        # None,
        alpha=0.1,
        beta=0.5,
        diffusion_steps=10,
        embedding_scale=1.5,
        rate_of_speech=1.0,
    )
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    print(wav)
    save_wav(wav)

Respair

Owner 3 days ago

•

edited 3 days ago

My guy, what am I supposed to infer from "the result is not right".

if it's a path problem, you should just copy the full path of each checkpoint on your device.

@Respair Hi, in that case, what if users want styles from audio exactly

the provided notebooks show everything you can do. in case it's not clear, use the compute style function and then feed its result instead of Kotodama

lucasjin

3 days ago

Hi, plz have a look at my script, the path changed, still not right

Respair

Owner 3 days ago

•

edited 3 days ago

just put the .pth file here : https://huggingface.co/Respair/Tsukasa_Speech/tree/main/Models/Style_Tsukasa_v02 in the model's ckpt path.

should be good to go. if you're still facing an error, please write the exact error you're facing here.

lucasjin

3 days ago

First of all, the ipy noteb ook points is Kotana model, rather than tsukasa.
The Models/ folder contains only tsukasa checkpoint.

I changed the config to tsukasa as well.

the output not right
...

Respair

Owner 3 days ago

The broken output suggests the diffusion doesn't work on your system ( i mentioned this in the read me) try to set alpha and beta to zero. if it works then everything is set up correctly.