Can u please opensource the right weights?

#1
by lucasjin - opened

The weights in your repo can not infer correctly.

Hi, the weights are exactly what I used and trained. may I ask you what's the issue you're facing please?

You are finally online, plz help these model:

KotoDama_Prompter = load_KotoDama_Prompter(
path="Utils/Kotodama/prompt_enc/checkpoint-73285"
)
KotoDama_TextSampler = load_KotoDama_TextSampler(
path="Utils/Kotodama/text_enc/checkpoint-22680"
)

without these, the model can not infer correctly.

Also, what does these module helped in generation? How does it trained?

oh, Sorry i forgot to rename the path.

change "Kotodama" to "KTD" in the path, and then it should work.

Kotodama is a new way of generating the style vectors without audio files or the diffusion sampler.

@Respair Hi, in that case, what if users want styles from audio exactly

Hi, the result still not right.

config = yaml.safe_load(open("Configs/config_kanade.yml"))

params_whole = torch.load(
# "Models/Style_Kanade_v02/Top_ckpt_24khz.pth", map_location="cpu"
"Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth",
map_location="cpu",
)

I can only saw the Style_Tsukasa_v02 this one pth. Can u point me where could be wrong?

import IPython.display as ipd
import os
import torch

from utils import recursive_munch

torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

from Utils.phonemize.cotlet_phon import phonemize

import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize

from models import *
from Modules.KotoDama_sampler import (
    tokenizer_koto_prompt,
    tokenizer_koto_text,
    inference,
    Longform,
    merging_sentences,
    # trim_long_silences,
)
from utils import *

import nltk

nltk.download("punkt_tab")

from nltk.tokenize import sent_tokenize

from konoha import SentenceTokenizer


sent_tokenizer = SentenceTokenizer()

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300
)
mean, std = -4, 4


def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def compute_style_through_clip_through_clip(path):
    wave, sr = librosa.load(path, sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:
        audio = librosa.resample(audio, sr, 24000)
    mel_tensor = preprocess(audio).to(device)

    with torch.no_grad():
        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))

    return torch.cat([ref_s, ref_p], dim=1)


def Kotodama_Prompter(model, text, device):

    with torch.no_grad():
        style = model.KotoDama_Prompt(
            **tokenizer_koto_prompt(text, return_tensors="pt").to(device)
        )["logits"]
    return style


def Kotodama_Sampler(model, text, device):

    with torch.no_grad():
        style = model.KotoDama_Text(
            **tokenizer_koto_text(text, return_tensors="pt").to(device)
        )["logits"]
    return style


import soundfile as sf


def save_wav(wav, rate=24000, filename="output.wav"):

    if not isinstance(wav, np.ndarray):
        wav = np.array(wav, dtype=np.float32)

    # Clip the audio data to the valid range [-1, 1] to avoid issues
    wav = np.clip(wav, -1.0, 1.0)

    # Save the WAV file using soundfile
    sf.write(filename, wav, 24000)


device = "cuda" if torch.cuda.is_available() else "cpu"

config = yaml.safe_load(open("Configs/config_kanade.yml"))

# load pretrained ASR model
ASR_config = config.get("ASR_config", False)
ASR_path = config.get("ASR_path", False)
text_aligner = load_ASR_models(ASR_path, ASR_config)


KotoDama_Prompter = load_KotoDama_Prompter(
    # path="Utils/Kotodama/prompt_enc/checkpoint-73285"
    path="Utils/KTD/prompt_enc/checkpoint-73285"
)
KotoDama_TextSampler = load_KotoDama_TextSampler(
    # path="Utils/Kotodama/text_enc/checkpoint-22680"
    path="Utils/KTD/text_enc/checkpoint-22680"
)

# load pretrained F0 model
F0_path = config.get("F0_path", False)
pitch_extractor = load_F0_models(F0_path)

# load BERT model
from Utils.PLBERT.util import load_plbert

BERT_path = config.get("PLBERT_dir", False)
plbert = load_plbert(BERT_path)

model_params = recursive_munch(config["model_params"])
model = build_model(
    model_params,
    text_aligner,
    pitch_extractor,
    plbert,
    KotoDama_Prompter,
    KotoDama_TextSampler,
    # None, None
)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]

params_whole = torch.load(
    # "Models/Style_Kanade_v02/Top_ckpt_24khz.pth", map_location="cpu"
    "Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth",
    map_location="cpu",
)
params = params_whole["net"]


for key in model:
    if key in params:
        print("%s loaded" % key)
        try:
            model[key].load_state_dict(params[key])
        except:
            from collections import OrderedDict

            state_dict = params[key]
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v
            # load params
            model[key].load_state_dict(new_state_dict, strict=False)
#             except:
#                 _load(params[key], model[key])


_ = [model[key].eval() for key in model]


from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule

diffusion_sampler = DiffusionSampler(
    model.diffusion.diffusion,
    sampler=ADPM2Sampler(),
    sigma_schedule=KarrasSchedule(
        sigma_min=0.0001, sigma_max=3.0, rho=9.0
    ),  # empirical parameters
    clamp=False,
)

"""different speaker by switching the name. we then define how much should be the impact of the diffusion sampler. 
if the diffusion sampler works for you, it's recommended to use both Kotodama. otherwise, set alpha and beta to 0."""

japanese = "Kimiji: ไบบ็”Ÿใฏใ€ๆžœใฆใ—ใชใ„ๆŽขๆฑ‚ใฎๆ—…ใฎใ‚ˆใ†ใชใ‚‚ใฎใ€‚็งใŸใกใฏใ€่‡ชๅˆ†่‡ช่บซใ‚„ๅ‘จๅ›ฒใฎไธ–็•Œใซใคใ„ใฆใ€ๅธธใซๆ–ฐใ—ใ„็™บ่ฆ‹ใ‚’ใ—ใฆใ„ใใ€‚ใใ‚Œใฏใ€ๆ™‚ใจใ—ใฆๅ–œใณใ‚’ใ‚‚ใŸใ‚‰ใ™ใ“ใจใ‚‚ใ‚ใ‚Œใฐใ€ๅ›ฐ้›ฃใซ็›ด้ขใ™ใ‚‹ใ“ใจใ‚‚ใ‚ใ‚‹ใ€‚ใ—ใ‹ใ—ใใ‚Œใ‚‰ๅ…จใฆใŒใ€่‡ชๅˆ†ใ‚’ๅฝขไฝœใ‚‹่ฒด้‡ใช็ตŒ้จ“ใงใ‚ใ‚‹ใ€‚"

raw_jpn = japanese[
    japanese.find(":") + 2 :
]  # factoring out the name of the speaker, since we don't need that for phonemization.
text = phonemize(raw_jpn)

Kotodama = Kotodama_Sampler(
    model, text=japanese, device=device
)  # provide the Japanese text, not the Phonemized version.

reference_dicts = {}

reference_dicts["1789_14289w"] = japanese

start = time.time()
noise = torch.randn(1, 1, 256).to(device)
for k, path in reference_dicts.items():

    wav = inference(
        model,
        diffusion_sampler,
        text,
        Kotodama,
        # None,
        alpha=0.1,
        beta=0.5,
        diffusion_steps=10,
        embedding_scale=1.5,
        rate_of_speech=1.0,
    )
    rtf = (time.time() - start) / (len(wav) / 24000)
    print(f"RTF = {rtf:5f}")
    print(wav)
    save_wav(wav)

My guy, what am I supposed to infer from "the result is not right".

if it's a path problem, you should just copy the full path of each checkpoint on your device.

@Respair Hi, in that case, what if users want styles from audio exactly

the provided notebooks show everything you can do. in case it's not clear, use the compute style function and then feed its result instead of Kotodama

Hi, plz have a look at my script, the path changed, still not right

just put the .pth file here : https://huggingface.co/Respair/Tsukasa_Speech/tree/main/Models/Style_Tsukasa_v02 in the model's ckpt path.

should be good to go. if you're still facing an error, please write the exact error you're facing here.

First of all, the ipy noteb ook points is Kotana model, rather than tsukasa.
The Models/ folder contains only tsukasa checkpoint.

I changed the config to tsukasa as well.

the output not right
...

The broken output suggests the diffusion doesn't work on your system ( i mentioned this in the read me) try to set alpha and beta to zero. if it works then everything is set up correctly.

Sign up or log in to comment