Can u please opensource the right weights?
The weights in your repo can not infer correctly.
Hi, the weights are exactly what I used and trained. may I ask you what's the issue you're facing please?
You are finally online, plz help these model:
KotoDama_Prompter = load_KotoDama_Prompter(
path="Utils/Kotodama/prompt_enc/checkpoint-73285"
)
KotoDama_TextSampler = load_KotoDama_TextSampler(
path="Utils/Kotodama/text_enc/checkpoint-22680"
)
without these, the model can not infer correctly.
Also, what does these module helped in generation? How does it trained?
oh, Sorry i forgot to rename the path.
change "Kotodama" to "KTD" in the path, and then it should work.
Kotodama is a new way of generating the style vectors without audio files or the diffusion sampler.
Hi, the result still not right.
config = yaml.safe_load(open("Configs/config_kanade.yml"))
params_whole = torch.load(
# "Models/Style_Kanade_v02/Top_ckpt_24khz.pth", map_location="cpu"
"Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth",
map_location="cpu",
)
I can only saw the Style_Tsukasa_v02 this one pth. Can u point me where could be wrong?
import IPython.display as ipd
import os
import torch
from utils import recursive_munch
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
from Utils.phonemize.cotlet_phon import phonemize
import time
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
from nltk.tokenize import word_tokenize
from models import *
from Modules.KotoDama_sampler import (
tokenizer_koto_prompt,
tokenizer_koto_text,
inference,
Longform,
merging_sentences,
# trim_long_silences,
)
from utils import *
import nltk
nltk.download("punkt_tab")
from nltk.tokenize import sent_tokenize
from konoha import SentenceTokenizer
sent_tokenizer = SentenceTokenizer()
to_mel = torchaudio.transforms.MelSpectrogram(
n_mels=80, n_fft=2048, win_length=1200, hop_length=300
)
mean, std = -4, 4
def preprocess(wave):
wave_tensor = torch.from_numpy(wave).float()
mel_tensor = to_mel(wave_tensor)
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
return mel_tensor
def compute_style_through_clip_through_clip(path):
wave, sr = librosa.load(path, sr=24000)
audio, index = librosa.effects.trim(wave, top_db=30)
if sr != 24000:
audio = librosa.resample(audio, sr, 24000)
mel_tensor = preprocess(audio).to(device)
with torch.no_grad():
ref_s = model.style_encoder(mel_tensor.unsqueeze(1))
ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))
return torch.cat([ref_s, ref_p], dim=1)
def Kotodama_Prompter(model, text, device):
with torch.no_grad():
style = model.KotoDama_Prompt(
**tokenizer_koto_prompt(text, return_tensors="pt").to(device)
)["logits"]
return style
def Kotodama_Sampler(model, text, device):
with torch.no_grad():
style = model.KotoDama_Text(
**tokenizer_koto_text(text, return_tensors="pt").to(device)
)["logits"]
return style
import soundfile as sf
def save_wav(wav, rate=24000, filename="output.wav"):
if not isinstance(wav, np.ndarray):
wav = np.array(wav, dtype=np.float32)
# Clip the audio data to the valid range [-1, 1] to avoid issues
wav = np.clip(wav, -1.0, 1.0)
# Save the WAV file using soundfile
sf.write(filename, wav, 24000)
device = "cuda" if torch.cuda.is_available() else "cpu"
config = yaml.safe_load(open("Configs/config_kanade.yml"))
# load pretrained ASR model
ASR_config = config.get("ASR_config", False)
ASR_path = config.get("ASR_path", False)
text_aligner = load_ASR_models(ASR_path, ASR_config)
KotoDama_Prompter = load_KotoDama_Prompter(
# path="Utils/Kotodama/prompt_enc/checkpoint-73285"
path="Utils/KTD/prompt_enc/checkpoint-73285"
)
KotoDama_TextSampler = load_KotoDama_TextSampler(
# path="Utils/Kotodama/text_enc/checkpoint-22680"
path="Utils/KTD/text_enc/checkpoint-22680"
)
# load pretrained F0 model
F0_path = config.get("F0_path", False)
pitch_extractor = load_F0_models(F0_path)
# load BERT model
from Utils.PLBERT.util import load_plbert
BERT_path = config.get("PLBERT_dir", False)
plbert = load_plbert(BERT_path)
model_params = recursive_munch(config["model_params"])
model = build_model(
model_params,
text_aligner,
pitch_extractor,
plbert,
KotoDama_Prompter,
KotoDama_TextSampler,
# None, None
)
_ = [model[key].eval() for key in model]
_ = [model[key].to(device) for key in model]
params_whole = torch.load(
# "Models/Style_Kanade_v02/Top_ckpt_24khz.pth", map_location="cpu"
"Models/Style_Tsukasa_v02/Top_ckpt_24khz.pth",
map_location="cpu",
)
params = params_whole["net"]
for key in model:
if key in params:
print("%s loaded" % key)
try:
model[key].load_state_dict(params[key])
except:
from collections import OrderedDict
state_dict = params[key]
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = k[7:] # remove `module.`
new_state_dict[name] = v
# load params
model[key].load_state_dict(new_state_dict, strict=False)
# except:
# _load(params[key], model[key])
_ = [model[key].eval() for key in model]
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
diffusion_sampler = DiffusionSampler(
model.diffusion.diffusion,
sampler=ADPM2Sampler(),
sigma_schedule=KarrasSchedule(
sigma_min=0.0001, sigma_max=3.0, rho=9.0
), # empirical parameters
clamp=False,
)
"""different speaker by switching the name. we then define how much should be the impact of the diffusion sampler.
if the diffusion sampler works for you, it's recommended to use both Kotodama. otherwise, set alpha and beta to 0."""
japanese = "Kimiji: ไบบ็ใฏใๆใฆใใชใๆขๆฑใฎๆ
ใฎใใใชใใฎใ็งใใกใฏใ่ชๅ่ช่บซใๅจๅฒใฎไธ็ใซใคใใฆใๅธธใซๆฐใใ็บ่ฆใใใฆใใใใใใฏใๆใจใใฆๅใณใใใใใใใจใใใใฐใๅฐ้ฃใซ็ด้ขใใใใจใใใใใใใใใใๅ
จใฆใใ่ชๅใๅฝขไฝใ่ฒด้ใช็ต้จใงใใใ"
raw_jpn = japanese[
japanese.find(":") + 2 :
] # factoring out the name of the speaker, since we don't need that for phonemization.
text = phonemize(raw_jpn)
Kotodama = Kotodama_Sampler(
model, text=japanese, device=device
) # provide the Japanese text, not the Phonemized version.
reference_dicts = {}
reference_dicts["1789_14289w"] = japanese
start = time.time()
noise = torch.randn(1, 1, 256).to(device)
for k, path in reference_dicts.items():
wav = inference(
model,
diffusion_sampler,
text,
Kotodama,
# None,
alpha=0.1,
beta=0.5,
diffusion_steps=10,
embedding_scale=1.5,
rate_of_speech=1.0,
)
rtf = (time.time() - start) / (len(wav) / 24000)
print(f"RTF = {rtf:5f}")
print(wav)
save_wav(wav)
My guy, what am I supposed to infer from "the result is not right".
if it's a path problem, you should just copy the full path of each checkpoint on your device.
@Respair Hi, in that case, what if users want styles from audio exactly
the provided notebooks show everything you can do. in case it's not clear, use the compute style function and then feed its result instead of Kotodama
Hi, plz have a look at my script, the path changed, still not right
just put the .pth file here : https://huggingface.co/Respair/Tsukasa_Speech/tree/main/Models/Style_Tsukasa_v02 in the model's ckpt path.
should be good to go. if you're still facing an error, please write the exact error you're facing here.
First of all, the ipy noteb ook points is Kotana model, rather than tsukasa.
The Models/ folder contains only tsukasa checkpoint.
I changed the config to tsukasa as well.
the output not right
...
The broken output suggests the diffusion doesn't work on your system ( i mentioned this in the read me) try to set alpha and beta to zero. if it works then everything is set up correctly.