kevinwang676's picture
Update app.py
a797ecf
import json
import os
import subprocess
from pathlib import Path
import gradio as gr
import librosa
import numpy as np
import torch
from demucs.apply import apply_model
from demucs.pretrained import DEFAULT_MODEL, get_model
from huggingface_hub import hf_hub_download, list_repo_files
from so_vits_svc_fork.hparams import HParams
from so_vits_svc_fork.inference.core import Svc
###################################################################
# REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS
###################################################################
# The Hugging Face Hub repo ID - 在这里修改repo_id,可替换成任何已经训练好的模型!
repo_id = "kevinwang676/talktalkai-qing"
# If None, Uses latest ckpt in the repo
ckpt_name = None
# If None, Uses "kmeans.pt" if it exists in the repo
cluster_model_name = None
# Set the default f0 type to use - use the one it was trained on.
# The default for so-vits-svc-fork is "dio".
# Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest"
default_f0_method = "crepe"
# The default ratio of cluster inference to SVC inference.
# If cluster_model_name is not found in the repo, this is set to 0.
default_cluster_infer_ratio = 0.5
# Limit on duration of audio at inference time. increase if you can
# In this parent app, we set the limit with an env var to 30 seconds
# If you didnt set env var + you go OOM try changing 9e9 to <=300ish
duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9))
###################################################################
# Figure out the latest generator by taking highest value one.
# Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth
if ckpt_name is None:
latest_id = sorted(
[
int(Path(x).stem.split("_")[1])
for x in list_repo_files(repo_id)
if x.startswith("G_") and x.endswith(".pth")
]
)[-1]
ckpt_name = f"G_{latest_id}.pth"
cluster_model_name = cluster_model_name or "kmeans.pt"
if cluster_model_name in list_repo_files(repo_id):
print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}")
cluster_model_path = hf_hub_download(repo_id, cluster_model_name)
else:
print(f"Could not find {cluster_model_name} in {repo_id}. Using None")
cluster_model_path = None
default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0
generator_path = hf_hub_download(repo_id, ckpt_name)
config_path = hf_hub_download(repo_id, "config.json")
hparams = HParams(**json.loads(Path(config_path).read_text()))
speakers = list(hparams.spk.keys())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path)
demucs_model = get_model(DEFAULT_MODEL)
def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0):
wav, sr = librosa.load(filename, mono=False, sr=sr)
wav = torch.tensor(wav)
ref = wav.mean(0)
wav = (wav - ref.mean()) / ref.std()
sources = apply_model(
model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs
)[0]
sources = sources * ref.std() + ref.mean()
# We take just the vocals stem. I know the vocals for this model are at index -1
# If using different model, check model.sources.index('vocals')
vocal_wav = sources[-1]
# I did this because its the same normalization the so-vits model required
vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1)
vocal_wav = vocal_wav.numpy()
vocal_wav = librosa.to_mono(vocal_wav)
vocal_wav = vocal_wav.T
instrumental_wav = sources[:-1].sum(0).numpy().T
return vocal_wav, instrumental_wav
def download_youtube_clip(
video_identifier,
start_time,
end_time,
output_filename,
num_attempts=5,
url_base="https://www.youtube.com/watch?v=",
quiet=False,
force=False,
):
output_path = Path(output_filename)
if output_path.exists():
if not force:
return output_path
else:
output_path.unlink()
quiet = "--quiet --no-warnings" if quiet else ""
command = f"""
yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
""".strip()
attempts = 0
while True:
try:
_ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError:
attempts += 1
if attempts == num_attempts:
return None
else:
break
if output_path.exists():
return output_path
else:
return None
def predict(
speaker,
audio,
transpose: int = 0,
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: str = "crepe",
db_thresh: int = -40,
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
):
audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit)
audio = model.infer_silence(
audio.astype(np.float32),
speaker=speaker,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
f0_method=f0_method,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
)
return model.target_sample, audio
def predict_song_from_yt(
ytid_or_url,
start,
end,
speaker=speakers[0],
transpose: int = 0,
auto_predict_f0: bool = False,
cluster_infer_ratio: float = 0,
noise_scale: float = 0.4,
f0_method: str = "dio",
db_thresh: int = -40,
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
):
end = min(start + duration_limit, end)
original_track_filepath = download_youtube_clip(
ytid_or_url,
start,
end,
"track.wav",
force=True,
url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=",
)
vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath)
if transpose != 0:
inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T
cloned_vox = model.infer_silence(
vox_wav.astype(np.float32),
speaker=speaker,
transpose=transpose,
auto_predict_f0=auto_predict_f0,
cluster_infer_ratio=cluster_infer_ratio,
noise_scale=noise_scale,
f0_method=f0_method,
db_thresh=db_thresh,
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
)
full_song = inst_wav + np.expand_dims(cloned_vox, 1)
return (model.target_sample, full_song), (model.target_sample, cloned_vox)
description = f"""
## <center>🏞️ - TalkTalkAI - Generative AI Text to Speech & Singing Voice Conversion</center>
### <center>🌟 - The singer who collaborates with TalkTalkAI: [一清清清](https://space.bilibili.com/22960772?spm_id_from=333.337.0.0)</center>
### <center>🤗 - Stay tuned. The best is yet to come. Contact us: talktalkai.kevin@gmail.com</center>
""".strip()
interface_yt = gr.Interface(
predict_song_from_yt,
inputs=[
gr.Textbox(
label="YouTube URL or ID", info="A YouTube URL (or ID) to a song on YouTube you want to clone from"
),
gr.Number(value=0, label="Start Time (seconds)"),
gr.Number(value=15, label="End Time (seconds)"),
gr.Dropdown(speakers, value=speakers[0], label="🎤 AI Singer"),
gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"),
gr.Checkbox(False, label="Auto Predict F0", visible=False),
gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio", visible=False),
gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale", visible=False),
gr.Dropdown(
choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
value=default_f0_method,
label="f0 method", visible=False
),
],
outputs=[gr.Audio(label="With BGM🎵"), gr.Audio(label="Without BGM🎤")],
title="🌊💕🎶 - TalkTalkAI",
description=description,
examples=[
["https://www.youtube.com/watch?v=cn4M-fH08XY", 0, 11.4, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method],
],
)
interface = gr.TabbedInterface(
[interface_yt],
["📺 - Clone Song From YouTube"],
)
if __name__ == "__main__":
interface.launch(show_error=True)