File size: 4,269 Bytes
28b3671 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Style-Bert-VITS2ライブラリの使用例\n",
"\n",
"`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# PyTorch環境の構築(ない場合)\n",
"# 参照: https://pytorch.org/get-started/locally/\n",
"\n",
"!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LLrngKcQEAyP"
},
"outputs": [],
"source": [
"# style-bert-vits2のインストール\n",
"\n",
"!pip install style-bert-vits2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9xRtfUg5EZkx"
},
"outputs": [],
"source": [
"# BERTモデルをロード(ローカルに手動でダウンロードする必要はありません)\n",
"\n",
"from style_bert_vits2.nlp import bert_models\n",
"from style_bert_vits2.constants import Languages\n",
"\n",
"\n",
"bert_models.load_model(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
"bert_models.load_tokenizer(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
"# bert_models.load_model(Languages.EN, \"microsoft/deberta-v3-large\")\n",
"# bert_models.load_tokenizer(Languages.EN, \"microsoft/deberta-v3-large\")\n",
"# bert_models.load_model(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")\n",
"# bert_models.load_tokenizer(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "q2V9d3HyFAr_"
},
"outputs": [],
"source": [
"# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる\n",
"# model_assetsディレクトリにダウンロードされます\n",
"\n",
"from pathlib import Path\n",
"from huggingface_hub import hf_hub_download\n",
"\n",
"\n",
"model_file = \"jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors\"\n",
"config_file = \"jvnv-F1-jp/config.json\"\n",
"style_file = \"jvnv-F1-jp/style_vectors.npy\"\n",
"\n",
"for file in [model_file, config_file, style_file]:\n",
" print(file)\n",
" hf_hub_download(\n",
" \"litagin/style_bert_vits2_jvnv\",\n",
" file,\n",
" local_dir=\"model_assets\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "hJa31MEUFhe4"
},
"outputs": [],
"source": [
"# 上でダウンロードしたモデルファイルを指定して音声合成のテスト\n",
"\n",
"from style_bert_vits2.tts_model import TTSModel\n",
"\n",
"assets_root = Path(\"model_assets\")\n",
"\n",
"model = TTSModel(\n",
" model_path=assets_root / model_file,\n",
" config_path=assets_root / config_file,\n",
" style_vec_path=assets_root / style_file,\n",
" device=\"cpu\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Gal0tqrtGXZx"
},
"outputs": [],
"source": [
"from IPython.display import Audio, display\n",
"\n",
"sr, audio = model.infer(text=\"こんにちは\")\n",
"display(Audio(audio, rate=sr))"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
|