File size: 4,269 Bytes
28b3671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Style-Bert-VITS2ライブラリの使用例\n",
        "\n",
        "`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# PyTorch環境の構築(ない場合)\n",
        "# 参照: https://pytorch.org/get-started/locally/\n",
        "\n",
        "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LLrngKcQEAyP"
      },
      "outputs": [],
      "source": [
        "# style-bert-vits2のインストール\n",
        "\n",
        "!pip install style-bert-vits2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9xRtfUg5EZkx"
      },
      "outputs": [],
      "source": [
        "# BERTモデルをロード(ローカルに手動でダウンロードする必要はありません)\n",
        "\n",
        "from style_bert_vits2.nlp import bert_models\n",
        "from style_bert_vits2.constants import Languages\n",
        "\n",
        "\n",
        "bert_models.load_model(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
        "bert_models.load_tokenizer(Languages.JP, \"ku-nlp/deberta-v2-large-japanese-char-wwm\")\n",
        "# bert_models.load_model(Languages.EN, \"microsoft/deberta-v3-large\")\n",
        "# bert_models.load_tokenizer(Languages.EN, \"microsoft/deberta-v3-large\")\n",
        "# bert_models.load_model(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")\n",
        "# bert_models.load_tokenizer(Languages.ZH, \"hfl/chinese-roberta-wwm-ext-large\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "q2V9d3HyFAr_"
      },
      "outputs": [],
      "source": [
        "# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる\n",
        "# model_assetsディレクトリにダウンロードされます\n",
        "\n",
        "from pathlib import Path\n",
        "from huggingface_hub import hf_hub_download\n",
        "\n",
        "\n",
        "model_file = \"jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors\"\n",
        "config_file = \"jvnv-F1-jp/config.json\"\n",
        "style_file = \"jvnv-F1-jp/style_vectors.npy\"\n",
        "\n",
        "for file in [model_file, config_file, style_file]:\n",
        "    print(file)\n",
        "    hf_hub_download(\n",
        "        \"litagin/style_bert_vits2_jvnv\",\n",
        "        file,\n",
        "        local_dir=\"model_assets\"\n",
        "    )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hJa31MEUFhe4"
      },
      "outputs": [],
      "source": [
        "# 上でダウンロードしたモデルファイルを指定して音声合成のテスト\n",
        "\n",
        "from style_bert_vits2.tts_model import TTSModel\n",
        "\n",
        "assets_root = Path(\"model_assets\")\n",
        "\n",
        "model = TTSModel(\n",
        "    model_path=assets_root / model_file,\n",
        "    config_path=assets_root / config_file,\n",
        "    style_vec_path=assets_root / style_file,\n",
        "    device=\"cpu\"\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Gal0tqrtGXZx"
      },
      "outputs": [],
      "source": [
        "from IPython.display import Audio, display\n",
        "\n",
        "sr, audio = model.infer(text=\"こんにちは\")\n",
        "display(Audio(audio, rate=sr))"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}