File size: 7,523 Bytes
66baa21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import torch
from transformers import pipeline
import numpy as np
import gradio as gr
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return device
device = _grab_best_device()
default_model_per_language = {
"marathi": "facebook/mms-tts-mar"
}
models_per_language = {
"marathi": ["ylacombe/mms-mar-finetuned-monospeaker"]
}
HUB_PATH = "ylacombe/vits_ljs_midlands_male_monospeaker"
pipe_dict = {
"current_model": "ylacombe/vits_ljs_midlands_male_monospeaker",
"pipe": pipeline("text-to-speech", model=HUB_PATH, device=0),
"original_pipe": pipeline("text-to-speech", model=default_model_per_language["marathi"], device=0),
"language": "english",
}
title = """
# Explore MMS finetuning
## Or how to access truely multilingual TTS
Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
Training recipe available in this [github repository](https://github.com/ylacombe/finetune-hf-vits)!
"""
max_speakers = 1
# Inference
def generate_audio(text, model_id, language):
if pipe_dict["language"] != language:
gr.Warning(f"Language has changed - loading new default model: {default_model_per_language[language]}")
pipe_dict["language"] = language
pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
if pipe_dict["current_model"] != model_id:
gr.Warning("Model has changed - loading new model")
pipe_dict["pipe"] = pipeline("text-to-speech", model=model_id, device=0)
pipe_dict["current_model"] = model_id
num_speakers = pipe_dict["pipe"].model.config.num_speakers
out = []
# first generate original model result
output = pipe_dict["original_pipe"](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Non finetuned model prediction {default_model_per_language[language]}", show_label=True,
visible=True)
out.append(output)
if num_speakers>1:
for i in range(min(num_speakers, max_speakers - 1)):
forward_params = {"speaker_id": i}
output = pipe_dict["pipe"](text, forward_params=forward_params)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True,
visible=True)
out.append(output)
out.extend([gr.Audio(visible=False)]*(max_speakers-num_speakers))
else:
output = pipe_dict["pipe"](text)
output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label="Generated Audio - Mono speaker", show_label=True,
visible=True)
out.append(output)
out.extend([gr.Audio(visible=False)]*(max_speakers-2))
return out
css = """
#container{
margin: 0 auto;
max-width: 80rem;
}
#intro{
max-width: 100%;
text-align: center;
margin: 0 auto;
}
"""
# Gradio blocks demo
with gr.Blocks(css=css) as demo_blocks:
gr.Markdown(title, elem_id="intro")
with gr.Row():
with gr.Column():
inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
btn = gr.Button("Generate Audio!")
language = gr.Dropdown(
default_model_per_language.keys(),
value = "marathi",
label = "language",
info = "Language that you want to test"
)
model_id = gr.Dropdown(
models_per_language["marathi"],
value="ylacombe/mms-mar-finetuned-monospeaker",
label="Model",
info="Model you want to test",
)
with gr.Column():
outputs = []
for i in range(max_speakers):
out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
outputs.append(out_audio)
with gr.Accordion("Datasets and models details", open=False):
gr.Markdown("""
For each language, we used 100 to 150 samples of a single speaker to finetune the model.
### Spanish
* **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
* **Datasets**:
- [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
### Tamil
* **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
* **Datasets**:
- [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
### Gujarati
* **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
* **Datasets**:
- [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
### Marathi
* **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
* **Datasets**:
- [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
### English
* **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
* **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
""")
with gr.Accordion("Run VITS and MMS with transformers", open=False):
gr.Markdown(
"""
```bash
pip install transformers
```
```py
from transformers import pipeline
import scipy
pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
# write to a wav file
scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
```
"""
)
language.change(lambda language: gr.Dropdown(
models_per_language[language],
value=models_per_language[language][0],
label="Model",
info="Model you want to test",
),
language,
model_id
)
btn.click(generate_audio, [inp_text, model_id, language], outputs)
demo_blocks.queue().launch() |