fawadrashid commited on
Commit
93c7859
1 Parent(s): 0419a2b

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +29 -0
  2. app.py +190 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV HOME=/home/user \
7
+ PATH=/home/user/.local/bin:$PATH
8
+ WORKDIR $HOME/app
9
+
10
+ COPY --chown=user . $HOME/app
11
+ COPY ./requirements.txt ~/app/requirements.txt
12
+
13
+ USER root
14
+ RUN rm /var/lib/apt/lists/* -vf
15
+ RUN apt-get clean
16
+ RUN apt-get update
17
+ RUN apt-get upgrade
18
+ RUN apt-get install -y wget zip unzip uvicorn espeak-ng
19
+ USER user
20
+ COPY . .
21
+ USER root
22
+ RUN chmod 777 ~/app/*
23
+ USER user
24
+
25
+ RUN pip3 install -r requirements.txt
26
+
27
+ EXPOSE 7860
28
+
29
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from transformers import pipeline
4
+
5
+ import numpy as np
6
+ import gradio as gr
7
+
8
+ def _grab_best_device(use_gpu=False):
9
+ if torch.cuda.device_count() > 0 and use_gpu:
10
+ device = "cuda"
11
+ else:
12
+ device = "cpu"
13
+ return device
14
+
15
+ device = _grab_best_device()
16
+
17
+ default_model_per_language = {
18
+ "english": "kakao-enterprise/vits-ljs",
19
+ "spanish": "facebook/mms-tts-spa",
20
+ }
21
+
22
+ models_per_language = {
23
+ "english": [
24
+ "ylacombe/vits_ljs_midlands_male_monospeaker",
25
+ ],
26
+ "spanish": [
27
+ "ylacombe/mms-spa-finetuned-chilean-monospeaker",
28
+ ]
29
+ }
30
+
31
+ HUB_PATH = "ylacombe/vits_ljs_midlands_male_monospeaker"
32
+
33
+
34
+ pipe_dict = {
35
+ "current_model": "ylacombe/vits_ljs_midlands_male_monospeaker",
36
+ "pipe": pipeline("text-to-speech", model=HUB_PATH, device=device),
37
+ "original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=device),
38
+ "language": "english",
39
+ }
40
+
41
+ title = """
42
+ # Explore MMS finetuning
43
+ ## Or how to access truely multilingual TTS
44
+
45
+ Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
46
+
47
+ Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
48
+ and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
49
+
50
+ Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
51
+
52
+ Training recipe available in this [github repository](https://github.com/ylacombe/finetune-hf-vits)!
53
+ """
54
+
55
+ max_speakers = 15
56
+
57
+
58
+ # Inference
59
+ def generate_audio(text, model_id, language):
60
+
61
+ if pipe_dict["language"] != language:
62
+ gr.Warning(f"Language has changed - loading new default model: {default_model_per_language[language]}")
63
+ pipe_dict["language"] = language
64
+ pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=device)
65
+
66
+ if pipe_dict["current_model"] != model_id:
67
+ gr.Warning("Model has changed - loading new model")
68
+ pipe_dict["pipe"] = pipeline("text-to-speech", model=model_id, device=device)
69
+ pipe_dict["current_model"] = model_id
70
+
71
+ num_speakers = pipe_dict["pipe"].model.config.num_speakers
72
+
73
+ out = []
74
+ # first generate original model result
75
+ output = pipe_dict["original_pipe"](text)
76
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Non finetuned model prediction {default_model_per_language[language]}", show_label=True,
77
+ visible=True)
78
+ out.append(output)
79
+
80
+
81
+ if num_speakers>1:
82
+ for i in range(min(num_speakers, max_speakers - 1)):
83
+ forward_params = {"speaker_id": i}
84
+ output = pipe_dict["pipe"](text, forward_params=forward_params)
85
+
86
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True,
87
+ visible=True)
88
+ out.append(output)
89
+ out.extend([gr.Audio(visible=False)]*(max_speakers-num_speakers))
90
+ else:
91
+ output = pipe_dict["pipe"](text)
92
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label="Generated Audio - Mono speaker", show_label=True,
93
+ visible=True)
94
+ out.append(output)
95
+ out.extend([gr.Audio(visible=False)]*(max_speakers-2))
96
+ return out
97
+
98
+
99
+ css = """
100
+ #container{
101
+ margin: 0 auto;
102
+ max-width: 80rem;
103
+ }
104
+ #intro{
105
+ max-width: 100%;
106
+ text-align: center;
107
+ margin: 0 auto;
108
+ }
109
+ """
110
+ # Gradio blocks demo
111
+ with gr.Blocks(css=css) as demo_blocks:
112
+ gr.Markdown(title, elem_id="intro")
113
+
114
+ with gr.Row():
115
+ with gr.Column():
116
+ inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
117
+ btn = gr.Button("Generate Audio!")
118
+ language = gr.Dropdown(
119
+ default_model_per_language.keys(),
120
+ value = "spanish",
121
+ label = "language",
122
+ info = "Language that you want to test"
123
+ )
124
+
125
+ model_id = gr.Dropdown(
126
+ models_per_language["spanish"],
127
+ value="ylacombe/mms-spa-finetuned-chilean-monospeaker",
128
+ label="Model",
129
+ info="Model you want to test",
130
+ )
131
+
132
+ with gr.Column():
133
+ outputs = []
134
+ for i in range(max_speakers):
135
+ out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
136
+ outputs.append(out_audio)
137
+
138
+ with gr.Accordion("Datasets and models details", open=False):
139
+ gr.Markdown("""
140
+
141
+ For each language, we used 100 to 150 samples of a single speaker to finetune the model.
142
+
143
+ ### Spanish
144
+
145
+ * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
146
+ * **Datasets**:
147
+ - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
148
+
149
+ ### English
150
+
151
+ * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
152
+ * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
153
+
154
+
155
+ """)
156
+
157
+ with gr.Accordion("Run VITS and MMS with transformers", open=False):
158
+ gr.Markdown(
159
+ """
160
+ ```bash
161
+ pip install transformers
162
+ ```
163
+ ```py
164
+ from transformers import pipeline
165
+ import scipy
166
+ pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
167
+
168
+ results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
169
+
170
+ # write to a wav file
171
+ scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
172
+ ```
173
+ """
174
+ )
175
+
176
+
177
+ language.change(lambda language: gr.Dropdown(
178
+ models_per_language[language],
179
+ value=models_per_language[language][0],
180
+ label="Model",
181
+ info="Model you want to test",
182
+ ),
183
+ language,
184
+ model_id
185
+ )
186
+
187
+ btn.click(generate_audio, [inp_text, model_id, language], outputs)
188
+
189
+
190
+ demo_blocks.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ opencv-python-headless<4.3
2
+ gradio
3
+ torch
4
+ torchaudio
5
+ transformers
6
+ ffmpeg
7
+ librosa
8
+ phonemizer
9
+ py-espeak-ng