fawadrashid commited on
Commit
e585f50
1 Parent(s): 536a548

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +0 -2
  2. app.py +21 -184
  3. requirements.txt +3 -5
Dockerfile CHANGED
@@ -24,6 +24,4 @@ USER user
24
 
25
  RUN pip3 install -r requirements.txt
26
 
27
- EXPOSE 7860
28
-
29
  CMD ["python", "app.py"]
 
24
 
25
  RUN pip3 install -r requirements.txt
26
 
 
 
27
  CMD ["python", "app.py"]
app.py CHANGED
@@ -1,190 +1,27 @@
1
- import torch
2
-
3
- from transformers import pipeline
4
-
5
- import numpy as np
6
  import gradio as gr
 
 
 
 
7
 
8
- def _grab_best_device(use_gpu=False):
9
- if torch.cuda.device_count() > 0 and use_gpu:
10
- device = "cuda"
11
- else:
12
- device = "cpu"
13
- return device
14
-
15
- device = _grab_best_device()
16
-
17
- default_model_per_language = {
18
- "english": "kakao-enterprise/vits-ljs",
19
- "spanish": "facebook/mms-tts-spa",
20
- }
21
-
22
- models_per_language = {
23
- "english": [
24
- "ylacombe/vits_ljs_midlands_male_monospeaker",
25
- ],
26
- "spanish": [
27
- "ylacombe/mms-spa-finetuned-chilean-monospeaker",
28
- ]
29
- }
30
-
31
- HUB_PATH = "ylacombe/vits_ljs_midlands_male_monospeaker"
32
-
33
-
34
- pipe_dict = {
35
- "current_model": "ylacombe/vits_ljs_midlands_male_monospeaker",
36
- "pipe": pipeline("text-to-speech", model=HUB_PATH, device=device),
37
- "original_pipe": pipeline("text-to-speech", model=default_model_per_language["english"], device=device),
38
- "language": "english",
39
- }
40
-
41
- title = """
42
- # Explore MMS finetuning
43
- ## Or how to access truely multilingual TTS
44
-
45
- Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
46
-
47
- Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
48
- and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
49
-
50
- Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
51
-
52
- Training recipe available in this [github repository](https://github.com/ylacombe/finetune-hf-vits)!
53
- """
54
-
55
- max_speakers = 15
56
-
57
-
58
- # Inference
59
- def generate_audio(text, model_id, language):
60
-
61
- if pipe_dict["language"] != language:
62
- gr.Warning(f"Language has changed - loading new default model: {default_model_per_language[language]}")
63
- pipe_dict["language"] = language
64
- pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=device)
65
-
66
- if pipe_dict["current_model"] != model_id:
67
- gr.Warning("Model has changed - loading new model")
68
- pipe_dict["pipe"] = pipeline("text-to-speech", model=model_id, device=device)
69
- pipe_dict["current_model"] = model_id
70
-
71
- num_speakers = pipe_dict["pipe"].model.config.num_speakers
72
-
73
- out = []
74
- # first generate original model result
75
- output = pipe_dict["original_pipe"](text)
76
- output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Non finetuned model prediction {default_model_per_language[language]}", show_label=True,
77
- visible=True)
78
- out.append(output)
79
-
80
-
81
- if num_speakers>1:
82
- for i in range(min(num_speakers, max_speakers - 1)):
83
- forward_params = {"speaker_id": i}
84
- output = pipe_dict["pipe"](text, forward_params=forward_params)
85
-
86
- output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True,
87
- visible=True)
88
- out.append(output)
89
- out.extend([gr.Audio(visible=False)]*(max_speakers-num_speakers))
90
- else:
91
- output = pipe_dict["pipe"](text)
92
- output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label="Generated Audio - Mono speaker", show_label=True,
93
- visible=True)
94
- out.append(output)
95
- out.extend([gr.Audio(visible=False)]*(max_speakers-2))
96
- return out
97
-
98
-
99
- css = """
100
- #container{
101
- margin: 0 auto;
102
- max-width: 80rem;
103
- }
104
- #intro{
105
- max-width: 100%;
106
- text-align: center;
107
- margin: 0 auto;
108
- }
109
- """
110
- # Gradio blocks demo
111
- with gr.Blocks(css=css) as demo_blocks:
112
- gr.Markdown(title, elem_id="intro")
113
-
114
- with gr.Row():
115
- with gr.Column():
116
- inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
117
- btn = gr.Button("Generate Audio!")
118
- language = gr.Dropdown(
119
- default_model_per_language.keys(),
120
- value = "spanish",
121
- label = "language",
122
- info = "Language that you want to test"
123
- )
124
-
125
- model_id = gr.Dropdown(
126
- models_per_language["spanish"],
127
- value="ylacombe/mms-spa-finetuned-chilean-monospeaker",
128
- label="Model",
129
- info="Model you want to test",
130
- )
131
-
132
- with gr.Column():
133
- outputs = []
134
- for i in range(max_speakers):
135
- out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
136
- outputs.append(out_audio)
137
-
138
- with gr.Accordion("Datasets and models details", open=False):
139
- gr.Markdown("""
140
-
141
- For each language, we used 100 to 150 samples of a single speaker to finetune the model.
142
-
143
- ### Spanish
144
-
145
- * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
146
- * **Datasets**:
147
- - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
148
-
149
- ### English
150
-
151
- * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
152
- * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
153
-
154
-
155
- """)
156
-
157
- with gr.Accordion("Run VITS and MMS with transformers", open=False):
158
- gr.Markdown(
159
- """
160
- ```bash
161
- pip install transformers
162
- ```
163
- ```py
164
- from transformers import pipeline
165
- import scipy
166
- pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
167
-
168
- results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
169
-
170
- # write to a wav file
171
- scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
172
- ```
173
- """
174
- )
175
 
 
176
 
177
- language.change(lambda language: gr.Dropdown(
178
- models_per_language[language],
179
- value=models_per_language[language][0],
180
- label="Model",
181
- info="Model you want to test",
182
- ),
183
- language,
184
- model_id
185
- )
186
 
187
- btn.click(generate_audio, [inp_text, model_id, language], outputs)
188
 
189
-
190
- demo_blocks.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
 
 
 
 
2
  import gradio as gr
3
+ from helper import load_image_from_url, render_results_in_image
4
+ from transformers import pipeline
5
+ from transformers.utils import logging
6
+ logging.set_verbosity_error()
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ od_pipe = pipeline("object-detection", "./models/facebook/detr-resnet-50")
10
 
11
+ def get_pipeline_prediction(pil_image):
 
 
 
 
 
 
 
 
12
 
13
+ pipeline_output = od_pipe(pil_image)
14
 
15
+ processed_image = render_results_in_image(pil_image,
16
+ pipeline_output)
17
+ return processed_image
18
+
19
+ demo = gr.Interface(
20
+ fn=get_pipeline_prediction,
21
+ inputs=gr.Image(label="Input image",
22
+ type="pil"),
23
+ outputs=gr.Image(label="Output image with predicted instances",
24
+ type="pil")
25
+ )
26
+
27
+ demo_blocks.queue().launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,9 +1,7 @@
1
  opencv-python-headless<4.3
2
  gradio
3
- torch
4
- torchaudio
5
  transformers
6
- ffmpeg
7
- librosa
8
  phonemizer
9
- py-espeak-ng
 
 
 
1
  opencv-python-headless<4.3
2
  gradio
 
 
3
  transformers
 
 
4
  phonemizer
5
+ py-espeak-ng
6
+ inflect
7
+ timm