Spaces:
Running
on
Zero
Running
on
Zero
Changed coqui to forked (and updated) version from idiap/coqui-ai-TTS
Browse files- Dockerfile +5 -2
- README.md +2 -2
- app.py +2 -2
- requirements.txt +6 -12
- xtts.py +1 -7
Dockerfile
CHANGED
@@ -7,14 +7,17 @@ RUN apt-get update && \
|
|
7 |
|
8 |
WORKDIR /app
|
9 |
COPY requirements.txt .
|
10 |
-
RUN python -m pip install -r requirements.txt
|
11 |
-
|
12 |
|
13 |
RUN python -m unidic download
|
14 |
RUN mkdir -p /app/tts_models
|
15 |
|
|
|
16 |
RUN python -m pip install spaces
|
17 |
|
|
|
|
|
18 |
COPY xtts.py .
|
19 |
COPY app.py .
|
20 |
|
|
|
7 |
|
8 |
WORKDIR /app
|
9 |
COPY requirements.txt .
|
10 |
+
RUN python -m pip install --verbose -r requirements.txt
|
11 |
+
RUN python -m pip cache purge
|
12 |
|
13 |
RUN python -m unidic download
|
14 |
RUN mkdir -p /app/tts_models
|
15 |
|
16 |
+
RUN python -m pip install gradio==5.5
|
17 |
RUN python -m pip install spaces
|
18 |
|
19 |
+
|
20 |
+
|
21 |
COPY xtts.py .
|
22 |
COPY app.py .
|
23 |
|
README.md
CHANGED
@@ -4,10 +4,10 @@ emoji: ⚡
|
|
4 |
colorFrom: red
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
startup_duration_timeout:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.5
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
startup_duration_timeout: 4h
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -319,7 +319,7 @@ with gr.Blocks(js=js) as demo:
|
|
319 |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
320 |
with gr.Column() as col2:
|
321 |
with gr.Row():
|
322 |
-
text = gr.Textbox(label="
|
323 |
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
|
324 |
with gr.Row():
|
325 |
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
@@ -336,7 +336,7 @@ with gr.Blocks(js=js) as demo:
|
|
336 |
|
337 |
AudioList = gr.Dropdown(
|
338 |
label="Generated Audios",
|
339 |
-
choices=[
|
340 |
,interactive=True
|
341 |
)
|
342 |
|
|
|
319 |
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
320 |
with gr.Column() as col2:
|
321 |
with gr.Row():
|
322 |
+
text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
|
323 |
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
|
324 |
with gr.Row():
|
325 |
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
|
|
336 |
|
337 |
AudioList = gr.Dropdown(
|
338 |
label="Generated Audios",
|
339 |
+
choices=[]
|
340 |
,interactive=True
|
341 |
)
|
342 |
|
requirements.txt
CHANGED
@@ -1,17 +1,11 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
torchaudio==2.1.0
|
4 |
-
gradio==4.44
|
5 |
-
numpy==1.22.0
|
6 |
-
TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
|
7 |
uvicorn[standard]==0.23.2
|
8 |
-
deepspeed
|
9 |
-
pydantic
|
10 |
-
python-multipart==0.0.
|
11 |
typing-extensions>=4.8.0
|
12 |
-
cutlet
|
13 |
mecab-python3==1.0.6
|
14 |
unidic-lite==1.0.8
|
15 |
unidic==1.1.0
|
16 |
-
|
17 |
-
|
|
|
1 |
+
# TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
|
2 |
+
coqui-tts
|
|
|
|
|
|
|
|
|
3 |
uvicorn[standard]==0.23.2
|
4 |
+
deepspeed
|
5 |
+
pydantic
|
6 |
+
python-multipart==0.0.6
|
7 |
typing-extensions>=4.8.0
|
8 |
+
cutlet
|
9 |
mecab-python3==1.0.6
|
10 |
unidic-lite==1.0.8
|
11 |
unidic==1.1.0
|
|
|
|
xtts.py
CHANGED
@@ -11,7 +11,7 @@ import spaces
|
|
11 |
|
12 |
from TTS.tts.configs.xtts_config import XttsConfig
|
13 |
from TTS.tts.models.xtts import Xtts
|
14 |
-
from
|
15 |
from TTS.utils.manage import ModelManager
|
16 |
|
17 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
@@ -151,12 +151,6 @@ def predict_speech(parsed_input: TTSInputs):
|
|
151 |
speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
|
152 |
gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
|
153 |
|
154 |
-
print("speaker embedding")
|
155 |
-
print(speaker_embedding)
|
156 |
-
|
157 |
-
print("latent")
|
158 |
-
print(gpt_cond_latent)
|
159 |
-
|
160 |
text = parsed_input.text
|
161 |
language = parsed_input.language
|
162 |
temperature = parsed_input.temperature
|
|
|
11 |
|
12 |
from TTS.tts.configs.xtts_config import XttsConfig
|
13 |
from TTS.tts.models.xtts import Xtts
|
14 |
+
from trainer.io import get_user_data_dir
|
15 |
from TTS.utils.manage import ModelManager
|
16 |
|
17 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
|
|
151 |
speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
|
152 |
gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
text = parsed_input.text
|
155 |
language = parsed_input.language
|
156 |
temperature = parsed_input.temperature
|