rrg92 commited on
Commit
c56f0a9
1 Parent(s): 8581811

Changed coqui to forked (and updated) version from idiap/coqui-ai-TTS

Browse files
Files changed (5) hide show
  1. Dockerfile +5 -2
  2. README.md +2 -2
  3. app.py +2 -2
  4. requirements.txt +6 -12
  5. xtts.py +1 -7
Dockerfile CHANGED
@@ -7,14 +7,17 @@ RUN apt-get update && \
7
 
8
  WORKDIR /app
9
  COPY requirements.txt .
10
- RUN python -m pip install -r requirements.txt \
11
- && python -m pip cache purge
12
 
13
  RUN python -m unidic download
14
  RUN mkdir -p /app/tts_models
15
 
 
16
  RUN python -m pip install spaces
17
 
 
 
18
  COPY xtts.py .
19
  COPY app.py .
20
 
 
7
 
8
  WORKDIR /app
9
  COPY requirements.txt .
10
+ RUN python -m pip install --verbose -r requirements.txt
11
+ RUN python -m pip cache purge
12
 
13
  RUN python -m unidic download
14
  RUN mkdir -p /app/tts_models
15
 
16
+ RUN python -m pip install gradio==5.5
17
  RUN python -m pip install spaces
18
 
19
+
20
+
21
  COPY xtts.py .
22
  COPY app.py .
23
 
README.md CHANGED
@@ -4,10 +4,10 @@ emoji: ⚡
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
- startup_duration_timeout: 5h
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.5
8
  app_file: app.py
9
  pinned: false
10
+ startup_duration_timeout: 4h
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -319,7 +319,7 @@ with gr.Blocks(js=js) as demo:
319
  speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
320
  with gr.Column() as col2:
321
  with gr.Row():
322
- text = gr.Textbox(label="text",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
323
  pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
324
  with gr.Row():
325
  lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
@@ -336,7 +336,7 @@ with gr.Blocks(js=js) as demo:
336
 
337
  AudioList = gr.Dropdown(
338
  label="Generated Audios",
339
- choices=['a','b']
340
  ,interactive=True
341
  )
342
 
 
319
  speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
320
  with gr.Column() as col2:
321
  with gr.Row():
322
+ text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
323
  pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
324
  with gr.Row():
325
  lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
 
336
 
337
  AudioList = gr.Dropdown(
338
  label="Generated Audios",
339
+ choices=[]
340
  ,interactive=True
341
  )
342
 
requirements.txt CHANGED
@@ -1,17 +1,11 @@
1
- torch==2.1.0
2
- torchvision==0.16.0
3
- torchaudio==2.1.0
4
- gradio==4.44
5
- numpy==1.22.0
6
- TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
7
  uvicorn[standard]==0.23.2
8
- deepspeed==0.15.1
9
- pydantic==2.9.0
10
- python-multipart==0.0.9
11
  typing-extensions>=4.8.0
12
- cutlet==0.4.0
13
  mecab-python3==1.0.6
14
  unidic-lite==1.0.8
15
  unidic==1.1.0
16
-
17
-
 
1
+ # TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
2
+ coqui-tts
 
 
 
 
3
  uvicorn[standard]==0.23.2
4
+ deepspeed
5
+ pydantic
6
+ python-multipart==0.0.6
7
  typing-extensions>=4.8.0
8
+ cutlet
9
  mecab-python3==1.0.6
10
  unidic-lite==1.0.8
11
  unidic==1.1.0
 
 
xtts.py CHANGED
@@ -11,7 +11,7 @@ import spaces
11
 
12
  from TTS.tts.configs.xtts_config import XttsConfig
13
  from TTS.tts.models.xtts import Xtts
14
- from TTS.utils.generic_utils import get_user_data_dir
15
  from TTS.utils.manage import ModelManager
16
 
17
  os.environ["COQUI_TOS_AGREED"] = "1"
@@ -151,12 +151,6 @@ def predict_speech(parsed_input: TTSInputs):
151
  speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
152
  gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
153
 
154
- print("speaker embedding")
155
- print(speaker_embedding)
156
-
157
- print("latent")
158
- print(gpt_cond_latent)
159
-
160
  text = parsed_input.text
161
  language = parsed_input.language
162
  temperature = parsed_input.temperature
 
11
 
12
  from TTS.tts.configs.xtts_config import XttsConfig
13
  from TTS.tts.models.xtts import Xtts
14
+ from trainer.io import get_user_data_dir
15
  from TTS.utils.manage import ModelManager
16
 
17
  os.environ["COQUI_TOS_AGREED"] = "1"
 
151
  speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
152
  gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
153
 
 
 
 
 
 
 
154
  text = parsed_input.text
155
  language = parsed_input.language
156
  temperature = parsed_input.temperature