Spaces:

rrg92
/

xtts

Running on Zero

rrg92 commited on Nov 10, 2024

Commit

c56f0a9

1 Parent(s): 8581811

Changed coqui to forked (and updated) version from idiap/coqui-ai-TTS

Files changed (5) hide show

Dockerfile CHANGED Viewed

@@ -7,14 +7,17 @@ RUN apt-get update && \
 WORKDIR /app
 COPY requirements.txt .
-RUN python -m pip install -r requirements.txt \
-    && python -m pip cache purge
 RUN python -m unidic download
 RUN mkdir -p /app/tts_models
 RUN python -m pip install spaces
 COPY xtts.py .
 COPY app.py .

 WORKDIR /app
 COPY requirements.txt .
+RUN python -m pip install  --verbose -r requirements.txt
+RUN python -m pip cache purge
 RUN python -m unidic download
 RUN mkdir -p /app/tts_models
+RUN python -m pip install gradio==5.5
 RUN python -m pip install spaces
 COPY xtts.py .
 COPY app.py .

README.md CHANGED Viewed

@@ -4,10 +4,10 @@ emoji: ⚡
 colorFrom: red
 colorTo: green
 sdk: gradio
-sdk_version: 4.44.0
 app_file: app.py
 pinned: false
-startup_duration_timeout: 5h
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: red
 colorTo: green
 sdk: gradio
+sdk_version: 5.5
 app_file: app.py
 pinned: false
+startup_duration_timeout: 4h
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -319,7 +319,7 @@ with gr.Blocks(js=js) as demo:
                 speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
         with gr.Column() as col2:
             with gr.Row():
-                text = gr.Textbox(label="text",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
                 pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
             with gr.Row():
                 lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
@@ -336,7 +336,7 @@ with gr.Blocks(js=js) as demo:
             AudioList = gr.Dropdown(
                     label="Generated Audios",
-                    choices=['a','b']
                     ,interactive=True
                 )

                 speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
         with gr.Column() as col2:
             with gr.Row():
+                text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!")
                 pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4)
             with gr.Row():
                 lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
             AudioList = gr.Dropdown(
                     label="Generated Audios",
+                    choices=[]
                     ,interactive=True
                 )

requirements.txt CHANGED Viewed

@@ -1,17 +1,11 @@
-torch==2.1.0
-torchvision==0.16.0
-torchaudio==2.1.0
-gradio==4.44
-numpy==1.22.0
-TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
 uvicorn[standard]==0.23.2
-deepspeed==0.15.1
-pydantic==2.9.0
-python-multipart==0.0.9
 typing-extensions>=4.8.0
-cutlet==0.4.0
 mecab-python3==1.0.6
 unidic-lite==1.0.8
 unidic==1.1.0

+# TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
+coqui-tts
 uvicorn[standard]==0.23.2
+deepspeed
+pydantic
+python-multipart==0.0.6
 typing-extensions>=4.8.0
+cutlet
 mecab-python3==1.0.6
 unidic-lite==1.0.8
 unidic==1.1.0

xtts.py CHANGED Viewed

@@ -11,7 +11,7 @@ import spaces
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
-from TTS.utils.generic_utils import get_user_data_dir
 from TTS.utils.manage import ModelManager
 os.environ["COQUI_TOS_AGREED"] = "1"
@@ -151,12 +151,6 @@ def predict_speech(parsed_input: TTSInputs):
     speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
     gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
-    print("speaker embedding")
-    print(speaker_embedding)
-    print("latent")
-    print(gpt_cond_latent)
     text = parsed_input.text
     language = parsed_input.language
     temperature = parsed_input.temperature

 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
+from trainer.io import get_user_data_dir
 from TTS.utils.manage import ModelManager
 os.environ["COQUI_TOS_AGREED"] = "1"
     speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
     gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
     text = parsed_input.text
     language = parsed_input.language
     temperature = parsed_input.temperature