Spaces:

coqui
/

xtts

Running on T4

App Files Files Community

gorkemgoknar commited on Nov 7, 2023

Commit

96324d6

1 Parent(s): e5753d7

xtts v2 with silence fix

Browse files

Files changed (1) hide show

app.py +30 -16

app.py CHANGED Viewed

@@ -44,10 +44,10 @@ st = os.stat("ffmpeg")
 os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
 # This will trigger downloading model
-print("Downloading if not downloaded Coqui XTTS V1.1")
 from TTS.utils.manage import ModelManager
-model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
 ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 print("XTTS downloaded")
@@ -55,10 +55,6 @@ print("XTTS downloaded")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
-# it should be there just to be sure
-if "ja" not in config.languages:
-    config.languages.append("ja")
 model = Xtts.init_from_config(config)
 model.load_checkpoint(
     config,
@@ -74,11 +70,8 @@ DEVICE_ASSERT_DETECTED = 0
 DEVICE_ASSERT_PROMPT = None
 DEVICE_ASSERT_LANG = None
-# supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
 supported_languages = config.languages
 def predict(
     prompt,
     language,
@@ -254,8 +247,7 @@ def predict(
                 language,
                 gpt_cond_latent,
                 speaker_embedding,
-                diffusion_conditioning,
-                decoder="ne_hifigan",
             )
             inference_time = time.time() - t0
             print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
@@ -273,7 +265,8 @@ def predict(
                 language,
                 gpt_cond_latent,
                 speaker_embedding,
-                decoder="ne_hifigan",
             )
             first_chunk = True
@@ -403,7 +396,7 @@ description = """
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </div>
-<a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
 <br/>
 XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 <br/>
@@ -415,9 +408,8 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>
 <br/>
 </p>
 <p>Language Selectors:
-Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
-Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
-Russian: ru, Spanish: es, Turkish: tr, Japanese: ja <br/>
 </p>
 <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
 """
@@ -559,6 +551,26 @@ examples = [
         False,
         True,
     ],
 ]
@@ -588,6 +600,8 @@ gr.Interface(
                 "ar",
                 "zh-cn",
                 "ja",
             ],
             max_choices=1,
             value="en",

 os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
 # This will trigger downloading model
+print("Downloading if not downloaded Coqui XTTS V2")
 from TTS.utils.manage import ModelManager
+model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 ModelManager().download_model(model_name)
 model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 print("XTTS downloaded")
 config = XttsConfig()
 config.load_json(os.path.join(model_path, "config.json"))
 model = Xtts.init_from_config(config)
 model.load_checkpoint(
     config,
 DEVICE_ASSERT_PROMPT = None
 DEVICE_ASSERT_LANG = None
 supported_languages = config.languages
 def predict(
     prompt,
     language,
                 language,
                 gpt_cond_latent,
                 speaker_embedding,
+                diffusion_conditioning
             )
             inference_time = time.time() - t0
             print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
                 language,
                 gpt_cond_latent,
                 speaker_embedding,
+                repetition_penalty=5.0,
+                temperature=0.75,
             )
             first_chunk = True
 <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
 </div>
+<a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
 <br/>
 XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 <br/>
 <br/>
 </p>
 <p>Language Selectors:
+Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
+Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu <br/>
 </p>
 <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
 """
         False,
         True,
     ],
+    [
+        "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
+        "ko",
+        "examples/female.wav",
+        None,
+        False,
+        True,
+        False,
+        True,
+    ],
+        [
+        "Egyszer hat éves koromban láttam egy csodálatos képet",
+        "hu",
+        "examples/male.wav",
+        None,
+        False,
+        True,
+        False,
+        True,
+    ],
 ]
                 "ar",
                 "zh-cn",
                 "ja",
+                "ko",
+                "hu"
             ],
             max_choices=1,
             value="en",