gorkemgoknar commited on
Commit
f81d4f2
1 Parent(s): d8cc0b4

Use inference via python directly

Browse files
Files changed (1) hide show
  1. app.py +63 -13
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import sys
2
- import os,stat
3
  import subprocess
4
  import random
5
  from zipfile import ZipFile
6
  import uuid
7
-
 
 
8
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
9
  os.environ["COQUI_TOS_AGREED"] = "1"
10
 
@@ -13,9 +15,18 @@ os.environ["COQUI_TOS_AGREED"] = "1"
13
  import langid
14
 
15
  import gradio as gr
 
 
 
16
  from TTS.api import TTS
 
 
 
 
17
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
18
  from huggingface_hub import HfApi
 
19
  # will use api to restart space on a unrecoverable error
20
  api = HfApi(token=HF_TOKEN)
21
  repo_id = "coqui/xtts"
@@ -29,8 +40,19 @@ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
29
 
30
  # Load TTS
31
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
32
- tts.to("cuda")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # This is for debugging purposes only
36
  DEVICE_ASSERT_DETECTED=0
@@ -40,14 +62,15 @@ DEVICE_ASSERT_LANG=None
40
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
41
  if agree == True:
42
  supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
43
-
44
  if language not in supported_languages:
45
- gr.Warning("Language you put in is not in is not in our Supported Languages, please choose from dropdown")
46
 
47
  return (
48
  None,
49
  None,
50
  None,
 
51
  )
52
 
53
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
@@ -72,6 +95,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
72
  None,
73
  None,
74
  None,
 
75
  )
76
 
77
 
@@ -84,6 +108,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
84
  None,
85
  None,
86
  None,
 
87
  )
88
 
89
  else:
@@ -129,6 +154,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
129
  None,
130
  None,
131
  None,
 
132
  )
133
  if len(prompt)>200:
134
  gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
@@ -136,6 +162,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
136
  None,
137
  None,
138
  None,
 
139
  )
140
  global DEVICE_ASSERT_DETECTED
141
  if DEVICE_ASSERT_DETECTED:
@@ -145,12 +172,33 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
145
  print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
146
 
147
  try:
148
- tts.tts_to_file(
149
- text=prompt,
150
- file_path="output.wav",
151
- language=language,
152
- speaker_wav=speaker_wav,
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  )
 
 
 
 
 
 
 
 
154
  except RuntimeError as e :
155
  if "device-side assert" in str(e):
156
  # cannot do anything on cuda device side error, need tor estart
@@ -173,6 +221,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
173
  audio="output.wav",
174
  ),
175
  "output.wav",
 
176
  speaker_wav,
177
  )
178
  else:
@@ -181,6 +230,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
181
  None,
182
  None,
183
  None,
 
184
  )
185
 
186
 
@@ -205,7 +255,7 @@ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
205
  Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
206
  Russian: ru, Spanish: es, Turkish: tr <br/>
207
  </p>
208
- <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=f3d15adf-0944-4cf3-ba8c-9c9a6862759c" />
209
  """
210
 
211
  article = """
@@ -234,7 +284,6 @@ examples = [
234
  False,
235
  False,
236
  True,
237
- False,
238
  ],
239
  [
240
  "Als ich sechs war, sah ich einmal ein wunderbares Bild",
@@ -399,7 +448,8 @@ gr.Interface(
399
  ],
400
  outputs=[
401
  gr.Video(label="Waveform Visual"),
402
- gr.Audio(label="Synthesised Audio", autoplay=True),
 
403
  gr.Audio(label="Reference Audio Used"),
404
  ],
405
  title=title,
 
1
  import sys
2
+ import io, os, stat
3
  import subprocess
4
  import random
5
  from zipfile import ZipFile
6
  import uuid
7
+ import time
8
+ import torch
9
+ import torchaudio
10
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
 
15
  import langid
16
 
17
  import gradio as gr
18
+ from scipy.io.wavfile import write
19
+ from pydub import AudioSegment
20
+
21
  from TTS.api import TTS
22
+ from TTS.tts.configs.xtts_config import XttsConfig
23
+ from TTS.tts.models.xtts import Xtts
24
+ from TTS.utils.generic_utils import get_user_data_dir
25
+
26
  HF_TOKEN = os.environ.get("HF_TOKEN")
27
+
28
  from huggingface_hub import HfApi
29
+
30
  # will use api to restart space on a unrecoverable error
31
  api = HfApi(token=HF_TOKEN)
32
  repo_id = "coqui/xtts"
 
40
 
41
  # Load TTS
42
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
 
43
 
44
+ model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
45
+ config = XttsConfig()
46
+ config.load_json(os.path.join(model_path, "config.json"))
47
+ model = Xtts.init_from_config(config)
48
+ model.load_checkpoint(
49
+ config,
50
+ checkpoint_path=os.path.join(model_path, "model.pth"),
51
+ vocab_path=os.path.join(model_path, "vocab.json"),
52
+ eval=True,
53
+ use_deepspeed=True
54
+ )
55
+ model.cuda()
56
 
57
  # This is for debugging purposes only
58
  DEVICE_ASSERT_DETECTED=0
 
62
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
63
  if agree == True:
64
  supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
65
+
66
  if language not in supported_languages:
67
+ gr.Warning(f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown")
68
 
69
  return (
70
  None,
71
  None,
72
  None,
73
+ None,
74
  )
75
 
76
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
 
95
  None,
96
  None,
97
  None,
98
+ None,
99
  )
100
 
101
 
 
108
  None,
109
  None,
110
  None,
111
+ None,
112
  )
113
 
114
  else:
 
154
  None,
155
  None,
156
  None,
157
+ None,
158
  )
159
  if len(prompt)>200:
160
  gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
 
162
  None,
163
  None,
164
  None,
165
+ None,
166
  )
167
  global DEVICE_ASSERT_DETECTED
168
  if DEVICE_ASSERT_DETECTED:
 
172
  print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
173
 
174
  try:
175
+ metrics_text=""
176
+ t_latent=time.time()
177
+
178
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
179
+ gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
180
+ latent_calculation_time = time.time() - t_latent
181
+ #metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
182
+
183
+ wav_chunks = []
184
+
185
+ print("I: Generating new audio...")
186
+ t0 = time.time()
187
+ out = model.inference(
188
+ prompt,
189
+ language,
190
+ gpt_cond_latent,
191
+ speaker_embedding,
192
+ diffusion_conditioning
193
  )
194
+ inference_time = time.time() - t0
195
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
196
+ metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
197
+ real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
198
+ print(f"Real-time factor (RTF): {real_time_factor}")
199
+ metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
200
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
201
+
202
  except RuntimeError as e :
203
  if "device-side assert" in str(e):
204
  # cannot do anything on cuda device side error, need tor estart
 
221
  audio="output.wav",
222
  ),
223
  "output.wav",
224
+ metrics_text,
225
  speaker_wav,
226
  )
227
  else:
 
230
  None,
231
  None,
232
  None,
233
+ None,
234
  )
235
 
236
 
 
255
  Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
256
  Russian: ru, Spanish: es, Turkish: tr <br/>
257
  </p>
258
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
259
  """
260
 
261
  article = """
 
284
  False,
285
  False,
286
  True,
 
287
  ],
288
  [
289
  "Als ich sechs war, sah ich einmal ein wunderbares Bild",
 
448
  ],
449
  outputs=[
450
  gr.Video(label="Waveform Visual"),
451
+ gr.Audio(label="Synthesised Audio",autoplay=True),
452
+ gr.Text(label="Metrics"),
453
  gr.Audio(label="Reference Audio Used"),
454
  ],
455
  title=title,