Pendrokar commited on
Commit
261f600
·
1 Parent(s): 4eba378

TTS: same space, custom model => Parler Large

Browse files
Files changed (4) hide show
  1. README.md +6 -2
  2. app/models.py +74 -27
  3. app/synth.py +7 -7
  4. test_tts_styletts_kokoro.py +2 -0
README.md CHANGED
@@ -9,13 +9,14 @@ emoji: 🤗🏆
9
  colorFrom: red
10
  colorTo: red
11
  pinned: true
12
- short_description: Vote on the top HF TTS models!
13
  models:
14
  - amphion/MaskGCT
15
  - coqui/XTTS-v2
16
  - fishaudio/fish-speech-1.4
17
  - fishaudio/fish-speech-1.5
18
  - hexgrad/Kokoro-82M
 
19
  - lj1995/GPT-SoVITS
20
  - metavoiceio/metavoice-1B-v0.1
21
  - myshell-ai/MeloTTS-English-v2
@@ -24,6 +25,7 @@ models:
24
  - myshell-ai/OpenVoiceV2
25
  - OuteAI/OuteTTS-0.2-500M
26
  - OuteAI/OuteTTS-0.3-1B
 
27
  - parler-tts/parler-tts-mini-v1
28
  - parler-tts/parler-tts-mini-expresso
29
  - Pendrokar/xvapitch_expresso
@@ -47,4 +49,6 @@ The TTS Arena is a Gradio app with several components. Please refer to the `app`
47
  RUNNING_LOCALLY=1 python app.py
48
  ```
49
 
50
- You must set the `RUNNING_LOCALLY` environment variable to `1` when running the app locally. This prevents it from syncing with the database
 
 
 
9
  colorFrom: red
10
  colorTo: red
11
  pinned: true
12
+ short_description: Blind vote on HF TTS models!
13
  models:
14
  - amphion/MaskGCT
15
  - coqui/XTTS-v2
16
  - fishaudio/fish-speech-1.4
17
  - fishaudio/fish-speech-1.5
18
  - hexgrad/Kokoro-82M
19
+ - HKUSTAudio/Llasa-3B
20
  - lj1995/GPT-SoVITS
21
  - metavoiceio/metavoice-1B-v0.1
22
  - myshell-ai/MeloTTS-English-v2
 
25
  - myshell-ai/OpenVoiceV2
26
  - OuteAI/OuteTTS-0.2-500M
27
  - OuteAI/OuteTTS-0.3-1B
28
+ - parler-tts/parler-tts-large-v1
29
  - parler-tts/parler-tts-mini-v1
30
  - parler-tts/parler-tts-mini-expresso
31
  - Pendrokar/xvapitch_expresso
 
49
  RUNNING_LOCALLY=1 python app.py
50
  ```
51
 
52
+ You must set the `RUNNING_LOCALLY` environment variable to `1` when running the app locally. This prevents it from syncing with the database
53
+
54
+ The only other needed environmental variable may be 'HF_TOKEN' as anonymous accounts may get more restrictions.
app/models.py CHANGED
@@ -23,6 +23,8 @@ AVAILABLE_MODELS = {
23
  # HF Gradio Spaces: # <works with gradio version #>
24
  # gravio version that works with most spaces: 4.29
25
  'coqui/xtts': 'coqui/xtts', # 4.29 4.32
 
 
26
  # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
27
  # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
28
  # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
@@ -35,10 +37,12 @@ AVAILABLE_MODELS = {
35
  # E2 & F5 TTS
36
  # F5 model
37
  'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
 
 
38
 
39
  # # Parler
40
  # Parler Large model
41
- # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
42
  # Parler Mini model
43
  'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
44
  # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
@@ -76,7 +80,6 @@ AVAILABLE_MODELS = {
76
  # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
77
  # 'suno/bark': '3#0', # Hallucinates
78
  # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
79
- # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
80
  # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
81
  # 'pytorch/Tacotron2': '0#0', # old gradio
82
  }
@@ -158,7 +161,7 @@ HF_SPACES = {
158
  'series': 'MeloTTS',
159
  },
160
 
161
- # Parler
162
  'parler-tts/parler_tts': {
163
  'name': 'Parler Mini',
164
  'function': '/gen_tts',
@@ -167,16 +170,16 @@ HF_SPACES = {
167
  'is_zero_gpu_space': True,
168
  'series': 'Parler',
169
  },
170
- # Parler Mini
171
- # 'parler-tts/parler_tts': {
172
- # 'name': 'Parler Large',
173
- # 'function': '/gen_tts',
174
- # 'text_param_index': 0,
175
- # 'return_audio_index': 0,
176
- # 'is_zero_gpu_space': True,
177
- # 'series': 'Parler',
178
- # },
179
- # Parler Mini which using Expresso dataset
180
  'parler-tts/parler-tts-expresso': {
181
  'name': 'Parler Mini Expresso',
182
  'function': '/gen_tts',
@@ -207,14 +210,24 @@ HF_SPACES = {
207
  'series': 'Fish Speech',
208
  },
209
 
210
- # E2/F5 TTS
211
  'mrfakename/E2-F5-TTS': {
212
  'name': 'F5 TTS',
213
  'function': '/basic_tts',
214
  'text_param_index': 'gen_text_input',
215
  'return_audio_index': 0,
216
  'is_zero_gpu_space': True,
217
- 'series': 'E2/F5 TTS',
 
 
 
 
 
 
 
 
 
 
218
  },
219
 
220
  # IMS-Toucan
@@ -247,11 +260,21 @@ HF_SPACES = {
247
  # 'emoji': '😪',
248
  },
249
 
250
- # StyleTTS v2 kokoro fine tune
251
  'hexgrad/kokoro': {
252
  'name': 'StyleTTS Kokoro v19',
253
  'function': '/generate',
254
- 'text_param_index': 0,
 
 
 
 
 
 
 
 
 
 
255
  'return_audio_index': 0,
256
  'is_zero_gpu_space': True,
257
  'series': 'Kokoro',
@@ -314,6 +337,7 @@ HF_SPACES = {
314
  DEFAULT_VOICE_SAMPLE_STR = 'voice_samples/xtts_sample.wav'
315
  DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
316
  DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
 
317
 
318
  OVERRIDE_INPUTS = {
319
  'coqui/xtts': {
@@ -379,11 +403,16 @@ OVERRIDE_INPUTS = {
379
  4: "Bria", # Literal['Bria', 'Alex', 'Jacob'] in 'Preset voices' Dropdown component
380
  5: None, # filepath in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
381
  },
382
- 'parler-tts/parler_tts': {
383
- 1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
 
 
 
 
 
384
  },
385
  'parler-tts/parler-tts-expresso': {
386
- 1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
387
  },
388
  'innoai/Edge-TTS-Text-to-Speech': {
389
  1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
@@ -414,6 +443,16 @@ OVERRIDE_INPUTS = {
414
  'speed_slider': 1,
415
  },
416
 
 
 
 
 
 
 
 
 
 
 
417
  # IMS-Toucan
418
  'Flux9665/MassivelyMultilingualTTS': {
419
  1: "English (eng)", #language
@@ -431,14 +470,22 @@ OVERRIDE_INPUTS = {
431
  'lngsteps': 8,
432
  },
433
 
434
- # StyleTTS 2 kokoro
435
  'hexgrad/kokoro': {
436
- 1: "af", #voice
437
- 2: None, #ps
438
- 3: 1, #speed
439
- 4: 3000, #trim
440
- 5: False, #use_gpu; fast enough with multithreaded with CPU
441
- 6: os.getenv('KOKORO'), #sk
 
 
 
 
 
 
 
 
442
  },
443
 
444
  # maskGCT (by amphion)
 
23
  # HF Gradio Spaces: # <works with gradio version #>
24
  # gravio version that works with most spaces: 4.29
25
  'coqui/xtts': 'coqui/xtts', # 4.29 4.32
26
+ # '<keyname>':'<Space URL>'
27
+ # gradio version that works with most spaces: 4.29
28
  # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
29
  # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
30
  # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
 
37
  # E2 & F5 TTS
38
  # F5 model
39
  'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
40
+ # E2 model
41
+ # 'mrfakename/E2-F5-TTS/E2': 'mrfakename/E2-F5-TTS', # seems to require multiple requests for setup
42
 
43
  # # Parler
44
  # Parler Large model
45
+ 'parler-tts/parler_tts/large': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
46
  # Parler Mini model
47
  'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
48
  # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
 
80
  # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
81
  # 'suno/bark': '3#0', # Hallucinates
82
  # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
 
83
  # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
84
  # 'pytorch/Tacotron2': '0#0', # old gradio
85
  }
 
161
  'series': 'MeloTTS',
162
  },
163
 
164
+ # Parler Mini
165
  'parler-tts/parler_tts': {
166
  'name': 'Parler Mini',
167
  'function': '/gen_tts',
 
170
  'is_zero_gpu_space': True,
171
  'series': 'Parler',
172
  },
173
+ # Parler Large
174
+ 'parler-tts/parler_tts/large': {
175
+ 'name': 'Parler Large',
176
+ 'function': '/gen_tts',
177
+ 'text_param_index': 0,
178
+ 'return_audio_index': 0,
179
+ 'is_zero_gpu_space': True,
180
+ 'series': 'Parler',
181
+ },
182
+ # Parler Mini trained on Expresso dataset
183
  'parler-tts/parler-tts-expresso': {
184
  'name': 'Parler Mini Expresso',
185
  'function': '/gen_tts',
 
210
  'series': 'Fish Speech',
211
  },
212
 
213
+ # F5 TTS
214
  'mrfakename/E2-F5-TTS': {
215
  'name': 'F5 TTS',
216
  'function': '/basic_tts',
217
  'text_param_index': 'gen_text_input',
218
  'return_audio_index': 0,
219
  'is_zero_gpu_space': True,
220
+ 'series': 'F5 TTS',
221
+ },
222
+
223
+ # E2 TTS TODO: call switch model function
224
+ 'mrfakename/E2-F5-TTS': {
225
+ 'name': 'E2 TTS',
226
+ 'function': '/basic_tts',
227
+ 'text_param_index': 'gen_text_input',
228
+ 'return_audio_index': 0,
229
+ 'is_zero_gpu_space': True,
230
+ 'series': 'E2 TTS',
231
  },
232
 
233
  # IMS-Toucan
 
260
  # 'emoji': '😪',
261
  },
262
 
263
+ # StyleTTS Kokoro v0.19
264
  'hexgrad/kokoro': {
265
  'name': 'StyleTTS Kokoro v19',
266
  'function': '/generate',
267
+ 'text_param_index': 'text',
268
+ 'return_audio_index': 0,
269
+ 'is_zero_gpu_space': False,
270
+ 'series': 'Kokoro',
271
+ },
272
+
273
+ # StyleTTS Kokoro v0.23
274
+ 'hexgrad/Kokoro-TTS/0.23': {
275
+ 'name': 'StyleTTS Kokoro v23',
276
+ 'function': '/multilingual',
277
+ 'text_param_index': 'text',
278
  'return_audio_index': 0,
279
  'is_zero_gpu_space': True,
280
  'series': 'Kokoro',
 
337
  DEFAULT_VOICE_SAMPLE_STR = 'voice_samples/xtts_sample.wav'
338
  DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
339
  DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
340
+ DEFAULT_VOICE_PROMPT = "female voice; very clear audio"
341
 
342
  OVERRIDE_INPUTS = {
343
  'coqui/xtts': {
 
403
  4: "Bria", # Literal['Bria', 'Alex', 'Jacob'] in 'Preset voices' Dropdown component
404
  5: None, # filepath in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
405
  },
406
+ 'parler-tts/parler_tts': { # mini
407
+ 1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
408
+ 2: False, #use_large
409
+ },
410
+ 'parler-tts/parler_tts/large': {
411
+ 1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
412
+ 2: True, #use_large
413
  },
414
  'parler-tts/parler-tts-expresso': {
415
+ 1: 'Elisabeth; Elisabeth\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
416
  },
417
  'innoai/Edge-TTS-Text-to-Speech': {
418
  1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
 
443
  'speed_slider': 1,
444
  },
445
 
446
+ # E2 TODO: call switch model
447
+ 'mrfakename/E2-F5-TTS/E2': {
448
+ 'ref_audio_input': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
449
+ 'ref_text_input': 'Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we\'ll call Kruger Bern.',
450
+ 'remove_silence': False,
451
+ 'cross_fade_duration_slider': 0.15,
452
+ 'nfe_slider': 32,
453
+ 'speed_slider': 1,
454
+ },
455
+
456
  # IMS-Toucan
457
  'Flux9665/MassivelyMultilingualTTS': {
458
  1: "English (eng)", #language
 
470
  'lngsteps': 8,
471
  },
472
 
473
+ # StyleTTS 2 Kokoro v0.19
474
  'hexgrad/kokoro': {
475
+ 'voice': "af",
476
+ 'ps': None,
477
+ 'speed': 1,
478
+ 'trim': 0.5,
479
+ 'use_gpu': False, # fast enough with multithreaded CPU
480
+ 'sk': os.getenv('KOKORO'),
481
+ },
482
+
483
+ # StyleTTS 2 Kokoro v0.23
484
+ 'hexgrad/Kokoro-TTS/0.23': {
485
+ 'voice': "af",
486
+ 'speed': 1,
487
+ 'trim': 0.5,
488
+ 'sk': os.getenv('KOKORO'),
489
  },
490
 
491
  # maskGCT (by amphion)
app/synth.py CHANGED
@@ -101,7 +101,7 @@ def synthandreturn(text, autoplay, request: gr.Request):
101
  # Use public HF Space
102
  # if (model not in hf_clients):
103
  # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
104
- mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
105
 
106
  # print(f"{model}: Fetching endpoints of HF Space")
107
  # assume the index is one of the first 9 return params
@@ -197,7 +197,7 @@ def synthandreturn(text, autoplay, request: gr.Request):
197
  except:
198
  print(f"{model}: [WARN] Unable to resample audio")
199
  pass
200
- if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
201
  result_storage[model] = result
202
 
203
  def _get_param_examples(parameters):
@@ -269,8 +269,8 @@ def synthandreturn(text, autoplay, request: gr.Request):
269
  mdl1k = mdl1
270
  mdl2k = mdl2
271
  print(mdl1k, mdl2k)
272
- if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
273
- if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
274
  results = {}
275
  print(f"Sending models {mdl1k} and {mdl2k} to API")
276
 
@@ -381,7 +381,7 @@ def synthandreturn_battle(text, mdl1, mdl2, autoplay):
381
  result = f.name
382
  except:
383
  pass
384
- if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
385
  print(model)
386
  print(f"Running model {model}")
387
  result_storage[model] = result
@@ -392,8 +392,8 @@ def synthandreturn_battle(text, mdl1, mdl2, autoplay):
392
  mdl1k = mdl1
393
  mdl2k = mdl2
394
  print(mdl1k, mdl2k)
395
- if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
396
- if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
397
  results = {}
398
  print(f"Sending models {mdl1k} and {mdl2k} to API")
399
  thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
 
101
  # Use public HF Space
102
  # if (model not in hf_clients):
103
  # hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
104
+ mdl_space = Client(AVAILABLE_MODELS[model], hf_token=hf_token, headers=hf_headers)
105
 
106
  # print(f"{model}: Fetching endpoints of HF Space")
107
  # assume the index is one of the first 9 return params
 
197
  except:
198
  print(f"{model}: [WARN] Unable to resample audio")
199
  pass
200
+ # if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
201
  result_storage[model] = result
202
 
203
  def _get_param_examples(parameters):
 
269
  mdl1k = mdl1
270
  mdl2k = mdl2
271
  print(mdl1k, mdl2k)
272
+ # if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
273
+ # if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
274
  results = {}
275
  print(f"Sending models {mdl1k} and {mdl2k} to API")
276
 
 
381
  result = f.name
382
  except:
383
  pass
384
+ # if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
385
  print(model)
386
  print(f"Running model {model}")
387
  result_storage[model] = result
 
392
  mdl1k = mdl1
393
  mdl2k = mdl2
394
  print(mdl1k, mdl2k)
395
+ # if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
396
+ # if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
397
  results = {}
398
  print(f"Sending models {mdl1k} and {mdl2k} to API")
399
  thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))
test_tts_styletts_kokoro.py CHANGED
@@ -4,6 +4,7 @@ from gradio_client import Client, file
4
  client = Client("hexgrad/kokoro", hf_token=os.getenv('HF_TOKEN'))
5
  # endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
6
  # print(endpoints)
 
7
  result = client.predict(
8
  text='"I hate it when people lie to me."',
9
  voice="af",
@@ -19,6 +20,7 @@ result = client.predict(
19
  # 3000, #trim
20
  # False, #use_gpu; fast enough with multithreaded with CPU
21
  # ],
 
22
  api_name="/generate"
23
  )
24
 
 
4
  client = Client("hexgrad/kokoro", hf_token=os.getenv('HF_TOKEN'))
5
  # endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
6
  # print(endpoints)
7
+ key = os.getenv('KOKORO')
8
  result = client.predict(
9
  text='"I hate it when people lie to me."',
10
  voice="af",
 
20
  # 3000, #trim
21
  # False, #use_gpu; fast enough with multithreaded with CPU
22
  # ],
23
+ sk=key,
24
  api_name="/generate"
25
  )
26