TTS-Spaces-Arena

Running on Zero

App Files Files Community

Pendrokar commited on Jan 25

Commit

261f600

1 Parent(s): 4eba378

TTS: same space, custom model => Parler Large

Browse files

Files changed (4) hide show

README.md +6 -2
app/models.py +74 -27
app/synth.py +7 -7
test_tts_styletts_kokoro.py +2 -0

README.md CHANGED Viewed

@@ -9,13 +9,14 @@ emoji: 🤗🏆
 colorFrom: red
 colorTo: red
 pinned: true
-short_description: Vote on the top HF TTS models!
 models:
 - amphion/MaskGCT
 - coqui/XTTS-v2
 - fishaudio/fish-speech-1.4
 - fishaudio/fish-speech-1.5
 - hexgrad/Kokoro-82M
 - lj1995/GPT-SoVITS
 - metavoiceio/metavoice-1B-v0.1
 - myshell-ai/MeloTTS-English-v2
@@ -24,6 +25,7 @@ models:
 - myshell-ai/OpenVoiceV2
 - OuteAI/OuteTTS-0.2-500M
 - OuteAI/OuteTTS-0.3-1B
 - parler-tts/parler-tts-mini-v1
 - parler-tts/parler-tts-mini-expresso
 - Pendrokar/xvapitch_expresso
@@ -47,4 +49,6 @@ The TTS Arena is a Gradio app with several components. Please refer to the `app`
 RUNNING_LOCALLY=1 python app.py
 ```
-You must set the `RUNNING_LOCALLY` environment variable to `1` when running the app locally. This prevents it from syncing with the database

 colorFrom: red
 colorTo: red
 pinned: true
+short_description: Blind vote on HF TTS models!
 models:
 - amphion/MaskGCT
 - coqui/XTTS-v2
 - fishaudio/fish-speech-1.4
 - fishaudio/fish-speech-1.5
 - hexgrad/Kokoro-82M
+- HKUSTAudio/Llasa-3B
 - lj1995/GPT-SoVITS
 - metavoiceio/metavoice-1B-v0.1
 - myshell-ai/MeloTTS-English-v2
 - myshell-ai/OpenVoiceV2
 - OuteAI/OuteTTS-0.2-500M
 - OuteAI/OuteTTS-0.3-1B
+- parler-tts/parler-tts-large-v1
 - parler-tts/parler-tts-mini-v1
 - parler-tts/parler-tts-mini-expresso
 - Pendrokar/xvapitch_expresso
 RUNNING_LOCALLY=1 python app.py
 ```
+You must set the `RUNNING_LOCALLY` environment variable to `1` when running the app locally. This prevents it from syncing with the database
+The only other needed environmental variable may be 'HF_TOKEN' as anonymous accounts may get more restrictions.

app/models.py CHANGED Viewed

@@ -23,6 +23,8 @@ AVAILABLE_MODELS = {
     # HF Gradio Spaces: # <works with gradio version #>
     # gravio version that works with most spaces: 4.29
      'coqui/xtts': 'coqui/xtts', # 4.29 4.32
     # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
     # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
     # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
@@ -35,10 +37,12 @@ AVAILABLE_MODELS = {
     # E2 & F5 TTS
     # F5 model
     'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
     # # Parler
     # Parler Large model
-    # 'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
     # Parler Mini model
     'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
     # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
@@ -76,7 +80,6 @@ AVAILABLE_MODELS = {
     # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
     # 'suno/bark': '3#0', # Hallucinates
     # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
-    # 'styletts2/styletts2': '0#0', # API disabled, awaiting approval of PR #15
     # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
     # 'pytorch/Tacotron2': '0#0', # old gradio
 }
@@ -158,7 +161,7 @@ HF_SPACES = {
         'series': 'MeloTTS',
     },
-    # Parler
     'parler-tts/parler_tts': {
         'name': 'Parler Mini',
         'function': '/gen_tts',
@@ -167,16 +170,16 @@ HF_SPACES = {
         'is_zero_gpu_space': True,
         'series': 'Parler',
     },
-    # Parler Mini
-    # 'parler-tts/parler_tts': {
-    #     'name': 'Parler Large',
-    #     'function': '/gen_tts',
-    #     'text_param_index': 0,
-    #     'return_audio_index': 0,
-    #     'is_zero_gpu_space': True,
-    #    'series': 'Parler',
-    # },
-    # Parler Mini which using Expresso dataset
     'parler-tts/parler-tts-expresso': {
         'name': 'Parler Mini Expresso',
         'function': '/gen_tts',
@@ -207,14 +210,24 @@ HF_SPACES = {
         'series': 'Fish Speech',
     },
-    # E2/F5 TTS
     'mrfakename/E2-F5-TTS': {
         'name': 'F5 TTS',
         'function': '/basic_tts',
         'text_param_index': 'gen_text_input',
         'return_audio_index': 0,
         'is_zero_gpu_space': True,
-        'series': 'E2/F5 TTS',
     },
     # IMS-Toucan
@@ -247,11 +260,21 @@ HF_SPACES = {
         # 'emoji': '😪',
     },
-    # StyleTTS v2 kokoro fine tune
     'hexgrad/kokoro': {
         'name': 'StyleTTS Kokoro v19',
         'function': '/generate',
-        'text_param_index': 0,
         'return_audio_index': 0,
         'is_zero_gpu_space': True,
         'series': 'Kokoro',
@@ -314,6 +337,7 @@ HF_SPACES = {
 DEFAULT_VOICE_SAMPLE_STR = 'voice_samples/xtts_sample.wav'
 DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
 DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
 OVERRIDE_INPUTS = {
     'coqui/xtts': {
@@ -379,11 +403,16 @@ OVERRIDE_INPUTS = {
 		4: "Bria",	# Literal['Bria', 'Alex', 'Jacob']  in 'Preset voices' Dropdown component
 		5: None,	# filepath  in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
     },
-    'parler-tts/parler_tts': {
-        1: 'Laura; Laura\'s female voice; very clear audio', # description/prompt
     },
     'parler-tts/parler-tts-expresso': {
-        1: 'Elisabeth; Elisabeth\'s female voice; very clear audio', # description/prompt
     },
     'innoai/Edge-TTS-Text-to-Speech': {
         1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
@@ -414,6 +443,16 @@ OVERRIDE_INPUTS = {
         'speed_slider': 1,
     },
     # IMS-Toucan
     'Flux9665/MassivelyMultilingualTTS': {
 		1: "English (eng)", #language
@@ -431,14 +470,22 @@ OVERRIDE_INPUTS = {
         'lngsteps': 8,
     },
-    # StyleTTS 2 kokoro
     'hexgrad/kokoro': {
-		1: "af", #voice
-		2: None, #ps
-		3: 1, #speed
-		4: 3000, #trim
-		5: False, #use_gpu; fast enough with multithreaded with CPU
-        6: os.getenv('KOKORO'), #sk
     },
     # maskGCT (by amphion)

     # HF Gradio Spaces: # <works with gradio version #>
     # gravio version that works with most spaces: 4.29
      'coqui/xtts': 'coqui/xtts', # 4.29 4.32
+    # '<keyname>':'<Space URL>'
+    # gradio version that works with most spaces: 4.29
     # 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
     # 'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # 4.29
     # 'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # 4.29
     # E2 & F5 TTS
     # F5 model
     'mrfakename/E2-F5-TTS': 'mrfakename/E2-F5-TTS', # 5.0
+    # E2 model
+    # 'mrfakename/E2-F5-TTS/E2': 'mrfakename/E2-F5-TTS', # seems to require multiple requests for setup
     # # Parler
     # Parler Large model
+    'parler-tts/parler_tts/large': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
     # Parler Mini model
     'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29 4.32 4.36.1 4.42.0
     # 'parler-tts/parler_tts_mini': 'parler-tts/parler_tts_mini', # Mini is the default model of parler_tts
     # 'amphion/Text-to-Speech': '/predict#0', # disabled also on original HF space due to poor ratings
     # 'suno/bark': '3#0', # Hallucinates
     # 'shivammehta25/Matcha-TTS': '5#0', # seems to require multiple requests for setup
     # 'Manmay/tortoise-tts': '/predict#0', # Cannot retrieve streamed file; 403
     # 'pytorch/Tacotron2': '0#0', # old gradio
 }
         'series': 'MeloTTS',
     },
+    # Parler Mini
     'parler-tts/parler_tts': {
         'name': 'Parler Mini',
         'function': '/gen_tts',
         'is_zero_gpu_space': True,
         'series': 'Parler',
     },
+    # Parler Large
+    'parler-tts/parler_tts/large': {
+        'name': 'Parler Large',
+        'function': '/gen_tts',
+        'text_param_index': 0,
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'Parler',
+    },
+    # Parler Mini trained on Expresso dataset
     'parler-tts/parler-tts-expresso': {
         'name': 'Parler Mini Expresso',
         'function': '/gen_tts',
         'series': 'Fish Speech',
     },
+    # F5 TTS
     'mrfakename/E2-F5-TTS': {
         'name': 'F5 TTS',
         'function': '/basic_tts',
         'text_param_index': 'gen_text_input',
         'return_audio_index': 0,
         'is_zero_gpu_space': True,
+        'series': 'F5 TTS',
+    },
+    # E2 TTS TODO: call switch model function
+    'mrfakename/E2-F5-TTS': {
+        'name': 'E2 TTS',
+        'function': '/basic_tts',
+        'text_param_index': 'gen_text_input',
+        'return_audio_index': 0,
+        'is_zero_gpu_space': True,
+        'series': 'E2 TTS',
     },
     # IMS-Toucan
         # 'emoji': '😪',
     },
+    # StyleTTS Kokoro v0.19
     'hexgrad/kokoro': {
         'name': 'StyleTTS Kokoro v19',
         'function': '/generate',
+        'text_param_index': 'text',
+        'return_audio_index': 0,
+        'is_zero_gpu_space': False,
+        'series': 'Kokoro',
+    },
+    # StyleTTS Kokoro v0.23
+    'hexgrad/Kokoro-TTS/0.23': {
+        'name': 'StyleTTS Kokoro v23',
+        'function': '/multilingual',
+        'text_param_index': 'text',
         'return_audio_index': 0,
         'is_zero_gpu_space': True,
         'series': 'Kokoro',
 DEFAULT_VOICE_SAMPLE_STR = 'voice_samples/xtts_sample.wav'
 DEFAULT_VOICE_SAMPLE = handle_file(DEFAULT_VOICE_SAMPLE_STR)
 DEFAULT_VOICE_TRANSCRIPT = "The Hispaniola was rolling scuppers under in the ocean swell. The booms were tearing at the blocks, the rudder was banging to and fro, and the whole ship creaking, groaning, and jumping like a manufactory."
+DEFAULT_VOICE_PROMPT = "female voice; very clear audio"
 OVERRIDE_INPUTS = {
     'coqui/xtts': {
 		4: "Bria",	# Literal['Bria', 'Alex', 'Jacob']  in 'Preset voices' Dropdown component
 		5: None,	# filepath  in 'Upload a clean sample to clone. Sample should contain 1 speaker, be between 30-90 seconds and not contain background noise.' Audio component
     },
+    'parler-tts/parler_tts': { # mini
+        1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
+        2: False, #use_large
+    },
+    'parler-tts/parler_tts/large': {
+        1: 'Laura; Laura\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
+        2: True, #use_large
     },
     'parler-tts/parler-tts-expresso': {
+        1: 'Elisabeth; Elisabeth\'s ' + DEFAULT_VOICE_PROMPT, #description / voice prompt
     },
     'innoai/Edge-TTS-Text-to-Speech': {
         1: 'en-US-EmmaMultilingualNeural - en-US (Female)', # voice
         'speed_slider': 1,
     },
+    # E2 TODO: call switch model
+    'mrfakename/E2-F5-TTS/E2': {
+        'ref_audio_input': handle_file('voice_samples/EN_B00004_S00051_W000213.mp3'),
+        'ref_text_input': 'Our model manager is Graham, whom we observed leading a small team of chemical engineers within a multinational European firm we\'ll call Kruger Bern.',
+        'remove_silence': False,
+        'cross_fade_duration_slider': 0.15,
+        'nfe_slider': 32,
+        'speed_slider': 1,
+    },
     # IMS-Toucan
     'Flux9665/MassivelyMultilingualTTS': {
 		1: "English (eng)", #language
         'lngsteps': 8,
     },
+    # StyleTTS 2 Kokoro v0.19
     'hexgrad/kokoro': {
+		'voice': "af",
+		'ps': None,
+		'speed': 1,
+		'trim': 0.5,
+		'use_gpu': False, # fast enough with multithreaded CPU
+        'sk': os.getenv('KOKORO'),
+    },
+    # StyleTTS 2 Kokoro v0.23
+    'hexgrad/Kokoro-TTS/0.23': {
+		'voice': "af",
+		'speed': 1,
+		'trim': 0.5,
+        'sk': os.getenv('KOKORO'),
     },
     # maskGCT (by amphion)

app/synth.py CHANGED Viewed

@@ -101,7 +101,7 @@ def synthandreturn(text, autoplay, request: gr.Request):
                         # Use public HF Space
                         # if (model not in hf_clients):
                         #     hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
-                        mdl_space = Client(model, hf_token=hf_token, headers=hf_headers)
                         # print(f"{model}: Fetching endpoints of HF Space")
                         # assume the index is one of the first 9 return params
@@ -197,7 +197,7 @@ def synthandreturn(text, autoplay, request: gr.Request):
         except:
             print(f"{model}: [WARN] Unable to resample audio")
             pass
-        if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
         result_storage[model] = result
     def _get_param_examples(parameters):
@@ -269,8 +269,8 @@ def synthandreturn(text, autoplay, request: gr.Request):
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
-    if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
-    if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
     results = {}
     print(f"Sending models {mdl1k} and {mdl2k} to API")
@@ -381,7 +381,7 @@ def synthandreturn_battle(text, mdl1, mdl2, autoplay):
                 result = f.name
         except:
             pass
-        if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
         print(model)
         print(f"Running model {model}")
         result_storage[model] = result
@@ -392,8 +392,8 @@ def synthandreturn_battle(text, mdl1, mdl2, autoplay):
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
-    if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
-    if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
     results = {}
     print(f"Sending models {mdl1k} and {mdl2k} to API")
     thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))

                         # Use public HF Space
                         # if (model not in hf_clients):
                         #     hf_clients[model] = Client(model, hf_token=hf_token, headers=hf_headers)
+                        mdl_space = Client(AVAILABLE_MODELS[model], hf_token=hf_token, headers=hf_headers)
                         # print(f"{model}: Fetching endpoints of HF Space")
                         # assume the index is one of the first 9 return params
         except:
             print(f"{model}: [WARN] Unable to resample audio")
             pass
+        # if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
         result_storage[model] = result
     def _get_param_examples(parameters):
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
+    # if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
+    # if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
     results = {}
     print(f"Sending models {mdl1k} and {mdl2k} to API")
                 result = f.name
         except:
             pass
+        # if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
         print(model)
         print(f"Running model {model}")
         result_storage[model] = result
     mdl1k = mdl1
     mdl2k = mdl2
     print(mdl1k, mdl2k)
+    # if mdl1 in AVAILABLE_MODELS.keys(): mdl1k=AVAILABLE_MODELS[mdl1]
+    # if mdl2 in AVAILABLE_MODELS.keys(): mdl2k=AVAILABLE_MODELS[mdl2]
     results = {}
     print(f"Sending models {mdl1k} and {mdl2k} to API")
     thread1 = threading.Thread(target=predict_and_update_result, args=(text, mdl1k, results))

test_tts_styletts_kokoro.py CHANGED Viewed

@@ -4,6 +4,7 @@ from gradio_client import Client, file
 client = Client("hexgrad/kokoro", hf_token=os.getenv('HF_TOKEN'))
 # endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
 # print(endpoints)
 result = client.predict(
 		text='"I hate it when people lie to me."',
 		voice="af",
@@ -19,6 +20,7 @@ result = client.predict(
 		# 	3000, #trim
 		# 	False, #use_gpu; fast enough with multithreaded with CPU
 		# ],
 		api_name="/generate"
 )

 client = Client("hexgrad/kokoro", hf_token=os.getenv('HF_TOKEN'))
 # endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
 # print(endpoints)
+key = os.getenv('KOKORO')
 result = client.predict(
 		text='"I hate it when people lie to me."',
 		voice="af",
 		# 	3000, #trim
 		# 	False, #use_gpu; fast enough with multithreaded with CPU
 		# ],
+        sk=key,
 		api_name="/generate"
 )