Spaces:
Running
Running
Added Fish Speech TTS
Browse files
app.py
CHANGED
@@ -72,6 +72,7 @@ AVAILABLE_MODELS = {
|
|
72 |
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
73 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # 4.29.0
|
74 |
'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29.0
|
|
|
75 |
|
76 |
# Parler
|
77 |
'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29.0 4.42.0
|
@@ -186,6 +187,13 @@ HF_SPACES = {
|
|
186 |
'return_audio_index': 0,
|
187 |
},
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
# TTS w issues
|
190 |
# 'PolyAI/pheme': '/predict#0', #sleepy HF Space
|
191 |
# 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
|
@@ -200,6 +208,7 @@ HF_SPACES = {
|
|
200 |
# for zero-shot TTS - voice sample of Scarlett Johanson
|
201 |
DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/iKHHqWxWy6Zfmp6QP6CZZ.wav'
|
202 |
DEFAULT_VOICE_SAMPLE = file(DEFAULT_VOICE_SAMPLE_STR)
|
|
|
203 |
|
204 |
OVERRIDE_INPUTS = {
|
205 |
'coqui/xtts': {
|
@@ -269,6 +278,19 @@ OVERRIDE_INPUTS = {
|
|
269 |
2: 0, # pace rate
|
270 |
3: 0, # pitch
|
271 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
}
|
273 |
|
274 |
hf_clients = {}
|
@@ -693,7 +715,7 @@ def downvote_model(model, uname):
|
|
693 |
cursor.close()
|
694 |
|
695 |
def a_is_better(model1, model2, userid):
|
696 |
-
print("A is better", model1, model2)
|
697 |
if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
|
698 |
raise gr.Error('Sorry, please try voting again.')
|
699 |
userid = mkuuid(userid)
|
@@ -708,7 +730,7 @@ def a_is_better(model1, model2, userid):
|
|
708 |
downvote_model(model2, str(userid))
|
709 |
return reload(model1, model2, userid, chose_a=True)
|
710 |
def b_is_better(model1, model2, userid):
|
711 |
-
print("B is better", model1, model2)
|
712 |
if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
|
713 |
raise gr.Error('Sorry, please try voting again.')
|
714 |
userid = mkuuid(userid)
|
@@ -931,7 +953,13 @@ def synthandreturn(text):
|
|
931 |
results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
|
932 |
|
933 |
# return path to audio
|
934 |
-
result = results
|
|
|
|
|
|
|
|
|
|
|
|
|
935 |
else:
|
936 |
# Use the private HF Space
|
937 |
result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
|
@@ -969,6 +997,7 @@ def synthandreturn(text):
|
|
969 |
os.unlink(result)
|
970 |
result = f.name
|
971 |
except:
|
|
|
972 |
pass
|
973 |
if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
974 |
result_storage[model] = result
|
@@ -1104,7 +1133,7 @@ def unlock_vote(btn_index, aplayed, bplayed):
|
|
1104 |
|
1105 |
# both audio samples played
|
1106 |
if bool(aplayed) and bool(bplayed):
|
1107 |
-
print('Both audio samples played, voting unlocked')
|
1108 |
return [gr.update(interactive=True), gr.update(interactive=True), True, True]
|
1109 |
|
1110 |
return [gr.update(), gr.update(), aplayed, bplayed]
|
@@ -1309,7 +1338,7 @@ with gr.Blocks() as vote:
|
|
1309 |
.then(
|
1310 |
None,
|
1311 |
inputs=[bplayed],
|
1312 |
-
js="(b) =>
|
1313 |
)
|
1314 |
# autoplay if unplayed
|
1315 |
aud2\
|
|
|
72 |
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
73 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # 4.29.0
|
74 |
'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29.0
|
75 |
+
'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # 4.29.0
|
76 |
|
77 |
# Parler
|
78 |
'parler-tts/parler_tts': 'parler-tts/parler_tts', # 4.29.0 4.42.0
|
|
|
187 |
'return_audio_index': 0,
|
188 |
},
|
189 |
|
190 |
+
'fishaudio/fish-speech-1': {
|
191 |
+
'name': 'Fish Speech',
|
192 |
+
'function': '/inference_wrapper',
|
193 |
+
'text_param_index': 0,
|
194 |
+
'return_audio_index': 1,
|
195 |
+
},
|
196 |
+
|
197 |
# TTS w issues
|
198 |
# 'PolyAI/pheme': '/predict#0', #sleepy HF Space
|
199 |
# 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
|
|
|
208 |
# for zero-shot TTS - voice sample of Scarlett Johanson
|
209 |
DEFAULT_VOICE_SAMPLE_STR = 'https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/iKHHqWxWy6Zfmp6QP6CZZ.wav'
|
210 |
DEFAULT_VOICE_SAMPLE = file(DEFAULT_VOICE_SAMPLE_STR)
|
211 |
+
DEFAULT_VOICE_TRANSCRIPT = "In the first half of the 20th century, science fiction familiarized the world with the concept of artificially intelligent robots. It began with the “heartless” Tin man from the Wizard of Oz and continued with the humanoid robot that impersonated Maria in Metropolis. By the 1950s, we had a generation of scientists, mathematicians, and philosophers with the concept of artificial intelligence (or AI) culturally assimilated in their minds."
|
212 |
|
213 |
OVERRIDE_INPUTS = {
|
214 |
'coqui/xtts': {
|
|
|
278 |
2: 0, # pace rate
|
279 |
3: 0, # pitch
|
280 |
},
|
281 |
+
|
282 |
+
'fishaudio/fish-speech-1': {
|
283 |
+
1: True, # enable_reference_audio
|
284 |
+
2: DEFAULT_VOICE_SAMPLE, # reference_audio
|
285 |
+
3: DEFAULT_VOICE_TRANSCRIPT, # reference_text
|
286 |
+
4: 1024, # max_new_tokens
|
287 |
+
5: 200, # chunk_length
|
288 |
+
6: 0.7, # top_p
|
289 |
+
7: 1.2, # repetition_penalty
|
290 |
+
8: 0.7, # temperature
|
291 |
+
9: 1, # batch_infer_num
|
292 |
+
10: False, # if_load_asr_model
|
293 |
+
},
|
294 |
}
|
295 |
|
296 |
hf_clients = {}
|
|
|
715 |
cursor.close()
|
716 |
|
717 |
def a_is_better(model1, model2, userid):
|
718 |
+
# print("A is better", model1, model2)
|
719 |
if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
|
720 |
raise gr.Error('Sorry, please try voting again.')
|
721 |
userid = mkuuid(userid)
|
|
|
730 |
downvote_model(model2, str(userid))
|
731 |
return reload(model1, model2, userid, chose_a=True)
|
732 |
def b_is_better(model1, model2, userid):
|
733 |
+
# print("B is better", model1, model2)
|
734 |
if not model1 in AVAILABLE_MODELS.keys() and not model1 in AVAILABLE_MODELS.values():
|
735 |
raise gr.Error('Sorry, please try voting again.')
|
736 |
userid = mkuuid(userid)
|
|
|
953 |
results = mdl_space.predict(*space_inputs, api_name=api_name, fn_index=fn_index)
|
954 |
|
955 |
# return path to audio
|
956 |
+
result = results
|
957 |
+
if (not isinstance(results, str)):
|
958 |
+
# return_audio_index may be a filepath string
|
959 |
+
result = results[return_audio_index]
|
960 |
+
if (isinstance(result, dict)):
|
961 |
+
# return_audio_index is a dictionary
|
962 |
+
result = results[return_audio_index]['value']
|
963 |
else:
|
964 |
# Use the private HF Space
|
965 |
result = router.predict(text, AVAILABLE_MODELS[model].lower(), api_name="/synthesize")
|
|
|
997 |
os.unlink(result)
|
998 |
result = f.name
|
999 |
except:
|
1000 |
+
print(f"{model}: [WARN] Unable to resample audio")
|
1001 |
pass
|
1002 |
if model in AVAILABLE_MODELS.keys(): model = AVAILABLE_MODELS[model]
|
1003 |
result_storage[model] = result
|
|
|
1133 |
|
1134 |
# both audio samples played
|
1135 |
if bool(aplayed) and bool(bplayed):
|
1136 |
+
# print('Both audio samples played, voting unlocked')
|
1137 |
return [gr.update(interactive=True), gr.update(interactive=True), True, True]
|
1138 |
|
1139 |
return [gr.update(), gr.update(), aplayed, bplayed]
|
|
|
1338 |
.then(
|
1339 |
None,
|
1340 |
inputs=[bplayed],
|
1341 |
+
js="(b) => b ? 0 : document.querySelector('.stretch .gap+.gap button.play-pause-button').click()",
|
1342 |
)
|
1343 |
# autoplay if unplayed
|
1344 |
aud2\
|